Browse Source

filter_seccomp: binary match generation strategy

This commit introduces a new BPF program generation strategy.  Traced
syscalls are encoded in 32-bit bit arrays in the BPF program.  Syscalls
are then matched against bit arrays at runtime with two ALU operations:
a division to select the appropriate bit array to compare with,
and a shift to select the appropriate bit in the bit array.

Since there is no way to implement a jump table in BPF (jumps have fixed
offsets), we have to iterate over all bit arrays to select the appropriate
bit array.  The division and modulo are also converted into a shift and
a bitwise AND, to improve performance and because seccomp-bpf disallows
modulos in BPF.

Compared to the linear generation strategy, this strategy generates
programs of near constant size.  There is a single optimization that
depends on traced syscalls: if a bit array is all-0 or all-1, we don't
need to do a jset against it, we can simply jump to either RET_ALLOW
or RET_TRACE.

* filter_seccomp.c (JMP_PLACEHOLDER_ALLOW): New constant.
(binary_match_filter_generator): New prototype.
(filter_generators): Add binary_match_filter_generator.
(replace_jmp_placeholders): Handle JMP_PLACEHOLDER_ALLOW case.
(linear_filter_generator): New argument for replace_jmp_placeholders.
(bpf_syscalls_match, binary_match_filter_generator): New functions.
(dump_seccomp_bpf): Handle ldwimm, jset, rsh, lsh, and, tax, and txa
instructions.

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Paul Chaignon 3 months ago
parent
commit
4e72a134ff
1 changed files with 166 additions and 3 deletions
  1. 166
    3
      filter_seccomp.c

+ 166
- 3
filter_seccomp.c View File

@@ -42,6 +42,7 @@ bool seccomp_before_sysentry;
42 42
 
43 43
 # define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
44 44
 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
45
+# define JMP_PLACEHOLDER_ALLOW ((unsigned char) -3)
45 46
 
46 47
 # define SET_BPF(filter, code, jt, jf, k) \
47 48
 	(*(filter) = (struct sock_filter) { code, jt, jf, k })
@@ -77,8 +78,11 @@ typedef unsigned short (*filter_generator_t)(struct sock_filter *,
77 78
 					     bool *overflow);
78 79
 static unsigned short linear_filter_generator(struct sock_filter *,
79 80
 					      bool *overflow);
81
+static unsigned short binary_match_filter_generator(struct sock_filter *,
82
+						    bool *overflow);
80 83
 static filter_generator_t filter_generators[] = {
81 84
 	linear_filter_generator,
85
+	binary_match_filter_generator,
82 86
 };
83 87
 
84 88
 /*
@@ -295,7 +299,7 @@ traced_by_seccomp(unsigned int scno, unsigned int p)
295 299
 
296 300
 static void
297 301
 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
298
-			 unsigned char jmp_trace)
302
+			 unsigned char jmp_trace, unsigned char jmp_allow)
299 303
 {
300 304
 	switch (*jmp_offset) {
301 305
 	case JMP_PLACEHOLDER_NEXT:
@@ -304,6 +308,9 @@ replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
304 308
 	case JMP_PLACEHOLDER_TRACE:
305 309
 		*jmp_offset = jmp_trace;
306 310
 		break;
311
+	case JMP_PLACEHOLDER_ALLOW:
312
+		*jmp_offset = jmp_allow;
313
+		break;
307 314
 	default:
308 315
 		break;
309 316
 	}
@@ -439,10 +446,11 @@ linear_filter_generator(struct sock_filter *filter, bool *overflow)
439 446
 				continue;
440 447
 			unsigned char jmp_next = pos - i - 1;
441 448
 			unsigned char jmp_trace = pos - i - 2;
449
+			unsigned char jmp_allow = pos - i - 3;
442 450
 			replace_jmp_placeholders(&filter[i].jt, jmp_next,
443
-						 jmp_trace);
451
+						 jmp_trace, jmp_allow);
444 452
 			replace_jmp_placeholders(&filter[i].jf, jmp_next,
445
-						 jmp_trace);
453
+						 jmp_trace, jmp_allow);
446 454
 			if (BPF_OP(filter[i].code) == BPF_JA)
447 455
 				filter[i].k = (unsigned int) jmp_next;
448 456
 		}
@@ -456,6 +464,138 @@ linear_filter_generator(struct sock_filter *filter, bool *overflow)
456 464
 	return pos;
457 465
 }
458 466
 
467
+static unsigned short
468
+bpf_syscalls_match(struct sock_filter *filter, unsigned int bitarray,
469
+		   unsigned int bitarray_idx)
470
+{
471
+	if (!bitarray) {
472
+		/* return RET_ALLOW; */
473
+		SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
474
+			     JMP_PLACEHOLDER_ALLOW, 0);
475
+		return 1;
476
+	}
477
+	if (bitarray == UINT_MAX) {
478
+		/* return RET_TRACE; */
479
+		SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
480
+			     JMP_PLACEHOLDER_TRACE, 0);
481
+		return 1;
482
+	}
483
+	/*
484
+	 * if (A == nr / 32)
485
+	 *   return (X & bitarray) ? RET_TRACE : RET_ALLOW;
486
+	 */
487
+	SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
488
+		     0, 2);
489
+	SET_BPF_STMT(filter + 1, BPF_MISC | BPF_TXA, 0);
490
+	SET_BPF_JUMP(filter + 2, BPF_JMP | BPF_JSET | BPF_K, bitarray,
491
+		     JMP_PLACEHOLDER_TRACE, JMP_PLACEHOLDER_ALLOW);
492
+	return 3;
493
+}
494
+
495
+static unsigned short
496
+binary_match_filter_generator(struct sock_filter *filter, bool *overflow)
497
+{
498
+	unsigned short pos = 0;
499
+
500
+#if SUPPORTED_PERSONALITIES > 1
501
+	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
502
+		     offsetof(struct seccomp_data, arch));
503
+#endif
504
+
505
+	/* Personalities are iterated in reverse-order in the BPF program so that
506
+	 * the x86 case is naturally handled.  In x86, the first and third
507
+	 * personalities have the same arch identifier.  The third can be
508
+	 * distinguished based on its associated bit mask, so we check it first.
509
+	 * The only drawback here is that the first personality is more common,
510
+	 * which may make the BPF program slower to match syscalls on average. */
511
+	for (int p = SUPPORTED_PERSONALITIES - 1;
512
+		 p >= 0 && pos <= BPF_MAXINSNS;
513
+		 --p) {
514
+		unsigned short start = pos, end;
515
+		unsigned int bitarray = 0;
516
+		unsigned int i;
517
+
518
+#if SUPPORTED_PERSONALITIES > 1
519
+		SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JEQ | BPF_K,
520
+			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
521
+#endif
522
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
523
+			     offsetof(struct seccomp_data, nr));
524
+
525
+#if SUPPORTED_PERSONALITIES > 1
526
+		if (audit_arch_vec[p].flag) {
527
+			SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JGE | BPF_K,
528
+				     audit_arch_vec[p].flag, 2, 0);
529
+			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
530
+				     offsetof(struct seccomp_data, arch));
531
+			SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JA,
532
+				     JMP_PLACEHOLDER_NEXT, 0, 0);
533
+
534
+			/* nr = nr & ~mask */
535
+			SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K,
536
+				     ~audit_arch_vec[p].flag);
537
+		}
538
+#endif
539
+
540
+		/* X = 1 << nr % 32 = 1 << nr & 0x1F; */
541
+		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K, 0x1F);
542
+		SET_BPF_STMT(&filter[pos++], BPF_MISC | BPF_TAX, 0);
543
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_IMM, 1);
544
+		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_LSH | BPF_X, 0);
545
+		SET_BPF_STMT(&filter[pos++], BPF_MISC | BPF_TAX, 0);
546
+
547
+		/* A = nr / 32 = n >> 5; */
548
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
549
+			     offsetof(struct seccomp_data, nr));
550
+		if (audit_arch_vec[p].flag) {
551
+			/* nr = nr & ~mask */
552
+			SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K,
553
+				     ~audit_arch_vec[p].flag);
554
+		}
555
+		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_RSH | BPF_K, 5);
556
+
557
+		for (i = 0; i < nsyscall_vec[p] && pos <= BPF_MAXINSNS; ++i) {
558
+			if (traced_by_seccomp(i, p))
559
+				bitarray |= (1 << i % 32);
560
+			if (i % 32 == 31) {
561
+				pos += bpf_syscalls_match(filter + pos,
562
+							  bitarray, i / 32);
563
+				bitarray = 0;
564
+			}
565
+		}
566
+		if (i % 32 != 0)
567
+			pos += bpf_syscalls_match(filter + pos, bitarray,
568
+						  i / 32);
569
+
570
+		end = pos;
571
+
572
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
573
+			     SECCOMP_RET_ALLOW);
574
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
575
+			     SECCOMP_RET_TRACE);
576
+
577
+		for (unsigned int i = start; i < end; ++i) {
578
+			if (BPF_CLASS(filter[i].code) != BPF_JMP)
579
+				continue;
580
+			unsigned char jmp_next = pos - i - 1;
581
+			unsigned char jmp_trace = pos - i - 2;
582
+			unsigned char jmp_allow = pos - i - 3;
583
+			replace_jmp_placeholders(&filter[i].jt, jmp_next,
584
+						 jmp_trace, jmp_allow);
585
+			replace_jmp_placeholders(&filter[i].jf, jmp_next,
586
+						 jmp_trace, jmp_allow);
587
+			if (BPF_OP(filter[i].code) == BPF_JA)
588
+				filter[i].k = (unsigned int)jmp_next;
589
+		}
590
+	}
591
+
592
+#if SUPPORTED_PERSONALITIES > 1
593
+	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
594
+#endif
595
+
596
+	return pos;
597
+}
598
+
459 599
 static void
460 600
 check_seccomp_filter_properties(void)
461 601
 {
@@ -509,6 +649,9 @@ dump_seccomp_bpf(void)
509 649
 					  filter[i].k);
510 650
 			}
511 651
 			break;
652
+		case BPF_LD + BPF_W + BPF_IMM:
653
+			error_msg("STMT(BPF_LDWIMM, 0x%x)", filter[i].k);
654
+			break;
512 655
 		case BPF_RET | BPF_K:
513 656
 			switch (filter[i].k) {
514 657
 			case SECCOMP_RET_TRACE:
@@ -531,9 +674,29 @@ dump_seccomp_bpf(void)
531 674
 				  filter[i].jt, filter[i].jf,
532 675
 				  filter[i].k);
533 676
 			break;
677
+		case BPF_JMP + BPF_JSET + BPF_K:
678
+			error_msg("JUMP(BPF_JSET, %u, %u, 0x%x)",
679
+				  filter[i].jt, filter[i].jf,
680
+				  filter[i].k);
681
+			break;
534 682
 		case BPF_JMP | BPF_JA:
535 683
 			error_msg("JUMP(BPF_JA, %u)", filter[i].k);
536 684
 			break;
685
+		case BPF_ALU + BPF_RSH + BPF_K:
686
+			error_msg("STMT(BPF_RSH, %u)", filter[i].k);
687
+			break;
688
+		case BPF_ALU + BPF_LSH + BPF_X:
689
+			error_msg("STMT(BPF_LSH, X)");
690
+			break;
691
+		case BPF_ALU + BPF_AND + BPF_K:
692
+			error_msg("STMT(BPF_AND, 0x%x)", filter[i].k);
693
+			break;
694
+		case BPF_MISC + BPF_TAX:
695
+			error_msg("STMT(BPF_TAX)");
696
+			break;
697
+		case BPF_MISC + BPF_TXA:
698
+			error_msg("STMT(BPF_TXA)");
699
+			break;
537 700
 		default:
538 701
 			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
539 702
 				  filter[i].jt, filter[i].jf, filter[i].k);

Loading…
Cancel
Save