[PATCH RFC 8/9] filter_seccomp: binary match generation strategy

Paul Chaignon paul.chaignon at gmail.com
Fri Aug 23 09:44:21 UTC 2019


This commit introduces a new BPF program generation strategy.  Traced
syscalls are encoded in 32 bits bit arrays in the BPF program.  Syscall
are then matched against bit arrays at runtime with two ALU operations: a
division to select the appropriate bit array to compare with and a shift
to select the appropriate bit in the bit array.

Since there is no way to implement a jump table in BPF (jumps have fixed
offsets), we have to iterate over all bit arrays to select the appropriate
bit array.  The division and modulo are also converted into a shift and a
bitwise AND, to improve performance and because seccomp-bpf disallow
modulos in BPF.

Compared to the linear generation strategy, this strategy generates
programs of near constant size.  There is a single optimization that
depends on traced syscalls: if a bit array is all-0 or all-1, we don't
need to do a jset against it, we can simply jump to either RET_ALLOW or
RET_TRACE.  With this strategy, programs are, however, generally longer
than with the linear strategy since there's more pre-processing required.

filter_seccomp.c (JMP_PLACEHOLDER_ALLOW): New constant.
(binary_match_filter_generator): New prototype.
(filter_generators): Add binary_match_filter_generator.
(replace_jmp_placeholders): Handle JMP_PLACEHOLDER_ALLOW case.
(__linear_filter_generator): New argument for replace_jmp_placeholders.
(bpf_syscalls_match, binary_match_filter_generator): New functions.
(dump_seccomp_bpf): Handle ldwimm, jset, rsh, lsh, and, tax, and txa
instructions.

Signed-off-by: Paul Chaignon <paul.chaignon at gmail.com>
---
 filter_seccomp.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 165 insertions(+), 3 deletions(-)

diff --git a/filter_seccomp.c b/filter_seccomp.c
index 22a011bc..e5bc2003 100644
--- a/filter_seccomp.c
+++ b/filter_seccomp.c
@@ -27,6 +27,7 @@
 
 #define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
 #define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
+#define JMP_PLACEHOLDER_ALLOW ((unsigned char) -3)
 
 #define SET_BPF(filter, code, jt, jf, k) \
 	(*(filter) = (struct sock_filter) { code, jt, jf, k })
@@ -58,9 +59,12 @@ static unsigned short linear_filter_generator(struct sock_filter *,
 					      bool *overflow);
 static unsigned short reverse_linear_filter_generator(struct sock_filter *,
 						      bool *overflow);
+static unsigned short binary_match_filter_generator(struct sock_filter *,
+						    bool *overflow);
 static filter_generator_t filter_generators[] = {
 	linear_filter_generator,
 	reverse_linear_filter_generator,
+	binary_match_filter_generator,
 };
 
 bool seccomp_filtering = false;
@@ -173,7 +177,7 @@ traced_by_seccomp(unsigned int scno, unsigned int p)
 
 static void
 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
-			 unsigned char jmp_trace) {
+			 unsigned char jmp_trace, unsigned char jmp_allow) {
 	switch (*jmp_offset) {
 	case JMP_PLACEHOLDER_NEXT:
 		*jmp_offset = jmp_next;
@@ -181,6 +185,9 @@ replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
 	case JMP_PLACEHOLDER_TRACE:
 		*jmp_offset = jmp_trace;
 		break;
+	case JMP_PLACEHOLDER_ALLOW:
+		*jmp_offset = jmp_allow;
+		break;
 	default:
 		break;
 	}
@@ -319,9 +326,9 @@ __linear_filter_generator(struct sock_filter *filter, bool *overflow,
 			unsigned char jmp_match = match_traced ?
 						  pos - i - 2 : pos - i - 3;
 			replace_jmp_placeholders(&filter[i].jt, jmp_next,
-						 jmp_match);
+						 jmp_match, 0);
 			replace_jmp_placeholders(&filter[i].jf, jmp_next,
-						 jmp_match);
+						 jmp_match, 0);
 			if (BPF_OP(filter[i].code) == BPF_JA)
 				filter[i].k = (unsigned int) jmp_next;
 		}
@@ -345,6 +352,138 @@ reverse_linear_filter_generator(struct sock_filter *filter, bool *overflow) {
 	return __linear_filter_generator(filter, overflow, false);
 }
 
+static unsigned short
+bpf_syscalls_match(struct sock_filter *filter, unsigned int bitarray,
+		   unsigned int bitarray_idx)
+{
+	if (!bitarray) {
+		/* return RET_ALLOW; */
+		SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
+			     JMP_PLACEHOLDER_ALLOW, 0);
+		return 1;
+	}
+	if (bitarray == UINT_MAX) {
+		/* return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
+			     JMP_PLACEHOLDER_TRACE, 0);
+		return 1;
+	}
+	/*
+	 * if (A == nr / 32)
+	 *   return (X & bitarray) ? RET_TRACE : RET_ALLOW;
+	 */
+	SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
+		     0, 2);
+	SET_BPF_STMT(filter + 1, BPF_MISC | BPF_TXA, 0);
+	SET_BPF_JUMP(filter + 2, BPF_JMP | BPF_JSET | BPF_K, bitarray,
+		     JMP_PLACEHOLDER_TRACE, JMP_PLACEHOLDER_ALLOW);
+	return 3;
+}
+
+static unsigned short
+binary_match_filter_generator(struct sock_filter *filter, bool *overflow)
+{
+	unsigned short pos = 0;
+
+#if SUPPORTED_PERSONALITIES > 1
+	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+		     offsetof(struct seccomp_data, arch));
+#endif
+
+	/* Personnalities are iterated in reverse-order in the BPF program so that
+	 * the x86 case is naturally handled.  In x86, the first and third
+	 * personnalities have the same arch identifier.  The third can be
+	 * distinguished based on its associated bit mask, so we check it first.
+	 * The only drawback here is that the first personnality is more common,
+	 * which may make the BPF program slower to match syscalls on average. */
+	for (int p = SUPPORTED_PERSONALITIES - 1;
+		 p >= 0 && pos <= BPF_MAXINSNS;
+		 --p) {
+		unsigned short start = pos, end;
+		unsigned int bitarray = 0;
+		unsigned int i;
+
+#if SUPPORTED_PERSONALITIES > 1
+		SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JEQ | BPF_K,
+			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
+#endif
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+			     offsetof(struct seccomp_data, nr));
+
+#if SUPPORTED_PERSONALITIES > 1
+		if (audit_arch_vec[p].flag) {
+			SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JGE | BPF_K,
+				     audit_arch_vec[p].flag, 2, 0);
+			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+				     offsetof(struct seccomp_data, arch));
+			SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JA,
+				     JMP_PLACEHOLDER_NEXT, 0, 0);
+
+			/* nr = nr & ~mask */
+			SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K,
+				     ~audit_arch_vec[p].flag);
+		}
+#endif
+
+		/* X = 1 << nr % 32 = 1 << nr & 0x1F; */
+		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K, 0x1F);
+		SET_BPF_STMT(&filter[pos++], BPF_MISC | BPF_TAX, 0);
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_IMM, 1);
+		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_LSH | BPF_X, 0);
+		SET_BPF_STMT(&filter[pos++], BPF_MISC | BPF_TAX, 0);
+
+		/* A = nr / 32 = n >> 5; */
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+			     offsetof(struct seccomp_data, nr));
+		if (audit_arch_vec[p].flag) {
+			/* nr = nr & ~mask */
+			SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K,
+				     ~audit_arch_vec[p].flag);
+		}
+		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_RSH | BPF_K, 5);
+
+		for (i = 0; i < nsyscall_vec[p] && pos <= BPF_MAXINSNS; ++i) {
+			if (traced_by_seccomp(i, p))
+				bitarray |= (1 << i % 32);
+			if (i % 32 == 31) {
+				pos += bpf_syscalls_match(filter + pos,
+							  bitarray, i / 32);
+				bitarray = 0;
+			}
+		}
+		if (i % 32 != 0)
+			pos += bpf_syscalls_match(filter + pos, bitarray,
+						  i / 32);
+
+		end = pos;
+
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_ALLOW);
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_TRACE);
+
+		for (unsigned int i = start; i < end; ++i) {
+			if (BPF_CLASS(filter[i].code) != BPF_JMP)
+				continue;
+			unsigned char jmp_next = pos - i - 1;
+			unsigned char jmp_trace = pos - i - 2;
+			unsigned char jmp_allow = pos - i - 3;
+			replace_jmp_placeholders(&filter[i].jt, jmp_next,
+						 jmp_trace, jmp_allow);
+			replace_jmp_placeholders(&filter[i].jf, jmp_next,
+						 jmp_trace, jmp_allow);
+			if (BPF_OP(filter[i].code) == BPF_JA)
+				filter[i].k = (unsigned int)jmp_next;
+		}
+	}
+
+#if SUPPORTED_PERSONALITIES > 1
+	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
+#endif
+
+	return pos;
+}
+
 void
 check_seccomp_filter(void)
 {
@@ -418,6 +557,9 @@ dump_seccomp_bpf(void)
 					  filter[i].k);
 			}
 			break;
+		case BPF_LD + BPF_W + BPF_IMM:
+			error_msg("STMT(BPF_LDWIMM, 0x%x)", filter[i].k);
+			break;
 		case BPF_RET | BPF_K:
 			switch (filter[i].k) {
 			case SECCOMP_RET_TRACE:
@@ -440,9 +582,29 @@ dump_seccomp_bpf(void)
 				  filter[i].jt, filter[i].jf,
 				  filter[i].k);
 			break;
+		case BPF_JMP + BPF_JSET + BPF_K:
+			error_msg("JUMP(BPF_JSET, %u, %u, 0x%x)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
 		case BPF_JMP | BPF_JA:
 			error_msg("JUMP(BPF_JA, %u)", filter[i].k);
 			break;
+		case BPF_ALU + BPF_RSH + BPF_K:
+			error_msg("STMT(BPF_RSH, %u)", filter[i].k);
+			break;
+		case BPF_ALU + BPF_LSH + BPF_X:
+			error_msg("STMT(BPF_LSH, X)");
+			break;
+		case BPF_ALU + BPF_AND + BPF_K:
+			error_msg("STMT(BPF_AND, 0x%x)", filter[i].k);
+			break;
+		case BPF_MISC + BPF_TAX:
+			error_msg("STMT(BPF_TAX)");
+			break;
+		case BPF_MISC + BPF_TXA:
+			error_msg("STMT(BPF_TXA)");
+			break;
 		default:
 			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
 				  filter[i].jt, filter[i].jf, filter[i].k);
-- 
2.17.1



More information about the Strace-devel mailing list