[PATCH RFC 2/9] Introduce seccomp-assisted syscall filtering

Fri Aug 23 09:42:44 UTC 2019

From: Chen Jingpiao <chenjingpiao at gmail.com>

With this patch, strace can rely on seccomp to only be stopped at syscalls
of interest, instead of stopping at all syscalls.  The seccomp filtering
of syscalls is opt-in only; it must be enabled with the -n option.  Kernel
support is first checked with check_seccomp_filter(), which also ensures
the BPF program derived from the syscalls to filter is not too larger than
the kernel's limit.

The -n option implies -f, but a warning is emitted if -f is not explicitly
specified.  Since a task's children inherit its seccomp filters, we want
to ensure all children are also traced to avoid their syscalls failing
with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).

The current BPF program implements a simple linear match of the syscall
numbers.  It can be improved in the future without impacting user-observed
behavior.

The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8 (cf.
PTRACE_EVENT_SECCOMP in ptrace man page).  This patch supports both
behaviors by checking the kernel's actual behavior before installing the
seccomp filter.

* filter_seccomp.c: New file.
* filter_seccomp.h: New file.
* Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
* linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Define for aarch64.
* linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
* linux/riscv/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for riscv.
* linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
* linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for sparc64.
PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
* linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for tile.
* linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for x32.
* linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
* linux/ia64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH): Likewise for IA64.
* strace.c (usage): Document -n option.
(exec_or_die): Initialize seccomp filtering if requested.
(init): Handle -n option and check that seccomp can be enabled.
(print_debug_info): Handle PTRACE_EVENT_SECCOMP.
(next_event): Capture PTRACE_EVENT_SECCOMP event.
(dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
* trace_event.h (trace_event): New enumeration entity.
* strace.1.in: Document new -n option.
* NEWS: Mention this change.

Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
---
 Makefile.am                  |   2 +
 NEWS                         |   2 +
 filter_seccomp.c             | 486 +++++++++++++++++++++++++++++++++++
 filter_seccomp.h             |  21 ++
 linux/aarch64/arch_defs_.h   |   2 +
 linux/arch_defs_.h           |   4 +
 linux/ia64/arch_defs_.h      |   1 +
 linux/powerpc64/arch_defs_.h |   2 +
 linux/riscv/arch_defs_.h     |   2 +
 linux/s390x/arch_defs_.h     |   2 +
 linux/sparc64/arch_defs_.h   |   2 +
 linux/tile/arch_defs_.h      |   2 +
 linux/x32/arch_defs_.h       |   2 +
 linux/x86_64/arch_defs_.h    |   3 +
 strace.1.in                  |  17 +-
 strace.c                     |  76 +++++-
 trace_event.h                |   5 +
 17 files changed, 626 insertions(+), 5 deletions(-)
 create mode 100644 filter_seccomp.c
 create mode 100644 filter_seccomp.h

diff --git a/Makefile.am b/Makefile.am
index 8ce2147f..b0fef9c0 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -129,6 +129,8 @@ strace_SOURCES =	\
 	file_ioctl.c	\
 	filter.h	\
 	filter_qualify.c \
+	filter_seccomp.c \
+	filter_seccomp.h \
 	flock.c		\
 	flock.h		\
 	fs_x_ioctl.c	\
diff --git a/NEWS b/NEWS
index a23b361b..e2913785 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,8 @@ Noteworthy changes in release ?.? (????-??-??)
   * Implemented decoding of UNIX_DIAG_UID netlink attribute.
   * Updated lists of BPF_*, ETH_*, KEYCTL_*, KVM_*, MAP_*, SO_*, TCP_*, V4L2_*,
     XDP_*, and *_MAGIC constants.
+  * Implemented filtering of system calls via seccomp-bpf.  Use -n option to
+    enable.
 
 * Bug fixes
   * Fixed syscall tampering on arc, avr32, csky, ia64, m68k, metag, mips,
diff --git a/filter_seccomp.c b/filter_seccomp.c
new file mode 100644
index 00000000..0a5bed53
--- /dev/null
+++ b/filter_seccomp.c
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#include "defs.h"
+
+#include "ptrace.h"
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <asm/unistd.h>
+#include <signal.h>
+
+#include "filter_seccomp.h"
+#include "number_set.h"
+#include "syscall.h"
+
+#ifndef BPF_MAXINSNS
+# define BPF_MAXINSNS 4096
+#endif
+
+#define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
+#define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
+
+#define SET_BPF(filter, code, jt, jf, k) \
+	(*(filter) = (struct sock_filter) { code, jt, jf, k })
+
+#define SET_BPF_STMT(filter, code, k) \
+	SET_BPF(filter, code, 0, 0, k)
+
+#define SET_BPF_JUMP(filter, code, k, jt, jf) \
+	SET_BPF(filter, BPF_JMP | code, jt, jf, k)
+
+struct audit_arch_t {
+	unsigned int arch;
+	unsigned int flag;
+};
+
+static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
+#if SUPPORTED_PERSONALITIES > 1
+	PERSONALITY0_AUDIT_ARCH,
+	PERSONALITY1_AUDIT_ARCH,
+# if SUPPORTED_PERSONALITIES > 2
+	PERSONALITY2_AUDIT_ARCH,
+# endif
+#endif
+};
+
+bool seccomp_filtering = false;
+bool seccomp_before_sysentry;
+
+static void
+check_seccomp_order_do_child(void)
+{
+	static struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+			 offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
+	};
+	static const struct sock_fprog prog = {
+		.len = ARRAY_SIZE(filter),
+		.filter = filter
+	};
+
+	if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) < 0)
+		perror_func_msg_and_die("ptrace(PTRACE_TRACEME, ...");
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1, ...");
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
+	kill(getpid(), SIGSTOP);
+	syscall(__NR_gettid);
+	pause();
+	_exit(0);
+}
+
+static int
+check_seccomp_order_tracer(int pid)
+{
+	int status, tracee_pid, step = 0;
+	unsigned int event;
+
+	while (1) {
+		errno = 0;
+		tracee_pid = waitpid(pid, &status, 0);
+		if (tracee_pid <= 0) {
+			if (errno == EINTR)
+				continue;
+			perror_func_msg("waitpid(%u)", pid);
+			return -1;
+		}
+		switch (step) {
+		case 0:
+			if (ptrace(PTRACE_SETOPTIONS, pid, 0,
+				   PTRACE_O_TRACESECCOMP) < 0) {
+				perror_func_msg("ptrace(PTRACE_SETOPTIONS)");
+				return -1;
+			}
+			if (ptrace(PTRACE_SYSCALL, pid, NULL, NULL) < 0) {
+				perror_func_msg("ptrace(PTRACE_SYSCALL)");
+				return -1;
+			}
+			break;
+		case 1:
+			event = (unsigned int) status >> 16;
+			seccomp_before_sysentry = event == PTRACE_EVENT_SECCOMP;
+			kill(pid, SIGKILL);
+			break;
+		default:
+			if (WIFSIGNALED(status))
+				return 0;
+
+			error_func_msg("unexpected wait status %#x", status);
+			return -1;
+		}
+		step++;
+	}
+	return 0;
+}
+
+static int
+check_seccomp_order(void)
+{
+	int pid;
+
+	pid = fork();
+	if (pid < 0) {
+		perror_func_msg("fork");
+		return -1;
+	}
+
+	if (pid == 0)
+		check_seccomp_order_do_child();
+
+	return check_seccomp_order_tracer(pid);
+}
+
+static bool
+traced_by_seccomp(unsigned int scno, unsigned int p)
+{
+	if (is_number_in_set_array(scno, trace_set, p)
+	    || sysent_vec[p][scno].sys_flags
+	    & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
+		return true;
+	return false;
+}
+
+static void
+check_bpf_program_size(void)
+{
+	unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
+
+	/*
+	 * Implements a simplified form of init_sock_filter()'s bytecode
+	 * generation algorithm, to count the number of instructions that will
+	 * be generated.
+	 */
+	for (int p = SUPPORTED_PERSONALITIES - 1;
+	     p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
+		unsigned int nb_insns_personality = 0;
+		unsigned int lower = UINT_MAX;
+
+		nb_insns_personality++;
+#if SUPPORTED_PERSONALITIES > 1
+		nb_insns_personality++;
+		if (audit_arch_vec[p].flag)
+			nb_insns_personality += 3;
+#endif
+
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+			if (traced_by_seccomp(i, p)) {
+				if (lower == UINT_MAX)
+					lower = i;
+				continue;
+			}
+			if (lower == UINT_MAX)
+				continue;
+			if (lower + 1 == i)
+				nb_insns_personality++;
+			else
+				nb_insns_personality += 2;
+			lower = UINT_MAX;
+		}
+		if (lower != UINT_MAX) {
+			if (lower + 1 == nsyscall_vec[p])
+				nb_insns_personality++;
+			else
+				nb_insns_personality += 2;
+		}
+
+		nb_insns_personality += 3;
+
+		/*
+		 * Within generated BPF programs, the origin and destination of
+		 * jumps are always in the same personality section.  The
+		 * largest jump is therefore the jump from the first
+		 * instruction of the section to the last, to skip the
+		 * personality and try to compare .arch to the next
+		 * personality.
+		 * If we have a personality section with more than 255
+		 * instructions, the jump offset will overflow.  Such program
+		 * is unlikely to happen, so we simply disable seccomp-filter
+		 * is such a case.
+		 */
+		if (nb_insns_personality > UCHAR_MAX) {
+			debug_msg("seccomp-filter disabled due to "
+				  "possibility of overflow");
+			seccomp_filtering = false;
+			return;
+		}
+		nb_insns += nb_insns_personality;
+	}
+
+#if SUPPORTED_PERSONALITIES > 1
+	nb_insns++;
+#endif
+
+	if (nb_insns > BPF_MAXINSNS) {
+		debug_msg("seccomp-filter disabled due to BPF program being "
+			  "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
+		seccomp_filtering = false;
+	}
+}
+
+void
+check_seccomp_filter(void)
+{
+	int rc;
+
+	if (!seccomp_filtering)
+		return;
+
+	if (NOMMU_SYSTEM) {
+		seccomp_filtering = false;
+		goto end;
+	}
+
+	rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
+	seccomp_filtering = rc >= 0 || errno != EINVAL;
+	if (seccomp_filtering)
+		check_bpf_program_size();
+	if (seccomp_filtering && check_seccomp_order() < 0)
+		seccomp_filtering = false;
+
+end:
+	if (!seccomp_filtering)
+		error_msg("seccomp-filter is requested but unavailable");
+}
+
+static void
+dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
+{
+	for (unsigned int i = 0; i < len; ++i) {
+		switch (filter[i].code) {
+		case BPF_LD | BPF_W | BPF_ABS:
+			switch (filter[i].k) {
+			case offsetof(struct seccomp_data, arch):
+				error_msg("STMT(BPF_LDWABS, data->arch)");
+				break;
+			case offsetof(struct seccomp_data, nr):
+				error_msg("STMT(BPF_LDWABS, data->nr)");
+				break;
+			default:
+				error_msg("STMT(BPF_LDWABS, 0x%x)",
+					  filter[i].k);
+			}
+			break;
+		case BPF_RET | BPF_K:
+			switch (filter[i].k) {
+			case SECCOMP_RET_TRACE:
+				error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
+				break;
+			case SECCOMP_RET_ALLOW:
+				error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
+				break;
+			default:
+				error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
+			}
+			break;
+		case BPF_JMP | BPF_JEQ | BPF_K:
+			error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
+		case BPF_JMP | BPF_JGE | BPF_K:
+			error_msg("JUMP(BPF_JGE, %u, %u, %u)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
+		case BPF_JMP | BPF_JA:
+			error_msg("JUMP(BPF_JA, %u)", filter[i].k);
+			break;
+		default:
+			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
+				  filter[i].jt, filter[i].jf, filter[i].k);
+		}
+	}
+}
+
+static void
+replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
+			 unsigned char jmp_trace) {
+	switch (*jmp_offset) {
+	case JMP_PLACEHOLDER_NEXT:
+		*jmp_offset = jmp_next;
+		break;
+	case JMP_PLACEHOLDER_TRACE:
+		*jmp_offset = jmp_trace;
+		break;
+	default:
+		break;
+	}
+}
+
+static unsigned short
+bpf_syscalls_cmp(struct sock_filter *filter,
+		 unsigned int lower, unsigned int upper)
+{
+	if (lower + 1 == upper) {
+		/* if (nr == lower) return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
+			     JMP_PLACEHOLDER_TRACE, 0);
+		return 1;
+	} else {
+		/* if (nr >= lower && nr < upper) return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
+		SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
+			     JMP_PLACEHOLDER_TRACE);
+		return 2;
+	}
+}
+
+static unsigned short
+init_sock_filter(struct sock_filter *filter)
+{
+	/*
+	 * Generated program looks like:
+	 * if (arch == AUDIT_ARCH_A && nr >= flag) {
+	 *	if (nr == 59)
+	 *		return SECCOMP_RET_TRACE;
+	 *	if (nr >= 321 && nr <= 323)
+	 *		return SECCOMP_RET_TRACE;
+	 *	...
+	 *	return SECCOMP_RET_ALLOW;
+	 * }
+	 * if (arch == AUDIT_ARCH_A) {
+	 *	...
+	 * }
+	 * if (arch == AUDIT_ARCH_B) {
+	 *	...
+	 * }
+	 * return SECCOMP_RET_TRACE;
+	 */
+	unsigned short pos = 0;
+
+#if SUPPORTED_PERSONALITIES > 1
+	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+		     offsetof(struct seccomp_data, arch));
+#endif
+
+	/*
+	 * Personalities are iterated in reverse-order in the BPF program so
+	 * that the x86 case is naturally handled.  On x86, the first and third
+	 * personalities have the same arch identifier.  The third can be
+	 * distinguished based on its associated syscall flag, so we check it
+	 * first.  The only drawback here is that the first personality is more
+	 * common, which may make the BPF program slower to match syscalls on
+	 * average.
+	 */
+	for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
+		unsigned int lower = UINT_MAX;
+		unsigned short start = pos, end;
+
+#if SUPPORTED_PERSONALITIES > 1
+		/* if (arch != audit_arch_vec[p].arch) goto next; */
+		SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
+			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
+#endif
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+			     offsetof(struct seccomp_data, nr));
+
+#if SUPPORTED_PERSONALITIES > 1
+		if (audit_arch_vec[p].flag) {
+			/* if (nr < audit_arch_vec[p].flag) goto next; */
+			SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+				     audit_arch_vec[p].flag, 2, 0);
+			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+				     offsetof(struct seccomp_data, arch));
+			SET_BPF_JUMP(&filter[pos++], BPF_JA,
+				     JMP_PLACEHOLDER_NEXT, 0, 0);
+		}
+#endif
+
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+			if (traced_by_seccomp(i, p)) {
+				if (lower == UINT_MAX)
+					lower = i;
+				continue;
+			}
+			if (lower == UINT_MAX)
+				continue;
+			pos += bpf_syscalls_cmp(filter + pos,
+						lower | audit_arch_vec[p].flag,
+						i | audit_arch_vec[p].flag);
+			lower = UINT_MAX;
+		}
+		if (lower != UINT_MAX)
+			pos += bpf_syscalls_cmp(filter + pos,
+						lower | audit_arch_vec[p].flag,
+						nsyscall_vec[p]
+						| audit_arch_vec[p].flag);
+		end = pos;
+
+		/* if (nr >= max_nr) return RET_TRACE; */
+		SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+			     nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
+
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_ALLOW);
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_TRACE);
+
+		for (unsigned int i = start; i < end; ++i) {
+			if (BPF_CLASS(filter[i].code) != BPF_JMP)
+				continue;
+			unsigned char jmp_next = pos - i - 1;
+			unsigned char jmp_trace = pos - i - 2;
+			replace_jmp_placeholders(&filter[i].jt, jmp_next,
+						 jmp_trace);
+			replace_jmp_placeholders(&filter[i].jf, jmp_next,
+						 jmp_trace);
+			if (BPF_OP(filter[i].code) == BPF_JA)
+				filter[i].k = (unsigned int) jmp_next;
+		}
+	}
+
+#if SUPPORTED_PERSONALITIES > 1
+	/* Jumps conditioned on .arch default to this RET_TRACE. */
+	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
+#endif
+
+	if (debug_flag)
+		dump_seccomp_bpf(filter, pos);
+
+	return pos;
+}
+
+void
+init_seccomp_filter(void)
+{
+	struct sock_filter filter[BPF_MAXINSNS];
+	unsigned short len;
+
+	len = init_sock_filter(filter);
+
+	struct sock_fprog prog = {
+		.len = len,
+		.filter = filter
+	};
+
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+		perror_func_msg("prctl(PR_SET_NO_NEW_PRIVS)");
+		return;
+	}
+
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+		perror_func_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+	if (tcp && exiting(tcp)
+	    && tcp->scno < nsyscall_vec[current_personality]
+	    && traced_by_seccomp(tcp->scno, current_personality))
+		return PTRACE_SYSCALL;
+	return PTRACE_CONT;
+}
diff --git a/filter_seccomp.h b/filter_seccomp.h
new file mode 100644
index 00000000..5e4d2f80
--- /dev/null
+++ b/filter_seccomp.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#ifndef STRACE_SECCOMP_FILTER_H
+#define STRACE_SECCOMP_FILTER_H
+
+#include "defs.h"
+
+extern bool seccomp_filtering;
+extern bool seccomp_before_sysentry;
+
+extern void check_seccomp_filter(void);
+extern void init_seccomp_filter(void);
+extern int seccomp_filter_restart_operator(const struct tcb *);
+
+#endif /* !STRACE_SECCOMP_FILTER_H */
diff --git a/linux/aarch64/arch_defs_.h b/linux/aarch64/arch_defs_.h
index ed9261f5..fb75722f 100644
--- a/linux/aarch64/arch_defs_.h
+++ b/linux/aarch64/arch_defs_.h
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_AARCH64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_ARM,     0 }
diff --git a/linux/arch_defs_.h b/linux/arch_defs_.h
index 5baf3f91..7dc40912 100644
--- a/linux/arch_defs_.h
+++ b/linux/arch_defs_.h
@@ -66,3 +66,7 @@
 #ifndef HAVE_ARCH_OLD_TIME64_SYSCALLS
 # define HAVE_ARCH_OLD_TIME64_SYSCALLS (SIZEOF_LONG == 8)
 #endif
+
+#ifndef SECCOMP_TRACED_SYSCALLS
+# define SECCOMP_TRACED_SYSCALLS
+#endif
diff --git a/linux/ia64/arch_defs_.h b/linux/ia64/arch_defs_.h
index 87ca0cdb..107a74df 100644
--- a/linux/ia64/arch_defs_.h
+++ b/linux/ia64/arch_defs_.h
@@ -9,3 +9,4 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 0
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_IA64, SYSCALLENT_BASE_NR }
diff --git a/linux/powerpc64/arch_defs_.h b/linux/powerpc64/arch_defs_.h
index 871f4109..a4ac007e 100644
--- a/linux/powerpc64/arch_defs_.h
+++ b/linux/powerpc64/arch_defs_.h
@@ -8,3 +8,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define SUPPORTED_PERSONALITIES 2
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_PPC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_PPC,   0 }
diff --git a/linux/riscv/arch_defs_.h b/linux/riscv/arch_defs_.h
index a9c27bc7..f53f076a 100644
--- a/linux/riscv/arch_defs_.h
+++ b/linux/riscv/arch_defs_.h
@@ -7,4 +7,6 @@
 
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+# define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_RISCV64, 0 }
+# define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_RISCV32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
diff --git a/linux/s390x/arch_defs_.h b/linux/s390x/arch_defs_.h
index 1e520761..750ab512 100644
--- a/linux/s390x/arch_defs_.h
+++ b/linux/s390x/arch_defs_.h
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_MMAP_PGOFF 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_S390X, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_S390,  0 }
diff --git a/linux/sparc64/arch_defs_.h b/linux/sparc64/arch_defs_.h
index 68eef4fc..9eacaa40 100644
--- a/linux/sparc64/arch_defs_.h
+++ b/linux/sparc64/arch_defs_.h
@@ -9,4 +9,6 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_SPARC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_SPARC,   0 }
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
diff --git a/linux/tile/arch_defs_.h b/linux/tile/arch_defs_.h
index a781208c..12ba0d8b 100644
--- a/linux/tile/arch_defs_.h
+++ b/linux/tile/arch_defs_.h
@@ -6,6 +6,8 @@
  */
 
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_TILEGX,   0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_TILEGX32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
 
 #ifdef __tilepro__
diff --git a/linux/x32/arch_defs_.h b/linux/x32/arch_defs_.h
index 1055de12..9f48d313 100644
--- a/linux/x32/arch_defs_.h
+++ b/linux/x32/arch_defs_.h
@@ -11,3 +11,5 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_OLD_TIME64_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
diff --git a/linux/x86_64/arch_defs_.h b/linux/x86_64/arch_defs_.h
index a8c1d991..c2924ac2 100644
--- a/linux/x86_64/arch_defs_.h
+++ b/linux/x86_64/arch_defs_.h
@@ -9,3 +9,6 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 3
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
+#define PERSONALITY2_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
diff --git a/strace.1.in b/strace.1.in
index 3ac46bac..36dabc9f 100644
--- a/strace.1.in
+++ b/strace.1.in
@@ -38,8 +38,8 @@
 strace \- trace system calls and signals
 .SH SYNOPSIS
 .SY strace
-.if '@ENABLE_STACKTRACE_FALSE@'#' .OP \-ACdffhikqqrtttTvVwxxyyzZ
-.if '@ENABLE_STACKTRACE_TRUE@'#' .OP \-ACdffhiqqrtttTvVwxxyyzZ
+.if '@ENABLE_STACKTRACE_FALSE@'#' .OP \-ACdffhiknqqrtttTvVwxxyyzZ
+.if '@ENABLE_STACKTRACE_TRUE@'#' .OP \-ACdffhinqqrtttTvVwxxyyzZ
 .OP \-I n
 .OP \-b execve
 .OM \-e expr
@@ -949,6 +949,19 @@ Show some debugging output of
 .B strace
 itself on the standard error.
 .TP
+.B \-n
+Enable (experimental) usage of seccomp-bpf to have ptrace(2)-stops only when
+system calls that are being traced occur in the traced processes.  Requires the
+.B \-f
+option.
+An attempt to rely on seccomp-bpf to filter system calls may fail for diverse
+reasons: there are too many system calls to filter, the seccomp API is
+unavailable, or
+.B strace
+itself is being traced.  In cases when seccomp-bpf filter setup failed,
+.B strace
+proceeds as usual and stops traced processes on every system call.
+.TP
 .B \-F
 This option is deprecated.  It is retained for backward compatibility only
 and may be removed in future releases.
diff --git a/strace.c b/strace.c
index 8d9e465c..2a047474 100644
--- a/strace.c
+++ b/strace.c
@@ -30,6 +30,7 @@
 #endif
 
 #include "kill_save_errno.h"
+#include "filter_seccomp.h"
 #include "largefile_wrappers.h"
 #include "mmap_cache.h"
 #include "number_set.h"
@@ -236,7 +237,7 @@ usage(void)
 #endif
 
 	printf("\
-usage: strace [-ACdffhi" K_OPT "qqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
+usage: strace [-ACdffhi" K_OPT "nqqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
               [-a column] [-o file] [-s strsize] [-X format] [-P path]...\n\
               [-p pid]...\n\
 	      { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
@@ -308,6 +309,7 @@ Startup:\n\
 \n\
 Miscellaneous:\n\
   -d             enable debug output to stderr\n\
+  -n             enable seccomp-bpf filtering\n\
   -h             print help message\n\
   -V             print version\n\
 "
@@ -1231,6 +1233,10 @@ exec_or_die(void)
 	if (params_for_tracee.child_sa.sa_handler != SIG_DFL)
 		sigaction(SIGCHLD, &params_for_tracee.child_sa, NULL);
 
+	debug_msg("seccomp-filter %s",
+		  seccomp_filtering ? "enabled" : "disabled");
+	if (seccomp_filtering)
+		init_seccomp_filter();
 	execv(params->pathname, params->argv);
 	perror_msg_and_die("exec");
 }
@@ -1606,7 +1612,7 @@ init(int argc, char *argv[])
 #ifdef ENABLE_STACKTRACE
 	    "k"
 #endif
-	    "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
+	    "a:Ab:cCdDe:E:fFhiI:no:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
 		switch (c) {
 		case 'a':
 			acolumn = string_to_uint(optarg);
@@ -1706,6 +1712,9 @@ init(int argc, char *argv[])
 		case 'u':
 			username = optarg;
 			break;
+		case 'n':
+			seccomp_filtering = true;
+			break;
 		case 'v':
 			qualify("abbrev=none");
 			break;
@@ -1759,6 +1768,11 @@ init(int argc, char *argv[])
 		error_msg_and_help("PROG [ARGS] must be specified with -D");
 	}
 
+	if (seccomp_filtering && !followfork) {
+		error_msg("-n was specified without -f, please use -f.");
+		followfork = 1;
+	}
+
 	if (optF) {
 		if (followfork) {
 			error_msg("deprecated option -F ignored");
@@ -1830,6 +1844,10 @@ init(int argc, char *argv[])
 		run_gid = getgid();
 	}
 
+	check_seccomp_filter();
+	if (seccomp_filtering)
+		ptrace_setoptions |= PTRACE_O_TRACESECCOMP;
+
 	if (followfork)
 		ptrace_setoptions |= PTRACE_O_TRACECLONE |
 				     PTRACE_O_TRACEFORK |
@@ -2021,6 +2039,7 @@ print_debug_info(const int pid, int status)
 			[PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
 			[PTRACE_EVENT_EXEC]  = "EXEC",
 			[PTRACE_EVENT_EXIT]  = "EXIT",
+			[PTRACE_EVENT_SECCOMP]  = "SECCOMP",
 			/* [PTRACE_EVENT_STOP (=128)] would make biggish array */
 		};
 		const char *e = "??";
@@ -2544,6 +2563,9 @@ next_event(void)
 			case PTRACE_EVENT_EXIT:
 				wd->te = TE_STOP_BEFORE_EXIT;
 				break;
+			case PTRACE_EVENT_SECCOMP:
+				wd->te = TE_SECCOMP;
+				break;
 			default:
 				wd->te = TE_RESTART;
 			}
@@ -2629,7 +2651,7 @@ trace_syscall(struct tcb *tcp, unsigned int *sig)
 static bool
 dispatch_event(const struct tcb_wait_data *wd)
 {
-	unsigned int restart_op = PTRACE_SYSCALL;
+	unsigned int restart_op;
 	unsigned int restart_sig = 0;
 	enum trace_event te = wd ? wd->te : TE_BREAK;
 	/*
@@ -2638,6 +2660,11 @@ dispatch_event(const struct tcb_wait_data *wd)
 	 */
 	int status = wd ? wd->status : 0;
 
+	if (seccomp_filtering)
+		restart_op = seccomp_filter_restart_operator(current_tcp);
+	else
+		restart_op = PTRACE_SYSCALL;
+
 	switch (te) {
 	case TE_BREAK:
 		return false;
@@ -2648,6 +2675,13 @@ dispatch_event(const struct tcb_wait_data *wd)
 	case TE_RESTART:
 		break;
 
+	case TE_SECCOMP:
+		if (seccomp_before_sysentry) {
+			restart_op = PTRACE_SYSCALL;
+			break;
+		}
+		ATTRIBUTE_FALLTHROUGH;
+
 	case TE_SYSCALL_STOP:
 		if (trace_syscall(current_tcp, &restart_sig) < 0) {
 			/*
@@ -2663,6 +2697,42 @@ dispatch_event(const struct tcb_wait_data *wd)
 			 */
 			return true;
 		}
+		if (seccomp_filtering) {
+			/*
+			 * Syscall and seccomp stops can happen in different
+			 * orders depending on kernel.  strace tests this in
+			 * check_seccomp_order_tracer().
+			 *
+			 * Linux 3.5--4.7:
+			 * (seccomp-stop before syscall-entry-stop)
+			 *         +--> seccomp-stop ->-PTRACE_SYSCALL->-+
+			 *         |                                     |
+			 *     PTRACE_CONT                   syscall-entry-stop
+			 *         |                                     |
+			 * syscall-exit-stop <---PTRACE_SYSCALL-----<----+
+			 *
+			 * Linux 4.8+:
+			 * (seccomp-stop after syscall-entry-stop)
+			 *                 syscall-entry-stop
+			 * 
+			 *         +---->-----PTRACE_CONT---->----+
+			 *         |                              |
+			 *  syscall-exit-stop               seccomp-stop
+			 *         |                              |
+			 *         +----<----PTRACE_SYSCALL---<---+
+			 *
+			 * Note in Linux 4.8+, we restart in PTRACE_CONT after
+			 * syscall-exit to skip the syscall-entry-stop.  The
+			 * next seccomp-stop will be treated as a syscall
+			 * entry.
+			 * 
+			 * The below line implement this behavior. Note
+			 * exiting(current_tcp) actually marks a
+			 * syscall-entry-stop because the flag was inverted in
+			 * the above call to trace_syscall.
+			 */
+			restart_op = exiting(current_tcp) ? PTRACE_SYSCALL : PTRACE_CONT;
+		}
 		break;
 
 	case TE_SIGNAL_DELIVERY_STOP:
diff --git a/trace_event.h b/trace_event.h
index 53a711b8..9021fc55 100644
--- a/trace_event.h
+++ b/trace_event.h
@@ -66,6 +66,11 @@ enum trace_event {
 	 * Restart the tracee with signal 0.
 	 */
 	TE_STOP_BEFORE_EXIT,
+
+	/*
+	 * SECCOMP_RET_TRACE rule is triggered.
+	 */
+	TE_SECCOMP,
 };
 
 #endif /* !STRACE_TRACE_EVENT_H */
-- 
2.17.1