[PATCH v4 2/4] Introduce seccomp-assisted syscall filtering

Thu Aug 29 14:00:24 UTC 2019

From: Chen Jingpiao <chenjingpiao at gmail.com>

With this patch, strace can rely on seccomp to only be stopped at syscalls
of interest, instead of stopping at all syscalls.  The seccomp filtering
of syscalls is opt-in only; it must be enabled with the -n option.  Kernel
support is first checked with check_seccomp_filter(), which also ensures
the BPF program derived from the syscalls to filter is not larger than the
kernel's limit.

The -n option implies -f, but a warning is emitted if -f is not explicitly
specified.  Since a task's children inherit its seccomp filters, we want
to ensure all children are also traced to avoid their syscalls failing
with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).

The current BPF program implements a simple linear match of the syscall
numbers.  Contiguous sequences of syscall numbers are however matched as
an interval, with two instructions only.  The algorithm can be improved or
replaced in the future without impacting user-observed behavior.

The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8 (cf.
PTRACE_EVENT_SECCOMP in ptrace man page).  This patch supports both
behaviors by checking the kernel's actual behavior before installing the
seccomp filter.

* filter_seccomp.c: New file.
* filter_seccomp.h: New file.
* Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
* linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Define for aarch64.
* linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
* linux/riscv/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for riscv.
* linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
* linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for sparc64.
PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
* linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for tile.
* linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for x32.
* linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
* linux/ia64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH): Likewise for IA64.
* strace.c (usage): Document -n option.
(exec_or_die): Initialize seccomp filtering if requested.
(init): Handle -n option and check that seccomp can be enabled.
(print_debug_info): Handle PTRACE_EVENT_SECCOMP.
(next_event): Capture PTRACE_EVENT_SECCOMP event.
(dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
* trace_event.h (trace_event): New enumeration entity.
* strace.1.in: Document new -n option.
* NEWS: Mention this change.

Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
---
 Makefile.am                  |   2 +
 NEWS                         |   2 +
 filter_seccomp.c             | 488 +++++++++++++++++++++++++++++++++++
 filter_seccomp.h             |  21 ++
 linux/aarch64/arch_defs_.h   |   2 +
 linux/ia64/arch_defs_.h      |   1 +
 linux/powerpc64/arch_defs_.h |   2 +
 linux/riscv/arch_defs_.h     |   2 +
 linux/s390x/arch_defs_.h     |   2 +
 linux/sparc64/arch_defs_.h   |   2 +
 linux/tile/arch_defs_.h      |   2 +
 linux/x32/arch_defs_.h       |   2 +
 linux/x86_64/arch_defs_.h    |   3 +
 strace.1.in                  |  17 +-
 strace.c                     |  76 +++++-
 trace_event.h                |   5 +
 16 files changed, 624 insertions(+), 5 deletions(-)
 create mode 100644 filter_seccomp.c
 create mode 100644 filter_seccomp.h

diff --git a/Makefile.am b/Makefile.am
index b4f31568..f4c65b0a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -129,6 +129,8 @@ strace_SOURCES =	\
 	file_ioctl.c	\
 	filter.h	\
 	filter_qualify.c \
+	filter_seccomp.c \
+	filter_seccomp.h \
 	flock.c		\
 	flock.h		\
 	fs_x_ioctl.c	\
diff --git a/NEWS b/NEWS
index a23b361b..e2913785 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,8 @@ Noteworthy changes in release ?.? (????-??-??)
   * Implemented decoding of UNIX_DIAG_UID netlink attribute.
   * Updated lists of BPF_*, ETH_*, KEYCTL_*, KVM_*, MAP_*, SO_*, TCP_*, V4L2_*,
     XDP_*, and *_MAGIC constants.
+  * Implemented filtering of system calls via seccomp-bpf.  Use -n option to
+    enable.
 
 * Bug fixes
   * Fixed syscall tampering on arc, avr32, csky, ia64, m68k, metag, mips,
diff --git a/filter_seccomp.c b/filter_seccomp.c
new file mode 100644
index 00000000..f1d10a5d
--- /dev/null
+++ b/filter_seccomp.c
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#include "defs.h"
+
+#include "ptrace.h"
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <asm/unistd.h>
+#include <signal.h>
+
+#include "filter_seccomp.h"
+#include "number_set.h"
+#include "syscall.h"
+
+#ifndef BPF_MAXINSNS
+# define BPF_MAXINSNS 4096
+#endif
+
+#define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
+#define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
+
+#define SET_BPF(filter, code, jt, jf, k) \
+	(*(filter) = (struct sock_filter) { code, jt, jf, k })
+
+#define SET_BPF_STMT(filter, code, k) \
+	SET_BPF(filter, code, 0, 0, k)
+
+#define SET_BPF_JUMP(filter, code, k, jt, jf) \
+	SET_BPF(filter, BPF_JMP | code, jt, jf, k)
+
+struct audit_arch_t {
+	unsigned int arch;
+	unsigned int flag;
+};
+
+static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
+#if SUPPORTED_PERSONALITIES > 1
+	PERSONALITY0_AUDIT_ARCH,
+	PERSONALITY1_AUDIT_ARCH,
+# if SUPPORTED_PERSONALITIES > 2
+	PERSONALITY2_AUDIT_ARCH,
+# endif
+#endif
+};
+
+bool seccomp_filtering = false;
+bool seccomp_before_sysentry;
+
+static void
+check_seccomp_order_do_child(void)
+{
+	static struct sock_filter filter[] = {
+		/* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
+		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+			 offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
+	};
+	static const struct sock_fprog prog = {
+		.len = ARRAY_SIZE(filter),
+		.filter = filter
+	};
+
+	if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) < 0)
+		perror_func_msg_and_die("ptrace(PTRACE_TRACEME, ...");
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1, ...");
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
+	kill(getpid(), SIGSTOP);
+	syscall(__NR_gettid);
+	pause();
+	_exit(0);
+}
+
+static int
+check_seccomp_order_tracer(int pid)
+{
+	int status, tracee_pid, step = 0;
+	unsigned int event;
+
+	while (1) {
+		errno = 0;
+		tracee_pid = waitpid(pid, &status, 0);
+		if (tracee_pid <= 0) {
+			if (errno == EINTR)
+				continue;
+			perror_func_msg("waitpid(%u)", pid);
+			return -1;
+		}
+		switch (step) {
+		case 0:
+			if (ptrace(PTRACE_SETOPTIONS, pid, 0,
+				   PTRACE_O_TRACESECCOMP) < 0) {
+				perror_func_msg("ptrace(PTRACE_SETOPTIONS)");
+				return -1;
+			}
+			if (ptrace(PTRACE_SYSCALL, pid, NULL, NULL) < 0) {
+				perror_func_msg("ptrace(PTRACE_SYSCALL)");
+				return -1;
+			}
+			break;
+		case 1:
+			event = (unsigned int) status >> 16;
+			seccomp_before_sysentry = event == PTRACE_EVENT_SECCOMP;
+			kill(pid, SIGKILL);
+			break;
+		default:
+			if (WIFSIGNALED(status))
+				return 0;
+
+			error_func_msg("unexpected wait status %#x", status);
+			return -1;
+		}
+		step++;
+	}
+	return 0;
+}
+
+static int
+check_seccomp_order(void)
+{
+	int pid;
+
+	pid = fork();
+	if (pid < 0) {
+		perror_func_msg("fork");
+		return -1;
+	}
+
+	if (pid == 0)
+		check_seccomp_order_do_child();
+
+	return check_seccomp_order_tracer(pid);
+}
+
+static bool
+traced_by_seccomp(unsigned int scno, unsigned int p)
+{
+	if (is_number_in_set_array(scno, trace_set, p)
+	    || sysent_vec[p][scno].sys_flags
+	    & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
+		return true;
+	return false;
+}
+
+static void
+check_bpf_program_size(void)
+{
+	unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
+
+	/*
+	 * Implements a simplified form of init_sock_filter()'s bytecode
+	 * generation algorithm, to count the number of instructions that will
+	 * be generated.
+	 */
+	for (int p = SUPPORTED_PERSONALITIES - 1;
+	     p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
+		unsigned int nb_insns_personality = 0;
+		unsigned int lower = UINT_MAX;
+
+		nb_insns_personality++;
+#if SUPPORTED_PERSONALITIES > 1
+		nb_insns_personality++;
+		if (audit_arch_vec[p].flag)
+			nb_insns_personality += 3;
+#endif
+
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+			if (traced_by_seccomp(i, p)) {
+				if (lower == UINT_MAX)
+					lower = i;
+				continue;
+			}
+			if (lower == UINT_MAX)
+				continue;
+			if (lower + 1 == i)
+				nb_insns_personality++;
+			else
+				nb_insns_personality += 2;
+			lower = UINT_MAX;
+		}
+		if (lower != UINT_MAX) {
+			if (lower + 1 == nsyscall_vec[p])
+				nb_insns_personality++;
+			else
+				nb_insns_personality += 2;
+		}
+
+		nb_insns_personality += 3;
+
+		/*
+		 * Within generated BPF programs, the origin and destination of
+		 * jumps are always in the same personality section.  The
+		 * largest jump is therefore the jump from the first
+		 * instruction of the section to the last, to skip the
+		 * personality and try to compare .arch to the next
+		 * personality.
+		 * If we have a personality section with more than 255
+		 * instructions, the jump offset will overflow.  Such program
+		 * is unlikely to happen, so we simply disable seccomp-filter
+		 * is such a case.
+		 */
+		if (nb_insns_personality > UCHAR_MAX) {
+			debug_msg("seccomp-filter disabled due to "
+				  "possibility of overflow");
+			seccomp_filtering = false;
+			return;
+		}
+		nb_insns += nb_insns_personality;
+	}
+
+#if SUPPORTED_PERSONALITIES > 1
+	nb_insns++;
+#endif
+
+	if (nb_insns > BPF_MAXINSNS) {
+		debug_msg("seccomp-filter disabled due to BPF program being "
+			  "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
+		seccomp_filtering = false;
+	}
+}
+
+void
+check_seccomp_filter(void)
+{
+	int rc;
+
+	if (!seccomp_filtering)
+		return;
+
+	if (NOMMU_SYSTEM) {
+		seccomp_filtering = false;
+		goto end;
+	}
+
+	rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
+	seccomp_filtering = rc >= 0 || errno != EINVAL;
+	if (seccomp_filtering)
+		check_bpf_program_size();
+	if (seccomp_filtering && check_seccomp_order() < 0)
+		seccomp_filtering = false;
+
+end:
+	if (!seccomp_filtering)
+		error_msg("seccomp-filter is requested but unavailable");
+}
+
+static void
+dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
+{
+	for (unsigned int i = 0; i < len; ++i) {
+		switch (filter[i].code) {
+		case BPF_LD | BPF_W | BPF_ABS:
+			switch (filter[i].k) {
+			case offsetof(struct seccomp_data, arch):
+				error_msg("STMT(BPF_LDWABS, data->arch)");
+				break;
+			case offsetof(struct seccomp_data, nr):
+				error_msg("STMT(BPF_LDWABS, data->nr)");
+				break;
+			default:
+				error_msg("STMT(BPF_LDWABS, 0x%x)",
+					  filter[i].k);
+			}
+			break;
+		case BPF_RET | BPF_K:
+			switch (filter[i].k) {
+			case SECCOMP_RET_TRACE:
+				error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
+				break;
+			case SECCOMP_RET_ALLOW:
+				error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
+				break;
+			default:
+				error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
+			}
+			break;
+		case BPF_JMP | BPF_JEQ | BPF_K:
+			error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
+		case BPF_JMP | BPF_JGE | BPF_K:
+			error_msg("JUMP(BPF_JGE, %u, %u, %u)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
+		case BPF_JMP | BPF_JA:
+			error_msg("JUMP(BPF_JA, %u)", filter[i].k);
+			break;
+		default:
+			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
+				  filter[i].jt, filter[i].jf, filter[i].k);
+		}
+	}
+}
+
+static void
+replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
+			 unsigned char jmp_trace)
+{
+	switch (*jmp_offset) {
+	case JMP_PLACEHOLDER_NEXT:
+		*jmp_offset = jmp_next;
+		break;
+	case JMP_PLACEHOLDER_TRACE:
+		*jmp_offset = jmp_trace;
+		break;
+	default:
+		break;
+	}
+}
+
+static unsigned short
+bpf_syscalls_cmp(struct sock_filter *filter,
+		 unsigned int lower, unsigned int upper)
+{
+	if (lower + 1 == upper) {
+		/* if (nr == lower) return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
+			     JMP_PLACEHOLDER_TRACE, 0);
+		return 1;
+	} else {
+		/* if (nr >= lower && nr < upper) return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
+		SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
+			     JMP_PLACEHOLDER_TRACE);
+		return 2;
+	}
+}
+
+static unsigned short
+init_sock_filter(struct sock_filter *filter)
+{
+	/*
+	 * Generated program looks like:
+	 * if (arch == AUDIT_ARCH_A && nr >= flag) {
+	 *	if (nr == 59)
+	 *		return SECCOMP_RET_TRACE;
+	 *	if (nr >= 321 && nr <= 323)
+	 *		return SECCOMP_RET_TRACE;
+	 *	...
+	 *	return SECCOMP_RET_ALLOW;
+	 * }
+	 * if (arch == AUDIT_ARCH_A) {
+	 *	...
+	 * }
+	 * if (arch == AUDIT_ARCH_B) {
+	 *	...
+	 * }
+	 * return SECCOMP_RET_TRACE;
+	 */
+	unsigned short pos = 0;
+
+#if SUPPORTED_PERSONALITIES > 1
+	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+		     offsetof(struct seccomp_data, arch));
+#endif
+
+	/*
+	 * Personalities are iterated in reverse-order in the BPF program so
+	 * that the x86 case is naturally handled.  On x86, the first and third
+	 * personalities have the same arch identifier.  The third can be
+	 * distinguished based on its associated syscall flag, so we check it
+	 * first.  The only drawback here is that the first personality is more
+	 * common, which may make the BPF program slower to match syscalls on
+	 * average.
+	 */
+	for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
+		unsigned int lower = UINT_MAX;
+		unsigned short start = pos, end;
+
+#if SUPPORTED_PERSONALITIES > 1
+		/* if (arch != audit_arch_vec[p].arch) goto next; */
+		SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
+			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
+#endif
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+			     offsetof(struct seccomp_data, nr));
+
+#if SUPPORTED_PERSONALITIES > 1
+		if (audit_arch_vec[p].flag) {
+			/* if (nr < audit_arch_vec[p].flag) goto next; */
+			SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+				     audit_arch_vec[p].flag, 2, 0);
+			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+				     offsetof(struct seccomp_data, arch));
+			SET_BPF_JUMP(&filter[pos++], BPF_JA,
+				     JMP_PLACEHOLDER_NEXT, 0, 0);
+		}
+#endif
+
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+			if (traced_by_seccomp(i, p)) {
+				if (lower == UINT_MAX)
+					lower = i;
+				continue;
+			}
+			if (lower == UINT_MAX)
+				continue;
+			pos += bpf_syscalls_cmp(filter + pos,
+						lower | audit_arch_vec[p].flag,
+						i | audit_arch_vec[p].flag);
+			lower = UINT_MAX;
+		}
+		if (lower != UINT_MAX)
+			pos += bpf_syscalls_cmp(filter + pos,
+						lower | audit_arch_vec[p].flag,
+						nsyscall_vec[p]
+						| audit_arch_vec[p].flag);
+		end = pos;
+
+		/* if (nr >= max_nr) return RET_TRACE; */
+		SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+			     nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
+
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_ALLOW);
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_TRACE);
+
+		for (unsigned int i = start; i < end; ++i) {
+			if (BPF_CLASS(filter[i].code) != BPF_JMP)
+				continue;
+			unsigned char jmp_next = pos - i - 1;
+			unsigned char jmp_trace = pos - i - 2;
+			replace_jmp_placeholders(&filter[i].jt, jmp_next,
+						 jmp_trace);
+			replace_jmp_placeholders(&filter[i].jf, jmp_next,
+						 jmp_trace);
+			if (BPF_OP(filter[i].code) == BPF_JA)
+				filter[i].k = (unsigned int) jmp_next;
+		}
+	}
+
+#if SUPPORTED_PERSONALITIES > 1
+	/* Jumps conditioned on .arch default to this RET_TRACE. */
+	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
+#endif
+
+	if (debug_flag)
+		dump_seccomp_bpf(filter, pos);
+
+	return pos;
+}
+
+void
+init_seccomp_filter(void)
+{
+	struct sock_filter filter[BPF_MAXINSNS];
+	unsigned short len;
+
+	len = init_sock_filter(filter);
+
+	struct sock_fprog prog = {
+		.len = len,
+		.filter = filter
+	};
+
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+		perror_func_msg("prctl(PR_SET_NO_NEW_PRIVS)");
+		return;
+	}
+
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+		perror_func_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+	if (tcp && exiting(tcp)
+	    && tcp->scno < nsyscall_vec[current_personality]
+	    && traced_by_seccomp(tcp->scno, current_personality))
+		return PTRACE_SYSCALL;
+	return PTRACE_CONT;
+}
diff --git a/filter_seccomp.h b/filter_seccomp.h
new file mode 100644
index 00000000..5e4d2f80
--- /dev/null
+++ b/filter_seccomp.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#ifndef STRACE_SECCOMP_FILTER_H
+#define STRACE_SECCOMP_FILTER_H
+
+#include "defs.h"
+
+extern bool seccomp_filtering;
+extern bool seccomp_before_sysentry;
+
+extern void check_seccomp_filter(void);
+extern void init_seccomp_filter(void);
+extern int seccomp_filter_restart_operator(const struct tcb *);
+
+#endif /* !STRACE_SECCOMP_FILTER_H */
diff --git a/linux/aarch64/arch_defs_.h b/linux/aarch64/arch_defs_.h
index ed9261f5..fb75722f 100644
--- a/linux/aarch64/arch_defs_.h
+++ b/linux/aarch64/arch_defs_.h
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_AARCH64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_ARM,     0 }
diff --git a/linux/ia64/arch_defs_.h b/linux/ia64/arch_defs_.h
index 87ca0cdb..107a74df 100644
--- a/linux/ia64/arch_defs_.h
+++ b/linux/ia64/arch_defs_.h
@@ -9,3 +9,4 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 0
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_IA64, SYSCALLENT_BASE_NR }
diff --git a/linux/powerpc64/arch_defs_.h b/linux/powerpc64/arch_defs_.h
index 871f4109..a4ac007e 100644
--- a/linux/powerpc64/arch_defs_.h
+++ b/linux/powerpc64/arch_defs_.h
@@ -8,3 +8,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define SUPPORTED_PERSONALITIES 2
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_PPC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_PPC,   0 }
diff --git a/linux/riscv/arch_defs_.h b/linux/riscv/arch_defs_.h
index a9c27bc7..f53f076a 100644
--- a/linux/riscv/arch_defs_.h
+++ b/linux/riscv/arch_defs_.h
@@ -7,4 +7,6 @@
 
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+# define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_RISCV64, 0 }
+# define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_RISCV32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
diff --git a/linux/s390x/arch_defs_.h b/linux/s390x/arch_defs_.h
index 1e520761..750ab512 100644
--- a/linux/s390x/arch_defs_.h
+++ b/linux/s390x/arch_defs_.h
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_MMAP_PGOFF 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_S390X, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_S390,  0 }
diff --git a/linux/sparc64/arch_defs_.h b/linux/sparc64/arch_defs_.h
index 68eef4fc..9eacaa40 100644
--- a/linux/sparc64/arch_defs_.h
+++ b/linux/sparc64/arch_defs_.h
@@ -9,4 +9,6 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_SPARC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_SPARC,   0 }
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
diff --git a/linux/tile/arch_defs_.h b/linux/tile/arch_defs_.h
index a781208c..12ba0d8b 100644
--- a/linux/tile/arch_defs_.h
+++ b/linux/tile/arch_defs_.h
@@ -6,6 +6,8 @@
  */
 
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_TILEGX,   0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_TILEGX32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
 
 #ifdef __tilepro__
diff --git a/linux/x32/arch_defs_.h b/linux/x32/arch_defs_.h
index 1055de12..9f48d313 100644
--- a/linux/x32/arch_defs_.h
+++ b/linux/x32/arch_defs_.h
@@ -11,3 +11,5 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_OLD_TIME64_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
diff --git a/linux/x86_64/arch_defs_.h b/linux/x86_64/arch_defs_.h
index a8c1d991..c2924ac2 100644
--- a/linux/x86_64/arch_defs_.h
+++ b/linux/x86_64/arch_defs_.h
@@ -9,3 +9,6 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 3
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
+#define PERSONALITY2_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
diff --git a/strace.1.in b/strace.1.in
index 78bbc628..aeb00b04 100644
--- a/strace.1.in
+++ b/strace.1.in
@@ -38,8 +38,8 @@
 strace \- trace system calls and signals
 .SH SYNOPSIS
 .SY strace
-.if '@ENABLE_STACKTRACE_FALSE@'#' .OP \-ACdffhikqqrtttTvVwxxyyzZ
-.if '@ENABLE_STACKTRACE_TRUE@'#' .OP \-ACdffhiqqrtttTvVwxxyyzZ
+.if '@ENABLE_STACKTRACE_FALSE@'#' .OP \-ACdffhiknqqrtttTvVwxxyyzZ
+.if '@ENABLE_STACKTRACE_TRUE@'#' .OP \-ACdffhinqqrtttTvVwxxyyzZ
 .OP \-I n
 .OP \-b execve
 .OM \-e expr
@@ -970,6 +970,19 @@ Show some debugging output of
 .B strace
 itself on the standard error.
 .TP
+.B \-n
+Enable (experimental) usage of seccomp-bpf to have ptrace(2)-stops only when
+system calls that are being traced occur in the traced processes.  Requires the
+.B \-f
+option.
+An attempt to rely on seccomp-bpf to filter system calls may fail for diverse
+reasons: there are too many system calls to filter, the seccomp API is
+unavailable, or
+.B strace
+itself is being traced.  In cases when seccomp-bpf filter setup failed,
+.B strace
+proceeds as usual and stops traced processes on every system call.
+.TP
 .B \-F
 This option is deprecated.  It is retained for backward compatibility only
 and may be removed in future releases.
diff --git a/strace.c b/strace.c
index 8d9e465c..ecd0976e 100644
--- a/strace.c
+++ b/strace.c
@@ -30,6 +30,7 @@
 #endif
 
 #include "kill_save_errno.h"
+#include "filter_seccomp.h"
 #include "largefile_wrappers.h"
 #include "mmap_cache.h"
 #include "number_set.h"
@@ -236,7 +237,7 @@ usage(void)
 #endif
 
 	printf("\
-usage: strace [-ACdffhi" K_OPT "qqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
+usage: strace [-ACdffhi" K_OPT "nqqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
               [-a column] [-o file] [-s strsize] [-X format] [-P path]...\n\
               [-p pid]...\n\
 	      { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
@@ -308,6 +309,7 @@ Startup:\n\
 \n\
 Miscellaneous:\n\
   -d             enable debug output to stderr\n\
+  -n             enable seccomp-bpf filtering\n\
   -h             print help message\n\
   -V             print version\n\
 "
@@ -1231,6 +1233,10 @@ exec_or_die(void)
 	if (params_for_tracee.child_sa.sa_handler != SIG_DFL)
 		sigaction(SIGCHLD, &params_for_tracee.child_sa, NULL);
 
+	debug_msg("seccomp-filter %s",
+		  seccomp_filtering ? "enabled" : "disabled");
+	if (seccomp_filtering)
+		init_seccomp_filter();
 	execv(params->pathname, params->argv);
 	perror_msg_and_die("exec");
 }
@@ -1606,7 +1612,7 @@ init(int argc, char *argv[])
 #ifdef ENABLE_STACKTRACE
 	    "k"
 #endif
-	    "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
+	    "a:Ab:cCdDe:E:fFhiI:no:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
 		switch (c) {
 		case 'a':
 			acolumn = string_to_uint(optarg);
@@ -1706,6 +1712,9 @@ init(int argc, char *argv[])
 		case 'u':
 			username = optarg;
 			break;
+		case 'n':
+			seccomp_filtering = true;
+			break;
 		case 'v':
 			qualify("abbrev=none");
 			break;
@@ -1759,6 +1768,11 @@ init(int argc, char *argv[])
 		error_msg_and_help("PROG [ARGS] must be specified with -D");
 	}
 
+	if (seccomp_filtering && !followfork) {
+		error_msg("-n was specified without -f, please use -f.");
+		followfork = 1;
+	}
+
 	if (optF) {
 		if (followfork) {
 			error_msg("deprecated option -F ignored");
@@ -1830,6 +1844,10 @@ init(int argc, char *argv[])
 		run_gid = getgid();
 	}
 
+	check_seccomp_filter();
+	if (seccomp_filtering)
+		ptrace_setoptions |= PTRACE_O_TRACESECCOMP;
+
 	if (followfork)
 		ptrace_setoptions |= PTRACE_O_TRACECLONE |
 				     PTRACE_O_TRACEFORK |
@@ -2021,6 +2039,7 @@ print_debug_info(const int pid, int status)
 			[PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
 			[PTRACE_EVENT_EXEC]  = "EXEC",
 			[PTRACE_EVENT_EXIT]  = "EXIT",
+			[PTRACE_EVENT_SECCOMP]  = "SECCOMP",
 			/* [PTRACE_EVENT_STOP (=128)] would make biggish array */
 		};
 		const char *e = "??";
@@ -2544,6 +2563,9 @@ next_event(void)
 			case PTRACE_EVENT_EXIT:
 				wd->te = TE_STOP_BEFORE_EXIT;
 				break;
+			case PTRACE_EVENT_SECCOMP:
+				wd->te = TE_SECCOMP;
+				break;
 			default:
 				wd->te = TE_RESTART;
 			}
@@ -2629,7 +2651,7 @@ trace_syscall(struct tcb *tcp, unsigned int *sig)
 static bool
 dispatch_event(const struct tcb_wait_data *wd)
 {
-	unsigned int restart_op = PTRACE_SYSCALL;
+	unsigned int restart_op;
 	unsigned int restart_sig = 0;
 	enum trace_event te = wd ? wd->te : TE_BREAK;
 	/*
@@ -2638,6 +2660,11 @@ dispatch_event(const struct tcb_wait_data *wd)
 	 */
 	int status = wd ? wd->status : 0;
 
+	if (seccomp_filtering)
+		restart_op = seccomp_filter_restart_operator(current_tcp);
+	else
+		restart_op = PTRACE_SYSCALL;
+
 	switch (te) {
 	case TE_BREAK:
 		return false;
@@ -2648,6 +2675,13 @@ dispatch_event(const struct tcb_wait_data *wd)
 	case TE_RESTART:
 		break;
 
+	case TE_SECCOMP:
+		if (seccomp_before_sysentry) {
+			restart_op = PTRACE_SYSCALL;
+			break;
+		}
+		ATTRIBUTE_FALLTHROUGH;
+
 	case TE_SYSCALL_STOP:
 		if (trace_syscall(current_tcp, &restart_sig) < 0) {
 			/*
@@ -2663,6 +2697,42 @@ dispatch_event(const struct tcb_wait_data *wd)
 			 */
 			return true;
 		}
+		if (seccomp_filtering) {
+			/*
+			 * Syscall and seccomp stops can happen in different
+			 * orders depending on kernel.  strace tests this in
+			 * check_seccomp_order_tracer().
+			 *
+			 * Linux 3.5--4.7:
+			 * (seccomp-stop before syscall-entry-stop)
+			 *         +--> seccomp-stop ->-PTRACE_SYSCALL->-+
+			 *         |                                     |
+			 *     PTRACE_CONT                   syscall-entry-stop
+			 *         |                                     |
+			 * syscall-exit-stop <---PTRACE_SYSCALL-----<----+
+			 *
+			 * Linux 4.8+:
+			 * (seccomp-stop after syscall-entry-stop)
+			 *                 syscall-entry-stop
+			 * 
+			 *         +---->-----PTRACE_CONT---->----+
+			 *         |                              |
+			 *  syscall-exit-stop               seccomp-stop
+			 *         |                              |
+			 *         +----<----PTRACE_SYSCALL---<---+
+			 *
+			 * Note in Linux 4.8+, we restart in PTRACE_CONT after
+			 * syscall-exit to skip the syscall-entry-stop.  The
+			 * next seccomp-stop will be treated as a syscall
+			 * entry.
+			 * 
+			 * The below line implements this behavior. Note
+			 * exiting(current_tcp) actually marks a
+			 * syscall-entry-stop because the flag was inverted in
+			 * the above call to trace_syscall.
+			 */
+			restart_op = exiting(current_tcp) ? PTRACE_SYSCALL : PTRACE_CONT;
+		}
 		break;
 
 	case TE_SIGNAL_DELIVERY_STOP:
diff --git a/trace_event.h b/trace_event.h
index 53a711b8..9021fc55 100644
--- a/trace_event.h
+++ b/trace_event.h
@@ -66,6 +66,11 @@ enum trace_event {
 	 * Restart the tracee with signal 0.
 	 */
 	TE_STOP_BEFORE_EXIT,
+
+	/*
+	 * SECCOMP_RET_TRACE rule is triggered.
+	 */
+	TE_SECCOMP,
 };
 
 #endif /* !STRACE_TRACE_EVENT_H */
-- 
2.17.1