[PATCH v7 1/3] Introduce seccomp-assisted syscall filtering

Mon Sep 23 12:02:35 UTC 2019

From: Chen Jingpiao <chenjingpiao at gmail.com>

With this patch, strace can rely on seccomp to only be stopped at syscalls
of interest, instead of stopping at all syscalls.  The seccomp filtering
of syscalls is opt-in only; it must be enabled with the -n option.  Kernel
support is first checked with check_seccomp_filter(), which also ensures
the BPF program derived from the syscalls to filter is not larger than the
kernel's limit.

The -n option implies -f, but a warning is emitted if -f is not explicitly
specified.  Since a task's children inherit its seccomp filters, we want
to ensure all children are also traced to avoid their syscalls failing
with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).

Fork/vfork/clone children of traced processes are marked as not having a
seccomp filter until we receive a first seccomp-stop.  They are therefore
stopped at every syscall entries and exits until that first seccomp-stop.

The current BPF program implements a simple linear match of the syscall
numbers.  Contiguous sequences of syscall numbers are however matched as
an interval, with two instructions only.  The algorithm can be improved
or replaced in the future without impacting user-observed behavior.

The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8
(cf. PTRACE_EVENT_SECCOMP in ptrace man page).  This patch supports both
behaviors by checking the kernel's actual behavior before installing the
seccomp filter.

* filter_seccomp.c: New file.
* filter_seccomp.h: New file.
* Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
* linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Define for aarch64.
* linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
* linux/riscv/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for riscv.
* linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
* linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for sparc64.
PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
* linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for tile.
* linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for x32.
* linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
* linux/ia64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH): Likewise for IA64.
* strace.c (usage): Document -n option.
(startup_child): Mark process has having seccomp filter.
(exec_or_die): Initialize seccomp filtering if requested.
(init): Handle -n option and check that seccomp can be enabled.
(print_debug_info): Handle PTRACE_EVENT_SECCOMP.
(next_event): Capture PTRACE_EVENT_SECCOMP event.
(dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
* trace_event.h (trace_event): New enumeration entity.
* strace.1.in: Document new -n option.
* NEWS: Mention this change.

Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
Co-Authored-by: Dmitry V. Levin <ldv at altlinux.org>
---
 Makefile.am                  |   2 +
 NEWS                         |   2 +
 defs.h                       |   4 +
 filter_seccomp.c             | 617 +++++++++++++++++++++++++++++++++++
 filter_seccomp.h             |  21 ++
 linux/aarch64/arch_defs_.h   |   2 +
 linux/ia64/arch_defs_.h      |   1 +
 linux/powerpc64/arch_defs_.h |   2 +
 linux/riscv/arch_defs_.h     |   2 +
 linux/s390x/arch_defs_.h     |   2 +
 linux/sparc64/arch_defs_.h   |   2 +
 linux/tile/arch_defs_.h      |   2 +
 linux/x32/arch_defs_.h       |   2 +
 linux/x86_64/arch_defs_.h    |   3 +
 strace.1.in                  |  21 +-
 strace.c                     | 100 +++++-
 trace_event.h                |   5 +
 17 files changed, 785 insertions(+), 5 deletions(-)
 create mode 100644 filter_seccomp.c
 create mode 100644 filter_seccomp.h

diff --git a/Makefile.am b/Makefile.am
index b4f31568..f4c65b0a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -129,6 +129,8 @@ strace_SOURCES =	\
 	file_ioctl.c	\
 	filter.h	\
 	filter_qualify.c \
+	filter_seccomp.c \
+	filter_seccomp.h \
 	flock.c		\
 	flock.h		\
 	fs_x_ioctl.c	\
diff --git a/NEWS b/NEWS
index 76fdf58d..6f7c3b67 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,8 @@ Noteworthy changes in release ?.? (????-??-??)
 ==============================================
 
 * Improvements
+  * Implemented filtering of system calls via seccomp-bpf.  Use -n option to
+    enable.
   * Implemented decoding of pidfd_open syscall.
   * Enhanced decoding of NETLINK_ROUTE protocol.
   * Implemented decoding of UNIX_DIAG_UID netlink attribute.
diff --git a/defs.h b/defs.h
index 337c9bd7..bdb319a2 100644
--- a/defs.h
+++ b/defs.h
@@ -332,6 +332,9 @@ struct tcb {
 # define TCB_DELAYED	0x2000	/* Current syscall has been delayed */
 # define TCB_TAMPERED_NO_FAIL 0x4000	/* We tamper tcb with syscall
 					   that should not fail. */
+# define TCB_SECCOMP_FILTER	0x8000	/* This process has a seccomp filter
+					 * attached.
+					 */
 
 /* qualifier flags */
 # define QUAL_TRACE	0x001	/* this system call should be traced */
@@ -358,6 +361,7 @@ struct tcb {
 # define inject_delay_exit(tcp)	((tcp)->flags & TCB_INJECT_DELAY_EXIT)
 # define syscall_delayed(tcp)	((tcp)->flags & TCB_DELAYED)
 # define syscall_tampered_nofail(tcp) ((tcp)->flags & TCB_TAMPERED_NO_FAIL)
+# define has_seccomp_filter(tcp)	((tcp)->flags & TCB_SECCOMP_FILTER)
 
 extern const struct_sysent stub_sysent;
 # define tcp_sysent(tcp) (tcp->s_ent ?: &stub_sysent)
diff --git a/filter_seccomp.c b/filter_seccomp.c
new file mode 100644
index 00000000..dd3aa173
--- /dev/null
+++ b/filter_seccomp.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
+ * Copyright (c) 2019 The strace developers.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#include "defs.h"
+
+#include "ptrace.h"
+#include <signal.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+
+#include "filter_seccomp.h"
+#include "number_set.h"
+#include "syscall.h"
+#include "scno.h"
+
+bool seccomp_filtering;
+bool seccomp_before_sysentry;
+
+#ifdef HAVE_LINUX_SECCOMP_H
+
+# include <linux/seccomp.h>
+
+# ifndef BPF_MAXINSNS
+#  define BPF_MAXINSNS 4096
+# endif
+
+# define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
+# define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
+
+# define SET_BPF(filter, code, jt, jf, k) \
+	(*(filter) = (struct sock_filter) { code, jt, jf, k })
+
+# define SET_BPF_STMT(filter, code, k) \
+	SET_BPF(filter, code, 0, 0, k)
+
+# define SET_BPF_JUMP(filter, code, k, jt, jf) \
+	SET_BPF(filter, BPF_JMP | code, jt, jf, k)
+
+struct audit_arch_t {
+	unsigned int arch;
+	unsigned int flag;
+};
+
+static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
+# if SUPPORTED_PERSONALITIES > 1
+	PERSONALITY0_AUDIT_ARCH,
+	PERSONALITY1_AUDIT_ARCH,
+#  if SUPPORTED_PERSONALITIES > 2
+	PERSONALITY2_AUDIT_ARCH,
+#  endif
+# endif
+};
+
+# ifdef ENABLE_COVERAGE_GCOV
+extern void __gcov_flush(void);
+# endif
+
+static void ATTRIBUTE_NORETURN
+check_seccomp_order_do_child(void)
+{
+	static const struct sock_filter filter[] = {
+		/* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
+		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+			 offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
+	};
+	static const struct sock_fprog prog = {
+		.len = ARRAY_SIZE(filter),
+		.filter = (struct sock_filter *) filter
+	};
+
+	/* Get everything ready before PTRACE_TRACEME.  */
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
+	int pid = getpid();
+
+	if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
+		/* Exit with a nonzero exit status.  */
+		perror_func_msg_and_die("PTRACE_TRACEME");
+	}
+
+# ifdef ENABLE_COVERAGE_GCOV
+	__gcov_flush();
+# endif
+
+	kill(pid, SIGSTOP);
+	syscall(__NR_gettid);
+	_exit(0);
+}
+
+static int
+check_seccomp_order_tracer(int pid)
+{
+	unsigned int step;
+
+	for (step = 0; ; ++step) {
+		int status;
+
+		for (;;) {
+			long rc = waitpid(pid, &status, 0);
+			if (rc < 0 && errno == EINTR)
+				continue;
+			if (rc == pid)
+				break;
+			/* Cannot happen.  */
+			perror_func_msg("#%d: unexpected wait result %ld",
+					step, rc);
+			return pid;
+		}
+
+		if (WIFEXITED(status)) {
+			/* The tracee is no more.  */
+			pid = 0;
+
+			int exitstatus = WEXITSTATUS(status);
+			if (step == 5 && exitstatus == 0) {
+				seccomp_filtering = true;
+			} else {
+				error_func_msg("#%d: unexpected exit status %u",
+					       step, exitstatus);
+			}
+			break;
+		}
+
+		if (WIFSIGNALED(status)) {
+			/* The tracee is no more.  */
+			pid = 0;
+
+			error_func_msg("#%d: unexpected signal %u",
+				       step, WTERMSIG(status));
+			break;
+		}
+
+		if (!WIFSTOPPED(status)) {
+			/* Cannot happen.  */
+			error_func_msg("#%d: unexpected wait status %#x",
+				       step, status);
+			break;
+		}
+
+		unsigned int event = (unsigned int) status >> 16;
+
+		switch (WSTOPSIG(status)) {
+		case SIGSTOP:
+			if (step != 0) {
+				error_func_msg("#%d: unexpected signal stop",
+					       step);
+				return pid;
+			}
+			if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
+				   PTRACE_O_TRACESYSGOOD|
+				   PTRACE_O_TRACESECCOMP) < 0) {
+				perror_func_msg("PTRACE_SETOPTIONS");
+				return pid;
+			}
+			break;
+
+		case SIGTRAP:
+			if (event != PTRACE_EVENT_SECCOMP) {
+				error_func_msg("#%d: unexpected trap %#x",
+					       step, event);
+				return pid;
+			}
+
+			switch (step) {
+			case 1: /* Seccomp stop before entering gettid.  */
+				seccomp_before_sysentry = true;
+				break;
+			case 2: /* Seccomp stop after entering gettid.  */
+				if (!seccomp_before_sysentry)
+					break;
+				ATTRIBUTE_FALLTHROUGH;
+			default:
+				error_func_msg("#%d: unexpected seccomp stop",
+					       step);
+				return pid;
+			}
+			break;
+
+		case SIGTRAP | 0x80:
+			switch (step) {
+			case 3: /* Exiting gettid.  */
+			case 4: /* Entering exit_group.  */
+				break;
+			case 1: /* Entering gettid before seccomp stop.  */
+				seccomp_before_sysentry = false;
+				break;
+			case 2: /* Entering gettid after seccomp stop.  */
+				if (seccomp_before_sysentry)
+					break;
+				ATTRIBUTE_FALLTHROUGH;
+			default:
+				error_func_msg("#%d: unexpected syscall stop",
+					       step);
+				return pid;
+			}
+			break;
+
+		default:
+			error_func_msg("#%d: unexpected stop signal %#x",
+				       step, WSTOPSIG(status));
+			return pid;
+		}
+
+		if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
+			/* Cannot happen.  */
+			perror_func_msg("#%d: PTRACE_SYSCALL", step);
+			break;
+		}
+	}
+
+	return pid;
+}
+
+static void
+check_seccomp_order(void)
+{
+	seccomp_filtering = false;
+
+	int pid = fork();
+	if (pid < 0) {
+		perror_func_msg("fork");
+		return;
+	}
+
+	if (pid == 0)
+		check_seccomp_order_do_child();
+
+	pid = check_seccomp_order_tracer(pid);
+	if (pid) {
+		kill(pid, SIGKILL);
+		for (;;) {
+			long rc = waitpid(pid, NULL, 0);
+			if (rc < 0 && errno == EINTR)
+				continue;
+			break;
+		}
+	}
+}
+
+static bool
+traced_by_seccomp(unsigned int scno, unsigned int p)
+{
+	if (is_number_in_set_array(scno, trace_set, p)
+	    || sysent_vec[p][scno].sys_flags
+	    & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
+		return true;
+	return false;
+}
+
+static void
+check_bpf_program_size(void)
+{
+	unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
+
+	/*
+	 * Implements a simplified form of init_sock_filter()'s bytecode
+	 * generation algorithm, to count the number of instructions that will
+	 * be generated.
+	 */
+	for (int p = SUPPORTED_PERSONALITIES - 1;
+	     p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
+		unsigned int nb_insns_personality = 0;
+		unsigned int lower = UINT_MAX;
+
+		nb_insns_personality++;
+# if SUPPORTED_PERSONALITIES > 1
+		nb_insns_personality++;
+		if (audit_arch_vec[p].flag)
+			nb_insns_personality += 3;
+# endif
+
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+			if (traced_by_seccomp(i, p)) {
+				if (lower == UINT_MAX)
+					lower = i;
+				continue;
+			}
+			if (lower == UINT_MAX)
+				continue;
+			if (lower + 1 == i)
+				nb_insns_personality++;
+			else
+				nb_insns_personality += 2;
+			lower = UINT_MAX;
+		}
+		if (lower != UINT_MAX) {
+			if (lower + 1 == nsyscall_vec[p])
+				nb_insns_personality++;
+			else
+				nb_insns_personality += 2;
+		}
+
+		nb_insns_personality += 3;
+
+		/*
+		 * Within generated BPF programs, the origin and destination of
+		 * jumps are always in the same personality section.  The
+		 * largest jump is therefore the jump from the first
+		 * instruction of the section to the last, to skip the
+		 * personality and try to compare .arch to the next
+		 * personality.
+		 * If we have a personality section with more than 255
+		 * instructions, the jump offset will overflow.  Such program
+		 * is unlikely to happen, so we simply disable seccomp filter
+		 * is such a case.
+		 */
+		if (nb_insns_personality > UCHAR_MAX) {
+			debug_msg("seccomp filter disabled due to "
+				  "possibility of overflow");
+			seccomp_filtering = false;
+			return;
+		}
+		nb_insns += nb_insns_personality;
+	}
+
+# if SUPPORTED_PERSONALITIES > 1
+	nb_insns++;
+# endif
+
+	if (nb_insns > BPF_MAXINSNS) {
+		debug_msg("seccomp filter disabled due to BPF program being "
+			  "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
+		seccomp_filtering = false;
+	}
+}
+
+static void
+check_seccomp_filter_properties(void)
+{
+	if (NOMMU_SYSTEM) {
+		seccomp_filtering = false;
+		return;
+	}
+
+	int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
+	seccomp_filtering = rc < 0 && errno != EINVAL;
+	if (!seccomp_filtering)
+		debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+
+	if (seccomp_filtering)
+		check_bpf_program_size();
+	if (seccomp_filtering)
+		check_seccomp_order();
+}
+
+static void
+dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
+{
+	for (unsigned int i = 0; i < len; ++i) {
+		switch (filter[i].code) {
+		case BPF_LD | BPF_W | BPF_ABS:
+			switch (filter[i].k) {
+			case offsetof(struct seccomp_data, arch):
+				error_msg("STMT(BPF_LDWABS, data->arch)");
+				break;
+			case offsetof(struct seccomp_data, nr):
+				error_msg("STMT(BPF_LDWABS, data->nr)");
+				break;
+			default:
+				error_msg("STMT(BPF_LDWABS, 0x%x)",
+					  filter[i].k);
+			}
+			break;
+		case BPF_RET | BPF_K:
+			switch (filter[i].k) {
+			case SECCOMP_RET_TRACE:
+				error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
+				break;
+			case SECCOMP_RET_ALLOW:
+				error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
+				break;
+			default:
+				error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
+			}
+			break;
+		case BPF_JMP | BPF_JEQ | BPF_K:
+			error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
+		case BPF_JMP | BPF_JGE | BPF_K:
+			error_msg("JUMP(BPF_JGE, %u, %u, %u)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
+		case BPF_JMP | BPF_JA:
+			error_msg("JUMP(BPF_JA, %u)", filter[i].k);
+			break;
+		default:
+			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
+				  filter[i].jt, filter[i].jf, filter[i].k);
+		}
+	}
+}
+
+static void
+replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
+			 unsigned char jmp_trace)
+{
+	switch (*jmp_offset) {
+	case JMP_PLACEHOLDER_NEXT:
+		*jmp_offset = jmp_next;
+		break;
+	case JMP_PLACEHOLDER_TRACE:
+		*jmp_offset = jmp_trace;
+		break;
+	default:
+		break;
+	}
+}
+
+static unsigned short
+bpf_syscalls_cmp(struct sock_filter *filter,
+		 unsigned int lower, unsigned int upper)
+{
+	if (lower + 1 == upper) {
+		/* if (nr == lower) return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
+			     JMP_PLACEHOLDER_TRACE, 0);
+		return 1;
+	} else {
+		/* if (nr >= lower && nr < upper) return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
+		SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
+			     JMP_PLACEHOLDER_TRACE);
+		return 2;
+	}
+}
+
+static unsigned short
+init_sock_filter(struct sock_filter *filter)
+{
+	/*
+	 * Generated program looks like:
+	 * if (arch == AUDIT_ARCH_A && nr >= flag) {
+	 *	if (nr == 59)
+	 *		return SECCOMP_RET_TRACE;
+	 *	if (nr >= 321 && nr <= 323)
+	 *		return SECCOMP_RET_TRACE;
+	 *	...
+	 *	return SECCOMP_RET_ALLOW;
+	 * }
+	 * if (arch == AUDIT_ARCH_A) {
+	 *	...
+	 * }
+	 * if (arch == AUDIT_ARCH_B) {
+	 *	...
+	 * }
+	 * return SECCOMP_RET_TRACE;
+	 */
+	unsigned short pos = 0;
+
+# if SUPPORTED_PERSONALITIES > 1
+	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+		     offsetof(struct seccomp_data, arch));
+# endif
+
+	/*
+	 * Personalities are iterated in reverse-order in the BPF program so
+	 * that the x86 case is naturally handled.  On x86, the first and third
+	 * personalities have the same arch identifier.  The third can be
+	 * distinguished based on its associated syscall flag, so we check it
+	 * first.  The only drawback here is that the first personality is more
+	 * common, which may make the BPF program slower to match syscalls on
+	 * average.
+	 */
+	for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
+		unsigned int lower = UINT_MAX;
+		unsigned short start = pos, end;
+
+# if SUPPORTED_PERSONALITIES > 1
+		/* if (arch != audit_arch_vec[p].arch) goto next; */
+		SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
+			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
+# endif
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+			     offsetof(struct seccomp_data, nr));
+
+# if SUPPORTED_PERSONALITIES > 1
+		if (audit_arch_vec[p].flag) {
+			/* if (nr < audit_arch_vec[p].flag) goto next; */
+			SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+				     audit_arch_vec[p].flag, 2, 0);
+			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+				     offsetof(struct seccomp_data, arch));
+			SET_BPF_JUMP(&filter[pos++], BPF_JA,
+				     JMP_PLACEHOLDER_NEXT, 0, 0);
+		}
+# endif
+
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+			if (traced_by_seccomp(i, p)) {
+				if (lower == UINT_MAX)
+					lower = i;
+				continue;
+			}
+			if (lower == UINT_MAX)
+				continue;
+			pos += bpf_syscalls_cmp(filter + pos,
+						lower | audit_arch_vec[p].flag,
+						i | audit_arch_vec[p].flag);
+			lower = UINT_MAX;
+		}
+		if (lower != UINT_MAX)
+			pos += bpf_syscalls_cmp(filter + pos,
+						lower | audit_arch_vec[p].flag,
+						nsyscall_vec[p]
+						| audit_arch_vec[p].flag);
+		end = pos;
+
+		/* if (nr >= max_nr) return RET_TRACE; */
+		SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+			     nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
+
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_ALLOW);
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_TRACE);
+
+		for (unsigned int i = start; i < end; ++i) {
+			if (BPF_CLASS(filter[i].code) != BPF_JMP)
+				continue;
+			unsigned char jmp_next = pos - i - 1;
+			unsigned char jmp_trace = pos - i - 2;
+			replace_jmp_placeholders(&filter[i].jt, jmp_next,
+						 jmp_trace);
+			replace_jmp_placeholders(&filter[i].jf, jmp_next,
+						 jmp_trace);
+			if (BPF_OP(filter[i].code) == BPF_JA)
+				filter[i].k = (unsigned int) jmp_next;
+		}
+	}
+
+# if SUPPORTED_PERSONALITIES > 1
+	/* Jumps conditioned on .arch default to this RET_TRACE. */
+	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
+# endif
+
+	if (debug_flag)
+		dump_seccomp_bpf(filter, pos);
+
+	return pos;
+}
+
+void
+init_seccomp_filter(void)
+{
+	struct sock_filter filter[BPF_MAXINSNS];
+	unsigned short len;
+
+	len = init_sock_filter(filter);
+
+	struct sock_fprog prog = {
+		.len = len,
+		.filter = filter
+	};
+
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
+
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+	if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
+	    && traced_by_seccomp(tcp->scno, current_personality))
+		return PTRACE_SYSCALL;
+	return PTRACE_CONT;
+}
+
+#else /* !HAVE_LINUX_SECCOMP_H */
+
+# warning <linux/seccomp.h> is not available, seccomp filtering is not supported
+
+static void
+check_seccomp_filter_properties(void)
+{
+	seccomp_filtering = false;
+}
+
+void
+init_seccomp_filter(void)
+{
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+	return PTRACE_SYSCALL;
+}
+
+#endif
+
+void
+check_seccomp_filter(void)
+{
+	check_seccomp_filter_properties();
+
+	if (!seccomp_filtering)
+		error_msg("seccomp filter is requested but unavailable");
+}
diff --git a/filter_seccomp.h b/filter_seccomp.h
new file mode 100644
index 00000000..5e4d2f80
--- /dev/null
+++ b/filter_seccomp.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#ifndef STRACE_SECCOMP_FILTER_H
+#define STRACE_SECCOMP_FILTER_H
+
+#include "defs.h"
+
+extern bool seccomp_filtering;
+extern bool seccomp_before_sysentry;
+
+extern void check_seccomp_filter(void);
+extern void init_seccomp_filter(void);
+extern int seccomp_filter_restart_operator(const struct tcb *);
+
+#endif /* !STRACE_SECCOMP_FILTER_H */
diff --git a/linux/aarch64/arch_defs_.h b/linux/aarch64/arch_defs_.h
index ed9261f5..fb75722f 100644
--- a/linux/aarch64/arch_defs_.h
+++ b/linux/aarch64/arch_defs_.h
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_AARCH64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_ARM,     0 }
diff --git a/linux/ia64/arch_defs_.h b/linux/ia64/arch_defs_.h
index 87ca0cdb..107a74df 100644
--- a/linux/ia64/arch_defs_.h
+++ b/linux/ia64/arch_defs_.h
@@ -9,3 +9,4 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 0
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_IA64, SYSCALLENT_BASE_NR }
diff --git a/linux/powerpc64/arch_defs_.h b/linux/powerpc64/arch_defs_.h
index 871f4109..a4ac007e 100644
--- a/linux/powerpc64/arch_defs_.h
+++ b/linux/powerpc64/arch_defs_.h
@@ -8,3 +8,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define SUPPORTED_PERSONALITIES 2
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_PPC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_PPC,   0 }
diff --git a/linux/riscv/arch_defs_.h b/linux/riscv/arch_defs_.h
index a9c27bc7..f53f076a 100644
--- a/linux/riscv/arch_defs_.h
+++ b/linux/riscv/arch_defs_.h
@@ -7,4 +7,6 @@
 
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+# define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_RISCV64, 0 }
+# define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_RISCV32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
diff --git a/linux/s390x/arch_defs_.h b/linux/s390x/arch_defs_.h
index 1e520761..750ab512 100644
--- a/linux/s390x/arch_defs_.h
+++ b/linux/s390x/arch_defs_.h
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_MMAP_PGOFF 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_S390X, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_S390,  0 }
diff --git a/linux/sparc64/arch_defs_.h b/linux/sparc64/arch_defs_.h
index 68eef4fc..9eacaa40 100644
--- a/linux/sparc64/arch_defs_.h
+++ b/linux/sparc64/arch_defs_.h
@@ -9,4 +9,6 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_SPARC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_SPARC,   0 }
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
diff --git a/linux/tile/arch_defs_.h b/linux/tile/arch_defs_.h
index a781208c..12ba0d8b 100644
--- a/linux/tile/arch_defs_.h
+++ b/linux/tile/arch_defs_.h
@@ -6,6 +6,8 @@
  */
 
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_TILEGX,   0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_TILEGX32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
 
 #ifdef __tilepro__
diff --git a/linux/x32/arch_defs_.h b/linux/x32/arch_defs_.h
index 1055de12..9f48d313 100644
--- a/linux/x32/arch_defs_.h
+++ b/linux/x32/arch_defs_.h
@@ -11,3 +11,5 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_OLD_TIME64_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
diff --git a/linux/x86_64/arch_defs_.h b/linux/x86_64/arch_defs_.h
index a8c1d991..c2924ac2 100644
--- a/linux/x86_64/arch_defs_.h
+++ b/linux/x86_64/arch_defs_.h
@@ -9,3 +9,6 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 3
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
+#define PERSONALITY2_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
diff --git a/strace.1.in b/strace.1.in
index 78bbc628..bf20d179 100644
--- a/strace.1.in
+++ b/strace.1.in
@@ -38,8 +38,8 @@
 strace \- trace system calls and signals
 .SH SYNOPSIS
 .SY strace
-.if '@ENABLE_STACKTRACE_FALSE@'#' .OP \-ACdffhikqqrtttTvVwxxyyzZ
-.if '@ENABLE_STACKTRACE_TRUE@'#' .OP \-ACdffhiqqrtttTvVwxxyyzZ
+.if '@ENABLE_STACKTRACE_FALSE@'#' .OP \-ACdffhiknqqrtttTvVwxxyyzZ
+.if '@ENABLE_STACKTRACE_TRUE@'#' .OP \-ACdffhinqqrtttTvVwxxyyzZ
 .OP \-I n
 .OP \-b execve
 .OM \-e expr
@@ -970,6 +970,23 @@ Show some debugging output of
 .B strace
 itself on the standard error.
 .TP
+.B \-n
+Enable (experimental) usage of seccomp-bpf to have ptrace(2)-stops only when
+system calls that are being traced occur in the traced processes.  Implies the
+.B \-f
+option.
+An attempt to rely on seccomp-bpf to filter system calls may fail for various
+reasons, e.g. there are too many system calls to filter, the seccomp API is not
+available, or
+.B strace
+itself is being traced.
+.B \-n
+is also ineffective on processes attached using
+.BR \-p .
+In cases when seccomp-bpf filter setup failed,
+.B strace
+proceeds as usual and stops traced processes on every system call.
+.TP
 .B \-F
 This option is deprecated.  It is retained for backward compatibility only
 and may be removed in future releases.
diff --git a/strace.c b/strace.c
index e8ced366..6ffa1d56 100644
--- a/strace.c
+++ b/strace.c
@@ -30,6 +30,7 @@
 #endif
 
 #include "kill_save_errno.h"
+#include "filter_seccomp.h"
 #include "largefile_wrappers.h"
 #include "mmap_cache.h"
 #include "number_set.h"
@@ -236,7 +237,7 @@ usage(void)
 #endif
 
 	printf("\
-usage: strace [-ACdffhi" K_OPT "qqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
+usage: strace [-ACdffhi" K_OPT "nqqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
               [-a column] [-o file] [-s strsize] [-X format] [-P path]...\n\
               [-p pid]...\n\
 	      { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
@@ -308,6 +309,7 @@ Startup:\n\
 \n\
 Miscellaneous:\n\
   -d             enable debug output to stderr\n\
+  -n             enable seccomp-bpf filtering\n\
   -h             print help message\n\
   -V             print version\n\
 "
@@ -1231,6 +1233,10 @@ exec_or_die(void)
 	if (params_for_tracee.child_sa.sa_handler != SIG_DFL)
 		sigaction(SIGCHLD, &params_for_tracee.child_sa, NULL);
 
+	debug_msg("seccomp filter %s",
+		  seccomp_filtering ? "enabled" : "disabled");
+	if (seccomp_filtering)
+		init_seccomp_filter();
 	execv(params->pathname, params->argv);
 	perror_msg_and_die("exec");
 }
@@ -1469,6 +1475,10 @@ startup_child(char **argv)
 		 * to create a genuine separate stack and execute on it.
 		 */
 	}
+
+	if (seccomp_filtering)
+		tcp->flags |= TCB_SECCOMP_FILTER;
+
 	/*
 	 * A case where straced process is part of a pipe:
 	 * { sleep 1; yes | head -n99999; } | strace -o/dev/null sh -c 'exec <&-; sleep 9'
@@ -1606,7 +1616,7 @@ init(int argc, char *argv[])
 #ifdef ENABLE_STACKTRACE
 	    "k"
 #endif
-	    "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
+	    "a:Ab:cCdDe:E:fFhiI:no:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
 		switch (c) {
 		case 'a':
 			acolumn = string_to_uint(optarg);
@@ -1706,6 +1716,9 @@ init(int argc, char *argv[])
 		case 'u':
 			username = optarg;
 			break;
+		case 'n':
+			seccomp_filtering = true;
+			break;
 		case 'v':
 			qualify("abbrev=none");
 			break;
@@ -1759,6 +1772,15 @@ init(int argc, char *argv[])
 		error_msg_and_help("PROG [ARGS] must be specified with -D");
 	}
 
+	if (seccomp_filtering) {
+		if (nprocs && (!argc || debug_flag))
+			error_msg("-n is not enabled for processes attached with -p");
+		if (!followfork) {
+			error_msg("-n implies -f");
+			followfork = 1;
+		}
+	}
+
 	if (optF) {
 		if (followfork) {
 			error_msg("deprecated option -F ignored");
@@ -1834,6 +1856,12 @@ init(int argc, char *argv[])
 		ptrace_setoptions |= PTRACE_O_TRACECLONE |
 				     PTRACE_O_TRACEFORK |
 				     PTRACE_O_TRACEVFORK;
+
+	if (seccomp_filtering)
+		check_seccomp_filter();
+	if (seccomp_filtering)
+		ptrace_setoptions |= PTRACE_O_TRACESECCOMP;
+
 	debug_msg("ptrace_setoptions = %#x", ptrace_setoptions);
 	test_ptrace_seize();
 	test_ptrace_get_syscall_info();
@@ -2021,6 +2049,7 @@ print_debug_info(const int pid, int status)
 			[PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
 			[PTRACE_EVENT_EXEC]  = "EXEC",
 			[PTRACE_EVENT_EXIT]  = "EXIT",
+			[PTRACE_EVENT_SECCOMP]  = "SECCOMP",
 			/* [PTRACE_EVENT_STOP (=128)] would make biggish array */
 		};
 		const char *e = "??";
@@ -2546,6 +2575,9 @@ next_event(void)
 			case PTRACE_EVENT_EXIT:
 				wd->te = TE_STOP_BEFORE_EXIT;
 				break;
+			case PTRACE_EVENT_SECCOMP:
+				wd->te = TE_SECCOMP;
+				break;
 			default:
 				wd->te = TE_RESTART;
 			}
@@ -2631,7 +2663,7 @@ trace_syscall(struct tcb *tcp, unsigned int *sig)
 static bool
 dispatch_event(const struct tcb_wait_data *wd)
 {
-	unsigned int restart_op = PTRACE_SYSCALL;
+	unsigned int restart_op;
 	unsigned int restart_sig = 0;
 	enum trace_event te = wd ? wd->te : TE_BREAK;
 	/*
@@ -2640,6 +2672,11 @@ dispatch_event(const struct tcb_wait_data *wd)
 	 */
 	int status = wd ? wd->status : 0;
 
+	if (current_tcp && has_seccomp_filter(current_tcp))
+		restart_op = seccomp_filter_restart_operator(current_tcp);
+	else
+		restart_op = PTRACE_SYSCALL;
+
 	switch (te) {
 	case TE_BREAK:
 		return false;
@@ -2650,6 +2687,27 @@ dispatch_event(const struct tcb_wait_data *wd)
 	case TE_RESTART:
 		break;
 
+	case TE_SECCOMP:
+		if (!has_seccomp_filter(current_tcp)) {
+			/*
+			 * We don't know if forks/clones have a seccomp filter
+			 * when they are created, but we can detect it when we
+			 * have a seccomp-stop.
+			 * In such a case, if !seccomp_before_sysentry, we have
+			 * already processed the syscall entry, so we avoid
+			 * processing it a second time.
+			 */
+			current_tcp->flags |= TCB_SECCOMP_FILTER;
+			restart_op = PTRACE_SYSCALL;
+			break;
+		}
+
+		if (seccomp_before_sysentry) {
+			restart_op = PTRACE_SYSCALL;
+			break;
+		}
+		ATTRIBUTE_FALLTHROUGH;
+
 	case TE_SYSCALL_STOP:
 		if (trace_syscall(current_tcp, &restart_sig) < 0) {
 			/*
@@ -2665,6 +2723,42 @@ dispatch_event(const struct tcb_wait_data *wd)
 			 */
 			return true;
 		}
+		if (has_seccomp_filter(current_tcp)) {
+			/*
+			 * Syscall and seccomp stops can happen in different
+			 * orders depending on kernel.  strace tests this in
+			 * check_seccomp_order_tracer().
+			 *
+			 * Linux 3.5--4.7:
+			 * (seccomp-stop before syscall-entry-stop)
+			 *         +--> seccomp-stop ->-PTRACE_SYSCALL->-+
+			 *         |                                     |
+			 *     PTRACE_CONT                   syscall-entry-stop
+			 *         |                                     |
+			 * syscall-exit-stop <---PTRACE_SYSCALL-----<----+
+			 *
+			 * Linux 4.8+:
+			 * (seccomp-stop after syscall-entry-stop)
+			 *                 syscall-entry-stop
+			 *
+			 *         +---->-----PTRACE_CONT---->----+
+			 *         |                              |
+			 *  syscall-exit-stop               seccomp-stop
+			 *         |                              |
+			 *         +----<----PTRACE_SYSCALL---<---+
+			 *
+			 * Note in Linux 4.8+, we restart in PTRACE_CONT
+			 * after syscall-exit to skip the syscall-entry-stop.
+			 * The next seccomp-stop will be treated as a syscall
+			 * entry.
+			 *
+			 * The line below implements this behavior.
+			 * Note that exiting(current_tcp) actually marks
+			 * a syscall-entry-stop because the flag was inverted
+			 * in the above call to trace_syscall.
+			 */
+			restart_op = exiting(current_tcp) ? PTRACE_SYSCALL : PTRACE_CONT;
+		}
 		break;
 
 	case TE_SIGNAL_DELIVERY_STOP:
diff --git a/trace_event.h b/trace_event.h
index 53a711b8..9021fc55 100644
--- a/trace_event.h
+++ b/trace_event.h
@@ -66,6 +66,11 @@ enum trace_event {
 	 * Restart the tracee with signal 0.
 	 */
 	TE_STOP_BEFORE_EXIT,
+
+	/*
+	 * SECCOMP_RET_TRACE rule is triggered.
+	 */
+	TE_SECCOMP,
 };
 
 #endif /* !STRACE_TRACE_EVENT_H */
-- 
2.17.1