[PATCH v5 1/3] Introduce seccomp-assisted syscall filtering

Dmitry V. Levin ldv at altlinux.org
Sat Sep 21 16:02:24 UTC 2019


From: Chen Jingpiao <chenjingpiao at gmail.com>

With this patch, strace can rely on seccomp to only be stopped at syscalls
of interest, instead of stopping at all syscalls.  The seccomp filtering
of syscalls is opt-in only; it must be enabled with the -n option.  Kernel
support is first checked with check_seccomp_filter(), which also ensures
the BPF program derived from the syscalls to filter is not larger than the
kernel's limit.

The -n option implies -f, but a warning is emitted if -f is not explicitly
specified.  Since a task's children inherit its seccomp filters, we want
to ensure all children are also traced to avoid their syscalls failing
with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).

The current BPF program implements a simple linear match of the syscall
numbers.  Contiguous sequences of syscall numbers are however matched as
an interval, with two instructions only.  The algorithm can be improved
or replaced in the future without impacting user-observed behavior.

The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8
(cf. PTRACE_EVENT_SECCOMP in ptrace man page).  This patch supports both
behaviors by checking the kernel's actual behavior before installing the
seccomp filter.

* filter_seccomp.c: New file.
* filter_seccomp.h: New file.
* Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
* linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Define for aarch64.
* linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
* linux/riscv/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for riscv.
* linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
* linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for sparc64.
PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
* linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for tile.
* linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for x32.
* linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
* linux/ia64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH): Likewise for IA64.
* strace.c (usage): Document -n option.
(exec_or_die): Initialize seccomp filtering if requested.
(init): Handle -n option and check that seccomp can be enabled.
(print_debug_info): Handle PTRACE_EVENT_SECCOMP.
(next_event): Capture PTRACE_EVENT_SECCOMP event.
(dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
* trace_event.h (trace_event): New enumeration entity.
* strace.1.in: Document new -n option.
* NEWS: Mention this change.

Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
Co-Authored-by: Dmitry V. Levin <ldv at altlinux.org>
---
 Makefile.am                  |   2 +
 NEWS                         |   2 +
 filter_seccomp.c             | 620 +++++++++++++++++++++++++++++++++++
 filter_seccomp.h             |  21 ++
 linux/aarch64/arch_defs_.h   |   2 +
 linux/ia64/arch_defs_.h      |   1 +
 linux/powerpc64/arch_defs_.h |   2 +
 linux/riscv/arch_defs_.h     |   2 +
 linux/s390x/arch_defs_.h     |   2 +
 linux/sparc64/arch_defs_.h   |   2 +
 linux/tile/arch_defs_.h      |   2 +
 linux/x32/arch_defs_.h       |   2 +
 linux/x86_64/arch_defs_.h    |   3 +
 strace.1.in                  |  17 +-
 strace.c                     |  78 ++++-
 trace_event.h                |   5 +
 16 files changed, 758 insertions(+), 5 deletions(-)
 create mode 100644 filter_seccomp.c
 create mode 100644 filter_seccomp.h

diff --git a/Makefile.am b/Makefile.am
index b4f31568f..f4c65b0ac 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -129,6 +129,8 @@ strace_SOURCES =	\
 	file_ioctl.c	\
 	filter.h	\
 	filter_qualify.c \
+	filter_seccomp.c \
+	filter_seccomp.h \
 	flock.c		\
 	flock.h		\
 	fs_x_ioctl.c	\
diff --git a/NEWS b/NEWS
index 76fdf58d6..6f7c3b67a 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,8 @@ Noteworthy changes in release ?.? (????-??-??)
 ==============================================
 
 * Improvements
+  * Implemented filtering of system calls via seccomp-bpf.  Use -n option to
+    enable.
   * Implemented decoding of pidfd_open syscall.
   * Enhanced decoding of NETLINK_ROUTE protocol.
   * Implemented decoding of UNIX_DIAG_UID netlink attribute.
diff --git a/filter_seccomp.c b/filter_seccomp.c
new file mode 100644
index 000000000..0fa6e84de
--- /dev/null
+++ b/filter_seccomp.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
+ * Copyright (c) 2019 The strace developers.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#include "defs.h"
+
+#include "ptrace.h"
+#include <signal.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+
+#include "filter_seccomp.h"
+#include "number_set.h"
+#include "syscall.h"
+#include "scno.h"
+
+bool seccomp_filtering;
+bool seccomp_before_sysentry;
+
+#ifdef HAVE_LINUX_SECCOMP_H
+
+# include <linux/seccomp.h>
+
+# ifndef BPF_MAXINSNS
+#  define BPF_MAXINSNS 4096
+# endif
+
+# define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
+# define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
+
+# define SET_BPF(filter, code, jt, jf, k) \
+	(*(filter) = (struct sock_filter) { code, jt, jf, k })
+
+# define SET_BPF_STMT(filter, code, k) \
+	SET_BPF(filter, code, 0, 0, k)
+
+# define SET_BPF_JUMP(filter, code, k, jt, jf) \
+	SET_BPF(filter, BPF_JMP | code, jt, jf, k)
+
+struct audit_arch_t {
+	unsigned int arch;
+	unsigned int flag;
+};
+
+static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
+# if SUPPORTED_PERSONALITIES > 1
+	PERSONALITY0_AUDIT_ARCH,
+	PERSONALITY1_AUDIT_ARCH,
+#  if SUPPORTED_PERSONALITIES > 2
+	PERSONALITY2_AUDIT_ARCH,
+#  endif
+# endif
+};
+
+# ifdef ENABLE_COVERAGE_GCOV
+extern void __gcov_flush(void);
+# endif
+
+static void ATTRIBUTE_NORETURN
+check_seccomp_order_do_child(void)
+{
+	static const struct sock_filter filter[] = {
+		/* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
+		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+			 offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
+	};
+	static const struct sock_fprog prog = {
+		.len = ARRAY_SIZE(filter),
+		.filter = (struct sock_filter *) filter
+	};
+
+	/* Get everything ready before PTRACE_TRACEME.  */
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+		perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
+	int pid = getpid();
+
+	if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
+		/* Exit with a nonzero exit status.  */
+		perror_func_msg_and_die("PTRACE_TRACEME");
+	}
+
+# ifdef ENABLE_COVERAGE_GCOV
+	__gcov_flush();
+# endif
+
+	kill(pid, SIGSTOP);
+	syscall(__NR_gettid);
+	_exit(0);
+}
+
+static int
+check_seccomp_order_tracer(int pid)
+{
+	unsigned int step;
+
+	for (step = 0; ; ++step) {
+		int status;
+
+		for (;;) {
+			long rc = waitpid(pid, &status, 0);
+			if (rc < 0 && errno == EINTR)
+				continue;
+			if (rc == pid)
+				break;
+			/* Cannot happen.  */
+			perror_func_msg("#%d: unexpected wait result %ld",
+					step, rc);
+			return pid;
+		}
+
+		if (WIFEXITED(status)) {
+			/* The tracee is no more.  */
+			pid = 0;
+
+			int exitstatus = WEXITSTATUS(status);
+			if (step == 5 && exitstatus == 0) {
+				seccomp_filtering = true;
+			} else {
+				error_func_msg("#%d: unexpected exit status %u",
+					       step, exitstatus);
+			}
+			break;
+		}
+
+		if (WIFSIGNALED(status)) {
+			/* The tracee is no more.  */
+			pid = 0;
+
+			error_func_msg("#%d: unexpected signal %u",
+				       step, WTERMSIG(status));
+			break;
+		}
+
+		if (!WIFSTOPPED(status)) {
+			/* Cannot happen.  */
+			error_func_msg("#%d: unexpected wait status %#x",
+				       step, status);
+			break;
+		}
+
+		unsigned int event = (unsigned int) status >> 16;
+
+		switch (WSTOPSIG(status)) {
+		case SIGSTOP:
+			if (step != 0) {
+				error_func_msg("#%d: unexpected signal stop",
+					       step);
+				return pid;
+			}
+			if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
+				   PTRACE_O_TRACESYSGOOD|
+				   PTRACE_O_TRACESECCOMP) < 0) {
+				perror_func_msg("PTRACE_SETOPTIONS");
+				return pid;
+			}
+			break;
+
+		case SIGTRAP:
+			if (event != PTRACE_EVENT_SECCOMP) {
+				error_func_msg("#%d: unexpected trap %#x",
+					       step, event);
+				return pid;
+			}
+
+			switch (step) {
+			case 1: /* Seccomp stop before entering gettid.  */
+				seccomp_before_sysentry = true;
+				break;
+			case 2: /* Seccomp stop after entering gettid.  */
+				if (!seccomp_before_sysentry)
+					break;
+				ATTRIBUTE_FALLTHROUGH;
+			default:
+				error_func_msg("#%d: unexpected seccomp stop",
+					       step);
+				return pid;
+			}
+			break;
+
+		case SIGTRAP | 0x80:
+			switch (step) {
+			case 3: /* Exiting gettid.  */
+			case 4: /* Entering exit_group.  */
+				break;
+			case 1: /* Entering gettid before seccomp stop.  */
+				seccomp_before_sysentry = false;
+				break;
+			case 2: /* Entering gettid after seccomp stop.  */
+				if (seccomp_before_sysentry)
+					break;
+				ATTRIBUTE_FALLTHROUGH;
+			default:
+				error_func_msg("#%d: unexpected syscall stop",
+					       step);
+				return pid;
+			}
+			break;
+
+		default:
+			error_func_msg("#%d: unexpected stop signal %#x",
+				       step, WSTOPSIG(status));
+			return pid;
+		}
+
+		if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
+			/* Cannot happen.  */
+			perror_func_msg("#%d: PTRACE_SYSCALL", step);
+			break;
+		}
+	}
+
+	return pid;
+}
+
+static void
+check_seccomp_order(void)
+{
+	seccomp_filtering = false;
+
+	int pid = fork();
+	if (pid < 0) {
+		perror_func_msg("fork");
+		return;
+	}
+
+	if (pid == 0)
+		check_seccomp_order_do_child();
+
+	pid = check_seccomp_order_tracer(pid);
+	if (pid) {
+		kill(pid, SIGKILL);
+		for (;;) {
+			long rc = waitpid(pid, NULL, 0);
+			if (rc < 0 && errno == EINTR)
+				continue;
+			break;
+		}
+	}
+}
+
+static bool
+traced_by_seccomp(unsigned int scno, unsigned int p)
+{
+	if (is_number_in_set_array(scno, trace_set, p)
+	    || sysent_vec[p][scno].sys_flags
+	    & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
+		return true;
+	return false;
+}
+
+static void
+check_bpf_program_size(void)
+{
+	unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
+
+	/*
+	 * Implements a simplified form of init_sock_filter()'s bytecode
+	 * generation algorithm, to count the number of instructions that will
+	 * be generated.
+	 */
+	for (int p = SUPPORTED_PERSONALITIES - 1;
+	     p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
+		unsigned int nb_insns_personality = 0;
+		unsigned int lower = UINT_MAX;
+
+		nb_insns_personality++;
+# if SUPPORTED_PERSONALITIES > 1
+		nb_insns_personality++;
+		if (audit_arch_vec[p].flag)
+			nb_insns_personality += 3;
+# endif
+
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+			if (traced_by_seccomp(i, p)) {
+				if (lower == UINT_MAX)
+					lower = i;
+				continue;
+			}
+			if (lower == UINT_MAX)
+				continue;
+			if (lower + 1 == i)
+				nb_insns_personality++;
+			else
+				nb_insns_personality += 2;
+			lower = UINT_MAX;
+		}
+		if (lower != UINT_MAX) {
+			if (lower + 1 == nsyscall_vec[p])
+				nb_insns_personality++;
+			else
+				nb_insns_personality += 2;
+		}
+
+		nb_insns_personality += 3;
+
+		/*
+		 * Within generated BPF programs, the origin and destination of
+		 * jumps are always in the same personality section.  The
+		 * largest jump is therefore the jump from the first
+		 * instruction of the section to the last, to skip the
+		 * personality and try to compare .arch to the next
+		 * personality.
+		 * If we have a personality section with more than 255
+		 * instructions, the jump offset will overflow.  Such program
+		 * is unlikely to happen, so we simply disable seccomp filter
+		 * is such a case.
+		 */
+		if (nb_insns_personality > UCHAR_MAX) {
+			debug_msg("seccomp filter disabled due to "
+				  "possibility of overflow");
+			seccomp_filtering = false;
+			return;
+		}
+		nb_insns += nb_insns_personality;
+	}
+
+# if SUPPORTED_PERSONALITIES > 1
+	nb_insns++;
+# endif
+
+	if (nb_insns > BPF_MAXINSNS) {
+		debug_msg("seccomp filter disabled due to BPF program being "
+			  "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
+		seccomp_filtering = false;
+	}
+}
+
+static void
+check_seccomp_filter_properties(void)
+{
+	if (NOMMU_SYSTEM) {
+		seccomp_filtering = false;
+		return;
+	}
+
+	int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
+	seccomp_filtering = rc < 0 && errno != EINVAL;
+	if (!seccomp_filtering)
+		debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+
+	if (seccomp_filtering)
+		check_bpf_program_size();
+	if (seccomp_filtering)
+		check_seccomp_order();
+}
+
+static void
+dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
+{
+	for (unsigned int i = 0; i < len; ++i) {
+		switch (filter[i].code) {
+		case BPF_LD | BPF_W | BPF_ABS:
+			switch (filter[i].k) {
+			case offsetof(struct seccomp_data, arch):
+				error_msg("STMT(BPF_LDWABS, data->arch)");
+				break;
+			case offsetof(struct seccomp_data, nr):
+				error_msg("STMT(BPF_LDWABS, data->nr)");
+				break;
+			default:
+				error_msg("STMT(BPF_LDWABS, 0x%x)",
+					  filter[i].k);
+			}
+			break;
+		case BPF_RET | BPF_K:
+			switch (filter[i].k) {
+			case SECCOMP_RET_TRACE:
+				error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
+				break;
+			case SECCOMP_RET_ALLOW:
+				error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
+				break;
+			default:
+				error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
+			}
+			break;
+		case BPF_JMP | BPF_JEQ | BPF_K:
+			error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
+		case BPF_JMP | BPF_JGE | BPF_K:
+			error_msg("JUMP(BPF_JGE, %u, %u, %u)",
+				  filter[i].jt, filter[i].jf,
+				  filter[i].k);
+			break;
+		case BPF_JMP | BPF_JA:
+			error_msg("JUMP(BPF_JA, %u)", filter[i].k);
+			break;
+		default:
+			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
+				  filter[i].jt, filter[i].jf, filter[i].k);
+		}
+	}
+}
+
+static void
+replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
+			 unsigned char jmp_trace)
+{
+	switch (*jmp_offset) {
+	case JMP_PLACEHOLDER_NEXT:
+		*jmp_offset = jmp_next;
+		break;
+	case JMP_PLACEHOLDER_TRACE:
+		*jmp_offset = jmp_trace;
+		break;
+	default:
+		break;
+	}
+}
+
+static unsigned short
+bpf_syscalls_cmp(struct sock_filter *filter,
+		 unsigned int lower, unsigned int upper)
+{
+	if (lower + 1 == upper) {
+		/* if (nr == lower) return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
+			     JMP_PLACEHOLDER_TRACE, 0);
+		return 1;
+	} else {
+		/* if (nr >= lower && nr < upper) return RET_TRACE; */
+		SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
+		SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
+			     JMP_PLACEHOLDER_TRACE);
+		return 2;
+	}
+}
+
+static unsigned short
+init_sock_filter(struct sock_filter *filter)
+{
+	/*
+	 * Generated program looks like:
+	 * if (arch == AUDIT_ARCH_A && nr >= flag) {
+	 *	if (nr == 59)
+	 *		return SECCOMP_RET_TRACE;
+	 *	if (nr >= 321 && nr <= 323)
+	 *		return SECCOMP_RET_TRACE;
+	 *	...
+	 *	return SECCOMP_RET_ALLOW;
+	 * }
+	 * if (arch == AUDIT_ARCH_A) {
+	 *	...
+	 * }
+	 * if (arch == AUDIT_ARCH_B) {
+	 *	...
+	 * }
+	 * return SECCOMP_RET_TRACE;
+	 */
+	unsigned short pos = 0;
+
+# if SUPPORTED_PERSONALITIES > 1
+	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+		     offsetof(struct seccomp_data, arch));
+# endif
+
+	/*
+	 * Personalities are iterated in reverse-order in the BPF program so
+	 * that the x86 case is naturally handled.  On x86, the first and third
+	 * personalities have the same arch identifier.  The third can be
+	 * distinguished based on its associated syscall flag, so we check it
+	 * first.  The only drawback here is that the first personality is more
+	 * common, which may make the BPF program slower to match syscalls on
+	 * average.
+	 */
+	for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
+		unsigned int lower = UINT_MAX;
+		unsigned short start = pos, end;
+
+# if SUPPORTED_PERSONALITIES > 1
+		/* if (arch != audit_arch_vec[p].arch) goto next; */
+		SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
+			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
+# endif
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+			     offsetof(struct seccomp_data, nr));
+
+# if SUPPORTED_PERSONALITIES > 1
+		if (audit_arch_vec[p].flag) {
+			/* if (nr < audit_arch_vec[p].flag) goto next; */
+			SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+				     audit_arch_vec[p].flag, 2, 0);
+			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+				     offsetof(struct seccomp_data, arch));
+			SET_BPF_JUMP(&filter[pos++], BPF_JA,
+				     JMP_PLACEHOLDER_NEXT, 0, 0);
+		}
+# endif
+
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+			if (traced_by_seccomp(i, p)) {
+				if (lower == UINT_MAX)
+					lower = i;
+				continue;
+			}
+			if (lower == UINT_MAX)
+				continue;
+			pos += bpf_syscalls_cmp(filter + pos,
+						lower | audit_arch_vec[p].flag,
+						i | audit_arch_vec[p].flag);
+			lower = UINT_MAX;
+		}
+		if (lower != UINT_MAX)
+			pos += bpf_syscalls_cmp(filter + pos,
+						lower | audit_arch_vec[p].flag,
+						nsyscall_vec[p]
+						| audit_arch_vec[p].flag);
+		end = pos;
+
+		/* if (nr >= max_nr) return RET_TRACE; */
+		SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+			     nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
+
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_ALLOW);
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+			     SECCOMP_RET_TRACE);
+
+		for (unsigned int i = start; i < end; ++i) {
+			if (BPF_CLASS(filter[i].code) != BPF_JMP)
+				continue;
+			unsigned char jmp_next = pos - i - 1;
+			unsigned char jmp_trace = pos - i - 2;
+			replace_jmp_placeholders(&filter[i].jt, jmp_next,
+						 jmp_trace);
+			replace_jmp_placeholders(&filter[i].jf, jmp_next,
+						 jmp_trace);
+			if (BPF_OP(filter[i].code) == BPF_JA)
+				filter[i].k = (unsigned int) jmp_next;
+		}
+	}
+
+# if SUPPORTED_PERSONALITIES > 1
+	/* Jumps conditioned on .arch default to this RET_TRACE. */
+	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
+# endif
+
+	if (debug_flag)
+		dump_seccomp_bpf(filter, pos);
+
+	return pos;
+}
+
+void
+init_seccomp_filter(void)
+{
+	struct sock_filter filter[BPF_MAXINSNS];
+	unsigned short len;
+
+	len = init_sock_filter(filter);
+
+	struct sock_fprog prog = {
+		.len = len,
+		.filter = filter
+	};
+
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+		perror_func_msg("prctl(PR_SET_NO_NEW_PRIVS)");
+		return;
+	}
+
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+		perror_func_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+	if (tcp && exiting(tcp)
+	    && tcp->scno < nsyscall_vec[current_personality]
+	    && traced_by_seccomp(tcp->scno, current_personality))
+		return PTRACE_SYSCALL;
+	return PTRACE_CONT;
+}
+
+#else /* !HAVE_LINUX_SECCOMP_H */
+
+# warning <linux/seccomp.h> is not available, seccomp filtering is not supported
+
+static void
+check_seccomp_filter_properties(void)
+{
+	seccomp_filtering = false;
+}
+
+void
+init_seccomp_filter(void)
+{
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+	return PTRACE_SYSCALL;
+}
+
+#endif
+
+void
+check_seccomp_filter(void)
+{
+	check_seccomp_filter_properties();
+
+	if (!seccomp_filtering)
+		error_msg("seccomp filter is requested but unavailable");
+}
diff --git a/filter_seccomp.h b/filter_seccomp.h
new file mode 100644
index 000000000..5e4d2f803
--- /dev/null
+++ b/filter_seccomp.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#ifndef STRACE_SECCOMP_FILTER_H
+#define STRACE_SECCOMP_FILTER_H
+
+#include "defs.h"
+
+extern bool seccomp_filtering;
+extern bool seccomp_before_sysentry;
+
+extern void check_seccomp_filter(void);
+extern void init_seccomp_filter(void);
+extern int seccomp_filter_restart_operator(const struct tcb *);
+
+#endif /* !STRACE_SECCOMP_FILTER_H */
diff --git a/linux/aarch64/arch_defs_.h b/linux/aarch64/arch_defs_.h
index ed9261f5a..fb75722f6 100644
--- a/linux/aarch64/arch_defs_.h
+++ b/linux/aarch64/arch_defs_.h
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_AARCH64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_ARM,     0 }
diff --git a/linux/ia64/arch_defs_.h b/linux/ia64/arch_defs_.h
index 87ca0cdbd..107a74df2 100644
--- a/linux/ia64/arch_defs_.h
+++ b/linux/ia64/arch_defs_.h
@@ -9,3 +9,4 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 0
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_IA64, SYSCALLENT_BASE_NR }
diff --git a/linux/powerpc64/arch_defs_.h b/linux/powerpc64/arch_defs_.h
index 871f4109d..a4ac007ef 100644
--- a/linux/powerpc64/arch_defs_.h
+++ b/linux/powerpc64/arch_defs_.h
@@ -8,3 +8,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define SUPPORTED_PERSONALITIES 2
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_PPC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_PPC,   0 }
diff --git a/linux/riscv/arch_defs_.h b/linux/riscv/arch_defs_.h
index a9c27bc7a..f53f076a6 100644
--- a/linux/riscv/arch_defs_.h
+++ b/linux/riscv/arch_defs_.h
@@ -7,4 +7,6 @@
 
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+# define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_RISCV64, 0 }
+# define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_RISCV32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
diff --git a/linux/s390x/arch_defs_.h b/linux/s390x/arch_defs_.h
index 1e520761d..750ab5121 100644
--- a/linux/s390x/arch_defs_.h
+++ b/linux/s390x/arch_defs_.h
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_MMAP_PGOFF 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_S390X, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_S390,  0 }
diff --git a/linux/sparc64/arch_defs_.h b/linux/sparc64/arch_defs_.h
index 68eef4fce..9eacaa401 100644
--- a/linux/sparc64/arch_defs_.h
+++ b/linux/sparc64/arch_defs_.h
@@ -9,4 +9,6 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_SPARC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_SPARC,   0 }
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
diff --git a/linux/tile/arch_defs_.h b/linux/tile/arch_defs_.h
index a781208c2..12ba0d8b6 100644
--- a/linux/tile/arch_defs_.h
+++ b/linux/tile/arch_defs_.h
@@ -6,6 +6,8 @@
  */
 
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_TILEGX,   0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_TILEGX32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
 
 #ifdef __tilepro__
diff --git a/linux/x32/arch_defs_.h b/linux/x32/arch_defs_.h
index 1055de123..9f48d3137 100644
--- a/linux/x32/arch_defs_.h
+++ b/linux/x32/arch_defs_.h
@@ -11,3 +11,5 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_OLD_TIME64_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
diff --git a/linux/x86_64/arch_defs_.h b/linux/x86_64/arch_defs_.h
index a8c1d9918..c2924ac21 100644
--- a/linux/x86_64/arch_defs_.h
+++ b/linux/x86_64/arch_defs_.h
@@ -9,3 +9,6 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 3
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
+#define PERSONALITY2_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
diff --git a/strace.1.in b/strace.1.in
index 78bbc628e..26de6bd3d 100644
--- a/strace.1.in
+++ b/strace.1.in
@@ -38,8 +38,8 @@
 strace \- trace system calls and signals
 .SH SYNOPSIS
 .SY strace
-.if '@ENABLE_STACKTRACE_FALSE@'#' .OP \-ACdffhikqqrtttTvVwxxyyzZ
-.if '@ENABLE_STACKTRACE_TRUE@'#' .OP \-ACdffhiqqrtttTvVwxxyyzZ
+.if '@ENABLE_STACKTRACE_FALSE@'#' .OP \-ACdffhiknqqrtttTvVwxxyyzZ
+.if '@ENABLE_STACKTRACE_TRUE@'#' .OP \-ACdffhinqqrtttTvVwxxyyzZ
 .OP \-I n
 .OP \-b execve
 .OM \-e expr
@@ -970,6 +970,19 @@ Show some debugging output of
 .B strace
 itself on the standard error.
 .TP
+.B \-n
+Enable (experimental) usage of seccomp-bpf to have ptrace(2)-stops only when
+system calls that are being traced occur in the traced processes.  Implies the
+.B \-f
+option.
+An attempt to rely on seccomp-bpf to filter system calls may fail for various
+reasons, e.g. there are too many system calls to filter, the seccomp API is not
+available, or
+.B strace
+itself is being traced.  In cases when seccomp-bpf filter setup failed,
+.B strace
+proceeds as usual and stops traced processes on every system call.
+.TP
 .B \-F
 This option is deprecated.  It is retained for backward compatibility only
 and may be removed in future releases.
diff --git a/strace.c b/strace.c
index e8ced366d..c79c0ab9a 100644
--- a/strace.c
+++ b/strace.c
@@ -30,6 +30,7 @@
 #endif
 
 #include "kill_save_errno.h"
+#include "filter_seccomp.h"
 #include "largefile_wrappers.h"
 #include "mmap_cache.h"
 #include "number_set.h"
@@ -236,7 +237,7 @@ usage(void)
 #endif
 
 	printf("\
-usage: strace [-ACdffhi" K_OPT "qqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
+usage: strace [-ACdffhi" K_OPT "nqqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
               [-a column] [-o file] [-s strsize] [-X format] [-P path]...\n\
               [-p pid]...\n\
 	      { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
@@ -308,6 +309,7 @@ Startup:\n\
 \n\
 Miscellaneous:\n\
   -d             enable debug output to stderr\n\
+  -n             enable seccomp-bpf filtering\n\
   -h             print help message\n\
   -V             print version\n\
 "
@@ -1231,6 +1233,10 @@ exec_or_die(void)
 	if (params_for_tracee.child_sa.sa_handler != SIG_DFL)
 		sigaction(SIGCHLD, &params_for_tracee.child_sa, NULL);
 
+	debug_msg("seccomp filter %s",
+		  seccomp_filtering ? "enabled" : "disabled");
+	if (seccomp_filtering)
+		init_seccomp_filter();
 	execv(params->pathname, params->argv);
 	perror_msg_and_die("exec");
 }
@@ -1606,7 +1612,7 @@ init(int argc, char *argv[])
 #ifdef ENABLE_STACKTRACE
 	    "k"
 #endif
-	    "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
+	    "a:Ab:cCdDe:E:fFhiI:no:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
 		switch (c) {
 		case 'a':
 			acolumn = string_to_uint(optarg);
@@ -1706,6 +1712,9 @@ init(int argc, char *argv[])
 		case 'u':
 			username = optarg;
 			break;
+		case 'n':
+			seccomp_filtering = true;
+			break;
 		case 'v':
 			qualify("abbrev=none");
 			break;
@@ -1759,6 +1768,11 @@ init(int argc, char *argv[])
 		error_msg_and_help("PROG [ARGS] must be specified with -D");
 	}
 
+	if (seccomp_filtering && !followfork) {
+		error_msg("-n implies -f");
+		followfork = 1;
+	}
+
 	if (optF) {
 		if (followfork) {
 			error_msg("deprecated option -F ignored");
@@ -1834,6 +1848,12 @@ init(int argc, char *argv[])
 		ptrace_setoptions |= PTRACE_O_TRACECLONE |
 				     PTRACE_O_TRACEFORK |
 				     PTRACE_O_TRACEVFORK;
+
+	if (seccomp_filtering)
+		check_seccomp_filter();
+	if (seccomp_filtering)
+		ptrace_setoptions |= PTRACE_O_TRACESECCOMP;
+
 	debug_msg("ptrace_setoptions = %#x", ptrace_setoptions);
 	test_ptrace_seize();
 	test_ptrace_get_syscall_info();
@@ -2021,6 +2041,7 @@ print_debug_info(const int pid, int status)
 			[PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
 			[PTRACE_EVENT_EXEC]  = "EXEC",
 			[PTRACE_EVENT_EXIT]  = "EXIT",
+			[PTRACE_EVENT_SECCOMP]  = "SECCOMP",
 			/* [PTRACE_EVENT_STOP (=128)] would make biggish array */
 		};
 		const char *e = "??";
@@ -2546,6 +2567,9 @@ next_event(void)
 			case PTRACE_EVENT_EXIT:
 				wd->te = TE_STOP_BEFORE_EXIT;
 				break;
+			case PTRACE_EVENT_SECCOMP:
+				wd->te = TE_SECCOMP;
+				break;
 			default:
 				wd->te = TE_RESTART;
 			}
@@ -2631,7 +2655,7 @@ trace_syscall(struct tcb *tcp, unsigned int *sig)
 static bool
 dispatch_event(const struct tcb_wait_data *wd)
 {
-	unsigned int restart_op = PTRACE_SYSCALL;
+	unsigned int restart_op;
 	unsigned int restart_sig = 0;
 	enum trace_event te = wd ? wd->te : TE_BREAK;
 	/*
@@ -2640,6 +2664,11 @@ dispatch_event(const struct tcb_wait_data *wd)
 	 */
 	int status = wd ? wd->status : 0;
 
+	if (seccomp_filtering)
+		restart_op = seccomp_filter_restart_operator(current_tcp);
+	else
+		restart_op = PTRACE_SYSCALL;
+
 	switch (te) {
 	case TE_BREAK:
 		return false;
@@ -2650,6 +2679,13 @@ dispatch_event(const struct tcb_wait_data *wd)
 	case TE_RESTART:
 		break;
 
+	case TE_SECCOMP:
+		if (seccomp_before_sysentry) {
+			restart_op = PTRACE_SYSCALL;
+			break;
+		}
+		ATTRIBUTE_FALLTHROUGH;
+
 	case TE_SYSCALL_STOP:
 		if (trace_syscall(current_tcp, &restart_sig) < 0) {
 			/*
@@ -2665,6 +2701,42 @@ dispatch_event(const struct tcb_wait_data *wd)
 			 */
 			return true;
 		}
+		if (seccomp_filtering) {
+			/*
+			 * Syscall and seccomp stops can happen in different
+			 * orders depending on kernel.  strace tests this in
+			 * check_seccomp_order_tracer().
+			 *
+			 * Linux 3.5--4.7:
+			 * (seccomp-stop before syscall-entry-stop)
+			 *         +--> seccomp-stop ->-PTRACE_SYSCALL->-+
+			 *         |                                     |
+			 *     PTRACE_CONT                   syscall-entry-stop
+			 *         |                                     |
+			 * syscall-exit-stop <---PTRACE_SYSCALL-----<----+
+			 *
+			 * Linux 4.8+:
+			 * (seccomp-stop after syscall-entry-stop)
+			 *                 syscall-entry-stop
+			 *
+			 *         +---->-----PTRACE_CONT---->----+
+			 *         |                              |
+			 *  syscall-exit-stop               seccomp-stop
+			 *         |                              |
+			 *         +----<----PTRACE_SYSCALL---<---+
+			 *
+			 * Note in Linux 4.8+, we restart in PTRACE_CONT after
+			 * syscall-exit to skip the syscall-entry-stop.  The
+			 * next seccomp-stop will be treated as a syscall
+			 * entry.
+			 *
+			 * The below line implements this behavior. Note
+			 * exiting(current_tcp) actually marks a
+			 * syscall-entry-stop because the flag was inverted in
+			 * the above call to trace_syscall.
+			 */
+			restart_op = exiting(current_tcp) ? PTRACE_SYSCALL : PTRACE_CONT;
+		}
 		break;
 
 	case TE_SIGNAL_DELIVERY_STOP:
diff --git a/trace_event.h b/trace_event.h
index 53a711b82..9021fc550 100644
--- a/trace_event.h
+++ b/trace_event.h
@@ -66,6 +66,11 @@ enum trace_event {
 	 * Restart the tracee with signal 0.
 	 */
 	TE_STOP_BEFORE_EXIT,
+
+	/*
+	 * SECCOMP_RET_TRACE rule is triggered.
+	 */
+	TE_SECCOMP,
 };
 
 #endif /* !STRACE_TRACE_EVENT_H */
-- 
ldv


More information about the Strace-devel mailing list