[PATCH RFC v2 1/3] Introduce seccomp-assisted syscall filtering
Paul Chaignon
paul.chaignon at gmail.com
Wed Jul 31 15:35:56 UTC 2019
From: Chen Jingpiao <chenjingpiao at gmail.com>
With this patch, strace can rely on seccomp to only be stopped at syscalls
of interest, instead of stopping at all syscalls. The seccomp filtering
of syscalls is opt-in only; it must be enabled with the -n option. Kernel
support is first checked with check_seccomp_filter(), which also ensures
the BPF program derived from the syscalls to filter is not too larger than
the kernel's limit.
The -n option implies -f, but a warning is emitted if -f is not explicitly
specified. Since a task's children inherit its seccomp filters, we want
to ensure all children are also traced to avoid their syscalls failing
with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).
The current BPF program implements a simple linear match of the syscall
numbers. It can be improved in the future without impacting user-observed
behavior.
The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8 (cf.
PTRACE_EVENT_SECCOMP in ptrace man page). This patch supports both
behavior by checking the kernel's actual behavior before installing the
seccomp filter.
* filter_seccomp.c: New file.
* filter_seccomp.h: New file.
* Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
* defs.h (PERSONALITY0_AUDIT_ARCH): Default definition.
(PERSONALITY1_AUDIT_ARCH): Default definition if several personalities are
supported.
* linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Define for aarch64.
* linux/mips/arch_defs_.h (SECCOMP_TRACED_SYSCALLS): Likewise for mips.
* linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
* linux/riscv/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for riscv.
* linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
* linux/sparc/arch_defs_.h (SECCOMP_TRACED_SYSCALLS): Likewise for sparc.
* linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH, SECCOMP_TRACED_SYSCALLS): Likewise for sparc64.
PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
* linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for tile.
* linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for x32.
* linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
* strace.c (exec_or_die): Initialize seccomp filtering if requested.
(init): Handle -n option and check that seccomp can be enabled.
(print_debug_info): Handle PTRACE_EVENT_SECCOMP.
(next_event): Capture PTRACE_EVENT_SECCOMP event.
(dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
* trace_event.h (trace_event): New enumeration entity.
* strace.1.in: Document new -n option.
* NEWS: Mention this change.
Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
---
Makefile.am | 2 +
NEWS | 1 +
defs.h | 9 +
filter_seccomp.c | 467 +++++++++++++++++++++++++++++++++++
filter_seccomp.h | 49 ++++
linux/aarch64/arch_defs_.h | 4 +
linux/mips/arch_defs_.h | 3 +
linux/powerpc64/arch_defs_.h | 2 +
linux/riscv/arch_defs_.h | 6 +
linux/s390x/arch_defs_.h | 2 +
linux/sparc/arch_defs_.h | 1 +
linux/sparc64/arch_defs_.h | 3 +
linux/tile/arch_defs_.h | 2 +
linux/x32/arch_defs_.h | 2 +
linux/x86_64/arch_defs_.h | 3 +
strace.1.in | 10 +
strace.c | 41 ++-
trace_event.h | 5 +
18 files changed, 609 insertions(+), 3 deletions(-)
create mode 100644 filter_seccomp.c
create mode 100644 filter_seccomp.h
diff --git a/Makefile.am b/Makefile.am
index 0fa7f25d..eb923439 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -129,6 +129,8 @@ strace_SOURCES = \
file_ioctl.c \
filter.h \
filter_qualify.c \
+ filter_seccomp.c \
+ filter_seccomp.h \
flock.c \
flock.h \
fs_x_ioctl.c \
diff --git a/NEWS b/NEWS
index e651c789..8b6fff43 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,7 @@ Noteworthy changes in release 5.2 (2019-07-12)
* Updated lists of AT_*, AUDIT_*, BPF_*, CLONE_*, ETH_*, KEY_*, KVM_*, MPOL_*,
TIPC_*, and V4L2_* constants.
* Updated lists of ioctl commands from Linux 5.2.
+ * Implemented seccomp filtering of system calls. Use -n option to enable.
* Bug fixes
* Fixed syscall tampering on powerpc, powerpc64, sparc, and sparc64 when
diff --git a/defs.h b/defs.h
index 51622c05..1ff17ae8 100644
--- a/defs.h
+++ b/defs.h
@@ -147,6 +147,15 @@ extern char *stpcpy(char *dst, const char *src);
# define HAVE_PERSONALITY_2_MPERS 0
# endif
+# ifndef PERSONALITY0_AUDIT_ARCH
+# define PERSONALITY0_AUDIT_ARCH { 0, 0 }
+# endif
+# if SUPPORTED_PERSONALITIES > 1
+# ifndef PERSONALITY1_AUDIT_ARCH
+# define PERSONALITY1_AUDIT_ARCH { 0, 0 }
+# endif
+# endif
+
# ifdef WORDS_BIGENDIAN
# define is_bigendian true
# else
diff --git a/filter_seccomp.c b/filter_seccomp.c
new file mode 100644
index 00000000..ff097d35
--- /dev/null
+++ b/filter_seccomp.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2018 The strace developers.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "defs.h"
+
+#include "ptrace.h"
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <asm/unistd.h>
+#include <signal.h>
+
+#include "filter_seccomp.h"
+#include "number_set.h"
+#include "syscall.h"
+
+#define JMP_PLACEHOLDER_NEXT ((unsigned char) -1)
+#define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
+
+#define SET_BPF(filter, code, jt, jf, k) \
+ (*(filter) = (struct sock_filter) { code, jt, jf, k })
+
+#define SET_BPF_STMT(filter, code, k) \
+ SET_BPF(filter, code, 0, 0, k)
+
+#define SET_BPF_JUMP(filter, code, k, jt, jf) \
+ SET_BPF(filter, code, jt, jf, k)
+
+struct audit_arch_t {
+ unsigned int arch;
+ unsigned int mask;
+};
+
+struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
+ PERSONALITY0_AUDIT_ARCH,
+#if SUPPORTED_PERSONALITIES > 1
+ PERSONALITY1_AUDIT_ARCH,
+# if SUPPORTED_PERSONALITIES > 2
+ PERSONALITY2_AUDIT_ARCH,
+# endif
+#endif
+};
+
+int traced_by_default[] = {
+ SEN_socketcall,
+ SEN_ipc,
+ SEN_execve,
+ SEN_execveat,
+#ifdef SECCOMP_TRACED_SYSCALLS
+ SECCOMP_TRACED_SYSCALLS
+#endif
+};
+
+bool seccomp_filtering = false;
+bool seccomp_before_sysentry;
+
+static void
+check_seccomp_order_do_child(void)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD + BPF_W + BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_gettid, 0, 1),
+ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRACE),
+ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW)
+ };
+ struct sock_fprog prog = {
+ .len = ARRAY_SIZE(filter),
+ .filter = filter
+ };
+
+ if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) < 0)
+ perror_func_msg_and_die("ptrace(PTRACE_TRACEME, ...");
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+ perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1, ...");
+ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+ perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
+ kill(getpid(), SIGSTOP);
+ syscall(__NR_gettid);
+ pause();
+ _exit(0);
+}
+
+static int
+check_seccomp_order_tracer(int pid)
+{
+ int status, tracee_pid, flags = 0;
+
+ while (1) {
+ errno = 0;
+ tracee_pid = waitpid(pid, &status, 0);
+ if (tracee_pid <= 0) {
+ if (errno == EINTR)
+ continue;
+ perror_func_msg("unexpected wait result %d",
+ tracee_pid);
+ return -1;
+ }
+ if (flags == 0) {
+ if (ptrace(PTRACE_SETOPTIONS, pid, 0,
+ PTRACE_O_TRACESECCOMP) < 0) {
+ perror_func_msg("ptrace(PTRACE_SETOPTIONS, ...");
+ return -1;
+ }
+ if (ptrace(PTRACE_SYSCALL, pid, NULL, NULL) < 0) {
+ perror_func_msg("ptrace(PTRACE_SYSCALL, ...");
+ return -1;
+ }
+ } else if (flags == 1) {
+ const unsigned int event = (unsigned int) status >> 16;
+ seccomp_before_sysentry = event == PTRACE_EVENT_SECCOMP;
+ kill(pid, SIGKILL);
+ } else {
+ if (WIFSIGNALED(status))
+ break;
+
+ error_func_msg("unexpected wait status %#x", status);
+ return -1;
+ }
+ flags++;
+ }
+ return 0;
+}
+
+static int
+check_seccomp_order(void)
+{
+ int pid;
+
+ pid = fork();
+ if (pid < 0) {
+ perror_func_msg("fork");
+ return -1;
+ }
+
+ if (pid == 0)
+ check_seccomp_order_do_child();
+
+ return check_seccomp_order_tracer(pid);
+}
+
+static bool
+traced_by_seccomp(unsigned int scno, unsigned int p)
+{
+ if (!sysent_vec[p][scno].sys_func
+ || is_number_in_set_array(scno, trace_set, p)
+ || sysent_vec[p][scno].sys_flags & TRACE_INDIRECT_SUBCALL)
+ return true;
+ for (unsigned int i = 0; i < ARRAY_SIZE(traced_by_default); ++i)
+ if (sysent_vec[p][scno].sen == traced_by_default[i])
+ return true;
+ return false;
+}
+
+static void
+check_bpf_instruction_number(void)
+{
+ unsigned int count = 0;
+#if SUPPORTED_PERSONALITIES > 1
+ count++;
+#endif
+
+ for (int p = SUPPORTED_PERSONALITIES - 1;
+ p >= 0 && count < BPF_MAXINSNS; --p) {
+ unsigned int lower = UINT_MAX;
+
+ count++;
+#if SUPPORTED_PERSONALITIES > 1
+ count++;
+ if (audit_arch_vec[p].mask) {
+ count += 3;
+ }
+#endif
+
+ for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+ if (traced_by_seccomp(i, p)) {
+ if (lower == UINT_MAX)
+ lower = i;
+ continue;
+ }
+ if (lower == UINT_MAX)
+ continue;
+ if (lower + 1 == i)
+ count++;
+ else
+ count += 2;
+ lower = UINT_MAX;
+ }
+ if (lower != UINT_MAX) {
+ if (lower + 1 == nsyscall_vec[p])
+ count++;
+ else
+ count += 2;
+ }
+
+ count += 3;
+ }
+
+#if SUPPORTED_PERSONALITIES > 1
+ count++;
+#endif
+
+ if (count > BPF_MAXINSNS) {
+ seccomp_filtering = false;
+ }
+}
+
+void
+check_seccomp_filter(void)
+{
+ if (!seccomp_filtering)
+ return;
+#ifdef SECCOMP_MODE_FILTER
+ int rc;
+
+ if (NOMMU_SYSTEM) {
+ seccomp_filtering = false;
+ goto end;
+ }
+
+#if SUPPORTED_PERSONALITIES > 1
+ for (unsigned int p = 0; p < SUPPORTED_PERSONALITIES; ++p)
+ if (!audit_arch_vec[p].arch) {
+ seccomp_filtering = false;
+ goto end;
+ }
+#endif
+
+ rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
+ seccomp_filtering = rc >= 0 || errno != EINVAL;
+ if (seccomp_filtering)
+ check_bpf_instruction_number();
+ if (seccomp_filtering && check_seccomp_order() < 0)
+ seccomp_filtering = false;
+#else
+ seccomp_filtering = false;
+#endif
+end:
+ error_msg("seccomp-filter requested but not available");
+}
+
+static unsigned short
+bpf_add_syscalls(struct sock_filter *filter,
+ unsigned int lower, unsigned int upper)
+{
+ if (lower + 1 == upper) {
+ SET_BPF_JUMP(filter, BPF_JMP + BPF_JEQ + BPF_K, lower, JMP_PLACEHOLDER_TRACE, 0);
+ return 1;
+ } else {
+ SET_BPF_JUMP(filter, BPF_JMP + BPF_JGE + BPF_K, lower, 0, 1);
+ ++filter;
+ SET_BPF_JUMP(filter, BPF_JMP + BPF_JGE + BPF_K, upper, 0, JMP_PLACEHOLDER_TRACE);
+ return 2;
+ }
+}
+
+static void
+dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
+{
+ for (unsigned int i = 0; i < len; ++i) {
+ switch (filter[i].code) {
+ case BPF_LD + BPF_W + BPF_ABS:
+ switch (filter[i].k) {
+ case offsetof(struct seccomp_data, arch):
+ error_msg("STMT(BPF_LDWABS, data->arch)");
+ break;
+ case offsetof(struct seccomp_data, nr):
+ error_msg("STMT(BPF_LDWABS, data->nr)");
+ break;
+ default:
+ error_msg("STMT(BPF_LDWABS, 0x%x)",
+ filter[i].k);
+ }
+ break;
+ case BPF_RET + BPF_K:
+ switch (filter[i].k) {
+ case SECCOMP_RET_TRACE:
+ error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
+ break;
+ case SECCOMP_RET_ALLOW:
+ error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
+ break;
+ default:
+ error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
+ }
+ break;
+ case BPF_JMP + BPF_JEQ + BPF_K:
+ error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
+ filter[i].jt, filter[i].jf,
+ filter[i].k);
+ break;
+ case BPF_JMP + BPF_JGE + BPF_K:
+ error_msg("JUMP(BPF_JGE, %u, %u, %u)",
+ filter[i].jt, filter[i].jf,
+ filter[i].k);
+ break;
+ case BPF_JMP + BPF_JA:
+ error_msg("JUMP(BPF_JA, %u)", filter[i].k);
+ break;
+ default:
+ error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
+ filter[i].jt, filter[i].jf, filter[i].k);
+ }
+ }
+}
+
+static void
+replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
+ unsigned char jmp_trace) {
+ switch (*jmp_offset) {
+ case JMP_PLACEHOLDER_NEXT:
+ *jmp_offset = jmp_next;
+ break;
+ case JMP_PLACEHOLDER_TRACE:
+ *jmp_offset = jmp_trace;
+ break;
+ default:
+ break;
+ }
+}
+
+static unsigned short
+init_sock_filter(struct sock_filter *filter)
+{
+ unsigned short pos = 0;
+
+#if SUPPORTED_PERSONALITIES > 1
+ SET_BPF_STMT(&filter[pos++], BPF_LD + BPF_W + BPF_ABS,
+ offsetof(struct seccomp_data, arch));
+#endif
+
+ /* Personnalities are iterated in reverse-order in the BPF program so that
+ * the x86 case is naturally handled. In x86, the first and third
+ * personnalities have the same arch identifier. The third can be
+ * distinguished based on its associated bit mask, so we check it first.
+ * The only drawback here is that the first personnality is more common,
+ * which may make the BPF program slower to match syscalls on average. */
+ for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
+ unsigned int lower = UINT_MAX;
+ unsigned short start = pos, end;
+
+#if SUPPORTED_PERSONALITIES > 1
+ SET_BPF_JUMP(&filter[pos++], BPF_JMP + BPF_JEQ + BPF_K,
+ audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
+#endif
+ SET_BPF_STMT(&filter[pos++], BPF_LD + BPF_W + BPF_ABS,
+ offsetof(struct seccomp_data, nr));
+
+#if SUPPORTED_PERSONALITIES > 1
+ if (audit_arch_vec[p].mask) {
+ SET_BPF_JUMP(&filter[pos++], BPF_JMP + BPF_JGE + BPF_K,
+ audit_arch_vec[p].mask, 2, 0);
+ SET_BPF_STMT(&filter[pos++], BPF_LD + BPF_W + BPF_ABS,
+ offsetof(struct seccomp_data, arch));
+ SET_BPF_JUMP(&filter[pos++], BPF_JMP + BPF_JA,
+ JMP_PLACEHOLDER_NEXT, 0, 0);
+ }
+#endif
+
+ for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+ if (traced_by_seccomp(i, p)) {
+ if (lower == UINT_MAX)
+ lower = i;
+ continue;
+ }
+ if (lower == UINT_MAX)
+ continue;
+ pos += bpf_add_syscalls(filter + pos,
+ lower + audit_arch_vec[p].mask,
+ i + audit_arch_vec[p].mask);
+ lower = UINT_MAX;
+ }
+ if (lower != UINT_MAX)
+ pos += bpf_add_syscalls(filter + pos,
+ lower + audit_arch_vec[p].mask,
+ nsyscall_vec[p] + audit_arch_vec[p].mask);
+ end = pos;
+
+ SET_BPF_JUMP(&filter[pos++], BPF_JMP + BPF_JGE + BPF_K,
+ nsyscall_vec[p] + audit_arch_vec[p].mask, 1, 0);
+ SET_BPF_STMT(&filter[pos++], BPF_RET + BPF_K,
+ SECCOMP_RET_ALLOW);
+ SET_BPF_STMT(&filter[pos++], BPF_RET + BPF_K,
+ SECCOMP_RET_TRACE);
+
+ for (unsigned int i = start; i < end; ++i) {
+ if (BPF_CLASS(filter[i].code) != BPF_JMP)
+ continue;
+ unsigned char jmp_next = pos - i - 1;
+ unsigned char jmp_trace = pos - i - 2;
+ replace_jmp_placeholders(&filter[i].jt, jmp_next,
+ jmp_trace);
+ replace_jmp_placeholders(&filter[i].jf, jmp_next,
+ jmp_trace);
+ if (BPF_OP(filter[i].code) == BPF_JA)
+ filter[i].k = (unsigned int)jmp_next;
+ }
+ }
+
+#if SUPPORTED_PERSONALITIES > 1
+ SET_BPF_STMT(&filter[pos++], BPF_RET + BPF_K, SECCOMP_RET_TRACE);
+#endif
+
+ if (debug_flag)
+ dump_seccomp_bpf(filter, pos);
+
+ return pos;
+}
+
+void
+init_seccomp_filter(void)
+{
+ struct sock_filter filter[BPF_MAXINSNS];
+ unsigned short len;
+
+ len = init_sock_filter(filter);
+
+ struct sock_fprog prog = {
+ .len = len,
+ .filter = filter
+ };
+
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+ perror_msg("prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)");
+ return;
+ }
+
+ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+ perror_msg("prctl");
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+ if (tcp && exiting(tcp)
+ && tcp->scno < nsyscall_vec[current_personality]
+ && traced_by_seccomp(tcp->scno, current_personality))
+ return PTRACE_SYSCALL;
+ return PTRACE_CONT;
+}
diff --git a/filter_seccomp.h b/filter_seccomp.h
new file mode 100644
index 00000000..6942e05b
--- /dev/null
+++ b/filter_seccomp.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
+ * Copyright (c) 2018 The strace developers.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef STRACE_SECCOMP_FILTER_H
+#define STRACE_SECCOMP_FILTER_H
+
+#include "defs.h"
+
+#ifdef HAVE_LINUX_SECCOMP_H
+# include <linux/seccomp.h>
+#endif
+
+#ifndef BPF_MAXINSNS
+# define BPF_MAXINSNS 4096
+#endif
+
+extern bool seccomp_filtering;
+extern bool seccomp_before_sysentry;
+
+extern void check_seccomp_filter(void);
+extern void init_seccomp_filter(void);
+extern int seccomp_filter_restart_operator(const struct tcb *);
+
+#endif /* !STRACE_SECCOMP_FILTER_H */
diff --git a/linux/aarch64/arch_defs_.h b/linux/aarch64/arch_defs_.h
index ed9261f5..213e7fad 100644
--- a/linux/aarch64/arch_defs_.h
+++ b/linux/aarch64/arch_defs_.h
@@ -9,3 +9,7 @@
#define HAVE_ARCH_OLD_SELECT 1
#define HAVE_ARCH_UID16_SYSCALLS 1
#define SUPPORTED_PERSONALITIES 2
+#ifdef AUDIT_ARCH_AARCH64
+# define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_AARCH64, 0 }
+# define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_ARM, 0 }
+#endif
diff --git a/linux/mips/arch_defs_.h b/linux/mips/arch_defs_.h
index f789e18d..3e95708b 100644
--- a/linux/mips/arch_defs_.h
+++ b/linux/mips/arch_defs_.h
@@ -8,3 +8,6 @@
#define HAVE_ARCH_GETRVAL2 1
#define HAVE_ARCH_DEDICATED_ERR_REG 1
#define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
+#ifdef LINUX_MIPSO32
+# define SECCOMP_TRACED_SYSCALLS SEN_syscall
+#endif
\ No newline at end of file
diff --git a/linux/powerpc64/arch_defs_.h b/linux/powerpc64/arch_defs_.h
index 871f4109..a4ac007e 100644
--- a/linux/powerpc64/arch_defs_.h
+++ b/linux/powerpc64/arch_defs_.h
@@ -8,3 +8,5 @@
#define HAVE_ARCH_OLD_SELECT 1
#define SUPPORTED_PERSONALITIES 2
#define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_PPC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_PPC, 0 }
diff --git a/linux/riscv/arch_defs_.h b/linux/riscv/arch_defs_.h
index a9c27bc7..a13448d8 100644
--- a/linux/riscv/arch_defs_.h
+++ b/linux/riscv/arch_defs_.h
@@ -7,4 +7,10 @@
#define HAVE_ARCH_UID16_SYSCALLS 1
#define SUPPORTED_PERSONALITIES 2
+#ifdef AUDIT_ARCH_RISCV64
+# define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_RISCV64, 0 }
+#endif
+#ifdef AUDIT_ARCH_RISCV32
+# define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_RISCV32, 0 }
+#endif
#define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
diff --git a/linux/s390x/arch_defs_.h b/linux/s390x/arch_defs_.h
index 1e520761..5db014a6 100644
--- a/linux/s390x/arch_defs_.h
+++ b/linux/s390x/arch_defs_.h
@@ -9,3 +9,5 @@
#define HAVE_ARCH_OLD_MMAP_PGOFF 1
#define HAVE_ARCH_UID16_SYSCALLS 1
#define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_S390X, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_S390, 0 }
\ No newline at end of file
diff --git a/linux/sparc/arch_defs_.h b/linux/sparc/arch_defs_.h
index 3549c95d..6a1d4522 100644
--- a/linux/sparc/arch_defs_.h
+++ b/linux/sparc/arch_defs_.h
@@ -10,3 +10,4 @@
#define HAVE_ARCH_SA_RESTORER 1
#define HAVE_ARCH_DEDICATED_ERR_REG 1
#define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
+#define SECCOMP_TRACED_SYSCALLS SEN_execv
diff --git a/linux/sparc64/arch_defs_.h b/linux/sparc64/arch_defs_.h
index 68eef4fc..36a57acc 100644
--- a/linux/sparc64/arch_defs_.h
+++ b/linux/sparc64/arch_defs_.h
@@ -9,4 +9,7 @@
#define HAVE_ARCH_UID16_SYSCALLS 1
#define HAVE_ARCH_SA_RESTORER 1
#define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_SPARC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_SPARC, 0 }
+#define SECCOMP_TRACED_SYSCALLS SEN_execv
#define HAVE_ARCH_DEDICATED_ERR_REG 1
diff --git a/linux/tile/arch_defs_.h b/linux/tile/arch_defs_.h
index a781208c..12ba0d8b 100644
--- a/linux/tile/arch_defs_.h
+++ b/linux/tile/arch_defs_.h
@@ -6,6 +6,8 @@
*/
#define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_TILEGX, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_TILEGX32, 0 }
#define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
#ifdef __tilepro__
diff --git a/linux/x32/arch_defs_.h b/linux/x32/arch_defs_.h
index 1055de12..9f48d313 100644
--- a/linux/x32/arch_defs_.h
+++ b/linux/x32/arch_defs_.h
@@ -11,3 +11,5 @@
#define HAVE_ARCH_UID16_SYSCALLS 1
#define HAVE_ARCH_OLD_TIME64_SYSCALLS 1
#define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386, 0 }
diff --git a/linux/x86_64/arch_defs_.h b/linux/x86_64/arch_defs_.h
index a8c1d991..c2924ac2 100644
--- a/linux/x86_64/arch_defs_.h
+++ b/linux/x86_64/arch_defs_.h
@@ -9,3 +9,6 @@
#define HAVE_ARCH_OLD_SELECT 1
#define HAVE_ARCH_UID16_SYSCALLS 1
#define SUPPORTED_PERSONALITIES 3
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386, 0 }
+#define PERSONALITY2_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
diff --git a/strace.1.in b/strace.1.in
index 56728371..292cf04d 100644
--- a/strace.1.in
+++ b/strace.1.in
@@ -951,6 +951,16 @@ Show some debugging output of
.B strace
itself on the standard error.
.TP
+.B \-n
+Enable use of seccomp-bpf to interrupt only system calls that are being traced.
+Requires the
+.B \-f
+option.
+The attempt to rely on seccomp-bpf to filter system calls may fail for diverse
+reasons: too many system calls to filter, seccomp API unavailable, or strace is
+itself being traced. In those cases, strace proceeds as usual and interrupts
+all system calls.
+.TP
.B \-F
This option is deprecated. It is retained for backward compatibility only
and may be removed in future releases.
diff --git a/strace.c b/strace.c
index afa841a7..f5a21b13 100644
--- a/strace.c
+++ b/strace.c
@@ -30,6 +30,7 @@
#endif
#include "kill_save_errno.h"
+#include "filter_seccomp.h"
#include "largefile_wrappers.h"
#include "mmap_cache.h"
#include "number_set.h"
@@ -293,6 +294,7 @@ Startup:\n\
\n\
Miscellaneous:\n\
-d enable debug output to stderr\n\
+ -n enable seccomp-bpf filtering\n\
-v verbose mode: print unabbreviated argv, stat, termios, etc. args\n\
-h print help message\n\
-V print version\n\
@@ -1207,6 +1209,10 @@ exec_or_die(void)
if (params_for_tracee.child_sa.sa_handler != SIG_DFL)
sigaction(SIGCHLD, ¶ms_for_tracee.child_sa, NULL);
+ debug_msg("seccomp-filter %s",
+ seccomp_filtering? "enabled" : "disabled");
+ if (seccomp_filtering)
+ init_seccomp_filter();
execv(params->pathname, params->argv);
perror_msg_and_die("exec");
}
@@ -1582,7 +1588,7 @@ init(int argc, char *argv[])
#ifdef ENABLE_STACKTRACE
"k"
#endif
- "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
+ "a:Ab:cCdDe:E:fFhiI:no:O:p:P:qrs:S:tTu:vVwxX:yzZ")) != EOF) {
switch (c) {
case 'a':
acolumn = string_to_uint(optarg);
@@ -1684,6 +1690,9 @@ init(int argc, char *argv[])
case 'u':
username = optarg;
break;
+ case 'n':
+ seccomp_filtering = true;
+ break;
case 'v':
qualify("abbrev=none");
break;
@@ -1737,6 +1746,11 @@ init(int argc, char *argv[])
error_msg_and_help("PROG [ARGS] must be specified with -D");
}
+ if (seccomp_filtering && !followfork) {
+ error_msg("-n was specified without -f");
+ followfork = 1;
+ }
+
if (optF) {
if (followfork) {
error_msg("deprecated option -F ignored");
@@ -1808,6 +1822,10 @@ init(int argc, char *argv[])
run_gid = getgid();
}
+ check_seccomp_filter();
+ if (seccomp_filtering)
+ ptrace_setoptions |= PTRACE_O_TRACESECCOMP;
+
if (followfork)
ptrace_setoptions |= PTRACE_O_TRACECLONE |
PTRACE_O_TRACEFORK |
@@ -1999,6 +2017,7 @@ print_debug_info(const int pid, int status)
[PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
[PTRACE_EVENT_EXEC] = "EXEC",
[PTRACE_EVENT_EXIT] = "EXIT",
+ [PTRACE_EVENT_SECCOMP] = "SECCOMP",
/* [PTRACE_EVENT_STOP (=128)] would make biggish array */
};
const char *e = "??";
@@ -2529,6 +2548,9 @@ next_event(void)
case PTRACE_EVENT_EXIT:
wd->te = TE_STOP_BEFORE_EXIT;
break;
+ case PTRACE_EVENT_SECCOMP:
+ wd->te = TE_SECCOMP;
+ break;
default:
wd->te = TE_RESTART;
}
@@ -2614,8 +2636,7 @@ trace_syscall(struct tcb *tcp, unsigned int *sig)
static bool
dispatch_event(const struct tcb_wait_data *wd)
{
- unsigned int restart_op = PTRACE_SYSCALL;
- unsigned int restart_sig = 0;
+ unsigned int restart_sig = 0, restart_op;
enum trace_event te = wd ? wd->te : TE_BREAK;
/*
* Copy wd->status to a non-const variable to workaround glibc bugs
@@ -2623,6 +2644,11 @@ dispatch_event(const struct tcb_wait_data *wd)
*/
int status = wd ? wd->status : 0;
+ if (seccomp_filtering)
+ restart_op = seccomp_filter_restart_operator(current_tcp);
+ else
+ restart_op = PTRACE_SYSCALL;
+
switch (te) {
case TE_BREAK:
return false;
@@ -2633,6 +2659,13 @@ dispatch_event(const struct tcb_wait_data *wd)
case TE_RESTART:
break;
+ case TE_SECCOMP:
+ if (seccomp_before_sysentry) {
+ restart_op = PTRACE_SYSCALL;
+ break;
+ }
+ ATTRIBUTE_FALLTHROUGH;
+
case TE_SYSCALL_STOP:
if (trace_syscall(current_tcp, &restart_sig) < 0) {
/*
@@ -2648,6 +2681,8 @@ dispatch_event(const struct tcb_wait_data *wd)
*/
return true;
}
+ if (seccomp_filtering)
+ restart_op = exiting(current_tcp)? PTRACE_SYSCALL : PTRACE_CONT;
break;
case TE_SIGNAL_DELIVERY_STOP:
diff --git a/trace_event.h b/trace_event.h
index 53a711b8..9021fc55 100644
--- a/trace_event.h
+++ b/trace_event.h
@@ -66,6 +66,11 @@ enum trace_event {
* Restart the tracee with signal 0.
*/
TE_STOP_BEFORE_EXIT,
+
+ /*
+ * SECCOMP_RET_TRACE rule is triggered.
+ */
+ TE_SECCOMP,
};
#endif /* !STRACE_TRACE_EVENT_H */
--
2.17.1
More information about the Strace-devel
mailing list