[PATCH v4 2/4] Introduce seccomp-assisted syscall filtering
Dmitry V. Levin
ldv at altlinux.org
Sat Aug 31 18:56:55 UTC 2019
On Thu, Aug 29, 2019 at 04:00:24PM +0200, Paul Chaignon wrote:
> From: Chen Jingpiao <chenjingpiao at gmail.com>
>
> With this patch, strace can rely on seccomp to only be stopped at syscalls
> of interest, instead of stopping at all syscalls. The seccomp filtering
> of syscalls is opt-in only; it must be enabled with the -n option. Kernel
> support is first checked with check_seccomp_filter(), which also ensures
> the BPF program derived from the syscalls to filter is not larger than the
> kernel's limit.
>
> The -n option implies -f, but a warning is emitted if -f is not explicitly
> specified. Since a task's children inherit its seccomp filters, we want
> to ensure all children are also traced to avoid their syscalls failing
> with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).
>
> The current BPF program implements a simple linear match of the syscall
> numbers. Contiguous sequences of syscall numbers are however matched as
> an interval, with two instructions only. The algorithm can be improved or
> replaced in the future without impacting user-observed behavior.
>
> The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8 (cf.
> PTRACE_EVENT_SECCOMP in ptrace man page). This patch supports both
> behaviors by checking the kernel's actual behavior before installing the
> seccomp filter.
>
> * filter_seccomp.c: New file.
> * filter_seccomp.h: New file.
> * Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
> * linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Define for aarch64.
> * linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
> * linux/riscv/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for riscv.
> * linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> * linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for sparc64.
> PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
> * linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for tile.
> * linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for x32.
> * linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
> * linux/ia64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH): Likewise for IA64.
> * strace.c (usage): Document -n option.
> (exec_or_die): Initialize seccomp filtering if requested.
> (init): Handle -n option and check that seccomp can be enabled.
> (print_debug_info): Handle PTRACE_EVENT_SECCOMP.
> (next_event): Capture PTRACE_EVENT_SECCOMP event.
> (dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
> * trace_event.h (trace_event): New enumeration entity.
> * strace.1.in: Document new -n option.
> * NEWS: Mention this change.
>
> Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
> ---
> Makefile.am | 2 +
> NEWS | 2 +
> filter_seccomp.c | 488 +++++++++++++++++++++++++++++++++++
> filter_seccomp.h | 21 ++
> linux/aarch64/arch_defs_.h | 2 +
> linux/ia64/arch_defs_.h | 1 +
> linux/powerpc64/arch_defs_.h | 2 +
> linux/riscv/arch_defs_.h | 2 +
> linux/s390x/arch_defs_.h | 2 +
> linux/sparc64/arch_defs_.h | 2 +
> linux/tile/arch_defs_.h | 2 +
> linux/x32/arch_defs_.h | 2 +
> linux/x86_64/arch_defs_.h | 3 +
> strace.1.in | 17 +-
> strace.c | 76 +++++-
> trace_event.h | 5 +
> 16 files changed, 624 insertions(+), 5 deletions(-)
> create mode 100644 filter_seccomp.c
> create mode 100644 filter_seccomp.h
>
> diff --git a/Makefile.am b/Makefile.am
> index b4f31568..f4c65b0a 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -129,6 +129,8 @@ strace_SOURCES = \
> file_ioctl.c \
> filter.h \
> filter_qualify.c \
> + filter_seccomp.c \
> + filter_seccomp.h \
> flock.c \
> flock.h \
> fs_x_ioctl.c \
> diff --git a/NEWS b/NEWS
> index a23b361b..e2913785 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -7,6 +7,8 @@ Noteworthy changes in release ?.? (????-??-??)
> * Implemented decoding of UNIX_DIAG_UID netlink attribute.
> * Updated lists of BPF_*, ETH_*, KEYCTL_*, KVM_*, MAP_*, SO_*, TCP_*, V4L2_*,
> XDP_*, and *_MAGIC constants.
> + * Implemented filtering of system calls via seccomp-bpf. Use -n option to
> + enable.
>
> * Bug fixes
> * Fixed syscall tampering on arc, avr32, csky, ia64, m68k, metag, mips,
> diff --git a/filter_seccomp.c b/filter_seccomp.c
> new file mode 100644
> index 00000000..f1d10a5d
> --- /dev/null
> +++ b/filter_seccomp.c
> @@ -0,0 +1,488 @@
> +/*
> + * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
> + * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier: LGPL-2.1-or-later
> + */
> +
> +#include "defs.h"
> +
> +#include "ptrace.h"
> +#include <sys/prctl.h>
> +#include <sys/wait.h>
> +#include <linux/audit.h>
> +#include <linux/filter.h>
> +#include <linux/seccomp.h>
> +#include <asm/unistd.h>
> +#include <signal.h>
> +
> +#include "filter_seccomp.h"
> +#include "number_set.h"
> +#include "syscall.h"
> +
> +#ifndef BPF_MAXINSNS
> +# define BPF_MAXINSNS 4096
> +#endif
> +
> +#define JMP_PLACEHOLDER_NEXT ((unsigned char) -1)
> +#define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
> +
> +#define SET_BPF(filter, code, jt, jf, k) \
> + (*(filter) = (struct sock_filter) { code, jt, jf, k })
> +
> +#define SET_BPF_STMT(filter, code, k) \
> + SET_BPF(filter, code, 0, 0, k)
> +
> +#define SET_BPF_JUMP(filter, code, k, jt, jf) \
> + SET_BPF(filter, BPF_JMP | code, jt, jf, k)
> +
> +struct audit_arch_t {
> + unsigned int arch;
> + unsigned int flag;
> +};
> +
> +static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
> +#if SUPPORTED_PERSONALITIES > 1
> + PERSONALITY0_AUDIT_ARCH,
> + PERSONALITY1_AUDIT_ARCH,
> +# if SUPPORTED_PERSONALITIES > 2
> + PERSONALITY2_AUDIT_ARCH,
> +# endif
> +#endif
> +};
> +
> +bool seccomp_filtering = false;
> +bool seccomp_before_sysentry;
> +
> +static void
ATTRIBUTE_NORETURN
> +check_seccomp_order_do_child(void)
> +{
> + static struct sock_filter filter[] = {
This really has to be static const struct sock_filter.
> + /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
> + BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> + offsetof(struct seccomp_data, nr)),
> + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
> + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
> + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
> + };
> + static const struct sock_fprog prog = {
> + .len = ARRAY_SIZE(filter),
> + .filter = filter
Feel free to cast filter to (struct sock_filter *) to overcome
the limitation of linux/filter.h API.
> + };
> +
> + if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) < 0)
> + perror_func_msg_and_die("ptrace(PTRACE_TRACEME, ...");
I suggest to make PTRACE_TRACEME the last syscall before raising SIGSTOP.
> + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
> + perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1, ...");
No need for ", ...".
> + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
> + perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
> + kill(getpid(), SIGSTOP);
> + syscall(__NR_gettid);
> + pause();
> + _exit(0);
> +}
No need for pause();
> +
> +static int
> +check_seccomp_order_tracer(int pid)
> +{
> + int status, tracee_pid, step = 0;
> + unsigned int event;
> +
> + while (1) {
> + errno = 0;
> + tracee_pid = waitpid(pid, &status, 0);
> + if (tracee_pid <= 0) {
> + if (errno == EINTR)
> + continue;
> + perror_func_msg("waitpid(%u)", pid);
> + return -1;
> + }
> + switch (step) {
> + case 0:
> + if (ptrace(PTRACE_SETOPTIONS, pid, 0,
> + PTRACE_O_TRACESECCOMP) < 0) {
> + perror_func_msg("ptrace(PTRACE_SETOPTIONS)");
> + return -1;
> + }
> + if (ptrace(PTRACE_SYSCALL, pid, NULL, NULL) < 0) {
> + perror_func_msg("ptrace(PTRACE_SYSCALL)");
> + return -1;
> + }
> + break;
> + case 1:
> + event = (unsigned int) status >> 16;
> + seccomp_before_sysentry = event == PTRACE_EVENT_SECCOMP;
> + kill(pid, SIGKILL);
> + break;
> + default:
> + if (WIFSIGNALED(status))
> + return 0;
> +
> + error_func_msg("unexpected wait status %#x", status);
> + return -1;
> + }
> + step++;
> + }
> + return 0;
> +}
> +
> +static int
> +check_seccomp_order(void)
> +{
> + int pid;
> +
> + pid = fork();
> + if (pid < 0) {
> + perror_func_msg("fork");
> + return -1;
> + }
> +
> + if (pid == 0)
> + check_seccomp_order_do_child();
> +
> + return check_seccomp_order_tracer(pid);
> +}
[...]
> +
> +void
> +check_seccomp_filter(void)
> +{
> + int rc;
> +
> + if (!seccomp_filtering)
> + return;
> +
> + if (NOMMU_SYSTEM) {
> + seccomp_filtering = false;
> + goto end;
> + }
> +
> + rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
> + seccomp_filtering = rc >= 0 || errno != EINVAL;
> + if (seccomp_filtering)
> + check_bpf_program_size();
> + if (seccomp_filtering && check_seccomp_order() < 0)
> + seccomp_filtering = false;
> +
> +end:
> + if (!seccomp_filtering)
> + error_msg("seccomp-filter is requested but unavailable");
> +}
This code doesn't test seccomp good enough. Also, it's kind of fragile.
For example, I've seen a system where it failed with
check_seccomp_order_tracer: unexpected wait status 0x57f
I suggest to change it this way (similar to test_ptrace_get_syscall_info):
static int
check_seccomp_order_tracer(int pid)
{
unsigned int step;
for (step = 0; ; ++step) {
int status;
for (;;) {
long rc = waitpid(pid, &status, 0);
if (rc < 0 && errno == EINTR)
continue;
if (rc == pid)
break;
/* Cannot happen. */
perror_func_msg("#%d: unexpected wait result %ld",
step, rc);
return pid;
}
if (WIFEXITED(status)) {
/* The tracee is no more. */
pid = 0;
int exitstatus = WEXITSTATUS(status);
if (step == 5 && exitstatus == 0) {
seccomp_filtering = true;
} else {
error_func_msg("#%d: unexpected exit status %u",
step, exitstatus);
}
break;
}
if (WIFSIGNALED(status)) {
/* The tracee is no more. */
pid = 0;
error_func_msg("#%d: unexpected signal %u",
step, WTERMSIG(status));
break;
}
if (!WIFSTOPPED(status)) {
/* Cannot happen. */
error_func_msg("#%d: unexpected wait status %#x",
step, status);
break;
}
unsigned int event = (unsigned int) status >> 16;
switch (WSTOPSIG(status)) {
case SIGSTOP:
if (step != 0) {
error_func_msg("#%d: unexpected signal stop",
step);
return pid;
}
if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
PTRACE_O_TRACESYSGOOD|
PTRACE_O_TRACESECCOMP) < 0) {
perror_func_msg("PTRACE_SETOPTIONS");
return pid;
}
break;
case SIGTRAP:
if (event != PTRACE_EVENT_SECCOMP) {
error_func_msg("#%d: unexpected trap %#x",
step, event);
return pid;
}
switch (step) {
case 1: /* Seccomp stop before entering gettid. */
seccomp_before_sysentry = true;
break;
case 2: /* Seccomp stop after entering gettid. */
if (!seccomp_before_sysentry)
break;
ATTRIBUTE_FALLTHROUGH;
default:
error_func_msg("#%d: unexpected seccomp stop",
step);
return pid;
}
break;
case SIGTRAP | 0x80:
switch (step) {
case 3: /* Exiting gettid. */
case 4: /* Entering exit_group. */
break;
case 1: /* Entering gettid before seccomp stop. */
seccomp_before_sysentry = false;
break;
case 2: /* Entering gettid after seccomp stop. */
if (seccomp_before_sysentry)
break;
ATTRIBUTE_FALLTHROUGH;
default:
error_func_msg("#%d: unexpected syscall stop",
step);
return pid;
}
break;
default:
error_func_msg("#%d: unexpected stop signal %#x",
step, WSTOPSIG(status));
return pid;
}
if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
/* Cannot happen. */
perror_func_msg("#%d: PTRACE_SYSCALL", step);
break;
}
}
return pid;
}
static void
check_seccomp_order(void)
{
seccomp_filtering = false;
int pid = fork();
if (pid < 0) {
perror_func_msg("fork");
return;
}
if (pid == 0)
check_seccomp_order_do_child();
pid = check_seccomp_order_tracer(pid);
if (pid) {
kill(pid, SIGKILL);
for (;;) {
long rc = waitpid(pid, NULL, 0);
if (rc < 0 && errno == EINTR)
continue;
break;
}
}
}
void
check_seccomp_filter(void)
{
if (!seccomp_filtering)
return;
if (NOMMU_SYSTEM) {
seccomp_filtering = false;
goto end;
}
int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
seccomp_filtering = rc < 0 && errno != EINVAL;
if (!seccomp_filtering)
debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
if (seccomp_filtering)
check_bpf_program_size();
if (seccomp_filtering)
check_seccomp_order();
end:
if (!seccomp_filtering)
error_msg("seccomp filter is requested but unavailable");
}
--
ldv
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 801 bytes
Desc: not available
URL: <http://lists.strace.io/pipermail/strace-devel/attachments/20190831/4b0420e2/attachment.bin>
More information about the Strace-devel
mailing list