[PATCH RFC 2/4] Introduce seccomp-assisted syscall filtering
Dmitry V. Levin
ldv at altlinux.org
Sun Jul 14 10:46:54 UTC 2019
On Sat, Jul 13, 2019 at 12:24:11PM +0200, Paul Chaignon wrote:
> From: Chen Jingpiao <chenjingpiao at gmail.com>
>
> With this patch, strace can rely on seccomp to only be stopped at syscalls
> of interest, instead of stopping at all syscalls. The seccomp filtering
> of syscalls is opt-in only; it must be enabled with the -n option. Kernel
> support is first checked with check_seccomp_filter(), which also ensures
> the BPF program derived from the syscalls to filter is not too larger than
> the kernel's limit.
>
> When using -n, -f is also required. Since a task's children inherit its
> seccomp filters, we want to ensure all children are also traced to avoid
> their syscalls failing with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man
> page).
>
> The current BPF program implements a simple linear match of the syscall
> numbers. It can be improved in the future without impacting user-observed
> behavior.
>
> The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8 (cf.
> PTRACE_EVENT_SECCOMP in ptrace man page). This patch supports both
> behavior by checking the kernel's actual behavior before installing the
> seccomp filter.
>
> * filter_seccomp.c: New file.
> * filter_seccomp.h: New file.
> * Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
> * strace.c (exec_or_die): Initialize seccomp filtering if requested.
> (init): Handle -n option and check that seccomp can be enabled.
> (print_debug_info): Handle PTRACE_EVENT_SECCOMP.
> (next_event): Capture PTRACE_EVENT_SECCOMP event.
> (dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
> * trace_event.h (trace_event): New enumeration entity.
> * strace.1.in: Document new -n option.
> * NEWS: Mention this change.
>
> Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
> ---
> Makefile.am | 2 +
> NEWS | 1 +
> filter_seccomp.c | 446 +++++++++++++++++++++++++++++++++++++++++++++++
> filter_seccomp.h | 57 ++++++
> strace.1.in | 10 ++
> strace.c | 38 +++-
> trace_event.h | 5 +
> 7 files changed, 556 insertions(+), 3 deletions(-)
> create mode 100644 filter_seccomp.c
> create mode 100644 filter_seccomp.h
I've had a cursory look at this patch, see my comments below.
[...]
> +static void
> +check_seccomp_order_do_child(void)
> +{
> + struct sock_filter filter[] = {
> + BPF_STMT(BPF_LD + BPF_W + BPF_ABS,
> + offsetof(struct seccomp_data, nr)),
> + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_getuid, 0, 1),
alpha has no __NR_getuid defined, this file won't compile there.
I suggest using something more portable, e.g. __NR_gettid.
> + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRACE),
> + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW)
> + };
> + struct sock_fprog prog = {
> + .len = ARRAY_SIZE(filter),
> + .filter = filter
> + };
> +
> + if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) < 0) {
> + /* Check if strace is already being traced. */
> + if (errno == EPERM)
> + _exit(0);
We don't have a EPERM check in test_ptrace_get_syscall_info,
why do we need it here?
[...]
> +static bool
> +traced_by_seccomp(unsigned int scno, unsigned int p)
> +{
> + return !sysent_vec[p][scno].sys_func
> + || sysent_vec[p][scno].sys_flags & TRACE_INDIRECT_SUBCALL
> + || is_number_in_set_array(scno, trace_set, p)
> + || strcmp("execve", sysent_vec[p][scno].sys_name) == 0
> + || strcmp("execveat", sysent_vec[p][scno].sys_name) == 0
> +#if defined SPARC || defined SPARC64
> + || strcmp("execv", sysent_vec[p][scno].sys_name) == 0
> +#endif
> + || strcmp("socketcall", sysent_vec[p][scno].sys_name) == 0
> + || strcmp("ipc", sysent_vec[p][scno].sys_name) == 0
> +#ifdef LINUX_MIPSO32
> + || strcmp("syscall", sysent_vec[p][scno].sys_name) == 0
> +#endif
> + ;
> +}
struct tcb.sys_name is not the proper field to check,
struct tcb.sen should be used instead.
By the way, why do we unconditionally trace multiplexing syscalls like ipc
and socketcall?
[...]
> +static void
> +dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
> +{
> + for (unsigned int i = 0; i < len; ++i) {
> + switch (filter[i].code) {
> + case BPF_LD + BPF_W + BPF_ABS:
> + switch (filter[i].k) {
> + case offsetof(struct seccomp_data, arch):
> + error_msg("STMT(BPF_LDWABS, data->arch)");
> + break;
> + case offsetof(struct seccomp_data, nr):
> + error_msg("STMT(BPF_LDWABS, data->nr)");
> + break;
> + default:
> + error_msg("STMT(BPF_LDWABS, 0x%x)",
> + filter[i].k);
> + }
> + break;
> + case BPF_RET + BPF_K:
> + switch (filter[i].k) {
> + case SECCOMP_RET_TRACE:
> + error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
> + break;
> + case SECCOMP_RET_ALLOW:
> + error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
> + break;
> + default:
> + error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
> + }
> + break;
> + case BPF_JMP + BPF_JEQ + BPF_K:
> + switch (filter[i].k) {
> + case AUDIT_ARCH_X86_64:
> + error_msg("JUMP(BPF_JEQ, %u, %u, AUDIT_ARCH_X86_64)",
> + filter[i].jt, filter[i].jf);
> + break;
> + case AUDIT_ARCH_I386:
> + error_msg("JUMP(BPF_JEQ, %u, %u, AUDIT_ARCH_I386)",
> + filter[i].jt, filter[i].jf);
> + break;
> + case __X32_SYSCALL_BIT:
> + error_msg("JUMP(BPF_JEQ, %u, %u, __X32_SYSCALL_BIT)",
> + filter[i].jt, filter[i].jf);
> + break;
> + default:
> + error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
> + filter[i].jt, filter[i].jf,
> + filter[i].k);
> + }
> + break;
> + case BPF_JMP + BPF_JGE + BPF_K:
> + switch (filter[i].k) {
> + case AUDIT_ARCH_X86_64:
> + error_msg("JUMP(BPF_JGE, %u, %u, AUDIT_ARCH_X86_64)",
> + filter[i].jt, filter[i].jf);
> + break;
> + case AUDIT_ARCH_I386:
> + error_msg("JUMP(BPF_JGE, %u, %u, AUDIT_ARCH_I386)",
> + filter[i].jt, filter[i].jf);
> + break;
> + case __X32_SYSCALL_BIT:
> + error_msg("JUMP(BPF_JGE, %u, %u, __X32_SYSCALL_BIT)",
> + filter[i].jt, filter[i].jf);
> + break;
> + default:
> + error_msg("JUMP(BPF_JGE, %u, %u, %u)",
> + filter[i].jt, filter[i].jf,
> + filter[i].k);
> + }
> + break;
> + default:
> + error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
> + filter[i].jt, filter[i].jf, filter[i].k);
> + }
> + }
> +}
> +
> +static unsigned short
> +init_sock_filter(struct sock_filter *filter)
> +{
> + unsigned short pos = 0;
> +#if SUPPORTED_PERSONALITIES > 1
> + unsigned int audit_arch_vec[] = {
> +# if defined X86_64
> + AUDIT_ARCH_X86_64,
> + AUDIT_ARCH_I386,
> + AUDIT_ARCH_X86_64
> +# elif SUPPORTED_PERSONALITIES == 2
> + AUDIT_ARCH_X86_64,
> + AUDIT_ARCH_I386
> +# endif
> + };
> +#endif
> + unsigned int syscall_bit_vec[] = {
> +#if defined X86_64
> + 0, 0, __X32_SYSCALL_BIT
> +#elif defined X32
> + __X32_SYSCALL_BIT, 0
> +#elif SUPPORTED_PERSONALITIES == 2
> + 0, 0
> +#else
> + 0
> +#endif
> + };
Please don't use any arch-specific ifdefs in the common code like this.
Use of arch-specific constants like AUDIT_ARCH_X86_64 in the common code
like this is also not acceptable.
--
ldv
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 801 bytes
Desc: not available
URL: <http://lists.strace.io/pipermail/strace-devel/attachments/20190714/840b3cce/attachment.bin>
More information about the Strace-devel
mailing list