[PATCH RFC 2/4] Introduce seccomp-assisted syscall filtering

Sun Jul 14 10:46:54 UTC 2019

On Sat, Jul 13, 2019 at 12:24:11PM +0200, Paul Chaignon wrote:
> From: Chen Jingpiao <chenjingpiao at gmail.com>
> 
> With this patch, strace can rely on seccomp to only be stopped at syscalls
> of interest, instead of stopping at all syscalls.  The seccomp filtering
> of syscalls is opt-in only; it must be enabled with the -n option.  Kernel
> support is first checked with check_seccomp_filter(), which also ensures
> the BPF program derived from the syscalls to filter is not too larger than
> the kernel's limit.
> 
> When using -n, -f is also required.  Since a task's children inherit its
> seccomp filters, we want to ensure all children are also traced to avoid
> their syscalls failing with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man
> page).
> 
> The current BPF program implements a simple linear match of the syscall
> numbers.  It can be improved in the future without impacting user-observed
> behavior.
> 
> The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8 (cf.
> PTRACE_EVENT_SECCOMP in ptrace man page).  This patch supports both
> behavior by checking the kernel's actual behavior before installing the
> seccomp filter.
> 
> * filter_seccomp.c: New file.
> * filter_seccomp.h: New file.
> * Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
> * strace.c (exec_or_die): Initialize seccomp filtering if requested.
> (init): Handle -n option and check that seccomp can be enabled.
> (print_debug_info): Handle PTRACE_EVENT_SECCOMP.
> (next_event): Capture PTRACE_EVENT_SECCOMP event.
> (dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
> * trace_event.h (trace_event): New enumeration entity.
> * strace.1.in: Document new -n option.
> * NEWS: Mention this change.
> 
> Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
> ---
>  Makefile.am      |   2 +
>  NEWS             |   1 +
>  filter_seccomp.c | 446 +++++++++++++++++++++++++++++++++++++++++++++++
>  filter_seccomp.h |  57 ++++++
>  strace.1.in      |  10 ++
>  strace.c         |  38 +++-
>  trace_event.h    |   5 +
>  7 files changed, 556 insertions(+), 3 deletions(-)
>  create mode 100644 filter_seccomp.c
>  create mode 100644 filter_seccomp.h

I've had a cursory look at this patch, see my comments below.

[...]
> +static void
> +check_seccomp_order_do_child(void)
> +{
> +	struct sock_filter filter[] = {
> +		BPF_STMT(BPF_LD + BPF_W + BPF_ABS,
> +			 offsetof(struct seccomp_data, nr)),
> +		BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_getuid, 0, 1),

alpha has no __NR_getuid defined, this file won't compile there.
I suggest using something more portable, e.g. __NR_gettid.

> +		BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRACE),
> +		BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW)
> +	};
> +	struct sock_fprog prog = {
> +		.len = ARRAY_SIZE(filter),
> +		.filter = filter
> +	};
> +
> +	if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) < 0) {
> +		/* Check if strace is already being traced. */
> +		if (errno == EPERM)
> +			_exit(0);

We don't have a EPERM check in test_ptrace_get_syscall_info,
why do we need it here?

[...]
> +static bool
> +traced_by_seccomp(unsigned int scno, unsigned int p)
> +{
> +	return !sysent_vec[p][scno].sys_func
> +	       || sysent_vec[p][scno].sys_flags & TRACE_INDIRECT_SUBCALL
> +	       || is_number_in_set_array(scno, trace_set, p)
> +	       || strcmp("execve", sysent_vec[p][scno].sys_name) == 0
> +	       || strcmp("execveat", sysent_vec[p][scno].sys_name) == 0
> +#if defined SPARC || defined SPARC64
> +	       || strcmp("execv", sysent_vec[p][scno].sys_name) == 0
> +#endif
> +	       || strcmp("socketcall", sysent_vec[p][scno].sys_name) == 0
> +	       || strcmp("ipc", sysent_vec[p][scno].sys_name) == 0
> +#ifdef LINUX_MIPSO32
> +	       || strcmp("syscall", sysent_vec[p][scno].sys_name) == 0
> +#endif
> +	       ;
> +}

struct tcb.sys_name is not the proper field to check,
struct tcb.sen should be used instead.

By the way, why do we unconditionally trace multiplexing syscalls like ipc
and socketcall?

[...]
> +static void
> +dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
> +{
> +	for (unsigned int i = 0; i < len; ++i) {
> +		switch (filter[i].code) {
> +		case BPF_LD + BPF_W + BPF_ABS:
> +			switch (filter[i].k) {
> +			case offsetof(struct seccomp_data, arch):
> +				error_msg("STMT(BPF_LDWABS, data->arch)");
> +				break;
> +			case offsetof(struct seccomp_data, nr):
> +				error_msg("STMT(BPF_LDWABS, data->nr)");
> +				break;
> +			default:
> +				error_msg("STMT(BPF_LDWABS, 0x%x)",
> +					  filter[i].k);
> +			}
> +			break;
> +		case BPF_RET + BPF_K:
> +			switch (filter[i].k) {
> +			case SECCOMP_RET_TRACE:
> +				error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
> +				break;
> +			case SECCOMP_RET_ALLOW:
> +				error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
> +				break;
> +			default:
> +				error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
> +			}
> +			break;
> +		case BPF_JMP + BPF_JEQ + BPF_K:
> +			switch (filter[i].k) {
> +			case AUDIT_ARCH_X86_64:
> +				error_msg("JUMP(BPF_JEQ, %u, %u, AUDIT_ARCH_X86_64)",
> +					  filter[i].jt, filter[i].jf);
> +				break;
> +			case AUDIT_ARCH_I386:
> +				error_msg("JUMP(BPF_JEQ, %u, %u, AUDIT_ARCH_I386)",
> +					  filter[i].jt, filter[i].jf);
> +				break;
> +			case __X32_SYSCALL_BIT:
> +				error_msg("JUMP(BPF_JEQ, %u, %u, __X32_SYSCALL_BIT)",
> +					  filter[i].jt, filter[i].jf);
> +				break;
> +			default:
> +				error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
> +					  filter[i].jt, filter[i].jf,
> +					  filter[i].k);
> +			}
> +			break;
> +		case BPF_JMP + BPF_JGE + BPF_K:
> +			switch (filter[i].k) {
> +			case AUDIT_ARCH_X86_64:
> +				error_msg("JUMP(BPF_JGE, %u, %u, AUDIT_ARCH_X86_64)",
> +					  filter[i].jt, filter[i].jf);
> +				break;
> +			case AUDIT_ARCH_I386:
> +				error_msg("JUMP(BPF_JGE, %u, %u, AUDIT_ARCH_I386)",
> +					  filter[i].jt, filter[i].jf);
> +				break;
> +			case __X32_SYSCALL_BIT:
> +				error_msg("JUMP(BPF_JGE, %u, %u, __X32_SYSCALL_BIT)",
> +					  filter[i].jt, filter[i].jf);
> +				break;
> +			default:
> +				error_msg("JUMP(BPF_JGE, %u, %u, %u)",
> +					  filter[i].jt, filter[i].jf,
> +					  filter[i].k);
> +			}
> +			break;
> +		default:
> +			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
> +				  filter[i].jt, filter[i].jf, filter[i].k);
> +		}
> +	}
> +}
> +
> +static unsigned short
> +init_sock_filter(struct sock_filter *filter)
> +{
> +	unsigned short pos = 0;
> +#if SUPPORTED_PERSONALITIES > 1
> +	unsigned int audit_arch_vec[] = {
> +# if defined X86_64
> +		AUDIT_ARCH_X86_64,
> +		AUDIT_ARCH_I386,
> +		AUDIT_ARCH_X86_64
> +# elif SUPPORTED_PERSONALITIES == 2
> +		AUDIT_ARCH_X86_64,
> +		AUDIT_ARCH_I386
> +# endif
> +	};
> +#endif
> +	unsigned int syscall_bit_vec[] = {
> +#if defined X86_64
> +		0, 0, __X32_SYSCALL_BIT
> +#elif defined X32
> +		__X32_SYSCALL_BIT, 0
> +#elif SUPPORTED_PERSONALITIES == 2
> +		0, 0
> +#else
> +		0
> +#endif
> +	};

Please don't use any arch-specific ifdefs in the common code like this.
Use of arch-specific constants like AUDIT_ARCH_X86_64 in the common code
like this is also not acceptable.

-- 
ldv
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 801 bytes
Desc: not available
URL: <http://lists.strace.io/pipermail/strace-devel/attachments/20190714/840b3cce/attachment.bin>