[PATCH v4 2/4] Introduce seccomp-assisted syscall filtering

Dmitry V. Levin ldv at altlinux.org
Sat Aug 31 18:56:55 UTC 2019


On Thu, Aug 29, 2019 at 04:00:24PM +0200, Paul Chaignon wrote:
> From: Chen Jingpiao <chenjingpiao at gmail.com>
> 
> With this patch, strace can rely on seccomp to only be stopped at syscalls
> of interest, instead of stopping at all syscalls.  The seccomp filtering
> of syscalls is opt-in only; it must be enabled with the -n option.  Kernel
> support is first checked with check_seccomp_filter(), which also ensures
> the BPF program derived from the syscalls to filter is not larger than the
> kernel's limit.
> 
> The -n option implies -f, but a warning is emitted if -f is not explicitly
> specified.  Since a task's children inherit its seccomp filters, we want
> to ensure all children are also traced to avoid their syscalls failing
> with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).
> 
> The current BPF program implements a simple linear match of the syscall
> numbers.  Contiguous sequences of syscall numbers are however matched as
> an interval, with two instructions only.  The algorithm can be improved or
> replaced in the future without impacting user-observed behavior.
> 
> The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8 (cf.
> PTRACE_EVENT_SECCOMP in ptrace man page).  This patch supports both
> behaviors by checking the kernel's actual behavior before installing the
> seccomp filter.
> 
> * filter_seccomp.c: New file.
> * filter_seccomp.h: New file.
> * Makefile.am (strace_SOURCES): Add filter_seccomp.c and filter_seccomp.h.
> * linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Define for aarch64.
> * linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
> * linux/riscv/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for riscv.
> * linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> * linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for sparc64.
> PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
> * linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for tile.
> * linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH): Likewise for x32.
> * linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
> PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
> * linux/ia64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH): Likewise for IA64.
> * strace.c (usage): Document -n option.
> (exec_or_die): Initialize seccomp filtering if requested.
> (init): Handle -n option and check that seccomp can be enabled.
> (print_debug_info): Handle PTRACE_EVENT_SECCOMP.
> (next_event): Capture PTRACE_EVENT_SECCOMP event.
> (dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
> * trace_event.h (trace_event): New enumeration entity.
> * strace.1.in: Document new -n option.
> * NEWS: Mention this change.
> 
> Co-authored-by: Paul Chaignon <paul.chaignon at gmail.com>
> ---
>  Makefile.am                  |   2 +
>  NEWS                         |   2 +
>  filter_seccomp.c             | 488 +++++++++++++++++++++++++++++++++++
>  filter_seccomp.h             |  21 ++
>  linux/aarch64/arch_defs_.h   |   2 +
>  linux/ia64/arch_defs_.h      |   1 +
>  linux/powerpc64/arch_defs_.h |   2 +
>  linux/riscv/arch_defs_.h     |   2 +
>  linux/s390x/arch_defs_.h     |   2 +
>  linux/sparc64/arch_defs_.h   |   2 +
>  linux/tile/arch_defs_.h      |   2 +
>  linux/x32/arch_defs_.h       |   2 +
>  linux/x86_64/arch_defs_.h    |   3 +
>  strace.1.in                  |  17 +-
>  strace.c                     |  76 +++++-
>  trace_event.h                |   5 +
>  16 files changed, 624 insertions(+), 5 deletions(-)
>  create mode 100644 filter_seccomp.c
>  create mode 100644 filter_seccomp.h
> 
> diff --git a/Makefile.am b/Makefile.am
> index b4f31568..f4c65b0a 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -129,6 +129,8 @@ strace_SOURCES =	\
>  	file_ioctl.c	\
>  	filter.h	\
>  	filter_qualify.c \
> +	filter_seccomp.c \
> +	filter_seccomp.h \
>  	flock.c		\
>  	flock.h		\
>  	fs_x_ioctl.c	\
> diff --git a/NEWS b/NEWS
> index a23b361b..e2913785 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -7,6 +7,8 @@ Noteworthy changes in release ?.? (????-??-??)
>    * Implemented decoding of UNIX_DIAG_UID netlink attribute.
>    * Updated lists of BPF_*, ETH_*, KEYCTL_*, KVM_*, MAP_*, SO_*, TCP_*, V4L2_*,
>      XDP_*, and *_MAGIC constants.
> +  * Implemented filtering of system calls via seccomp-bpf.  Use -n option to
> +    enable.
>  
>  * Bug fixes
>    * Fixed syscall tampering on arc, avr32, csky, ia64, m68k, metag, mips,
> diff --git a/filter_seccomp.c b/filter_seccomp.c
> new file mode 100644
> index 00000000..f1d10a5d
> --- /dev/null
> +++ b/filter_seccomp.c
> @@ -0,0 +1,488 @@
> +/*
> + * Copyright (c) 2018 Chen Jingpiao <chenjingpiao at gmail.com>
> + * Copyright (c) 2019 Paul Chaignon <paul.chaignon at gmail.com>
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier: LGPL-2.1-or-later
> + */
> +
> +#include "defs.h"
> +
> +#include "ptrace.h"
> +#include <sys/prctl.h>
> +#include <sys/wait.h>
> +#include <linux/audit.h>
> +#include <linux/filter.h>
> +#include <linux/seccomp.h>
> +#include <asm/unistd.h>
> +#include <signal.h>
> +
> +#include "filter_seccomp.h"
> +#include "number_set.h"
> +#include "syscall.h"
> +
> +#ifndef BPF_MAXINSNS
> +# define BPF_MAXINSNS 4096
> +#endif
> +
> +#define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
> +#define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
> +
> +#define SET_BPF(filter, code, jt, jf, k) \
> +	(*(filter) = (struct sock_filter) { code, jt, jf, k })
> +
> +#define SET_BPF_STMT(filter, code, k) \
> +	SET_BPF(filter, code, 0, 0, k)
> +
> +#define SET_BPF_JUMP(filter, code, k, jt, jf) \
> +	SET_BPF(filter, BPF_JMP | code, jt, jf, k)
> +
> +struct audit_arch_t {
> +	unsigned int arch;
> +	unsigned int flag;
> +};
> +
> +static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
> +#if SUPPORTED_PERSONALITIES > 1
> +	PERSONALITY0_AUDIT_ARCH,
> +	PERSONALITY1_AUDIT_ARCH,
> +# if SUPPORTED_PERSONALITIES > 2
> +	PERSONALITY2_AUDIT_ARCH,
> +# endif
> +#endif
> +};
> +
> +bool seccomp_filtering = false;
> +bool seccomp_before_sysentry;
> +
> +static void

ATTRIBUTE_NORETURN

> +check_seccomp_order_do_child(void)
> +{
> +	static struct sock_filter filter[] = {

This really has to be static const struct sock_filter.

> +		/* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
> +		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> +			 offsetof(struct seccomp_data, nr)),
> +		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
> +		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
> +		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
> +	};
> +	static const struct sock_fprog prog = {
> +		.len = ARRAY_SIZE(filter),
> +		.filter = filter

Feel free to cast filter to (struct sock_filter *) to overcome
the limitation of linux/filter.h API.

> +	};
> +
> +	if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) < 0)
> +		perror_func_msg_and_die("ptrace(PTRACE_TRACEME, ...");

I suggest to make PTRACE_TRACEME the last syscall before raising SIGSTOP.

> +	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
> +		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1, ...");

No need for ", ...".

> +	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
> +		perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
> +	kill(getpid(), SIGSTOP);
> +	syscall(__NR_gettid);
> +	pause();
> +	_exit(0);
> +}

No need for pause();

> +
> +static int
> +check_seccomp_order_tracer(int pid)
> +{
> +	int status, tracee_pid, step = 0;
> +	unsigned int event;
> +
> +	while (1) {
> +		errno = 0;
> +		tracee_pid = waitpid(pid, &status, 0);
> +		if (tracee_pid <= 0) {
> +			if (errno == EINTR)
> +				continue;
> +			perror_func_msg("waitpid(%u)", pid);
> +			return -1;
> +		}
> +		switch (step) {
> +		case 0:
> +			if (ptrace(PTRACE_SETOPTIONS, pid, 0,
> +				   PTRACE_O_TRACESECCOMP) < 0) {
> +				perror_func_msg("ptrace(PTRACE_SETOPTIONS)");
> +				return -1;
> +			}
> +			if (ptrace(PTRACE_SYSCALL, pid, NULL, NULL) < 0) {
> +				perror_func_msg("ptrace(PTRACE_SYSCALL)");
> +				return -1;
> +			}
> +			break;
> +		case 1:
> +			event = (unsigned int) status >> 16;
> +			seccomp_before_sysentry = event == PTRACE_EVENT_SECCOMP;
> +			kill(pid, SIGKILL);
> +			break;
> +		default:
> +			if (WIFSIGNALED(status))
> +				return 0;
> +
> +			error_func_msg("unexpected wait status %#x", status);
> +			return -1;
> +		}
> +		step++;
> +	}
> +	return 0;
> +}
> +
> +static int
> +check_seccomp_order(void)
> +{
> +	int pid;
> +
> +	pid = fork();
> +	if (pid < 0) {
> +		perror_func_msg("fork");
> +		return -1;
> +	}
> +
> +	if (pid == 0)
> +		check_seccomp_order_do_child();
> +
> +	return check_seccomp_order_tracer(pid);
> +}
[...]
> +
> +void
> +check_seccomp_filter(void)
> +{
> +	int rc;
> +
> +	if (!seccomp_filtering)
> +		return;
> +
> +	if (NOMMU_SYSTEM) {
> +		seccomp_filtering = false;
> +		goto end;
> +	}
> +
> +	rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
> +	seccomp_filtering = rc >= 0 || errno != EINVAL;
> +	if (seccomp_filtering)
> +		check_bpf_program_size();
> +	if (seccomp_filtering && check_seccomp_order() < 0)
> +		seccomp_filtering = false;
> +
> +end:
> +	if (!seccomp_filtering)
> +		error_msg("seccomp-filter is requested but unavailable");
> +}

This code doesn't test seccomp good enough.  Also, it's kind of fragile.
For example, I've seen a system where it failed with
check_seccomp_order_tracer: unexpected wait status 0x57f

I suggest to change it this way (similar to test_ptrace_get_syscall_info):

static int
check_seccomp_order_tracer(int pid)
{
	unsigned int step;

	for (step = 0; ; ++step) {
		int status;

		for (;;) {
			long rc = waitpid(pid, &status, 0);
			if (rc < 0 && errno == EINTR)
				continue;
			if (rc == pid)
				break;
			/* Cannot happen.  */
			perror_func_msg("#%d: unexpected wait result %ld",
					step, rc);
			return pid;
		}

		if (WIFEXITED(status)) {
			/* The tracee is no more.  */
			pid = 0;

			int exitstatus = WEXITSTATUS(status);
			if (step == 5 && exitstatus == 0) {
				seccomp_filtering = true;
			} else {
				error_func_msg("#%d: unexpected exit status %u",
					       step, exitstatus);
			}
			break;
		}

		if (WIFSIGNALED(status)) {
			/* The tracee is no more.  */
			pid = 0;

			error_func_msg("#%d: unexpected signal %u",
				       step, WTERMSIG(status));
			break;
		}

		if (!WIFSTOPPED(status)) {
			/* Cannot happen.  */
			error_func_msg("#%d: unexpected wait status %#x",
				       step, status);
			break;
		}

		unsigned int event = (unsigned int) status >> 16;

		switch (WSTOPSIG(status)) {
		case SIGSTOP:
			if (step != 0) {
				error_func_msg("#%d: unexpected signal stop",
					       step);
				return pid;
			}
			if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
				   PTRACE_O_TRACESYSGOOD|
				   PTRACE_O_TRACESECCOMP) < 0) {
				perror_func_msg("PTRACE_SETOPTIONS");
				return pid;
			}
			break;

		case SIGTRAP:
			if (event != PTRACE_EVENT_SECCOMP) {
				error_func_msg("#%d: unexpected trap %#x",
					       step, event);
				return pid;
			}

			switch (step) {
			case 1: /* Seccomp stop before entering gettid.  */
				seccomp_before_sysentry = true;
				break;
			case 2: /* Seccomp stop after entering gettid.  */
				if (!seccomp_before_sysentry)
					break;
				ATTRIBUTE_FALLTHROUGH;
			default:
				error_func_msg("#%d: unexpected seccomp stop",
					       step);
				return pid;
			}
			break;

		case SIGTRAP | 0x80:
			switch (step) {
			case 3: /* Exiting gettid.  */
			case 4: /* Entering exit_group.  */
				break;
			case 1: /* Entering gettid before seccomp stop.  */
				seccomp_before_sysentry = false;
				break;
			case 2: /* Entering gettid after seccomp stop.  */
				if (seccomp_before_sysentry)
					break;
				ATTRIBUTE_FALLTHROUGH;
			default:
				error_func_msg("#%d: unexpected syscall stop",
					       step);
				return pid;
			}
			break;

		default:
			error_func_msg("#%d: unexpected stop signal %#x",
				       step, WSTOPSIG(status));
			return pid;
		}

		if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
			/* Cannot happen.  */
			perror_func_msg("#%d: PTRACE_SYSCALL", step);
			break;
		}
	}

	return pid;
}

static void
check_seccomp_order(void)
{
	seccomp_filtering = false;

	int pid = fork();
	if (pid < 0) {
		perror_func_msg("fork");
		return;
	}

	if (pid == 0)
		check_seccomp_order_do_child();

	pid = check_seccomp_order_tracer(pid);
	if (pid) {
		kill(pid, SIGKILL);
		for (;;) {
			long rc = waitpid(pid, NULL, 0);
			if (rc < 0 && errno == EINTR)
				continue;
			break;
		}
	}
}

void
check_seccomp_filter(void)
{
	if (!seccomp_filtering)
		return;

	if (NOMMU_SYSTEM) {
		seccomp_filtering = false;
		goto end;
	}

	int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
	seccomp_filtering = rc < 0 && errno != EINVAL;
	if (!seccomp_filtering)
		debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");

	if (seccomp_filtering)
		check_bpf_program_size();
	if (seccomp_filtering)
		check_seccomp_order();

end:
	if (!seccomp_filtering)
		error_msg("seccomp filter is requested but unavailable");
}


-- 
ldv
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 801 bytes
Desc: not available
URL: <http://lists.strace.io/pipermail/strace-devel/attachments/20190831/4b0420e2/attachment.bin>


More information about the Strace-devel mailing list