[RFC v2] Add basic syscalls fault injection

Dmitry V. Levin ldv at altlinux.org
Mon May 16 11:46:38 UTC 2016


On Sun, May 15, 2016 at 01:33:47AM +0200, Nahim El Atmani wrote:
> From: Nahim El Atmani <naam at lse.epita.fr>
> 
> This patch adds a basic support for the faults injection in strace.
> As a reminder fault injection is something that could be really useful in many
> ways for developpers and testers. It could be used in test suites context to
> assert that an application behaves correctly on errors. By errors we don't
> necessarely mean that something went wrong, in fact an asynchroneous
> application may wants to have some tests for the code path when getting an
> EAGAIN value on a read for example. Normally the test have to setup a complex
> environment to create artificial errors or simply assert that the code works
> the way it should. strace would definetly get rid of that easily. On another
> hand if we considere the high proportion of bugs found by fuzzers nowadays it's
> pretty clear that tampering with the underneath of applications, that is with
> syscalls, looks promising.

Yes, this feature may appear to be useful in many ways.

> For that I reused the option parser and the custom parsing function to add a
> new -e keyword, `failwith' or in its shorter version `fw'. The API follows the
> following form: strace -e failwith=<syscall>:<occurrence>:<error>,...

follows the following? :)

> Since the syscall is directly passed to qual_syscall() everything you used
> before is still available (syscall number, name, classes...). The occurrence is
> actually an integer

Any integer?  Can it be negative or zero?

> but the room is made to support a percentage and and
> introduce non determinism if needed. Likewise the actual parser support error
> as an integer and further work is necessary to support the string
> representation. Here is a usage example with a dummy C program that print a
> message, try to kill himself and print a flag and a debug message:
> 
> strace -e kill -e failwith=kill:1:22 ./dummy
> Can you get the flag before I kill myself?
> kill(22978, SIGKILL)                    = -1 EINVAL (Invalid argument)
> CTF{Faults_Injection_Can_Save_lives}
> 
> Syscall returned: -1 Invalid argument(22)
> +++ exited with 0 +++
> 
> This currently only work in x86_64, further work is necessary to get it done on
> others.
> 
> Signed-off-by: Nahim El Atmani <nahim+dev at naam.me>
> ---
>  defs.h    |   8 +++++
>  syscall.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 114 insertions(+), 2 deletions(-)
> 
> diff --git a/defs.h b/defs.h
> index a3b1dda..92aad87 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -288,6 +288,13 @@ typedef struct ioctlent {
>  	unsigned int code;
>  } struct_ioctlent;
>  
> +typedef struct fail {
> +	int err;
> +	int cnt;
> +	int occ;
> +	int fail;
> +} struct_fail;

The name "fail" looks too short.  BTW, if there is going to be a typedef,
the structure doesn't have to be named.

> +
>  /* Trace Control Block */
>  struct tcb {
>  	int flags;		/* See below for TCB_ values */
> @@ -355,6 +362,7 @@ struct tcb {
>  #define QUAL_SIGNAL	0x010	/* report events with this signal */
>  #define QUAL_READ	0x020	/* dump data read on this file descriptor */
>  #define QUAL_WRITE	0x040	/* dump data written to this file descriptor */
> +#define QUAL_FAIL	0x080	/* this system call fail on purpose */
>  typedef uint8_t qualbits_t;
>  #define UNDEFINED_SCNO	0x100	/* Used only in tcp->qual_flg */
>  
> diff --git a/syscall.c b/syscall.c
> index d71ead3..580660c 100644
> --- a/syscall.c
> +++ b/syscall.c
> @@ -266,6 +266,8 @@ enum {
>  	MIN_QUALS = MAX_NSYSCALLS > 255 ? MAX_NSYSCALLS : 255
>  };
>  
> +struct_fail fails_vec[MAX_NSYSCALLS];

Shouldn't there be not just one vector but SUPPORTED_PERSONALITIES
vectors, like qual_vec?

> +
>  #if SUPPORTED_PERSONALITIES > 1
>  unsigned current_personality;
>  
> @@ -359,7 +361,7 @@ update_personality(struct tcb *tcp, unsigned int personality)
>  }
>  #endif
>  
> -static int qual_syscall(), qual_signal(), qual_desc();
> +static int qual_syscall(), qual_signal(), qual_desc(), qual_fail();
>  
>  static const struct qual_options {
>  	unsigned int bitflag;
> @@ -384,6 +386,8 @@ static const struct qual_options {
>  	{ QUAL_WRITE,	"write",	qual_desc,	"descriptor"	},
>  	{ QUAL_WRITE,	"writes",	qual_desc,	"descriptor"	},
>  	{ QUAL_WRITE,	"w",		qual_desc,	"descriptor"	},
> +	{ QUAL_FAIL,	"failwith",	qual_fail,	"system call"	},
> +	{ QUAL_FAIL,	"fw",		qual_fail,	"system call"	},
>  	{ 0,		NULL,		NULL,		NULL		},
>  };
>  
> @@ -448,6 +452,64 @@ qual_syscall(const char *s, const unsigned int bitflag, const int not)
>  }
>  
>  static int
> +qual_fail(const char *s, const unsigned int bitflag, const int not)
> +{
> +	char *ms, *ss, *end;
> +	char *saveptr;
> +	int occ, err;
> +	unsigned int i;
> +
> +	ms = ss = xstrdup(s);
> +	ss = strtok_r(ss, ":", &saveptr);
> +	if (!ss)
> +		goto bad_format;
> +
> +	for (i = 0; i <  nsyscall_vec[current_personality]; ++i)
> +		if (sysent_vec[current_personality][i].sys_name
> +		    && !strcmp(ss, sysent_vec[current_personality][i].sys_name))
> +			break;

Why limit this to current_personality?

> +	if (i == nsyscall_vec[current_personality])
> +		goto bad_format;
> +	qual_syscall(ss, bitflag, not);
> +
> +	ss = strtok_r(NULL, ":", &saveptr);
> +	if (!ss)
> +		goto bad_format;
> +
> +	occ = strtol(ss, &end, 10);
> +	if (end == ss || (*end != '\0' && *end != '%') || errno == ERANGE)
> +		goto bad_format;

What if occ is negative or zero?

> +	fails_vec[i].occ = occ;
> +	if (*end == '%')
> +		fails_vec[i].cnt = -1;
> +
> +	ss = strtok_r(NULL, ":", &saveptr);
> +	if (!ss)
> +		goto bad_format;
> +
> +	if (*ss >= '0' && *ss <= '9') {
> +		err = strtol(ss, &end, 10);
> +		if (end == ss || *end != '\0' || errno == ERANGE)
> +			goto bad_format;

What if err is negative or zero?

> +	}
> +	else {
> +		/* TODO: support string form. */
> +	}
> +	fails_vec[i].err = err;
> +	fails_vec[i].fail = 0;
> +
> +	free(ms);
> +
> +	return 0;
> +
> +bad_format:
> +	free(ms);
> +
> +	return -1;
> +}

I suggest a simple change to this procedure: parse the string passed to
the function and handle errors, then loop over personalities initializing
qual_vec and fails_vec.

> +static int
>  qual_signal(const char *s, const unsigned int bitflag, const int not)
>  {
>  	unsigned int i;
> @@ -789,6 +851,35 @@ static void get_error(struct tcb *, const bool);
>  static int getregs_old(pid_t);
>  #endif
>  
> +static long
> +fail_syscall_enter(struct tcb *tcp)
> +{
> +	if (fails_vec[tcp->scno].cnt != -1) {
> +		if (fails_vec[tcp->scno].cnt++ % fails_vec[tcp->scno].occ)

What of fails_vec[tcp->scno].occ is zero?

> +			return 0;
> +	}
> +	else { /* TODO: Support the non deterministic way */
> +		return 0;
> +	}
> +
> +	fails_vec[tcp->scno].fail = 1;
> +	return ptrace(PTRACE_POKEUSER, tcp->pid,
> +		      offsetof(struct user, regs.orig_rax),
> +		      (unsigned long)-1);
> +}

As this ptrace call is arch specific, I suggest moving it to a separate
function, e.g. arch_set_scno.

> +
> +static inline long
> +fail_syscall_exit(struct tcb *tcp)
> +{
> +	if (!fails_vec[tcp->scno].fail)
> +		return 0;
> +
> +	tcp->u_error = fails_vec[tcp->scno].err;
> +	return ptrace(PTRACE_POKEUSER, tcp->pid,
> +		      offsetof(struct user, regs.rax),
> +		      (unsigned long)-fails_vec[tcp->scno].err);
> +}

Likewise, I suggest moving arch specific code to a separate function,
e.g. arch_set_error.

> +
>  static int
>  trace_syscall_entering(struct tcb *tcp)
>  {
> @@ -842,6 +933,8 @@ trace_syscall_entering(struct tcb *tcp)
>  # endif
>  	}
>  #endif
> +	if (tcp->qual_flg & QUAL_FAIL)
> +		fail_syscall_enter(tcp);
>  
>  	if (!(tcp->qual_flg & QUAL_TRACE)
>  	 || (tracing_paths && !pathtrace_match(tcp))
> @@ -908,6 +1001,9 @@ trace_syscall_exiting(struct tcb *tcp)
>  	update_personality(tcp, tcp->currpers);
>  #endif
>  	res = (get_regs_error ? -1 : get_syscall_result(tcp));
> +
> +	if (tcp->qual_flg & QUAL_FAIL && fails_vec[tcp->scno].fail)
> +		fail_syscall_exit(tcp);

The check for hide_log_until_execve should go first.

It's important to make fail flag not per-scno but per-tcb, otherwise
force-failing syscalls when tracing mulitple processes will be unreliable.

struct_fail.fail is probably not needed at all.
struct call_counts is better place for accounting purposes.

>  	if (filtered(tcp) || hide_log_until_execve)
>  		goto ret;
>  
> @@ -973,7 +1069,15 @@ trace_syscall_exiting(struct tcb *tcp)
>  	tprints(") ");
>  	tabto();
>  	u_error = tcp->u_error;
> -	if (tcp->qual_flg & QUAL_RAW) {
> +	if (tcp->qual_flg & QUAL_FAIL && fails_vec[tcp->scno].fail) {
> +		if ((unsigned int)u_error < nerrnos && errnoent[u_error])
> +			tprintf("= -1 %s ", errnoent[tcp->u_error]);
> +		else
> +			tprintf("= -1 %ld ", u_error);
> +		tprints("(DISCARDED)");

Why DISCARDED?

> +		fails_vec[tcp->scno].fail = 0;
> +	}
> +	else if (tcp->qual_flg & QUAL_RAW) {
>  		if (u_error)
>  			tprintf("= -1 (errno %ld)", u_error);
>  		else

What if both QUAL_FAIL and QUAL_RAW flags are set?


-- 
ldv

-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 819 bytes
Desc: not available
URL: <http://lists.strace.io/pipermail/strace-devel/attachments/20160516/ab4b26fe/attachment.bin>


More information about the Strace-devel mailing list