[RFC v2] Add basic syscalls fault injection
Dmitry V. Levin
ldv at altlinux.org
Mon May 16 11:46:38 UTC 2016
On Sun, May 15, 2016 at 01:33:47AM +0200, Nahim El Atmani wrote:
> From: Nahim El Atmani <naam at lse.epita.fr>
>
> This patch adds a basic support for the faults injection in strace.
> As a reminder fault injection is something that could be really useful in many
> ways for developpers and testers. It could be used in test suites context to
> assert that an application behaves correctly on errors. By errors we don't
> necessarely mean that something went wrong, in fact an asynchroneous
> application may wants to have some tests for the code path when getting an
> EAGAIN value on a read for example. Normally the test have to setup a complex
> environment to create artificial errors or simply assert that the code works
> the way it should. strace would definetly get rid of that easily. On another
> hand if we considere the high proportion of bugs found by fuzzers nowadays it's
> pretty clear that tampering with the underneath of applications, that is with
> syscalls, looks promising.
Yes, this feature may appear to be useful in many ways.
> For that I reused the option parser and the custom parsing function to add a
> new -e keyword, `failwith' or in its shorter version `fw'. The API follows the
> following form: strace -e failwith=<syscall>:<occurrence>:<error>,...
follows the following? :)
> Since the syscall is directly passed to qual_syscall() everything you used
> before is still available (syscall number, name, classes...). The occurrence is
> actually an integer
Any integer? Can it be negative or zero?
> but the room is made to support a percentage and and
> introduce non determinism if needed. Likewise the actual parser support error
> as an integer and further work is necessary to support the string
> representation. Here is a usage example with a dummy C program that print a
> message, try to kill himself and print a flag and a debug message:
>
> strace -e kill -e failwith=kill:1:22 ./dummy
> Can you get the flag before I kill myself?
> kill(22978, SIGKILL) = -1 EINVAL (Invalid argument)
> CTF{Faults_Injection_Can_Save_lives}
>
> Syscall returned: -1 Invalid argument(22)
> +++ exited with 0 +++
>
> This currently only work in x86_64, further work is necessary to get it done on
> others.
>
> Signed-off-by: Nahim El Atmani <nahim+dev at naam.me>
> ---
> defs.h | 8 +++++
> syscall.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 114 insertions(+), 2 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index a3b1dda..92aad87 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -288,6 +288,13 @@ typedef struct ioctlent {
> unsigned int code;
> } struct_ioctlent;
>
> +typedef struct fail {
> + int err;
> + int cnt;
> + int occ;
> + int fail;
> +} struct_fail;
The name "fail" looks too short. BTW, if there is going to be a typedef,
the structure doesn't have to be named.
> +
> /* Trace Control Block */
> struct tcb {
> int flags; /* See below for TCB_ values */
> @@ -355,6 +362,7 @@ struct tcb {
> #define QUAL_SIGNAL 0x010 /* report events with this signal */
> #define QUAL_READ 0x020 /* dump data read on this file descriptor */
> #define QUAL_WRITE 0x040 /* dump data written to this file descriptor */
> +#define QUAL_FAIL 0x080 /* this system call fail on purpose */
> typedef uint8_t qualbits_t;
> #define UNDEFINED_SCNO 0x100 /* Used only in tcp->qual_flg */
>
> diff --git a/syscall.c b/syscall.c
> index d71ead3..580660c 100644
> --- a/syscall.c
> +++ b/syscall.c
> @@ -266,6 +266,8 @@ enum {
> MIN_QUALS = MAX_NSYSCALLS > 255 ? MAX_NSYSCALLS : 255
> };
>
> +struct_fail fails_vec[MAX_NSYSCALLS];
Shouldn't there be not just one vector but SUPPORTED_PERSONALITIES
vectors, like qual_vec?
> +
> #if SUPPORTED_PERSONALITIES > 1
> unsigned current_personality;
>
> @@ -359,7 +361,7 @@ update_personality(struct tcb *tcp, unsigned int personality)
> }
> #endif
>
> -static int qual_syscall(), qual_signal(), qual_desc();
> +static int qual_syscall(), qual_signal(), qual_desc(), qual_fail();
>
> static const struct qual_options {
> unsigned int bitflag;
> @@ -384,6 +386,8 @@ static const struct qual_options {
> { QUAL_WRITE, "write", qual_desc, "descriptor" },
> { QUAL_WRITE, "writes", qual_desc, "descriptor" },
> { QUAL_WRITE, "w", qual_desc, "descriptor" },
> + { QUAL_FAIL, "failwith", qual_fail, "system call" },
> + { QUAL_FAIL, "fw", qual_fail, "system call" },
> { 0, NULL, NULL, NULL },
> };
>
> @@ -448,6 +452,64 @@ qual_syscall(const char *s, const unsigned int bitflag, const int not)
> }
>
> static int
> +qual_fail(const char *s, const unsigned int bitflag, const int not)
> +{
> + char *ms, *ss, *end;
> + char *saveptr;
> + int occ, err;
> + unsigned int i;
> +
> + ms = ss = xstrdup(s);
> + ss = strtok_r(ss, ":", &saveptr);
> + if (!ss)
> + goto bad_format;
> +
> + for (i = 0; i < nsyscall_vec[current_personality]; ++i)
> + if (sysent_vec[current_personality][i].sys_name
> + && !strcmp(ss, sysent_vec[current_personality][i].sys_name))
> + break;
Why limit this to current_personality?
> + if (i == nsyscall_vec[current_personality])
> + goto bad_format;
> + qual_syscall(ss, bitflag, not);
> +
> + ss = strtok_r(NULL, ":", &saveptr);
> + if (!ss)
> + goto bad_format;
> +
> + occ = strtol(ss, &end, 10);
> + if (end == ss || (*end != '\0' && *end != '%') || errno == ERANGE)
> + goto bad_format;
What if occ is negative or zero?
> + fails_vec[i].occ = occ;
> + if (*end == '%')
> + fails_vec[i].cnt = -1;
> +
> + ss = strtok_r(NULL, ":", &saveptr);
> + if (!ss)
> + goto bad_format;
> +
> + if (*ss >= '0' && *ss <= '9') {
> + err = strtol(ss, &end, 10);
> + if (end == ss || *end != '\0' || errno == ERANGE)
> + goto bad_format;
What if err is negative or zero?
> + }
> + else {
> + /* TODO: support string form. */
> + }
> + fails_vec[i].err = err;
> + fails_vec[i].fail = 0;
> +
> + free(ms);
> +
> + return 0;
> +
> +bad_format:
> + free(ms);
> +
> + return -1;
> +}
I suggest a simple change to this procedure: parse the string passed to
the function and handle errors, then loop over personalities initializing
qual_vec and fails_vec.
> +static int
> qual_signal(const char *s, const unsigned int bitflag, const int not)
> {
> unsigned int i;
> @@ -789,6 +851,35 @@ static void get_error(struct tcb *, const bool);
> static int getregs_old(pid_t);
> #endif
>
> +static long
> +fail_syscall_enter(struct tcb *tcp)
> +{
> + if (fails_vec[tcp->scno].cnt != -1) {
> + if (fails_vec[tcp->scno].cnt++ % fails_vec[tcp->scno].occ)
What of fails_vec[tcp->scno].occ is zero?
> + return 0;
> + }
> + else { /* TODO: Support the non deterministic way */
> + return 0;
> + }
> +
> + fails_vec[tcp->scno].fail = 1;
> + return ptrace(PTRACE_POKEUSER, tcp->pid,
> + offsetof(struct user, regs.orig_rax),
> + (unsigned long)-1);
> +}
As this ptrace call is arch specific, I suggest moving it to a separate
function, e.g. arch_set_scno.
> +
> +static inline long
> +fail_syscall_exit(struct tcb *tcp)
> +{
> + if (!fails_vec[tcp->scno].fail)
> + return 0;
> +
> + tcp->u_error = fails_vec[tcp->scno].err;
> + return ptrace(PTRACE_POKEUSER, tcp->pid,
> + offsetof(struct user, regs.rax),
> + (unsigned long)-fails_vec[tcp->scno].err);
> +}
Likewise, I suggest moving arch specific code to a separate function,
e.g. arch_set_error.
> +
> static int
> trace_syscall_entering(struct tcb *tcp)
> {
> @@ -842,6 +933,8 @@ trace_syscall_entering(struct tcb *tcp)
> # endif
> }
> #endif
> + if (tcp->qual_flg & QUAL_FAIL)
> + fail_syscall_enter(tcp);
>
> if (!(tcp->qual_flg & QUAL_TRACE)
> || (tracing_paths && !pathtrace_match(tcp))
> @@ -908,6 +1001,9 @@ trace_syscall_exiting(struct tcb *tcp)
> update_personality(tcp, tcp->currpers);
> #endif
> res = (get_regs_error ? -1 : get_syscall_result(tcp));
> +
> + if (tcp->qual_flg & QUAL_FAIL && fails_vec[tcp->scno].fail)
> + fail_syscall_exit(tcp);
The check for hide_log_until_execve should go first.
It's important to make fail flag not per-scno but per-tcb, otherwise
force-failing syscalls when tracing mulitple processes will be unreliable.
struct_fail.fail is probably not needed at all.
struct call_counts is better place for accounting purposes.
> if (filtered(tcp) || hide_log_until_execve)
> goto ret;
>
> @@ -973,7 +1069,15 @@ trace_syscall_exiting(struct tcb *tcp)
> tprints(") ");
> tabto();
> u_error = tcp->u_error;
> - if (tcp->qual_flg & QUAL_RAW) {
> + if (tcp->qual_flg & QUAL_FAIL && fails_vec[tcp->scno].fail) {
> + if ((unsigned int)u_error < nerrnos && errnoent[u_error])
> + tprintf("= -1 %s ", errnoent[tcp->u_error]);
> + else
> + tprintf("= -1 %ld ", u_error);
> + tprints("(DISCARDED)");
Why DISCARDED?
> + fails_vec[tcp->scno].fail = 0;
> + }
> + else if (tcp->qual_flg & QUAL_RAW) {
> if (u_error)
> tprintf("= -1 (errno %ld)", u_error);
> else
What if both QUAL_FAIL and QUAL_RAW flags are set?
--
ldv
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 819 bytes
Desc: not available
URL: <http://lists.strace.io/pipermail/strace-devel/attachments/20160516/ab4b26fe/attachment.bin>
More information about the Strace-devel
mailing list