[PATCH] check process death if ptrace() fails, v3
Denys Vlasenko
dvlasenk at redhat.com
Thu Dec 11 19:08:00 UTC 2008
Hi Roland,
After testing on ia64 with real example of SIGKILL
I made a few fixes and now it works like this:
Unpatched:
[root at hp-rx6600-03 strace.2]# strace ./k
execve("./k", ["./k"], [/* 22 vars */]) = 0
uname({sys="Linux", node="hp-rx6600-03.rhts.bos.redhat.com", ...}) = 0
brk(0) = 0x600000000001c000
brk(0x600000000001cf38) = 0x600000000001cf38
brk(0x6000000000040f38) = 0x6000000000040f38
brk(0x6000000000044000) = 0x6000000000044000
getpid() = 7413
kill(7413, SIGKILLupeek: ptrace(PTRACE_PEEKUSER,7413,2096,0): No such process
upeek: ptrace(PTRACE_PEEKUSER,7413,2240,0): No such process
Patched:
[root at hp-rx6600-03 strace.2]# ./strace ./k
execve("./k", ["./k"], [/* 22 vars */]) = 0
uname({sys="Linux", node="hp-rx6600-03.rhts.bos.redhat.com", ...}) = 0
brk(0) = 0x600000000001c000
brk(0x600000000001cf38) = 0x600000000001cf38
brk(0x6000000000040f38) = 0x6000000000040f38
brk(0x6000000000044000) = 0x6000000000044000
getpid() = 7416
kill(7416, SIGKILL) = ? <unavailable>
+++ killed by SIGKILL +++
On x86_64:
# strace ./k
execve("./k", ["./k"], [/* 55 vars */]) = 0
uname({sys="Linux", node="localhost.localdomain", ...}) = 0
brk(0) = 0x263b000
brk(0x263bf10) = 0x263bf10
arch_prctl(ARCH_SET_FS, 0x263b850) = 0
brk(0x265cf10) = 0x265cf10
brk(0x265d000) = 0x265d000
getpid() = 26619
kill(26619, SIGKILL <unfinished ...>
+++ killed by SIGKILL +++
The patch itself is below.
--
vda
diff -d -urpN strace.1/defs.h strace.2/defs.h
--- strace.1/defs.h 2008-12-11 15:32:06.000000000 +0100
+++ strace.2/defs.h 2008-12-11 16:04:09.000000000 +0100
@@ -336,6 +336,7 @@ struct tcb {
prstatus_t status; /* procfs status structure */
#endif
#endif
+ int ptrace_errno;
#ifdef FREEBSD
struct procfs_status status;
int pfd_reg;
@@ -466,6 +467,8 @@ extern void set_overhead P((int));
extern void qualify P((char *));
extern int get_scno P((struct tcb *));
extern long known_scno P((struct tcb *));
+extern long do_ptrace P((int request, struct tcb *tcp, void *addr, void *data));
+extern int ptrace_restart P((int request, struct tcb *tcp, int sig));
extern int trace_syscall P((struct tcb *));
extern int count_syscall P((struct tcb *, struct timeval *));
extern void printxval P((const struct xlat *, int, const char *));
diff -d -urpN strace.1/process.c strace.2/process.c
--- strace.1/process.c 2008-12-11 15:32:06.000000000 +0100
+++ strace.2/process.c 2008-12-11 15:40:41.000000000 +0100
@@ -918,10 +918,8 @@ struct tcb *tcp;
clearbpt(tcpchild);
tcpchild->flags &= ~(TCB_SUSPENDED|TCB_STARTUP);
- if (ptrace(PTRACE_SYSCALL, pid, (char *) 1, 0) < 0) {
- perror("resume: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcpchild, 0) < 0)
return -1;
- }
if (!qflag)
fprintf(stderr, "\
diff -d -urpN strace.1/strace.c strace.2/strace.c
--- strace.1/strace.c 2008-12-11 15:32:06.000000000 +0100
+++ strace.2/strace.c 2008-12-11 17:55:01.000000000 +0100
@@ -1358,10 +1358,8 @@ struct tcb *tcp;
tcp->parent->nclone_waiting--;
#endif
- if (ptrace(PTRACE_SYSCALL, tcp->pid, (char *) 1, 0) < 0) {
- perror("resume: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0)
return -1;
- }
if (!qflag)
fprintf(stderr, "Process %u resumed\n", tcp->pid);
@@ -1533,21 +1531,14 @@ int sig;
break;
}
if (WSTOPSIG(status) == SIGSTOP) {
- if ((error = ptrace(PTRACE_DETACH,
- tcp->pid, (char *) 1, sig)) < 0) {
- if (errno != ESRCH)
- perror("detach: ptrace(PTRACE_DETACH, ...)");
- /* I died trying. */
- }
+ ptrace_restart(PTRACE_DETACH, tcp, sig);
break;
}
- if ((error = ptrace(PTRACE_CONT, tcp->pid, (char *) 1,
- WSTOPSIG(status) == SIGTRAP ?
- 0 : WSTOPSIG(status))) < 0) {
- if (errno != ESRCH)
- perror("detach: ptrace(PTRACE_CONT, ...)");
+ error = ptrace_restart(PTRACE_CONT, tcp,
+ WSTOPSIG(status) == SIGTRAP ? 0
+ : WSTOPSIG(status));
+ if (error < 0)
break;
- }
}
#endif /* LINUX */
@@ -1556,8 +1547,7 @@ int sig;
if (sig && kill(tcp->pid, sig) < 0)
perror("detach: kill");
sig = 0;
- if ((error = ptrace(PTRACE_DETACH, tcp->pid, (char *) 1, sig)) < 0)
- perror("detach: ptrace(PTRACE_DETACH, ...)");
+ error = ptrace_restart(PTRACE_DETACH, tcp, sig);
#endif /* SUNOS4 */
#ifndef USE_PROCFS
@@ -2160,17 +2150,16 @@ handle_group_exit(struct tcb *tcp, int s
detach(tcp, sig);
if (leader != NULL && leader != tcp)
leader->flags |= TCB_GROUP_EXITING;
- }
- else if (ptrace(PTRACE_CONT, tcp->pid, (char *) 1, sig) < 0) {
- perror("strace: ptrace(PTRACE_CONT, ...)");
- cleanup();
- return -1;
- }
- else {
- if (leader != NULL)
+ } else {
+ if (ptrace_restart(PTRACE_CONT, tcp, sig) < 0) {
+ cleanup();
+ return -1;
+ }
+ if (leader != NULL) {
leader->flags |= TCB_GROUP_EXITING;
- if (leader != NULL && leader != tcp)
- droptcb(tcp);
+ if (leader != tcp)
+ droptcb(tcp);
+ }
/* The leader will report to us as parent now,
and then we'll get to the SIG==-1 case. */
return 0;
@@ -2411,9 +2400,7 @@ Process %d attached (waiting for parent)
* Hope we are back in control now.
*/
tcp->flags &= ~(TCB_INSYSCALL | TCB_SIGTRAPPED);
- if (ptrace(PTRACE_SYSCALL,
- pid, (char *) 1, 0) < 0) {
- perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0) {
cleanup();
return -1;
}
@@ -2460,9 +2447,7 @@ Process %d attached (waiting for parent)
#endif
continue;
}
- if (ptrace(PTRACE_SYSCALL, pid, (char *) 1,
- WSTOPSIG(status)) < 0) {
- perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcp, WSTOPSIG(status)) < 0) {
cleanup();
return -1;
}
@@ -2472,7 +2457,7 @@ Process %d attached (waiting for parent)
/* we handled the STATUS, we are permitted to interrupt now. */
if (interrupted)
return 0;
- if (trace_syscall(tcp) < 0) {
+ if (trace_syscall(tcp) < 0 && !tcp->ptrace_errno) {
if (tcp->flags & TCB_ATTACHED)
detach(tcp, 0);
else {
@@ -2492,8 +2477,7 @@ Process %d attached (waiting for parent)
#endif
if (tcp->flags & TCB_ATTACHED)
detach(tcp, 0);
- else if (ptrace(PTRACE_CONT, pid, (char *) 1, 0) < 0) {
- perror("strace: ptrace(PTRACE_CONT, ...)");
+ else if (ptrace_restart(PTRACE_CONT, tcp, 0) < 0) {
cleanup();
return -1;
}
@@ -2505,8 +2489,7 @@ Process %d attached (waiting for parent)
continue;
}
tracing:
- if (ptrace(PTRACE_SYSCALL, pid, (char *) 1, 0) < 0) {
- perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0) {
cleanup();
return -1;
}
@@ -2554,9 +2537,18 @@ void
printleader(tcp)
struct tcb *tcp;
{
- if (tcp_last && (!outfname || followfork < 2 || tcp_last == tcp)) {
- tcp_last->flags |= TCB_REPRINT;
- tprintf(" <unfinished ...>\n");
+ if (tcp_last) {
+ if (tcp_last->ptrace_errno) {
+ if (tcp_last->flags & TCB_INSYSCALL) {
+ tprintf(" <unavailable>)");
+ tabto(acolumn);
+ }
+ tprintf("= ? <unavailable>\n");
+ tcp_last->ptrace_errno = 0;
+ } else if (!outfname || followfork < 2 || tcp_last == tcp) {
+ tcp_last->flags |= TCB_REPRINT;
+ tprintf(" <unfinished ...>\n");
+ }
}
curcol = 0;
if ((followfork == 1 || pflag_seen > 1) && outfname)
diff -d -urpN strace.1/syscall.c strace.2/syscall.c
--- strace.1/syscall.c 2008-12-11 15:32:06.000000000 +0100
+++ strace.2/syscall.c 2008-12-11 19:58:32.000000000 +0100
@@ -2278,28 +2278,30 @@ trace_syscall(struct tcb *tcp)
{
int sys_res;
struct timeval tv;
- int res;
-
- /* Measure the exit time as early as possible to avoid errors. */
- if (dtime && (tcp->flags & TCB_INSYSCALL))
- gettimeofday(&tv, NULL);
-
- res = get_scno(tcp);
- if (res != 1)
- return res;
-
- res = syscall_fixup(tcp);
- if (res != 1)
- return res;
+ int res, scno_good;
if (tcp->flags & TCB_INSYSCALL) {
long u_error;
- res = get_error(tcp);
- if (res != 1)
+
+ /* Measure the exit time as early as possible to avoid errors. */
+ if (dtime)
+ gettimeofday(&tv, NULL);
+
+ scno_good = res = get_scno(tcp);
+ if (res == 0)
+ return res;
+ if (res == 1)
+ res = syscall_fixup(tcp);
+ if (res == 0)
return res;
+ if (res == 1)
+ res = get_error(tcp);
+ if (res == 0)
+ return res;
+ if (res == 1)
+ internal_syscall(tcp);
- internal_syscall(tcp);
- if (tcp->scno >= 0 && tcp->scno < nsyscalls &&
+ if (res == 1 && tcp->scno >= 0 && tcp->scno < nsyscalls &&
!(qual_flags[tcp->scno] & QUAL_TRACE)) {
tcp->flags &= ~TCB_INSYSCALL;
return 0;
@@ -2308,7 +2310,9 @@ trace_syscall(struct tcb *tcp)
if (tcp->flags & TCB_REPRINT) {
printleader(tcp);
tprintf("<... ");
- if (tcp->scno >= nsyscalls || tcp->scno < 0)
+ if (scno_good != 1)
+ tprintf("????");
+ else if (tcp->scno >= nsyscalls || tcp->scno < 0)
tprintf("syscall_%lu", tcp->scno);
else
tprintf("%s", sysent[tcp->scno].sys_name);
@@ -2318,6 +2322,13 @@ trace_syscall(struct tcb *tcp)
if (cflag)
return count_syscall(tcp, &tv);
+ if (res != 1) {
+ tprintf(") ");
+ tabto(acolumn);
+ tcp->flags &= ~TCB_INSYSCALL;
+ return res;
+ }
+
if (tcp->scno >= nsyscalls || tcp->scno < 0
|| (qual_flags[tcp->scno] & QUAL_RAW))
sys_res = printargs(tcp);
@@ -2420,10 +2431,36 @@ trace_syscall(struct tcb *tcp)
}
/* Entering system call */
- res = syscall_enter(tcp);
- if (res != 1)
+ scno_good = res = get_scno(tcp);
+ if (res == 0)
+ return res;
+ if (res == 1)
+ res = syscall_fixup(tcp);
+ if (res == 0)
+ return res;
+ if (res == 1)
+ res = syscall_enter(tcp);
+ if (res == 0)
return res;
+ if (res != 1) {
+ printleader(tcp);
+ tcp->flags &= ~TCB_REPRINT;
+ tcp_last = tcp;
+ if (scno_good != 1)
+ tprintf("????" /* anti-trigraph gap */ "(");
+ else if (tcp->scno >= nsyscalls || tcp->scno < 0)
+ tprintf("syscall_%lu(", tcp->scno);
+ else
+ tprintf("%s(", sysent[tcp->scno].sys_name);
+ /*
+ * " <unavailable>" will be added later by the code which
+ * detects ptrace errors.
+ */
+ tcp->flags |= TCB_INSYSCALL;
+ return res;
+ }
+
switch (known_scno(tcp)) {
#ifdef SYS_socket_subcall
case SYS_socketcall:
diff -d -urpN strace.1/util.c strace.2/util.c
--- strace.1/util.c 2008-12-11 15:32:06.000000000 +0100
+++ strace.2/util.c 2008-12-11 20:00:17.000000000 +0100
@@ -241,6 +241,61 @@ xlookup(const struct xlat *xlat, int val
}
/*
+ * Generic ptrace wrapper which tracks ESRCH errors
+ * by setting tcp->ptrace_errno to it.
+ *
+ * We assume that ESRCH indicates likely process death (SIGKILL?),
+ * modulo bugs where process somehow ended up not stopped.
+ * Unfortunately kernel uses ESRCH for that case too. Oh well.
+ *
+ * Currently used by upeek() only.
+ * TODO: use this in all other ptrace() calls while decoding.
+ */
+long
+do_ptrace(int request, struct tcb *tcp, void *addr, void *data)
+{
+ long l;
+
+ errno = 0;
+ l = ptrace(request, tcp->pid, addr, data);
+ /* Non-ESRCH errors might be our invalid reg/mem accesses,
+ * we do not record them. */
+ if (errno == ESRCH)
+ tcp->ptrace_errno = ESRCH;
+ return l;
+}
+
+/*
+ * Used when we want to unblock stopped traced process.
+ * Should be only used with PTRACE_CONT, PTRACE_DETACH and PTRACE_SYSCALL.
+ * Returns 0 on success or if error was ESRCH
+ * (presumably process was killed while we talk to it).
+ * Otherwise prints error message and returns -1.
+ */
+int
+ptrace_restart(int op, struct tcb *tcp, int sig)
+{
+ int err;
+ const char *msg;
+
+ errno = 0;
+ ptrace(op, tcp->pid, (void *) 1, (void *) (long) sig);
+ err = errno;
+ if (!err || err == ESRCH)
+ return 0;
+
+ tcp->ptrace_errno = err;
+ msg = "SYSCALL";
+ if (op == PTRACE_CONT)
+ msg = "CONT";
+ if (op == PTRACE_DETACH)
+ msg = "DETACH";
+ fprintf(stderr, "strace: ptrace(PTRACE_%s,1,%d): %s\n",
+ msg, sig, strerror(err));
+ return -1;
+}
+
+/*
* Print entry in struct xlat table, if there.
*/
void
@@ -1035,11 +1090,13 @@ long *res;
}
#endif /* SUNOS4_KERNEL_ARCH_KLUDGE */
errno = 0;
- val = ptrace(PTRACE_PEEKUSER, tcp->pid, (char *) off, 0);
+ val = do_ptrace(PTRACE_PEEKUSER, tcp, (char *) off, 0);
if (val == -1 && errno) {
- char buf[60];
- sprintf(buf,"upeek: ptrace(PTRACE_PEEKUSER,%d,%lu,0)", tcp->pid, off);
- perror(buf);
+ if (errno != ESRCH) {
+ char buf[60];
+ sprintf(buf,"upeek: ptrace(PTRACE_PEEKUSER,%d,%lu,0)", tcp->pid, off);
+ perror(buf);
+ }
return -1;
}
*res = val;
More information about the Strace-devel
mailing list