[PATCH] do not suspend waitpid under strace

Denys Vlasenko dvlasenk at redhat.com
Tue Jun 7 11:16:28 UTC 2011


Hi,

strace suspends waitpid until there is a child
for waitpid'ing process to collect status from.
Apparently, it is done because in some very old kernels
there were ptrace bugs which were making waitpid
not seeing children. Comment in strace code:

               /* There are children that this parent should block for.
                  But ptrace made us the parent of the traced children
                  and the real parent will get ECHILD from the wait call.
                  ...

Well. Whoever wrote this comment had to specify *which kernels*
have this bug. Apparently, this code (internal_wait function)
was added in or before 1999:

commit 76baf7c9f6dd61a15524ad43c1b690c252cf5b7c
Author: Wichert Akkerman <wichert at deephackmode.org>
Date:   Fri Feb 19 00:21:36 1999 +0000

    Initial revision


and the comment was added in 2002:

commit b69f81b8e552ab9ecdc5605887458adb0cb9542a
Author: Roland McGrath <roland at redhat.com>
Date:   Sat Dec 21 23:25:18 2002 +0000

    2002-12-21  Roland McGrath  <roland at redhat.com>

        * syscall.c (force_result): New function.
        * process.c (internal_wait): Handle ECHILD exit from wait call with
        WNOHANG flag set; force the return value to 0 in the inferior when it
        has live children we are tracing.


The newest kernel at that time was *linux-2.4.20*. Today, last 2.4.x is 2.4.37.11.
The bug may be fixed there. (I just don't have 2.4.x machine to test it).
It is definitely fixed in 2.6.x.

Oleg, can you tell approximately how many years ago was it fixed?


The problem with waitpid suspend is that it is broken:

                         * XXX doesn't handle pgrp matches (u_arg[0]==0,<-1)

Not only this: waitpid may also be interrupted by signals,
and waitpid suspend breaks that as well!
That's how I stumbled upon this bug - I noticed that waitpid isn't
interrupted by syscall under strace. Here's the test program:


#include <unistd.h>
#include <sys/types.h>
#include <signal.h>
#include <sys/wait.h>
static void handler(int s)
{
        write(1, "ho\n", 3);
}
int main()
{
        signal(SIGALRM, handler);
        if (fork() == 0) {
                sleep(1);
                kill(getppid(), SIGALRM);
                sleep(1);
                return 0;
        }
        wait(NULL);
        return 0;
}


It prints "ho" one second after it is started.
Under strace -tt -f -s99 -oLOG ./a.out, it prints it after *two* seconds.
Here is the LOG:

12464 22:51:06.663692 nanosleep({1, 0}, 0xbfa23e94) = 0
12464 22:51:07.663976 getppid()         = 12463
12464 22:51:07.664056 kill(12463, SIGALRM) = 0
<<====== why parent doesn't wake up now???
12464 22:51:07.664130 rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
12464 22:51:07.664280 rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
12464 22:51:07.664388 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
12464 22:51:07.664467 nanosleep({1, 0}, 0xbfa23e94) = 0
12464 22:51:08.664768 exit_group(0)     = ?
12463 22:51:08.664852 <... wait4 resumed> NULL, 0, NULL) = ? ERESTARTSYS (To be restarted)
<<====== why only now??
12463 22:51:08.664881 --- {si_signo=SIGALRM, si_code=SI_USER, si_pid=12464, si_uid=0, si_value={int=15, ptr=0xf}} (Alarm clock) ---
12463 22:51:08.664935 --- {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=12464, si_status=0, si_utime=0, si_stime=0} (Child exited) ---
12463 22:51:08.664993 write(1, "ho\n", 3) = 3
12463 22:51:08.665095 sigreturn()       = ? (mask now [])
12463 22:51:08.665214 wait4(-1, NULL, 0, NULL) = 12464
12463 22:51:08.665298 exit_group(0)     = ?


The patch below fixes this by deleting waitpid suspend code altogether.
Run-tested.

Dmitry, please take a look.

-- 
vda


diff -d -urpN strace.0/defs.h strace.1/defs.h
--- strace.0/defs.h	2011-06-07 12:12:14.000000000 +0200
+++ strace.1/defs.h	2011-06-07 12:45:50.391062006 +0200
@@ -367,11 +367,9 @@ struct tcb {
 				/* Support for tracing forked processes */
 	struct tcb *parent;	/* Parent of this process */
 	int nchildren;		/* # of traced children */
-	int waitpid;		/* pid(s) this process is waiting for */
 	int nzombies;		/* # of formerly traced children now dead */
 #ifdef LINUX
 	int nclone_threads;	/* # of nchildren with CLONE_THREAD */
-	int nclone_waiting;	/* clone threads in wait4 (TCB_SUSPENDED) */
 #endif
 				/* (1st arg of wait4()) */
 	long baddr;		/* `Breakpoint' address */
@@ -593,7 +591,6 @@ extern int pathtrace_match(struct tcb *)
 extern int change_syscall(struct tcb *, int);
 extern int internal_fork(struct tcb *);
 extern int internal_exec(struct tcb *);
-extern int internal_wait(struct tcb *, int);
 extern int internal_exit(struct tcb *);
 #ifdef LINUX
 extern int handle_new_child(struct tcb *, int, int);
diff -d -urpN strace.0/process.c strace.1/process.c
--- strace.0/process.c	2011-06-07 12:12:15.000000000 +0200
+++ strace.1/process.c	2011-06-07 13:15:15.954528019 +0200
@@ -1929,92 +1929,6 @@ printwaitn(struct tcb *tcp, int n, int b
 	return 0;
 }
 
-int
-internal_wait(struct tcb *tcp, int flagarg)
-{
-	int got_kids;
-
-#ifdef TCB_CLONE_THREAD
-	if (tcp->flags & TCB_CLONE_THREAD)
-		/* The children we wait for are our parent's children.  */
-		got_kids = (tcp->parent->nchildren
-			    > tcp->parent->nclone_threads);
-	else
-		got_kids = (tcp->nchildren > tcp->nclone_threads);
-#else
-	got_kids = tcp->nchildren > 0;
-#endif
-
-	if (entering(tcp) && got_kids) {
-		/* There are children that this parent should block for.
-		   But ptrace made us the parent of the traced children
-		   and the real parent will get ECHILD from the wait call.
-
-		   XXX If we attached with strace -f -p PID, then there
-		   may be untraced dead children the parent could be reaping
-		   now, but we make him block.  */
-
-		/* ??? WTA: fix bug with hanging children */
-
-		if (!(tcp->u_arg[flagarg] & WNOHANG)) {
-			/*
-			 * There are traced children.  We'll make the parent
-			 * block to avoid a false ECHILD error due to our
-			 * ptrace having stolen the children.  However,
-			 * we shouldn't block if there are zombies to reap.
-			 * XXX doesn't handle pgrp matches (u_arg[0]==0,<-1)
-			 */
-			struct tcb *child = NULL;
-			if (tcp->nzombies > 0 &&
-			    (tcp->u_arg[0] == -1 ||
-			     (child = pid2tcb(tcp->u_arg[0])) == NULL))
-				return 0;
-			if (tcp->u_arg[0] > 0) {
-				/*
-				 * If the parent waits for a specified child
-				 * PID, then it must get ECHILD right away
-				 * if that PID is not one of its children.
-				 * Make sure that the requested PID matches
-				 * one of the parent's children that we are
-				 * tracing, and don't suspend it otherwise.
-				 */
-				if (child == NULL)
-					child = pid2tcb(tcp->u_arg[0]);
-				if (child == NULL || child->parent != (
-#ifdef TCB_CLONE_THREAD
-					    (tcp->flags & TCB_CLONE_THREAD)
-					    ? tcp->parent :
-#endif
-					    tcp) ||
-				    (child->flags & TCB_EXITING))
-					return 0;
-			}
-			tcp->flags |= TCB_SUSPENDED;
-			tcp->waitpid = tcp->u_arg[0];
-#ifdef TCB_CLONE_THREAD
-			if (tcp->flags & TCB_CLONE_THREAD)
-				tcp->parent->nclone_waiting++;
-#endif
-		}
-	}
-	if (exiting(tcp) && tcp->u_error == ECHILD && got_kids) {
-		if (tcp->u_arg[flagarg] & WNOHANG) {
-			/* We must force a fake result of 0 instead of
-			   the ECHILD error.  */
-			return force_result(tcp, 0, 0);
-		}
-	}
-	else if (exiting(tcp) && tcp->u_error == 0 && tcp->u_rval > 0 &&
-		 tcp->nzombies > 0 && pid2tcb(tcp->u_rval) == NULL) {
-		/*
-		 * We just reaped a child we don't know about,
-		 * presumably a zombie we already droptcb'd.
-		 */
-		tcp->nzombies--;
-	}
-	return 0;
-}
-
 #ifdef SVR4
 
 int
diff -d -urpN strace.0/strace.c strace.1/strace.c
--- strace.0/strace.c	2011-06-07 12:12:15.000000000 +0200
+++ strace.1/strace.c	2011-06-07 12:45:56.633065169 +0200
@@ -1293,7 +1293,6 @@ alloc_tcb(int pid, int command_options_p
 			tcp->nzombies = 0;
 #ifdef TCB_CLONE_THREAD
 			tcp->nclone_threads = 0;
-			tcp->nclone_waiting = 0;
 #endif
 			tcp->flags = TCB_INUSE | TCB_STARTUP;
 			tcp->outf = outf; /* Initialise to current out file */
@@ -1713,99 +1712,6 @@ droptcb(struct tcb *tcp)
 	tcp->outf = 0;
 }
 
-#ifndef USE_PROCFS
-
-static int
-resume(struct tcb *tcp)
-{
-	if (tcp == NULL)
-		return -1;
-
-	if (!(tcp->flags & TCB_SUSPENDED)) {
-		fprintf(stderr, "PANIC: pid %u not suspended\n", tcp->pid);
-		return -1;
-	}
-	tcp->flags &= ~TCB_SUSPENDED;
-#ifdef TCB_CLONE_THREAD
-	if (tcp->flags & TCB_CLONE_THREAD)
-		tcp->parent->nclone_waiting--;
-#endif
-
-	if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0)
-		return -1;
-
-	if (!qflag)
-		fprintf(stderr, "Process %u resumed\n", tcp->pid);
-	return 0;
-}
-
-static int
-resume_from_tcp(struct tcb *tcp)
-{
-	int error = 0;
-	int resumed = 0;
-
-	/* XXX This won't always be quite right (but it never was).
-	   A waiter with argument 0 or < -1 is waiting for any pid in
-	   a particular pgrp, which this child might or might not be
-	   in.  The waiter will only wake up if it's argument is -1
-	   or if it's waiting for tcp->pid's pgrp.  It makes a
-	   difference to wake up a waiter when there might be more
-	   traced children, because it could get a false ECHILD
-	   error.  OTOH, if this was the last child in the pgrp, then
-	   it ought to wake up and get ECHILD.  We would have to
-	   search the system for all pid's in the pgrp to be sure.
-
-	     && (t->waitpid == -1 ||
-		 (t->waitpid == 0 && getpgid (tcp->pid) == getpgid (t->pid))
-		 || (t->waitpid < 0 && t->waitpid == -getpid (t->pid)))
-	*/
-
-	if (tcp->parent &&
-	    (tcp->parent->flags & TCB_SUSPENDED) &&
-	    (tcp->parent->waitpid <= 0 || tcp->parent->waitpid == tcp->pid)) {
-		error = resume(tcp->parent);
-		++resumed;
-	}
-#ifdef TCB_CLONE_THREAD
-	if (tcp->parent && tcp->parent->nclone_waiting > 0) {
-		/* Some other threads of our parent are waiting too.  */
-		unsigned int i;
-
-		/* Resume all the threads that were waiting for this PID.  */
-		for (i = 0; i < tcbtabsize; i++) {
-			struct tcb *t = tcbtab[i];
-			if (t->parent == tcp->parent && t != tcp
-			    && ((t->flags & (TCB_CLONE_THREAD|TCB_SUSPENDED))
-				== (TCB_CLONE_THREAD|TCB_SUSPENDED))
-			    && t->waitpid == tcp->pid) {
-				error |= resume(t);
-				++resumed;
-			}
-		}
-		if (resumed == 0)
-			/* Noone was waiting for this PID in particular,
-			   so now we might need to resume some wildcarders.  */
-			for (i = 0; i < tcbtabsize; i++) {
-				struct tcb *t = tcbtab[i];
-				if (t->parent == tcp->parent && t != tcp
-				    && ((t->flags
-					 & (TCB_CLONE_THREAD|TCB_SUSPENDED))
-					== (TCB_CLONE_THREAD|TCB_SUSPENDED))
-				    && t->waitpid <= 0
-					) {
-					error |= resume(t);
-					break;
-				}
-			}
-	}
-#endif
-
-	return error;
-}
-
-#endif /* !USE_PROCFS */
-
 /* detach traced process; continue with sig
    Never call DETACH twice on the same process as both unattached and
    attached-unstopped processes give the same ESRCH.  For unattached process we
@@ -1922,10 +1828,6 @@ detach(struct tcb *tcp, int sig)
 	error = ptrace_restart(PTRACE_DETACH, tcp, sig);
 #endif /* SUNOS4 */
 
-#ifndef USE_PROCFS
-	error |= resume_from_tcp(tcp);
-#endif
-
 	if (!qflag)
 		fprintf(stderr, "Process %u detached\n", tcp->pid);
 
@@ -2502,9 +2404,6 @@ handle_group_exit(struct tcb *tcp, int s
 				tcp->pid, leader ? leader->pid : -1);
 		}
 		/* TCP no longer exists therefore you must not detach() it.  */
-#ifndef USE_PROCFS
-		resume_from_tcp(tcp);
-#endif
 		droptcb(tcp);	/* Already died.  */
 	}
 	else {
diff -d -urpN strace.0/syscall.c strace.1/syscall.c
--- strace.0/syscall.c	2011-06-07 12:12:17.000000000 +0200
+++ strace.1/syscall.c	2011-06-07 12:36:14.160624975 +0200
@@ -653,22 +653,6 @@ internal_syscall(struct tcb *tcp)
 	   )
 		return internal_exec(tcp);
 
-	if (   sys_waitpid == func
-	    || sys_wait4 == func
-#if defined(SVR4) || defined(FREEBSD) || defined(SUNOS4)
-	    || sys_wait == func
-#endif
-#ifdef ALPHA
-	    || sys_osf_wait4 == func
-#endif
-	   )
-		return internal_wait(tcp, 2);
-
-#if defined(LINUX) || defined(SVR4)
-	if (sys_waitid == func)
-		return internal_wait(tcp, 3);
-#endif
-
 	return 0;
 }
 





More information about the Strace-devel mailing list