[patch] test/leaderkill fix

Jan Kratochvil jan.kratochvil at redhat.com
Thu Aug 2 08:47:14 UTC 2007


On Wed, 11 Jul 2007 10:40:48 +0200, Roland McGrath wrote:
...
> What I do see is that after PTRACE_KILL, the leader is a zombie but the other
> thread is still alive, and so the wait4 in detach called on the leader in
> handle_group_exit blocks.

I also have reproducible this behavior now.


> I am still concerned about other cases where there are more threads.  I think
> that the synchronous wait in detach will bite again on the leader because the
> other threads still exist.

This happened (reproducer in the updated `test/leaderkill.c' testcase).


> They should be killed by the group exit, but they will still stick around as
> zombies until we see them with wait because they are ptraced.  I think that
> is enough to prevent the zombie leader from being reported to wait.

Yes, waitpid() hangs there.


> If I'm right about that case, then I think the right solution is simply to
> punt the detach call on the leader in handle_group_exit.  It should be seen
> shortly along with all the other threads.  But I may be overlooking
> something, some reason that detach was there other than ancient kernels.

It works the way you describe.  There were two other bugs affecting it.


Regards,
Jan
-------------- next part --------------
2007-08-02  Jan Kratochvil  <jan.kratochvil at redhat.com>

	* strace.c (detach): Moved the resume notification code to ...
	(resume_from_tcp): ... a new function here.
	(handle_group_exit): No longer detach also the thread group leader.
	(trace): Fixed panic on exit of the TCB_GROUP_EXITING leader itself.

	* test/leaderkill.c (start): Renamed to ...
	(start0): ... here.
	(start1): New function.
	(main): Created a new spare thread.

--- strace.c	11 Jul 2007 08:35:11 -0000	1.81
+++ strace.c	2 Aug 2007 08:38:39 -0000
@@ -1337,6 +1337,71 @@ struct tcb *tcp;
 	return 0;
 }
 
+static int
+resume_from_tcp (struct tcb *tcp)
+{
+	int error = 0;
+	int resumed = 0;
+
+	/* XXX This won't always be quite right (but it never was).
+	   A waiter with argument 0 or < -1 is waiting for any pid in
+	   a particular pgrp, which this child might or might not be
+	   in.  The waiter will only wake up if it's argument is -1
+	   or if it's waiting for tcp->pid's pgrp.  It makes a
+	   difference to wake up a waiter when there might be more
+	   traced children, because it could get a false ECHILD
+	   error.  OTOH, if this was the last child in the pgrp, then
+	   it ought to wake up and get ECHILD.  We would have to
+	   search the system for all pid's in the pgrp to be sure.
+
+	     && (t->waitpid == -1 ||
+		 (t->waitpid == 0 && getpgid (tcp->pid) == getpgid (t->pid))
+		 || (t->waitpid < 0 && t->waitpid == -getpid (t->pid)))
+	*/
+
+	if (tcp->parent &&
+	    (tcp->parent->flags & TCB_SUSPENDED) &&
+	    (tcp->parent->waitpid <= 0 || tcp->parent->waitpid == tcp->pid)) {
+ 		error = resume(tcp->parent);
+		++resumed;
+	}
+#ifdef TCB_CLONE_THREAD
+	if (tcp->parent && tcp->parent->nclone_waiting > 0) {
+		/* Some other threads of our parent are waiting too.  */
+		unsigned int i;
+
+		/* Resume all the threads that were waiting for this PID.  */
+		for (i = 0; i < tcbtabsize; i++) {
+			struct tcb *t = tcbtab[i];
+			if (t->parent == tcp->parent && t != tcp
+			    && ((t->flags & (TCB_CLONE_THREAD|TCB_SUSPENDED))
+				== (TCB_CLONE_THREAD|TCB_SUSPENDED))
+			    && t->waitpid == tcp->pid) {
+				error |= resume (t);
+				++resumed;
+			}
+		}
+		if (resumed == 0)
+			/* Noone was waiting for this PID in particular,
+			   so now we might need to resume some wildcarders.  */
+			for (i = 0; i < tcbtabsize; i++) {
+				struct tcb *t = tcbtab[i];
+				if (t->parent == tcp->parent && t != tcp
+				    && ((t->flags
+					 & (TCB_CLONE_THREAD|TCB_SUSPENDED))
+					== (TCB_CLONE_THREAD|TCB_SUSPENDED))
+				    && t->waitpid <= 0
+					) {
+					error |= resume (t);
+					break;
+				}
+			}
+	}
+
+	return error;
+}
+#endif
+
 #endif /* !USE_PROCFS */
 
 /* detach traced process; continue with sig
@@ -1351,7 +1416,7 @@ int sig;
 {
 	int error = 0;
 #ifdef LINUX
-	int status, resumed, catch_sigstop;
+	int status, catch_sigstop;
 	struct tcb *zombie = NULL;
 
 	/* If the group leader is lingering only because of this other
@@ -1465,66 +1530,9 @@ int sig;
 #endif /* SUNOS4 */
 
 #ifndef USE_PROCFS
-	resumed = 0;
-
-	/* XXX This won't always be quite right (but it never was).
-	   A waiter with argument 0 or < -1 is waiting for any pid in
-	   a particular pgrp, which this child might or might not be
-	   in.  The waiter will only wake up if it's argument is -1
-	   or if it's waiting for tcp->pid's pgrp.  It makes a
-	   difference to wake up a waiter when there might be more
-	   traced children, because it could get a false ECHILD
-	   error.  OTOH, if this was the last child in the pgrp, then
-	   it ought to wake up and get ECHILD.  We would have to
-	   search the system for all pid's in the pgrp to be sure.
-
-	     && (t->waitpid == -1 ||
-		 (t->waitpid == 0 && getpgid (tcp->pid) == getpgid (t->pid))
-		 || (t->waitpid < 0 && t->waitpid == -getpid (t->pid)))
-	*/
-
-	if (tcp->parent &&
-	    (tcp->parent->flags & TCB_SUSPENDED) &&
-	    (tcp->parent->waitpid <= 0 || tcp->parent->waitpid == tcp->pid)) {
- 		error = resume(tcp->parent);
-		++resumed;
-	}
-#ifdef TCB_CLONE_THREAD
-	if (tcp->parent && tcp->parent->nclone_waiting > 0) {
-		/* Some other threads of our parent are waiting too.  */
-		unsigned int i;
-
-		/* Resume all the threads that were waiting for this PID.  */
-		for (i = 0; i < tcbtabsize; i++) {
-			struct tcb *t = tcbtab[i];
-			if (t->parent == tcp->parent && t != tcp
-			    && ((t->flags & (TCB_CLONE_THREAD|TCB_SUSPENDED))
-				== (TCB_CLONE_THREAD|TCB_SUSPENDED))
-			    && t->waitpid == tcp->pid) {
-				error |= resume (t);
-				++resumed;
-			}
-		}
-		if (resumed == 0)
-			/* Noone was waiting for this PID in particular,
-			   so now we might need to resume some wildcarders.  */
-			for (i = 0; i < tcbtabsize; i++) {
-				struct tcb *t = tcbtab[i];
-				if (t->parent == tcp->parent && t != tcp
-				    && ((t->flags
-					 & (TCB_CLONE_THREAD|TCB_SUSPENDED))
-					== (TCB_CLONE_THREAD|TCB_SUSPENDED))
-				    && t->waitpid <= 0
-					) {
-					error |= resume (t);
-					break;
-				}
-			}
-	}
+	error |= resume_from_tcp (tcp);
 #endif
 
-#endif /* !USE_PROCFS */
-
 	if (!qflag)
 		fprintf(stderr, "Process %u detached\n", tcp->pid);
 
@@ -2109,6 +2117,9 @@ handle_group_exit(struct tcb *tcp, int s
 				"PANIC: handle_group_exit: %d leader %d\n",
 				tcp->pid, leader ? leader->pid : -1);
 		/* TCP no longer exists therefore you must not detach () it.  */
+#ifndef USE_PROCFS
+		resume_from_tcp (tcp);
+#endif
 		droptcb(tcp);	/* Already died.  */
 	}
 	else {
@@ -2116,21 +2127,8 @@ handle_group_exit(struct tcb *tcp, int s
 		tcp->flags |= TCB_EXITING | TCB_GROUP_EXITING;
 		if (tcp->flags & TCB_ATTACHED) {
 			detach(tcp, sig);
-		  	if (leader != NULL && leader != tcp) {
-				if ((leader->flags & TCB_ATTACHED) &&
-				    !(leader->flags & TCB_EXITING)) {
-					/* We need to detach the leader so
-					   that the process death will be
-					   reported to its real parent.  */
-					if (debug)
-						fprintf(stderr,
-							" [%d exit %d detaches %d]\n",
-							tcp->pid, sig, leader->pid);
-					detach(leader, sig);
-				}
-				else
-					leader->flags |= TCB_GROUP_EXITING;
-			}
+		  	if (leader != NULL && leader != tcp)
+				leader->flags |= TCB_GROUP_EXITING;
 		}
 		else if (ptrace(PTRACE_CONT, tcp->pid, (char *) 1, sig) < 0) {
 			perror("strace: ptrace(PTRACE_CONT, ...)");
@@ -2318,6 +2316,7 @@ Process %d attached (waiting for parent)
 #ifdef TCB_GROUP_EXITING
 			    && !(tcp->parent && (tcp->parent->flags &
 						 TCB_GROUP_EXITING))
+			    && !(tcp->flags & TCB_GROUP_EXITING)
 #endif
 				)
 				fprintf(stderr,
--- test/leaderkill.c	5 Jul 2007 18:43:18 -0000	1.1
+++ test/leaderkill.c	2 Aug 2007 08:38:40 -0000
@@ -13,15 +13,23 @@
 #include <stdio.h>
 #include <sys/wait.h>
 
-static void *start (void *arg)
+static void *start0 (void *arg)
 {
   sleep (1);
 
   exit (42);
 }
 
+static void *start1 (void *arg)
+{
+  pause ();
+  /* NOTREACHED */
+  assert (0);
+}
+
 int main (void)
 {
+  pthread_t thread0;
   pthread_t thread1;
   int i;
   pid_t child, got_pid;
@@ -35,16 +43,13 @@ int main (void)
       case -1:
 	abort ();
       case 0:
-	i = pthread_create (&thread1, NULL, start, NULL);
+	i = pthread_create (&thread0, NULL, start0, NULL);
+	assert (i == 0);
+	i = pthread_create (&thread1, NULL, start1, NULL);
 	assert (i == 0);
-/* Two possible testcases; the second one passed even in the older versions.  */
-#if 1
 	pause ();
-#else
-	pthread_exit (NULL);
-#endif
 	/* NOTREACHED */
-	abort ();
+	assert (0);
 	break;
       default:
 	got_pid = waitpid (child, &status, 0);


More information about the Strace-devel mailing list