[patch] test/leaderkill fix
Jan Kratochvil
jan.kratochvil at redhat.com
Thu Aug 2 08:47:14 UTC 2007
On Wed, 11 Jul 2007 10:40:48 +0200, Roland McGrath wrote:
...
> What I do see is that after PTRACE_KILL, the leader is a zombie but the other
> thread is still alive, and so the wait4 in detach called on the leader in
> handle_group_exit blocks.
I also have reproducible this behavior now.
> I am still concerned about other cases where there are more threads. I think
> that the synchronous wait in detach will bite again on the leader because the
> other threads still exist.
This happened (reproducer in the updated `test/leaderkill.c' testcase).
> They should be killed by the group exit, but they will still stick around as
> zombies until we see them with wait because they are ptraced. I think that
> is enough to prevent the zombie leader from being reported to wait.
Yes, waitpid() hangs there.
> If I'm right about that case, then I think the right solution is simply to
> punt the detach call on the leader in handle_group_exit. It should be seen
> shortly along with all the other threads. But I may be overlooking
> something, some reason that detach was there other than ancient kernels.
It works the way you describe. There were two other bugs affecting it.
Regards,
Jan
-------------- next part --------------
2007-08-02 Jan Kratochvil <jan.kratochvil at redhat.com>
* strace.c (detach): Moved the resume notification code to ...
(resume_from_tcp): ... a new function here.
(handle_group_exit): No longer detach also the thread group leader.
(trace): Fixed panic on exit of the TCB_GROUP_EXITING leader itself.
* test/leaderkill.c (start): Renamed to ...
(start0): ... here.
(start1): New function.
(main): Created a new spare thread.
--- strace.c 11 Jul 2007 08:35:11 -0000 1.81
+++ strace.c 2 Aug 2007 08:38:39 -0000
@@ -1337,6 +1337,71 @@ struct tcb *tcp;
return 0;
}
+static int
+resume_from_tcp (struct tcb *tcp)
+{
+ int error = 0;
+ int resumed = 0;
+
+ /* XXX This won't always be quite right (but it never was).
+ A waiter with argument 0 or < -1 is waiting for any pid in
+ a particular pgrp, which this child might or might not be
+ in. The waiter will only wake up if it's argument is -1
+ or if it's waiting for tcp->pid's pgrp. It makes a
+ difference to wake up a waiter when there might be more
+ traced children, because it could get a false ECHILD
+ error. OTOH, if this was the last child in the pgrp, then
+ it ought to wake up and get ECHILD. We would have to
+ search the system for all pid's in the pgrp to be sure.
+
+ && (t->waitpid == -1 ||
+ (t->waitpid == 0 && getpgid (tcp->pid) == getpgid (t->pid))
+ || (t->waitpid < 0 && t->waitpid == -getpid (t->pid)))
+ */
+
+ if (tcp->parent &&
+ (tcp->parent->flags & TCB_SUSPENDED) &&
+ (tcp->parent->waitpid <= 0 || tcp->parent->waitpid == tcp->pid)) {
+ error = resume(tcp->parent);
+ ++resumed;
+ }
+#ifdef TCB_CLONE_THREAD
+ if (tcp->parent && tcp->parent->nclone_waiting > 0) {
+ /* Some other threads of our parent are waiting too. */
+ unsigned int i;
+
+ /* Resume all the threads that were waiting for this PID. */
+ for (i = 0; i < tcbtabsize; i++) {
+ struct tcb *t = tcbtab[i];
+ if (t->parent == tcp->parent && t != tcp
+ && ((t->flags & (TCB_CLONE_THREAD|TCB_SUSPENDED))
+ == (TCB_CLONE_THREAD|TCB_SUSPENDED))
+ && t->waitpid == tcp->pid) {
+ error |= resume (t);
+ ++resumed;
+ }
+ }
+ if (resumed == 0)
+ /* Noone was waiting for this PID in particular,
+ so now we might need to resume some wildcarders. */
+ for (i = 0; i < tcbtabsize; i++) {
+ struct tcb *t = tcbtab[i];
+ if (t->parent == tcp->parent && t != tcp
+ && ((t->flags
+ & (TCB_CLONE_THREAD|TCB_SUSPENDED))
+ == (TCB_CLONE_THREAD|TCB_SUSPENDED))
+ && t->waitpid <= 0
+ ) {
+ error |= resume (t);
+ break;
+ }
+ }
+ }
+
+ return error;
+}
+#endif
+
#endif /* !USE_PROCFS */
/* detach traced process; continue with sig
@@ -1351,7 +1416,7 @@ int sig;
{
int error = 0;
#ifdef LINUX
- int status, resumed, catch_sigstop;
+ int status, catch_sigstop;
struct tcb *zombie = NULL;
/* If the group leader is lingering only because of this other
@@ -1465,66 +1530,9 @@ int sig;
#endif /* SUNOS4 */
#ifndef USE_PROCFS
- resumed = 0;
-
- /* XXX This won't always be quite right (but it never was).
- A waiter with argument 0 or < -1 is waiting for any pid in
- a particular pgrp, which this child might or might not be
- in. The waiter will only wake up if it's argument is -1
- or if it's waiting for tcp->pid's pgrp. It makes a
- difference to wake up a waiter when there might be more
- traced children, because it could get a false ECHILD
- error. OTOH, if this was the last child in the pgrp, then
- it ought to wake up and get ECHILD. We would have to
- search the system for all pid's in the pgrp to be sure.
-
- && (t->waitpid == -1 ||
- (t->waitpid == 0 && getpgid (tcp->pid) == getpgid (t->pid))
- || (t->waitpid < 0 && t->waitpid == -getpid (t->pid)))
- */
-
- if (tcp->parent &&
- (tcp->parent->flags & TCB_SUSPENDED) &&
- (tcp->parent->waitpid <= 0 || tcp->parent->waitpid == tcp->pid)) {
- error = resume(tcp->parent);
- ++resumed;
- }
-#ifdef TCB_CLONE_THREAD
- if (tcp->parent && tcp->parent->nclone_waiting > 0) {
- /* Some other threads of our parent are waiting too. */
- unsigned int i;
-
- /* Resume all the threads that were waiting for this PID. */
- for (i = 0; i < tcbtabsize; i++) {
- struct tcb *t = tcbtab[i];
- if (t->parent == tcp->parent && t != tcp
- && ((t->flags & (TCB_CLONE_THREAD|TCB_SUSPENDED))
- == (TCB_CLONE_THREAD|TCB_SUSPENDED))
- && t->waitpid == tcp->pid) {
- error |= resume (t);
- ++resumed;
- }
- }
- if (resumed == 0)
- /* Noone was waiting for this PID in particular,
- so now we might need to resume some wildcarders. */
- for (i = 0; i < tcbtabsize; i++) {
- struct tcb *t = tcbtab[i];
- if (t->parent == tcp->parent && t != tcp
- && ((t->flags
- & (TCB_CLONE_THREAD|TCB_SUSPENDED))
- == (TCB_CLONE_THREAD|TCB_SUSPENDED))
- && t->waitpid <= 0
- ) {
- error |= resume (t);
- break;
- }
- }
- }
+ error |= resume_from_tcp (tcp);
#endif
-#endif /* !USE_PROCFS */
-
if (!qflag)
fprintf(stderr, "Process %u detached\n", tcp->pid);
@@ -2109,6 +2117,9 @@ handle_group_exit(struct tcb *tcp, int s
"PANIC: handle_group_exit: %d leader %d\n",
tcp->pid, leader ? leader->pid : -1);
/* TCP no longer exists therefore you must not detach () it. */
+#ifndef USE_PROCFS
+ resume_from_tcp (tcp);
+#endif
droptcb(tcp); /* Already died. */
}
else {
@@ -2116,21 +2127,8 @@ handle_group_exit(struct tcb *tcp, int s
tcp->flags |= TCB_EXITING | TCB_GROUP_EXITING;
if (tcp->flags & TCB_ATTACHED) {
detach(tcp, sig);
- if (leader != NULL && leader != tcp) {
- if ((leader->flags & TCB_ATTACHED) &&
- !(leader->flags & TCB_EXITING)) {
- /* We need to detach the leader so
- that the process death will be
- reported to its real parent. */
- if (debug)
- fprintf(stderr,
- " [%d exit %d detaches %d]\n",
- tcp->pid, sig, leader->pid);
- detach(leader, sig);
- }
- else
- leader->flags |= TCB_GROUP_EXITING;
- }
+ if (leader != NULL && leader != tcp)
+ leader->flags |= TCB_GROUP_EXITING;
}
else if (ptrace(PTRACE_CONT, tcp->pid, (char *) 1, sig) < 0) {
perror("strace: ptrace(PTRACE_CONT, ...)");
@@ -2318,6 +2316,7 @@ Process %d attached (waiting for parent)
#ifdef TCB_GROUP_EXITING
&& !(tcp->parent && (tcp->parent->flags &
TCB_GROUP_EXITING))
+ && !(tcp->flags & TCB_GROUP_EXITING)
#endif
)
fprintf(stderr,
--- test/leaderkill.c 5 Jul 2007 18:43:18 -0000 1.1
+++ test/leaderkill.c 2 Aug 2007 08:38:40 -0000
@@ -13,15 +13,23 @@
#include <stdio.h>
#include <sys/wait.h>
-static void *start (void *arg)
+static void *start0 (void *arg)
{
sleep (1);
exit (42);
}
+static void *start1 (void *arg)
+{
+ pause ();
+ /* NOTREACHED */
+ assert (0);
+}
+
int main (void)
{
+ pthread_t thread0;
pthread_t thread1;
int i;
pid_t child, got_pid;
@@ -35,16 +43,13 @@ int main (void)
case -1:
abort ();
case 0:
- i = pthread_create (&thread1, NULL, start, NULL);
+ i = pthread_create (&thread0, NULL, start0, NULL);
+ assert (i == 0);
+ i = pthread_create (&thread1, NULL, start1, NULL);
assert (i == 0);
-/* Two possible testcases; the second one passed even in the older versions. */
-#if 1
pause ();
-#else
- pthread_exit (NULL);
-#endif
/* NOTREACHED */
- abort ();
+ assert (0);
break;
default:
got_pid = waitpid (child, &status, 0);
More information about the Strace-devel
mailing list