[patch] Multithreaded process left Stopped (T) on CTRL-C of strace

Jan Kratochvil jan.kratochvil at redhat.com
Thu May 24 17:02:51 UTC 2007


On Thu, 24 May 2007 07:08:42 +0200, Roland McGrath wrote:
...
> Calling detach on something with TCB_EXITING set is wrong because that
> means it has already been through detach, or else is a natural child
> (without -p) that was continued and has had or soon will have a !WIFSTOPPED
> death report seen by wait4.  All the process diddling has already been done
> and only droptcb is needed to clean up the data structure.

Probably, I miss there an assertion, stderr print could be used.


> The call you changed in handle_group_exit is also a case where it's already
> detached or dead, so detach does not make sense.

It looked so.


> I think the (leader->flags & TCB_ATTACHED) case there may also be wrong if
> leader->flags has TCB_EXITING.  It should skip the PTRACE_KILL and detach
> when leader is already TCB_EXITING.  

That PTRACE_KILL should not harm even if the leader is already dead.
But it definitely should not try to `detach(leader)' as either if it was
already TCB_EXITING or if it was alive `detach(leader)' would get locked up.
Testcase for the latter case is attached.

While I believe your intention was a changed code condition I cannot find
a counterexample failing the patched code.  In the worst case there should be
double (triple...) PTRACE_KILL.


Thanks,
Jan


Debug dumps about the PTRACE_KILL change:

Hang of the unpatched strace:
-----------------------------

strace of strace:
wait4(4294967295, [{WIFSTOPPED(s) && WSTOPSIG(s) == SIGALRM}], __WALL, NULL) = 17211
rt_sigprocmask(SIG_BLOCK, [HUP INT QUIT PIPE TERM], NULL, 8) = 0
write(2, "[pid 17211] --- SIGALRM (Alarm c"..., 50) = 50
open("/proc/17211/status", O_RDONLY)    = 3
read(3, "Name:\tselftkill\nState:\tT (tracin"..., 2048) = 720
close(3)                                = 0
ptrace(PTRACE_KILL, 17203, 0, 0)        = 0
--- SIGCHLD (Child exited) @ 0 (0) ---
ptrace(PTRACE_DETACH, 17203, 0x1, SIGALRM) = -1 ESRCH (No such process)
kill(17203, SIG_0)                      = 0
kill(17203, SIGSTOP)                    = 0
wait4(17203,

strace itself:
[pid 17211] tgkill(17203, 17211, SIGALRM) = 0
[pid 17211] --- SIGALRM (Alarm clock) @ 0 (0) ---
HANG

10177 pts/15   S      0:22 -bash
17203 pts/15   Zl+    0:00  \_ [selftkill] <defunct>

UID        PID  PPID   LWP  C NLWP STIME TTY          TIME CMD
jkratoch 17203 10177 17203  0    2 17:37 pts/15   00:00:00 [selftkill] <defunct>
jkratoch 17203 10177 17211  0    2 17:37 pts/15   00:00:00 [selftkill] <defunct>


Pass of the patched strace:
---------------------------

strace of strace:
wait4(4294967295, [{WIFSTOPPED(s) && WSTOPSIG(s) == SIGALRM}], __WALL, NULL) = 17498
rt_sigprocmask(SIG_BLOCK, [HUP INT QUIT PIPE TERM], NULL, 8) = 0
write(2, "[pid 17498] --- SIGALRM (Alarm c"..., 50) = 50
open("/proc/17498/status", O_RDONLY)    = 3
read(3, "Name:\tselftkill\nState:\tT (tracin"..., 2048) = 720
close(3)                                = 0
ptrace(PTRACE_KILL, 17490, 0, 0)        = 0
--- SIGCHLD (Child exited) @ 0 (0) ---
ptrace(PTRACE_DETACH, 17498, 0x1, SIGALRM) = -1 ESRCH (No such process)
kill(17498, SIG_0)                      = 0
kill(17498, SIGSTOP)                    = 0
wait4(17498, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGKILL}], __WALL, NULL) = 17498
--- SIGCHLD (Child exited) @ 0 (0) ---
write(2, "Process 17498 detached\n", 23) = 23
exit_group(0)                           = ?

strace itself:
[pid 17498] tgkill(17490, 17498, SIGALRM) = 0
[pid 17498] --- SIGALRM (Alarm clock) @ 0 (0) ---
Process 17498 detached

inferior:
$ ./selftkill
17490
17498
Killed
$ _
-------------- next part --------------
2007-05-24  Jan Kratochvil  <jan.kratochvil at redhat.com>
	    Roland McGrath  <roland at redhat.com>

	* strace.c (detach): New prototype.
	[LINUX] (detach): Extended function comment.
	Call droptcb() instead of the wrong parametrized detach() call.
	(handle_group_exit): Call droptcb() instead of the wrong parametrized
	detach() call and instead of the inappropriate detach() call 

diff -u -rup strace-4.5.15-orig/strace.c strace-4.5.15-detach-zombie/strace.c
--- strace-4.5.15-orig/strace.c	2006-12-13 22:55:39.000000000 +0100
+++ strace-4.5.15-detach-zombie/strace.c	2007-05-24 18:11:05.000000000 +0200
@@ -83,6 +83,7 @@ unsigned int nprocs, tcbtabsize;
 char *progname;
 extern char **environ;
 
+static int detach P((struct tcb *tcp, int sig));
 static int trace P((void));
 static void cleanup P((void));
 static void interrupt P((int sig));
@@ -1282,7 +1283,10 @@ struct tcb *tcp;
 
 #endif /* !USE_PROCFS */
 
-/* detach traced process; continue with sig */
+/* detach traced process; continue with sig
+   Never call DETACH twice on the same process as both unattached and
+   attached-unstopped processes give the same ESRCH.  For unattached process we
+   would SIGSTOP it and wait for its SIGSTOP notification forever.  */
 
 static int
 detach(tcp, sig)
@@ -1460,8 +1464,10 @@ int sig;
 	droptcb(tcp);
 
 #ifdef LINUX
-	if (zombie != NULL)
-		error = detach(zombie) || error;
+	if (zombie != NULL) {
+		/* TCP no longer exists therefore you must not detach () it.  */
+		droptcb(zombie);
+	}
 #endif
 
 	return error;
@@ -2035,7 +2041,8 @@ handle_group_exit(struct tcb *tcp, int s
 			fprintf(stderr,
 				"PANIC: handle_group_exit: %d leader %d\n",
 				tcp->pid, leader ? leader->pid : -1);
-		detach(tcp);	/* Already died.  */
+		/* TCP no longer exists therefore you must not detach () it.  */
+		droptcb(tcp);	/* Already died.  */
 	}
 	else {
 		/* Mark that we are taking the process down.  */
@@ -2060,7 +2067,9 @@ handle_group_exit(struct tcb *tcp, int s
 						fprintf(stderr,
 							" [%d exit %d kills %d]\n",
 							tcp->pid, sig, leader->pid);
-					detach(leader, sig);
+					/* LEADER no longer exists therefore you
+					   must not detach () it.  */
+					droptcb(leader);
 				}
 				else
 					leader->flags |= TCB_GROUP_EXITING;
-------------- next part --------------
#include <pthread.h>
#include <assert.h>
#include <stdio.h>
#include <signal.h>
#include <stdlib.h>

#include <asm/unistd.h>
#include <unistd.h>
#define tgkill(pid, tid, sig) syscall (__NR_tgkill, (pid), (tid), (sig))
#define gettid() syscall (__NR_gettid)

static pid_t parent;

void *start (void *arg)
{
  assert (getpid () == parent);
  assert (getpid () != gettid ());
  printf ("%d\n", (int) gettid ());

  assert (signal (SIGALRM, SIG_DFL) == SIG_DFL);
  tgkill (getpid (), gettid (), SIGALRM);
  /* NOTREACHED */
  abort ();
}

int main (void)
{
  pthread_t thread_id;
  int i;

  sleep (5);

  parent = getpid ();
  assert (getpid () == gettid ());
  printf("%d\n", (int) getpid ());

  i = pthread_create (&thread_id, NULL, start, NULL);
  assert (i == 0);
  i = pthread_join (thread_id, NULL);
  /* NOTREACHED */
  abort ();
  assert (i == 0);

  return EXIT_SUCCESS;
}


More information about the Strace-devel mailing list