[PATCH/RFC] strace: use process_vm_readv instead of PTRACE_PEEKDATA to read data blocks

Denys Vlasenko dvlasenk at redhat.com
Fri Jan 20 11:24:07 UTC 2012


Currently, we use PTRACE_PEEKDATA to read things like filenames and
data passed by I/O syscalls.
PTRACE_PEEKDATA gets one word per syscall. This is VERY expensive.
For example, in order to print fstat syscall, we need to perform this:

write(3, "fstat64(1, ", 11)             = 11
ptrace(PTRACE_SYSCALL, 3983, 0x1, SIG_0) = 0
--- {si_signo=SIGCHLD, si_code=CLD_TRAPPED, si_pid=3983, si_status=SIGTRAP, si_utime=0, si_stime=0} (Child exited) ---
wait4(-1, [{WIFSTOPPED(s) && WSTOPSIG(s) == 133}], __WALL, NULL) = 3983
ptrace(PTRACE_GETREGS, 3983, 0, 0x80885a0) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ecc, [0xa]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed0, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed8, [0x3]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1edc, [0x2190]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee0, [0x1]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee8, [0x5]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1eec, [0x8800]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef0, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef8, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1efc, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f00, [0x400]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f04, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f08, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f0c, [0x4f192228]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f10, [0x18f0e6ba]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f14, [0x4f194c1f]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f18, [0x1b365bd4]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f1c, [0x4f192153]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f20, [0x246674e9]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f24, [0x3]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f28, [0]) = 0
write(3, "{st_mode=S_IFCHR|0620, st_rdev=m"..., 58) = 58

Twenty four trips into kernel to fetch one struct stat!

Kernel just got a new syscall, process_vm_readv(), which can be used to
copy data out of process' address space. Looks like it's ideal
for this goal.

This is a patch which uses process_vm_readv() in umoven() and umovestr()
functions if possible, with fallback to old method if process_vm_readv()
returns ENOSYS.

There is a slight change in API: since I read data in blocks of 256 bytes,
now umovestr() may overwrite data in the buffer past terminating NUL.
One caller used to depend on guarantee that this never happens, but
I just committed a small patch which reworks that place to remove
this assumption.

I don't have a kernel where process_vm_readv() syscall actually works,
so the patch definitely needs to wait until it can be tested for real.
So far I only tested that on ENOSYS it indeed falls back to old code
and everything continues to work :)

What do you guys think about it?

-- 
vda


diff -d -urpN strace.5/linux/i386/syscallent.h strace.6/linux/i386/syscallent.h
--- strace.5/linux/i386/syscallent.h	2012-01-04 15:09:05.000000000 +0100
+++ strace.6/linux/i386/syscallent.h	2012-01-20 12:01:46.824097896 +0100
@@ -377,8 +377,8 @@
  	{ 1,	TD,	sys_syncfs,		"syncfs"	}, /* 344 */
  	{ 4,	TN,	sys_sendmmsg,		"sendmmsg"	}, /* 345 */
  	{ 2,	TD,	sys_setns,		"setns"		}, /* 346 */
-	{ 5,	0,	printargs,		"SYS_347"	}, /* 347 */
-	{ 5,	0,	printargs,		"SYS_348"	}, /* 348 */
+	{ 6,	0,	printargs,		"process_vm_readv"	}, /* 347 */
+	{ 6,	0,	printargs,		"process_vm_writev"	}, /* 348 */
  	{ 5,	0,	printargs,		"SYS_349"	}, /* 349 */
  	{ 5,	0,	printargs,		"SYS_350"	}, /* 350 */
  	{ 5,	0,	printargs,		"SYS_351"	}, /* 351 */
diff -d -urpN strace.5/linux/powerpc/syscallent.h strace.6/linux/powerpc/syscallent.h
--- strace.5/linux/powerpc/syscallent.h	2012-01-10 16:48:13.000000000 +0100
+++ strace.6/linux/powerpc/syscallent.h	2012-01-20 12:01:46.824097896 +0100
@@ -379,8 +379,8 @@
  	{ 1,	TD,	sys_syncfs,		"syncfs"		}, /* 348 */
  	{ 4,	TN,	sys_sendmmsg,		"sendmmsg"		}, /* 349 */
  	{ 2,	TD,	sys_setns,		"setns"			}, /* 350 */
-	{ 5,	0,	printargs,		"SYS_351"		}, /* 351 */
-	{ 5,	0,	printargs,		"SYS_352"		}, /* 352 */
+	{ 6,	0,	printargs,		"process_vm_readv"	}, /* 351 */
+	{ 6,	0,	printargs,		"process_vm_writev"	}, /* 352 */
  	{ 5,	0,	printargs,		"SYS_353"		}, /* 353 */
  	{ 5,	0,	printargs,		"SYS_354"		}, /* 354 */
  	{ 5,	0,	printargs,		"SYS_355"		}, /* 355 */
diff -d -urpN strace.5/linux/x86_64/syscallent.h strace.6/linux/x86_64/syscallent.h
--- strace.5/linux/x86_64/syscallent.h	2012-01-04 15:09:05.000000000 +0100
+++ strace.6/linux/x86_64/syscallent.h	2012-01-20 12:01:46.824097896 +0100
@@ -308,3 +308,5 @@
  	{ 4,	TN,	sys_sendmmsg,		"sendmmsg"	}, /* 307 */
  	{ 2,	TD,	sys_setns,		"setns"		}, /* 308 */
  	{ 3,	0,	sys_getcpu,		"getcpu"	}, /* 309 */
+	{ 6,	0,	printargs,		"process_vm_readv"	}, /* 310 */
+	{ 6,	0,	printargs,		"process_vm_writev"	}, /* 311 */
diff -d -urpN strace.5/util.c strace.6/util.c
--- strace.5/util.c	2012-01-20 11:55:31.000000000 +0100
+++ strace.6/util.c	2012-01-20 12:08:23.222153067 +0100
@@ -769,6 +769,35 @@ dumpstr(struct tcb *tcp, long addr, int
  	}
  }

+
+
+#if !defined(__NR_process_vm_readv)
+# if defined(I386)
+#  define __NR_process_vm_readv  347
+#  define __NR_process_vm_writev 348
+# elif defined(X86_64)
+#  define __NR_process_vm_readv  310
+#  define __NR_process_vm_writev 311
+# endif
+#endif
+
+#if defined(__NR_process_vm_readv)
+static bool process_vm_readv_not_supported = 0;
+static ssize_t process_vm_readv(pid_t pid,
+		 const struct iovec *lvec,
+		 unsigned long liovcnt,
+		 const struct iovec *rvec,
+		 unsigned long riovcnt,
+		 unsigned long flags)
+{
+	return syscall(__NR_process_vm_readv, (long)pid, lvec, liovcnt, rvec, riovcnt, flags);
+}
+#else
+static bool process_vm_readv_not_supported = 1;
+# define process_vm_readv(...) (errno = ENOSYS, -1)
+#endif
+
+
  #define PAGMASK	(~(PAGSIZ - 1))
  /*
   * move `len' bytes of data from process `pid'
@@ -786,6 +815,29 @@ umoven(struct tcb *tcp, long addr, int l
  		char x[sizeof(long)];
  	} u;

+	if (!process_vm_readv_not_supported) {
+		struct iovec local[1], remote[1];
+		int r;
+
+		local[0].iov_base = laddr;
+		remote[0].iov_base = (void*)addr;
+		local[0].iov_len = remote[0].iov_len = len;
+		r = process_vm_readv(pid,
+				local, 1,
+				remote, 1,
+				/*flags:*/ 0
+		);
+		if (r < 0) {
+			if (errno == ENOSYS) {
+				process_vm_readv_not_supported = 1;
+				goto vm_readv_not_supported;
+			}
+			perror("process_vm_readv");
+		}
+		return r;
+	}
+ vm_readv_not_supported:
+
  #if SUPPORTED_PERSONALITIES > 1
  	if (personality_wordsize[current_personality] < sizeof(addr))
  		addr &= (1ul << 8 * personality_wordsize[current_personality]) - 1;
@@ -921,6 +973,55 @@ umovestr(struct tcb *tcp, long addr, int
  		addr &= (1ul << 8 * personality_wordsize[current_personality]) - 1;
  #endif

+	if (!process_vm_readv_not_supported) {
+		struct iovec local[1], remote[1];
+
+		local[0].iov_base = laddr;
+		remote[0].iov_base = (void*)addr;
+
+		while (len > 0) {
+			int end_in_page;
+			int r;
+			int chunk_len;
+
+			/* Don't read kilobytes: most strings are short */
+			chunk_len = len;
+			if (chunk_len > 256)
+				chunk_len = 256;
+			/* Don't cross pages. I guess otherwise we can get EFAULT
+			 * and fail to notice that terminating NUL lies
+			 * in the existing (first) page.
+			 * (I hope there aren't arches with pages < 4K)
+			 */
+			end_in_page = ((addr + chunk_len) & 4095);
+			r = chunk_len - end_in_page;
+			if (r > 0) /* if chunk_len > end_in_page */
+				chunk_len = r; /* chunk_len -= end_in_page */
+
+			local[0].iov_len = remote[0].iov_len = chunk_len;
+			r = process_vm_readv(pid,
+					local, 1,
+					remote, 1,
+					/*flags:*/ 0
+			);
+			if (r < 0) {
+				if (errno == ENOSYS) {
+					process_vm_readv_not_supported = 1;
+					goto vm_readv_not_supported;
+				}
+				perror("process_vm_readv");
+				return -1;
+			}
+			if (memchr(local[0].iov_base, 0, r))
+				return 1;
+			local[0].iov_base += r;
+			remote[0].iov_base += r;
+			len -= r;
+		}
+		return 0;
+	}
+ vm_readv_not_supported:
+
  	if (addr & (sizeof(long) - 1)) {
  		/* addr not a multiple of sizeof(long) */
  		n = addr - (addr & -sizeof(long)); /* residue */




More information about the Strace-devel mailing list