[PATCH/RFC] strace: use process_vm_readv instead of PTRACE_PEEKDATA to read data blocks
Denys Vlasenko
dvlasenk at redhat.com
Fri Jan 20 11:24:07 UTC 2012
Currently, we use PTRACE_PEEKDATA to read things like filenames and
data passed by I/O syscalls.
PTRACE_PEEKDATA gets one word per syscall. This is VERY expensive.
For example, in order to print fstat syscall, we need to perform this:
write(3, "fstat64(1, ", 11) = 11
ptrace(PTRACE_SYSCALL, 3983, 0x1, SIG_0) = 0
--- {si_signo=SIGCHLD, si_code=CLD_TRAPPED, si_pid=3983, si_status=SIGTRAP, si_utime=0, si_stime=0} (Child exited) ---
wait4(-1, [{WIFSTOPPED(s) && WSTOPSIG(s) == 133}], __WALL, NULL) = 3983
ptrace(PTRACE_GETREGS, 3983, 0, 0x80885a0) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ecc, [0xa]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed0, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed8, [0x3]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1edc, [0x2190]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee0, [0x1]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee8, [0x5]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1eec, [0x8800]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef0, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef8, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1efc, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f00, [0x400]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f04, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f08, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f0c, [0x4f192228]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f10, [0x18f0e6ba]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f14, [0x4f194c1f]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f18, [0x1b365bd4]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f1c, [0x4f192153]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f20, [0x246674e9]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f24, [0x3]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f28, [0]) = 0
write(3, "{st_mode=S_IFCHR|0620, st_rdev=m"..., 58) = 58
Twenty four trips into kernel to fetch one struct stat!
Kernel just got a new syscall, process_vm_readv(), which can be used to
copy data out of process' address space. Looks like it's ideal
for this goal.
This is a patch which uses process_vm_readv() in umoven() and umovestr()
functions if possible, with fallback to old method if process_vm_readv()
returns ENOSYS.
There is a slight change in API: since I read data in blocks of 256 bytes,
now umovestr() may overwrite data in the buffer past terminating NUL.
One caller used to depend on guarantee that this never happens, but
I just committed a small patch which reworks that place to remove
this assumption.
I don't have a kernel where process_vm_readv() syscall actually works,
so the patch definitely needs to wait until it can be tested for real.
So far I only tested that on ENOSYS it indeed falls back to old code
and everything continues to work :)
What do you guys think about it?
--
vda
diff -d -urpN strace.5/linux/i386/syscallent.h strace.6/linux/i386/syscallent.h
--- strace.5/linux/i386/syscallent.h 2012-01-04 15:09:05.000000000 +0100
+++ strace.6/linux/i386/syscallent.h 2012-01-20 12:01:46.824097896 +0100
@@ -377,8 +377,8 @@
{ 1, TD, sys_syncfs, "syncfs" }, /* 344 */
{ 4, TN, sys_sendmmsg, "sendmmsg" }, /* 345 */
{ 2, TD, sys_setns, "setns" }, /* 346 */
- { 5, 0, printargs, "SYS_347" }, /* 347 */
- { 5, 0, printargs, "SYS_348" }, /* 348 */
+ { 6, 0, printargs, "process_vm_readv" }, /* 347 */
+ { 6, 0, printargs, "process_vm_writev" }, /* 348 */
{ 5, 0, printargs, "SYS_349" }, /* 349 */
{ 5, 0, printargs, "SYS_350" }, /* 350 */
{ 5, 0, printargs, "SYS_351" }, /* 351 */
diff -d -urpN strace.5/linux/powerpc/syscallent.h strace.6/linux/powerpc/syscallent.h
--- strace.5/linux/powerpc/syscallent.h 2012-01-10 16:48:13.000000000 +0100
+++ strace.6/linux/powerpc/syscallent.h 2012-01-20 12:01:46.824097896 +0100
@@ -379,8 +379,8 @@
{ 1, TD, sys_syncfs, "syncfs" }, /* 348 */
{ 4, TN, sys_sendmmsg, "sendmmsg" }, /* 349 */
{ 2, TD, sys_setns, "setns" }, /* 350 */
- { 5, 0, printargs, "SYS_351" }, /* 351 */
- { 5, 0, printargs, "SYS_352" }, /* 352 */
+ { 6, 0, printargs, "process_vm_readv" }, /* 351 */
+ { 6, 0, printargs, "process_vm_writev" }, /* 352 */
{ 5, 0, printargs, "SYS_353" }, /* 353 */
{ 5, 0, printargs, "SYS_354" }, /* 354 */
{ 5, 0, printargs, "SYS_355" }, /* 355 */
diff -d -urpN strace.5/linux/x86_64/syscallent.h strace.6/linux/x86_64/syscallent.h
--- strace.5/linux/x86_64/syscallent.h 2012-01-04 15:09:05.000000000 +0100
+++ strace.6/linux/x86_64/syscallent.h 2012-01-20 12:01:46.824097896 +0100
@@ -308,3 +308,5 @@
{ 4, TN, sys_sendmmsg, "sendmmsg" }, /* 307 */
{ 2, TD, sys_setns, "setns" }, /* 308 */
{ 3, 0, sys_getcpu, "getcpu" }, /* 309 */
+ { 6, 0, printargs, "process_vm_readv" }, /* 310 */
+ { 6, 0, printargs, "process_vm_writev" }, /* 311 */
diff -d -urpN strace.5/util.c strace.6/util.c
--- strace.5/util.c 2012-01-20 11:55:31.000000000 +0100
+++ strace.6/util.c 2012-01-20 12:08:23.222153067 +0100
@@ -769,6 +769,35 @@ dumpstr(struct tcb *tcp, long addr, int
}
}
+
+
+#if !defined(__NR_process_vm_readv)
+# if defined(I386)
+# define __NR_process_vm_readv 347
+# define __NR_process_vm_writev 348
+# elif defined(X86_64)
+# define __NR_process_vm_readv 310
+# define __NR_process_vm_writev 311
+# endif
+#endif
+
+#if defined(__NR_process_vm_readv)
+static bool process_vm_readv_not_supported = 0;
+static ssize_t process_vm_readv(pid_t pid,
+ const struct iovec *lvec,
+ unsigned long liovcnt,
+ const struct iovec *rvec,
+ unsigned long riovcnt,
+ unsigned long flags)
+{
+ return syscall(__NR_process_vm_readv, (long)pid, lvec, liovcnt, rvec, riovcnt, flags);
+}
+#else
+static bool process_vm_readv_not_supported = 1;
+# define process_vm_readv(...) (errno = ENOSYS, -1)
+#endif
+
+
#define PAGMASK (~(PAGSIZ - 1))
/*
* move `len' bytes of data from process `pid'
@@ -786,6 +815,29 @@ umoven(struct tcb *tcp, long addr, int l
char x[sizeof(long)];
} u;
+ if (!process_vm_readv_not_supported) {
+ struct iovec local[1], remote[1];
+ int r;
+
+ local[0].iov_base = laddr;
+ remote[0].iov_base = (void*)addr;
+ local[0].iov_len = remote[0].iov_len = len;
+ r = process_vm_readv(pid,
+ local, 1,
+ remote, 1,
+ /*flags:*/ 0
+ );
+ if (r < 0) {
+ if (errno == ENOSYS) {
+ process_vm_readv_not_supported = 1;
+ goto vm_readv_not_supported;
+ }
+ perror("process_vm_readv");
+ }
+ return r;
+ }
+ vm_readv_not_supported:
+
#if SUPPORTED_PERSONALITIES > 1
if (personality_wordsize[current_personality] < sizeof(addr))
addr &= (1ul << 8 * personality_wordsize[current_personality]) - 1;
@@ -921,6 +973,55 @@ umovestr(struct tcb *tcp, long addr, int
addr &= (1ul << 8 * personality_wordsize[current_personality]) - 1;
#endif
+ if (!process_vm_readv_not_supported) {
+ struct iovec local[1], remote[1];
+
+ local[0].iov_base = laddr;
+ remote[0].iov_base = (void*)addr;
+
+ while (len > 0) {
+ int end_in_page;
+ int r;
+ int chunk_len;
+
+ /* Don't read kilobytes: most strings are short */
+ chunk_len = len;
+ if (chunk_len > 256)
+ chunk_len = 256;
+ /* Don't cross pages. I guess otherwise we can get EFAULT
+ * and fail to notice that terminating NUL lies
+ * in the existing (first) page.
+ * (I hope there aren't arches with pages < 4K)
+ */
+ end_in_page = ((addr + chunk_len) & 4095);
+ r = chunk_len - end_in_page;
+ if (r > 0) /* if chunk_len > end_in_page */
+ chunk_len = r; /* chunk_len -= end_in_page */
+
+ local[0].iov_len = remote[0].iov_len = chunk_len;
+ r = process_vm_readv(pid,
+ local, 1,
+ remote, 1,
+ /*flags:*/ 0
+ );
+ if (r < 0) {
+ if (errno == ENOSYS) {
+ process_vm_readv_not_supported = 1;
+ goto vm_readv_not_supported;
+ }
+ perror("process_vm_readv");
+ return -1;
+ }
+ if (memchr(local[0].iov_base, 0, r))
+ return 1;
+ local[0].iov_base += r;
+ remote[0].iov_base += r;
+ len -= r;
+ }
+ return 0;
+ }
+ vm_readv_not_supported:
+
if (addr & (sizeof(long) - 1)) {
/* addr not a multiple of sizeof(long) */
n = addr - (addr & -sizeof(long)); /* residue */
More information about the Strace-devel
mailing list