bugfix for strace for less-aligned kernel memory

Bolo (Josef Burger) bolo at cs.wisc.edu
Thu Mar 19 20:48:35 UTC 2015


This bugfix / patch consists of two portions

1) A 'Z' option which forces strace to use ptrace() memory access
routines all the time instead of vm-based memory access routines if
they are present..  This allows a fallback if you find that VM based
requests are failing or producing incorrect output.   The existing
ptrace based methods are quite robust and tested :-)

2) A correction for a long-term bug in th VM based memory access
method of strace.  This bug has been in strace in various forms since
at least 4.5.19.

The ptrace based code is correct.

What happens is that the vm based access code in strace doesn't deal
correctly with alignments of data in the kernel.   It assumes that
everything is at a certain alignment.  This is an incorrrect assumption;
when there are a lot of objects the kernel starts putting string data at
lesser aligned  addresses.   I discovered this through debugging
of strace failures and finding the kernel relaxing alignment constraints.
This is true of older (such as rhel 5.9 and newer such as rhel -6.6 kernels).

There was also an issue with the code not treating page boundaries
correctly due to incorrect page arithmetic.

When this alignment constraint is relaxed, or strace incorrectly issues
reads across page boundaries, strace fails with a error of

	"umovestr: short read (%d < %d) @0x%lx"

due to the incorect code.  In addition, since the vm-based code doesn't
correctly update the address and lengths of the region to be accessed,
the fallback code -- which is implemented correctly, fails to work.

The new code also deals with arbitrary page sizes correctly in extracting
data using the vm mode, instead of relying upon a 4k pagesize.


3) Reproducing this error depends upon where the kernel is putting items
in memory, and may also be based on the load of the kernel.   As such,
this error can be difficult to reproduce.  The project I'm working with
uses strace a lot -- we easily run 100000+ big straces a year,
so we happen to run into it often enough to notice.

However, if you run strace on something complex as a software build
and request a lot of output, you will often run into this error.  We've
experienced it on the major releases: debian, rhel, ubuntu,  and
scientific (rhel).

	download-some-large-software-package.tar.gz
	tar xfz large-software-package.tar.gz
	cd large-software-package.tar.gz
	./configure
	strace -f -v -ttt -s 500000 -o strace.out make >& made.out

will eventually result in the short read error.


Bolo -- Josef T. Burger

------------------------- diff against strace-4.10 -----------------------
diff -cr strace-4.10/defs.h strace-4.10-swamp/defs.h
*** strace-4.10/defs.h	Thu Mar  5 20:19:01 2015
--- strace-4.10-swamp/defs.h	Wed Mar 11 15:40:05 2015
***************
*** 630,632 ****
--- 630,635 ----
  /* Only ensures that sysent[scno] isn't out of range */
  #define SCNO_IN_RANGE(scno) \
  	((unsigned long)(scno) < nsyscalls)
+ 
+ unsigned long get_pagesize();
+ extern	bool	process_vm_readv_not_supported;
diff -cr strace-4.10/mem.c strace-4.10-swamp/mem.c
*** strace-4.10/mem.c	Wed Jan 14 01:17:11 2015
--- strace-4.10-swamp/mem.c	Wed Mar 11 15:40:07 2015
***************
*** 34,40 ****
  #include <asm/mman.h>
  #include <sys/mman.h>
  
! static unsigned long
  get_pagesize()
  {
  	static unsigned long pagesize;
--- 34,40 ----
  #include <asm/mman.h>
  #include <sys/mman.h>
  
! unsigned long
  get_pagesize()
  {
  	static unsigned long pagesize;
diff -cr strace-4.10/strace.1 strace-4.10-swamp/strace.1
*** strace-4.10/strace.1	Tue Mar  3 18:56:33 2015
--- strace-4.10-swamp/strace.1	Wed Mar 11 15:40:09 2015
***************
*** 327,332 ****
--- 327,335 ----
  .B \-yy
  Print ip:port pairs associated with socket file descriptors.
  .TP
+ .B \-Z
+ Force use of ptrace when process_vm_readv() is available.
+ .TP
  .BI "\-a " column
  Align return values in a specific column (default column 40).
  .TP
diff -cr strace-4.10/strace.c strace-4.10-swamp/strace.c
*** strace-4.10/strace.c	Sat Feb 28 08:50:09 2015
--- strace-4.10-swamp/strace.c	Thu Mar 12 15:30:01 2015
***************
*** 1469,1474 ****
--- 1469,1475 ----
  		"k"
  #endif
  		"D"
+ 		"Z"
  		"a:e:o:O:p:s:S:u:E:P:I:")) != EOF) {
  		switch (c) {
  		case 'b':
***************
*** 1538,1543 ****
--- 1539,1547 ----
  		case 'z':
  			not_failing_only = 1;
  			break;
+ 		case 'Z':
+ 			process_vm_readv_not_supported++;
+ 			break;
  		case 'a':
  			acolumn = string_to_uint(optarg);
  			if (acolumn < 0)
diff -cr strace-4.10/util.c strace-4.10-swamp/util.c
*** strace-4.10/util.c	Sat Feb 28 06:20:21 2015
--- strace-4.10-swamp/util.c	Thu Mar 12 15:35:23 2015
***************
*** 926,932 ****
  
  #ifdef HAVE_PROCESS_VM_READV
  /* C library supports this, but the kernel might not. */
! static bool process_vm_readv_not_supported = 0;
  #else
  
  /* Need to do this since process_vm_readv() is not yet available in libc.
--- 926,932 ----
  
  #ifdef HAVE_PROCESS_VM_READV
  /* C library supports this, but the kernel might not. */
! bool process_vm_readv_not_supported = 0;
  #else
  
  /* Need to do this since process_vm_readv() is not yet available in libc.
***************
*** 944,950 ****
  #endif
  
  #if defined(__NR_process_vm_readv)
! static bool process_vm_readv_not_supported = 0;
  /* Have to avoid duplicating with the C library headers. */
  static ssize_t strace_process_vm_readv(pid_t pid,
  		 const struct iovec *lvec,
--- 944,950 ----
  #endif
  
  #if defined(__NR_process_vm_readv)
! bool process_vm_readv_not_supported = 0;
  /* Have to avoid duplicating with the C library headers. */
  static ssize_t strace_process_vm_readv(pid_t pid,
  		 const struct iovec *lvec,
***************
*** 957,963 ****
  }
  #define process_vm_readv strace_process_vm_readv
  #else
! static bool process_vm_readv_not_supported = 1;
  # define process_vm_readv(...) (errno = ENOSYS, -1)
  #endif
  
--- 957,963 ----
  }
  #define process_vm_readv strace_process_vm_readv
  #else
! bool process_vm_readv_not_supported = 1;
  # define process_vm_readv(...) (errno = ENOSYS, -1)
  #endif
  
***************
*** 1117,1125 ****
  	nread = 0;
  	if (!process_vm_readv_not_supported) {
  		struct iovec local[1], remote[1];
! 
! 		local[0].iov_base = laddr;
! 		remote[0].iov_base = (void*)addr;
  
  		while (len > 0) {
  			unsigned int chunk_len;
--- 1117,1129 ----
  	nread = 0;
  	if (!process_vm_readv_not_supported) {
  		struct iovec local[1], remote[1];
! 		const unsigned long pagemask = get_pagesize() - 1;
! 		/* Don't read kilobytes: most strings are short */
! 		unsigned int	max_chunk_len = 256;
! 
! 		/* unlikely, but it will work correctly */
! 		if (max_chunk_len > get_pagesize())
! 			max_chunk_len = get_pagesize();
  
  		while (len > 0) {
  			unsigned int chunk_len;
***************
*** 1128,1153 ****
  
  			/* Don't read kilobytes: most strings are short */
  			chunk_len = len;
! 			if (chunk_len > 256)
! 				chunk_len = 256;
! 			/* Don't cross pages. I guess otherwise we can get EFAULT
  			 * and fail to notice that terminating NUL lies
  			 * in the existing (first) page.
  			 * (I hope there aren't arches with pages < 4K)
  			 */
! 			end_in_page = ((addr + chunk_len) & 4095);
! 			if (chunk_len > end_in_page) /* crosses to the next page */
! 				chunk_len -= end_in_page;
  
  			local[0].iov_len = remote[0].iov_len = chunk_len;
  			r = process_vm_readv(pid, local, 1, remote, 1, 0);
  			if (r > 0) {
  				if (memchr(local[0].iov_base, '\0', r))
  					return 1;
! 				local[0].iov_base += r;
! 				remote[0].iov_base += r;
! 				len -= r;
  				nread += r;
  				continue;
  			}
  			switch (errno) {
--- 1132,1164 ----
  
  			/* Don't read kilobytes: most strings are short */
  			chunk_len = len;
! 			if (chunk_len > max_chunk_len)
! 				chunk_len = max_chunk_len;
! 			/*
! 			 * Don't cross pages.
! 			 * EFAULT when a read crosses a page boundary. 
  			 * and fail to notice that terminating NUL lies
  			 * in the existing (first) page.
  			 * (I hope there aren't arches with pages < 4K)
  			 */
! 			end_in_page = ((addr + chunk_len) & pagemask);
! 			r = chunk_len - end_in_page;
! 			if (r > 0) /* if chunk_len > end_in_page */
! 				chunk_len = r; /* chunk_len -= end_in_page */
  
  			local[0].iov_len = remote[0].iov_len = chunk_len;
+ 			local[0].iov_base = laddr;
+ 			remote[0].iov_base = (void *) addr;
  			r = process_vm_readv(pid, local, 1, remote, 1, 0);
  			if (r > 0) {
  				if (memchr(local[0].iov_base, '\0', r))
  					return 1;
! 
! 				/* keep all the data updated properly */
! 				addr += r;
! 				laddr += r;
  				nread += r;
+ 				len -= r;
  				continue;
  			}
  			switch (errno) {




More information about the Strace-devel mailing list