[PATCH 0/8] mmap_cache subsystem (re-factoring)

Masatake YAMATO yamato at redhat.com
Tue Apr 24 17:34:32 UTC 2018


On Tue, 24 Apr 2018 16:56:34 +0300, "Dmitry V. Levin" <ldv at altlinux.org> wrote:
> On Tue, Apr 24, 2018 at 02:25:14PM +0900, Masatake YAMATO wrote:
>> > On Tue, Feb 27, 2018 at 02:30:06AM +0300, Dmitry V. Levin wrote:
>> >> On Sat, Jan 27, 2018 at 05:36:36AM +0900, Masatake YAMATO wrote:
>> >> > When we added unwinding feature (activated with -k option), we also added
>> >> > a code for caching the entries of /proc/$pid/maps.
>> >> > 
>> >> > The code for caching was tightly integrated with the code for
>> >> > unwinding. However, while prototyping kvm vcpu state dump feature to
>> >> > strace[1], I found the code for caching is useful for other purposes
>> >> > than unwinding feature.
>> >> > 
>> >> > This patch set makes the code for caching usable independently of
>> >> > unwinding feature. In the changes, I call the newly separated
>> >> > code for caching "mmap_cache subsystem".
>> >> > 
>> >> > [1] https://marc.info/?l=kvm&m=151531408406144&w=2
>> >> > 
>> >> > Masatake YAMATO (8):
>> >> >   unwind: lift up unw_flush_cache from mmap cache management code
>> >> >   Introduce mmap_cache subsystem derived from unwind.c
>> >> >   mmap_cache: Move code for searching an mmap cache from unwind
>> >> >   mmap_cache: record protection bits
>> >> >   Lift up mmap_cache_delete invocation from unwind.c
>> >> >   mmap_cache: add function to enable mmap_cache
>> >> >   mmap_cache: record device major and minor numbers
>> >> >   mmap_cache: add customizable search function
>> >> 
>> >> Thanks, I'll merge the first 7 of them, with minor corrections
>> >> (mostly related to commit messages) and a single build fix.
>> > 
>> > With introduction of libdw-based unwinder this mmap_cache subsystem looks
>> > very odd: despite the fact that libdw-based unwinder does not use mmap_cache,
>> > it's use is forced by unwind.c for no visible purpose.
>> 
>> mmap_cache calls tcb_flush_cache (and dwfl_linux_proc_report) when
>> the memory mapping of a target process is changed.
> 
> Well, it's not mmap_cache.c but unwind.c that calls tcb_flush_cache.
> Generic unwind uses just the return code of mmap_cache_rebuild_if_invalid.

Oh, I see. What unwind-libdw needs tcp->mmap_cache_generation and
mmap_cache.c::mmap_cache_generation. How about introducing a function
that allows libdw unwinder to access (or comapre) mmap_cache.c::mmap_cache_generation ?

> mmap_cache_rebuild_if_invalid calls build_mmap_cache but the mmap cache
> it builds is not used because the sole user of mmap_cache_search is
> unwind-libunwind.c, all the rest don't need any mmap cache any more.

The reason I introduced mmap_cache is not for unwinding, for kvm vcpu status
decoding.

See https://www.spinics.net/lists/kvm/msg161468.html .
I wrote the idea in the page.

The status of kvm vcpu is exposed via memory where a fd of kvm vcpu is
mmap'ed. My code under testing prints the reason vcpus exit as auxstr if
-K option is given. If -KK is given, strace dumps whole the memory
area with decoding like:

    # ./strace -KK -f -p 617 2>&1 | grep 'KVM_RUN\| K'
    ...
    [pid   664] ioctl(18, KVM_RUN, 0)       = 0 (KVM_EXIT_MMIO)
     K ready_for_interrupt_injection=1, if_flag=0, flags=0, cr8=0000000000000000, apic_base=0x000000fee00d00
     K phys_addr=0, len=1634035803, [33, 0, 0, 0, 0, 0, 0, 0], is_write=112
    [pid   664] ioctl(18, KVM_RUN, 0)       = 0 (KVM_EXIT_MMIO)
     K ready_for_interrupt_injection=1, if_flag=1, flags=0, cr8=0000000000000000, apic_base=0x000000fee00d00
     K phys_addr=0, len=1634035803, [33, 0, 0, 0, 0, 0, 0, 0], is_write=112
    ...

This feature uses mmap_cache subsystem to find and verify the memory
area for a vcpu. A kvm virtual machine has more than one vcpu.

Masatake YAMATO

diff --git a/defs.h b/defs.h
index 21069741..5526c4b7 100644
--- a/defs.h
+++ b/defs.h
@@ -381,6 +381,9 @@ extern unsigned followfork;
 /* if this is true do the stack trace for every system call */
 extern bool stack_trace_enabled;
 #endif
+#ifdef HAVE_LINUX_KVM_H
+extern int dump_kvm_run_structure_level;
+#endif
 extern unsigned ptrace_setoptions;
 extern unsigned max_strlen;
 extern unsigned os_release;
@@ -740,6 +743,10 @@ extern void unwind_tcb_print(struct tcb *);
 extern void unwind_tcb_capture(struct tcb *);
 #endif
 
+#ifdef HAVE_LINUX_KVM_H
+extern void kvm_run_structure_decoder_init(void);
+#endif
+
 static inline int
 printstrn(struct tcb *tcp, kernel_ulong_t addr, kernel_ulong_t len)
 {
diff --git a/kvm.c b/kvm.c
index 86fd9e50..d89b176f 100644
--- a/kvm.c
+++ b/kvm.c
@@ -34,13 +34,142 @@
 # include <linux/kvm.h>
 # include "print_fields.h"
 # include "arch_kvm.c"
+# include "xmalloc.h"
+# include "mmap_cache.h"
+# include <sys/mman.h>
+
+enum vcpu_info_source { VCPU_INFO_SOURCE_UNKNOWN,
+			VCPU_INFO_SOURCE_PROC_MAPS };
+struct vcpu_info {
+	int fd;
+	long mmap_addr;
+	long mmap_len;
+	enum vcpu_info_source info_source;
+	struct vcpu_info *next;
+};
+
+static struct vcpu_info *vcpu_info_list;
+static struct kvm_run vcpu_run_struct;
+
+static struct vcpu_info *
+vcpu_search(int fd, struct vcpu_info *head)
+{
+	struct vcpu_info *vcpu_info;
+
+	for (vcpu_info = head;
+	     vcpu_info;
+	     vcpu_info = vcpu_info->next)
+		if (vcpu_info->fd == fd)
+			return vcpu_info;
+
+	return NULL;
+}
+
+static struct vcpu_info *
+vcpu_alloc(int fd)
+{
+	struct vcpu_info *vcpu_info = xmalloc(sizeof(struct vcpu_info));
+
+	vcpu_info->fd = fd;
+	vcpu_info->next = vcpu_info_list;
+	vcpu_info_list = vcpu_info;
+
+	vcpu_info->info_source = VCPU_INFO_SOURCE_UNKNOWN;
+	vcpu_info->mmap_addr = 0L;
+	vcpu_info->mmap_len  = 0L;
+
+	return vcpu_info;
+}
+
+static struct vcpu_info *
+vcpu_register(int fd)
+{
+	struct vcpu_info *vcpu_info = vcpu_search(fd, vcpu_info_list);
+
+	if (!vcpu_info)
+		vcpu_info = vcpu_alloc(fd);
+
+	return vcpu_info;
+}
+
+static bool
+find_map_for_file(struct mmap_cache_t *map_cache, void *data)
+{
+	const char *path = data;
+
+	/* major version for anon inode may be given in get_anon_bdev():
+	 *
+	 * 	*p = MKDEV(0, dev & MINORMASK);
+	 *-----------------^
+	 */
+	if (map_cache->binary_filename &&
+	    map_cache->major == 0 &&
+	    strcmp(map_cache->binary_filename, path) == 0)
+		return true;
+	return false;
+}
+
+static void
+vcpu_fill(struct vcpu_info *vcpu_info, unsigned long addr, unsigned len,
+	  enum vcpu_info_source info_source)
+{
+	vcpu_info->mmap_addr = addr;
+	vcpu_info->mmap_len  = len;
+	vcpu_info->info_source = info_source;
+}
+
+static unsigned long
+map_len(struct mmap_cache_t *map_info)
+{
+	if (map_info->start_addr >= map_info->end_addr)
+		return 0;
+	else
+		return map_info->end_addr - map_info->start_addr;
+}
+
+static struct vcpu_info*
+vcpu_get_info(struct tcb *const tcp, int fd)
+{
+	struct vcpu_info *vcpu_info = vcpu_search(fd, vcpu_info_list);
+
+	if (!vcpu_info || vcpu_info->info_source == VCPU_INFO_SOURCE_UNKNOWN) {
+		char path[PATH_MAX + 1];
+		struct mmap_cache_t *map_info;
+		if (getfdpath(tcp, fd, path, sizeof(path)) >= 0
+		    && strncmp(path, "anon_inode:kvm-vcpu:", 19 + 1) == 0) {
+			mmap_cache_rebuild_if_invalid(tcp, __func__);
+			map_info = mmap_cache_find(tcp, find_map_for_file, path);
+			if (map_info) {
+				if (!vcpu_info)
+					vcpu_info = vcpu_alloc(fd);
+				vcpu_fill(vcpu_info,
+					  map_info->start_addr,
+					  map_len(map_info),
+					  VCPU_INFO_SOURCE_PROC_MAPS);
+			}
+
+		}
+	}
+
+	return vcpu_info;
+}
 
 static int
 kvm_ioctl_create_vcpu(struct tcb *const tcp, const kernel_ulong_t arg)
 {
-	uint32_t cpuid = arg;
+	uint32_t cpuid;
+
+	if (entering(tcp))
+		return 0;
 
+	cpuid = arg;
 	tprintf(", %u", cpuid);
+
+	if (dump_kvm_run_structure_level && tcp->u_rval >= 0) {
+		int fd = tcp->u_rval;
+		vcpu_register(fd);
+	}
+
 	return RVAL_IOCTL_DECODED | RVAL_FD;
 }
 
@@ -103,6 +232,74 @@ kvm_ioctl_decode_sregs(struct tcb *const tcp, const unsigned int code,
 }
 # endif /* HAVE_STRUCT_KVM_SREGS */
 
+# include "xlat/kvm_exit_reason.h"
+static bool
+kvm_ioctl_run_attach_auxstr(struct tcb *const tcp,
+			    long mmap_addr)
+
+{
+	struct mmap_cache_t *map_info;
+	const char *reason;
+	bool r;
+
+	mmap_cache_rebuild_if_invalid(tcp, __func__);
+
+	map_info = mmap_cache_search(tcp, mmap_addr);
+	if (!map_info)
+		return false;
+
+	if (map_len(map_info) < sizeof(vcpu_run_struct))
+		return false;
+
+	if (!map_info->binary_filename)
+		return false;
+
+	if (strncmp(map_info->binary_filename, "anon_inode:kvm-vcpu", 19))
+		return false;
+
+	if (umoven(tcp, mmap_addr, sizeof(vcpu_run_struct), &vcpu_run_struct) < 0)
+		return false;
+
+	r = true;
+	reason = xlookup(kvm_exit_reason, vcpu_run_struct.exit_reason);
+	if (!reason) {
+		reason = "KVM_EXIT_???";
+		r = false;
+	}
+
+	tcp->auxstr = reason;
+	return r;
+}
+
+static int
+kvm_ioctl_decode_run(struct tcb *const tcp)
+{
+	int fd;
+	struct vcpu_info *info;
+	int r;
+
+	if (entering(tcp))
+		return 0;
+
+	r = RVAL_DECODED;
+
+	if (tcp->u_rval < 0)
+		return r;
+	if (dump_kvm_run_structure_level) {
+		tcp->auxstr = NULL;
+		fd = tcp->u_arg[0];
+		info = vcpu_get_info(tcp, fd);
+
+		if (info) {
+			if (kvm_ioctl_run_attach_auxstr(tcp,
+							info->mmap_addr))
+			r |= RVAL_STR;
+		}
+	}
+
+	return r;
+}
+
 int
 kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t arg)
 {
@@ -129,7 +326,10 @@ kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t a
 
 	case KVM_CREATE_VM:
 		return RVAL_DECODED | RVAL_FD;
+
 	case KVM_RUN:
+		return kvm_ioctl_decode_run(tcp);
+
 	case KVM_GET_VCPU_MMAP_SIZE:
 	case KVM_GET_API_VERSION:
 	default:
@@ -137,4 +337,10 @@ kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t a
 	}
 }
 
+void
+kvm_run_structure_decoder_init(void)
+{
+	mmap_cache_enable();
+}
+
 #endif /* HAVE_LINUX_KVM_H */
diff --git a/mmap_cache.c b/mmap_cache.c
index 19f88abe..fdd3720b 100644
--- a/mmap_cache.c
+++ b/mmap_cache.c
@@ -231,3 +231,15 @@ mmap_cache_search(struct tcb *tcp, unsigned long ip)
 	}
 	return NULL;
 }
+
+struct mmap_cache_t *
+mmap_cache_find(struct tcb *tcp, mmap_cache_find_fn find_fn, void *data)
+{
+	int i;
+
+	for (i = 0; i <= (int) tcp->mmap_cache_size; i++) {
+		if (find_fn (tcp->mmap_cache + i, data))
+			return tcp->mmap_cache + i;
+	}
+	return NULL;
+}
diff --git a/mmap_cache.h b/mmap_cache.h
index 265f8ec2..1d9fe049 100644
--- a/mmap_cache.h
+++ b/mmap_cache.h
@@ -65,6 +65,8 @@ enum mmap_cache_rebuild_result {
 	MMAP_CACHE_REBUILD_RENEWED,
 };
 
+typedef bool (* mmap_cache_find_fn) (struct mmap_cache_t *, void *);
+
 extern void
 mmap_cache_enable(void);
 
@@ -83,4 +85,7 @@ mmap_cache_rebuild_if_invalid(struct tcb *, const char *caller);
 extern struct mmap_cache_t *
 mmap_cache_search(struct tcb *, unsigned long ip);
 
+extern struct mmap_cache_t *
+mmap_cache_find(struct tcb *, mmap_cache_find_fn find_fn, void * data);
+
 #endif /* !STRACE_MMAP_CACHE_H */
diff --git a/strace.c b/strace.c
index 4a691bab..77c0d897 100644
--- a/strace.c
+++ b/strace.c
@@ -70,6 +70,10 @@ extern char *optarg;
 bool stack_trace_enabled;
 #endif
 
+#ifdef HAVE_LINUX_KVM_H
+int dump_kvm_run_structure_level = 0;
+#endif
+
 #define my_tkill(tid, sig) syscall(__NR_tkill, (tid), (sig))
 
 /* Glue for systems without a MMU that cannot provide fork() */
@@ -253,6 +257,13 @@ Output format:\n\
   -k             obtain stack trace between each syscall (experimental)\n\
 "
 #endif
+"\
+"
+#ifdef HAVE_LINUX_KVM_H
+"\
+  -K             dump KVM run struct\n\
+"
+#endif
 "\
   -o file        send trace output to FILE instead of stderr\n\
   -q             suppress messages about attaching, detaching, etc.\n\
@@ -1594,6 +1605,9 @@ init(int argc, char *argv[])
 	while ((c = getopt(argc, argv, "+"
 #ifdef CAN_UNWIND
 	    "k"
+#endif
+#ifdef HAVE_LINUX_KVM_H
+	    "K"
 #endif
 	    "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxyz")) != EOF) {
 		switch (c) {
@@ -1657,6 +1671,11 @@ init(int argc, char *argv[])
 		case 'k':
 			stack_trace_enabled = true;
 			break;
+#endif
+#ifdef HAVE_LINUX_KVM_H
+		case 'K':
+			dump_kvm_run_structure_level++;
+			break;
 #endif
 		case 'o':
 			outfname = optarg;
@@ -1785,6 +1804,11 @@ init(int argc, char *argv[])
 	}
 #endif
 
+#ifdef HAVE_LINUX_KVM_H
+	if (dump_kvm_run_structure_level)
+		kvm_run_structure_decoder_init();
+#endif
+
 	/* See if they want to run as another user. */
 	if (username != NULL) {
 		struct passwd *pent;
diff --git a/xlat/kvm_exit_reason.in b/xlat/kvm_exit_reason.in
new file mode 100644
index 00000000..085790a0
--- /dev/null
+++ b/xlat/kvm_exit_reason.in
@@ -0,0 +1,29 @@
+KVM_EXIT_UNKNOWN          0
+KVM_EXIT_EXCEPTION        1
+KVM_EXIT_IO               2
+KVM_EXIT_HYPERCALL        3
+KVM_EXIT_DEBUG            4
+KVM_EXIT_HLT              5
+KVM_EXIT_MMIO             6
+KVM_EXIT_IRQ_WINDOW_OPEN  7
+KVM_EXIT_SHUTDOWN         8
+KVM_EXIT_FAIL_ENTRY       9
+KVM_EXIT_INTR             10
+KVM_EXIT_SET_TPR          11
+KVM_EXIT_TPR_ACCESS       12
+KVM_EXIT_S390_SIEIC       13
+KVM_EXIT_S390_RESET       14
+# /* deprecated */
+KVM_EXIT_DCR              15
+KVM_EXIT_NMI              16
+KVM_EXIT_INTERNAL_ERROR   17
+KVM_EXIT_OSI              18
+KVM_EXIT_PAPR_HCALL	  19
+KVM_EXIT_S390_UCONTROL	  20
+KVM_EXIT_WATCHDOG         21
+KVM_EXIT_S390_TSCH        22
+KVM_EXIT_EPR              23
+KVM_EXIT_SYSTEM_EVENT     24
+KVM_EXIT_S390_STSI        25
+KVM_EXIT_IOAPIC_EOI       26
+KVM_EXIT_HYPERV           27


More information about the Strace-devel mailing list