[PATCH 0/8] mmap_cache subsystem (re-factoring)
Masatake YAMATO
yamato at redhat.com
Tue Apr 24 17:34:32 UTC 2018
On Tue, 24 Apr 2018 16:56:34 +0300, "Dmitry V. Levin" <ldv at altlinux.org> wrote:
> On Tue, Apr 24, 2018 at 02:25:14PM +0900, Masatake YAMATO wrote:
>> > On Tue, Feb 27, 2018 at 02:30:06AM +0300, Dmitry V. Levin wrote:
>> >> On Sat, Jan 27, 2018 at 05:36:36AM +0900, Masatake YAMATO wrote:
>> >> > When we added unwinding feature (activated with -k option), we also added
>> >> > a code for caching the entries of /proc/$pid/maps.
>> >> >
>> >> > The code for caching was tightly integrated with the code for
>> >> > unwinding. However, while prototyping kvm vcpu state dump feature to
>> >> > strace[1], I found the code for caching is useful for other purposes
>> >> > than unwinding feature.
>> >> >
>> >> > This patch set makes the code for caching usable independently of
>> >> > unwinding feature. In the changes, I call the newly separated
>> >> > code for caching "mmap_cache subsystem".
>> >> >
>> >> > [1] https://marc.info/?l=kvm&m=151531408406144&w=2
>> >> >
>> >> > Masatake YAMATO (8):
>> >> > unwind: lift up unw_flush_cache from mmap cache management code
>> >> > Introduce mmap_cache subsystem derived from unwind.c
>> >> > mmap_cache: Move code for searching an mmap cache from unwind
>> >> > mmap_cache: record protection bits
>> >> > Lift up mmap_cache_delete invocation from unwind.c
>> >> > mmap_cache: add function to enable mmap_cache
>> >> > mmap_cache: record device major and minor numbers
>> >> > mmap_cache: add customizable search function
>> >>
>> >> Thanks, I'll merge the first 7 of them, with minor corrections
>> >> (mostly related to commit messages) and a single build fix.
>> >
>> > With introduction of libdw-based unwinder this mmap_cache subsystem looks
>> > very odd: despite the fact that libdw-based unwinder does not use mmap_cache,
>> > it's use is forced by unwind.c for no visible purpose.
>>
>> mmap_cache calls tcb_flush_cache (and dwfl_linux_proc_report) when
>> the memory mapping of a target process is changed.
>
> Well, it's not mmap_cache.c but unwind.c that calls tcb_flush_cache.
> Generic unwind uses just the return code of mmap_cache_rebuild_if_invalid.
Oh, I see. What unwind-libdw needs tcp->mmap_cache_generation and
mmap_cache.c::mmap_cache_generation. How about introducing a function
that allows libdw unwinder to access (or comapre) mmap_cache.c::mmap_cache_generation ?
> mmap_cache_rebuild_if_invalid calls build_mmap_cache but the mmap cache
> it builds is not used because the sole user of mmap_cache_search is
> unwind-libunwind.c, all the rest don't need any mmap cache any more.
The reason I introduced mmap_cache is not for unwinding, for kvm vcpu status
decoding.
See https://www.spinics.net/lists/kvm/msg161468.html .
I wrote the idea in the page.
The status of kvm vcpu is exposed via memory where a fd of kvm vcpu is
mmap'ed. My code under testing prints the reason vcpus exit as auxstr if
-K option is given. If -KK is given, strace dumps whole the memory
area with decoding like:
# ./strace -KK -f -p 617 2>&1 | grep 'KVM_RUN\| K'
...
[pid 664] ioctl(18, KVM_RUN, 0) = 0 (KVM_EXIT_MMIO)
K ready_for_interrupt_injection=1, if_flag=0, flags=0, cr8=0000000000000000, apic_base=0x000000fee00d00
K phys_addr=0, len=1634035803, [33, 0, 0, 0, 0, 0, 0, 0], is_write=112
[pid 664] ioctl(18, KVM_RUN, 0) = 0 (KVM_EXIT_MMIO)
K ready_for_interrupt_injection=1, if_flag=1, flags=0, cr8=0000000000000000, apic_base=0x000000fee00d00
K phys_addr=0, len=1634035803, [33, 0, 0, 0, 0, 0, 0, 0], is_write=112
...
This feature uses mmap_cache subsystem to find and verify the memory
area for a vcpu. A kvm virtual machine has more than one vcpu.
Masatake YAMATO
diff --git a/defs.h b/defs.h
index 21069741..5526c4b7 100644
--- a/defs.h
+++ b/defs.h
@@ -381,6 +381,9 @@ extern unsigned followfork;
/* if this is true do the stack trace for every system call */
extern bool stack_trace_enabled;
#endif
+#ifdef HAVE_LINUX_KVM_H
+extern int dump_kvm_run_structure_level;
+#endif
extern unsigned ptrace_setoptions;
extern unsigned max_strlen;
extern unsigned os_release;
@@ -740,6 +743,10 @@ extern void unwind_tcb_print(struct tcb *);
extern void unwind_tcb_capture(struct tcb *);
#endif
+#ifdef HAVE_LINUX_KVM_H
+extern void kvm_run_structure_decoder_init(void);
+#endif
+
static inline int
printstrn(struct tcb *tcp, kernel_ulong_t addr, kernel_ulong_t len)
{
diff --git a/kvm.c b/kvm.c
index 86fd9e50..d89b176f 100644
--- a/kvm.c
+++ b/kvm.c
@@ -34,13 +34,142 @@
# include <linux/kvm.h>
# include "print_fields.h"
# include "arch_kvm.c"
+# include "xmalloc.h"
+# include "mmap_cache.h"
+# include <sys/mman.h>
+
+enum vcpu_info_source { VCPU_INFO_SOURCE_UNKNOWN,
+ VCPU_INFO_SOURCE_PROC_MAPS };
+struct vcpu_info {
+ int fd;
+ long mmap_addr;
+ long mmap_len;
+ enum vcpu_info_source info_source;
+ struct vcpu_info *next;
+};
+
+static struct vcpu_info *vcpu_info_list;
+static struct kvm_run vcpu_run_struct;
+
+static struct vcpu_info *
+vcpu_search(int fd, struct vcpu_info *head)
+{
+ struct vcpu_info *vcpu_info;
+
+ for (vcpu_info = head;
+ vcpu_info;
+ vcpu_info = vcpu_info->next)
+ if (vcpu_info->fd == fd)
+ return vcpu_info;
+
+ return NULL;
+}
+
+static struct vcpu_info *
+vcpu_alloc(int fd)
+{
+ struct vcpu_info *vcpu_info = xmalloc(sizeof(struct vcpu_info));
+
+ vcpu_info->fd = fd;
+ vcpu_info->next = vcpu_info_list;
+ vcpu_info_list = vcpu_info;
+
+ vcpu_info->info_source = VCPU_INFO_SOURCE_UNKNOWN;
+ vcpu_info->mmap_addr = 0L;
+ vcpu_info->mmap_len = 0L;
+
+ return vcpu_info;
+}
+
+static struct vcpu_info *
+vcpu_register(int fd)
+{
+ struct vcpu_info *vcpu_info = vcpu_search(fd, vcpu_info_list);
+
+ if (!vcpu_info)
+ vcpu_info = vcpu_alloc(fd);
+
+ return vcpu_info;
+}
+
+static bool
+find_map_for_file(struct mmap_cache_t *map_cache, void *data)
+{
+ const char *path = data;
+
+ /* major version for anon inode may be given in get_anon_bdev():
+ *
+ * *p = MKDEV(0, dev & MINORMASK);
+ *-----------------^
+ */
+ if (map_cache->binary_filename &&
+ map_cache->major == 0 &&
+ strcmp(map_cache->binary_filename, path) == 0)
+ return true;
+ return false;
+}
+
+static void
+vcpu_fill(struct vcpu_info *vcpu_info, unsigned long addr, unsigned len,
+ enum vcpu_info_source info_source)
+{
+ vcpu_info->mmap_addr = addr;
+ vcpu_info->mmap_len = len;
+ vcpu_info->info_source = info_source;
+}
+
+static unsigned long
+map_len(struct mmap_cache_t *map_info)
+{
+ if (map_info->start_addr >= map_info->end_addr)
+ return 0;
+ else
+ return map_info->end_addr - map_info->start_addr;
+}
+
+static struct vcpu_info*
+vcpu_get_info(struct tcb *const tcp, int fd)
+{
+ struct vcpu_info *vcpu_info = vcpu_search(fd, vcpu_info_list);
+
+ if (!vcpu_info || vcpu_info->info_source == VCPU_INFO_SOURCE_UNKNOWN) {
+ char path[PATH_MAX + 1];
+ struct mmap_cache_t *map_info;
+ if (getfdpath(tcp, fd, path, sizeof(path)) >= 0
+ && strncmp(path, "anon_inode:kvm-vcpu:", 19 + 1) == 0) {
+ mmap_cache_rebuild_if_invalid(tcp, __func__);
+ map_info = mmap_cache_find(tcp, find_map_for_file, path);
+ if (map_info) {
+ if (!vcpu_info)
+ vcpu_info = vcpu_alloc(fd);
+ vcpu_fill(vcpu_info,
+ map_info->start_addr,
+ map_len(map_info),
+ VCPU_INFO_SOURCE_PROC_MAPS);
+ }
+
+ }
+ }
+
+ return vcpu_info;
+}
static int
kvm_ioctl_create_vcpu(struct tcb *const tcp, const kernel_ulong_t arg)
{
- uint32_t cpuid = arg;
+ uint32_t cpuid;
+
+ if (entering(tcp))
+ return 0;
+ cpuid = arg;
tprintf(", %u", cpuid);
+
+ if (dump_kvm_run_structure_level && tcp->u_rval >= 0) {
+ int fd = tcp->u_rval;
+ vcpu_register(fd);
+ }
+
return RVAL_IOCTL_DECODED | RVAL_FD;
}
@@ -103,6 +232,74 @@ kvm_ioctl_decode_sregs(struct tcb *const tcp, const unsigned int code,
}
# endif /* HAVE_STRUCT_KVM_SREGS */
+# include "xlat/kvm_exit_reason.h"
+static bool
+kvm_ioctl_run_attach_auxstr(struct tcb *const tcp,
+ long mmap_addr)
+
+{
+ struct mmap_cache_t *map_info;
+ const char *reason;
+ bool r;
+
+ mmap_cache_rebuild_if_invalid(tcp, __func__);
+
+ map_info = mmap_cache_search(tcp, mmap_addr);
+ if (!map_info)
+ return false;
+
+ if (map_len(map_info) < sizeof(vcpu_run_struct))
+ return false;
+
+ if (!map_info->binary_filename)
+ return false;
+
+ if (strncmp(map_info->binary_filename, "anon_inode:kvm-vcpu", 19))
+ return false;
+
+ if (umoven(tcp, mmap_addr, sizeof(vcpu_run_struct), &vcpu_run_struct) < 0)
+ return false;
+
+ r = true;
+ reason = xlookup(kvm_exit_reason, vcpu_run_struct.exit_reason);
+ if (!reason) {
+ reason = "KVM_EXIT_???";
+ r = false;
+ }
+
+ tcp->auxstr = reason;
+ return r;
+}
+
+static int
+kvm_ioctl_decode_run(struct tcb *const tcp)
+{
+ int fd;
+ struct vcpu_info *info;
+ int r;
+
+ if (entering(tcp))
+ return 0;
+
+ r = RVAL_DECODED;
+
+ if (tcp->u_rval < 0)
+ return r;
+ if (dump_kvm_run_structure_level) {
+ tcp->auxstr = NULL;
+ fd = tcp->u_arg[0];
+ info = vcpu_get_info(tcp, fd);
+
+ if (info) {
+ if (kvm_ioctl_run_attach_auxstr(tcp,
+ info->mmap_addr))
+ r |= RVAL_STR;
+ }
+ }
+
+ return r;
+}
+
int
kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t arg)
{
@@ -129,7 +326,10 @@ kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t a
case KVM_CREATE_VM:
return RVAL_DECODED | RVAL_FD;
+
case KVM_RUN:
+ return kvm_ioctl_decode_run(tcp);
+
case KVM_GET_VCPU_MMAP_SIZE:
case KVM_GET_API_VERSION:
default:
@@ -137,4 +337,10 @@ kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t a
}
}
+void
+kvm_run_structure_decoder_init(void)
+{
+ mmap_cache_enable();
+}
+
#endif /* HAVE_LINUX_KVM_H */
diff --git a/mmap_cache.c b/mmap_cache.c
index 19f88abe..fdd3720b 100644
--- a/mmap_cache.c
+++ b/mmap_cache.c
@@ -231,3 +231,15 @@ mmap_cache_search(struct tcb *tcp, unsigned long ip)
}
return NULL;
}
+
+struct mmap_cache_t *
+mmap_cache_find(struct tcb *tcp, mmap_cache_find_fn find_fn, void *data)
+{
+ int i;
+
+ for (i = 0; i <= (int) tcp->mmap_cache_size; i++) {
+ if (find_fn (tcp->mmap_cache + i, data))
+ return tcp->mmap_cache + i;
+ }
+ return NULL;
+}
diff --git a/mmap_cache.h b/mmap_cache.h
index 265f8ec2..1d9fe049 100644
--- a/mmap_cache.h
+++ b/mmap_cache.h
@@ -65,6 +65,8 @@ enum mmap_cache_rebuild_result {
MMAP_CACHE_REBUILD_RENEWED,
};
+typedef bool (* mmap_cache_find_fn) (struct mmap_cache_t *, void *);
+
extern void
mmap_cache_enable(void);
@@ -83,4 +85,7 @@ mmap_cache_rebuild_if_invalid(struct tcb *, const char *caller);
extern struct mmap_cache_t *
mmap_cache_search(struct tcb *, unsigned long ip);
+extern struct mmap_cache_t *
+mmap_cache_find(struct tcb *, mmap_cache_find_fn find_fn, void * data);
+
#endif /* !STRACE_MMAP_CACHE_H */
diff --git a/strace.c b/strace.c
index 4a691bab..77c0d897 100644
--- a/strace.c
+++ b/strace.c
@@ -70,6 +70,10 @@ extern char *optarg;
bool stack_trace_enabled;
#endif
+#ifdef HAVE_LINUX_KVM_H
+int dump_kvm_run_structure_level = 0;
+#endif
+
#define my_tkill(tid, sig) syscall(__NR_tkill, (tid), (sig))
/* Glue for systems without a MMU that cannot provide fork() */
@@ -253,6 +257,13 @@ Output format:\n\
-k obtain stack trace between each syscall (experimental)\n\
"
#endif
+"\
+"
+#ifdef HAVE_LINUX_KVM_H
+"\
+ -K dump KVM run struct\n\
+"
+#endif
"\
-o file send trace output to FILE instead of stderr\n\
-q suppress messages about attaching, detaching, etc.\n\
@@ -1594,6 +1605,9 @@ init(int argc, char *argv[])
while ((c = getopt(argc, argv, "+"
#ifdef CAN_UNWIND
"k"
+#endif
+#ifdef HAVE_LINUX_KVM_H
+ "K"
#endif
"a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxyz")) != EOF) {
switch (c) {
@@ -1657,6 +1671,11 @@ init(int argc, char *argv[])
case 'k':
stack_trace_enabled = true;
break;
+#endif
+#ifdef HAVE_LINUX_KVM_H
+ case 'K':
+ dump_kvm_run_structure_level++;
+ break;
#endif
case 'o':
outfname = optarg;
@@ -1785,6 +1804,11 @@ init(int argc, char *argv[])
}
#endif
+#ifdef HAVE_LINUX_KVM_H
+ if (dump_kvm_run_structure_level)
+ kvm_run_structure_decoder_init();
+#endif
+
/* See if they want to run as another user. */
if (username != NULL) {
struct passwd *pent;
diff --git a/xlat/kvm_exit_reason.in b/xlat/kvm_exit_reason.in
new file mode 100644
index 00000000..085790a0
--- /dev/null
+++ b/xlat/kvm_exit_reason.in
@@ -0,0 +1,29 @@
+KVM_EXIT_UNKNOWN 0
+KVM_EXIT_EXCEPTION 1
+KVM_EXIT_IO 2
+KVM_EXIT_HYPERCALL 3
+KVM_EXIT_DEBUG 4
+KVM_EXIT_HLT 5
+KVM_EXIT_MMIO 6
+KVM_EXIT_IRQ_WINDOW_OPEN 7
+KVM_EXIT_SHUTDOWN 8
+KVM_EXIT_FAIL_ENTRY 9
+KVM_EXIT_INTR 10
+KVM_EXIT_SET_TPR 11
+KVM_EXIT_TPR_ACCESS 12
+KVM_EXIT_S390_SIEIC 13
+KVM_EXIT_S390_RESET 14
+# /* deprecated */
+KVM_EXIT_DCR 15
+KVM_EXIT_NMI 16
+KVM_EXIT_INTERNAL_ERROR 17
+KVM_EXIT_OSI 18
+KVM_EXIT_PAPR_HCALL 19
+KVM_EXIT_S390_UCONTROL 20
+KVM_EXIT_WATCHDOG 21
+KVM_EXIT_S390_TSCH 22
+KVM_EXIT_EPR 23
+KVM_EXIT_SYSTEM_EVENT 24
+KVM_EXIT_S390_STSI 25
+KVM_EXIT_IOAPIC_EOI 26
+KVM_EXIT_HYPERV 27
More information about the Strace-devel
mailing list