[PATCH 2/4] kvm: attach the exit reason of vcpu as auxstr to KVM_RUN output

Dmitry V. Levin ldv at altlinux.org
Mon May 14 12:38:39 UTC 2018


On Mon, May 07, 2018 at 08:26:02PM +0900, Masatake YAMATO wrote:
> In KVM, a virtual machine implementation like Qemu can access a vcpu
> via ioctl. KVM_RUN is an ioctl command to enter vcpu. The command
> returns the control for some reasons: Needs of device emulation or
> consuming time slices are typical ones. The vmi takes a different
> action for the reason.
> 
> We, strace users, want to know the reason to understand kvm.  This
> change prints the reason as auxstr if -K option is given, and if
> strace runs on Linux 4.16.0 or higher, which includes
> e46b469278a59781f9b25ff608af84892963821b, "kvm: embed vcpu id to
> dentry of vcpu anon inode."
> 
> The way to get the reason is a bit complicated because the ioctl
> doesn't return it to the userspace directly. Instead, the vmi and kvm
> communicate via an area of the process virtual memory where the fd of
> vcpu is mmap'ed. strace must peek the area to know the reason.
> 
> The change does three things: (1) recording the area for given vcpu
> when the target calls VCPU_CREATE to vcpu_info_list per tcb data
> field, (2) verifying the data recorded in vcpu_info_list before
> doing (3), and (3) decoding the exit reason field of the area.
> 
> The change is not so simple because there is a case that strace
> doesn't have a chance to do (1) if -p option is used. In the case,
> vcpu_info data created in the step (2).
> 
> The area has more fields than "exit reason." Dumping them is future
> work.
> 
> * defs.h [HAVE_LINUX_KVM_H]: (kvm_run_structure_decoder_init,
> kvm_vcpu_info_free): New declarations.
> (struct tcb): Add new field vcpu_info_list.
> * strace.c (usage): Describe -K option.
> (init): Introduce -K option. Call
> kvm_run_structure_decoder_init to enable for attaching the exit reason
> to auxstr.
> (droptcb): Call kvm_vcpu_info_free.
> * xlat/kvm_exit_reason.in: The list for decoding the exit reason.
> * kvm.c: Include xmalloc.h, mmap_cache.h, and sys/mman.h.
> (dump_kvm_run_structure_level): New file static variable.
> (kvm_run_structure_decoder_init): New function.
> (vcpu_info): New struct definition representing the 3-tuple: vcpu file
> descriptor, id of the vcpu, and mmap'ed entry.
> (vcpu_find, vcpu_alloc, vcpu_register, vcpu_getinfo,
> kvm_vcpu_info_free): New functions to access tcb's vcpu_info_list
> field and vcpu_info data type.
> (is_map_for_file, extrac_cpuid, map_len): New helper functions.
> (kvm_ioclt_run_attach_auxstr, kvm_ioctl_decode_run): New functions
> decoding vcpu exit reason and attaching the decoded data to auxstr
> field of tcb.
> (kvm_ioctl_create_vcpu): Call vcpu_register to make an entry mapping a
> file descriptor and the vcpu id associated with the fd.
> (kvm_ioctl): Call kvm_ioctl_decode_run.
> 
> Signed-off-by: Masatake YAMATO <yamato at redhat.com>
> ---
>  defs.h                  |   9 ++
>  kvm.c                   | 235 ++++++++++++++++++++++++++++++++++++++++
>  strace.c                |  30 +++++
>  xlat/kvm_exit_reason.in |  29 +++++
>  4 files changed, 303 insertions(+)
>  create mode 100644 xlat/kvm_exit_reason.in
> 
> diff --git a/defs.h b/defs.h
> index 5116fc21..407e3a02 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -220,6 +220,10 @@ struct tcb {
>  
>  	struct mmap_cache_t *mmap_cache;
>  
> +#ifdef HAVE_LINUX_KVM_H
> +	struct vcpu_info *vcpu_info_list;
> +#endif
> +
>  #ifdef ENABLE_STACKTRACE
>  	void *unwind_ctx;
>  	struct unwind_queue_t *unwind_queue;
> @@ -845,6 +849,11 @@ extern void unwind_tcb_print(struct tcb *);
>  extern void unwind_tcb_capture(struct tcb *);
>  #endif
>  
> +#ifdef HAVE_LINUX_KVM_H
> +extern void kvm_run_structure_decoder_init(void);
> +extern void kvm_vcpu_info_free(struct tcb *);
> +#endif
> +
>  static inline int
>  printstrn(struct tcb *tcp, kernel_ulong_t addr, kernel_ulong_t len)
>  {
> diff --git a/kvm.c b/kvm.c
> index 86fd9e50..ab9aeb65 100644
> --- a/kvm.c
> +++ b/kvm.c
> @@ -34,13 +34,189 @@
>  # include <linux/kvm.h>
>  # include "print_fields.h"
>  # include "arch_kvm.c"
> +# include "xmalloc.h"
> +# include "mmap_cache.h"
> +# include <sys/mman.h>

Do you need <sys/mman.h>?

> +
> +struct vcpu_info {
> +	int fd;
> +	int cpuid;
> +	long mmap_addr;
> +	unsigned long mmap_len;
> +	bool resolved;
> +	struct vcpu_info *next;
> +};
> +
> +static int dump_kvm_run_structure_level;

Is it expected to have other values than 0 or 1?

> +static struct vcpu_info *
> +vcpu_find(struct tcb *const tcp, int fd)
> +{
> +	struct vcpu_info *head = tcp->vcpu_info_list;
> +	for (struct vcpu_info *vcpu_info = head;

	for (struct vcpu_info *vcpu_info = tcp->vcpu_info_list;

> +	     vcpu_info;
> +	     vcpu_info = vcpu_info->next)
> +		if (vcpu_info->fd == fd)
> +			return vcpu_info;
> +
> +	return NULL;
> +}
> +
> +static struct vcpu_info *
> +vcpu_alloc(struct tcb *const tcp, int fd, int cpuid)
> +{
> +	struct vcpu_info *vcpu_info = xmalloc(sizeof(struct vcpu_info));

I'd just use xcalloc:

	struct vcpu_info *vcpu_info = xcalloc(1, sizeof(*vcpu_info));
> +
> +	vcpu_info->fd = fd;
> +
> +	vcpu_info->cpuid = cpuid;
> +	vcpu_info->resolved = false;
> +	vcpu_info->mmap_addr = 0L;
> +	vcpu_info->mmap_len  = 0L;
> +
> +	vcpu_info->next = tcp->vcpu_info_list;
> +	tcp->vcpu_info_list = vcpu_info;
> +
> +	return vcpu_info;
> +}
> +
> +void
> +kvm_vcpu_info_free(struct tcb *tcp)
> +{
> +	struct vcpu_info *head = tcp->vcpu_info_list;
> +	struct vcpu_info *next;
> +
> +	while (head) {
> +		next = head->next;
> +		free (head);

We don't use space before function name and parenthesis.

> +		head = next;
> +	}

	for (head = tcp->vcpu_info_list; head; head = next)

> +
> +	tcp->vcpu_info_list = NULL;
> +}
> +
> +static void
> +vcpu_register(struct tcb *const tcp, int fd, int cpuid)
> +{
> +	struct vcpu_info *vcpu_info = vcpu_find(tcp, fd);
> +
> +	if (!vcpu_info)
> +		vcpu_info = vcpu_alloc(tcp, fd, cpuid);
> +	else if (vcpu_info->cpuid != cpuid)
> +	{
> +		vcpu_info->cpuid = cpuid;
> +		vcpu_info->resolved = false;
> +	}
> +}
> +
> +static bool
> +is_map_for_file(struct mmap_cache_entry_t *map_info, void *data)
> +{
> +	/* major version for anon inode may be given in get_anon_bdev()
> +	 * in linux kernel.
> +	 *
> +	 * 	*p = MKDEV(0, dev & MINORMASK);
> +	 *-----------------^
> +	 */
> +	if (map_info->binary_filename &&
> +	    map_info->major == 0 &&
> +	    strcmp(map_info->binary_filename, (const char *)data) == 0)
> +		return true;
> +	return false;

	return map_info->binary_filename &&
	       map_info->major == 0 &&
	       strcmp(map_info->binary_filename, (const char *) data) == 0;

> +}
> +
> +static unsigned long
> +map_len(struct mmap_cache_entry_t *map_info)
> +{
> +	if (map_info->start_addr >= map_info->end_addr)
> +		return 0;
> +	else
> +		return map_info->end_addr - map_info->start_addr;

	return map_info->end_addr > map_info->start_addr ?
	       map_info->end_addr - map_info->start_addr : 0;

> +}
> +
> +static int
> +extract_cpuid(const char *num)
> +{
> +	unsigned long cpuid;
> +
> +	errno = 0;
> +	cpuid = strtoul(num, NULL, 10);
> +	if (errno)
> +		return -1;
> +	else
> +		return (int)cpuid;
> +}

Wouldn't string_to_uint() be OK?

> +
> +static struct vcpu_info*
> +vcpu_get_info(struct tcb *const tcp, int fd)
> +{
> +	struct vcpu_info *vcpu_info = vcpu_find(tcp, fd);
> +	struct mmap_cache_entry_t *map_info;
> +	enum mmap_cache_rebuild_result mc_stat;
> +
> +	mc_stat = mmap_cache_rebuild_if_invalid(tcp, __func__);
> +	if (mc_stat == MMAP_CACHE_REBUILD_NOCACHE)
> +		return NULL;
> +
> +	if (vcpu_info && vcpu_info->resolved) {
> +		if (mc_stat == MMAP_CACHE_REBUILD_READY)
> +			return vcpu_info;
> +		else {
> +			map_info = mmap_cache_search(tcp, vcpu_info->mmap_addr);
> +			if ((map_info
> +			     && strncmp(map_info->binary_filename,
> +					"anon_inode:kvm-vcpu:", 20) == 0)) {
> +				int cpuid = extract_cpuid (map_info->binary_filename);

Did you mean map_info->binary_filename + 20?
Please don't hardcode 20, use STR_STRIP_PREFIX instead.

> +				if (cpuid < 0)
> +					return NULL;
> +				else if (vcpu_info->cpuid == cpuid)
> +					return vcpu_info;
> +				else
> +					vcpu_info->resolved = false;
> +			} else
> +				/* The vcpu vma may be mremap'ed. */
> +				vcpu_info->resolved = false;
> +		}
> +	}
> +
> +	/* Slow path */
> +	char path[PATH_MAX + 1];
> +	if (getfdpath(tcp, fd, path, sizeof(path)) >= 0
> +	    && strncmp(path, "anon_inode:kvm-vcpu:", 20) == 0) {
> +		map_info = mmap_cache_search_custom(tcp, is_map_for_file, path);
> +		if (map_info) {
> +			int cpuid = extract_cpuid(path + 20);
> +			if (cpuid < 0)
> +				return NULL;
> +			else if (!vcpu_info)
> +				vcpu_info = vcpu_alloc(tcp, fd, cpuid);
> +			else if (vcpu_info->cpuid != cpuid)
> +				vcpu_info->cpuid = cpuid;
> +			vcpu_info->mmap_addr = map_info->start_addr;
> +			vcpu_info->mmap_len  = map_len(map_info);
> +			vcpu_info->resolved  = true;
> +		} else
> +			vcpu_info->resolved = false;
> +	}
> +
> +	if (!vcpu_info->resolved)
> +		vcpu_info = NULL;
> +	return vcpu_info;
> +}
>  
>  static int
>  kvm_ioctl_create_vcpu(struct tcb *const tcp, const kernel_ulong_t arg)
>  {
>  	uint32_t cpuid = arg;
>  
> +	if (entering(tcp))
> +		return 0;
> +
>  	tprintf(", %u", cpuid);

Let's print cpuid on entering syscall.

> +
> +	if (dump_kvm_run_structure_level && tcp->u_rval >= 0)
> +		vcpu_register(tcp, tcp->u_rval, (int)cpuid);
> +
>  	return RVAL_IOCTL_DECODED | RVAL_FD;
>  }
>  
> @@ -103,6 +279,55 @@ kvm_ioctl_decode_sregs(struct tcb *const tcp, const unsigned int code,
>  }
>  # endif /* HAVE_STRUCT_KVM_SREGS */
>  
> +# include "xlat/kvm_exit_reason.h"
> +static void
> +kvm_ioctl_run_attach_auxstr(struct tcb *const tcp,
> +			    struct vcpu_info *info)
> +
> +{
> +	static struct kvm_run vcpu_run_struct;
> +
> +	if (info->mmap_len < sizeof(vcpu_run_struct))
> +		return;
> +
> +	if (umoven(tcp, info->mmap_addr,
> +		   sizeof(vcpu_run_struct), &vcpu_run_struct) < 0)
> +		return;

This looks like a use case for umove().

> +
> +	tcp->auxstr = xlookup(kvm_exit_reason, vcpu_run_struct.exit_reason);
> +	if (!tcp->auxstr)
> +		tcp->auxstr = "KVM_EXIT_???";

This looks like a use case for sprintxval().

> +}
> +
> +static int
> +kvm_ioctl_decode_run(struct tcb *const tcp)
> +{
> +	int fd;
> +	struct vcpu_info *info;
> +	int r;
> +
> +	if (entering(tcp))
> +		return 0;
> +
> +	r = RVAL_DECODED;
> +
> +	if (tcp->u_rval < 0)
> +		return r;

Did you mean syserror(tcp)?

> +	if (dump_kvm_run_structure_level) {
> +		tcp->auxstr = NULL;
> +		fd = tcp->u_arg[0];
> +		info = vcpu_get_info(tcp, fd);
> +
> +		if (info) {
> +			kvm_ioctl_run_attach_auxstr(tcp, info);
> +			if (tcp->auxstr)
> +				r |= RVAL_STR;
> +		}
> +	}
> +
> +	return r;
> +}
> +
>  int
>  kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t arg)
>  {
> @@ -129,7 +354,10 @@ kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t a
>  
>  	case KVM_CREATE_VM:
>  		return RVAL_DECODED | RVAL_FD;
> +
>  	case KVM_RUN:
> +		return kvm_ioctl_decode_run(tcp);
> +
>  	case KVM_GET_VCPU_MMAP_SIZE:
>  	case KVM_GET_API_VERSION:
>  	default:
> @@ -137,4 +365,11 @@ kvm_ioctl(struct tcb *const tcp, const unsigned int code, const kernel_ulong_t a
>  	}
>  }
>  
> +void
> +kvm_run_structure_decoder_init(void)
> +{
> +	dump_kvm_run_structure_level = 1;
> +	mmap_cache_enable();
> +}
> +
>  #endif /* HAVE_LINUX_KVM_H */
> diff --git a/strace.c b/strace.c
> index 4b374857..db07b4f1 100644
> --- a/strace.c
> +++ b/strace.c
> @@ -256,6 +256,13 @@ Output format:\n\
>    -k             obtain stack trace between each syscall\n\
>  "
>  #endif
> +"\
> +"
> +#ifdef HAVE_LINUX_KVM_H
> +"\
> +  -K             show the exit reason of KVM run\n\
> +"
> +#endif
>  "\
>    -o file        send trace output to FILE instead of stderr\n\
>    -q             suppress messages about attaching, detaching, etc.\n\
> @@ -819,6 +826,10 @@ droptcb(struct tcb *tcp)
>  		unwind_tcb_fin(tcp);
>  #endif
>  
> +#ifdef HAVE_LINUX_KVM_H
> +	kvm_vcpu_info_free(tcp);
> +#endif
> +
>  	if (tcp->mmap_cache)
>  		tcp->mmap_cache->free_fn(tcp, __func__);
>  
> @@ -1572,6 +1583,9 @@ init(int argc, char *argv[])
>  {
>  	int c, i;
>  	int optF = 0;
> +#ifdef HAVE_LINUX_KVM_H
> +	int kvm_vcpu = 0;
> +#endif

Is it expected to have other values than 0 or 1?


-- 
ldv
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 801 bytes
Desc: not available
URL: <http://lists.strace.io/pipermail/strace-devel/attachments/20180514/eb9797d8/attachment.bin>


More information about the Strace-devel mailing list