[RFC PATCH 01/15] [wip] PID namespace translation support

Ákos Uzonyi uzonyi.akos at gmail.com
Mon Jun 1 11:44:30 UTC 2020


From: Eugene Syromyatnikov <evgsyr at gmail.com>

---
 Makefile.am   |   2 +
 btree.c       | 298 +++++++++++++++++++++++++
 btree.h       |  89 ++++++++
 defs.h        |  24 +-
 getpid.c      |  30 +++
 linux/dummy.h |   9 +-
 pidns.c       | 602 ++++++++++++++++++++++++++++++++++++++++++++++++++
 strace.c      |   7 +-
 syscall.c     |  15 ++
 9 files changed, 1067 insertions(+), 9 deletions(-)
 create mode 100644 btree.c
 create mode 100644 btree.h
 create mode 100644 getpid.c
 create mode 100644 pidns.c

diff --git a/Makefile.am b/Makefile.am
index f5447811..e9e537e5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -135,6 +135,7 @@ libstrace_a_SOURCES =	\
 	getcpu.c	\
 	getcwd.c	\
 	getpagesize.c \
+	getpid.c	\
 	getrandom.c	\
 	hdio.c		\
 	hostname.c	\
@@ -233,6 +234,7 @@ libstrace_a_SOURCES =	\
 	personality.c	\
 	pidfd_getfd.c	\
 	pidfd_open.c	\
+	pidns.c		\
 	pkeys.c		\
 	poll.c		\
 	prctl.c		\
diff --git a/btree.c b/btree.c
new file mode 100644
index 00000000..ac279f82
--- /dev/null
+++ b/btree.c
@@ -0,0 +1,298 @@
+/* Simple B-tree implementation for key-value mapping storage */
+
+#include "defs.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "btree.h"
+
+static const ptr_sz_lg = (sizeof(uint64_t *) == 8 ? 3 : 2);
+
+bool
+btree_check(uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+	    uint8_t data_block_size_lg, uint8_t key_size)
+{
+	if (item_size_lg > 6)
+		return false;
+	if (key_size < 1 || key_size > 64)
+		return false;
+	if (ptr_block_size_lg < 1 || ptr_block_size_lg > PTR_BLOCK_SIZE_LG_MAX)
+		return false;
+	if (data_block_size_lg > DATA_BLOCK_SIZE_LG_MAX ||
+	    data_block_size_lg < 3 ||
+	    item_size_lg > (data_block_size_lg + 3))
+		return false;
+
+	return true;
+}
+
+void
+btree_init(struct btree *b, uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+	   uint8_t data_block_size_lg, uint8_t key_size, uint64_t set_value)
+{
+	assert(btree_check(item_size_lg, ptr_block_size_lg, data_block_size_lg,
+			   key_size));
+
+	b->set_value = set_value;
+	b->data = BTREE_UNSET;
+	b->item_size_lg = item_size_lg;
+	b->ptr_block_size_lg = ptr_block_size_lg;
+	b->data_block_size_lg = data_block_size_lg;
+	b->key_size = key_size;
+}
+
+static uint8_t
+btree_get_depth(struct btree *b)
+{
+	return (b->key_size - (b->data_block_size_lg + 3 - b->item_size_lg) +
+		b->ptr_block_size_lg - 1) / b->ptr_block_size_lg;
+}
+
+/**
+ * Returns lg2 of block size for the specific level of B-tree. If max_depth
+ * provided is less than zero, it is calculated via btree_get_depth call.
+ */
+static uint8_t
+btree_get_block_size(struct btree *b, uint8_t depth, uint8_t max_depth)
+{
+	if (!max_depth)
+		max_depth = btree_get_depth(b);
+
+	/* Last level contains data and we allow it having a different size */
+	if (depth == max_depth)
+		return b->data_block_size_lg;
+	/* Last level of the tree can be smaller */
+	if (depth == max_depth - 1)
+		return (b->key_size -
+			(b->data_block_size_lg + 3 - b->item_size_lg) +
+			b->ptr_block_size_lg - 1) %
+			b->ptr_block_size_lg + 1 + ptr_sz_lg;
+
+	return b->ptr_block_size_lg + ptr_sz_lg;
+}
+
+#define round_down(a, b) (((a) / (b)) * (b))
+
+/**
+ * Provides starting offset of bits in key corresponding to the block index
+ * at the specific level.
+ */
+static uint8_t
+btree_get_block_bit_offs(struct btree *b, uint8_t depth, int max_depth)
+{
+	uint8_t offs;
+
+	if (max_depth < 0)
+		max_depth = btree_get_depth(b);
+
+	if (depth == max_depth)
+		return 0;
+
+	offs = b->data_block_size_lg + 3 - b->item_size_lg;
+
+	if (depth == max_depth - 1)
+		return offs;
+
+	/* data_block_size + remainder */
+	offs = b->key_size - round_down(b->key_size - offs - 1,
+		b->ptr_block_size_lg);
+
+	return offs + (max_depth - depth - 2) * b->ptr_block_size_lg;
+}
+
+struct btree *
+btree_create(uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+	     uint8_t data_block_size_lg, uint8_t key_size, uint64_t set_value)
+{
+	struct btree *b;
+
+	if (!btree_check(item_size_lg, ptr_block_size_lg, data_block_size_lg,
+	    key_size))
+		return NULL;
+
+	b = malloc(sizeof(*b));
+	if (!b)
+		return NULL;
+
+	btree_init(b, item_size_lg, ptr_block_size_lg, data_block_size_lg,
+		   key_size, set_value);
+
+	return b;
+}
+
+static uint64_t
+btree_filler(uint64_t val, uint8_t item_size)
+{
+	val &= (1 << (1 << item_size)) - 1;
+
+	for (; item_size < 6; item_size++)
+		val |= val << (1 << item_size);
+
+	return val;
+}
+
+static uint64_t *
+btree_get_block(struct btree *b, uint64_t key, bool auto_create)
+{
+	uint64_t ***cur_block = &(b->data);
+	unsigned i;
+	uint8_t cur_depth;
+	uint8_t max_depth;
+	uint8_t sz;
+
+	if (b->key_size < 64 && key > (uint64_t) 1 << (1 << b->key_size))
+		return NULL;
+
+	max_depth = btree_get_depth(b);
+
+	for (cur_depth = 0; cur_depth <= max_depth; cur_depth++) {
+		sz = btree_get_block_size(b, cur_depth, max_depth);
+
+		if (*cur_block == BTREE_SET || *cur_block == BTREE_UNSET) {
+			uint64_t **old_val = *cur_block;
+
+			if (!auto_create)
+				return (uint64_t *) (*cur_block);
+
+			*cur_block = xcalloc(1 << sz, 1);
+
+			if (old_val == BTREE_SET) {
+				uint64_t filler = (cur_depth == max_depth) ?
+					btree_filler(b->set_value,
+						     b->item_size_lg) :
+					btree_filler((uintptr_t) BTREE_SET,
+						     ptr_sz_lg + 3);
+
+				for (i = 0; i < (1 << (sz - 3)); i++)
+					((uint64_t *) *cur_block)[i] = filler;
+			}
+		}
+
+		if (cur_depth < max_depth) {
+			size_t pos = (key >> btree_get_block_bit_offs(b,
+				cur_depth, max_depth)) & ((1 << (sz - ptr_sz_lg)) - 1);
+
+			cur_block = (uint64_t ***) ((*cur_block) + pos);
+		}
+	}
+
+	return (uint64_t *) (*cur_block);
+}
+
+bool
+btree_set(struct btree *b, uint64_t key, uint64_t val)
+{
+	uint64_t *data = btree_get_block(b, key, true);
+	size_t mask = (1 << (b->data_block_size_lg - 3)) - 1;
+	size_t pos = (key & mask) >> (6 - b->item_size_lg);
+
+	if (!data)
+		return false;
+
+	if (b->item_size_lg == 6) {
+		data[pos] = val;
+	} else {
+		size_t offs = (key & ((1 << (6 - b->item_size_lg)) - 1)) <<
+			b->item_size_lg;
+		uint64_t mask =
+			(uint64_t) ((1 << (1 << b->item_size_lg)) - 1) << offs;
+
+		data[pos] &= ~mask;
+		data[pos] |= (val << offs) & mask;
+	}
+
+	return true;
+}
+
+#if 0
+int
+btree_mask_set(struct btree *b, uint64_t key, uint8_t mask_bits)
+{
+}
+
+/**
+ * Sets to 0 all keys with 0-ed bits of mask equivalent to corresponding bits in
+ * key.
+ */
+int
+btree_mask_unset(struct btree *b, uint64_t key, uint8_t mask_bits)
+{
+}
+
+int
+btree_interval_set(struct btree *b, uint64_t begin, uint64_t end, uint64_t val)
+{
+}
+
+uint64_t
+btree_get_next_set_key(struct btree *b, uint64_t key)
+{
+}
+
+uint64_t
+btree_iterate_set_keys(struct btree *b, uint64_t start, uint64_t end,
+		       btree_iterate_fn fn, void *fn_data)
+{
+}
+#endif
+
+uint64_t
+btree_get(struct btree *b, uint64_t key)
+{
+	uint64_t *data = btree_get_block(b, key, false);
+	size_t mask;
+	size_t pos;
+	size_t offs;
+
+	if (!data)
+		return 0;
+	if ((void *) data == (void *) BTREE_SET)
+		return b->set_value;
+
+	mask = (1 << (b->data_block_size_lg - 3)) - 1;
+	pos = (key & mask) >> (6 - b->item_size_lg);
+
+	if (b->item_size_lg == 6)
+		return data[pos];
+
+	offs = (key & ((1 << (6 - b->item_size_lg)) - 1)) << b->item_size_lg;
+
+	return (data[pos] >> offs) & ((1 << (1 << b->item_size_lg)) - 1);
+}
+
+void
+btree_free_block(struct btree *b, uint64_t **block, uint8_t depth,
+		 int max_depth)
+{
+	size_t count;
+	size_t sz;
+	size_t i;
+
+	if (block == BTREE_SET || block == BTREE_UNSET)
+		return;
+	if (max_depth < 0)
+		max_depth = btree_get_depth(b);
+	if (depth >= max_depth)
+		goto free_block;
+
+	sz = 1 << (btree_get_block_size(b, depth, max_depth) - ptr_sz_lg);
+
+	for (i = 0; i < sz; i++)
+		if (((void *) block[i] != (void *) BTREE_SET) &&
+		    ((void *) block[i] != (void *) BTREE_UNSET))
+			btree_free_block(b, (uint64_t **) (block[i]), depth + 1,
+					 max_depth);
+
+free_block:
+	free(block);
+}
+
+void
+btree_free(struct btree *b)
+{
+	btree_free_block(b, b->data, 0, -1);
+	free(b);
+}
diff --git a/btree.h b/btree.h
new file mode 100644
index 00000000..bd416c87
--- /dev/null
+++ b/btree.h
@@ -0,0 +1,89 @@
+#ifndef STRACE_BTREE_H
+#define STRACE_BTREE_H
+
+/* Simple B-tree interface */
+
+#define BTREE_SET   ((uint64_t **) ~(intptr_t) 0)
+#define BTREE_UNSET ((uint64_t **) NULL)
+
+#define PTR_BLOCK_SIZE_LG_MAX   18
+#define DATA_BLOCK_SIZE_LG_MAX  20
+
+enum btree_iterate_flags {
+	BTREE_ITERATE_KEYS_SET   = 1 << 0,
+	BTREE_ITERATE_KEYS_UNSET = 1 << 1,
+};
+
+/**
+ * B-tree control structure.
+ * B-tree implemented here has the following properties:
+ *  * It allows storing values of the same size, the size can vary from 1 bit to
+ *    64 bit values (only power of 2 sizes are allowed).
+ *  * The key can be up to 64 bits in size.
+ *  * It has separate configuration for pointer block size and data block size.
+ *  * It can be used for mask storage - supports storing the flag that all keys
+ *    are set/unset in the middle tree layers. See also btree_mask_set() and
+ *    btree_mask_unset().
+ *
+ * How bits of key are used for different block levels:
+ *
+ *     highest bits                                         lowest bits
+ *     | ptr_block_size_lg | ... | < remainder > | data_block_size_lg |
+ *     \______________________________________________________________/
+ *                                 key_size
+ *
+ * So, the remainder is used on the lowest non-data node level.
+ *
+ * As of now, it doesn't implement any mechanisms for resizing/changing key
+ * size.  De-fragmentation is also unsupported currently.
+ */
+struct btree {
+	uint64_t set_value;         /**< Default set value */
+	uint64_t **data;
+	uint8_t item_size_lg;       /**< Item size log2, in bits, 0..6. */
+	/** Pointer block size log2, in pointers sizes. 8-14, usually. */
+	uint8_t ptr_block_size_lg;
+	/** Data block size log2, in bytes. 8-14, usually. */
+	uint8_t data_block_size_lg;
+	uint8_t key_size;           /**< Key size, in bits, 1..64. */
+};
+
+
+bool btree_check(uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+		 uint8_t data_block_size_lg, uint8_t key_size);
+void btree_init(struct btree *b, uint8_t item_size_lg,
+		uint8_t ptr_block_size_lg, uint8_t data_block_size_lg,
+		uint8_t key_size, uint64_t set_value);
+struct btree * btree_create(uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+			    uint8_t data_block_size_lg, uint8_t key_size,
+			    uint64_t set_value);
+
+bool btree_set(struct btree *b, uint64_t key, uint64_t val);
+#if 0
+/**
+ * Sets to the value b->set_value all keys with 0-ed bits of mask equivalent to
+ * corresponding bits in key.
+ */
+int btree_mask_set(struct btree *b, uint64_t key, uint8_t mask_bits);
+/**
+ * Sets to 0 all keys with 0-ed bits of mask equivalent to corresponding bits in
+ * key.
+ */
+int btree_mask_unset(struct btree *b, uint64_t key, uint8_t mask_bits);
+int btree_interval_set(struct btree *b, uint64_t begin, uint64_t end,
+		       uint64_t val);
+
+uint64_t btree_get_next_set_key(struct btree *b, uint64_t key);
+uint64_t btree_iterate_keys(struct btree *b, uint64_t start, uint64_t end,
+			    enum btree_iterate_flags flags, btree_iterate_fn fn,
+			    void *fn_data);
+#endif
+
+
+uint64_t btree_get(struct btree *b, uint64_t key);
+
+void btree_free_block(struct btree *b, uint64_t **block, uint8_t depth,
+		      int max_depth);
+void btree_free(struct btree *b);
+
+#endif /* !STRACE_BTREE_H */
diff --git a/defs.h b/defs.h
index 3aa07fb8..41416455 100644
--- a/defs.h
+++ b/defs.h
@@ -280,6 +280,9 @@ struct tcb {
 	struct timespec etime;	/* Syscall entry time (CLOCK_MONOTONIC) */
 	struct timespec delay_expiration_time; /* When does the delay end */
 
+	uint64_t pid_ns;
+	bool pid_ns_inited;
+
 	struct mmap_cache_t *mmap_cache;
 
 	/*
@@ -413,7 +416,11 @@ extern const struct xlat whence_codes[];
 # define RVAL_HEX	001	/* hex format */
 # define RVAL_OCTAL	002	/* octal format */
 # define RVAL_FD		010	/* file descriptor */
-# define RVAL_MASK	013	/* mask for these values */
+# define RVAL_TID	011	/* task ID */
+# define RVAL_SID	012	/* session ID */
+# define RVAL_TGID	013	/* thread group ID */
+# define RVAL_PGID	014	/* process group ID */
+# define RVAL_MASK	017	/* mask for these values */
 
 # define RVAL_STR	020	/* Print `auxstr' field after return val */
 # define RVAL_NONE	040	/* Print nothing */
@@ -428,6 +435,16 @@ extern const struct xlat whence_codes[];
 
 # define indirect_ipccall(tcp) (tcp_sysent(tcp)->sys_flags & TRACE_INDIRECT_SUBCALL)
 
+enum pid_type {
+	PT_TID,
+	PT_TGID,
+	PT_PGID,
+	PT_SID,
+
+	PT_COUNT,
+	PT_NONE = -1
+};
+
 enum sock_proto {
 	SOCK_PROTO_UNKNOWN,
 	SOCK_PROTO_UNIX,
@@ -469,6 +486,7 @@ extern int Tflag_scale;
 extern int Tflag_width;
 extern bool iflag;
 extern bool count_wallclock;
+extern unsigned int perform_ns_resolution;
 /* are we filtering traces based on paths? */
 extern struct path_set {
 	const char **paths_selected;
@@ -984,6 +1002,9 @@ print_local_array_ex(struct tcb *tcp,
 extern kernel_ulong_t *
 fetch_indirect_syscall_args(struct tcb *, kernel_ulong_t addr, unsigned int n_args);
 
+extern int find_pid(struct tcb *tcp, int dest_id, enum pid_type type,
+		    int *proc_pid_ptr);
+
 extern void
 dumpiov_in_msghdr(struct tcb *, kernel_ulong_t addr, kernel_ulong_t data_size);
 
@@ -1059,6 +1080,7 @@ printfd(struct tcb *tcp, int fd)
  * of the tracee the descriptor tcp).  This is a stub.
  */
 extern void printfd_pid_tracee_ns(struct tcb *tcp, pid_t pid, int fd);
+extern void printpid(struct tcb *tcp, int pid, enum pid_type type);
 extern void print_sockaddr(const void *sa, int len);
 extern bool
 print_inet_addr(int af, const void *addr, unsigned int len, const char *var_name);
diff --git a/getpid.c b/getpid.c
new file mode 100644
index 00000000..2afaa997
--- /dev/null
+++ b/getpid.c
@@ -0,0 +1,30 @@
+/*#include "defs.h"
+
+SYS_FUNC(getpid)
+{
+	return RVAL_DECODED | RVAL_TGID;
+}
+
+SYS_FUNC(gettid)
+{
+	return RVAL_DECODED | RVAL_TID;
+}
+
+SYS_FUNC(getpgrp)
+{
+	return RVAL_DECODED | RVAL_PGID;
+}
+
+SYS_FUNC(getpgid)
+{
+	printpid(tcp, tcp->u_arg[0], PT_TGID);
+
+	return RVAL_DECODED | RVAL_PGID;
+}
+
+SYS_FUNC(getsid)
+{
+	printpid(tcp, tcp->u_arg[0], PT_TGID);
+
+	return RVAL_DECODED | RVAL_SID;
+}*/
diff --git a/linux/dummy.h b/linux/dummy.h
index 2f859a60..c6921cb2 100644
--- a/linux/dummy.h
+++ b/linux/dummy.h
@@ -53,9 +53,10 @@
 # define sys_getgid		sys_getuid
 # define sys_getgid16		sys_getuid16
 # define sys_getpeername		sys_getsockname
+# define sys_getppid		sys_getpid
 # define sys_getresgid		sys_getresuid
 # define sys_getresgid16		sys_getresuid16
-# define sys_lstat		sys_stat
+# define sys_lstat		sys_statq
 # define sys_lstat64		sys_stat64
 # define sys_mkdir		sys_chmod
 # define sys_mkdirat		sys_fchmodat
@@ -87,10 +88,6 @@
 # define sys_vfork		sys_fork
 
 /* printargs does the right thing */
-# define sys_getpgrp		printargs
-# define sys_getpid		printargs
-# define sys_getppid		printargs
-# define sys_gettid		printargs
 # define sys_idle		printargs
 # define sys_munlockall		printargs
 # define sys_pause		printargs
@@ -111,8 +108,6 @@
 # define sys_getpgid		printargs_d
 # define sys_getsid		printargs_d
 # define sys_nice		printargs_d
-# define sys_setpgid		printargs_d
-# define sys_setpgrp		printargs_d
 # define sys_timer_delete	printargs_d
 # define sys_timer_getoverrun	printargs_d
 
diff --git a/pidns.c b/pidns.c
new file mode 100644
index 00000000..cd5dbbb2
--- /dev/null
+++ b/pidns.c
@@ -0,0 +1,602 @@
+#if 0
+#include "defs.h"
+
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <asm/unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+//#include "btree.h"
+#include "nsfs.h"
+#include "xmalloc.h"
+
+/* key - NS ID, value - parent NS ID. */
+// struct btree *ns_hierarchy;
+/*
+ * key - NS ID, value - struct btree * with PID tree;
+ * PID tree has PID in NS as a key and PID in parent NS as a value.
+ */
+// struct btree *ns_pid_tree;
+
+static const char tid_str[]  = "NSpid:\t";
+static const char tgid_str[] = "NStgid:\t";
+static const char pgid_str[] = "NSpgid:\t";
+static const char sid_str[]  = "NSsid:\t";
+
+static const struct {
+	const char *str;
+	size_t size;
+} id_strs[PT_COUNT] = {
+	[PT_TID] =  { tid_str,  sizeof(tid_str)  },
+	[PT_TGID] = { tgid_str, sizeof(tgid_str) },
+	[PT_PGID] = { pgid_str, sizeof(pgid_str) },
+	[PT_SID] =  { sid_str,  sizeof(sid_str)  },
+};
+
+
+/**
+ * Limit on PID NS hierarchy depth, imposed since Linux 3.7. NS traversal
+ * is not possible before Linux 4.9, so we consider this limut pretty universal.
+ */
+#define MAX_NS_DEPTH 32
+
+struct proc_data {
+	int proc_pid;
+	short ns_count;
+	short refcount;
+	uint64_t ns_hierarchy[MAX_NS_DEPTH];
+	int id_count[PT_COUNT];
+	int *id_hierarchy[PT_COUNT];
+};
+
+/**
+ * Helper function, converts pid to string, or to "self" for pid == 0.
+ * Uses static buffer for operation.
+ */
+static const char *
+pid_to_str(pid_t pid)
+{
+	static char buf[sizeof("-2147483648")];
+	ssize_t ret;
+
+	if (!pid)
+		return "self";
+
+	ret = snprintf(buf, sizeof(buf), "%d", pid);
+
+	if ((ret < 0) || ((size_t) ret >= sizeof(buf)))
+		perror_msg_and_die("pid_to_str: snprintf");
+
+	return buf;
+}
+
+/**
+ * Returns a list of PID NS IDs for the specified PID.
+ *
+ * @param proc_pid PID (as present in /proc) to get information for.
+ * @param ns_buf   Pointer to buffer that is able to contain at least
+ *                 MAX_NS_DEPTH items.
+ * @param last     ID of NS on which ascencion can be interrupted.
+ *                 0 for no interruption.
+ * @return         Amount of NS in list. 0 indicates error, MAX_NS_DEPTH + 1
+ *                 indicates that ascension limit hasn't been reached (only
+ *                 MAX_NS_DEPTH values have been written to the array, however).
+ */
+static size_t
+get_ns_hierarchy(int proc_pid, uint64_t *ns_buf, size_t ns_buf_size,
+		 uint64_t last)
+{
+	char path[PATH_MAX + 1];
+	struct stat st;
+	ssize_t ret;
+	size_t n = 0;
+	int fd;
+	int parent_fd;
+
+	ret = snprintf(path, sizeof(path), "/proc/%s/ns/pid",
+		       pid_to_str(proc_pid));
+
+	if ((ret < 0) || ((size_t) ret >= sizeof(path)))
+		return 0;
+
+	fd = open(path, O_RDONLY | O_NONBLOCK);
+	if (fd < 0)
+		return 0;
+
+	while (1) {
+		ret = fstat(fd, &st);
+		if (ret)
+			break;
+
+		/* 32 is the hierarchy depth on modern Linux */
+		if ((n >= MAX_NS_DEPTH) || (n >= ns_buf_size)) {
+			n++;
+			break;
+		}
+
+		ns_buf[n] = st.st_ino;
+		if (debug_flag)
+			error_msg("Got NS: %" PRIu64, ns_buf[n]);
+
+		n++;
+
+		if (!last || ns_buf[n - 1] == last)
+			break;
+
+		parent_fd = ioctl(fd, NS_GET_PARENT);
+		if (parent_fd == -1) {
+			switch (errno) {
+			case EPERM:
+				if (debug_flag)
+					error_msg("Terminating NS ascending "
+						  "after %zu levels on NS %"
+						  PRIu64, n, ns_buf[n - 1]);
+				break;
+
+			case ENOTTY:
+				error_msg("NS_* ioctl commands are not "
+					  "supported by the kernel");
+				break;
+			default:
+				perror_msg("get_ns_hierarchy: "
+					   "ioctl(NS_GET_PARENT)");
+				break;
+			}
+
+			break;
+		}
+
+		close(fd);
+		fd = parent_fd;
+	}
+
+	//update_ns_hierarchy 
+
+	//parent_fd = ge
+
+	close(fd);
+
+	return n;
+}
+
+/**
+ * Get list of IDs present in NS* proc status record. IDs are placed as they are
+ * stored in /proc (from top to bottom of NS hierarchy).
+ *
+ * @param proc_pid    PID (as present in /proc) to get information for.
+ * @param id_buf      Pointer to buffer that is able to contain at least
+ *                    MAX_NS_DEPTH items. Can be NULL.
+ * @param type        Type of ID requested.
+ * @return            Number of items stored in id_list. 0 indicates error,
+ *                    MAX_NS_DEPTH + 1 indicates that status record contains
+ *                    more that MAX_NS_DEPTH records and the id_buf provided
+ *                    is unusable.
+ */
+static size_t
+get_id_list(int proc_pid, int *id_buf, enum pid_type type)
+{
+	const char *ns_str = id_strs[type].str;
+	size_t ns_str_size = id_strs[type].size;
+	char *buf;
+	char *p;
+	char *endp;
+	FILE *f;
+	size_t idx;
+	ssize_t ret;
+
+	ret = asprintf(&buf, "/proc/%s/status", pid_to_str(proc_pid));
+
+	if (ret < 0)
+		return 0;
+
+	f = fopen(buf, "r");
+	free(buf);
+
+	if (!f)
+		return 0;
+
+	while (fscanf(f, "%m[^\n]", &buf) == 1) {
+		if (strncmp(buf, ns_str, ns_str_size)) {
+			free(buf);
+			continue;
+		}
+
+		p = buf + ns_str_size;
+
+		for (idx = 0; idx < MAX_NS_DEPTH; idx++) {
+			errno = 0;
+			ret = strtol(p, &endp, 10);
+
+			if (errno && (p[0] != '\t'))
+				return 0;
+
+			if (debug_flag)
+				error_msg("PID %d: %s[%zu]: %zd",
+					  proc_pid, ns_str, idx, ret);
+
+			if (id_buf)
+				id_buf[idx] = ret;
+
+			strsep(&p, "\t");
+
+			/* In order to distinguish MAX_NS_DEPTH items */
+			if (!p)
+				break;
+		}
+	}
+
+	free(buf);
+
+	return idx + 1;
+}
+
+static bool
+is_proc_ours(void)
+{
+	static int cached_val = -1;
+
+	if (cached_val < 0)
+		cached_val = get_id_list(0, NULL, PT_TID) == 1;
+
+	return cached_val;
+}
+
+static uint64_t
+get_ns(struct tcb *tcp)
+{
+	if (!tcp->pid_ns_inited) {
+		int pid = tcp->pid;
+
+		if (!is_proc_ours())
+			if (find_pid(NULL, tcp->pid, PT_TID, &pid) < 1)
+				pid = -1;
+
+		if ((pid == -1) || !get_ns_hierarchy(pid, &tcp->pid_ns, 1, 0))
+			tcp->pid_ns = -1ULL;
+
+		tcp->pid_ns_inited = true;
+	}
+
+	return tcp->pid_ns;
+}
+
+static uint64_t
+get_our_ns(void)
+{
+	static uint64_t our_ns = 0;
+	static bool our_ns_initialised = false;
+
+	if (!our_ns_initialised) {
+		uint64_t ns_buf[MAX_NS_DEPTH];
+		size_t ret;
+
+		if (!(ret = get_ns_hierarchy(0, ns_buf, ARRAY_SIZE(ns_buf), 0)))
+			our_ns = -1ULL;
+		else
+			our_ns = ns_buf[0];
+
+		our_ns_initialised = true;
+	}
+
+	return our_ns;
+}
+
+
+/**
+ * Returns ID in our NS. If orig_ns_id is provided, also returns ID in orig_ns.
+ */
+/* static int
+dens_id(int proc_pid,
+	uint64_t *ns_buf, size_t ns_count,
+	int *id_buf, size_t id_count,
+	uint64_t orig_ns, uint64_t our_ns, int *orig_ns_id)
+{
+	bool orig_idx_found = false;
+	size_t idx;
+
+	if (!ns_count || (ns_count > MAX_NS_DEPTH) ||
+	    !id_count || (id_count > MAX_NS_DEPTH))
+		return -1;
+
+	if (is_proc_ours()) {
+	}
+
+	for (idx = 0; idx < ns_count; idx++) {
+		if (ns_buf[idx] != orig_ns)
+			continue;
+
+		orig_idx = idx;
+		orig_idx_found = true;
+		break;
+	}
+
+	if (!orig_idx_found) {
+		free(ns_buf);
+
+		return -1;
+	}
+
+} */
+
+/**
+ * Checks whether proc data record is actual, and updates it in case it doesn't.
+ * Automatically removes invalid entries if found.
+ *
+ * -1 - error
+ *  0 - cache is invalid
+ *  1 - cache is valid
+ *  2 - only NS cache is valid
+ */
+static int
+check_proc_data_validity(struct proc_data *pd, enum pid_type type)
+{
+	/* ns_cnt = get_ns_hierarchy(proc_pid, &ns_buf, our_ns);
+	if (!ns_cnt || (ns_cnt >= MAX_NS_DEPTH) ||
+	    (ns_buf[ns_cnt - 1] != our_ns)) */
+	return 0;
+
+}
+
+static struct proc_data *
+get_proc_data(int proc_pid)
+{
+	struct proc_data *pd = calloc(1, sizeof(*pd));
+
+	if (!pd)
+		return NULL;
+
+	pd->proc_pid = proc_pid;
+
+	return pd;
+}
+
+static struct proc_data *
+find_proc_data(int id, uint64_t ns, enum pid_type type)
+{
+	return NULL;
+}
+
+static void
+put_proc_data(struct proc_data *pd)
+{
+	free(pd);
+}
+
+static void
+update_proc_data_cache(struct proc_data *pd, enum pid_type type)
+{
+}
+
+/**
+ * Removes references to the proc_data entry from all caches.
+ */
+static void
+invalidate_proc_data(struct proc_data *pd)
+{
+}
+
+/**
+ * Caches:
+ *  * tidns:ns -> tid in our ns
+ *   * How to check validity: get cached proc path, with additional check for
+ *     ns and that it also has tidns at the relevant level in NSpid
+ *  * tid (in our ns) -> proc_tid
+ *   * How to check validity: open cached /proc/pid/status and check relevant
+ *     NSpid record, check that /proc/pid/ns/pid is accessible [and leads to our
+ *     ns]
+ *
+ *  Tracees have fixed pid ns.
+ */
+
+/**
+ * tcp == NULL - strace's view
+ * dest_pid == 0 - use the data from tcb
+ */
+int
+find_pid(struct tcb *tcp, int dest_id, enum pid_type type, int *proc_pid_ptr)
+{
+	static long name_max = -1;
+
+	const uint64_t our_ns = get_our_ns();
+	uint64_t dest_ns;
+
+	struct proc_data *pd;
+	int pd_valid = 0;
+
+	DIR *dp = NULL;
+	struct dirent *entry;
+	struct dirent *entry_buf;
+	struct dirent *entry_ret;
+	const char *id_str;
+	size_t idx;
+	size_t entry_size;
+	long proc_pid = -1;
+	int ret;
+	int res = -1;
+
+	if ((type >= PT_COUNT) || (type < 0))
+		goto find_pid_exit;
+
+	if (is_proc_ours() && (!tcp || get_ns(tcp) == our_ns)) {
+		if (proc_pid_ptr)
+			*proc_pid_ptr =
+				dest_id ? dest_id : syscall(__NR_gettid);
+
+		if (dest_id) {
+			return dest_id;
+
+		switch (type) {
+		case PT_TID:	return syscall(__NR_gettid);
+		case PT_TGID:	return getpid();
+		case PT_PGID:	return getpgrp();
+		case PT_SID:	return getsid(getpid());
+		default:	return -1;
+		}
+	}
+
+	dest_ns = tcp ? get_ns(tcp) : our_ns;
+
+	pd = find_proc_data(dest_id, dest_ns, type);
+	if (pd) {
+		pd_valid = check_proc_data_validity(pd, type);
+		if (pd_valid == -1)
+			goto find_pid_pd;
+		if (pd_valid == 0)
+			put_proc_data(pd);
+		if (pd_valid == 2)
+			goto find_pid_get_ids;
+	}
+
+	if (pd_valid)
+		goto find_pid_get_pid;
+
+	dp = opendir("/proc");
+	if (!dp)
+		goto find_pid_pd;
+
+
+	if (name_max == -1) {
+		name_max = pathconf("/proc", _PC_NAME_MAX);
+		if (name_max == -1)
+			name_max = 255;
+	}
+
+	entry_size = offsetof(struct dirent, d_name) + name_max + 1;
+	entry_buf = malloc(entry_size);
+	if (!entry_buf)
+		goto find_pid_dir;
+
+	do {
+		ret = readdir_r(dp, entry_buf, &entry);
+		if (ret) {
+			perror_msg("find_pid: readdir");
+			goto find_pid_entry;
+		}
+
+		if (!entry)
+			goto find_pid_entry;
+
+		if (entry->d_type != DT_DIR)
+			continue;
+
+		errno = 0;
+		proc_pid = strtol(entry->d_name, NULL, 10);
+		if (errno)
+			continue;
+		if ((proc_pid < 1) || (proc_pid > INT_MAX))
+			continue;
+
+		pd = get_proc_data(proc_pid);
+		pd_valid = check_proc_data_validity(pd, type);
+		if (pd_valid == -1)
+			goto find_pid_entry;
+		if (pd_valid == 1)
+			goto find_pid_get_pid;
+		if (pd_valid == 0)
+			pd->ns_count = get_ns_hierarchy(proc_pid,
+				pd->ns_hierarchy, ARRAY_SIZE(pd->ns_hierarchy),
+				our_ns);
+find_pid_get_ids:
+		if (!pd->id_hierarchy[type])
+			pd->id_hierarchy[type] = calloc(MAX_NS_DEPTH,
+				sizeof(pd->id_hierarchy[type][0]));
+		if (!pd->id_hierarchy[type])
+			goto find_pid_entry;
+
+		pd->id_count[type] = get_id_list(proc_pid,
+			pd->id_hierarchy[type], type);
+
+		update_proc_data_cache(pd, type);
+
+find_pid_get_pid:
+		if (!pd->ns_count || (pd->ns_count > pd->id_count[type])) {
+			continue;
+		}
+
+		if (pd->ns_hierarchy[pd->ns_count - 1] != dest_ns)
+			continue;
+
+		if (dest_ns == our_ns) {
+			if (pd->id_hierarchy[type][pd->id_count[type] -
+			    pd->ns_count] == dest_id) {
+				res = dest_id;
+				goto find_pid_entry;
+			}
+		} else {
+			for (idx = 0; idx < pd->ns_count - 1; idx++) {
+				if (pr->ns_hierarchy[idx] != dest_ns)
+					continue;
+				if (pr->id_hierarchy[type][pd->id_count[type] -
+				    idx + 1] != dest_id)
+					break;
+
+				res = pd->id_hierarchy[type][pd->id_count[type] -
+							     pd->ns_count]
+
+				goto find_pid_entry;
+			}
+		}
+
+		put_proc_data(pd);
+	} while (1)
+
+find_pid_entry:
+	free(entry_buf);
+find_pid_dir:
+	closedir(dp);
+find_pid_pd:
+	put_proc_data(pd);
+
+find_pid_exit:
+	if (proc_pid_ptr)
+		*proc_pid_ptr = proc_pid;
+
+	return res;
+}
+
+int
+get_proc_pid(struct *tcp)
+{
+	if (!is_proc_ours()) {
+		int ret;
+
+		if (find_pid(NULL, tcp->pid, PT_TID, &ret) < 0)
+			return -1;
+
+		return ret;
+	}
+
+	return tcp->pid;
+}
+
+/* To be called on tracee exits or other clear indications that pid is no more
+ * relevant */
+void
+clear_proc_pid(struct tcb *tcp, int pid)
+{
+}
+
+void
+printpid(struct tcb *tcp, int pid, enum pid_type type)
+{
+	int strace_pid;
+
+	tprintf("%d", pid);
+
+	if (perform_ns_resolution) {
+		find_pid(tcp, 0, type, NULL);
+
+		if ((strace_pid > 0) && (pid != strace_pid))
+			tprintf_comment("%d in strace's PID NS", strace_pid);
+	}
+}
+#endif
diff --git a/strace.c b/strace.c
index 311e4d62..d32d6487 100644
--- a/strace.c
+++ b/strace.c
@@ -133,6 +133,8 @@ static unsigned int daemonized_tracer;
 static int post_attach_sigstop = TCB_IGNORE_ONE_SIGSTOP;
 #define use_seize (post_attach_sigstop == 0)
 
+unsigned int perform_ns_resolution;
+
 static bool detach_on_execve;
 
 static int exit_code;
@@ -2013,7 +2015,7 @@ init(int argc, char *argv[])
 	qualify_signals("all");
 
 	static const char optstring[] =
-		"+a:Ab:cCdDe:E:fFhiI:ko:O:p:P:qrs:S:tTu:U:vVwxX:yzZ";
+		"+a:Ab:cCdDe:E:fFhiI:ko:O:p:P:qrs:S:tTu:U:vVwxX:yYzZ";
 
 	enum {
 		GETOPT_SECCOMP = 0x100,
@@ -2285,6 +2287,9 @@ init(int argc, char *argv[])
 		case 'y':
 			yflag_short++;
 			break;
+		case 'Y':
+			perform_ns_resolution++;
+			break;
 		case 'z':
 			clear_number_set_array(status_set, 1);
 			add_number_to_set(STATUS_SUCCESSFUL, status_set);
diff --git a/syscall.c b/syscall.c
index 022cab5b..e0433e56 100644
--- a/syscall.c
+++ b/syscall.c
@@ -930,6 +930,21 @@ syscall_exiting_trace(struct tcb *tcp, struct timespec *ts, int res)
 					tprintf("= %" PRI_kld, tcp->u_rval);
 				}
 				break;
+			case RVAL_TID:
+			case RVAL_SID:
+			case RVAL_TGID:
+			case RVAL_PGID: {
+				#define _(_t) [RVAL_##_t - RVAL_TID] = PT_##_t
+				static const enum pid_type types[] = {
+					_(TID), _(SID), _(TGID), _(PGID),
+				};
+				#undef _
+
+				tprints("= ");
+				printpid(tcp, tcp->u_rval,
+					 types[(sys_res & RVAL_MASK) - RVAL_TID]);
+				break;
+			}
 			default:
 				error_msg("invalid rval format");
 				break;
-- 
2.26.2



More information about the Strace-devel mailing list