[RFC PATCH v3 07/14] PID namespace translation support
Ákos Uzonyi
uzonyi.akos at gmail.com
Sat Jun 13 11:25:29 UTC 2020
Co-Authored-by: Eugene Syromyatnikov <evgsyr at gmail.com>
---
Makefile.am | 2 +
btree.c | 344 +++++++++++++++++++++++++++
btree.h | 90 +++++++
defs.h | 26 ++-
linux/dummy.h | 7 +-
pidns.c | 633 ++++++++++++++++++++++++++++++++++++++++++++++++++
strace.c | 8 +-
syscall.c | 15 ++
util.c | 8 +-
9 files changed, 1119 insertions(+), 14 deletions(-)
create mode 100644 btree.c
create mode 100644 btree.h
create mode 100644 pidns.c
diff --git a/Makefile.am b/Makefile.am
index f5447811..2717d292 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -68,6 +68,7 @@ libstrace_a_SOURCES = \
bpf_fprog.h \
bpf_seccomp_filter.c \
bpf_sock_filter.c \
+ btree.c \
btrfs.c \
cacheflush.c \
capability.c \
@@ -233,6 +234,7 @@ libstrace_a_SOURCES = \
personality.c \
pidfd_getfd.c \
pidfd_open.c \
+ pidns.c \
pkeys.c \
poll.c \
prctl.c \
diff --git a/btree.c b/btree.c
new file mode 100644
index 00000000..dd64e212
--- /dev/null
+++ b/btree.c
@@ -0,0 +1,344 @@
+/* Simple B-tree implementation for key-value mapping storage */
+
+#include "defs.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "btree.h"
+
+static const uint8_t ptr_sz_lg = (sizeof(uint64_t *) == 8 ? 6 : 5);
+
+bool
+btree_check(uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+ uint8_t data_block_size_lg, uint8_t key_size)
+{
+ if (item_size_lg > 6)
+ return false;
+ if (key_size < 1 || key_size > 64)
+ return false;
+ if (ptr_block_size_lg < ptr_sz_lg || ptr_block_size_lg > PTR_BLOCK_SIZE_LG_MAX)
+ return false;
+ if (data_block_size_lg > DATA_BLOCK_SIZE_LG_MAX ||
+ data_block_size_lg < 6 ||
+ item_size_lg > data_block_size_lg)
+ return false;
+
+ return true;
+}
+
+void
+btree_init(struct btree *b, uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+ uint8_t data_block_size_lg, uint8_t key_size, uint64_t set_value)
+{
+ assert(btree_check(item_size_lg, ptr_block_size_lg, data_block_size_lg,
+ key_size));
+
+ b->set_value = set_value;
+ b->data = BTREE_UNSET;
+ b->item_size_lg = item_size_lg;
+ b->ptr_block_size_lg = ptr_block_size_lg;
+ b->data_block_size_lg = data_block_size_lg;
+ b->key_size = key_size;
+}
+
+static uint8_t
+btree_get_depth(struct btree *b)
+{
+ return (b->key_size - (b->data_block_size_lg - b->item_size_lg) +
+ b->ptr_block_size_lg - ptr_sz_lg - 1) / (b->ptr_block_size_lg - ptr_sz_lg);
+}
+
+/**
+ * Returns lg2 of block size for the specific level of B-tree. If max_depth
+ * provided is less than zero, it is calculated via btree_get_depth call.
+ */
+static uint8_t
+btree_get_block_size(struct btree *b, uint8_t depth, int max_depth)
+{
+ if (max_depth < 0)
+ max_depth = btree_get_depth(b);
+
+ /* Last level contains data and we allow it having a different size */
+ if (depth == max_depth)
+ return b->data_block_size_lg;
+ /* Last level of the tree can be smaller */
+ if (depth == max_depth - 1)
+ return (b->key_size -
+ (b->data_block_size_lg - b->item_size_lg) - 1) %
+ (b->ptr_block_size_lg - ptr_sz_lg) + 1 + ptr_sz_lg;
+
+ return b->ptr_block_size_lg;
+}
+
+#define round_down(a, b) (((a) / (b)) * (b))
+
+/**
+ * Provides starting offset of bits in key corresponding to the block index
+ * at the specific level.
+ */
+static uint8_t
+btree_get_block_bit_offs(struct btree *b, uint8_t depth, int max_depth)
+{
+ uint8_t offs;
+
+ if (max_depth < 0)
+ max_depth = btree_get_depth(b);
+
+ if (depth == max_depth)
+ return 0;
+
+ offs = b->data_block_size_lg - b->item_size_lg;
+
+ if (depth == max_depth - 1)
+ return offs;
+
+ /* data_block_size + remainder */
+ offs += btree_get_block_size(b, max_depth - 1, max_depth) - ptr_sz_lg;
+ offs += (max_depth - depth - 2) * (b->ptr_block_size_lg - ptr_sz_lg);
+
+ return offs;
+}
+
+struct btree *
+btree_create(uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+ uint8_t data_block_size_lg, uint8_t key_size, uint64_t set_value)
+{
+ struct btree *b;
+
+ if (!btree_check(item_size_lg, ptr_block_size_lg, data_block_size_lg,
+ key_size))
+ return NULL;
+
+ b = malloc(sizeof(*b));
+ if (!b)
+ return NULL;
+
+ btree_init(b, item_size_lg, ptr_block_size_lg, data_block_size_lg,
+ key_size, set_value);
+
+ return b;
+}
+
+static uint64_t
+btree_filler(uint64_t val, uint8_t item_size)
+{
+ val &= (1 << (1 << item_size)) - 1;
+
+ for (; item_size < 6; item_size++)
+ val |= val << (1 << item_size);
+
+ return val;
+}
+
+static uint64_t *
+btree_get_block(struct btree *b, uint64_t key, bool auto_create)
+{
+ void **cur_block = &(b->data);
+ unsigned i;
+ uint8_t cur_depth;
+ uint8_t max_depth;
+ uint8_t sz;
+
+ if (b->key_size < 64 && key > (uint64_t) 1 << b->key_size)
+ return NULL;
+
+ max_depth = btree_get_depth(b);
+
+ for (cur_depth = 0; cur_depth <= max_depth; cur_depth++) {
+ sz = btree_get_block_size(b, cur_depth, max_depth);
+
+ if (*cur_block == BTREE_SET || *cur_block == BTREE_UNSET) {
+ void *old_val = *cur_block;
+
+ if (!auto_create)
+ return (uint64_t *) (*cur_block);
+
+ *cur_block = xcalloc(1 << sz, 8);
+
+ if (old_val == BTREE_SET) {
+ uint64_t fill_value = cur_depth == max_depth ? b->set_value : (uintptr_t) BTREE_SET;
+ uint8_t fill_size = cur_depth == max_depth ? b->item_size_lg : ptr_sz_lg;
+
+ for (i = 0; i < ((unsigned int)1 << (sz - 3)); i++)
+ ((uint64_t *) *cur_block)[i] = btree_filler(fill_value, fill_size);
+ }
+ }
+
+ if (cur_depth < max_depth) {
+ size_t pos = (key >> btree_get_block_bit_offs(b,
+ cur_depth, max_depth)) & ((1 << (sz - ptr_sz_lg)) - 1);
+
+ cur_block = (((void **) (*cur_block)) + pos);
+ }
+ }
+
+ return (uint64_t *) (*cur_block);
+}
+
+bool
+btree_set(struct btree *b, uint64_t key, uint64_t val)
+{
+ uint64_t *data = btree_get_block(b, key, true);
+ size_t mask = (1 << (b->data_block_size_lg - b->item_size_lg)) - 1;
+ size_t pos = (key & mask) >> (6 - b->item_size_lg);
+
+ if (!data)
+ return false;
+
+ if (b->item_size_lg == 6) {
+ data[pos] = val;
+ } else {
+ size_t offs = (key & ((1 << (6 - b->item_size_lg)) - 1)) * (1 << b->item_size_lg);
+ uint64_t mask = (((uint64_t) 1 << (1 << b->item_size_lg)) - 1) << offs;
+
+ data[pos] &= ~mask;
+ data[pos] |= (val << offs) & mask;
+ }
+
+ return true;
+}
+
+#if 0
+int
+btree_mask_set(struct btree *b, uint64_t key, uint8_t mask_bits)
+{
+}
+
+/**
+ * Sets to 0 all keys with 0-ed bits of mask equivalent to corresponding bits in
+ * key.
+ */
+int
+btree_mask_unset(struct btree *b, uint64_t key, uint8_t mask_bits)
+{
+}
+
+int
+btree_interval_set(struct btree *b, uint64_t begin, uint64_t end, uint64_t val)
+{
+}
+
+uint64_t
+btree_get_next_set_key(struct btree *b, uint64_t key)
+{
+}
+#endif
+
+static uint64_t
+btree_data_block_get(struct btree *b, uint64_t *data, uint64_t key)
+{
+ size_t mask;
+ size_t pos;
+ size_t offs;
+
+ if (!data)
+ return 0;
+ if ((void *) data == (void *) BTREE_SET)
+ return b->set_value;
+
+ mask = (1 << (b->data_block_size_lg - b->item_size_lg)) - 1;
+ pos = (key & mask) >> (6 - b->item_size_lg);
+
+ if (b->item_size_lg == 6)
+ return data[pos];
+
+ offs = (key & ((1 << (6 - b->item_size_lg)) - 1)) * (1 << b->item_size_lg);
+
+ return (data[pos] >> offs) & (((uint64_t)1 << (1 << b->item_size_lg)) - 1);
+}
+
+uint64_t
+btree_get(struct btree *b, uint64_t key)
+{
+ return btree_data_block_get(b, btree_get_block(b, key, false), key);
+}
+
+static uint64_t
+btree_iterate_keys_block(struct btree *b, enum btree_iterate_flags flags,
+ btree_iterate_fn fn, void *fn_data,
+ uint64_t **block, uint64_t start, uint64_t end,
+ uint8_t depth, uint8_t max_depth)
+{
+ if (start > end)
+ return 0;
+
+ if ((block == BTREE_SET && !(flags & BTREE_ITERATE_KEYS_SET)) ||
+ (block == BTREE_UNSET && !(flags & BTREE_ITERATE_KEYS_UNSET)))
+ return 0;
+
+ if (block == BTREE_SET || block == BTREE_UNSET || depth == max_depth) {
+ for (uint64_t i = start; i <= end; i++)
+ fn(fn_data, i, btree_data_block_get(b, (uint64_t *) block, i));
+
+ return end - start + 1; //TODO: overflow
+ }
+
+ uint8_t parent_block_bit_off = depth == 0 ? b->key_size : btree_get_block_bit_offs(b, depth - 1, max_depth);
+ uint64_t first_key_in_block = start & (uint64_t) -1 << parent_block_bit_off;
+
+ uint8_t block_bit_off = btree_get_block_bit_offs(b, depth, max_depth);
+ uint8_t block_key_bits = parent_block_bit_off - block_bit_off;
+ uint64_t mask = ((uint64_t) 1 << (block_key_bits)) - 1;
+ uint64_t start_index = (start >> block_bit_off) & mask;
+ uint64_t end_index = (end >> block_bit_off) & mask;
+ uint64_t child_key_count = (uint64_t) 1 << block_bit_off;
+
+ uint64_t count = 0;
+
+ for (uint64_t i = start_index; i <= end_index; i++) {
+ uint64_t child_start = first_key_in_block + i * child_key_count;
+ uint64_t child_end = first_key_in_block + (i + 1) * child_key_count - 1;
+
+ if (child_start < start)
+ child_start = start;
+ if (child_end > end)
+ child_end = end;
+
+ count += btree_iterate_keys_block(b, flags, fn, fn_data,
+ (uint64_t **) block[i], child_start, child_end,
+ depth + 1, max_depth);
+ }
+
+ return count;
+}
+
+uint64_t btree_iterate_keys(struct btree *b, uint64_t start, uint64_t end,
+ enum btree_iterate_flags flags, btree_iterate_fn fn,
+ void *fn_data)
+{
+ return btree_iterate_keys_block(b, flags, fn, fn_data, b->data,
+ start, end, 0, btree_get_depth(b));
+}
+
+void
+btree_free_block(struct btree *b, uint64_t **block, uint8_t depth,
+ int max_depth)
+{
+ size_t sz;
+ size_t i;
+
+ if (block == BTREE_SET || block == BTREE_UNSET)
+ return;
+ if (max_depth < 0)
+ max_depth = btree_get_depth(b);
+ if (depth >= max_depth)
+ goto free_block;
+
+ sz = 1 << (btree_get_block_size(b, depth, max_depth) - ptr_sz_lg);
+
+ for (i = 0; i < sz; i++)
+ btree_free_block(b, (uint64_t **) (block[i]), depth + 1, max_depth);
+
+free_block:
+ free(block);
+}
+
+void
+btree_free(struct btree *b)
+{
+ btree_free_block(b, b->data, 0, -1);
+ free(b);
+}
diff --git a/btree.h b/btree.h
new file mode 100644
index 00000000..08e8f867
--- /dev/null
+++ b/btree.h
@@ -0,0 +1,90 @@
+#ifndef STRACE_BTREE_H
+#define STRACE_BTREE_H
+
+/* Simple B-tree interface */
+
+#define BTREE_SET ((void *) ~(intptr_t) 0)
+#define BTREE_UNSET ((void *) NULL)
+
+#define PTR_BLOCK_SIZE_LG_MAX 24
+#define DATA_BLOCK_SIZE_LG_MAX 23
+
+enum btree_iterate_flags {
+ BTREE_ITERATE_KEYS_SET = 1 << 0,
+ BTREE_ITERATE_KEYS_UNSET = 1 << 1,
+};
+
+/**
+ * B-tree control structure.
+ * B-tree implemented here has the following properties:
+ * * It allows storing values of the same size, the size can vary from 1 bit to
+ * 64 bit values (only power of 2 sizes are allowed).
+ * * The key can be up to 64 bits in size.
+ * * It has separate configuration for pointer block size and data block size.
+ * * It can be used for mask storage - supports storing the flag that all keys
+ * are set/unset in the middle tree layers. See also btree_mask_set() and
+ * btree_mask_unset().
+ *
+ * How bits of key are used for different block levels:
+ *
+ * highest bits lowest bits
+ * | ptr_block_size_lg | ... | < remainder > | data_block_size_lg |
+ * \______________________________________________________________/
+ * key_size
+ *
+ * So, the remainder is used on the lowest non-data node level.
+ *
+ * As of now, it doesn't implement any mechanisms for resizing/changing key
+ * size. De-fragmentation is also unsupported currently.
+ */
+struct btree {
+ uint64_t set_value; /**< Default set value */
+ void *data;
+ uint8_t item_size_lg; /**< Item size log2, in bits, 0..6. */
+ /** Pointer block size log2, in bits. 14-20, usually. */
+ uint8_t ptr_block_size_lg;
+ /** Data block size log2, in bits. 11-17, usually. */
+ uint8_t data_block_size_lg;
+ uint8_t key_size; /**< Key size, in bits, 1..64. */
+};
+
+typedef void (*btree_iterate_fn)(void *data, uint64_t key, uint64_t val);
+
+bool btree_check(uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+ uint8_t data_block_size_lg, uint8_t key_size);
+void btree_init(struct btree *b, uint8_t item_size_lg,
+ uint8_t ptr_block_size_lg, uint8_t data_block_size_lg,
+ uint8_t key_size, uint64_t set_value);
+struct btree * btree_create(uint8_t item_size_lg, uint8_t ptr_block_size_lg,
+ uint8_t data_block_size_lg, uint8_t key_size,
+ uint64_t set_value);
+
+bool btree_set(struct btree *b, uint64_t key, uint64_t val);
+#if 0
+/**
+ * Sets to the value b->set_value all keys with 0-ed bits of mask equivalent to
+ * corresponding bits in key.
+ */
+int btree_mask_set(struct btree *b, uint64_t key, uint8_t mask_bits);
+/**
+ * Sets to 0 all keys with 0-ed bits of mask equivalent to corresponding bits in
+ * key.
+ */
+int btree_mask_unset(struct btree *b, uint64_t key, uint8_t mask_bits);
+int btree_interval_set(struct btree *b, uint64_t begin, uint64_t end,
+ uint64_t val);
+
+uint64_t btree_get_next_set_key(struct btree *b, uint64_t key);
+#endif
+
+uint64_t btree_iterate_keys(struct btree *b, uint64_t start, uint64_t end,
+ enum btree_iterate_flags flags, btree_iterate_fn fn,
+ void *fn_data);
+
+uint64_t btree_get(struct btree *b, uint64_t key);
+
+void btree_free_block(struct btree *b, uint64_t **block, uint8_t depth,
+ int max_depth);
+void btree_free(struct btree *b);
+
+#endif /* !STRACE_BTREE_H */
diff --git a/defs.h b/defs.h
index d8bd5135..cc8dae04 100644
--- a/defs.h
+++ b/defs.h
@@ -280,6 +280,9 @@ struct tcb {
struct timespec etime; /* Syscall entry time (CLOCK_MONOTONIC) */
struct timespec delay_expiration_time; /* When does the delay end */
+ uint64_t pid_ns;
+ bool pid_ns_inited;
+
struct mmap_cache_t *mmap_cache;
/*
@@ -413,7 +416,11 @@ extern const struct xlat whence_codes[];
# define RVAL_HEX 001 /* hex format */
# define RVAL_OCTAL 002 /* octal format */
# define RVAL_FD 010 /* file descriptor */
-# define RVAL_MASK 013 /* mask for these values */
+# define RVAL_TID 011 /* task ID */
+# define RVAL_SID 012 /* session ID */
+# define RVAL_TGID 013 /* thread group ID */
+# define RVAL_PGID 014 /* process group ID */
+# define RVAL_MASK 017 /* mask for these values */
# define RVAL_STR 020 /* Print `auxstr' field after return val */
# define RVAL_NONE 040 /* Print nothing */
@@ -428,6 +435,16 @@ extern const struct xlat whence_codes[];
# define indirect_ipccall(tcp) (tcp_sysent(tcp)->sys_flags & TRACE_INDIRECT_SUBCALL)
+enum pid_type {
+ PT_TID,
+ PT_TGID,
+ PT_PGID,
+ PT_SID,
+
+ PT_COUNT,
+ PT_NONE = -1
+};
+
enum sock_proto {
SOCK_PROTO_UNKNOWN,
SOCK_PROTO_UNIX,
@@ -469,6 +486,7 @@ extern int Tflag_scale;
extern int Tflag_width;
extern bool iflag;
extern bool count_wallclock;
+extern unsigned int perform_ns_resolution;
/* are we filtering traces based on paths? */
extern struct path_set {
const char **paths_selected;
@@ -984,6 +1002,11 @@ print_local_array_ex(struct tcb *tcp,
extern kernel_ulong_t *
fetch_indirect_syscall_args(struct tcb *, kernel_ulong_t addr, unsigned int n_args);
+extern void pidns_init(void);
+
+extern int translate_pid(struct tcb *tcp, int dest_id, enum pid_type type,
+ int *proc_pid_ptr);
+
extern void
dumpiov_in_msghdr(struct tcb *, kernel_ulong_t addr, kernel_ulong_t data_size);
@@ -1059,6 +1082,7 @@ printfd(struct tcb *tcp, int fd)
* of the tracee the descriptor tcp). This is a stub.
*/
extern void printfd_pid_tracee_ns(struct tcb *tcp, pid_t pid, int fd);
+extern void printpid(struct tcb *tcp, int pid, enum pid_type type);
extern void print_sockaddr(const void *sa, int len);
extern bool
print_inet_addr(int af, const void *addr, unsigned int len, const char *var_name);
diff --git a/linux/dummy.h b/linux/dummy.h
index 2f859a60..1ec110fe 100644
--- a/linux/dummy.h
+++ b/linux/dummy.h
@@ -53,6 +53,7 @@
# define sys_getgid sys_getuid
# define sys_getgid16 sys_getuid16
# define sys_getpeername sys_getsockname
+# define sys_getppid sys_getpid
# define sys_getresgid sys_getresuid
# define sys_getresgid16 sys_getresuid16
# define sys_lstat sys_stat
@@ -87,10 +88,6 @@
# define sys_vfork sys_fork
/* printargs does the right thing */
-# define sys_getpgrp printargs
-# define sys_getpid printargs
-# define sys_getppid printargs
-# define sys_gettid printargs
# define sys_idle printargs
# define sys_munlockall printargs
# define sys_pause printargs
@@ -111,8 +108,6 @@
# define sys_getpgid printargs_d
# define sys_getsid printargs_d
# define sys_nice printargs_d
-# define sys_setpgid printargs_d
-# define sys_setpgrp printargs_d
# define sys_timer_delete printargs_d
# define sys_timer_getoverrun printargs_d
diff --git a/pidns.c b/pidns.c
new file mode 100644
index 00000000..44bda6ed
--- /dev/null
+++ b/pidns.c
@@ -0,0 +1,633 @@
+#include "defs.h"
+
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <asm/unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "btree.h"
+#include "nsfs.h"
+#include "xmalloc.h"
+
+struct btree *ns_pid_to_proc_pid[PT_COUNT];
+struct btree *proc_data_cache;
+
+static const char tid_str[] = "NSpid:\t";
+static const char tgid_str[] = "NStgid:\t";
+static const char pgid_str[] = "NSpgid:\t";
+static const char sid_str[] = "NSsid:\t";
+
+static const struct {
+ const char *str;
+ size_t size;
+} id_strs[PT_COUNT] = {
+ [PT_TID] = { tid_str, sizeof(tid_str) - 1 },
+ [PT_TGID] = { tgid_str, sizeof(tgid_str) - 1 },
+ [PT_PGID] = { pgid_str, sizeof(pgid_str) - 1 },
+ [PT_SID] = { sid_str, sizeof(sid_str) - 1 },
+};
+
+
+/**
+ * Limit on PID NS hierarchy depth, imposed since Linux 3.7. NS traversal
+ * is not possible before Linux 4.9, so we consider this limit pretty universal.
+ */
+#define MAX_NS_DEPTH 32
+
+struct proc_data {
+ int proc_pid;
+ int ns_count;
+ uint64_t ns_hierarchy[MAX_NS_DEPTH]; /* from bottom to top of NS hierarchy */
+ int id_count[PT_COUNT];
+ int *id_hierarchy[PT_COUNT]; /* from top to bottom of NS hierarchy */
+};
+
+static uint8_t
+lg2(uint64_t n)
+{
+ uint8_t res = 0;
+ while (n) {
+ res++;
+ n >>= 1;
+ }
+ return res;
+}
+
+static int
+get_pid_max(void)
+{
+ static int pid_max = -1;
+
+ if (pid_max < 0) {
+ pid_max = INT_MAX;
+
+ FILE *f = fopen("/proc/sys/kernel/pid_max", "r");
+ if (!f)
+ perror_msg("get_pid_max: opening /proc/sys/kernel/pid_max");
+ else
+ fscanf(f, "%d", &pid_max);
+ }
+
+ return pid_max;
+}
+
+void
+pidns_init(void)
+{
+ static bool inited = false;
+ if (inited)
+ return;
+
+ for (int i = 0; i < PT_COUNT; i++)
+ ns_pid_to_proc_pid[i] = btree_create(6, 16, 16, 64, 0);
+
+ proc_data_cache = btree_create(6, 16, 16, lg2(get_pid_max() - 1), 0);
+
+ inited = true;
+}
+
+static void
+put_proc_pid(uint64_t ns, int ns_pid, enum pid_type type, int proc_pid)
+{
+ struct btree *b = (struct btree *) btree_get(ns_pid_to_proc_pid[type], ns);
+ if (!b) {
+ int pid_max = get_pid_max();
+ uint8_t pid_max_size = lg2(pid_max - 1);
+ uint8_t pid_max_size_lg = lg2(pid_max_size - 1);
+ b = btree_create(pid_max_size_lg, 16, 16, pid_max_size, 0);
+
+ btree_set(ns_pid_to_proc_pid[type], ns, (uint64_t) b);
+ }
+ btree_set(b, ns_pid, proc_pid);
+}
+
+static int
+get_cached_proc_pid(uint64_t ns, int ns_pid, enum pid_type type)
+{
+ struct btree *b = (struct btree *) btree_get(ns_pid_to_proc_pid[type], ns);
+ if (!b)
+ return 0;
+
+ return btree_get(b, ns_pid);
+}
+
+/**
+ * Helper function, converts pid to string, or to "self" for pid == 0.
+ * Uses static buffer for operation.
+ */
+static const char *
+pid_to_str(pid_t pid)
+{
+ static char buf[sizeof("-2147483648")];
+ ssize_t ret;
+
+ if (!pid)
+ return "self";
+
+ ret = snprintf(buf, sizeof(buf), "%d", pid);
+
+ if ((ret < 0) || ((size_t) ret >= sizeof(buf)))
+ perror_msg_and_die("pid_to_str: snprintf");
+
+ return buf;
+}
+
+/**
+ * Returns a list of PID NS IDs for the specified PID.
+ *
+ * @param proc_pid PID (as present in /proc) to get information for.
+ * @param ns_buf Pointer to buffer that is able to contain at least
+ * MAX_NS_DEPTH items.
+ * @param last ID of NS on which ascencion can be interrupted.
+ * 0 for no interruption.
+ * @return Amount of NS in list. 0 indicates error, MAX_NS_DEPTH + 1
+ * indicates that ascension limit hasn't been reached (only
+ * MAX_NS_DEPTH values have been written to the array, however).
+ */
+static size_t
+get_ns_hierarchy(int proc_pid, uint64_t *ns_buf, size_t ns_buf_size)
+{
+ char path[PATH_MAX + 1];
+ struct stat st;
+ ssize_t ret;
+ size_t n = 0;
+ int fd;
+ int parent_fd;
+
+ ret = snprintf(path, sizeof(path), "/proc/%s/ns/pid",
+ pid_to_str(proc_pid));
+
+ if ((ret < 0) || ((size_t) ret >= sizeof(path)))
+ return 0;
+
+ fd = open(path, O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ perror_msg("get_ns_hierarchy: opening /proc/<pid>/ns/pid");
+ return 0;
+ }
+
+ while (1) {
+ ret = fstat(fd, &st);
+ if (ret)
+ break;
+
+ /* 32 is the hierarchy depth on modern Linux */
+ if ((n >= MAX_NS_DEPTH) || (n >= ns_buf_size)) {
+ n++;
+ break;
+ }
+
+ ns_buf[n] = st.st_ino;
+ if (debug_flag)
+ error_msg("Got NS: %" PRIu64, ns_buf[n]);
+
+ n++;
+
+ parent_fd = ioctl(fd, NS_GET_PARENT);
+ if (parent_fd == -1) {
+ switch (errno) {
+ case EPERM:
+ if (debug_flag)
+ error_msg("Terminating NS ascending "
+ "after %zu levels on NS %"
+ PRIu64, n, ns_buf[n - 1]);
+ break;
+
+ case ENOTTY:
+ error_msg("NS_* ioctl commands are not "
+ "supported by the kernel");
+ break;
+ default:
+ perror_msg("get_ns_hierarchy: "
+ "ioctl(NS_GET_PARENT)");
+ break;
+ }
+
+ break;
+ }
+
+ close(fd);
+ fd = parent_fd;
+ }
+
+ close(fd);
+
+ return n;
+}
+
+/**
+ * Get list of IDs present in NS* proc status record. IDs are placed as they are
+ * stored in /proc (from top to bottom of NS hierarchy).
+ *
+ * @param proc_pid PID (as present in /proc) to get information for.
+ * @param id_buf Pointer to buffer that is able to contain at least
+ * MAX_NS_DEPTH items. Can be NULL.
+ * @param type Type of ID requested.
+ * @return Number of items stored in id_list. 0 indicates error,
+ * MAX_NS_DEPTH + 1 indicates that status record contains
+ * more that MAX_NS_DEPTH records and the id_buf provided
+ * is unusable.
+ */
+static size_t
+get_id_list(int proc_pid, int *id_buf, enum pid_type type)
+{
+ const char *ns_str = id_strs[type].str;
+ size_t ns_str_size = id_strs[type].size;
+ char *buf;
+ size_t bufsize = 0;
+ char *p;
+ char *endp;
+ FILE *f;
+ int idx = 0;
+ ssize_t ret;
+
+ ret = asprintf(&buf, "/proc/%s/status", pid_to_str(proc_pid));
+ if (ret < 0)
+ return 0;
+
+ f = fopen(buf, "r");
+ if (!f) {
+ perror_msg("get_id_list: opening /proc/<pid>/status");
+ return 0;
+ }
+
+ free(buf);
+ buf = NULL;
+
+ while (getline(&buf, &bufsize, f) > 0) {
+ if (strncmp(buf, ns_str, ns_str_size))
+ continue;
+
+ p = buf + ns_str_size;
+
+ for (idx = 0; idx < MAX_NS_DEPTH; idx++) {
+ if (!p)
+ break;
+
+ errno = 0;
+ int id = strtol(p, &endp, 10);
+
+ if (errno && (p[0] != '\t')) {
+ perror_msg("get_id_list: converting pid to int");
+ idx = 0;
+ goto get_id_list_exit;
+ }
+
+ if (debug_flag)
+ error_msg("PID %d: %s[%d]: %d",
+ proc_pid, ns_str, idx, id);
+
+ if (id_buf)
+ id_buf[idx] = id;
+
+ strsep(&p, "\t");
+ }
+
+ if (p)
+ idx++;
+
+ break;
+ }
+
+get_id_list_exit:
+ if (f)
+ fclose(f);
+ if (buf)
+ free(buf);
+
+ return idx;
+}
+
+static bool
+is_proc_ours(void)
+{
+ static int cached_val = -1;
+
+ if (cached_val < 0)
+ cached_val = get_id_list(0, NULL, PT_TID) == 1;
+
+ return cached_val;
+}
+
+static uint64_t
+get_ns(struct tcb *tcp)
+{
+ if (!tcp->pid_ns_inited) {
+ int pid = tcp->pid;
+
+ if (!is_proc_ours())
+ if (translate_pid(NULL, tcp->pid, PT_TID, &pid) < 1)
+ pid = -1;
+
+ if ((pid == -1) || !get_ns_hierarchy(pid, &tcp->pid_ns, 1))
+ tcp->pid_ns = 0;
+
+ tcp->pid_ns_inited = true;
+ }
+
+ return tcp->pid_ns;
+}
+
+static uint64_t
+get_our_ns(void)
+{
+ static uint64_t our_ns = 0;
+ static bool our_ns_initialised = false;
+
+ if (!our_ns_initialised) {
+ get_ns_hierarchy(0, &our_ns, 1);
+ our_ns_initialised = true;
+ }
+
+ return our_ns;
+}
+
+/**
+ * Returns the cached proc_data struct associated with proc_pid.
+ * If none found, allocates a new proc_data.
+ */
+static struct proc_data *
+get_or_create_proc_data(int proc_pid)
+{
+ struct proc_data *pd = (struct proc_data *) btree_get(proc_data_cache, proc_pid);
+
+ if (!pd) {
+ pd = calloc(1, sizeof(*pd));
+ if (!pd)
+ return NULL;
+
+ pd->proc_pid = proc_pid;
+ btree_set(proc_data_cache, proc_pid, (uint64_t) pd);
+ }
+
+ return pd;
+}
+
+/**
+ * Updates the proc_data from /proc
+ * If the process does not exists, returns false, and frees the proc_data
+ */
+static bool
+update_proc_data(struct proc_data *pd, enum pid_type type)
+{
+ pd->ns_count = get_ns_hierarchy(pd->proc_pid,
+ pd->ns_hierarchy, MAX_NS_DEPTH);
+ if (!pd->ns_count)
+ goto fail;
+
+ if (!pd->id_hierarchy[type])
+ pd->id_hierarchy[type] = calloc(MAX_NS_DEPTH,
+ sizeof(pd->id_hierarchy[type][0]));
+ if (!pd->id_hierarchy[type])
+ goto fail;
+
+ pd->id_count[type] = get_id_list(pd->proc_pid,
+ pd->id_hierarchy[type], type);
+ if (!pd->id_count[type])
+ goto fail;
+
+ return true;
+
+fail:
+ if (pd)
+ free(pd);
+
+ btree_set(proc_data_cache, pd->proc_pid, (uint64_t) NULL);
+ return false;
+}
+
+/**
+ * Paramters for id translation
+ */
+struct translate_id_params {
+ /* The result (output) */
+ int result_id;
+ /* The proc data of the process (output) */
+ struct proc_data *pd;
+
+ /* The namespace to be translated from */
+ uint64_t from_ns;
+ /* The id to be translated */
+ int from_id;
+ /* The type of the id */
+ enum pid_type type;
+};
+
+/**
+ * Translates an id to our namespace, given the proc_pid of the process, by reading files in /proc.
+ *
+ * @param tip The parameters
+ * @param proc_pid The proc pid of the process. If 0, use the cached values in tip->pd.
+ */
+static void
+translate_id_proc_pid(struct translate_id_params *tip, int proc_pid)
+{
+ struct proc_data *pd = proc_pid ? get_or_create_proc_data(proc_pid) : tip->pd;
+
+ tip->result_id = 0;
+ tip->pd = NULL;
+
+ if (!pd)
+ return;
+
+ if (proc_pid && !update_proc_data(pd, tip->type))
+ return;
+
+ if (!pd->ns_count || (pd->id_count[tip->type] < pd->ns_count))
+ return;
+
+ int our_ns_id_idx = pd->id_count[tip->type] - pd->ns_count;
+
+ for (int i = 0; i < pd->ns_count; i++) {
+ if (pd->ns_hierarchy[i] != tip->from_ns)
+ continue;
+
+ int id_idx = pd->id_count[tip->type] - i - 1;
+ if (pd->id_hierarchy[tip->type][id_idx] != tip->from_id)
+ return;
+
+ tip->result_id = pd->id_hierarchy[tip->type][our_ns_id_idx];
+ tip->pd = pd;
+ return;
+ }
+}
+
+/**
+ * Translates an id to our namespace, by reading all proc entries in dir.
+ *
+ * @param tip The parameters
+ * @param path The path of the dir to be read.
+ * @param read_task_dir Whether recurse to "task" subdirectory.
+ */
+static void
+translate_id_dir(struct translate_id_params *tip, const char *path, bool read_task_dir)
+{
+ DIR *dir = opendir(path);
+ if (!dir) {
+ perror_msg("translate_id_dir: opening dir: %s", path);
+ return;
+ }
+
+ while (!tip->result_id) {
+ errno = 0;
+ struct dirent *entry = readdir(dir);
+ if (!entry) {
+ if (errno)
+ perror_msg("translate_id_dir: readdir");
+
+ break;
+ }
+
+ if (entry->d_type != DT_DIR)
+ continue;
+
+ errno = 0;
+ int proc_pid = strtol(entry->d_name, NULL, 10);
+ if (errno)
+ continue;
+ if ((proc_pid < 1) || (proc_pid > INT_MAX))
+ continue;
+
+ if (read_task_dir) {
+ char task_dir_path[PATH_MAX + 1];
+ snprintf(task_dir_path, sizeof(task_dir_path), "/proc/%d/task", proc_pid);
+ translate_id_dir(tip, task_dir_path, false);
+ }
+
+ if (tip->result_id)
+ break;
+
+ translate_id_proc_pid(tip, proc_pid);
+ }
+
+ closedir(dir);
+}
+
+/**
+ * Iterator function of the proc_data_cache for id translation.
+ * If the cache contains the id we are looking for, reads the corresponding
+ * directory in /proc, and if cache is valid, saves the result.
+ */
+static void
+proc_data_cache_iterator_fn(void* fn_data, uint64_t key, uint64_t val)
+{
+ struct translate_id_params *tip = (struct translate_id_params *) fn_data;
+ struct proc_data *pd = (struct proc_data *) val;
+
+ if (!pd)
+ return;
+
+ /* Result already found in an earlier iteration */
+ if (tip->result_id)
+ return;
+
+ /* Translate from cache */
+ tip->pd = pd;
+ translate_id_proc_pid(tip, 0);
+ if (!tip->result_id)
+ return; /* According to cache, this is not what we are looking for, continue */
+
+ /* Now translate it from actual data in /proc, to check cache validity */
+ translate_id_proc_pid(tip, pd->proc_pid);
+}
+
+/**
+ * Translates an ID from tcp's namespace to our namepace
+ *
+ * @param tcp The tcb whose namepace from_id is in (NULL means strace's namespace)
+ * @param from_id The id to be translated
+ * @param type The type of ID
+ * @param proc_pid_ptr If not NULL, writes the proc PID to this location
+ */
+int
+translate_pid(struct tcb *tcp, int from_id, enum pid_type type, int *proc_pid_ptr)
+{
+ const uint64_t our_ns = get_our_ns();
+
+ struct translate_id_params tip = {
+ .result_id = 0,
+ .pd = NULL,
+ .from_ns = tcp ? get_ns(tcp) : our_ns,
+ .from_id = from_id,
+ .type = type,
+ };
+
+ if ((tip.type >= PT_COUNT) || (tip.type < 0))
+ goto translate_pid_exit;
+
+ /* If translation is trivial */
+ if (tip.from_ns == our_ns && (is_proc_ours() || !proc_pid_ptr)) {
+ if (proc_pid_ptr)
+ *proc_pid_ptr = from_id;
+
+ tip.result_id = tip.from_id;
+ goto translate_pid_exit;
+ }
+
+ /* Look for a cached proc_pid for this (from_ns, from_id) pair */
+ int cached_proc_pid = get_cached_proc_pid(tip.from_ns, tip.from_id, tip.type);
+ if (cached_proc_pid) {
+ translate_id_proc_pid(&tip, cached_proc_pid);
+ if (tip.result_id)
+ goto translate_pid_exit;
+ }
+
+ /* Iterate through the cache, find potential proc_data */
+ btree_iterate_keys(proc_data_cache, 0, get_pid_max(), 0, proc_data_cache_iterator_fn, &tip);
+ /* (proc_data_cache_iterator_fn takes care about updating proc_data) */
+ if (tip.result_id)
+ goto translate_pid_exit;
+
+ /* No cache helped, read all entries in /proc */
+ translate_id_dir(&tip, "/proc", true);
+
+translate_pid_exit:
+ if (tip.pd) {
+ if (tip.pd->proc_pid)
+ put_proc_pid(tip.from_ns, tip.from_id, tip.type, tip.pd->proc_pid);
+
+ if (proc_pid_ptr)
+ *proc_pid_ptr = tip.pd->proc_pid;
+ }
+
+ return tip.result_id;
+}
+
+int
+get_proc_pid(struct tcb *tcp)
+{
+ if (!is_proc_ours()) {
+ int ret;
+
+ if (translate_pid(NULL, tcp->pid, PT_TID, &ret) < 0)
+ return -1;
+
+ return ret;
+ }
+
+ return tcp->pid;
+}
+
+void
+printpid(struct tcb *tcp, int pid, enum pid_type type)
+{
+ int strace_pid;
+
+ tprintf("%d", pid);
+
+ if (perform_ns_resolution) {
+ strace_pid = translate_pid(tcp, pid, type, NULL);
+
+ if ((strace_pid > 0) && (pid != strace_pid))
+ tprintf_comment("%d in strace's PID NS", strace_pid);
+ }
+}
diff --git a/strace.c b/strace.c
index 311e4d62..bd8562bf 100644
--- a/strace.c
+++ b/strace.c
@@ -133,6 +133,8 @@ static unsigned int daemonized_tracer;
static int post_attach_sigstop = TCB_IGNORE_ONE_SIGSTOP;
#define use_seize (post_attach_sigstop == 0)
+unsigned int perform_ns_resolution;
+
static bool detach_on_execve;
static int exit_code;
@@ -2013,7 +2015,7 @@ init(int argc, char *argv[])
qualify_signals("all");
static const char optstring[] =
- "+a:Ab:cCdDe:E:fFhiI:ko:O:p:P:qrs:S:tTu:U:vVwxX:yzZ";
+ "+a:Ab:cCdDe:E:fFhiI:ko:O:p:P:qrs:S:tTu:U:vVwxX:yYzZ";
enum {
GETOPT_SECCOMP = 0x100,
@@ -2285,6 +2287,10 @@ init(int argc, char *argv[])
case 'y':
yflag_short++;
break;
+ case 'Y':
+ perform_ns_resolution++;
+ pidns_init();
+ break;
case 'z':
clear_number_set_array(status_set, 1);
add_number_to_set(STATUS_SUCCESSFUL, status_set);
diff --git a/syscall.c b/syscall.c
index 022cab5b..e0433e56 100644
--- a/syscall.c
+++ b/syscall.c
@@ -930,6 +930,21 @@ syscall_exiting_trace(struct tcb *tcp, struct timespec *ts, int res)
tprintf("= %" PRI_kld, tcp->u_rval);
}
break;
+ case RVAL_TID:
+ case RVAL_SID:
+ case RVAL_TGID:
+ case RVAL_PGID: {
+ #define _(_t) [RVAL_##_t - RVAL_TID] = PT_##_t
+ static const enum pid_type types[] = {
+ _(TID), _(SID), _(TGID), _(PGID),
+ };
+ #undef _
+
+ tprints("= ");
+ printpid(tcp, tcp->u_rval,
+ types[(sys_res & RVAL_MASK) - RVAL_TID]);
+ break;
+ }
default:
error_msg("invalid rval format");
break;
diff --git a/util.c b/util.c
index cde76c13..6d599e98 100644
--- a/util.c
+++ b/util.c
@@ -641,12 +641,8 @@ printed:
void
printfd_pid_tracee_ns(struct tcb *tcp, pid_t pid, int fd)
{
- /*
- * TODO: We want to have the same formatting as printfd here,
- * but we should figure out first which process in strace's
- * PID NS is referred to by pid in tracee's PID NS.
- */
- tprintf("%d", fd);
+ int strace_pid = translate_pid(tcp, pid, PT_TGID, NULL);
+ printfd_pid(tcp, strace_pid, fd);
}
/*
--
2.27.0
More information about the Strace-devel
mailing list