[PATCH v3 6/6] Implement new filtering language parsing

Mon Jul 3 18:52:25 UTC 2017

On Thu, Jun 29, 2017 at 02:46:15PM +0700, Nikolay Marchuk wrote:
> * defs.h (parse_action): Add definition.
> * filter.c: Add filters parsing function.
> * filter.h: Add new definitions.
> * filter_action.c: Add action parsing function.
> * filter_expression.c: Add expression parsing function.
> * strace.c (init): Add strace -m option.
> ---
>  defs.h              |   1 +
>  filter.c            | 104 ++++++++++++++++++++++++++++++++++++++++++++++++
>  filter.h            |   3 ++
>  filter_action.c     |  31 +++++++++++++++
>  filter_expression.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  strace.c            |   5 ++-
>  6 files changed, 253 insertions(+), 3 deletions(-)
> 
> diff --git a/defs.h b/defs.h
> index 4b3516b..372c7f8 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -674,6 +674,7 @@ extern bool is_number_in_set(const unsigned int, const struct number_set *);
>  extern void sort_filter_actions(void);
>  extern void filter_syscall(struct tcb *);
>  extern void parse_qualify_filter(const char *);
> +extern void parse_action(const char *);
>  
>  #define DECL_IOCTL(name)						\
>  extern int								\
> diff --git a/filter.c b/filter.c
> index 1f2d2eb..a7ec155 100644
> --- a/filter.c
> +++ b/filter.c
> @@ -121,6 +121,110 @@ free_filter(struct filter *filter)
>  	filter->type->free_priv_data(filter->_priv_data);
>  }
>  
> +static unsigned int
> +escaped_argument_len(const char *str) {
Curly bracket should be on the separate line.

> +	unsigned int len = 0;
> +	const char *p;
> +	bool escaped = false;
> +	for (p = str; *p && *p != ';'; ++p) {
> +		if (!escaped) {
> +			if (*p == '\\') {
> +				escaped = true;
> +				continue;
> +			} else if (isspace(*p) || *p == ')' || *p == '|'
> +			           || *p == '&') {
> +				break;
> +			}
> +		}
> +		escaped = false;
> +		++len;
> +	}
> +	return len;
> +}
> +
> +static char *
> +unescape_argument(const char *str)
> +{
> +	const char *p;
> +	char *p_new;
> +	bool escaped = false;
> +	char *new_str = xcalloc(escaped_argument_len(str) + 1, sizeof(char));
> +	for (p = str, p_new = new_str; *p && *p != ';'; ++p) {
> +		if (!escaped) {
> +			if (*p == '\\') {
> +				escaped = true;
> +				continue;
> +			} else if (isspace(*p) || *p == ')' || *p == '|'
> +			           || *p == '&') {
> +				break;
> +			}
> +		}
> +		escaped = false;
> +		*(p_new++) = *p;
> +	}
> +	return new_str;
> +}
Missing empty line.

> +/*
> + * Try to parse filter from str. Return positive number of characters
> + * to be skipped in str on success or negative number of characters
> + * to be copied to new string on failure.
> +*/
> +static int
> +try_parse_filter(const char *str, struct filter **filters,
> +                 unsigned int *nfilters)
> +{
> +	int res = 0;
> +	const char *p = str;
> +	if (!lookup_filter_type(str)) {
> +		for (; isalpha(*p); ++p, --res);
Note that isalpha and isspace are locale-specific, things could be ugly
in exotic locales without C/POSIX locale enforcement.

> +	} else {
> +		struct filter *filter = add_filter_to_array(filters, nfilters,
> +		                                            str);
> +		/*
> +		 * Find and parse filter argument.
> +		*/
> +		for (; isalpha(*p); ++p, ++res);
> +		for (; isspace(*p); ++p, ++res);
> +		char *arg = unescape_argument(p);
> +		res += strlen(arg);
> +		parse_filter(filter, arg, filter->type->name);
> +		free(arg);
> +	}
> +	return res;
> +}
Missing empty line.

> +/* Parses all filters from expression, replaces the filter
> + * with filter terminal and returns resulting string.
> +*/
> +char *
> +parse_filters_from_expression(const char *str, struct filter **filters,
> +                                   unsigned int *nfilters)
> +{
> +	char *new_expr = xcalloc(strlen(str) + 1, sizeof(char));
> +	const char *p;
> +	char *p_new;
> +	for (p = str, p_new = new_expr; *p && *p != ';'; ++p) {
> +		if (isalpha(*p)) {
> +			int res = try_parse_filter(p, filters, nfilters);
This approach would fail once you'd try to parse syscall read, or
syscall syscall, for example.

> +			if (res > 0) {
> +				/* Skip successfully parsed filter. */
> +				p += res - 1;
> +				int rc = snprintf(p_new, 2, "F");
Why snprintf and a variable for writing two characters to a string?

	*p_new++ = 'F';
	*p_new = '\0';

> +				p_new += rc;
> +			} else {
> +				/* Filter is not parsed. */
> +				strncpy(p_new, p, -res);
> +				p += -res - 1;
> +				p_new += -res;
> +			}
> +		} else {
> +			*(p_new++) = *p;
> +		}
> +	}
> +	unsigned int new_len = (p_new - new_expr) + 1;
> +	new_expr = xreallocarray(new_expr, new_len, sizeof(char));
> +	return new_expr;
> +}
> +
>  void *
>  get_filter_priv_data(struct filter *filter)
>  {
> diff --git a/filter.h b/filter.h
> index 7e03d75..8fcc8e8 100644
> --- a/filter.h
> +++ b/filter.h
> @@ -48,6 +48,8 @@ struct filter* add_filter_to_array(struct filter **, unsigned int *nfilters,
>                                     const char *name);
>  void parse_filter(struct filter *, const char *str, const char *const name);
>  bool *run_filters(struct tcb *, struct filter *, unsigned int);
> +char *parse_filters_from_expression(const char *str, struct filter **,
> +                                          unsigned int *nfilters);
>  void free_filter(struct filter *);
>  void *get_filter_priv_data(struct filter *);
>  void set_filter_priv_data(struct filter *, void *);
> @@ -62,5 +64,6 @@ void set_filter_action_priv_data(struct filter_action *, void *);
>  struct bool_expression *create_expression();
>  void set_expression_qualify_mode(struct bool_expression *);
>  bool run_expression(struct bool_expression *, unsigned int, bool *);
> +void parse_expression(const char *, struct bool_expression *, unsigned int);
>  
>  #endif
> diff --git a/filter_action.c b/filter_action.c
> index 0c235da..3663d49 100644
> --- a/filter_action.c
> +++ b/filter_action.c
> @@ -169,6 +169,40 @@ find_or_add_action(const char *name)
>  	return add_action(type);
>  }
>  
> +void
> +parse_action(const char *str)
> +{
> +	const struct filter_action_type *type;
> +	char *buf = xstrdup(str);
> +	char *expr = strchr(buf, '(');
> +	if (!expr || (expr == buf)) {
> +		type = lookup_filter_action_type("trace");
> +		expr = buf;
> +	} else if (expr[strlen(expr) - 1] == ')'){
Missing space before the opening curly bracket.

> +		type = lookup_filter_action_type(buf);
> +		++expr;
> +		expr[strlen(expr) - 1] = '\0';
Second strlen call.

> +		if (!type)
> +			error_msg_and_die("invalid filter action '%s'", buf);
> +	} else
> +		error_msg_and_die("invalid filter expression");
It's better to enclose both branches of a conditional statements in
curly brackets if at least one branch requires doing so.

> +
> +	struct filter_action *action = find_or_add_action(type->name);
> +	const char *args = strchr(expr, ';');
> +	if (!args && type->parse_args != &parse_null) {
parse_null comparison looks redundant.

> +		action->_priv_data = type->parse_args(NULL);
> +	}
Enclosing single statement here in curly brackets is not really needed.

> +	if (args) {
> +		action->_priv_data = type->parse_args(args + 1);
> +	}
Ditto here.

> +	unsigned int start_id = action->nfilters;
> +	expr = parse_filters_from_expression(expr, &action->filters,
> +	                                     &action->nfilters);
> +	parse_expression(expr, action->expr, start_id);
> +	free(expr);
> +	free(buf);
> +}
> +
>  static void
>  run_filter_action(struct tcb *tcp, struct filter_action *action)
>  {
> diff --git a/filter_expression.c b/filter_expression.c
> index ca02280..81dde02 100644
> --- a/filter_expression.c
> +++ b/filter_expression.c
> @@ -25,6 +25,8 @@
>   * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>   */
>  #include "defs.h"
> +#include <ctype.h>
> +#include "filter.h"
>  
>  struct expression_token{
>  	enum token_type{
> @@ -55,17 +57,45 @@ create_expression(void)
>  }
>  
>  static void
> -reallocate_expression(struct bool_expression *const expr, const unsigned int new_ntokens)
> +reallocate_expression(struct bool_expression *expr, unsigned int new_ntokens)
Why const specifier removal?

>  {
>  	if (new_ntokens <= expr->ntokens)
>  		return;
>  	expr->tokens = xreallocarray(expr->tokens, new_ntokens,
> -	                             sizeof(*expr->tokens));
> +	                             sizeof(struct expression_token));
Why this change in this patch?

>  	memset(expr->tokens + expr->ntokens, 0,
>  	       sizeof(*expr->tokens) * (new_ntokens - expr->ntokens));
>  	expr->ntokens = new_ntokens;
>  }
>  
> +static void
> +add_variable_token(struct bool_expression *expr, unsigned int id)
> +{
> +	struct expression_token token;
> +	token.type = TOK_VARIABLE;
> +	token.data.variable_id = id;
> +	reallocate_expression(expr, expr->ntokens + 1);
> +	expr->tokens[expr->ntokens - 1] = token;
> +}
> +
> +static void
> +add_operator_token(struct bool_expression *expr, char c) {
Curly bracket should be on the separate line.

> +	struct expression_token token;
> +	token.type = TOK_OPERATOR;
> +	switch (c){
Missing space before the opening curly bracket.

> +	case '!':
> +		token.data.operator_id = OP_NOT;
> +		break;
> +	case '&':
> +		token.data.operator_id = OP_AND;
> +		break;
> +	case '|':
> +		token.data.operator_id = OP_OR;
> +	}
> +	reallocate_expression(expr, expr->ntokens + 1);
> +	expr->tokens[expr->ntokens - 1] = token;
> +}
> +
>  #define STACK_SIZE 32
>  static bool stack[STACK_SIZE];
>  
> @@ -129,3 +159,81 @@ run_expression(struct bool_expression *expr,
>  		error_msg_and_die("corrupted filter expression");
>  	return stack[0];
>  }
> +
> +void parse_expression(const char *str, struct bool_expression *expr,
> +                      unsigned int start_id)
> +{
> +	unsigned int variable_id = 0;
> +	/* Current stack index */
> +	unsigned int index = 0;
> +	char op_stack[STACK_SIZE];
> +	const char *p;
> +	for (p = str; *p; ++p) {
> +		if (isspace(*p))
> +			continue;
> +		switch (*p) {
> +		case 'F':
> +			add_variable_token(expr, variable_id++);
> +			break;
> +		case '(':
> +			if (index == STACK_SIZE)
> +				error_msg_and_die("stack overflow");
> +			op_stack[index++] = '(';
> +			break;
> +		case ')':
> +			while ((index > 0) && (op_stack[index - 1] != '('))
> +				add_operator_token(expr, op_stack[--index]);
> +			if (index == 0)
> +				error_msg_and_die("invalid filter expression");
> +			--index;
> +			break;
> +		case '!':
> +			op_stack[index++] = '!';
> +			break;
> +		case 'n':
> +			if (*(p + 1) != 'o' || *(p + 2) != 't')
> +				error_msg_and_die("invalid filter expression");
> +			p += 2;
> +			op_stack[index++] = '!';
> +			break;
> +		case '&':
> +			if (*(p + 1) != '&')
> +				error_msg_and_die("invalid filter expression");
> +			++p;
> +			while ((index > 0) && (op_stack[index - 1] == '!'))
> +				add_operator_token(expr, op_stack[--index]);
> +			op_stack[index++] = '&';
> +			break;
> +		case 'a':
> +			if (*(p + 1) != 'n' || *(p + 2) != 'd')
> +				error_msg_and_die("invalid filter expression");
> +			p += 2;
> +			while ((index > 0) && (op_stack[index - 1] == '!'))
> +				add_operator_token(expr, op_stack[--index]);
> +			op_stack[index++] = '&';
> +			break;
> +		case '|':
> +			if (*(p + 1) != '|')
> +				error_msg_and_die("invalid filter expression");
> +			++p;
> +			while ((index > 0) && ((op_stack[index - 1] == '!')
> +			       || (op_stack[index - 1] == '&')))
> +				add_operator_token(expr, op_stack[--index]);
> +			op_stack[index++] = '|';
> +			break;
> +		case 'o':
> +			if (*(p + 1) != 'r')
> +				error_msg_and_die("invalid filter expression");
> +			++p;
> +			while ((index > 0) && ((op_stack[index - 1] == '!')
> +			       || (op_stack[index - 1] == '&')))
> +				add_operator_token(expr, op_stack[--index]);
> +			op_stack[index++] = '|';
> +			break;
> +		}
> +	}
> +	while (index > 0)
> +		add_operator_token(expr, op_stack[--index]);
> +	if (start_id > 0)
> +		add_operator_token(expr, '&');
> +}

This expression parsing code successfully parses (and then evaluates) expression
'trace(syscall read syscall write syscall read || && !)', for example.

The whole idea of converting filter instances to special characters
along with spreading parser states across the code (like the condition in
escaped_argument_len/unescape_argument, not to mention its duplication)
makes this code highly unmaintainable. I'd prefer having plain dumb
DFA-based tokenizer/parser instead, with state table defined statically
or dynamically.

> diff --git a/strace.c b/strace.c
> index 0733b84..7d7ac1b 100644
> --- a/strace.c
> +++ b/strace.c
> @@ -1632,7 +1632,7 @@ init(int argc, char *argv[])
>  		"k"
>  #endif
>  		"D"
> -		"a:e:o:O:p:s:S:u:E:P:I:")) != EOF) {
> +		"a:e:m:o:O:p:s:S:u:E:P:I:")) != EOF) {
>  		switch (c) {
>  		case 'b':
>  			if (strcmp(optarg, "execve") != 0)
> @@ -1709,6 +1709,9 @@ init(int argc, char *argv[])
>  		case 'e':
>  			parse_qualify_filter(optarg);
>  			break;
> +		case 'm':
> +			parse_action(optarg);
> +			break;
I assume this change should be documented in strace.1 and NEWS, at
least.

Btw, I see no reason for using a separate option for the new syntax, it
could be supported along the old one (as the new one requires presence of
parentheses and the old one avoids them at all, for example).

>  		case 'o':
>  			outfname = xstrdup(optarg);
>  			break;
> -- 
> 2.1.4