From a1283dda10ece6316124a827b00f42a7352e6185 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Mon, 17 Jul 2023 15:00:55 -0400 Subject: [PATCH] Change some terminals to rules in the grammar --- src/kestrel/syntax/kestrel.lark | 58 +++++++++------------ src/kestrel/syntax/parser.py | 89 +++++++++++++++++++++++++++------ 2 files changed, 99 insertions(+), 48 deletions(-) diff --git a/src/kestrel/syntax/kestrel.lark b/src/kestrel/syntax/kestrel.lark index 71b04641..52f5c256 100644 --- a/src/kestrel/syntax/kestrel.lark +++ b/src/kestrel/syntax/kestrel.lark @@ -10,7 +10,7 @@ start: statement* statement: assignment | command_no_result - + // If no VARIABLE is given, default to _ in post-parsing // For assign or merge, the result variable is required // This eliminates meaningless huntflows like `var1 var2 var3` @@ -46,7 +46,7 @@ join: "JOIN"i VARIABLE "," VARIABLE (BY ATTRIBUTE "," ATTRIBUTE)? load: "LOAD"i stdpath ("AS"i ENTITY_TYPE)? -new: "NEW"i ENTITY_TYPE? VAR_DATA +new: "NEW"i ENTITY_TYPE? var_data sort: "SORT"i VARIABLE BY ATTRIBUTE (ASC|DESC)? @@ -104,8 +104,8 @@ offset_clause: "OFFSET"i INT | comparison_null | "(" disjunction ")" -comparison_std: ENTITY_ATTRIBUTE_PATH OP value -comparison_null: ENTITY_ATTRIBUTE_PATH NULL_OP NULL +comparison_std: ENTITY_ATTRIBUTE_PATH op value +comparison_null: ENTITY_ATTRIBUTE_PATH null_op NULL // // Timespan @@ -114,15 +114,15 @@ comparison_null: ENTITY_ATTRIBUTE_PATH NULL_OP NULL ?timespan: "start"i timestamp "stop"i timestamp -> timespan_absolute | "last"i INT timeunit -> timespan_relative -?timeunit: DAY - | HOUR - | MINUTE - | SECOND +?timeunit: day + | hour + | minute + | second -DAY: "days"i | "day"i | "d"i -HOUR: "hours"i | "hour"i | "h"i -MINUTE: "minutes"i | "minute"i | "m"i -SECOND: "seconds"i | "second"i | "s"i +day: "days"i | "day"i | "d"i +hour: "hours"i | "hour"i | "h"i +minute: "minutes"i | "minute"i | "m"i +second: "seconds"i | "second"i | "s"i timestamp: ISOTIMESTAMP | "\"" ISOTIMESTAMP "\"" @@ -189,17 +189,15 @@ ANALYTICS_ESCAPED: PATH_ESCAPED // Two-level JSON in command NEW // -// use terminal to load the entire VAR_DATA without parsing into it -// add `WS*` since `%ignore WS` doesn't apply to spaces inside terminals -// https://github.com/lark-parser/lark/issues/99 -VAR_DATA: "[" (RAW_VALUES | JSON_OBJS) "]" +// use terminal to load the entire var_data without parsing into it +var_data: "[" (RAW_VALUES | json_objs) "]" RAW_VALUES: ESCAPED_STRING_WS ("," ESCAPED_STRING_WS)* -JSON_OBJS: JSON_OBJ ("," JSON_OBJ)* -JSON_OBJ: WS* "{" JSON_PAIR ("," JSON_PAIR)* "}" WS* -JSON_PAIR: ESCAPED_STRING_WS ":" JSON_VALUE -JSON_VALUE: WS* (NUMBER|ESCAPED_STRING|TRUE|FALSE|NULL) WS* +json_objs: json_obj ("," json_obj)* +json_obj: "{" json_pair ("," json_pair)* "}" +json_pair: ESCAPED_STRING ":" json_value +json_value: (NUMBER|ESCAPED_STRING|TRUE|FALSE|NULL) // // Arguments @@ -228,24 +226,18 @@ NOT: "NOT"i ISSUBSET: "ISSUBSET"i ISSUPERSET: "ISSUPERSET"i -OP: OP_SIGN - | (NOT WS+)? OP_KEYWORD +op: OP_SIGN + | NOT? op_keyword -OP_SIGN: "=" - | "==" - | "!=" - | ">" - | "<" - | ">=" - | ">=" +OP_SIGN: /([!=]?=|[<>]=?)/ -OP_KEYWORD: IN +op_keyword: IN | LIKE | MATCHES | ISSUBSET | ISSUPERSET -NULL_OP: IS (WS+ NOT)? +null_op: IS (NOT)? // // Common language constructs @@ -263,7 +255,7 @@ literal_list: "(" literal ("," literal)* ")" reference_or_simple_string: ECNAME ("." ATTRIBUTE)? -string: ADVANCED_STRING +?string: advanced_string number: NUMBER @@ -295,7 +287,7 @@ SIMPLE_STRING: ECNAME // nearly Python string, but no [ubf]? as prefix options // check Lark example of Python parser for reference -ADVANCED_STRING: /(r?)("(?!"").*?(? COMMENT diff --git a/src/kestrel/syntax/parser.py b/src/kestrel/syntax/parser.py index b3196934..31a536ea 100644 --- a/src/kestrel/syntax/parser.py +++ b/src/kestrel/syntax/parser.py @@ -195,10 +195,20 @@ def save(self, args): return packet def new(self, args): + if len(args) == 1: + # Try to get entity type from first entity + data = args[0] + if isinstance(data, list): + entity_type = data[0].get("type") + else: + entity_type = None + else: + entity_type = _first(args) + data = args[1] return { "command": "new", - "type": self._extract_entity_type(args), - "data": self._assert_and_extract_single("VAR_DATA", args), + "type": entity_type, + "data": data, } def expression(self, args): @@ -236,13 +246,13 @@ def expression_and(self, args): def comparison_std(self, args): etype, attr = _extract_entity_and_attribute(args[0].value) # remove more than one spaces; capitalize op - op = " ".join(_second(args).split()).upper() + op = args[1] value = args[2] return ECGPComparison(attr, op, value, etype) def comparison_null(self, args): etype, attr = _extract_entity_and_attribute(args[0].value) - op = _second(args) + op = args[1] if "NOT" in op: op = "!=" else: @@ -271,12 +281,9 @@ def reference_or_simple_string(self, args): v = _first(args) return v - def string(self, args): + def advanced_string(self, args): raw = _first(args) - if args[0].type == self.token_prefix + "SIMPLE_STRING": - value = raw - elif args[0].type == self.token_prefix + "ADVANCED_STRING": - value = unescape_quoted_string(raw) + value = unescape_quoted_string(raw) return value def number(self, args): @@ -311,13 +318,13 @@ def offset_clause(self, args): def timespan_relative(self, args): num = int(args[0]) unit = args[1] - if unit.type == self.token_prefix + "DAY": + if unit == "DAY": delta = timedelta(days=num) - elif unit.type == self.token_prefix + "HOUR": + elif unit == "HOUR": delta = timedelta(hours=num) - elif unit.type == self.token_prefix + "MINUTE": + elif unit == "MINUTE": delta = timedelta(minutes=num) - elif unit.type == self.token_prefix + "SECOND": + elif unit == "SECOND": delta = timedelta(seconds=num) stop = datetime.utcnow() start = stop - delta @@ -328,8 +335,48 @@ def timespan_absolute(self, args): stop = to_datetime(args[1]) return {"timerange": (start, stop)} + def day(self, _args): + return "DAY" + + def hour(self, _args): + return "HOUR" + + def minute(self, _args): + return "MINUTE" + + def second(self, _args): + return "SECOND" + def timestamp(self, args): - return self._assert_and_extract_single("ISOTIMESTAMP", args) + return args[0] + + def var_data(self, args): + if isinstance(args[0], Token): + # Restore the brackets + v = "[" + _first(args) + "]" + else: + v = args[0] + return v + + def json_objs(self, args): + return args + + def json_obj(self, args): + return dict(args) + + def json_pair(self, args): + v = _first(args) + if "ESCAPED_STRING" in args[0].type: + v = unescape_quoted_string(v) + return v, args[1] + + def json_value(self, args): + v = _first(args) + if "ESCAPED_STRING" in args[0].type: + v = unescape_quoted_string(v) + elif args[0].type == self.token_prefix + "NUMBER": + v = float(v) if "." in v else int(v) + return v def entity_type(self, args): return _first(args) @@ -374,7 +421,7 @@ def bin_func(self, args): attr = _first(args) num = int(_second(args)) if len(args) >= 3: - unit = _third(args) + unit = args[2][0] # Only pass 1st letter (d, h, m, or s) else: unit = None alias = f"{attr}_bin" @@ -398,6 +445,18 @@ def args(self, args): def arg_kv_pair(self, args): return {_first(args): args[1]} + def op(self, args): + return " ".join([arg.upper() for arg in args]) + + def op_sign(self, args): + return _first(args) if args else "" + + def op_keyword(self, args): + return _first(args) + + def null_op(self, args): + return " ".join([arg.upper() for arg in args]) + def _extract_vars(self, args): var_names = [] for arg in args: