Merge pull request #550 from opencybersecurityalliance/k2-complete

K2 auto complete
opencybersecurityalliance · Jul 23, 2024 · 22b4621 · 22b4621
2 parents 8f2963f + 2fd4584
commit 22b4621
Show file tree

Hide file tree

Showing 16 changed files with 380 additions and 44 deletions.
diff --git a/packages/kestrel_core/pyproject.toml b/packages/kestrel_core/pyproject.toml
@@ -30,14 +30,14 @@ classifiers = [
 ]
 
 dependencies = [
-    "typeguard>=4.1.5",
+    "typeguard>=4.3.0",
     "pyyaml>=6.0.1",
-    "lark>=1.1.7",
-    "pandas>=2.0.3",
-    "pyarrow>=13.0.0",
-    "mashumaro>=3.10",
-    "networkx>=3.1", # networkx==3.2.1 only for Python>=3.9
-    "SQLAlchemy>=2.0.23",
+    "lark>=1.1.9",
+    "pandas>=2.0.3", # any higher version drops Python 3.8 support
+    "pyarrow>=17.0.0",
+    "mashumaro>=3.13.1",
+    "networkx>=3.1", # any higher version drops Python 3.8 support
+    "SQLAlchemy>=2.0.31",
 ]
 
 [project.optional-dependencies]

diff --git a/packages/kestrel_core/src/kestrel/analytics/interface.py b/packages/kestrel_core/src/kestrel/analytics/interface.py
@@ -107,7 +107,7 @@ def my_analytic(df: pd.DataFrame, x: int = 0, y: float = 0.5)
 import traceback
 from contextlib import AbstractContextManager
 from importlib.util import module_from_spec, spec_from_file_location
-from typing import Any, Iterable, Mapping, MutableMapping, Optional
+from typing import Any, Iterable, List, Mapping, MutableMapping, Optional
 from uuid import UUID
 
 from kestrel.analytics.config import get_profile, load_profiles
@@ -119,7 +119,7 @@ def my_analytic(df: pd.DataFrame, x: int = 0, y: float = 0.5)
     InvalidAnalyticsInterfaceImplementation,
     InvalidAnalyticsOutput,
 )
-from kestrel.interface import AbstractInterface
+from kestrel.interface import AnalyticsInterface
 from kestrel.ir.graph import IRGraphEvaluable
 from kestrel.ir.instructions import (
     Analytic,
@@ -157,7 +157,7 @@ def run(self, config: dict) -> DataFrame:
         return df
 
 
-class PythonAnalyticsInterface(AbstractInterface):
+class PythonAnalyticsInterface(AnalyticsInterface):
     def __init__(
         self,
         serialized_cache_catalog: Optional[str] = None,
@@ -171,6 +171,9 @@ def __init__(
     def schemes() -> Iterable[str]:
         return ["python"]
 
+    def get_datasources(self) -> List[str]:
+        return list(self.config)
+
     def get_storage_of_datasource(self, datasource: str) -> str:
         return "local"
 

diff --git a/packages/kestrel_core/src/kestrel/cache/base.py b/packages/kestrel_core/src/kestrel/cache/base.py
@@ -1,26 +1,24 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import Iterable, MutableMapping
+from typing import Iterable, List, MutableMapping
 from uuid import UUID
 
 from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER, CACHE_STORAGE_IDENTIFIER
-from kestrel.interface import AbstractInterface
+from kestrel.interface import DatasourceInterface
 from pandas import DataFrame
 
 
-class AbstractCache(AbstractInterface, MutableMapping):
-    """Base class for Kestrel cache
-
-    Additional @abstractmethod from AbstractInterface:
-
-        - evaluate_graph()
-    """
+class AbstractCache(DatasourceInterface, MutableMapping):
+    """Base class for Kestrel cache"""
 
     @staticmethod
     def schemes() -> Iterable[str]:
         return [CACHE_INTERFACE_IDENTIFIER]
 
+    def get_datasources(self) -> List[str]:
+        return []
+
     def get_storage_of_datasource(self, datasource: str) -> str:
         return CACHE_STORAGE_IDENTIFIER
 

diff --git a/packages/kestrel_core/src/kestrel/config/relations/event.csv b/packages/kestrel_core/src/kestrel/config/relations/event.csv
@@ -7,3 +7,4 @@ network_endpoint,RESPONDED,dst_endpoint
 reg_key,RESPONDED,reg_key
 reg_value,RESPONDED,reg_value
 user,ORIGINATED,actor.user
+endpoint,RESPONDED,device
diff --git a/packages/kestrel_core/src/kestrel/config/utils.py b/packages/kestrel_core/src/kestrel/config/utils.py
@@ -1,7 +1,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import Mapping, Union
+from typing import List, Mapping, Union
 
 import pandas
 import yaml
@@ -17,6 +17,9 @@
 CONFIG_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "kestrel.yaml"
 CONFIG_PATH_ENV_VAR = "KESTREL_CONFIG"  # override CONFIG_PATH_DEFAULT if provided
 
+relations = []
+entity_types = []
+
 _logger = logging.getLogger(__name__)
 
 
@@ -105,3 +108,33 @@ def load_relation_configs(table_name: str) -> pandas.DataFrame:
     except:
         raise InvalidKestrelRelationTable(filepaths[0])
     return table
+
+
+@typechecked
+def get_all_relations() -> List[str]:
+    global relations
+    if not relations:
+        _relations = set()
+        for filepath in list_folder_files(
+            "kestrel.config", "relations", extension="csv"
+        ):
+            table = pandas.read_csv(filepath)
+            _relations |= set(table["Relation"].to_list())
+        relations = list(_relations)
+    return relations
+
+
+@typechecked
+def get_all_entity_types() -> List[str]:
+    global entity_types
+    if not entity_types:
+        _entity_types = {"event"}
+        for filepath in list_folder_files(
+            "kestrel.config", "relations", extension="csv"
+        ):
+            table = pandas.read_csv(filepath)
+            for typecol in ("InputType", "OutputType"):
+                if typecol in table:
+                    _entity_types |= set(table[typecol].to_list())
+        entity_types = list(_entity_types)
+    return entity_types
diff --git a/packages/kestrel_core/src/kestrel/frontend/completor.py b/packages/kestrel_core/src/kestrel/frontend/completor.py
@@ -0,0 +1,229 @@
+import logging
+import re
+from datetime import datetime
+from typing import Callable, Iterable, List, Tuple
+
+import lark
+from kestrel.config.utils import get_all_entity_types, get_all_relations
+from kestrel.frontend.parser import get_keywords, parse_without_transform
+from kestrel.interface import InterfaceManager
+from kestrel.interface.base import AnalyticsInterface, DatasourceInterface
+from kestrel.utils import timefmt
+from typeguard import typechecked
+
+_logger = logging.getLogger(__name__)
+
+ISO_TS_RE = re.compile(r"\d{4}(-\d{2}(-\d{2}(T\d{2}(:\d{2}(:\d{2}Z?)?)?)?)?)?")
+
+
+@typechecked
+def do_complete(
+    code: str,
+    cursor_pos: int,
+    itf_manager: InterfaceManager,
+    varnames: List[str],
+) -> Iterable[str]:
+    _logger.debug("auto_complete function starts...")
+
+    # do not care code after cursor position in the current version
+    line = code[:cursor_pos]
+    _logger.debug(f"line to auto-complete: {line}")
+
+    # if the last char is a space, `line_to_parse = line`
+    # otherwise, exclude the last token in `line_to_parse` to prompt the expected token
+    last_word_prefix, line_to_parse = _split_last_token(line)
+    _logger.debug(f"last word prefix: {last_word_prefix}")
+    _logger.debug(f"line to parse: {line_to_parse}")
+
+    try:
+        ast = parse_without_transform(line_to_parse)
+
+    except lark.exceptions.UnexpectedCharacters as e:
+        suggestions = ["% illegal char in huntflow %"]
+        _logger.debug(f"illegal character in `line_to_parse`, err: {str(e)}")
+
+    except lark.exceptions.UnexpectedEOF as e:
+        suggestions = ["% EOF auto-complete internal error, report to developers %"]
+        # https://github.com/lark-parser/lark/issues/791
+        # Lark updates may break this, check if it is the case
+        # no need to use KestrelInternalError; not to break huntflow execution
+        _logger.debug(f"Lark with LALR should not give this error: {str(e)}")
+
+    except lark.exceptions.UnexpectedToken as e:
+        error_token = e.token
+        expected_tokens = e.accepts or e.expected
+        expected_values = []
+        keywords = set(get_keywords(False))
+        relations = get_all_relations()
+        entity_types = get_all_entity_types()
+        for token in expected_tokens:
+            _logger.debug("token: %s", token)
+            if token == "VARIABLE":
+                expected_values.extend(varnames)
+            elif token == "ISOTIMESTAMP":
+                if last_word_prefix:
+                    if last_word_prefix.startswith("t'"):
+                        ts_prefix = last_word_prefix[2:]
+                        ts_complete = _do_complete_timestamp(ts_prefix)
+                        exp_value = "t'" + ts_complete + "'"
+                    else:
+                        exp_value = _do_complete_timestamp(last_word_prefix)
+                else:
+                    exp_value = timefmt(datetime.now())
+                expected_values.append(exp_value)
+            elif token == "DATASRC_SIMPLE":
+                _logger.debug("auto-complete data source")
+                expected_values.extend(
+                    _do_complete_interface(
+                        last_word_prefix,
+                        itf_manager.schemes(DatasourceInterface),
+                        itf_manager.list_datasources_from_scheme,
+                    )
+                )
+            elif token == "ANALYTICS_SIMPLE":
+                _logger.debug("auto-complete analytics")
+                expected_values.extend(
+                    _do_complete_interface(
+                        last_word_prefix,
+                        itf_manager.schemes(AnalyticsInterface),
+                        itf_manager.list_datasources_from_scheme,
+                    )
+                )
+            elif token == "ENTITY_TYPE":
+                expected_values.extend(entity_types)
+            elif token == "PROJECT_FIELD":  # not precise
+                expected_values.extend(entity_types)
+            elif token == "RELATION":
+                expected_values.extend(relations)
+            elif token == "REVERSED":
+                expected_values.append("BY")
+            elif token == "EQUAL":
+                expected_values.append("=")
+            elif token == "ATTRIBUTE":
+                # TODO: attribute completion
+                # https://github.com/opencybersecurityalliance/kestrel-lang/issues/79
+                _logger.debug(f"TODO: ATTRIBUTE COMPLETION")
+            elif token == "ENTITY_ATTRIBUTE_PATH":
+                # TODO: attribute completion
+                # https://github.com/opencybersecurityalliance/kestrel-lang/issues/79
+                _logger.debug(f"TODO: ATTRIBUTE COMPLETION")
+            elif token == "COMMA":
+                expected_values.append(",")
+            elif token in keywords:
+                if last_word_prefix and last_word_prefix.islower():
+                    token = token.lower()
+                expected_values.append(token)
+            else:
+                # token not handled
+                continue
+        expected_values = sorted(expected_values)
+        _logger.debug(f"expected values: {expected_values}")
+
+        # turn `expected_values` into `suggestions`
+        _p = last_word_prefix
+        _e = expected_values
+        suggestions = [t[len(_p) :] for t in _e if t.startswith(_p)] if _p else _e
+        suggestions = [x for x in set(suggestions) if x]
+        _logger.debug(f"suggestions: {suggestions}")
+
+    else:
+        suggestions = []
+
+        # handle optional components
+        if ast:
+            stmt = ast.children[-1].children[0]
+            cmd = (
+                stmt.children[1].data.value
+                if stmt.data.value == "assignment"
+                else stmt.data.value
+            )
+            if cmd == "disp":
+                for clause in ("attr_clause", "limit_clause", "offset_clause"):
+                    if not list(stmt.find_data(clause)):
+                        suggestions.append("ATTR")
+            elif cmd in ("expression", "find") and not list(
+                stmt.find_data("where_clause")
+            ):
+                suggestions.append("WHERE")
+            elif cmd in ("get", "find") and not list(stmt.find_data("timerange")):
+                suggestions.append("START")
+            elif cmd == "apply" and not list(stmt.find_data("args")):
+                suggestions.append("WITH")
+
+        suggestions = [x for x in set(suggestions) if x]
+        _p = last_word_prefix
+        suggestions = (
+            [t[len(_p) :] for t in suggestions if t.startswith(_p)]
+            if _p
+            else suggestions
+        )
+        _logger.debug(f"suggestions from optional components: {suggestions}")
+
+    return suggestions
+
+
+@typechecked
+def _end_with_blank_or_comma(s: str) -> bool:
+    return s[-1] in [" ", "\t", "\n", "\r", "\f", "\v", ","] if s else True
+
+
+@typechecked
+def _split_last_token(s: str) -> Tuple[str, str]:
+    last = ""
+    if not _end_with_blank_or_comma(s):
+        while not _end_with_blank_or_comma(s):
+            last = s[-1] + last
+            s = s[:-1]
+    return last, s
+
+
+@typechecked
+def _do_complete_timestamp(ts_prefix: str) -> str:
+    valid_ts_formats = [
+        "%Y",
+        "%Y-%m",
+        "%Y-%m-%d",
+        "%Y-%m-%dT%H",
+        "%Y-%m-%dT%H:%M",
+        "%Y-%m-%dT%H:%M:%S",
+    ]
+    matched = ISO_TS_RE.match(ts_prefix)
+    if matched:
+        for ts_format in valid_ts_formats:
+            _logger.debug(f"Match timestamp {ts_prefix} with format {ts_format}")
+            try:
+                ts = datetime.strptime(matched.group(), ts_format)
+            except:
+                _logger.debug(f"Timestamp match failed")
+            else:
+                ts_complete = timefmt(ts)
+                _logger.debug(f"Timestamp completed: {ts_complete}")
+                break
+        else:
+            ts_complete = "% TS auto-complete internal error, report to developers %"
+            # no need to use KestrelInternalError; not to break huntflow execution
+            _logger.debug(
+                f"TS auto-complete internal error: `valid_ts_formats` is incomplete"
+            )
+    else:
+        ts_complete = "% illegal ISO 8601 timestamp prefix %"
+        _logger.debug(f"illegal ISO 8601 timestamp prefix: {ts_prefix}")
+    return ts_complete
+
+
+@typechecked
+def _do_complete_interface(
+    last_word_prefix: str,
+    schemes: Iterable[str],
+    list_names_from_scheme: Callable,
+) -> Iterable[str]:
+    if last_word_prefix and "://" in last_word_prefix:
+        scheme, _ = last_word_prefix.split("://")
+        if scheme in schemes:
+            names = list_names_from_scheme(scheme)
+            paths = [scheme + "://" + name for name in names]
+            _logger.debug(f"auto-complete interface {scheme}: {paths}")
+            expected_values = paths
+    else:
+        expected_values = [scheme + "://" for scheme in schemes]
+    return expected_values