Skip to content

Commit

Permalink
Merge pull request #550 from opencybersecurityalliance/k2-complete
Browse files Browse the repository at this point in the history
K2 auto complete
  • Loading branch information
subbyte authored Jul 23, 2024
2 parents 8f2963f + 2fd4584 commit 22b4621
Show file tree
Hide file tree
Showing 16 changed files with 380 additions and 44 deletions.
14 changes: 7 additions & 7 deletions packages/kestrel_core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ classifiers = [
]

dependencies = [
"typeguard>=4.1.5",
"typeguard>=4.3.0",
"pyyaml>=6.0.1",
"lark>=1.1.7",
"pandas>=2.0.3",
"pyarrow>=13.0.0",
"mashumaro>=3.10",
"networkx>=3.1", # networkx==3.2.1 only for Python>=3.9
"SQLAlchemy>=2.0.23",
"lark>=1.1.9",
"pandas>=2.0.3", # any higher version drops Python 3.8 support
"pyarrow>=17.0.0",
"mashumaro>=3.13.1",
"networkx>=3.1", # any higher version drops Python 3.8 support
"SQLAlchemy>=2.0.31",
]

[project.optional-dependencies]
Expand Down
9 changes: 6 additions & 3 deletions packages/kestrel_core/src/kestrel/analytics/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def my_analytic(df: pd.DataFrame, x: int = 0, y: float = 0.5)
import traceback
from contextlib import AbstractContextManager
from importlib.util import module_from_spec, spec_from_file_location
from typing import Any, Iterable, Mapping, MutableMapping, Optional
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional
from uuid import UUID

from kestrel.analytics.config import get_profile, load_profiles
Expand All @@ -119,7 +119,7 @@ def my_analytic(df: pd.DataFrame, x: int = 0, y: float = 0.5)
InvalidAnalyticsInterfaceImplementation,
InvalidAnalyticsOutput,
)
from kestrel.interface import AbstractInterface
from kestrel.interface import AnalyticsInterface
from kestrel.ir.graph import IRGraphEvaluable
from kestrel.ir.instructions import (
Analytic,
Expand Down Expand Up @@ -157,7 +157,7 @@ def run(self, config: dict) -> DataFrame:
return df


class PythonAnalyticsInterface(AbstractInterface):
class PythonAnalyticsInterface(AnalyticsInterface):
def __init__(
self,
serialized_cache_catalog: Optional[str] = None,
Expand All @@ -171,6 +171,9 @@ def __init__(
def schemes() -> Iterable[str]:
return ["python"]

def get_datasources(self) -> List[str]:
return list(self.config)

def get_storage_of_datasource(self, datasource: str) -> str:
return "local"

Expand Down
16 changes: 7 additions & 9 deletions packages/kestrel_core/src/kestrel/cache/base.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,24 @@
from __future__ import annotations

from abc import abstractmethod
from typing import Iterable, MutableMapping
from typing import Iterable, List, MutableMapping
from uuid import UUID

from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER, CACHE_STORAGE_IDENTIFIER
from kestrel.interface import AbstractInterface
from kestrel.interface import DatasourceInterface
from pandas import DataFrame


class AbstractCache(AbstractInterface, MutableMapping):
"""Base class for Kestrel cache
Additional @abstractmethod from AbstractInterface:
- evaluate_graph()
"""
class AbstractCache(DatasourceInterface, MutableMapping):
"""Base class for Kestrel cache"""

@staticmethod
def schemes() -> Iterable[str]:
return [CACHE_INTERFACE_IDENTIFIER]

def get_datasources(self) -> List[str]:
return []

def get_storage_of_datasource(self, datasource: str) -> str:
return CACHE_STORAGE_IDENTIFIER

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ network_endpoint,RESPONDED,dst_endpoint
reg_key,RESPONDED,reg_key
reg_value,RESPONDED,reg_value
user,ORIGINATED,actor.user
endpoint,RESPONDED,device
35 changes: 34 additions & 1 deletion packages/kestrel_core/src/kestrel/config/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import os
from pathlib import Path
from typing import Mapping, Union
from typing import List, Mapping, Union

import pandas
import yaml
Expand All @@ -17,6 +17,9 @@
CONFIG_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "kestrel.yaml"
CONFIG_PATH_ENV_VAR = "KESTREL_CONFIG" # override CONFIG_PATH_DEFAULT if provided

relations = []
entity_types = []

_logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -105,3 +108,33 @@ def load_relation_configs(table_name: str) -> pandas.DataFrame:
except:
raise InvalidKestrelRelationTable(filepaths[0])
return table


@typechecked
def get_all_relations() -> List[str]:
global relations
if not relations:
_relations = set()
for filepath in list_folder_files(
"kestrel.config", "relations", extension="csv"
):
table = pandas.read_csv(filepath)
_relations |= set(table["Relation"].to_list())
relations = list(_relations)
return relations


@typechecked
def get_all_entity_types() -> List[str]:
global entity_types
if not entity_types:
_entity_types = {"event"}
for filepath in list_folder_files(
"kestrel.config", "relations", extension="csv"
):
table = pandas.read_csv(filepath)
for typecol in ("InputType", "OutputType"):
if typecol in table:
_entity_types |= set(table[typecol].to_list())
entity_types = list(_entity_types)
return entity_types
229 changes: 229 additions & 0 deletions packages/kestrel_core/src/kestrel/frontend/completor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import logging
import re
from datetime import datetime
from typing import Callable, Iterable, List, Tuple

import lark
from kestrel.config.utils import get_all_entity_types, get_all_relations
from kestrel.frontend.parser import get_keywords, parse_without_transform
from kestrel.interface import InterfaceManager
from kestrel.interface.base import AnalyticsInterface, DatasourceInterface
from kestrel.utils import timefmt
from typeguard import typechecked

_logger = logging.getLogger(__name__)

ISO_TS_RE = re.compile(r"\d{4}(-\d{2}(-\d{2}(T\d{2}(:\d{2}(:\d{2}Z?)?)?)?)?)?")


@typechecked
def do_complete(
code: str,
cursor_pos: int,
itf_manager: InterfaceManager,
varnames: List[str],
) -> Iterable[str]:
_logger.debug("auto_complete function starts...")

# do not care code after cursor position in the current version
line = code[:cursor_pos]
_logger.debug(f"line to auto-complete: {line}")

# if the last char is a space, `line_to_parse = line`
# otherwise, exclude the last token in `line_to_parse` to prompt the expected token
last_word_prefix, line_to_parse = _split_last_token(line)
_logger.debug(f"last word prefix: {last_word_prefix}")
_logger.debug(f"line to parse: {line_to_parse}")

try:
ast = parse_without_transform(line_to_parse)

except lark.exceptions.UnexpectedCharacters as e:
suggestions = ["% illegal char in huntflow %"]
_logger.debug(f"illegal character in `line_to_parse`, err: {str(e)}")

except lark.exceptions.UnexpectedEOF as e:
suggestions = ["% EOF auto-complete internal error, report to developers %"]
# https://github.com/lark-parser/lark/issues/791
# Lark updates may break this, check if it is the case
# no need to use KestrelInternalError; not to break huntflow execution
_logger.debug(f"Lark with LALR should not give this error: {str(e)}")

except lark.exceptions.UnexpectedToken as e:
error_token = e.token
expected_tokens = e.accepts or e.expected
expected_values = []
keywords = set(get_keywords(False))
relations = get_all_relations()
entity_types = get_all_entity_types()
for token in expected_tokens:
_logger.debug("token: %s", token)
if token == "VARIABLE":
expected_values.extend(varnames)
elif token == "ISOTIMESTAMP":
if last_word_prefix:
if last_word_prefix.startswith("t'"):
ts_prefix = last_word_prefix[2:]
ts_complete = _do_complete_timestamp(ts_prefix)
exp_value = "t'" + ts_complete + "'"
else:
exp_value = _do_complete_timestamp(last_word_prefix)
else:
exp_value = timefmt(datetime.now())
expected_values.append(exp_value)
elif token == "DATASRC_SIMPLE":
_logger.debug("auto-complete data source")
expected_values.extend(
_do_complete_interface(
last_word_prefix,
itf_manager.schemes(DatasourceInterface),
itf_manager.list_datasources_from_scheme,
)
)
elif token == "ANALYTICS_SIMPLE":
_logger.debug("auto-complete analytics")
expected_values.extend(
_do_complete_interface(
last_word_prefix,
itf_manager.schemes(AnalyticsInterface),
itf_manager.list_datasources_from_scheme,
)
)
elif token == "ENTITY_TYPE":
expected_values.extend(entity_types)
elif token == "PROJECT_FIELD": # not precise
expected_values.extend(entity_types)
elif token == "RELATION":
expected_values.extend(relations)
elif token == "REVERSED":
expected_values.append("BY")
elif token == "EQUAL":
expected_values.append("=")
elif token == "ATTRIBUTE":
# TODO: attribute completion
# https://github.com/opencybersecurityalliance/kestrel-lang/issues/79
_logger.debug(f"TODO: ATTRIBUTE COMPLETION")
elif token == "ENTITY_ATTRIBUTE_PATH":
# TODO: attribute completion
# https://github.com/opencybersecurityalliance/kestrel-lang/issues/79
_logger.debug(f"TODO: ATTRIBUTE COMPLETION")
elif token == "COMMA":
expected_values.append(",")
elif token in keywords:
if last_word_prefix and last_word_prefix.islower():
token = token.lower()
expected_values.append(token)
else:
# token not handled
continue
expected_values = sorted(expected_values)
_logger.debug(f"expected values: {expected_values}")

# turn `expected_values` into `suggestions`
_p = last_word_prefix
_e = expected_values
suggestions = [t[len(_p) :] for t in _e if t.startswith(_p)] if _p else _e
suggestions = [x for x in set(suggestions) if x]
_logger.debug(f"suggestions: {suggestions}")

else:
suggestions = []

# handle optional components
if ast:
stmt = ast.children[-1].children[0]
cmd = (
stmt.children[1].data.value
if stmt.data.value == "assignment"
else stmt.data.value
)
if cmd == "disp":
for clause in ("attr_clause", "limit_clause", "offset_clause"):
if not list(stmt.find_data(clause)):
suggestions.append("ATTR")
elif cmd in ("expression", "find") and not list(
stmt.find_data("where_clause")
):
suggestions.append("WHERE")
elif cmd in ("get", "find") and not list(stmt.find_data("timerange")):
suggestions.append("START")
elif cmd == "apply" and not list(stmt.find_data("args")):
suggestions.append("WITH")

suggestions = [x for x in set(suggestions) if x]
_p = last_word_prefix
suggestions = (
[t[len(_p) :] for t in suggestions if t.startswith(_p)]
if _p
else suggestions
)
_logger.debug(f"suggestions from optional components: {suggestions}")

return suggestions


@typechecked
def _end_with_blank_or_comma(s: str) -> bool:
return s[-1] in [" ", "\t", "\n", "\r", "\f", "\v", ","] if s else True


@typechecked
def _split_last_token(s: str) -> Tuple[str, str]:
last = ""
if not _end_with_blank_or_comma(s):
while not _end_with_blank_or_comma(s):
last = s[-1] + last
s = s[:-1]
return last, s


@typechecked
def _do_complete_timestamp(ts_prefix: str) -> str:
valid_ts_formats = [
"%Y",
"%Y-%m",
"%Y-%m-%d",
"%Y-%m-%dT%H",
"%Y-%m-%dT%H:%M",
"%Y-%m-%dT%H:%M:%S",
]
matched = ISO_TS_RE.match(ts_prefix)
if matched:
for ts_format in valid_ts_formats:
_logger.debug(f"Match timestamp {ts_prefix} with format {ts_format}")
try:
ts = datetime.strptime(matched.group(), ts_format)
except:
_logger.debug(f"Timestamp match failed")
else:
ts_complete = timefmt(ts)
_logger.debug(f"Timestamp completed: {ts_complete}")
break
else:
ts_complete = "% TS auto-complete internal error, report to developers %"
# no need to use KestrelInternalError; not to break huntflow execution
_logger.debug(
f"TS auto-complete internal error: `valid_ts_formats` is incomplete"
)
else:
ts_complete = "% illegal ISO 8601 timestamp prefix %"
_logger.debug(f"illegal ISO 8601 timestamp prefix: {ts_prefix}")
return ts_complete


@typechecked
def _do_complete_interface(
last_word_prefix: str,
schemes: Iterable[str],
list_names_from_scheme: Callable,
) -> Iterable[str]:
if last_word_prefix and "://" in last_word_prefix:
scheme, _ = last_word_prefix.split("://")
if scheme in schemes:
names = list_names_from_scheme(scheme)
paths = [scheme + "://" + name for name in names]
_logger.debug(f"auto-complete interface {scheme}: {paths}")
expected_values = paths
else:
expected_values = [scheme + "://" for scheme in schemes]
return expected_values
Loading

0 comments on commit 22b4621

Please sign in to comment.