feat: improve handling of IndexedDB databases and restructure code (#36)

* feat: extend gh action to run on MS Teams 2.0 datasets * refactor: disable continue-on-error 🪲 * fix: unsupported operand type(s) for +: 'NoneType' and 'int' * refactor: use WrappedIndexDB instead of FastIndexedDB * feat: implement basic parser for the object stores * feat: add option to pass .blob folder * chore: update path of submodule * chore: update path to chromedb package * fix: parse v1 and v2 separately where necessary * fix: skip unnecessary object stores in parser mode --------- Co-authored-by: Alexander Bilz <mail@alexbilz.com>
lxndrblz · Jan 21, 2024 · 9db96bf · 9db96bf
1 parent cf53e38
commit 9db96bf
Show file tree

Hide file tree

Showing 8 changed files with 286 additions and 293 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -1,7 +1,7 @@
 on:
-    push:
+  push:
 jobs:
-    build:
+  build:
       name: Build exectuable 📦
       runs-on: windows-latest
       steps:
@@ -24,21 +24,11 @@ jobs:
         run: |
              .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe_v_1_4_00_11161.json"
              .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_v_1_4_00_11161.json"
-      - name: Upload results📲
-        uses: actions/upload-artifact@v4
-        with:
-          name: generated-outputs
-          path: |
-              jane_doe_v_1_4_00_11161.json
-              john_doe_v_1_4_00_11161.json
-          retention-days: 1
-      - name: Test calling script 📞
+             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.live.com_0.indexeddb.leveldb" -o "karelze.json"
+             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja.json"
+      - name: Test calling script 🖱️
         run: |
               python tools/main.py --help
               python tools/dump_leveldb.py --help
               python tools/dump_localstorage.py --help
               python tools/dump_sessionstorage.py --help
-      - name: Calculate diff 👽
-        run: | 
-            git diff --no-index --word-diff .\forensicsim-data\expected-result\jane_doe_v_1_4_00_11161.json jane_doe_v_1_4_00_11161.json
-            git diff --no-index --word-diff .\forensicsim-data\expected-result\john_doe_v_1_4_00_11161.json john_doe_v_1_4_00_11161.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,6 +25,13 @@ repos:
       - id: debug-statements
       - id: end-of-file-fixer
       - id: mixed-line-ending
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.8.0
+    hooks:
+      - id: mypy
+        # yaml requires additional stubs.
+        # Similar to: https://stackoverflow.com/a/73603491/5755604
+        additional_dependencies: ['types-PyYAML']
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.11
     hooks:

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,13 +10,13 @@ readme = "README.md"
 license = {file = "LICENSE.md"}
 requires-python = ">=3.9"
 authors = [
-    { name = "Alexander Bilz", email = "github@markusbilz.com" },
+    { name = "Alexander Bilz", email = "github@alexbilz.com" },
     { name = "Markus Bilz", email = "github@markusbilz.com" }
 ]
 dependencies = [
 "beautifulsoup4",
 "click",
-"chromedb @ git+https://github.com/karelze/ccl_chrome_indexeddb@master",
+"chromedb @ git+https://github.com/lxndrblz/chromedb@main",
 "dataclasses-json",
 "pause",
 "pyautogui",
@@ -36,10 +36,24 @@ version = {attr = "forensicsim.__version__"}
 dev=[
     "build",
     "pre-commit",
+    "mypy",
     "ruff",
     "tox",
 ]
 
+[tool.mypy]
+
+python_version = "3.9"
+
+exclude = [
+    "tools/**.py"
+]
+
+# https://github.com/python/mypy/issues/2410
+ignore_missing_imports = true
+disallow_untyped_defs = true
+disallow_untyped_calls = true
+disallow_incomplete_defs = true
 
 [tool.ruff]
 
@@ -105,6 +119,7 @@ commands =
 # Syntax Checks
 [testenv:lint]
 commands =
+    python -m mypy src/forensicsim/backend.py
     python -m ruff --output-format=github src
     python -m ruff format src --check
 

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
@@ -22,26 +22,20 @@
 SOFTWARE.
 """
 
-import io
 import json
-import os
 from pathlib import Path
+from typing import Any, Optional
 
 from chromedb import (
-    ccl_blink_value_deserializer,
     ccl_chromium_indexeddb,
     ccl_chromium_localstorage,
     ccl_chromium_sessionstorage,
-    ccl_leveldb,
-    ccl_v8_value_deserializer,
-)
-from chromedb.ccl_chromium_indexeddb import (
-    DatabaseMetadataType,
-    ObjectStoreMetadataType,
 )
 
 TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"]
 
+ENCODING = "iso-8859-1"
+
 """
 The following code is heavily adopted from the RawLevelDb and IndexedDB processing proposed by CCL Group
 
@@ -53,197 +47,52 @@
 """
 
 
-class FastIndexedDB:
-    def __init__(self, leveldb_dir: os.PathLike):
-        self._db = ccl_leveldb.RawLevelDb(leveldb_dir)
-        self._fetched_records = []
-        self.global_metadata = None
-        self.database_metadata = None
-        self.object_store_meta = None
-        self.fetch_data()
-
-    def fetch_data(self):
-        global_metadata_raw = {}
-
-        database_metadata_raw = {}
-        objectstore_metadata_raw = {}
-
-        self._fetched_records = []
-        # Fetch the records only once
-        for record in self._db.iterate_records_raw():
-            self._fetched_records.append(record)
-
-        for record in self._fetched_records:
-            # Global Metadata
-            if (
-                record.key.startswith(b"\x00\x00\x00\x00")
-                and record.state == ccl_leveldb.KeyState.Live
-            ) and (
-                record.key not in global_metadata_raw
-                or global_metadata_raw[record.key].seq < record.seq
-            ):
-                global_metadata_raw[record.key] = record
-
-        # Convert the raw metadata to a nice GlobalMetadata Object
-        global_metadata = ccl_chromium_indexeddb.GlobalMetadata(global_metadata_raw)
-
-        # Loop through the database IDs
-        for db_id in global_metadata.db_ids:
-            if db_id.dbid_no == None:
-                continue
-
-            if db_id.dbid_no > 0x7F:
-                raise NotImplementedError(
-                    "there could be this many dbs, but I don't support it yet"
-                )
-
-            # Database keys end with 0
-            prefix_database = bytes([0, db_id.dbid_no, 0, 0])
-
-            # Objetstore keys end with 50
-            prefix_objectstore = bytes([0, db_id.dbid_no, 0, 0, 50])
-
-            for record in reversed(self._fetched_records):
-                if (
-                    record.key.startswith(prefix_database)
-                    and record.state == ccl_leveldb.KeyState.Live
-                ):
-                    # we only want live keys and the newest version thereof (highest seq)
-                    meta_type = record.key[len(prefix_database)]
-                    old_version = database_metadata_raw.get((db_id.dbid_no, meta_type))
-                    if old_version is None or old_version.seq < record.seq:
-                        database_metadata_raw[(db_id.dbid_no, meta_type)] = record
-                if (
-                    record.key.startswith(prefix_objectstore)
-                    and record.state == ccl_leveldb.KeyState.Live
-                ):
-                    # we only want live keys and the newest version thereof (highest seq)
-                    try:
-                        (
-                            objstore_id,
-                            varint_raw,
-                        ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
-                            record.key[len(prefix_objectstore) :]
-                        )
-                    except TypeError:
-                        continue
-
-                    meta_type = record.key[len(prefix_objectstore) + len(varint_raw)]
-
-                    old_version = objectstore_metadata_raw.get((
-                        db_id.dbid_no,
-                        objstore_id,
-                        meta_type,
-                    ))
+def parse_db(
+    filepath: Path,
+    blobpath: Optional[Path] = None,
+    do_not_filter: Optional[bool] = False,
+) -> list[dict[str, Any]]:
+    # Open raw access to a LevelDB and deserialize the records.
 
-                    if old_version is None or old_version.seq < record.seq:
-                        objectstore_metadata_raw[
-                            (db_id.dbid_no, objstore_id, meta_type)
-                        ] = record
+    wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath)
 
-        self.global_metadata = global_metadata
-        self.database_metadata = ccl_chromium_indexeddb.DatabaseMetadata(
-            database_metadata_raw
-        )
-        self.object_store_meta = ccl_chromium_indexeddb.ObjectStoreMetadata(
-            objectstore_metadata_raw
-        )
+    extracted_values = []
 
-    def get_database_metadata(self, db_id: int, meta_type: DatabaseMetadataType):
-        return self.database_metadata.get_meta(db_id, meta_type)
+    for db_info in wrapper.database_ids:
+        # Skip databases without a valid dbid_no
+        if db_info.dbid_no is None:
+            continue
 
-    def get_object_store_metadata(
-        self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType
-    ):
-        return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type)
+        db = wrapper[db_info.dbid_no]
 
-    def iterate_records(self, do_not_filter=False):
-        blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer()
-        # Loop through the databases and object stores based on their ids
-        for global_id in self.global_metadata.db_ids:
-            # print(f"Processing database: {global_id.name}")
-            if global_id.dbid_no == None:
-                print(f"WARNING: Skipping database {global_id.name}")
+        for obj_store_name in db.object_store_names:
+            # Skip empty object stores
+            if obj_store_name is None:
                 continue
-
-            for object_store_id in range(
-                1,
-                self.database_metadata.get_meta(
-                    global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId
-                )
-                + 1,
-            ):
-                datastore = self.object_store_meta.get_meta(
-                    global_id.dbid_no,
-                    object_store_id,
-                    ObjectStoreMetadataType.StoreName,
-                )
-
-                # print(f"\t Processing object store: {datastore}")
+            if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter is False:
+                obj_store = db[obj_store_name]
                 records_per_object_store = 0
-                if datastore in TEAMS_DB_OBJECT_STORES or do_not_filter:
-                    prefix = bytes([0, global_id.dbid_no, object_store_id, 1])
-                    for record in self._fetched_records:
-                        if record.key.startswith(prefix):
-                            records_per_object_store += 1
-                            # Skip records with empty values as these cant properly decoded
-                            if record.value == b"":
-                                continue
-                            (
-                                _value_version,
-                                varint_raw,
-                            ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
-                                record.value
-                            )
-                            val_idx = len(varint_raw)
-                            # read the blink envelope
-                            blink_type_tag = record.value[val_idx]
-                            if blink_type_tag != 0xFF:
-                                print("Blink type tag not present")
-                            val_idx += 1
-
-                            (
-                                _,
-                                varint_raw,
-                            ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
-                                record.value[val_idx:]
-                            )
-
-                            val_idx += len(varint_raw)
-
-                            # read the raw value of the record.
-                            obj_raw = io.BytesIO(record.value[val_idx:])
-                            try:
-                                # Initialize deserializer and try deserialization.
-                                deserializer = ccl_v8_value_deserializer.Deserializer(
-                                    obj_raw,
-                                    host_object_delegate=blink_deserializer.read,
-                                )
-                                value = deserializer.read()
-                                yield {
-                                    "key": record.key,
-                                    "value": value,
-                                    "origin_file": record.origin_file,
-                                    "store": datastore,
-                                    "state": record.state,
-                                    "seq": record.seq,
-                                }
-                            except Exception:
-                                # TODO Some proper error handling wouldn't hurt
-                                continue
-                # print(f"{datastore} {global_id.name} {records_per_object_store}")
-
-
-def parse_db(filepath, do_not_filter=False):
-    # Open raw access to a LevelDB and deserialize the records.
-    db = FastIndexedDB(filepath)
-    extracted_values = []
-    for record in db.iterate_records(do_not_filter):
-        extracted_values.append(record)
+                for record in obj_store.iterate_records():
+                    records_per_object_store += 1
+                    sourcefile = str(filepath)
+                    # TODO: Fix None values
+                    state = None
+                    seq = None
+                    extracted_values.append({
+                        "key": record.key.raw_key,
+                        "value": record.value,
+                        "origin_file": sourcefile,
+                        "store": obj_store_name,
+                        "state": state,
+                        "seq": seq,
+                    })
+                print(
+                    f"{obj_store_name} {db.name} (Records: {records_per_object_store})"
+                )
     return extracted_values
 
 
-def parse_localstorage(filepath):
+def parse_localstorage(filepath: Path) -> list[dict[str, Any]]:
     local_store = ccl_chromium_localstorage.LocalStoreDb(filepath)
     extracted_values = []
     for record in local_store.iter_all_records():
@@ -254,7 +103,7 @@ def parse_localstorage(filepath):
     return extracted_values
 
 
-def parse_sessionstorage(filepath):
+def parse_sessionstorage(filepath: Path) -> list[dict[str, Any]]:
     session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath)
     extracted_values = []
     for host in session_storage:
@@ -275,7 +124,7 @@ def parse_sessionstorage(filepath):
     return extracted_values
 
 
-def write_results_to_json(data, outputpath):
+def write_results_to_json(data: list[dict[str, Any]], outputpath: Path) -> None:
     # Dump messages into a json file
     try:
         with open(outputpath, "w", encoding="utf-8") as f:
@@ -284,12 +133,3 @@ def write_results_to_json(data, outputpath):
             )
     except OSError as e:
         print(e)
-
-
-def parse_json():
-    # read data from a file. This is only for testing purpose.
-    try:
-        with Path("teams.json").open() as json_file:
-            return json.load(json_file)
-    except OSError as e:
-        print(e)