From 897dd8e8942b4884de7a81cb75e073a9424c9980 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Mon, 8 Jan 2024 14:34:58 +0100
Subject: [PATCH 01/25] feat: extend gh action to run on MS Teams 2.0 datasets

---
 .github/workflows/build.yaml | 5 +++--
 .gitignore                   | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index f351794..0b709ea 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -20,8 +20,9 @@ jobs:
         run:  pyinstaller "main.spec"
       - name: Run conversion ↩️
         run: |
-             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe.json"
-             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
+             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe_old_teams.json"
+             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_old_teams.json"
+             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_new_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja_new_teams.json"
       - name: Test calling script 📞
         run: |
               python utils/dump_leveldb.py --help
diff --git a/.gitignore b/.gitignore
index 34dd43e..efe303c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,5 @@ dmypy.json
 
 venv/
 env/
+
+testing/

From d4adb8d58e87baff96506e0edccde5c48e95570b Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Mon, 8 Jan 2024 17:46:43 +0100
Subject: [PATCH 02/25] feat: add new datasets for teams v2

---
 .github/workflows/build.yaml | 63 +++++++++++++++++++-----------------
 forensicsim-data             |  2 +-
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0b709ea..91c220b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -1,35 +1,38 @@
 on:
-    push:
+  push:
 jobs:
-    build:
-      name: Build exectuable 📦
-      runs-on: windows-latest
-      steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.9"
-      - name: Install dependencies
-        run: |
-              python -m pip install --upgrade pip
-              python -m pip install -r requirements.txt pyinstaller
-      - name: Build binary 🔢
-        run:  pyinstaller "main.spec"
-      - name: Run conversion ↩️
-        run: |
-             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe_old_teams.json"
-             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_old_teams.json"
-             .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_new_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja_new_teams.json"
-      - name: Test calling script 📞
-        run: |
-              python utils/dump_leveldb.py --help
-              python utils/dump_localstorage.py --help
-              python utils/dump_sessionstorage.py --help
+  build:
+    name: Build exectuable 📦
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.9"
+    - name: Install dependencies
+      run: |
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt pyinstaller
+    - name: Build binary 🔢
+      run:  pyinstaller "main.spec"
+    - name: Run conversion ↩️
+      run: |
+           .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe.json"
+           .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
+           .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
+           .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
+      continue-on-error: true
+    - name: Test calling script 📞
+      run: |
+            python utils/dump_leveldb.py --help
+            python utils/dump_localstorage.py --help
+            python utils/dump_sessionstorage.py --help
+      continue-on-error: true
 #              python utils/populate_teams.py --help
 #              python utils/populate_teams_2.py --help
 #              python utils/populate_skype.py --help
-      # - name: Calculate diff 👽
-      #   run: git diff --no-index --word-diff expected_output/john_doe.json current_output.json
+    # - name: Calculate diff 👽
+    #   run: git diff --no-index --word-diff expected_output/john_doe.json current_output.json
diff --git a/forensicsim-data b/forensicsim-data
index 68a144b..36e337f 160000
--- a/forensicsim-data
+++ b/forensicsim-data
@@ -1 +1 @@
-Subproject commit 68a144b3406fa3a532eee24b9181bb72a8ad691d
+Subproject commit 36e337f7f0f05cd358f67b116621cb4ca995184b

From f28295ee6da11973f3078edef0fba872a053c442 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Mon, 8 Jan 2024 17:51:00 +0100
Subject: [PATCH 03/25] =?UTF-8?q?refactor:=20disable=20continue-on-error?=
 =?UTF-8?q?=20=F0=9F=AA=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/build.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 91c220b..8fc6af8 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -24,13 +24,13 @@ jobs:
            .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
            .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
            .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
-      continue-on-error: true
+      continue-on-error: false
     - name: Test calling script 📞
       run: |
             python utils/dump_leveldb.py --help
             python utils/dump_localstorage.py --help
             python utils/dump_sessionstorage.py --help
-      continue-on-error: true
+      continue-on-error: false
 #              python utils/populate_teams.py --help
 #              python utils/populate_teams_2.py --help
 #              python utils/populate_skype.py --help

From cfff8f3371f98d41c4ed27f7c4dc7479c21972b3 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Mon, 8 Jan 2024 17:55:00 +0100
Subject: [PATCH 04/25] Feat: fix paths of datasets in build job

---
 .github/workflows/build.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 8fc6af8..e05f08a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -22,8 +22,8 @@ jobs:
       run: |
            .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe.json"
            .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
-           .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
-           .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json"
+           .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.live.com_0.indexeddb.leveldb" -o "karelze.json"
+           .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja.json"
       continue-on-error: false
     - name: Test calling script 📞
       run: |

From 18e69993220a2ec97b0e1f1d8cfc9e035d633848 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Tue, 9 Jan 2024 10:48:50 +0100
Subject: [PATCH 05/25] feat: add mypy for type checking

---
 .pre-commit-config.yaml |  7 +++++++
 pyproject.toml          | 15 +++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 88fb5be..c6b876e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,6 +14,13 @@ repos:
       - id: debug-statements
       - id: end-of-file-fixer
       - id: mixed-line-ending
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.8.0
+    hooks:
+      - id: mypy
+        # yaml requires additional stubs.
+        # Similar to: https://stackoverflow.com/a/73603491/5755604
+        additional_dependencies: ['types-PyYAML']
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.11
     hooks:
diff --git a/pyproject.toml b/pyproject.toml
index 31e963c..d8a30d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,10 +35,24 @@ version = {attr = "forensicsim.__version__"}
 dev=[
     "build",
     "pre-commit",
+    "mypy",
     "ruff",
     "tox",
 ]
 
+[tool.mypy]
+
+python_version = "3.9"
+
+exclude = [
+    "tools/"
+]
+
+# https://github.com/python/mypy/issues/2410
+ignore_missing_imports = true
+disallow_untyped_defs = true
+disallow_untyped_calls = true
+disallow_incomplete_defs = true
 
 [tool.ruff]
 
@@ -103,6 +117,7 @@ commands =
 # Syntax Checks
 [testenv:lint]
 commands =
+    python -m mypy src/
     python -m ruff --output-format=github src
     python -m ruff format src --check
 

From ecfb39f9a4906226d815b37f234c7448cfe4d12b Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Tue, 9 Jan 2024 12:11:50 +0100
Subject: [PATCH 06/25] refactor: add typehints for mypy to backend.py

---
 pyproject.toml             |  2 +-
 src/forensicsim/backend.py | 85 ++++++++++++++++++--------------------
 2 files changed, 41 insertions(+), 46 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d8a30d2..5c3b452 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -117,7 +117,7 @@ commands =
 # Syntax Checks
 [testenv:lint]
 commands =
-    python -m mypy src/
+    python -m mypy src/backend.py
     python -m ruff --output-format=github src
     python -m ruff format src --check
 
diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 06ec4ff..7260792 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -24,7 +24,9 @@
 
 import io
 import json
-import os
+from collections.abc import Iterator
+from pathlib import Path
+from typing import Any, Optional, Union
 
 from chromedb import (
     ccl_blink_value_deserializer,
@@ -35,7 +37,10 @@
     ccl_v8_value_deserializer,
 )
 from chromedb.ccl_chromium_indexeddb import (
+    DatabaseMetadata,
     DatabaseMetadataType,
+    GlobalMetadata,
+    ObjectStoreMetadata,
     ObjectStoreMetadataType,
 )
 
@@ -53,21 +58,21 @@
 
 
 class FastIndexedDB:
-    def __init__(self, leveldb_dir: os.PathLike):
+    def __init__(self, leveldb_dir: Path):
         self._db = ccl_leveldb.RawLevelDb(leveldb_dir)
-        self._fetched_records = []
-        self.global_metadata = None
-        self.database_metadata = None
-        self.object_store_meta = None
+        self._fetched_records: list[ccl_leveldb.Record] = []
+        self.global_metadata: GlobalMetadata
+        self.database_metadata: DatabaseMetadata
+        self.object_store_meta: ObjectStoreMetadata
         self.fetch_data()
 
-    def fetch_data(self):
-        global_metadata_raw = {}
+    def fetch_data(self) -> None:
+        global_metadata_raw: dict[bytes, ccl_leveldb.Records] = {}
+        database_metadata_raw: dict[tuple, Any] = {}
+        objectstore_metadata_raw: dict[tuple, Any] = {}
 
-        database_metadata_raw = {}
-        objectstore_metadata_raw = {}
+        self._fetched_records.clear()
 
-        self._fetched_records = []
         # Fetch the records only once
         for record in self._db.iterate_records_raw():
             self._fetched_records.append(record)
@@ -148,15 +153,21 @@ def fetch_data(self):
             objectstore_metadata_raw
         )
 
-    def get_database_metadata(self, db_id: int, meta_type: DatabaseMetadataType):
-        return self.database_metadata.get_meta(db_id, meta_type)
+    def get_database_metadata(
+        self, db_id: int, meta_type: DatabaseMetadataType
+    ) -> Optional[Union[str, int]]:
+        if self.database_metadata:
+            return self.database_metadata.get_meta(db_id, meta_type)
+        return None
 
     def get_object_store_metadata(
         self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType
-    ):
-        return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type)
+    ) -> Optional[Any]:
+        if self.object_store_meta:
+            return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type)
+        return None
 
-    def iterate_records(self, do_not_filter=False):
+    def __iter__(self) -> Iterator[dict[str, Any]]:
         blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer()
         # Loop through the databases and object stores based on their ids
         for global_id in self.global_metadata.db_ids:
@@ -179,12 +190,10 @@ def iterate_records(self, do_not_filter=False):
                 )
 
                 # print(f"\t Processing object store: {datastore}")
-                records_per_object_store = 0
-                if datastore in TEAMS_DB_OBJECT_STORES or do_not_filter:
+                if datastore in TEAMS_DB_OBJECT_STORES:
                     prefix = bytes([0, global_id.dbid_no, object_store_id, 1])
                     for record in self._fetched_records:
                         if record.key.startswith(prefix):
-                            records_per_object_store += 1
                             # Skip records with empty values as these cant properly decoded
                             if record.value == b"":
                                 continue
@@ -202,7 +211,7 @@ def iterate_records(self, do_not_filter=False):
                             val_idx += 1
 
                             (
-                                _blink_version,
+                                _,
                                 varint_raw,
                             ) = ccl_chromium_indexeddb.le_varint_from_bytes(
                                 record.value[val_idx:]
@@ -230,19 +239,14 @@ def iterate_records(self, do_not_filter=False):
                             except Exception:
                                 # TODO Some proper error handling wouldn't hurt
                                 continue
-                # print(f"{datastore} {global_id.name} {records_per_object_store}")
 
 
-def parse_db(filepath, do_not_filter=False):
-    # Open raw access to a LevelDB and deserialize the records.
+def parse_db(filepath: Path) -> list[dict]:
     db = FastIndexedDB(filepath)
-    extracted_values = []
-    for record in db.iterate_records(do_not_filter):
-        extracted_values.append(record)
-    return extracted_values
+    return list(db)
 
 
-def parse_localstorage(filepath):
+def parse_localstorage(filepath: Path) -> list[dict]:
     local_store = ccl_chromium_localstorage.LocalStoreDb(filepath)
     extracted_values = []
     for record in local_store.iter_all_records():
@@ -253,7 +257,7 @@ def parse_localstorage(filepath):
     return extracted_values
 
 
-def parse_sessionstorage(filepath):
+def parse_sessionstorage(filepath: Path) -> list[dict]:
     session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath)
     extracted_values = []
     for host in session_storage:
@@ -274,22 +278,13 @@ def parse_sessionstorage(filepath):
     return extracted_values
 
 
-def write_results_to_json(data, outputpath):
-    # Dump messages into a json file
-    try:
-        with open(outputpath, "w", encoding="utf-8") as f:
-            json.dump(
-                data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False
-            )
-    except OSError as e:
-        print(e)
+def write_results_to_json(data: object, outputpath: Path) -> None:
+    with outputpath.open("w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False)
 
 
-def parse_json():
+def parse_json() -> Any:
     # read data from a file. This is only for testing purpose.
-    try:
-        with open("teams.json") as json_file:
-            data = json.load(json_file)
-            return data
-    except OSError as e:
-        print(e)
+    with Path("teams.json").open() as json_file:
+        data = json.load(json_file)
+    return data

From 6837e062702cac83ce5dd0d2fafd816e3270d156 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Tue, 9 Jan 2024 12:16:06 +0100
Subject: [PATCH 07/25] feat: fix some typos in linting

---
 pyproject.toml             | 2 +-
 src/forensicsim/backend.py | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5c3b452..0c60eaa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -117,7 +117,7 @@ commands =
 # Syntax Checks
 [testenv:lint]
 commands =
-    python -m mypy src/backend.py
+    python -m mypy src/forensicsim/backend.py
     python -m ruff --output-format=github src
     python -m ruff format src --check
 
diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 7260792..dc2fccf 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -156,16 +156,12 @@ def fetch_data(self) -> None:
     def get_database_metadata(
         self, db_id: int, meta_type: DatabaseMetadataType
     ) -> Optional[Union[str, int]]:
-        if self.database_metadata:
-            return self.database_metadata.get_meta(db_id, meta_type)
-        return None
+        return self.database_metadata.get_meta(db_id, meta_type)
 
     def get_object_store_metadata(
         self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType
     ) -> Optional[Any]:
-        if self.object_store_meta:
-            return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type)
-        return None
+        return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type)
 
     def __iter__(self) -> Iterator[dict[str, Any]]:
         blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer()

From 87245125474cf06082741a318b16dffe3bb53dc0 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Tue, 9 Jan 2024 12:24:42 +0100
Subject: [PATCH 08/25] fix: unsupported operand type(s) for +: 'NoneType' and
 'int'

---
 pyproject.toml             |  2 +-
 src/forensicsim/backend.py | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0c60eaa..d770e2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
 "chromedb @ git+https://github.com/karelze/ccl_chrome_indexeddb@master",
 "pause~=0.3",
 "pyautogui~=0.9.54",
-"pywinauto~=0.6.8"
+#"pywinauto~=0.6.8"
 ]
 
 dynamic = ["version"]
diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index dc2fccf..8536bca 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -172,12 +172,14 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
                 print(f"WARNING: Skipping database {global_id.name}")
                 continue
 
+            max_object_stores = self.database_metadata.get_meta(
+                global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId
+            )
+            max_object_stores = max_object_stores if not max_object_stores else 0
+
             for object_store_id in range(
                 1,
-                self.database_metadata.get_meta(
-                    global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId
-                )
-                + 1,
+                max_object_stores + 1,
             ):
                 datastore = self.object_store_meta.get_meta(
                     global_id.dbid_no,

From b2c2a4a8fcff8a2b9bd276529e557894d2d753d7 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Tue, 9 Jan 2024 12:30:14 +0100
Subject: [PATCH 09/25] feat: fix typo in 'backend.py'

---
 src/forensicsim/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 8536bca..c95606e 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -175,7 +175,7 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
             max_object_stores = self.database_metadata.get_meta(
                 global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId
             )
-            max_object_stores = max_object_stores if not max_object_stores else 0
+            max_object_stores = max_object_stores if max_object_stores else 0
 
             for object_store_id in range(
                 1,

From 48c2b89ab61febe08d8e7dc7d5134cfc58c7cd0e Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Tue, 16 Jan 2024 05:32:08 +0100
Subject: [PATCH 10/25] refactor: match api of updated chromedb

---
 pyproject.toml             | 2 +-
 src/forensicsim/backend.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0f9bfb0..802acd5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
 "dataclasses-json",
 "pause",
 "pyautogui",
-"pywinauto"
+# "pywinauto"
 ]
 
 dynamic = ["version"]
diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 95afd6e..7c2b6b0 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -126,7 +126,7 @@ def fetch_data(self) -> None:
                         (
                             objstore_id,
                             varint_raw,
-                        ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
+                        ) = ccl_chromium_indexeddb.le_varint_from_bytes(
                             record.key[len(prefix_objectstore) :]
                         )
                     except TypeError:
@@ -198,7 +198,7 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
                             (
                                 _value_version,
                                 varint_raw,
-                            ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
+                            ) = ccl_chromium_indexeddb.le_varint_from_bytes(
                                 record.value
                             )
                             val_idx = len(varint_raw)
@@ -211,7 +211,7 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
                             (
                                 _,
                                 varint_raw,
-                            ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
+                            ) = ccl_chromium_indexeddb.le_varint_from_bytes(
                                 record.value[val_idx:]
                             )
 

From fab391399b4d7ab60dcb58a5648baab41fe121b8 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Tue, 16 Jan 2024 05:44:27 +0100
Subject: [PATCH 11/25] feat: fix ruff linting

---
 .github/workflows/build.yaml | 2 +-
 pyproject.toml               | 2 +-
 src/forensicsim/backend.py   | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d37b384..cc9b8f1 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -41,6 +41,6 @@ jobs:
               python tools/dump_localstorage.py --help
               python tools/dump_sessionstorage.py --help
       - name: Calculate diff 👽
-        run: | 
+        run: |
             git diff --no-index --word-diff .\forensicsim-data\expected-result\jane_doe_v_1_4_00_11161.json jane_doe_v_1_4_00_11161.json
             git diff --no-index --word-diff .\forensicsim-data\expected-result\john_doe_v_1_4_00_11161.json john_doe_v_1_4_00_11161.json
diff --git a/pyproject.toml b/pyproject.toml
index 802acd5..ef39a50 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ dev=[
 python_version = "3.9"
 
 exclude = [
-    "tools/"
+    "tools/**.py"
 ]
 
 # https://github.com/python/mypy/issues/2410
diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 7c2b6b0..002dfd3 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -284,5 +284,4 @@ def write_results_to_json(data: object, outputpath: Path) -> None:
 def parse_json() -> Any:
     # read data from a file. This is only for testing purpose.
     with Path("teams.json").open() as json_file:
-        data = json.load(json_file)
-    return data
+        return json.load(json_file)

From 36c7fa5d76280418a76b8721489431034e430ee7 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Tue, 16 Jan 2024 17:35:09 +0100
Subject: [PATCH 12/25] refactor: use WrappedIndexDB instead of

---
 pyproject.toml             |   2 +-
 src/forensicsim/backend.py | 245 +++++++------------------------------
 2 files changed, 45 insertions(+), 202 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ef39a50..0361102 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
 "dataclasses-json",
 "pause",
 "pyautogui",
-# "pywinauto"
+"pywinauto"
 ]
 
 dynamic = ["version"]
diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 002dfd3..9df24d2 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -24,28 +24,19 @@
 
 import io
 import json
-from collections.abc import Iterator
+import os
 from pathlib import Path
-from typing import Any, Optional, Union
 
 from chromedb import (
-    ccl_blink_value_deserializer,
     ccl_chromium_indexeddb,
     ccl_chromium_localstorage,
     ccl_chromium_sessionstorage,
-    ccl_leveldb,
-    ccl_v8_value_deserializer,
-)
-from chromedb.ccl_chromium_indexeddb import (
-    DatabaseMetadata,
-    DatabaseMetadataType,
-    GlobalMetadata,
-    ObjectStoreMetadata,
-    ObjectStoreMetadataType,
 )
 
 TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"]
 
+ENCODING = "iso-8859-1"
+
 """
 The following code is heavily adopted from the RawLevelDb and IndexedDB processing proposed by CCL Group
 
@@ -56,195 +47,38 @@
 Additionally, it has a flag to filter for datastores, which are interesting for us.
 """
 
+def parse_db(filepath, do_not_filter=False):
+    # Open raw access to a LevelDB and deserialize the records.
+    wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath)
 
-class FastIndexedDB:
-    def __init__(self, leveldb_dir: Path):
-        self._db = ccl_leveldb.RawLevelDb(leveldb_dir)
-        self._fetched_records: list[ccl_leveldb.Record] = []
-        self.global_metadata: GlobalMetadata
-        self.database_metadata: DatabaseMetadata
-        self.object_store_meta: ObjectStoreMetadata
-        self.fetch_data()
-
-    def fetch_data(self) -> None:
-        global_metadata_raw: dict[bytes, ccl_leveldb.Records] = {}
-        database_metadata_raw: dict[tuple, Any] = {}
-        objectstore_metadata_raw: dict[tuple, Any] = {}
-
-        self._fetched_records.clear()
-
-        # Fetch the records only once
-        for record in self._db.iterate_records_raw():
-            self._fetched_records.append(record)
-
-        for record in self._fetched_records:
-            # Global Metadata
-            if (
-                record.key.startswith(b"\x00\x00\x00\x00")
-                and record.state == ccl_leveldb.KeyState.Live
-            ) and (
-                record.key not in global_metadata_raw
-                or global_metadata_raw[record.key].seq < record.seq
-            ):
-                global_metadata_raw[record.key] = record
-
-        # Convert the raw metadata to a nice GlobalMetadata Object
-        global_metadata = ccl_chromium_indexeddb.GlobalMetadata(global_metadata_raw)
-
-        # Loop through the database IDs
-        for db_id in global_metadata.db_ids:
-            if db_id.dbid_no == None:
-                continue
-
-            if db_id.dbid_no > 0x7F:
-                raise NotImplementedError(
-                    "there could be this many dbs, but I don't support it yet"
-                )
-
-            # Database keys end with 0
-            prefix_database = bytes([0, db_id.dbid_no, 0, 0])
-
-            # Objetstore keys end with 50
-            prefix_objectstore = bytes([0, db_id.dbid_no, 0, 0, 50])
-
-            for record in reversed(self._fetched_records):
-                if (
-                    record.key.startswith(prefix_database)
-                    and record.state == ccl_leveldb.KeyState.Live
-                ):
-                    # we only want live keys and the newest version thereof (highest seq)
-                    meta_type = record.key[len(prefix_database)]
-                    old_version = database_metadata_raw.get((db_id.dbid_no, meta_type))
-                    if old_version is None or old_version.seq < record.seq:
-                        database_metadata_raw[(db_id.dbid_no, meta_type)] = record
-                if (
-                    record.key.startswith(prefix_objectstore)
-                    and record.state == ccl_leveldb.KeyState.Live
-                ):
-                    # we only want live keys and the newest version thereof (highest seq)
-                    try:
-                        (
-                            objstore_id,
-                            varint_raw,
-                        ) = ccl_chromium_indexeddb.le_varint_from_bytes(
-                            record.key[len(prefix_objectstore) :]
-                        )
-                    except TypeError:
-                        continue
-
-                    meta_type = record.key[len(prefix_objectstore) + len(varint_raw)]
-
-                    old_version = objectstore_metadata_raw.get((
-                        db_id.dbid_no,
-                        objstore_id,
-                        meta_type,
-                    ))
-
-                    if old_version is None or old_version.seq < record.seq:
-                        objectstore_metadata_raw[
-                            (db_id.dbid_no, objstore_id, meta_type)
-                        ] = record
-
-        self.global_metadata = global_metadata
-        self.database_metadata = ccl_chromium_indexeddb.DatabaseMetadata(
-            database_metadata_raw
-        )
-        self.object_store_meta = ccl_chromium_indexeddb.ObjectStoreMetadata(
-            objectstore_metadata_raw
-        )
+    extracted_values = []
 
-    def get_database_metadata(
-        self, db_id: int, meta_type: DatabaseMetadataType
-    ) -> Optional[Union[str, int]]:
-        return self.database_metadata.get_meta(db_id, meta_type)
+    for db_info in wrapper.database_ids:
+        # Skip databases without a valid dbid_no
+        if db_info.dbid_no is None:
+            continue
 
-    def get_object_store_metadata(
-        self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType
-    ) -> Optional[Any]:
-        return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type)
+        db = wrapper[db_info.dbid_no]
 
-    def __iter__(self) -> Iterator[dict[str, Any]]:
-        blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer()
-        # Loop through the databases and object stores based on their ids
-        for global_id in self.global_metadata.db_ids:
-            # print(f"Processing database: {global_id.name}")
-            if global_id.dbid_no == None:
-                print(f"WARNING: Skipping database {global_id.name}")
+        for obj_store_name in db.object_store_names:
+            # Skip empty object stores
+            if obj_store_name is None:
                 continue
-
-            max_object_stores = self.database_metadata.get_meta(
-                global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId
-            )
-            max_object_stores = max_object_stores if max_object_stores else 0
-
-            for object_store_id in range(
-                1,
-                max_object_stores + 1,
-            ):
-                datastore = self.object_store_meta.get_meta(
-                    global_id.dbid_no,
-                    object_store_id,
-                    ObjectStoreMetadataType.StoreName,
-                )
-
-                # print(f"\t Processing object store: {datastore}")
-                if datastore in TEAMS_DB_OBJECT_STORES:
-                    prefix = bytes([0, global_id.dbid_no, object_store_id, 1])
-                    for record in self._fetched_records:
-                        if record.key.startswith(prefix):
-                            # Skip records with empty values as these cant properly decoded
-                            if record.value == b"":
-                                continue
-                            (
-                                _value_version,
-                                varint_raw,
-                            ) = ccl_chromium_indexeddb.le_varint_from_bytes(
-                                record.value
-                            )
-                            val_idx = len(varint_raw)
-                            # read the blink envelope
-                            blink_type_tag = record.value[val_idx]
-                            if blink_type_tag != 0xFF:
-                                print("Blink type tag not present")
-                            val_idx += 1
-
-                            (
-                                _,
-                                varint_raw,
-                            ) = ccl_chromium_indexeddb.le_varint_from_bytes(
-                                record.value[val_idx:]
-                            )
-
-                            val_idx += len(varint_raw)
-
-                            # read the raw value of the record.
-                            obj_raw = io.BytesIO(record.value[val_idx:])
-                            try:
-                                # Initialize deserializer and try deserialization.
-                                deserializer = ccl_v8_value_deserializer.Deserializer(
-                                    obj_raw,
-                                    host_object_delegate=blink_deserializer.read,
-                                )
-                                value = deserializer.read()
-                                yield {
-                                    "key": record.key,
-                                    "value": value,
-                                    "origin_file": record.origin_file,
-                                    "store": datastore,
-                                    "state": record.state,
-                                    "seq": record.seq,
-                                }
-                            except Exception:
-                                # TODO Some proper error handling wouldn't hurt
-                                continue
-
-
-def parse_db(filepath: Path) -> list[dict]:
-    db = FastIndexedDB(filepath)
-    return list(db)
+            if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter:
+                obj_store = db[obj_store_name]
+                records_per_object_store = 0
+                for record in obj_store.iterate_records():
+                    records_per_object_store += 1
+                    sourcefile = str(filepath)
+                    # TODO: Replace None values with actual values
+                    state = None
+                    seq = None
+                    extracted_values.append({"key": record.key.raw_key, "value": record.value, "origin_file": sourcefile, "store": obj_store_name, "state": state, "seq": seq})
+                print(f"{obj_store_name} {db.name} (Records: {records_per_object_store})") 
+    return extracted_values
 
 
-def parse_localstorage(filepath: Path) -> list[dict]:
+def parse_localstorage(filepath):
     local_store = ccl_chromium_localstorage.LocalStoreDb(filepath)
     extracted_values = []
     for record in local_store.iter_all_records():
@@ -255,7 +89,7 @@ def parse_localstorage(filepath: Path) -> list[dict]:
     return extracted_values
 
 
-def parse_sessionstorage(filepath: Path) -> list[dict]:
+def parse_sessionstorage(filepath):
     session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath)
     extracted_values = []
     for host in session_storage:
@@ -276,12 +110,21 @@ def parse_sessionstorage(filepath: Path) -> list[dict]:
     return extracted_values
 
 
-def write_results_to_json(data: object, outputpath: Path) -> None:
-    with outputpath.open("w", encoding="utf-8") as f:
-        json.dump(data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False)
+def write_results_to_json(data, outputpath):
+    # Dump messages into a json file
+    try:
+        with open(outputpath, "w", encoding="utf-8") as f:
+            json.dump(
+                data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False
+            )
+    except OSError as e:
+        print(e)
 
 
-def parse_json() -> Any:
+def parse_json():
     # read data from a file. This is only for testing purpose.
-    with Path("teams.json").open() as json_file:
-        return json.load(json_file)
+    try:
+        with Path("teams.json").open() as json_file:
+            return json.load(json_file)
+    except OSError as e:
+        print(e)

From ccdd881285b81fee12c26ee7ddd218199d9d79a9 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Tue, 16 Jan 2024 20:16:13 +0100
Subject: [PATCH 13/25] feat: implement basic parser for the object stores

---
 src/forensicsim/backend.py | 395 +++++++++++++++++++++++++------------
 1 file changed, 273 insertions(+), 122 deletions(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 9df24d2..bcfb656 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -1,130 +1,281 @@
-"""
-MIT License
-
-Copyright (c) 2021 Alexander Bilz
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-
-import io
 import json
-import os
+import warnings
+from dataclasses import dataclass, field
+from datetime import datetime
 from pathlib import Path
+from typing import Any, Optional
 
-from chromedb import (
-    ccl_chromium_indexeddb,
-    ccl_chromium_localstorage,
-    ccl_chromium_sessionstorage,
+from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
+from dataclasses_json import (
+    DataClassJsonMixin,
+    LetterCase,
+    Undefined,
+    config,
 )
 
-TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"]
-
-ENCODING = "iso-8859-1"
-
-"""
-The following code is heavily adopted from the RawLevelDb and IndexedDB processing proposed by CCL Group
-
-https://github.com/cclgroupltd/ccl_chrome_indexeddb/blob/35b6a9efba1078cf339f9e64d2796b1f5f7c556f/ccl_chromium_indexeddb.py
-
-It uses an optimized enumeration approach for processing the metadata, which makes the original IndexedDB super slow.
-
-Additionally, it has a flag to filter for datastores, which are interesting for us.
-"""
-
-def parse_db(filepath, do_not_filter=False):
-    # Open raw access to a LevelDB and deserialize the records.
-    wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath)
-
-    extracted_values = []
-
-    for db_info in wrapper.database_ids:
-        # Skip databases without a valid dbid_no
-        if db_info.dbid_no is None:
-            continue
-
-        db = wrapper[db_info.dbid_no]
-
-        for obj_store_name in db.object_store_names:
-            # Skip empty object stores
-            if obj_store_name is None:
-                continue
-            if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter:
-                obj_store = db[obj_store_name]
-                records_per_object_store = 0
-                for record in obj_store.iterate_records():
-                    records_per_object_store += 1
-                    sourcefile = str(filepath)
-                    # TODO: Replace None values with actual values
-                    state = None
-                    seq = None
-                    extracted_values.append({"key": record.key.raw_key, "value": record.value, "origin_file": sourcefile, "store": obj_store_name, "state": state, "seq": seq})
-                print(f"{obj_store_name} {db.name} (Records: {records_per_object_store})") 
-    return extracted_values
-
-
-def parse_localstorage(filepath):
-    local_store = ccl_chromium_localstorage.LocalStoreDb(filepath)
-    extracted_values = []
-    for record in local_store.iter_all_records():
-        try:
-            extracted_values.append(json.loads(record.value, strict=False))
-        except json.decoder.JSONDecodeError:
-            continue
-    return extracted_values
-
-
-def parse_sessionstorage(filepath):
-    session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath)
-    extracted_values = []
-    for host in session_storage:
-        print(host)
-        # Hosts can have multiple sessions associated with them
-        for session_store_values in session_storage.get_all_for_host(host).values():
-            for session_store_value in session_store_values:
-                # response is of type SessionStoreValue
-
-                # Make a nice dictionary out of it
-                entry = {
-                    "key": host,
-                    "value": session_store_value.value,
-                    "guid": session_store_value.guid,
-                    "leveldb_sequence_number": session_store_value.leveldb_sequence_number,
-                }
-                extracted_values.append(entry)
-    return extracted_values
-
-
-def write_results_to_json(data, outputpath):
-    # Dump messages into a json file
-    try:
-        with open(outputpath, "w", encoding="utf-8") as f:
-            json.dump(
-                data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False
+from forensicsim.backend import parse_db, write_results_to_json
+
+# Suppress Beautiful Soup warnings
+warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
+
+def strip_html_tags(value):
+    # Get the text of any embedded html, such as divs, a href links
+    soup = BeautifulSoup(value, features="html.parser")
+    return soup.get_text()
+
+
+def decode_dict(properties):
+    if isinstance(properties, bytes):
+        soup = BeautifulSoup(properties, features="html.parser")
+        properties = properties.decode(soup.original_encoding)
+    if isinstance(properties, dict):
+        # handle case where nested childs are dicts or list but provided with "" but have to be expanded.
+        for key, value in properties.items():
+            if isinstance(value, str) and value.startswith(("[", "{")):
+                properties[key] = json.loads(value, strict=False)
+        return properties
+
+    return json.loads(properties, strict=False)
+
+
+def decode_timestamp(content_utf8_encoded) -> datetime:
+    return datetime.utcfromtimestamp(int(content_utf8_encoded) / 1000)
+
+
+def encode_timestamp(timestamp) -> Optional[str]:
+    if timestamp is not None:
+        return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")
+    return None
+
+
+JSON_CONFIG = config(letter_case=LetterCase.CAMEL, undefined=Undefined.EXCLUDE)[
+    "dataclasses_json"
+]
+
+
+@dataclass()
+class Meeting(DataClassJsonMixin):
+    dataclass_json_config = JSON_CONFIG
+
+    client_update_time: Optional[str] = None
+    cached_deduplication_key: Optional[str] = None
+    id: Optional[str] = None
+    members: Optional[list[dict]] = None
+    thread_properties: dict[str, Any] = field(
+        default_factory=dict, metadata=config(decoder=decode_dict)
+    )
+    type: Optional[str] = None
+    version: Optional[float] = None
+
+    record_type: Optional[str] = field(
+        default="meeting", metadata=config(field_name="record_type")
+    )
+
+    def __eq__(self, other):
+        return self.cached_deduplication_key == other.cachedDeduplicationKey
+
+    def __hash__(self):
+        return hash(self.cached_deduplication_key)
+
+    def __lt__(self, other):
+        return self.cached_deduplication_key < other.cached_deduplication_key
+
+
+@dataclass()
+class Message(DataClassJsonMixin):
+    dataclass_json_config = JSON_CONFIG
+
+    attachments: list[Any] = field(default_factory=list)
+    cached_deduplication_key: Optional[str] = None
+    client_arrival_time: Optional[str] = None
+    clientmessageid: Optional[str] = None
+    composetime: Optional[str] = None
+    conversation_id: Optional[str] = None
+    content: Optional[str] = field(
+        default=None, metadata=config(decoder=strip_html_tags)
+    )
+    contenttype: Optional[str] = None
+    created_time: Optional[datetime] = field(
+        default=None,
+        metadata=config(decoder=decode_timestamp, encoder=encode_timestamp),
+    )
+    creator: Optional[str] = None
+    is_from_me: Optional[bool] = None
+    message_kind: Optional[str] = None
+    messagetype: Optional[str] = None
+    originalarrivaltime: Optional[str] = None
+    properties: dict[str, Any] = field(
+        default_factory=dict, metadata=config(decoder=decode_dict)
+    )
+    version: Optional[datetime] = field(
+        default=None,
+        metadata=config(decoder=decode_timestamp, encoder=encode_timestamp),
+    )
+
+    origin_file: Optional[str] = field(
+        default=None, metadata=config(field_name="origin_file")
+    )
+    record_type: str = field(
+        default="message", metadata=config(field_name="record_type")
+    )
+
+    def __post_init__(self):
+        if self.cached_deduplication_key is None:
+            self.cached_deduplication_key = str(self.creator) + str(
+                self.clientmessageid
             )
-    except OSError as e:
-        print(e)
 
+    def __eq__(self, other):
+        return self.cached_deduplication_key == other.cached_deduplication_key
+
+    def __hash__(self):
+        return hash(self.cached_deduplication_key)
+
+    def __lt__(self, other):
+        return self.cached_deduplication_key < other.cached_deduplication_key
+
+
+@dataclass()
+class Contact(DataClassJsonMixin):
+    dataclass_json_config = JSON_CONFIG
+
+    display_name: Optional[str] = None
+    email: Optional[str] = None
+    mri: Optional[str] = field(default=None, compare=True)
+    user_principal_name: Optional[str] = None
+
+    origin_file: Optional[str] = field(
+        default=None, metadata=config(field_name="origin_file")
+    )
+    record_type: Optional[str] = field(
+        default="contact", metadata=config(field_name="record_type")
+    )
+
+    def __eq__(self, other):
+        return self.mri == other.mri
+
+    def __hash__(self):
+        return hash(self.mri)
+
+    def __lt__(self, other):
+        return self.mri < other.mri
+
+
+def _parse_people(people: list[dict]) -> set[Contact]:
+    parsed_people = set()
+    for p in people:
+ 
+        p |= p.get("value", {})
+        p |= {"display_name": p.get("displayName")}
+        p |= {"email": p.get("email")}
+        p |= {"mri": p.get("mri")}
+        p |= {"user_principal_name": p.get("userPrincipalName")}
+        p |= {"origin_file": p.get("origin_file")}
+
+        parsed_people.add(Contact.from_dict(p))
+    return parsed_people
+
+
+def _parse_buddies(buddies: list[dict]) -> set[Contact]:
+    parsed_buddies = set()
+    for b in buddies:
+        buddies_of_b = b.get("value", {}).get("buddies", [])
+        for b_of_b in buddies_of_b:
+
+            b_of_b |= {"origin_file": b.get("origin_file")}
+            parsed_buddies.add(Contact.from_dict(b_of_b))
+    return parsed_buddies
+
+
+def _parse_conversations(conversations: list[dict]) -> set[Meeting]:
+    cleaned_conversations = set()
+    for c in conversations:
+
+
+        if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get("value", {}).get(
+            "threadProperties", {}
+        ):
+            last_message = c.get("value", {}).get("lastMessage", {})
+            meeting_properties = c.get("value", {}).get("threadProperties", {})
+            c |= c.get("value", {})
+            c |= {"client_update_time": c.get("clientUpdateTime")}
+            c |= {"id": c.get("id")}
+            c |= {"members": c.get("members")}
+            c |= {"thread_properties": meeting_properties}
+            c |= {"client_update_time": c.get("clientUpdateTime")}
+            c |= {"version": c.get("version")}
+            c |= {"last_message": last_message}
+            c |= {"cached_deduplication_key": c.get("id")}
+            cleaned_conversations.add(Meeting.from_dict(c))
+    return cleaned_conversations
+
+
+def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
+    cleaned_reply_chains = set()
+    for rc in reply_chains:
+        rc |= {"origin_file": rc.get("origin_file")}
+        
+        message_dict = {}
+        if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get("messages", {}):
+            if rc.get("value", {}).get("messageMap", {}):
+                message_dict = rc.get("value", {}).get("messageMap", {})
+            else:
+                message_dict = rc.get("value", {}).get("messages", {})
+
+        for k in message_dict:
+            md = message_dict[k]
+
+            if md.get("messageType", "") == "RichText/Html" or md.get("messageType", "") == "Text":
+                rc |= rc.get("value", {})
+                rc |= {"cached_deduplication_key": md.get("dedupeKey")}
+                rc |= {"clientmessageid": md.get("clientMessageId")}
+                rc |= {"composetime": md.get("clientArrivalTime")}
+                rc |= {"conversation_id": md.get("conversationId")}
+                rc |= {"content": md.get("content")}
+                rc |= {"contenttype": md.get("contentType")}
+                rc |= {"created_time": md.get("clientArrivalTime")}
+                rc |= {"creator": md.get("version")}
+                rc |= {"is_from_me": md.get("isSentByCurrentUser")}
+                rc |= {"messagetype": md.get("messageType")}
+                rc |= {"originalArrivalTime": md.get("version")}
+                rc |= {"client_arrival_time": md.get("clientArrivalTime")}
+                rc |= {"original_arrival_time": md.get("clientArrivalTime")}
+                rc |= {"version": md.get("version")}
+                rc |= {"properties": md.get("properties")}
+            
+            cleaned_reply_chains.add(Message.from_dict(rc))
+
+    return cleaned_reply_chains
+
+
+def parse_records(records: list[dict]) -> list[dict]:
+    people, buddies, reply_chains, conversations = [], [], [], []
+
+    for r in records:
+        store = r.get("store", "other")
+        if store == "people":
+            people.append(r)
+        elif store == "buddylist":
+            buddies.append(r)
+        elif store == "replychains":
+            reply_chains.append(r)
+        elif store == "conversations":
+            conversations.append(r)
+
+    # sort within groups i.e., Contacts, Meetings, Conversations
+    parsed_records = (
+        sorted(_parse_people(people))
+    #    + sorted(_parse_buddies(buddies))
+        + sorted(_parse_reply_chains(reply_chains))
+        + sorted(_parse_conversations(conversations))
+    )
+    return [r.to_dict() for r in parsed_records]
+
+
+def process_db(input_path: Path, output_path: Path):
+    if not input_path.parts[-1].endswith(".leveldb"):
+        raise ValueError(f"Expected a leveldb folder. Path: {input_path}")
 
-def parse_json():
-    # read data from a file. This is only for testing purpose.
-    try:
-        with Path("teams.json").open() as json_file:
-            return json.load(json_file)
-    except OSError as e:
-        print(e)
+    extracted_values = parse_db(input_path)
+    parsed_records = parse_records(extracted_values)
+    write_results_to_json(parsed_records, output_path)

From d74dfeb342d66b7985cde6943bfa8426338ac044 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Wed, 17 Jan 2024 17:08:15 +0100
Subject: [PATCH 14/25] fix: wrong file locations

---
 src/forensicsim/backend.py | 402 ++++++++++++-------------------------
 src/forensicsim/parser.py  |  76 +++++--
 2 files changed, 186 insertions(+), 292 deletions(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index bcfb656..6533254 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -1,281 +1,137 @@
+"""
+MIT License
+
+Copyright (c) 2021 Alexander Bilz
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import io
 import json
-import warnings
-from dataclasses import dataclass, field
-from datetime import datetime
+import os
 from pathlib import Path
-from typing import Any, Optional
 
-from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
-from dataclasses_json import (
-    DataClassJsonMixin,
-    LetterCase,
-    Undefined,
-    config,
+from chromedb import (
+    ccl_blink_value_deserializer,
+    ccl_chromium_indexeddb,
+    ccl_chromium_localstorage,
+    ccl_chromium_sessionstorage,
+    ccl_leveldb,
+    ccl_v8_value_deserializer,
+)
+from chromedb.ccl_chromium_indexeddb import (
+    DatabaseMetadataType,
+    ObjectStoreMetadataType,
 )
 
-from forensicsim.backend import parse_db, write_results_to_json
-
-# Suppress Beautiful Soup warnings
-warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
-
-def strip_html_tags(value):
-    # Get the text of any embedded html, such as divs, a href links
-    soup = BeautifulSoup(value, features="html.parser")
-    return soup.get_text()
-
-
-def decode_dict(properties):
-    if isinstance(properties, bytes):
-        soup = BeautifulSoup(properties, features="html.parser")
-        properties = properties.decode(soup.original_encoding)
-    if isinstance(properties, dict):
-        # handle case where nested childs are dicts or list but provided with "" but have to be expanded.
-        for key, value in properties.items():
-            if isinstance(value, str) and value.startswith(("[", "{")):
-                properties[key] = json.loads(value, strict=False)
-        return properties
-
-    return json.loads(properties, strict=False)
-
-
-def decode_timestamp(content_utf8_encoded) -> datetime:
-    return datetime.utcfromtimestamp(int(content_utf8_encoded) / 1000)
-
-
-def encode_timestamp(timestamp) -> Optional[str]:
-    if timestamp is not None:
-        return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")
-    return None
-
-
-JSON_CONFIG = config(letter_case=LetterCase.CAMEL, undefined=Undefined.EXCLUDE)[
-    "dataclasses_json"
-]
-
-
-@dataclass()
-class Meeting(DataClassJsonMixin):
-    dataclass_json_config = JSON_CONFIG
-
-    client_update_time: Optional[str] = None
-    cached_deduplication_key: Optional[str] = None
-    id: Optional[str] = None
-    members: Optional[list[dict]] = None
-    thread_properties: dict[str, Any] = field(
-        default_factory=dict, metadata=config(decoder=decode_dict)
-    )
-    type: Optional[str] = None
-    version: Optional[float] = None
-
-    record_type: Optional[str] = field(
-        default="meeting", metadata=config(field_name="record_type")
-    )
-
-    def __eq__(self, other):
-        return self.cached_deduplication_key == other.cachedDeduplicationKey
-
-    def __hash__(self):
-        return hash(self.cached_deduplication_key)
-
-    def __lt__(self, other):
-        return self.cached_deduplication_key < other.cached_deduplication_key
-
-
-@dataclass()
-class Message(DataClassJsonMixin):
-    dataclass_json_config = JSON_CONFIG
-
-    attachments: list[Any] = field(default_factory=list)
-    cached_deduplication_key: Optional[str] = None
-    client_arrival_time: Optional[str] = None
-    clientmessageid: Optional[str] = None
-    composetime: Optional[str] = None
-    conversation_id: Optional[str] = None
-    content: Optional[str] = field(
-        default=None, metadata=config(decoder=strip_html_tags)
-    )
-    contenttype: Optional[str] = None
-    created_time: Optional[datetime] = field(
-        default=None,
-        metadata=config(decoder=decode_timestamp, encoder=encode_timestamp),
-    )
-    creator: Optional[str] = None
-    is_from_me: Optional[bool] = None
-    message_kind: Optional[str] = None
-    messagetype: Optional[str] = None
-    originalarrivaltime: Optional[str] = None
-    properties: dict[str, Any] = field(
-        default_factory=dict, metadata=config(decoder=decode_dict)
-    )
-    version: Optional[datetime] = field(
-        default=None,
-        metadata=config(decoder=decode_timestamp, encoder=encode_timestamp),
-    )
-
-    origin_file: Optional[str] = field(
-        default=None, metadata=config(field_name="origin_file")
-    )
-    record_type: str = field(
-        default="message", metadata=config(field_name="record_type")
-    )
-
-    def __post_init__(self):
-        if self.cached_deduplication_key is None:
-            self.cached_deduplication_key = str(self.creator) + str(
-                self.clientmessageid
+TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"]
+
+ENCODING = "iso-8859-1"
+
+"""
+The following code is heavily adopted from the RawLevelDb and IndexedDB processing proposed by CCL Group
+
+https://github.com/cclgroupltd/ccl_chrome_indexeddb/blob/35b6a9efba1078cf339f9e64d2796b1f5f7c556f/ccl_chromium_indexeddb.py
+
+It uses an optimized enumeration approach for processing the metadata, which makes the original IndexedDB super slow.
+
+Additionally, it has a flag to filter for datastores, which are interesting for us.
+"""
+
+def parse_db(filepath, do_not_filter=False):
+    # Open raw access to a LevelDB and deserialize the records.
+    wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath)
+
+    extracted_values = []
+
+    for db_info in wrapper.database_ids:
+        # Skip databases without a valid dbid_no
+        if db_info.dbid_no is None:
+            continue
+
+        db = wrapper[db_info.dbid_no]
+
+        for obj_store_name in db.object_store_names:
+            # Skip empty object stores
+            if obj_store_name is None:
+                continue
+            if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter:
+                obj_store = db[obj_store_name]
+                records_per_object_store = 0
+                for record in obj_store.iterate_records():
+                    records_per_object_store += 1
+                    sourcefile = str(filepath)
+                    # TODO: Fix None values
+                    state = None
+                    seq = None
+                    extracted_values.append({"key": record.key.raw_key, "value": record.value, "origin_file": sourcefile, "store": obj_store_name, "state": state, "seq": seq})
+                print(f"{obj_store_name} {db.name} (Records: {records_per_object_store})") 
+    return extracted_values
+
+
+def parse_localstorage(filepath):
+    local_store = ccl_chromium_localstorage.LocalStoreDb(filepath)
+    extracted_values = []
+    for record in local_store.iter_all_records():
+        try:
+            extracted_values.append(json.loads(record.value, strict=False))
+        except json.decoder.JSONDecodeError:
+            continue
+    return extracted_values
+
+
+def parse_sessionstorage(filepath):
+    session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath)
+    extracted_values = []
+    for host in session_storage:
+        print(host)
+        # Hosts can have multiple sessions associated with them
+        for session_store_values in session_storage.get_all_for_host(host).values():
+            for session_store_value in session_store_values:
+                # response is of type SessionStoreValue
+
+                # Make a nice dictionary out of it
+                entry = {
+                    "key": host,
+                    "value": session_store_value.value,
+                    "guid": session_store_value.guid,
+                    "leveldb_sequence_number": session_store_value.leveldb_sequence_number,
+                }
+                extracted_values.append(entry)
+    return extracted_values
+
+
+def write_results_to_json(data, outputpath):
+    # Dump messages into a json file
+    try:
+        with open(outputpath, "w", encoding="utf-8") as f:
+            json.dump(
+                data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False
             )
+    except OSError as e:
+        print(e)
 
-    def __eq__(self, other):
-        return self.cached_deduplication_key == other.cached_deduplication_key
-
-    def __hash__(self):
-        return hash(self.cached_deduplication_key)
-
-    def __lt__(self, other):
-        return self.cached_deduplication_key < other.cached_deduplication_key
-
-
-@dataclass()
-class Contact(DataClassJsonMixin):
-    dataclass_json_config = JSON_CONFIG
-
-    display_name: Optional[str] = None
-    email: Optional[str] = None
-    mri: Optional[str] = field(default=None, compare=True)
-    user_principal_name: Optional[str] = None
-
-    origin_file: Optional[str] = field(
-        default=None, metadata=config(field_name="origin_file")
-    )
-    record_type: Optional[str] = field(
-        default="contact", metadata=config(field_name="record_type")
-    )
-
-    def __eq__(self, other):
-        return self.mri == other.mri
-
-    def __hash__(self):
-        return hash(self.mri)
-
-    def __lt__(self, other):
-        return self.mri < other.mri
-
-
-def _parse_people(people: list[dict]) -> set[Contact]:
-    parsed_people = set()
-    for p in people:
- 
-        p |= p.get("value", {})
-        p |= {"display_name": p.get("displayName")}
-        p |= {"email": p.get("email")}
-        p |= {"mri": p.get("mri")}
-        p |= {"user_principal_name": p.get("userPrincipalName")}
-        p |= {"origin_file": p.get("origin_file")}
-
-        parsed_people.add(Contact.from_dict(p))
-    return parsed_people
-
-
-def _parse_buddies(buddies: list[dict]) -> set[Contact]:
-    parsed_buddies = set()
-    for b in buddies:
-        buddies_of_b = b.get("value", {}).get("buddies", [])
-        for b_of_b in buddies_of_b:
-
-            b_of_b |= {"origin_file": b.get("origin_file")}
-            parsed_buddies.add(Contact.from_dict(b_of_b))
-    return parsed_buddies
-
-
-def _parse_conversations(conversations: list[dict]) -> set[Meeting]:
-    cleaned_conversations = set()
-    for c in conversations:
-
-
-        if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get("value", {}).get(
-            "threadProperties", {}
-        ):
-            last_message = c.get("value", {}).get("lastMessage", {})
-            meeting_properties = c.get("value", {}).get("threadProperties", {})
-            c |= c.get("value", {})
-            c |= {"client_update_time": c.get("clientUpdateTime")}
-            c |= {"id": c.get("id")}
-            c |= {"members": c.get("members")}
-            c |= {"thread_properties": meeting_properties}
-            c |= {"client_update_time": c.get("clientUpdateTime")}
-            c |= {"version": c.get("version")}
-            c |= {"last_message": last_message}
-            c |= {"cached_deduplication_key": c.get("id")}
-            cleaned_conversations.add(Meeting.from_dict(c))
-    return cleaned_conversations
-
-
-def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
-    cleaned_reply_chains = set()
-    for rc in reply_chains:
-        rc |= {"origin_file": rc.get("origin_file")}
-        
-        message_dict = {}
-        if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get("messages", {}):
-            if rc.get("value", {}).get("messageMap", {}):
-                message_dict = rc.get("value", {}).get("messageMap", {})
-            else:
-                message_dict = rc.get("value", {}).get("messages", {})
-
-        for k in message_dict:
-            md = message_dict[k]
-
-            if md.get("messageType", "") == "RichText/Html" or md.get("messageType", "") == "Text":
-                rc |= rc.get("value", {})
-                rc |= {"cached_deduplication_key": md.get("dedupeKey")}
-                rc |= {"clientmessageid": md.get("clientMessageId")}
-                rc |= {"composetime": md.get("clientArrivalTime")}
-                rc |= {"conversation_id": md.get("conversationId")}
-                rc |= {"content": md.get("content")}
-                rc |= {"contenttype": md.get("contentType")}
-                rc |= {"created_time": md.get("clientArrivalTime")}
-                rc |= {"creator": md.get("version")}
-                rc |= {"is_from_me": md.get("isSentByCurrentUser")}
-                rc |= {"messagetype": md.get("messageType")}
-                rc |= {"originalArrivalTime": md.get("version")}
-                rc |= {"client_arrival_time": md.get("clientArrivalTime")}
-                rc |= {"original_arrival_time": md.get("clientArrivalTime")}
-                rc |= {"version": md.get("version")}
-                rc |= {"properties": md.get("properties")}
-            
-            cleaned_reply_chains.add(Message.from_dict(rc))
-
-    return cleaned_reply_chains
-
-
-def parse_records(records: list[dict]) -> list[dict]:
-    people, buddies, reply_chains, conversations = [], [], [], []
-
-    for r in records:
-        store = r.get("store", "other")
-        if store == "people":
-            people.append(r)
-        elif store == "buddylist":
-            buddies.append(r)
-        elif store == "replychains":
-            reply_chains.append(r)
-        elif store == "conversations":
-            conversations.append(r)
-
-    # sort within groups i.e., Contacts, Meetings, Conversations
-    parsed_records = (
-        sorted(_parse_people(people))
-    #    + sorted(_parse_buddies(buddies))
-        + sorted(_parse_reply_chains(reply_chains))
-        + sorted(_parse_conversations(conversations))
-    )
-    return [r.to_dict() for r in parsed_records]
-
-
-def process_db(input_path: Path, output_path: Path):
-    if not input_path.parts[-1].endswith(".leveldb"):
-        raise ValueError(f"Expected a leveldb folder. Path: {input_path}")
 
-    extracted_values = parse_db(input_path)
-    parsed_records = parse_records(extracted_values)
-    write_results_to_json(parsed_records, output_path)
+def parse_json():
+    # read data from a file. This is only for testing purpose.
+    try:
+        with Path("teams.json").open() as json_file:
+            return json.load(json_file)
+    except OSError as e:
+        print(e)
diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py
index 5c2c3dd..bcfb656 100644
--- a/src/forensicsim/parser.py
+++ b/src/forensicsim/parser.py
@@ -1,10 +1,11 @@
 import json
+import warnings
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Optional
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 from dataclasses_json import (
     DataClassJsonMixin,
     LetterCase,
@@ -14,6 +15,8 @@
 
 from forensicsim.backend import parse_db, write_results_to_json
 
+# Suppress Beautiful Soup warnings
+warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
 
 def strip_html_tags(value):
     # Get the text of any embedded html, such as divs, a href links
@@ -121,10 +124,6 @@ def __post_init__(self):
             self.cached_deduplication_key = str(self.creator) + str(
                 self.clientmessageid
             )
-        if "call-log" in self.properties:
-            self.record_type = "call"
-        if "activity" in self.properties:
-            self.record_type = "reaction"
 
     def __eq__(self, other):
         return self.cached_deduplication_key == other.cached_deduplication_key
@@ -165,8 +164,14 @@ def __lt__(self, other):
 def _parse_people(people: list[dict]) -> set[Contact]:
     parsed_people = set()
     for p in people:
-        p |= {"origin_file": p.get("origin_file")}
+ 
         p |= p.get("value", {})
+        p |= {"display_name": p.get("displayName")}
+        p |= {"email": p.get("email")}
+        p |= {"mri": p.get("mri")}
+        p |= {"user_principal_name": p.get("userPrincipalName")}
+        p |= {"origin_file": p.get("origin_file")}
+
         parsed_people.add(Contact.from_dict(p))
     return parsed_people
 
@@ -176,6 +181,7 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]:
     for b in buddies:
         buddies_of_b = b.get("value", {}).get("buddies", [])
         for b_of_b in buddies_of_b:
+
             b_of_b |= {"origin_file": b.get("origin_file")}
             parsed_buddies.add(Contact.from_dict(b_of_b))
     return parsed_buddies
@@ -184,29 +190,61 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]:
 def _parse_conversations(conversations: list[dict]) -> set[Meeting]:
     cleaned_conversations = set()
     for c in conversations:
-        last_message = c.get("value", {}).get("lastMessage", {})
 
-        c |= {
-            "cachedDeduplicationKey": last_message.get("cachedDeduplicationKey"),
-        }
 
-        if c.get("type", "") == "Meeting" and "meeting" in c.get(
+        if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get("value", {}).get(
             "threadProperties", {}
         ):
+            last_message = c.get("value", {}).get("lastMessage", {})
+            meeting_properties = c.get("value", {}).get("threadProperties", {})
+            c |= c.get("value", {})
+            c |= {"client_update_time": c.get("clientUpdateTime")}
+            c |= {"id": c.get("id")}
+            c |= {"members": c.get("members")}
+            c |= {"thread_properties": meeting_properties}
+            c |= {"client_update_time": c.get("clientUpdateTime")}
+            c |= {"version": c.get("version")}
+            c |= {"last_message": last_message}
+            c |= {"cached_deduplication_key": c.get("id")}
             cleaned_conversations.add(Meeting.from_dict(c))
-
     return cleaned_conversations
 
 
 def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
     cleaned_reply_chains = set()
-
     for rc in reply_chains:
-        for message_values in rc.get("value", {}).get("messages", {}).values():
-            message_values |= {
-                "origin_file": rc.get("origin_file"),
-            }
-            cleaned_reply_chains.add(Message.from_dict(message_values))
+        rc |= {"origin_file": rc.get("origin_file")}
+        
+        message_dict = {}
+        if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get("messages", {}):
+            if rc.get("value", {}).get("messageMap", {}):
+                message_dict = rc.get("value", {}).get("messageMap", {})
+            else:
+                message_dict = rc.get("value", {}).get("messages", {})
+
+        for k in message_dict:
+            md = message_dict[k]
+
+            if md.get("messageType", "") == "RichText/Html" or md.get("messageType", "") == "Text":
+                rc |= rc.get("value", {})
+                rc |= {"cached_deduplication_key": md.get("dedupeKey")}
+                rc |= {"clientmessageid": md.get("clientMessageId")}
+                rc |= {"composetime": md.get("clientArrivalTime")}
+                rc |= {"conversation_id": md.get("conversationId")}
+                rc |= {"content": md.get("content")}
+                rc |= {"contenttype": md.get("contentType")}
+                rc |= {"created_time": md.get("clientArrivalTime")}
+                rc |= {"creator": md.get("version")}
+                rc |= {"is_from_me": md.get("isSentByCurrentUser")}
+                rc |= {"messagetype": md.get("messageType")}
+                rc |= {"originalArrivalTime": md.get("version")}
+                rc |= {"client_arrival_time": md.get("clientArrivalTime")}
+                rc |= {"original_arrival_time": md.get("clientArrivalTime")}
+                rc |= {"version": md.get("version")}
+                rc |= {"properties": md.get("properties")}
+            
+            cleaned_reply_chains.add(Message.from_dict(rc))
+
     return cleaned_reply_chains
 
 
@@ -227,7 +265,7 @@ def parse_records(records: list[dict]) -> list[dict]:
     # sort within groups i.e., Contacts, Meetings, Conversations
     parsed_records = (
         sorted(_parse_people(people))
-        + sorted(_parse_buddies(buddies))
+    #    + sorted(_parse_buddies(buddies))
         + sorted(_parse_reply_chains(reply_chains))
         + sorted(_parse_conversations(conversations))
     )

From 10b0a875c6903243ecf92b9a02f868437b67b37f Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Wed, 17 Jan 2024 18:14:23 +0100
Subject: [PATCH 15/25] feat: minor refactorings

---
 src/forensicsim/backend.py  | 23 +++++-----
 src/forensicsim/parser.py   | 89 +++++++++++++++++++------------------
 tools/Forensicsim_Parser.py |  2 +-
 tools/main.py               |  4 +-
 4 files changed, 61 insertions(+), 57 deletions(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 6533254..3aab92e 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -22,22 +22,13 @@
 SOFTWARE.
 """
 
-import io
 import json
-import os
 from pathlib import Path
 
 from chromedb import (
-    ccl_blink_value_deserializer,
     ccl_chromium_indexeddb,
     ccl_chromium_localstorage,
     ccl_chromium_sessionstorage,
-    ccl_leveldb,
-    ccl_v8_value_deserializer,
-)
-from chromedb.ccl_chromium_indexeddb import (
-    DatabaseMetadataType,
-    ObjectStoreMetadataType,
 )
 
 TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"]
@@ -54,6 +45,7 @@
 Additionally, it has a flag to filter for datastores, which are interesting for us.
 """
 
+
 def parse_db(filepath, do_not_filter=False):
     # Open raw access to a LevelDB and deserialize the records.
     wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath)
@@ -80,8 +72,17 @@ def parse_db(filepath, do_not_filter=False):
                     # TODO: Fix None values
                     state = None
                     seq = None
-                    extracted_values.append({"key": record.key.raw_key, "value": record.value, "origin_file": sourcefile, "store": obj_store_name, "state": state, "seq": seq})
-                print(f"{obj_store_name} {db.name} (Records: {records_per_object_store})") 
+                    extracted_values.append({
+                        "key": record.key.raw_key,
+                        "value": record.value,
+                        "origin_file": sourcefile,
+                        "store": obj_store_name,
+                        "state": state,
+                        "seq": seq,
+                    })
+                print(
+                    f"{obj_store_name} {db.name} (Records: {records_per_object_store})"
+                )
     return extracted_values
 
 
diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py
index bcfb656..218d391 100644
--- a/src/forensicsim/parser.py
+++ b/src/forensicsim/parser.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 from dataclasses_json import (
@@ -16,15 +16,16 @@
 from forensicsim.backend import parse_db, write_results_to_json
 
 # Suppress Beautiful Soup warnings
-warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
+warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
 
-def strip_html_tags(value):
+
+def strip_html_tags(value: str) -> str:
     # Get the text of any embedded html, such as divs, a href links
     soup = BeautifulSoup(value, features="html.parser")
     return soup.get_text()
 
 
-def decode_dict(properties):
+def decode_dict(properties: Union[bytes, str, dict]) -> dict[str, Any]:
     if isinstance(properties, bytes):
         soup = BeautifulSoup(properties, features="html.parser")
         properties = properties.decode(soup.original_encoding)
@@ -38,11 +39,11 @@ def decode_dict(properties):
     return json.loads(properties, strict=False)
 
 
-def decode_timestamp(content_utf8_encoded) -> datetime:
+def decode_timestamp(content_utf8_encoded: str) -> datetime:
     return datetime.utcfromtimestamp(int(content_utf8_encoded) / 1000)
 
 
-def encode_timestamp(timestamp) -> Optional[str]:
+def encode_timestamp(timestamp: Optional[datetime]) -> Optional[str]:
     if timestamp is not None:
         return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")
     return None
@@ -71,13 +72,17 @@ class Meeting(DataClassJsonMixin):
         default="meeting", metadata=config(field_name="record_type")
     )
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Meeting):
+            return NotImplemented
         return self.cached_deduplication_key == other.cachedDeduplicationKey
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         return hash(self.cached_deduplication_key)
 
-    def __lt__(self, other):
+    def __lt__(self, other: object) -> bool:
+        if not isinstance(other, Meeting):
+            return NotImplemented
         return self.cached_deduplication_key < other.cached_deduplication_key
 
 
@@ -119,19 +124,23 @@ class Message(DataClassJsonMixin):
         default="message", metadata=config(field_name="record_type")
     )
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.cached_deduplication_key is None:
             self.cached_deduplication_key = str(self.creator) + str(
                 self.clientmessageid
             )
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Message):
+            return NotImplemented
         return self.cached_deduplication_key == other.cached_deduplication_key
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         return hash(self.cached_deduplication_key)
 
-    def __lt__(self, other):
+    def __lt__(self, other: object) -> bool:
+        if not isinstance(other, Message):
+            return NotImplemented
         return self.cached_deduplication_key < other.cached_deduplication_key
 
 
@@ -151,25 +160,24 @@ class Contact(DataClassJsonMixin):
         default="contact", metadata=config(field_name="record_type")
     )
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Contact):
+            return NotImplemented
         return self.mri == other.mri
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         return hash(self.mri)
 
-    def __lt__(self, other):
+    def __lt__(self, other: object) -> bool:
+        if not isinstance(other, Contact):
+            return NotImplemented
         return self.mri < other.mri
 
 
 def _parse_people(people: list[dict]) -> set[Contact]:
     parsed_people = set()
     for p in people:
- 
         p |= p.get("value", {})
-        p |= {"display_name": p.get("displayName")}
-        p |= {"email": p.get("email")}
-        p |= {"mri": p.get("mri")}
-        p |= {"user_principal_name": p.get("userPrincipalName")}
         p |= {"origin_file": p.get("origin_file")}
 
         parsed_people.add(Contact.from_dict(p))
@@ -181,7 +189,6 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]:
     for b in buddies:
         buddies_of_b = b.get("value", {}).get("buddies", [])
         for b_of_b in buddies_of_b:
-
             b_of_b |= {"origin_file": b.get("origin_file")}
             parsed_buddies.add(Contact.from_dict(b_of_b))
     return parsed_buddies
@@ -190,21 +197,12 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]:
 def _parse_conversations(conversations: list[dict]) -> set[Meeting]:
     cleaned_conversations = set()
     for c in conversations:
-
-
-        if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get("value", {}).get(
-            "threadProperties", {}
-        ):
-            last_message = c.get("value", {}).get("lastMessage", {})
-            meeting_properties = c.get("value", {}).get("threadProperties", {})
-            c |= c.get("value", {})
-            c |= {"client_update_time": c.get("clientUpdateTime")}
-            c |= {"id": c.get("id")}
-            c |= {"members": c.get("members")}
-            c |= {"thread_properties": meeting_properties}
-            c |= {"client_update_time": c.get("clientUpdateTime")}
-            c |= {"version": c.get("version")}
-            c |= {"last_message": last_message}
+        if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get(
+            "value", {}
+        ).get("threadProperties", {}):
+            c_value = c.get("value", {})
+            c |= c_value
+            c |= {"thread_properties": c_value.get("threadProperties", {})}
             c |= {"cached_deduplication_key": c.get("id")}
             cleaned_conversations.add(Meeting.from_dict(c))
     return cleaned_conversations
@@ -214,9 +212,11 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
     cleaned_reply_chains = set()
     for rc in reply_chains:
         rc |= {"origin_file": rc.get("origin_file")}
-        
+
         message_dict = {}
-        if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get("messages", {}):
+        if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get(
+            "messages", {}
+        ):
             if rc.get("value", {}).get("messageMap", {}):
                 message_dict = rc.get("value", {}).get("messageMap", {})
             else:
@@ -225,7 +225,10 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
         for k in message_dict:
             md = message_dict[k]
 
-            if md.get("messageType", "") == "RichText/Html" or md.get("messageType", "") == "Text":
+            if (
+                md.get("messageType", "") == "RichText/Html"
+                or md.get("messageType", "") == "Text"
+            ):
                 rc |= rc.get("value", {})
                 rc |= {"cached_deduplication_key": md.get("dedupeKey")}
                 rc |= {"clientmessageid": md.get("clientMessageId")}
@@ -242,7 +245,7 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
                 rc |= {"original_arrival_time": md.get("clientArrivalTime")}
                 rc |= {"version": md.get("version")}
                 rc |= {"properties": md.get("properties")}
-            
+
             cleaned_reply_chains.add(Message.from_dict(rc))
 
     return cleaned_reply_chains
@@ -265,14 +268,14 @@ def parse_records(records: list[dict]) -> list[dict]:
     # sort within groups i.e., Contacts, Meetings, Conversations
     parsed_records = (
         sorted(_parse_people(people))
-    #    + sorted(_parse_buddies(buddies))
+        #    + sorted(_parse_buddies(buddies))
         + sorted(_parse_reply_chains(reply_chains))
         + sorted(_parse_conversations(conversations))
     )
     return [r.to_dict() for r in parsed_records]
 
 
-def process_db(input_path: Path, output_path: Path):
+def process_db(input_path: Path, output_path: Path) -> None:
     if not input_path.parts[-1].endswith(".leveldb"):
         raise ValueError(f"Expected a leveldb folder. Path: {input_path}")
 
diff --git a/tools/Forensicsim_Parser.py b/tools/Forensicsim_Parser.py
index 3b35d94..ce35ec1 100644
--- a/tools/Forensicsim_Parser.py
+++ b/tools/Forensicsim_Parser.py
@@ -88,7 +88,7 @@
 # Factory that defines the name and details of the module and allows Autopsy
 # to create instances of the modules that will do the analysis.
 class ForensicIMIngestModuleFactory(IngestModuleFactoryAdapter):
-    def __init__(self):
+    def __init__(self) -> None:
         self.settings = None
 
     moduleName = "Microsoft Teams Parser"
diff --git a/tools/main.py b/tools/main.py
index 10bda5a..71f30e4 100644
--- a/tools/main.py
+++ b/tools/main.py
@@ -31,7 +31,7 @@
 from forensicsim.parser import parse_records
 
 
-def process_db(input_path: Path, output_path: Path):
+def process_db(input_path: Path, output_path: Path) -> None:
     if not input_path.parts[-1].endswith(".leveldb"):
         raise ValueError(f"Expected a leveldb folder. Path: {input_path}")
 
@@ -57,7 +57,7 @@ def process_db(input_path: Path, output_path: Path):
     required=True,
     help="File path to the processed output.",
 )
-def process_cmd(filepath, outputpath):
+def process_cmd(filepath: Path, outputpath: Path) -> None:
     click.echo(XTRACT_HEADER)
     process_db(filepath, outputpath)
 

From b5accff81297d942a458ea6c86396a3bfe2fb75e Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sat, 20 Jan 2024 11:35:12 +0100
Subject: [PATCH 16/25] feat: add option to pass .blob folder

---
 src/forensicsim/backend.py |  5 +++--
 src/forensicsim/parser.py  |  7 +++++--
 tools/dump_leveldb.py      | 29 +++++++++++++++++++----------
 tools/main.py              | 30 ++++++++++++++++--------------
 4 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 3aab92e..eea7b1c 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -46,10 +46,11 @@
 """
 
 
-def parse_db(filepath, do_not_filter=False):
+def parse_db(filepath : Path, blobpath: Path = None, do_not_filter: bool=False):
     # Open raw access to a LevelDB and deserialize the records.
-    wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath)
 
+    wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath)
+    
     extracted_values = []
 
     for db_info in wrapper.database_ids:
diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py
index 218d391..20cfc91 100644
--- a/src/forensicsim/parser.py
+++ b/src/forensicsim/parser.py
@@ -275,10 +275,13 @@ def parse_records(records: list[dict]) -> list[dict]:
     return [r.to_dict() for r in parsed_records]
 
 
-def process_db(input_path: Path, output_path: Path) -> None:
+def process_db(input_path: Path, output_path: Path, blob_path: Path=None, do_not_filter: bool = True) -> None:
     if not input_path.parts[-1].endswith(".leveldb"):
         raise ValueError(f"Expected a leveldb folder. Path: {input_path}")
 
-    extracted_values = parse_db(input_path)
+    if blob_path is not None and not blob_path.parts[-1].endswith(".blob"):
+        raise ValueError(f"Expected a .blob folder. Path: {blob_path}")
+
+    extracted_values = parse_db(input_path, blob_path, do_not_filter)
     parsed_records = parse_records(extracted_values)
     write_results_to_json(parsed_records, output_path)
diff --git a/tools/dump_leveldb.py b/tools/dump_leveldb.py
index bdcb814..3f6fe63 100644
--- a/tools/dump_leveldb.py
+++ b/tools/dump_leveldb.py
@@ -26,17 +26,14 @@
 
 import click
 
-from forensicsim.backend import parse_db, write_results_to_json
+from forensicsim.backend import write_results_to_json
+from forensicsim.parser import parse_db
 from forensicsim.consts import DUMP_HEADER
 
-
-def process_db(input_path, output_path):
-    # Do some basic error handling
-    if not input_path.parts[-1].endswith(".leveldb"):
-        raise ValueError(f"Expected a leveldb folder. Path: {input_path}")
+def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None):
 
     # convert the database to a python list with nested dictionaries
-    extracted_values = parse_db(input_path, do_not_filter=True)
+    extracted_values = parse_db(input_path, blob_path, do_not_filter=True)
 
     # write the output to a json file
     write_results_to_json(extracted_values, output_path)
@@ -50,8 +47,9 @@ def process_db(input_path, output_path):
         exists=True, readable=True, writable=False, dir_okay=True, path_type=Path
     ),
     required=True,
-    help="File path to the IndexedDB.",
+    help="File path to the .leveldb folder of the IndexedDB.",
 )
+
 @click.option(
     "-o",
     "--outputpath",
@@ -59,9 +57,20 @@ def process_db(input_path, output_path):
     required=True,
     help="File path to the processed output.",
 )
-def process_cmd(filepath, outputpath):
+
+@click.option(
+    "-b",
+    "--blobpath",
+    type=click.Path(
+        exists=True, readable=True, writable=False, dir_okay=True, path_type=Path
+    ),
+    required=False,
+    help="File path to the .blob folder of the IndexedDB.",
+)
+
+def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None:
     click.echo(DUMP_HEADER)
-    process_db(filepath, outputpath)
+    process_level_db(filepath, outputpath, blobpath)
 
 
 if __name__ == "__main__":
diff --git a/tools/main.py b/tools/main.py
index 71f30e4..4b05f3d 100644
--- a/tools/main.py
+++ b/tools/main.py
@@ -28,17 +28,7 @@
 
 from forensicsim.backend import parse_db, write_results_to_json
 from forensicsim.consts import XTRACT_HEADER
-from forensicsim.parser import parse_records
-
-
-def process_db(input_path: Path, output_path: Path) -> None:
-    if not input_path.parts[-1].endswith(".leveldb"):
-        raise ValueError(f"Expected a leveldb folder. Path: {input_path}")
-
-    extracted_values = parse_db(input_path)
-    parsed_records = parse_records(extracted_values)
-    write_results_to_json(parsed_records, output_path)
-
+from forensicsim.parser import process_db
 
 @click.command()
 @click.option(
@@ -48,8 +38,9 @@ def process_db(input_path: Path, output_path: Path) -> None:
         exists=True, readable=True, writable=False, dir_okay=True, path_type=Path
     ),
     required=True,
-    help="File path to the IndexedDB.",
+    help="File path to the .leveldb folder of the IndexedDB.",
 )
+
 @click.option(
     "-o",
     "--outputpath",
@@ -57,9 +48,20 @@ def process_db(input_path: Path, output_path: Path) -> None:
     required=True,
     help="File path to the processed output.",
 )
-def process_cmd(filepath: Path, outputpath: Path) -> None:
+
+@click.option(
+    "-b",
+    "--blobpath",
+    type=click.Path(
+        exists=True, readable=True, writable=False, dir_okay=True, path_type=Path
+    ),
+    required=False,
+    help="File path to the .blob folder of the IndexedDB.",
+)
+
+def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None:
     click.echo(XTRACT_HEADER)
-    process_db(filepath, outputpath)
+    process_db(filepath, outputpath, blobpath, False)
 
 
 if __name__ == "__main__":

From 6b6b42f9b5f8689058f59fca2f4ca0cff402f4ab Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sat, 20 Jan 2024 13:06:03 +0100
Subject: [PATCH 17/25] chore: update path of submodule

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 4099ff0..5298878 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "forensicsim-data"]
 	path = forensicsim-data
-	url = https://github.com/KarelZe/forensicsim-data.git
+	url = https://github.com/lxndrblz/forensicsim-data.git

From 08306f24dd3f201b7359d576736f090260f35ff3 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sat, 20 Jan 2024 13:36:11 +0100
Subject: [PATCH 18/25] chore: update path to chromedb package

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0361102..6ee0eed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,13 +10,13 @@ readme = "README.md"
 license = {file = "LICENSE.md"}
 requires-python = ">=3.9"
 authors = [
-    { name = "Alexander Bilz", email = "github@markusbilz.com" },
+    { name = "Alexander Bilz", email = "github@alexbilz.com" },
     { name = "Markus Bilz", email = "github@markusbilz.com" }
 ]
 dependencies = [
 "beautifulsoup4",
 "click",
-"chromedb @ git+https://github.com/karelze/ccl_chrome_indexeddb@master",
+"chromedb @ git+https://github.com/lxndrblz/chromedb@main",
 "dataclasses-json",
 "pause",
 "pyautogui",

From 4e3bda91376daae4abbf8b127defeb3610588596 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sat, 20 Jan 2024 17:02:24 +0100
Subject: [PATCH 19/25] fix: restore parser capability for MS Teams v1

---
 src/forensicsim/parser.py | 50 ++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py
index 20cfc91..9664e5a 100644
--- a/src/forensicsim/parser.py
+++ b/src/forensicsim/parser.py
@@ -75,7 +75,7 @@ class Meeting(DataClassJsonMixin):
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Meeting):
             return NotImplemented
-        return self.cached_deduplication_key == other.cachedDeduplicationKey
+        return self.cached_deduplication_key == other.cached_deduplication_key
 
     def __hash__(self) -> int:
         return hash(self.cached_deduplication_key)
@@ -108,7 +108,7 @@ class Message(DataClassJsonMixin):
     is_from_me: Optional[bool] = None
     message_kind: Optional[str] = None
     messagetype: Optional[str] = None
-    originalarrivaltime: Optional[str] = None
+    original_arrival_time: Optional[str] = None
     properties: dict[str, Any] = field(
         default_factory=dict, metadata=config(decoder=decode_dict)
     )
@@ -129,6 +129,11 @@ def __post_init__(self) -> None:
             self.cached_deduplication_key = str(self.creator) + str(
                 self.clientmessageid
             )
+        # change record type depending on properties
+        if "call-log" in self.properties:
+            self.record_type = "call"
+        if "activity" in self.properties:
+            self.record_type = "reaction"
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Message):
@@ -193,7 +198,8 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]:
             parsed_buddies.add(Contact.from_dict(b_of_b))
     return parsed_buddies
 
-
+# Conversations can contain multiple artefacts
+# -> If type:Meeting then its a meeting
 def _parse_conversations(conversations: list[dict]) -> set[Meeting]:
     cleaned_conversations = set()
     for c in conversations:
@@ -211,6 +217,13 @@ def _parse_conversations(conversations: list[dict]) -> set[Meeting]:
 def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
     cleaned_reply_chains = set()
     for rc in reply_chains:
+
+
+        # Skip empty records
+        if rc["value"] is None:
+            continue
+        
+        rc |= rc.get("value", {})
         rc |= {"origin_file": rc.get("origin_file")}
 
         message_dict = {}
@@ -226,27 +239,26 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
             md = message_dict[k]
 
             if (
-                md.get("messageType", "") == "RichText/Html"
-                or md.get("messageType", "") == "Text"
+                md.get("messagetype", "") == "RichText/Html"
+                or md.get("messagetype", "") == "Text"
             ):
-                rc |= rc.get("value", {})
-                rc |= {"cached_deduplication_key": md.get("dedupeKey")}
-                rc |= {"clientmessageid": md.get("clientMessageId")}
-                rc |= {"composetime": md.get("clientArrivalTime")}
+                rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")}
+                rc |= {"clientmessageid": md.get("clientmessageid")}
+                rc |= {"composetime": md.get("composetime")}
                 rc |= {"conversation_id": md.get("conversationId")}
                 rc |= {"content": md.get("content")}
-                rc |= {"contenttype": md.get("contentType")}
-                rc |= {"created_time": md.get("clientArrivalTime")}
-                rc |= {"creator": md.get("version")}
-                rc |= {"is_from_me": md.get("isSentByCurrentUser")}
-                rc |= {"messagetype": md.get("messageType")}
-                rc |= {"originalArrivalTime": md.get("version")}
+                rc |= {"contenttype": md.get("contenttype")}
+                rc |= {"created_time": md.get("createdTime")}
+                rc |= {"creator": md.get("creator")}
+                rc |= {"is_from_me": md.get("isFromMe")}
+                rc |= {"messagetype": md.get("messagetype")}
+                rc |= {"messageKind": md.get("messageKind")}
                 rc |= {"client_arrival_time": md.get("clientArrivalTime")}
-                rc |= {"original_arrival_time": md.get("clientArrivalTime")}
+                rc |= {"original_arrival_time": md.get("originalarrivaltime")}
                 rc |= {"version": md.get("version")}
                 rc |= {"properties": md.get("properties")}
 
-            cleaned_reply_chains.add(Message.from_dict(rc))
+                cleaned_reply_chains.add(Message.from_dict(rc))
 
     return cleaned_reply_chains
 
@@ -268,7 +280,7 @@ def parse_records(records: list[dict]) -> list[dict]:
     # sort within groups i.e., Contacts, Meetings, Conversations
     parsed_records = (
         sorted(_parse_people(people))
-        #    + sorted(_parse_buddies(buddies))
+        + sorted(_parse_buddies(buddies))
         + sorted(_parse_reply_chains(reply_chains))
         + sorted(_parse_conversations(conversations))
     )
@@ -284,4 +296,4 @@ def process_db(input_path: Path, output_path: Path, blob_path: Path=None, do_not
 
     extracted_values = parse_db(input_path, blob_path, do_not_filter)
     parsed_records = parse_records(extracted_values)
-    write_results_to_json(parsed_records, output_path)
+    write_results_to_json(parsed_records, output_path)
\ No newline at end of file

From 2147d4107fab9291dba6f92d528936d31e465139 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sun, 21 Jan 2024 15:47:30 +0100
Subject: [PATCH 20/25] fix: parse v1 and v2 separately where necessary

---
 src/forensicsim/parser.py | 150 ++++++++++++++++++++++++++++----------
 1 file changed, 111 insertions(+), 39 deletions(-)

diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py
index 9664e5a..a9d6b38 100644
--- a/src/forensicsim/parser.py
+++ b/src/forensicsim/parser.py
@@ -179,42 +179,76 @@ def __lt__(self, other: object) -> bool:
         return self.mri < other.mri
 
 
-def _parse_people(people: list[dict]) -> set[Contact]:
+def _parse_people(people: list[dict], version: str) -> set[Contact]:
     parsed_people = set()
+
     for p in people:
+        # Skip empty records
+        if p["value"] is None:
+            continue
+        
+        # Fetch relevant data 
         p |= p.get("value", {})
         p |= {"origin_file": p.get("origin_file")}
 
+        # Skip contacts without an MRI
+        if p.get("mri") is None:
+            continue
+
+        if version == "v1" or version == "v2":
+            p |= {"display_name": p.get("displayName")}
+            p |= {"email": p.get("email")}
+            p |= {"mri": p.get("mri")}
+            p |= {"user_principal_name": p.get("userPrincipalName")}
+        else:
+            print("Teams Version is unknown. Can not extract records of type people.")            
+
+
         parsed_people.add(Contact.from_dict(p))
     return parsed_people
 
 
-def _parse_buddies(buddies: list[dict]) -> set[Contact]:
+def _parse_buddies(buddies: list[dict], version: str) -> set[Contact]:
     parsed_buddies = set()
+
     for b in buddies:
-        buddies_of_b = b.get("value", {}).get("buddies", [])
-        for b_of_b in buddies_of_b:
-            b_of_b |= {"origin_file": b.get("origin_file")}
-            parsed_buddies.add(Contact.from_dict(b_of_b))
+        # Skip empty records
+        if b["value"] is None:
+            continue
+        # Fetch relevant data 
+        if version == "v1" or version == "v2":
+            buddies_of_b = b.get("value", {}).get("buddies", [])
+            for b_of_b in buddies_of_b:
+                b_of_b |= {"origin_file": b.get("origin_file")}
+                parsed_buddies.add(Contact.from_dict(b_of_b))
+        else:
+            print("Teams Version is unknown. Can not extract records of type buddies.")      
     return parsed_buddies
 
 # Conversations can contain multiple artefacts
 # -> If type:Meeting then its a meeting
-def _parse_conversations(conversations: list[dict]) -> set[Meeting]:
+def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting]:
     cleaned_conversations = set()
     for c in conversations:
-        if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get(
-            "value", {}
-        ).get("threadProperties", {}):
-            c_value = c.get("value", {})
-            c |= c_value
-            c |= {"thread_properties": c_value.get("threadProperties", {})}
-            c |= {"cached_deduplication_key": c.get("id")}
-            cleaned_conversations.add(Meeting.from_dict(c))
+        # Skip empty records
+        if c["value"] is None:
+            continue
+        # Fetch relevant data 
+        if version == "v1" or version == "v2":
+            if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get(
+                "value", {}
+            ).get("threadProperties", {}):
+                c_value = c.get("value", {})
+                c |= c_value
+                c |= {"thread_properties": c_value.get("threadProperties", {})}
+                c |= {"cached_deduplication_key": c.get("id")}
+                cleaned_conversations.add(Meeting.from_dict(c))
+        else:
+            print("Teams Version is unknown. Can not extract records of type meeting.")     
     return cleaned_conversations
 
 
-def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
+def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
     cleaned_reply_chains = set()
     for rc in reply_chains:
 
@@ -223,49 +257,84 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]:
         if rc["value"] is None:
             continue
         
+        # Fetch relevant data 
         rc |= rc.get("value", {})
         rc |= {"origin_file": rc.get("origin_file")}
 
         message_dict = {}
-        if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get(
-            "messages", {}
-        ):
-            if rc.get("value", {}).get("messageMap", {}):
-                message_dict = rc.get("value", {}).get("messageMap", {})
-            else:
-                message_dict = rc.get("value", {}).get("messages", {})
+        if version == "v1":
+            message_dict = rc.get("value", {}).get("messages", {})
+        elif version == "v2":
+            message_dict = rc.get("value", {}).get("messageMap", {})
+        else:
+            print("Teams Version is unknown. Can not extract records of type reply_chains.")
+            continue
 
         for k in message_dict:
             md = message_dict[k]
-
             if (
                 md.get("messagetype", "") == "RichText/Html"
                 or md.get("messagetype", "") == "Text"
+                or md.get("messageType", "") == "RichText/Html"
+                or md.get("messageType", "") == "Text"
             ):
-                rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")}
-                rc |= {"clientmessageid": md.get("clientmessageid")}
-                rc |= {"composetime": md.get("composetime")}
+                if version == "v1":
+
+                    rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")}
+                    rc |= {"clientmessageid": md.get("clientmessageid")}
+                    rc |= {"composetime": md.get("composetime")}
+                    rc |= {"contenttype": md.get("contenttype")}
+                    rc |= {"created_time": md.get("createdTime")}
+                    rc |= {"is_from_me": md.get("isFromMe")}
+                    rc |= {"messagetype": md.get("messagetype")}
+                    rc |= {"messageKind": md.get("messageKind")}
+                    rc |= {"original_arrival_time": md.get("originalarrivaltime")}
+      
+                elif version == "v2":
+                    rc |= {"cached_deduplication_key": md.get("dedupeKey")}
+                    rc |= {"clientmessageid": md.get("clientMessageId")}
+                    # set to clientArrivalTime as compose Time is no longer present
+                    rc |= {"composetime": md.get("clientArrivalTime")}
+                    rc |= {"contenttype": md.get("contentType")}
+                    # set to clientArrivalTime as created time is no longer present
+                    rc |= {"created_time": md.get("clientArrivalTime")}
+                    rc |= {"is_from_me": md.get("isSentByCurrentUser")}
+                    rc |= {"messagetype": md.get("messageType")}
+                    rc |= {"original_arrival_time": md.get("originalArrivalTime")}
+                
+                # Similar across versions
+                rc |= {"creator": md.get("creator")}
                 rc |= {"conversation_id": md.get("conversationId")}
                 rc |= {"content": md.get("content")}
-                rc |= {"contenttype": md.get("contenttype")}
-                rc |= {"created_time": md.get("createdTime")}
-                rc |= {"creator": md.get("creator")}
-                rc |= {"is_from_me": md.get("isFromMe")}
-                rc |= {"messagetype": md.get("messagetype")}
-                rc |= {"messageKind": md.get("messageKind")}
                 rc |= {"client_arrival_time": md.get("clientArrivalTime")}
-                rc |= {"original_arrival_time": md.get("originalarrivaltime")}
                 rc |= {"version": md.get("version")}
                 rc |= {"properties": md.get("properties")}
 
                 cleaned_reply_chains.add(Message.from_dict(rc))
-
+        
     return cleaned_reply_chains
 
 
+def identify_teams_version(reply_chains: list[dict]) -> str:
+    # Identify version based on reply chain structure
+    fingerprint_teams_version = ""
+    for rc in reply_chains:
+        rc |= rc.get("value", {})
+        if rc.get("value", {}).get("messages", {}):
+                fingerprint_teams_version = "v1"
+                break
+        elif rc.get("value", {}).get("messageMap", {}):
+                fingerprint_teams_version = "v2"
+                break
+        else:
+            fingerprint_teams_version = "unknown"
+
+    return fingerprint_teams_version
+
 def parse_records(records: list[dict]) -> list[dict]:
     people, buddies, reply_chains, conversations = [], [], [], []
 
+    
     for r in records:
         store = r.get("store", "other")
         if store == "people":
@@ -277,12 +346,15 @@ def parse_records(records: list[dict]) -> list[dict]:
         elif store == "conversations":
             conversations.append(r)
 
+    # identify version
+    version = identify_teams_version(reply_chains)
+
     # sort within groups i.e., Contacts, Meetings, Conversations
     parsed_records = (
-        sorted(_parse_people(people))
-        + sorted(_parse_buddies(buddies))
-        + sorted(_parse_reply_chains(reply_chains))
-        + sorted(_parse_conversations(conversations))
+        sorted(_parse_people(people, version))
+        + sorted(_parse_buddies(buddies, version))
+        + sorted(_parse_reply_chains(reply_chains, version))
+        + sorted(_parse_conversations(conversations, version))
     )
     return [r.to_dict() for r in parsed_records]
 

From 7895b2206a69019854acae09460d60527656c284 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sun, 21 Jan 2024 15:48:00 +0100
Subject: [PATCH 21/25] fix: skip unnecessary object stores in parser mode

---
 src/forensicsim/backend.py | 2 +-
 tools/main.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index eea7b1c..59fe643 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -64,7 +64,7 @@ def parse_db(filepath : Path, blobpath: Path = None, do_not_filter: bool=False):
             # Skip empty object stores
             if obj_store_name is None:
                 continue
-            if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter:
+            if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter is False:
                 obj_store = db[obj_store_name]
                 records_per_object_store = 0
                 for record in obj_store.iterate_records():
diff --git a/tools/main.py b/tools/main.py
index 4b05f3d..e21c7e2 100644
--- a/tools/main.py
+++ b/tools/main.py
@@ -61,7 +61,7 @@
 
 def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None:
     click.echo(XTRACT_HEADER)
-    process_db(filepath, outputpath, blobpath, False)
+    process_db(filepath, outputpath, blobpath, True)
 
 
 if __name__ == "__main__":

From 68c7570029ae83c6fee577b2d0bb55701a085671 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sun, 21 Jan 2024 16:00:50 +0100
Subject: [PATCH 22/25] chore: fix ruff errors

---
 src/forensicsim/backend.py |  9 +++++--
 src/forensicsim/parser.py  | 51 ++++++++++++++++++++------------------
 tools/dump_leveldb.py      | 14 ++++++-----
 tools/main.py              |  5 +---
 4 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 59fe643..514c2a0 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -24,6 +24,7 @@
 
 import json
 from pathlib import Path
+from typing import Optional
 
 from chromedb import (
     ccl_chromium_indexeddb,
@@ -46,11 +47,15 @@
 """
 
 
-def parse_db(filepath : Path, blobpath: Path = None, do_not_filter: bool=False):
+def parse_db(
+    filepath: Path,
+    blobpath: Optional[Path] = None,
+    do_not_filter: Optional[bool] = False,
+):
     # Open raw access to a LevelDB and deserialize the records.
 
     wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath)
-    
+
     extracted_values = []
 
     for db_info in wrapper.database_ids:
diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py
index a9d6b38..3ec3084 100644
--- a/src/forensicsim/parser.py
+++ b/src/forensicsim/parser.py
@@ -186,8 +186,8 @@ def _parse_people(people: list[dict], version: str) -> set[Contact]:
         # Skip empty records
         if p["value"] is None:
             continue
-        
-        # Fetch relevant data 
+
+        # Fetch relevant data
         p |= p.get("value", {})
         p |= {"origin_file": p.get("origin_file")}
 
@@ -201,8 +201,7 @@ def _parse_people(people: list[dict], version: str) -> set[Contact]:
             p |= {"mri": p.get("mri")}
             p |= {"user_principal_name": p.get("userPrincipalName")}
         else:
-            print("Teams Version is unknown. Can not extract records of type people.")            
-
+            print("Teams Version is unknown. Can not extract records of type people.")
 
         parsed_people.add(Contact.from_dict(p))
     return parsed_people
@@ -215,16 +214,17 @@ def _parse_buddies(buddies: list[dict], version: str) -> set[Contact]:
         # Skip empty records
         if b["value"] is None:
             continue
-        # Fetch relevant data 
+        # Fetch relevant data
         if version == "v1" or version == "v2":
             buddies_of_b = b.get("value", {}).get("buddies", [])
             for b_of_b in buddies_of_b:
                 b_of_b |= {"origin_file": b.get("origin_file")}
                 parsed_buddies.add(Contact.from_dict(b_of_b))
         else:
-            print("Teams Version is unknown. Can not extract records of type buddies.")      
+            print("Teams Version is unknown. Can not extract records of type buddies.")
     return parsed_buddies
 
+
 # Conversations can contain multiple artefacts
 # -> If type:Meeting then its a meeting
 def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting]:
@@ -233,7 +233,7 @@ def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting
         # Skip empty records
         if c["value"] is None:
             continue
-        # Fetch relevant data 
+        # Fetch relevant data
         if version == "v1" or version == "v2":
             if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get(
                 "value", {}
@@ -244,20 +244,18 @@ def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting
                 c |= {"cached_deduplication_key": c.get("id")}
                 cleaned_conversations.add(Meeting.from_dict(c))
         else:
-            print("Teams Version is unknown. Can not extract records of type meeting.")     
+            print("Teams Version is unknown. Can not extract records of type meeting.")
     return cleaned_conversations
 
 
 def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
     cleaned_reply_chains = set()
     for rc in reply_chains:
-
-
         # Skip empty records
         if rc["value"] is None:
             continue
-        
-        # Fetch relevant data 
+
+        # Fetch relevant data
         rc |= rc.get("value", {})
         rc |= {"origin_file": rc.get("origin_file")}
 
@@ -267,7 +265,9 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
         elif version == "v2":
             message_dict = rc.get("value", {}).get("messageMap", {})
         else:
-            print("Teams Version is unknown. Can not extract records of type reply_chains.")
+            print(
+                "Teams Version is unknown. Can not extract records of type reply_chains."
+            )
             continue
 
         for k in message_dict:
@@ -279,7 +279,6 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
                 or md.get("messageType", "") == "Text"
             ):
                 if version == "v1":
-
                     rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")}
                     rc |= {"clientmessageid": md.get("clientmessageid")}
                     rc |= {"composetime": md.get("composetime")}
@@ -289,7 +288,7 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
                     rc |= {"messagetype": md.get("messagetype")}
                     rc |= {"messageKind": md.get("messageKind")}
                     rc |= {"original_arrival_time": md.get("originalarrivaltime")}
-      
+
                 elif version == "v2":
                     rc |= {"cached_deduplication_key": md.get("dedupeKey")}
                     rc |= {"clientmessageid": md.get("clientMessageId")}
@@ -301,7 +300,7 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
                     rc |= {"is_from_me": md.get("isSentByCurrentUser")}
                     rc |= {"messagetype": md.get("messageType")}
                     rc |= {"original_arrival_time": md.get("originalArrivalTime")}
-                
+
                 # Similar across versions
                 rc |= {"creator": md.get("creator")}
                 rc |= {"conversation_id": md.get("conversationId")}
@@ -311,7 +310,7 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
                 rc |= {"properties": md.get("properties")}
 
                 cleaned_reply_chains.add(Message.from_dict(rc))
-        
+
     return cleaned_reply_chains
 
 
@@ -321,20 +320,19 @@ def identify_teams_version(reply_chains: list[dict]) -> str:
     for rc in reply_chains:
         rc |= rc.get("value", {})
         if rc.get("value", {}).get("messages", {}):
-                fingerprint_teams_version = "v1"
-                break
+            fingerprint_teams_version = "v1"
         elif rc.get("value", {}).get("messageMap", {}):
-                fingerprint_teams_version = "v2"
-                break
+            fingerprint_teams_version = "v2"
         else:
             fingerprint_teams_version = "unknown"
+        break
 
     return fingerprint_teams_version
 
+
 def parse_records(records: list[dict]) -> list[dict]:
     people, buddies, reply_chains, conversations = [], [], [], []
 
-    
     for r in records:
         store = r.get("store", "other")
         if store == "people":
@@ -359,7 +357,12 @@ def parse_records(records: list[dict]) -> list[dict]:
     return [r.to_dict() for r in parsed_records]
 
 
-def process_db(input_path: Path, output_path: Path, blob_path: Path=None, do_not_filter: bool = True) -> None:
+def process_db(
+    input_path: Path,
+    output_path: Path,
+    blob_path: Optional[Path] = None,
+    do_not_filter: Optional[bool] = True,
+) -> None:
     if not input_path.parts[-1].endswith(".leveldb"):
         raise ValueError(f"Expected a leveldb folder. Path: {input_path}")
 
@@ -368,4 +371,4 @@ def process_db(input_path: Path, output_path: Path, blob_path: Path=None, do_not
 
     extracted_values = parse_db(input_path, blob_path, do_not_filter)
     parsed_records = parse_records(extracted_values)
-    write_results_to_json(parsed_records, output_path)
\ No newline at end of file
+    write_results_to_json(parsed_records, output_path)
diff --git a/tools/dump_leveldb.py b/tools/dump_leveldb.py
index 3f6fe63..5b9241e 100644
--- a/tools/dump_leveldb.py
+++ b/tools/dump_leveldb.py
@@ -23,15 +23,18 @@
 """
 
 from pathlib import Path
+from typing import Optional
 
 import click
 
 from forensicsim.backend import write_results_to_json
-from forensicsim.parser import parse_db
 from forensicsim.consts import DUMP_HEADER
+from forensicsim.parser import parse_db
 
-def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None):
 
+def process_level_db(
+    input_path: Path, output_path: Path, blob_path: Optional[Path] = None
+) -> None:
     # convert the database to a python list with nested dictionaries
     extracted_values = parse_db(input_path, blob_path, do_not_filter=True)
 
@@ -49,7 +52,6 @@ def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None):
     required=True,
     help="File path to the .leveldb folder of the IndexedDB.",
 )
-
 @click.option(
     "-o",
     "--outputpath",
@@ -57,7 +59,6 @@ def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None):
     required=True,
     help="File path to the processed output.",
 )
-
 @click.option(
     "-b",
     "--blobpath",
@@ -67,8 +68,9 @@ def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None):
     required=False,
     help="File path to the .blob folder of the IndexedDB.",
 )
-
-def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None:
+def process_cmd(
+    filepath: Path, outputpath: Path, blobpath: Optional[Path] = None
+) -> None:
     click.echo(DUMP_HEADER)
     process_level_db(filepath, outputpath, blobpath)
 
diff --git a/tools/main.py b/tools/main.py
index e21c7e2..db4ec0d 100644
--- a/tools/main.py
+++ b/tools/main.py
@@ -26,10 +26,10 @@
 
 import click
 
-from forensicsim.backend import parse_db, write_results_to_json
 from forensicsim.consts import XTRACT_HEADER
 from forensicsim.parser import process_db
 
+
 @click.command()
 @click.option(
     "-f",
@@ -40,7 +40,6 @@
     required=True,
     help="File path to the .leveldb folder of the IndexedDB.",
 )
-
 @click.option(
     "-o",
     "--outputpath",
@@ -48,7 +47,6 @@
     required=True,
     help="File path to the processed output.",
 )
-
 @click.option(
     "-b",
     "--blobpath",
@@ -58,7 +56,6 @@
     required=False,
     help="File path to the .blob folder of the IndexedDB.",
 )
-
 def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None:
     click.echo(XTRACT_HEADER)
     process_db(filepath, outputpath, blobpath, True)

From fc8d6c1fb67b05a7143272c9d5f14e425555ff52 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sun, 21 Jan 2024 16:14:47 +0100
Subject: [PATCH 23/25] fix: mypi errors

---
 src/forensicsim/backend.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 514c2a0..3cdfc84 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -24,7 +24,7 @@
 
 import json
 from pathlib import Path
-from typing import Optional
+from typing import Optional, List, Any
 
 from chromedb import (
     ccl_chromium_indexeddb,
@@ -51,7 +51,7 @@ def parse_db(
     filepath: Path,
     blobpath: Optional[Path] = None,
     do_not_filter: Optional[bool] = False,
-):
+) -> List[dict[str, Any]]:
     # Open raw access to a LevelDB and deserialize the records.
 
     wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath)
@@ -92,7 +92,7 @@ def parse_db(
     return extracted_values
 
 
-def parse_localstorage(filepath):
+def parse_localstorage(filepath: Path) -> List[dict[str, Any]]:
     local_store = ccl_chromium_localstorage.LocalStoreDb(filepath)
     extracted_values = []
     for record in local_store.iter_all_records():
@@ -103,7 +103,7 @@ def parse_localstorage(filepath):
     return extracted_values
 
 
-def parse_sessionstorage(filepath):
+def parse_sessionstorage(filepath: Path) -> List[dict[str, Any]]:
     session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath)
     extracted_values = []
     for host in session_storage:
@@ -124,7 +124,7 @@ def parse_sessionstorage(filepath):
     return extracted_values
 
 
-def write_results_to_json(data, outputpath):
+def write_results_to_json(data: List[dict[str, Any]], outputpath: Path) -> None:
     # Dump messages into a json file
     try:
         with open(outputpath, "w", encoding="utf-8") as f:
@@ -133,12 +133,3 @@ def write_results_to_json(data, outputpath):
             )
     except OSError as e:
         print(e)
-
-
-def parse_json():
-    # read data from a file. This is only for testing purpose.
-    try:
-        with Path("teams.json").open() as json_file:
-            return json.load(json_file)
-    except OSError as e:
-        print(e)

From cdf21846efc31f4942dbe75bd77ac814ac487034 Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sun, 21 Jan 2024 16:22:15 +0100
Subject: [PATCH 24/25] fix: fix more ruff errors

---
 src/forensicsim/backend.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py
index 3cdfc84..fc58e50 100644
--- a/src/forensicsim/backend.py
+++ b/src/forensicsim/backend.py
@@ -24,7 +24,7 @@
 
 import json
 from pathlib import Path
-from typing import Optional, List, Any
+from typing import Any, Optional
 
 from chromedb import (
     ccl_chromium_indexeddb,
@@ -51,7 +51,7 @@ def parse_db(
     filepath: Path,
     blobpath: Optional[Path] = None,
     do_not_filter: Optional[bool] = False,
-) -> List[dict[str, Any]]:
+) -> list[dict[str, Any]]:
     # Open raw access to a LevelDB and deserialize the records.
 
     wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath)
@@ -92,7 +92,7 @@ def parse_db(
     return extracted_values
 
 
-def parse_localstorage(filepath: Path) -> List[dict[str, Any]]:
+def parse_localstorage(filepath: Path) -> list[dict[str, Any]]:
     local_store = ccl_chromium_localstorage.LocalStoreDb(filepath)
     extracted_values = []
     for record in local_store.iter_all_records():
@@ -103,7 +103,7 @@ def parse_localstorage(filepath: Path) -> List[dict[str, Any]]:
     return extracted_values
 
 
-def parse_sessionstorage(filepath: Path) -> List[dict[str, Any]]:
+def parse_sessionstorage(filepath: Path) -> list[dict[str, Any]]:
     session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath)
     extracted_values = []
     for host in session_storage:
@@ -124,7 +124,7 @@ def parse_sessionstorage(filepath: Path) -> List[dict[str, Any]]:
     return extracted_values
 
 
-def write_results_to_json(data: List[dict[str, Any]], outputpath: Path) -> None:
+def write_results_to_json(data: list[dict[str, Any]], outputpath: Path) -> None:
     # Dump messages into a json file
     try:
         with open(outputpath, "w", encoding="utf-8") as f:

From 98f381033cd2737051a13e18941d6fd12740460b Mon Sep 17 00:00:00 2001
From: Alexander Bilz <mail@alexbilz.com>
Date: Sun, 21 Jan 2024 16:31:29 +0100
Subject: [PATCH 25/25] chore: remove upload of files and git diff

---
 .github/workflows/build.yaml | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index cc9b8f1..8e208b3 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -26,21 +26,9 @@ jobs:
              .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_v_1_4_00_11161.json"
              .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.live.com_0.indexeddb.leveldb" -o "karelze.json"
              .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja.json"
-      - name: Upload results📲
-        uses: actions/upload-artifact@v4
-        with:
-          name: generated-outputs
-          path: |
-              jane_doe_v_1_4_00_11161.json
-              john_doe_v_1_4_00_11161.json
-          retention-days: 1
-      - name: Test calling script 📞
+      - name: Test calling script 🖱️
         run: |
               python tools/main.py --help
               python tools/dump_leveldb.py --help
               python tools/dump_localstorage.py --help
               python tools/dump_sessionstorage.py --help
-      - name: Calculate diff 👽
-        run: |
-            git diff --no-index --word-diff .\forensicsim-data\expected-result\jane_doe_v_1_4_00_11161.json jane_doe_v_1_4_00_11161.json
-            git diff --no-index --word-diff .\forensicsim-data\expected-result\john_doe_v_1_4_00_11161.json john_doe_v_1_4_00_11161.json