From 897dd8e8942b4884de7a81cb75e073a9424c9980 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Mon, 8 Jan 2024 14:34:58 +0100 Subject: [PATCH 01/25] feat: extend gh action to run on MS Teams 2.0 datasets --- .github/workflows/build.yaml | 5 +++-- .gitignore | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index f351794..0b709ea 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -20,8 +20,9 @@ jobs: run: pyinstaller "main.spec" - name: Run conversion ↩️ run: | - .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe.json" - .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe_old_teams.json" + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_old_teams.json" + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_new_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja_new_teams.json" - name: Test calling script πŸ“ž run: | python utils/dump_leveldb.py --help diff --git a/.gitignore b/.gitignore index 34dd43e..efe303c 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,5 @@ dmypy.json venv/ env/ + +testing/ From d4adb8d58e87baff96506e0edccde5c48e95570b Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Mon, 8 Jan 2024 17:46:43 +0100 Subject: [PATCH 02/25] feat: add new datasets for teams v2 --- .github/workflows/build.yaml | 63 +++++++++++++++++++----------------- forensicsim-data | 2 +- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 0b709ea..91c220b 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,35 +1,38 @@ on: - push: + push: jobs: - build: - name: Build exectuable πŸ“¦ - runs-on: windows-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install -r requirements.txt pyinstaller - - name: Build binary πŸ”’ - run: pyinstaller "main.spec" - - name: Run conversion ↩️ - run: | - .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe_old_teams.json" - .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_old_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_old_teams.json" - .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_new_teams\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja_new_teams.json" - - name: Test calling script πŸ“ž - run: | - python utils/dump_leveldb.py --help - python utils/dump_localstorage.py --help - python utils/dump_sessionstorage.py --help + build: + name: Build exectuable πŸ“¦ + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt pyinstaller + - name: Build binary πŸ”’ + run: pyinstaller "main.spec" + - name: Run conversion ↩️ + run: | + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe.json" + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" + continue-on-error: true + - name: Test calling script πŸ“ž + run: | + python utils/dump_leveldb.py --help + python utils/dump_localstorage.py --help + python utils/dump_sessionstorage.py --help + continue-on-error: true # python utils/populate_teams.py --help # python utils/populate_teams_2.py --help # python utils/populate_skype.py --help - # - name: Calculate diff πŸ‘½ - # run: git diff --no-index --word-diff expected_output/john_doe.json current_output.json + # - name: Calculate diff πŸ‘½ + # run: git diff --no-index --word-diff expected_output/john_doe.json current_output.json diff --git a/forensicsim-data b/forensicsim-data index 68a144b..36e337f 160000 --- a/forensicsim-data +++ b/forensicsim-data @@ -1 +1 @@ -Subproject commit 68a144b3406fa3a532eee24b9181bb72a8ad691d +Subproject commit 36e337f7f0f05cd358f67b116621cb4ca995184b From f28295ee6da11973f3078edef0fba872a053c442 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Mon, 8 Jan 2024 17:51:00 +0100 Subject: [PATCH 03/25] =?UTF-8?q?refactor:=20disable=20continue-on-error?= =?UTF-8?q?=20=F0=9F=AA=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 91c220b..8fc6af8 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -24,13 +24,13 @@ jobs: .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" - continue-on-error: true + continue-on-error: false - name: Test calling script πŸ“ž run: | python utils/dump_leveldb.py --help python utils/dump_localstorage.py --help python utils/dump_sessionstorage.py --help - continue-on-error: true + continue-on-error: false # python utils/populate_teams.py --help # python utils/populate_teams_2.py --help # python utils/populate_skype.py --help From cfff8f3371f98d41c4ed27f7c4dc7479c21972b3 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Mon, 8 Jan 2024 17:55:00 +0100 Subject: [PATCH 04/25] Feat: fix paths of datasets in build job --- .github/workflows/build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 8fc6af8..e05f08a 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -22,8 +22,8 @@ jobs: run: | .\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe.json" .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" - .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" - .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe.json" + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.live.com_0.indexeddb.leveldb" -o "karelze.json" + .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja.json" continue-on-error: false - name: Test calling script πŸ“ž run: | From 18e69993220a2ec97b0e1f1d8cfc9e035d633848 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Tue, 9 Jan 2024 10:48:50 +0100 Subject: [PATCH 05/25] feat: add mypy for type checking --- .pre-commit-config.yaml | 7 +++++++ pyproject.toml | 15 +++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 88fb5be..c6b876e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,6 +14,13 @@ repos: - id: debug-statements - id: end-of-file-fixer - id: mixed-line-ending + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy + # yaml requires additional stubs. + # Similar to: https://stackoverflow.com/a/73603491/5755604 + additional_dependencies: ['types-PyYAML'] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.1.11 hooks: diff --git a/pyproject.toml b/pyproject.toml index 31e963c..d8a30d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,10 +35,24 @@ version = {attr = "forensicsim.__version__"} dev=[ "build", "pre-commit", + "mypy", "ruff", "tox", ] +[tool.mypy] + +python_version = "3.9" + +exclude = [ + "tools/" +] + +# https://github.com/python/mypy/issues/2410 +ignore_missing_imports = true +disallow_untyped_defs = true +disallow_untyped_calls = true +disallow_incomplete_defs = true [tool.ruff] @@ -103,6 +117,7 @@ commands = # Syntax Checks [testenv:lint] commands = + python -m mypy src/ python -m ruff --output-format=github src python -m ruff format src --check From ecfb39f9a4906226d815b37f234c7448cfe4d12b Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Tue, 9 Jan 2024 12:11:50 +0100 Subject: [PATCH 06/25] refactor: add typehints for mypy to backend.py --- pyproject.toml | 2 +- src/forensicsim/backend.py | 85 ++++++++++++++++++-------------------- 2 files changed, 41 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d8a30d2..5c3b452 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,7 +117,7 @@ commands = # Syntax Checks [testenv:lint] commands = - python -m mypy src/ + python -m mypy src/backend.py python -m ruff --output-format=github src python -m ruff format src --check diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 06ec4ff..7260792 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -24,7 +24,9 @@ import io import json -import os +from collections.abc import Iterator +from pathlib import Path +from typing import Any, Optional, Union from chromedb import ( ccl_blink_value_deserializer, @@ -35,7 +37,10 @@ ccl_v8_value_deserializer, ) from chromedb.ccl_chromium_indexeddb import ( + DatabaseMetadata, DatabaseMetadataType, + GlobalMetadata, + ObjectStoreMetadata, ObjectStoreMetadataType, ) @@ -53,21 +58,21 @@ class FastIndexedDB: - def __init__(self, leveldb_dir: os.PathLike): + def __init__(self, leveldb_dir: Path): self._db = ccl_leveldb.RawLevelDb(leveldb_dir) - self._fetched_records = [] - self.global_metadata = None - self.database_metadata = None - self.object_store_meta = None + self._fetched_records: list[ccl_leveldb.Record] = [] + self.global_metadata: GlobalMetadata + self.database_metadata: DatabaseMetadata + self.object_store_meta: ObjectStoreMetadata self.fetch_data() - def fetch_data(self): - global_metadata_raw = {} + def fetch_data(self) -> None: + global_metadata_raw: dict[bytes, ccl_leveldb.Records] = {} + database_metadata_raw: dict[tuple, Any] = {} + objectstore_metadata_raw: dict[tuple, Any] = {} - database_metadata_raw = {} - objectstore_metadata_raw = {} + self._fetched_records.clear() - self._fetched_records = [] # Fetch the records only once for record in self._db.iterate_records_raw(): self._fetched_records.append(record) @@ -148,15 +153,21 @@ def fetch_data(self): objectstore_metadata_raw ) - def get_database_metadata(self, db_id: int, meta_type: DatabaseMetadataType): - return self.database_metadata.get_meta(db_id, meta_type) + def get_database_metadata( + self, db_id: int, meta_type: DatabaseMetadataType + ) -> Optional[Union[str, int]]: + if self.database_metadata: + return self.database_metadata.get_meta(db_id, meta_type) + return None def get_object_store_metadata( self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType - ): - return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type) + ) -> Optional[Any]: + if self.object_store_meta: + return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type) + return None - def iterate_records(self, do_not_filter=False): + def __iter__(self) -> Iterator[dict[str, Any]]: blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer() # Loop through the databases and object stores based on their ids for global_id in self.global_metadata.db_ids: @@ -179,12 +190,10 @@ def iterate_records(self, do_not_filter=False): ) # print(f"\t Processing object store: {datastore}") - records_per_object_store = 0 - if datastore in TEAMS_DB_OBJECT_STORES or do_not_filter: + if datastore in TEAMS_DB_OBJECT_STORES: prefix = bytes([0, global_id.dbid_no, object_store_id, 1]) for record in self._fetched_records: if record.key.startswith(prefix): - records_per_object_store += 1 # Skip records with empty values as these cant properly decoded if record.value == b"": continue @@ -202,7 +211,7 @@ def iterate_records(self, do_not_filter=False): val_idx += 1 ( - _blink_version, + _, varint_raw, ) = ccl_chromium_indexeddb.le_varint_from_bytes( record.value[val_idx:] @@ -230,19 +239,14 @@ def iterate_records(self, do_not_filter=False): except Exception: # TODO Some proper error handling wouldn't hurt continue - # print(f"{datastore} {global_id.name} {records_per_object_store}") -def parse_db(filepath, do_not_filter=False): - # Open raw access to a LevelDB and deserialize the records. +def parse_db(filepath: Path) -> list[dict]: db = FastIndexedDB(filepath) - extracted_values = [] - for record in db.iterate_records(do_not_filter): - extracted_values.append(record) - return extracted_values + return list(db) -def parse_localstorage(filepath): +def parse_localstorage(filepath: Path) -> list[dict]: local_store = ccl_chromium_localstorage.LocalStoreDb(filepath) extracted_values = [] for record in local_store.iter_all_records(): @@ -253,7 +257,7 @@ def parse_localstorage(filepath): return extracted_values -def parse_sessionstorage(filepath): +def parse_sessionstorage(filepath: Path) -> list[dict]: session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath) extracted_values = [] for host in session_storage: @@ -274,22 +278,13 @@ def parse_sessionstorage(filepath): return extracted_values -def write_results_to_json(data, outputpath): - # Dump messages into a json file - try: - with open(outputpath, "w", encoding="utf-8") as f: - json.dump( - data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False - ) - except OSError as e: - print(e) +def write_results_to_json(data: object, outputpath: Path) -> None: + with outputpath.open("w", encoding="utf-8") as f: + json.dump(data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False) -def parse_json(): +def parse_json() -> Any: # read data from a file. This is only for testing purpose. - try: - with open("teams.json") as json_file: - data = json.load(json_file) - return data - except OSError as e: - print(e) + with Path("teams.json").open() as json_file: + data = json.load(json_file) + return data From 6837e062702cac83ce5dd0d2fafd816e3270d156 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Tue, 9 Jan 2024 12:16:06 +0100 Subject: [PATCH 07/25] feat: fix some typos in linting --- pyproject.toml | 2 +- src/forensicsim/backend.py | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5c3b452..0c60eaa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,7 +117,7 @@ commands = # Syntax Checks [testenv:lint] commands = - python -m mypy src/backend.py + python -m mypy src/forensicsim/backend.py python -m ruff --output-format=github src python -m ruff format src --check diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 7260792..dc2fccf 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -156,16 +156,12 @@ def fetch_data(self) -> None: def get_database_metadata( self, db_id: int, meta_type: DatabaseMetadataType ) -> Optional[Union[str, int]]: - if self.database_metadata: - return self.database_metadata.get_meta(db_id, meta_type) - return None + return self.database_metadata.get_meta(db_id, meta_type) def get_object_store_metadata( self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType ) -> Optional[Any]: - if self.object_store_meta: - return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type) - return None + return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type) def __iter__(self) -> Iterator[dict[str, Any]]: blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer() From 87245125474cf06082741a318b16dffe3bb53dc0 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Tue, 9 Jan 2024 12:24:42 +0100 Subject: [PATCH 08/25] fix: unsupported operand type(s) for +: 'NoneType' and 'int' --- pyproject.toml | 2 +- src/forensicsim/backend.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0c60eaa..d770e2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "chromedb @ git+https://github.com/karelze/ccl_chrome_indexeddb@master", "pause~=0.3", "pyautogui~=0.9.54", -"pywinauto~=0.6.8" +#"pywinauto~=0.6.8" ] dynamic = ["version"] diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index dc2fccf..8536bca 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -172,12 +172,14 @@ def __iter__(self) -> Iterator[dict[str, Any]]: print(f"WARNING: Skipping database {global_id.name}") continue + max_object_stores = self.database_metadata.get_meta( + global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId + ) + max_object_stores = max_object_stores if not max_object_stores else 0 + for object_store_id in range( 1, - self.database_metadata.get_meta( - global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId - ) - + 1, + max_object_stores + 1, ): datastore = self.object_store_meta.get_meta( global_id.dbid_no, From b2c2a4a8fcff8a2b9bd276529e557894d2d753d7 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Tue, 9 Jan 2024 12:30:14 +0100 Subject: [PATCH 09/25] feat: fix typo in 'backend.py' --- src/forensicsim/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 8536bca..c95606e 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -175,7 +175,7 @@ def __iter__(self) -> Iterator[dict[str, Any]]: max_object_stores = self.database_metadata.get_meta( global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId ) - max_object_stores = max_object_stores if not max_object_stores else 0 + max_object_stores = max_object_stores if max_object_stores else 0 for object_store_id in range( 1, From 48c2b89ab61febe08d8e7dc7d5134cfc58c7cd0e Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Tue, 16 Jan 2024 05:32:08 +0100 Subject: [PATCH 10/25] refactor: match api of updated chromedb --- pyproject.toml | 2 +- src/forensicsim/backend.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0f9bfb0..802acd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "dataclasses-json", "pause", "pyautogui", -"pywinauto" +# "pywinauto" ] dynamic = ["version"] diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 95afd6e..7c2b6b0 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -126,7 +126,7 @@ def fetch_data(self) -> None: ( objstore_id, varint_raw, - ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes( + ) = ccl_chromium_indexeddb.le_varint_from_bytes( record.key[len(prefix_objectstore) :] ) except TypeError: @@ -198,7 +198,7 @@ def __iter__(self) -> Iterator[dict[str, Any]]: ( _value_version, varint_raw, - ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes( + ) = ccl_chromium_indexeddb.le_varint_from_bytes( record.value ) val_idx = len(varint_raw) @@ -211,7 +211,7 @@ def __iter__(self) -> Iterator[dict[str, Any]]: ( _, varint_raw, - ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes( + ) = ccl_chromium_indexeddb.le_varint_from_bytes( record.value[val_idx:] ) From fab391399b4d7ab60dcb58a5648baab41fe121b8 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Tue, 16 Jan 2024 05:44:27 +0100 Subject: [PATCH 11/25] feat: fix ruff linting --- .github/workflows/build.yaml | 2 +- pyproject.toml | 2 +- src/forensicsim/backend.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d37b384..cc9b8f1 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -41,6 +41,6 @@ jobs: python tools/dump_localstorage.py --help python tools/dump_sessionstorage.py --help - name: Calculate diff πŸ‘½ - run: | + run: | git diff --no-index --word-diff .\forensicsim-data\expected-result\jane_doe_v_1_4_00_11161.json jane_doe_v_1_4_00_11161.json git diff --no-index --word-diff .\forensicsim-data\expected-result\john_doe_v_1_4_00_11161.json john_doe_v_1_4_00_11161.json diff --git a/pyproject.toml b/pyproject.toml index 802acd5..ef39a50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dev=[ python_version = "3.9" exclude = [ - "tools/" + "tools/**.py" ] # https://github.com/python/mypy/issues/2410 diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 7c2b6b0..002dfd3 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -284,5 +284,4 @@ def write_results_to_json(data: object, outputpath: Path) -> None: def parse_json() -> Any: # read data from a file. This is only for testing purpose. with Path("teams.json").open() as json_file: - data = json.load(json_file) - return data + return json.load(json_file) From 36c7fa5d76280418a76b8721489431034e430ee7 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Tue, 16 Jan 2024 17:35:09 +0100 Subject: [PATCH 12/25] refactor: use WrappedIndexDB instead of --- pyproject.toml | 2 +- src/forensicsim/backend.py | 245 +++++++------------------------------ 2 files changed, 45 insertions(+), 202 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ef39a50..0361102 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "dataclasses-json", "pause", "pyautogui", -# "pywinauto" +"pywinauto" ] dynamic = ["version"] diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 002dfd3..9df24d2 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -24,28 +24,19 @@ import io import json -from collections.abc import Iterator +import os from pathlib import Path -from typing import Any, Optional, Union from chromedb import ( - ccl_blink_value_deserializer, ccl_chromium_indexeddb, ccl_chromium_localstorage, ccl_chromium_sessionstorage, - ccl_leveldb, - ccl_v8_value_deserializer, -) -from chromedb.ccl_chromium_indexeddb import ( - DatabaseMetadata, - DatabaseMetadataType, - GlobalMetadata, - ObjectStoreMetadata, - ObjectStoreMetadataType, ) TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"] +ENCODING = "iso-8859-1" + """ The following code is heavily adopted from the RawLevelDb and IndexedDB processing proposed by CCL Group @@ -56,195 +47,38 @@ Additionally, it has a flag to filter for datastores, which are interesting for us. """ +def parse_db(filepath, do_not_filter=False): + # Open raw access to a LevelDB and deserialize the records. + wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath) -class FastIndexedDB: - def __init__(self, leveldb_dir: Path): - self._db = ccl_leveldb.RawLevelDb(leveldb_dir) - self._fetched_records: list[ccl_leveldb.Record] = [] - self.global_metadata: GlobalMetadata - self.database_metadata: DatabaseMetadata - self.object_store_meta: ObjectStoreMetadata - self.fetch_data() - - def fetch_data(self) -> None: - global_metadata_raw: dict[bytes, ccl_leveldb.Records] = {} - database_metadata_raw: dict[tuple, Any] = {} - objectstore_metadata_raw: dict[tuple, Any] = {} - - self._fetched_records.clear() - - # Fetch the records only once - for record in self._db.iterate_records_raw(): - self._fetched_records.append(record) - - for record in self._fetched_records: - # Global Metadata - if ( - record.key.startswith(b"\x00\x00\x00\x00") - and record.state == ccl_leveldb.KeyState.Live - ) and ( - record.key not in global_metadata_raw - or global_metadata_raw[record.key].seq < record.seq - ): - global_metadata_raw[record.key] = record - - # Convert the raw metadata to a nice GlobalMetadata Object - global_metadata = ccl_chromium_indexeddb.GlobalMetadata(global_metadata_raw) - - # Loop through the database IDs - for db_id in global_metadata.db_ids: - if db_id.dbid_no == None: - continue - - if db_id.dbid_no > 0x7F: - raise NotImplementedError( - "there could be this many dbs, but I don't support it yet" - ) - - # Database keys end with 0 - prefix_database = bytes([0, db_id.dbid_no, 0, 0]) - - # Objetstore keys end with 50 - prefix_objectstore = bytes([0, db_id.dbid_no, 0, 0, 50]) - - for record in reversed(self._fetched_records): - if ( - record.key.startswith(prefix_database) - and record.state == ccl_leveldb.KeyState.Live - ): - # we only want live keys and the newest version thereof (highest seq) - meta_type = record.key[len(prefix_database)] - old_version = database_metadata_raw.get((db_id.dbid_no, meta_type)) - if old_version is None or old_version.seq < record.seq: - database_metadata_raw[(db_id.dbid_no, meta_type)] = record - if ( - record.key.startswith(prefix_objectstore) - and record.state == ccl_leveldb.KeyState.Live - ): - # we only want live keys and the newest version thereof (highest seq) - try: - ( - objstore_id, - varint_raw, - ) = ccl_chromium_indexeddb.le_varint_from_bytes( - record.key[len(prefix_objectstore) :] - ) - except TypeError: - continue - - meta_type = record.key[len(prefix_objectstore) + len(varint_raw)] - - old_version = objectstore_metadata_raw.get(( - db_id.dbid_no, - objstore_id, - meta_type, - )) - - if old_version is None or old_version.seq < record.seq: - objectstore_metadata_raw[ - (db_id.dbid_no, objstore_id, meta_type) - ] = record - - self.global_metadata = global_metadata - self.database_metadata = ccl_chromium_indexeddb.DatabaseMetadata( - database_metadata_raw - ) - self.object_store_meta = ccl_chromium_indexeddb.ObjectStoreMetadata( - objectstore_metadata_raw - ) + extracted_values = [] - def get_database_metadata( - self, db_id: int, meta_type: DatabaseMetadataType - ) -> Optional[Union[str, int]]: - return self.database_metadata.get_meta(db_id, meta_type) + for db_info in wrapper.database_ids: + # Skip databases without a valid dbid_no + if db_info.dbid_no is None: + continue - def get_object_store_metadata( - self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType - ) -> Optional[Any]: - return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type) + db = wrapper[db_info.dbid_no] - def __iter__(self) -> Iterator[dict[str, Any]]: - blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer() - # Loop through the databases and object stores based on their ids - for global_id in self.global_metadata.db_ids: - # print(f"Processing database: {global_id.name}") - if global_id.dbid_no == None: - print(f"WARNING: Skipping database {global_id.name}") + for obj_store_name in db.object_store_names: + # Skip empty object stores + if obj_store_name is None: continue - - max_object_stores = self.database_metadata.get_meta( - global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId - ) - max_object_stores = max_object_stores if max_object_stores else 0 - - for object_store_id in range( - 1, - max_object_stores + 1, - ): - datastore = self.object_store_meta.get_meta( - global_id.dbid_no, - object_store_id, - ObjectStoreMetadataType.StoreName, - ) - - # print(f"\t Processing object store: {datastore}") - if datastore in TEAMS_DB_OBJECT_STORES: - prefix = bytes([0, global_id.dbid_no, object_store_id, 1]) - for record in self._fetched_records: - if record.key.startswith(prefix): - # Skip records with empty values as these cant properly decoded - if record.value == b"": - continue - ( - _value_version, - varint_raw, - ) = ccl_chromium_indexeddb.le_varint_from_bytes( - record.value - ) - val_idx = len(varint_raw) - # read the blink envelope - blink_type_tag = record.value[val_idx] - if blink_type_tag != 0xFF: - print("Blink type tag not present") - val_idx += 1 - - ( - _, - varint_raw, - ) = ccl_chromium_indexeddb.le_varint_from_bytes( - record.value[val_idx:] - ) - - val_idx += len(varint_raw) - - # read the raw value of the record. - obj_raw = io.BytesIO(record.value[val_idx:]) - try: - # Initialize deserializer and try deserialization. - deserializer = ccl_v8_value_deserializer.Deserializer( - obj_raw, - host_object_delegate=blink_deserializer.read, - ) - value = deserializer.read() - yield { - "key": record.key, - "value": value, - "origin_file": record.origin_file, - "store": datastore, - "state": record.state, - "seq": record.seq, - } - except Exception: - # TODO Some proper error handling wouldn't hurt - continue - - -def parse_db(filepath: Path) -> list[dict]: - db = FastIndexedDB(filepath) - return list(db) + if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter: + obj_store = db[obj_store_name] + records_per_object_store = 0 + for record in obj_store.iterate_records(): + records_per_object_store += 1 + sourcefile = str(filepath) + # TODO: Replace None values with actual values + state = None + seq = None + extracted_values.append({"key": record.key.raw_key, "value": record.value, "origin_file": sourcefile, "store": obj_store_name, "state": state, "seq": seq}) + print(f"{obj_store_name} {db.name} (Records: {records_per_object_store})") + return extracted_values -def parse_localstorage(filepath: Path) -> list[dict]: +def parse_localstorage(filepath): local_store = ccl_chromium_localstorage.LocalStoreDb(filepath) extracted_values = [] for record in local_store.iter_all_records(): @@ -255,7 +89,7 @@ def parse_localstorage(filepath: Path) -> list[dict]: return extracted_values -def parse_sessionstorage(filepath: Path) -> list[dict]: +def parse_sessionstorage(filepath): session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath) extracted_values = [] for host in session_storage: @@ -276,12 +110,21 @@ def parse_sessionstorage(filepath: Path) -> list[dict]: return extracted_values -def write_results_to_json(data: object, outputpath: Path) -> None: - with outputpath.open("w", encoding="utf-8") as f: - json.dump(data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False) +def write_results_to_json(data, outputpath): + # Dump messages into a json file + try: + with open(outputpath, "w", encoding="utf-8") as f: + json.dump( + data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False + ) + except OSError as e: + print(e) -def parse_json() -> Any: +def parse_json(): # read data from a file. This is only for testing purpose. - with Path("teams.json").open() as json_file: - return json.load(json_file) + try: + with Path("teams.json").open() as json_file: + return json.load(json_file) + except OSError as e: + print(e) From ccdd881285b81fee12c26ee7ddd218199d9d79a9 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Tue, 16 Jan 2024 20:16:13 +0100 Subject: [PATCH 13/25] feat: implement basic parser for the object stores --- src/forensicsim/backend.py | 395 +++++++++++++++++++++++++------------ 1 file changed, 273 insertions(+), 122 deletions(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 9df24d2..bcfb656 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -1,130 +1,281 @@ -""" -MIT License - -Copyright (c) 2021 Alexander Bilz - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import io import json -import os +import warnings +from dataclasses import dataclass, field +from datetime import datetime from pathlib import Path +from typing import Any, Optional -from chromedb import ( - ccl_chromium_indexeddb, - ccl_chromium_localstorage, - ccl_chromium_sessionstorage, +from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning +from dataclasses_json import ( + DataClassJsonMixin, + LetterCase, + Undefined, + config, ) -TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"] - -ENCODING = "iso-8859-1" - -""" -The following code is heavily adopted from the RawLevelDb and IndexedDB processing proposed by CCL Group - -https://github.com/cclgroupltd/ccl_chrome_indexeddb/blob/35b6a9efba1078cf339f9e64d2796b1f5f7c556f/ccl_chromium_indexeddb.py - -It uses an optimized enumeration approach for processing the metadata, which makes the original IndexedDB super slow. - -Additionally, it has a flag to filter for datastores, which are interesting for us. -""" - -def parse_db(filepath, do_not_filter=False): - # Open raw access to a LevelDB and deserialize the records. - wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath) - - extracted_values = [] - - for db_info in wrapper.database_ids: - # Skip databases without a valid dbid_no - if db_info.dbid_no is None: - continue - - db = wrapper[db_info.dbid_no] - - for obj_store_name in db.object_store_names: - # Skip empty object stores - if obj_store_name is None: - continue - if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter: - obj_store = db[obj_store_name] - records_per_object_store = 0 - for record in obj_store.iterate_records(): - records_per_object_store += 1 - sourcefile = str(filepath) - # TODO: Replace None values with actual values - state = None - seq = None - extracted_values.append({"key": record.key.raw_key, "value": record.value, "origin_file": sourcefile, "store": obj_store_name, "state": state, "seq": seq}) - print(f"{obj_store_name} {db.name} (Records: {records_per_object_store})") - return extracted_values - - -def parse_localstorage(filepath): - local_store = ccl_chromium_localstorage.LocalStoreDb(filepath) - extracted_values = [] - for record in local_store.iter_all_records(): - try: - extracted_values.append(json.loads(record.value, strict=False)) - except json.decoder.JSONDecodeError: - continue - return extracted_values - - -def parse_sessionstorage(filepath): - session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath) - extracted_values = [] - for host in session_storage: - print(host) - # Hosts can have multiple sessions associated with them - for session_store_values in session_storage.get_all_for_host(host).values(): - for session_store_value in session_store_values: - # response is of type SessionStoreValue - - # Make a nice dictionary out of it - entry = { - "key": host, - "value": session_store_value.value, - "guid": session_store_value.guid, - "leveldb_sequence_number": session_store_value.leveldb_sequence_number, - } - extracted_values.append(entry) - return extracted_values - - -def write_results_to_json(data, outputpath): - # Dump messages into a json file - try: - with open(outputpath, "w", encoding="utf-8") as f: - json.dump( - data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False +from forensicsim.backend import parse_db, write_results_to_json + +# Suppress Beautiful Soup warnings +warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning) + +def strip_html_tags(value): + # Get the text of any embedded html, such as divs, a href links + soup = BeautifulSoup(value, features="html.parser") + return soup.get_text() + + +def decode_dict(properties): + if isinstance(properties, bytes): + soup = BeautifulSoup(properties, features="html.parser") + properties = properties.decode(soup.original_encoding) + if isinstance(properties, dict): + # handle case where nested childs are dicts or list but provided with "" but have to be expanded. + for key, value in properties.items(): + if isinstance(value, str) and value.startswith(("[", "{")): + properties[key] = json.loads(value, strict=False) + return properties + + return json.loads(properties, strict=False) + + +def decode_timestamp(content_utf8_encoded) -> datetime: + return datetime.utcfromtimestamp(int(content_utf8_encoded) / 1000) + + +def encode_timestamp(timestamp) -> Optional[str]: + if timestamp is not None: + return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f") + return None + + +JSON_CONFIG = config(letter_case=LetterCase.CAMEL, undefined=Undefined.EXCLUDE)[ + "dataclasses_json" +] + + +@dataclass() +class Meeting(DataClassJsonMixin): + dataclass_json_config = JSON_CONFIG + + client_update_time: Optional[str] = None + cached_deduplication_key: Optional[str] = None + id: Optional[str] = None + members: Optional[list[dict]] = None + thread_properties: dict[str, Any] = field( + default_factory=dict, metadata=config(decoder=decode_dict) + ) + type: Optional[str] = None + version: Optional[float] = None + + record_type: Optional[str] = field( + default="meeting", metadata=config(field_name="record_type") + ) + + def __eq__(self, other): + return self.cached_deduplication_key == other.cachedDeduplicationKey + + def __hash__(self): + return hash(self.cached_deduplication_key) + + def __lt__(self, other): + return self.cached_deduplication_key < other.cached_deduplication_key + + +@dataclass() +class Message(DataClassJsonMixin): + dataclass_json_config = JSON_CONFIG + + attachments: list[Any] = field(default_factory=list) + cached_deduplication_key: Optional[str] = None + client_arrival_time: Optional[str] = None + clientmessageid: Optional[str] = None + composetime: Optional[str] = None + conversation_id: Optional[str] = None + content: Optional[str] = field( + default=None, metadata=config(decoder=strip_html_tags) + ) + contenttype: Optional[str] = None + created_time: Optional[datetime] = field( + default=None, + metadata=config(decoder=decode_timestamp, encoder=encode_timestamp), + ) + creator: Optional[str] = None + is_from_me: Optional[bool] = None + message_kind: Optional[str] = None + messagetype: Optional[str] = None + originalarrivaltime: Optional[str] = None + properties: dict[str, Any] = field( + default_factory=dict, metadata=config(decoder=decode_dict) + ) + version: Optional[datetime] = field( + default=None, + metadata=config(decoder=decode_timestamp, encoder=encode_timestamp), + ) + + origin_file: Optional[str] = field( + default=None, metadata=config(field_name="origin_file") + ) + record_type: str = field( + default="message", metadata=config(field_name="record_type") + ) + + def __post_init__(self): + if self.cached_deduplication_key is None: + self.cached_deduplication_key = str(self.creator) + str( + self.clientmessageid ) - except OSError as e: - print(e) + def __eq__(self, other): + return self.cached_deduplication_key == other.cached_deduplication_key + + def __hash__(self): + return hash(self.cached_deduplication_key) + + def __lt__(self, other): + return self.cached_deduplication_key < other.cached_deduplication_key + + +@dataclass() +class Contact(DataClassJsonMixin): + dataclass_json_config = JSON_CONFIG + + display_name: Optional[str] = None + email: Optional[str] = None + mri: Optional[str] = field(default=None, compare=True) + user_principal_name: Optional[str] = None + + origin_file: Optional[str] = field( + default=None, metadata=config(field_name="origin_file") + ) + record_type: Optional[str] = field( + default="contact", metadata=config(field_name="record_type") + ) + + def __eq__(self, other): + return self.mri == other.mri + + def __hash__(self): + return hash(self.mri) + + def __lt__(self, other): + return self.mri < other.mri + + +def _parse_people(people: list[dict]) -> set[Contact]: + parsed_people = set() + for p in people: + + p |= p.get("value", {}) + p |= {"display_name": p.get("displayName")} + p |= {"email": p.get("email")} + p |= {"mri": p.get("mri")} + p |= {"user_principal_name": p.get("userPrincipalName")} + p |= {"origin_file": p.get("origin_file")} + + parsed_people.add(Contact.from_dict(p)) + return parsed_people + + +def _parse_buddies(buddies: list[dict]) -> set[Contact]: + parsed_buddies = set() + for b in buddies: + buddies_of_b = b.get("value", {}).get("buddies", []) + for b_of_b in buddies_of_b: + + b_of_b |= {"origin_file": b.get("origin_file")} + parsed_buddies.add(Contact.from_dict(b_of_b)) + return parsed_buddies + + +def _parse_conversations(conversations: list[dict]) -> set[Meeting]: + cleaned_conversations = set() + for c in conversations: + + + if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get("value", {}).get( + "threadProperties", {} + ): + last_message = c.get("value", {}).get("lastMessage", {}) + meeting_properties = c.get("value", {}).get("threadProperties", {}) + c |= c.get("value", {}) + c |= {"client_update_time": c.get("clientUpdateTime")} + c |= {"id": c.get("id")} + c |= {"members": c.get("members")} + c |= {"thread_properties": meeting_properties} + c |= {"client_update_time": c.get("clientUpdateTime")} + c |= {"version": c.get("version")} + c |= {"last_message": last_message} + c |= {"cached_deduplication_key": c.get("id")} + cleaned_conversations.add(Meeting.from_dict(c)) + return cleaned_conversations + + +def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: + cleaned_reply_chains = set() + for rc in reply_chains: + rc |= {"origin_file": rc.get("origin_file")} + + message_dict = {} + if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get("messages", {}): + if rc.get("value", {}).get("messageMap", {}): + message_dict = rc.get("value", {}).get("messageMap", {}) + else: + message_dict = rc.get("value", {}).get("messages", {}) + + for k in message_dict: + md = message_dict[k] + + if md.get("messageType", "") == "RichText/Html" or md.get("messageType", "") == "Text": + rc |= rc.get("value", {}) + rc |= {"cached_deduplication_key": md.get("dedupeKey")} + rc |= {"clientmessageid": md.get("clientMessageId")} + rc |= {"composetime": md.get("clientArrivalTime")} + rc |= {"conversation_id": md.get("conversationId")} + rc |= {"content": md.get("content")} + rc |= {"contenttype": md.get("contentType")} + rc |= {"created_time": md.get("clientArrivalTime")} + rc |= {"creator": md.get("version")} + rc |= {"is_from_me": md.get("isSentByCurrentUser")} + rc |= {"messagetype": md.get("messageType")} + rc |= {"originalArrivalTime": md.get("version")} + rc |= {"client_arrival_time": md.get("clientArrivalTime")} + rc |= {"original_arrival_time": md.get("clientArrivalTime")} + rc |= {"version": md.get("version")} + rc |= {"properties": md.get("properties")} + + cleaned_reply_chains.add(Message.from_dict(rc)) + + return cleaned_reply_chains + + +def parse_records(records: list[dict]) -> list[dict]: + people, buddies, reply_chains, conversations = [], [], [], [] + + for r in records: + store = r.get("store", "other") + if store == "people": + people.append(r) + elif store == "buddylist": + buddies.append(r) + elif store == "replychains": + reply_chains.append(r) + elif store == "conversations": + conversations.append(r) + + # sort within groups i.e., Contacts, Meetings, Conversations + parsed_records = ( + sorted(_parse_people(people)) + # + sorted(_parse_buddies(buddies)) + + sorted(_parse_reply_chains(reply_chains)) + + sorted(_parse_conversations(conversations)) + ) + return [r.to_dict() for r in parsed_records] + + +def process_db(input_path: Path, output_path: Path): + if not input_path.parts[-1].endswith(".leveldb"): + raise ValueError(f"Expected a leveldb folder. Path: {input_path}") -def parse_json(): - # read data from a file. This is only for testing purpose. - try: - with Path("teams.json").open() as json_file: - return json.load(json_file) - except OSError as e: - print(e) + extracted_values = parse_db(input_path) + parsed_records = parse_records(extracted_values) + write_results_to_json(parsed_records, output_path) From d74dfeb342d66b7985cde6943bfa8426338ac044 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Wed, 17 Jan 2024 17:08:15 +0100 Subject: [PATCH 14/25] fix: wrong file locations --- src/forensicsim/backend.py | 402 ++++++++++++------------------------- src/forensicsim/parser.py | 76 +++++-- 2 files changed, 186 insertions(+), 292 deletions(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index bcfb656..6533254 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -1,281 +1,137 @@ +""" +MIT License + +Copyright (c) 2021 Alexander Bilz + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import io import json -import warnings -from dataclasses import dataclass, field -from datetime import datetime +import os from pathlib import Path -from typing import Any, Optional -from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning -from dataclasses_json import ( - DataClassJsonMixin, - LetterCase, - Undefined, - config, +from chromedb import ( + ccl_blink_value_deserializer, + ccl_chromium_indexeddb, + ccl_chromium_localstorage, + ccl_chromium_sessionstorage, + ccl_leveldb, + ccl_v8_value_deserializer, +) +from chromedb.ccl_chromium_indexeddb import ( + DatabaseMetadataType, + ObjectStoreMetadataType, ) -from forensicsim.backend import parse_db, write_results_to_json - -# Suppress Beautiful Soup warnings -warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning) - -def strip_html_tags(value): - # Get the text of any embedded html, such as divs, a href links - soup = BeautifulSoup(value, features="html.parser") - return soup.get_text() - - -def decode_dict(properties): - if isinstance(properties, bytes): - soup = BeautifulSoup(properties, features="html.parser") - properties = properties.decode(soup.original_encoding) - if isinstance(properties, dict): - # handle case where nested childs are dicts or list but provided with "" but have to be expanded. - for key, value in properties.items(): - if isinstance(value, str) and value.startswith(("[", "{")): - properties[key] = json.loads(value, strict=False) - return properties - - return json.loads(properties, strict=False) - - -def decode_timestamp(content_utf8_encoded) -> datetime: - return datetime.utcfromtimestamp(int(content_utf8_encoded) / 1000) - - -def encode_timestamp(timestamp) -> Optional[str]: - if timestamp is not None: - return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f") - return None - - -JSON_CONFIG = config(letter_case=LetterCase.CAMEL, undefined=Undefined.EXCLUDE)[ - "dataclasses_json" -] - - -@dataclass() -class Meeting(DataClassJsonMixin): - dataclass_json_config = JSON_CONFIG - - client_update_time: Optional[str] = None - cached_deduplication_key: Optional[str] = None - id: Optional[str] = None - members: Optional[list[dict]] = None - thread_properties: dict[str, Any] = field( - default_factory=dict, metadata=config(decoder=decode_dict) - ) - type: Optional[str] = None - version: Optional[float] = None - - record_type: Optional[str] = field( - default="meeting", metadata=config(field_name="record_type") - ) - - def __eq__(self, other): - return self.cached_deduplication_key == other.cachedDeduplicationKey - - def __hash__(self): - return hash(self.cached_deduplication_key) - - def __lt__(self, other): - return self.cached_deduplication_key < other.cached_deduplication_key - - -@dataclass() -class Message(DataClassJsonMixin): - dataclass_json_config = JSON_CONFIG - - attachments: list[Any] = field(default_factory=list) - cached_deduplication_key: Optional[str] = None - client_arrival_time: Optional[str] = None - clientmessageid: Optional[str] = None - composetime: Optional[str] = None - conversation_id: Optional[str] = None - content: Optional[str] = field( - default=None, metadata=config(decoder=strip_html_tags) - ) - contenttype: Optional[str] = None - created_time: Optional[datetime] = field( - default=None, - metadata=config(decoder=decode_timestamp, encoder=encode_timestamp), - ) - creator: Optional[str] = None - is_from_me: Optional[bool] = None - message_kind: Optional[str] = None - messagetype: Optional[str] = None - originalarrivaltime: Optional[str] = None - properties: dict[str, Any] = field( - default_factory=dict, metadata=config(decoder=decode_dict) - ) - version: Optional[datetime] = field( - default=None, - metadata=config(decoder=decode_timestamp, encoder=encode_timestamp), - ) - - origin_file: Optional[str] = field( - default=None, metadata=config(field_name="origin_file") - ) - record_type: str = field( - default="message", metadata=config(field_name="record_type") - ) - - def __post_init__(self): - if self.cached_deduplication_key is None: - self.cached_deduplication_key = str(self.creator) + str( - self.clientmessageid +TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"] + +ENCODING = "iso-8859-1" + +""" +The following code is heavily adopted from the RawLevelDb and IndexedDB processing proposed by CCL Group + +https://github.com/cclgroupltd/ccl_chrome_indexeddb/blob/35b6a9efba1078cf339f9e64d2796b1f5f7c556f/ccl_chromium_indexeddb.py + +It uses an optimized enumeration approach for processing the metadata, which makes the original IndexedDB super slow. + +Additionally, it has a flag to filter for datastores, which are interesting for us. +""" + +def parse_db(filepath, do_not_filter=False): + # Open raw access to a LevelDB and deserialize the records. + wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath) + + extracted_values = [] + + for db_info in wrapper.database_ids: + # Skip databases without a valid dbid_no + if db_info.dbid_no is None: + continue + + db = wrapper[db_info.dbid_no] + + for obj_store_name in db.object_store_names: + # Skip empty object stores + if obj_store_name is None: + continue + if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter: + obj_store = db[obj_store_name] + records_per_object_store = 0 + for record in obj_store.iterate_records(): + records_per_object_store += 1 + sourcefile = str(filepath) + # TODO: Fix None values + state = None + seq = None + extracted_values.append({"key": record.key.raw_key, "value": record.value, "origin_file": sourcefile, "store": obj_store_name, "state": state, "seq": seq}) + print(f"{obj_store_name} {db.name} (Records: {records_per_object_store})") + return extracted_values + + +def parse_localstorage(filepath): + local_store = ccl_chromium_localstorage.LocalStoreDb(filepath) + extracted_values = [] + for record in local_store.iter_all_records(): + try: + extracted_values.append(json.loads(record.value, strict=False)) + except json.decoder.JSONDecodeError: + continue + return extracted_values + + +def parse_sessionstorage(filepath): + session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath) + extracted_values = [] + for host in session_storage: + print(host) + # Hosts can have multiple sessions associated with them + for session_store_values in session_storage.get_all_for_host(host).values(): + for session_store_value in session_store_values: + # response is of type SessionStoreValue + + # Make a nice dictionary out of it + entry = { + "key": host, + "value": session_store_value.value, + "guid": session_store_value.guid, + "leveldb_sequence_number": session_store_value.leveldb_sequence_number, + } + extracted_values.append(entry) + return extracted_values + + +def write_results_to_json(data, outputpath): + # Dump messages into a json file + try: + with open(outputpath, "w", encoding="utf-8") as f: + json.dump( + data, f, indent=4, sort_keys=True, default=str, ensure_ascii=False ) + except OSError as e: + print(e) - def __eq__(self, other): - return self.cached_deduplication_key == other.cached_deduplication_key - - def __hash__(self): - return hash(self.cached_deduplication_key) - - def __lt__(self, other): - return self.cached_deduplication_key < other.cached_deduplication_key - - -@dataclass() -class Contact(DataClassJsonMixin): - dataclass_json_config = JSON_CONFIG - - display_name: Optional[str] = None - email: Optional[str] = None - mri: Optional[str] = field(default=None, compare=True) - user_principal_name: Optional[str] = None - - origin_file: Optional[str] = field( - default=None, metadata=config(field_name="origin_file") - ) - record_type: Optional[str] = field( - default="contact", metadata=config(field_name="record_type") - ) - - def __eq__(self, other): - return self.mri == other.mri - - def __hash__(self): - return hash(self.mri) - - def __lt__(self, other): - return self.mri < other.mri - - -def _parse_people(people: list[dict]) -> set[Contact]: - parsed_people = set() - for p in people: - - p |= p.get("value", {}) - p |= {"display_name": p.get("displayName")} - p |= {"email": p.get("email")} - p |= {"mri": p.get("mri")} - p |= {"user_principal_name": p.get("userPrincipalName")} - p |= {"origin_file": p.get("origin_file")} - - parsed_people.add(Contact.from_dict(p)) - return parsed_people - - -def _parse_buddies(buddies: list[dict]) -> set[Contact]: - parsed_buddies = set() - for b in buddies: - buddies_of_b = b.get("value", {}).get("buddies", []) - for b_of_b in buddies_of_b: - - b_of_b |= {"origin_file": b.get("origin_file")} - parsed_buddies.add(Contact.from_dict(b_of_b)) - return parsed_buddies - - -def _parse_conversations(conversations: list[dict]) -> set[Meeting]: - cleaned_conversations = set() - for c in conversations: - - - if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get("value", {}).get( - "threadProperties", {} - ): - last_message = c.get("value", {}).get("lastMessage", {}) - meeting_properties = c.get("value", {}).get("threadProperties", {}) - c |= c.get("value", {}) - c |= {"client_update_time": c.get("clientUpdateTime")} - c |= {"id": c.get("id")} - c |= {"members": c.get("members")} - c |= {"thread_properties": meeting_properties} - c |= {"client_update_time": c.get("clientUpdateTime")} - c |= {"version": c.get("version")} - c |= {"last_message": last_message} - c |= {"cached_deduplication_key": c.get("id")} - cleaned_conversations.add(Meeting.from_dict(c)) - return cleaned_conversations - - -def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: - cleaned_reply_chains = set() - for rc in reply_chains: - rc |= {"origin_file": rc.get("origin_file")} - - message_dict = {} - if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get("messages", {}): - if rc.get("value", {}).get("messageMap", {}): - message_dict = rc.get("value", {}).get("messageMap", {}) - else: - message_dict = rc.get("value", {}).get("messages", {}) - - for k in message_dict: - md = message_dict[k] - - if md.get("messageType", "") == "RichText/Html" or md.get("messageType", "") == "Text": - rc |= rc.get("value", {}) - rc |= {"cached_deduplication_key": md.get("dedupeKey")} - rc |= {"clientmessageid": md.get("clientMessageId")} - rc |= {"composetime": md.get("clientArrivalTime")} - rc |= {"conversation_id": md.get("conversationId")} - rc |= {"content": md.get("content")} - rc |= {"contenttype": md.get("contentType")} - rc |= {"created_time": md.get("clientArrivalTime")} - rc |= {"creator": md.get("version")} - rc |= {"is_from_me": md.get("isSentByCurrentUser")} - rc |= {"messagetype": md.get("messageType")} - rc |= {"originalArrivalTime": md.get("version")} - rc |= {"client_arrival_time": md.get("clientArrivalTime")} - rc |= {"original_arrival_time": md.get("clientArrivalTime")} - rc |= {"version": md.get("version")} - rc |= {"properties": md.get("properties")} - - cleaned_reply_chains.add(Message.from_dict(rc)) - - return cleaned_reply_chains - - -def parse_records(records: list[dict]) -> list[dict]: - people, buddies, reply_chains, conversations = [], [], [], [] - - for r in records: - store = r.get("store", "other") - if store == "people": - people.append(r) - elif store == "buddylist": - buddies.append(r) - elif store == "replychains": - reply_chains.append(r) - elif store == "conversations": - conversations.append(r) - - # sort within groups i.e., Contacts, Meetings, Conversations - parsed_records = ( - sorted(_parse_people(people)) - # + sorted(_parse_buddies(buddies)) - + sorted(_parse_reply_chains(reply_chains)) - + sorted(_parse_conversations(conversations)) - ) - return [r.to_dict() for r in parsed_records] - - -def process_db(input_path: Path, output_path: Path): - if not input_path.parts[-1].endswith(".leveldb"): - raise ValueError(f"Expected a leveldb folder. Path: {input_path}") - extracted_values = parse_db(input_path) - parsed_records = parse_records(extracted_values) - write_results_to_json(parsed_records, output_path) +def parse_json(): + # read data from a file. This is only for testing purpose. + try: + with Path("teams.json").open() as json_file: + return json.load(json_file) + except OSError as e: + print(e) diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py index 5c2c3dd..bcfb656 100644 --- a/src/forensicsim/parser.py +++ b/src/forensicsim/parser.py @@ -1,10 +1,11 @@ import json +import warnings from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Any, Optional -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from dataclasses_json import ( DataClassJsonMixin, LetterCase, @@ -14,6 +15,8 @@ from forensicsim.backend import parse_db, write_results_to_json +# Suppress Beautiful Soup warnings +warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning) def strip_html_tags(value): # Get the text of any embedded html, such as divs, a href links @@ -121,10 +124,6 @@ def __post_init__(self): self.cached_deduplication_key = str(self.creator) + str( self.clientmessageid ) - if "call-log" in self.properties: - self.record_type = "call" - if "activity" in self.properties: - self.record_type = "reaction" def __eq__(self, other): return self.cached_deduplication_key == other.cached_deduplication_key @@ -165,8 +164,14 @@ def __lt__(self, other): def _parse_people(people: list[dict]) -> set[Contact]: parsed_people = set() for p in people: - p |= {"origin_file": p.get("origin_file")} + p |= p.get("value", {}) + p |= {"display_name": p.get("displayName")} + p |= {"email": p.get("email")} + p |= {"mri": p.get("mri")} + p |= {"user_principal_name": p.get("userPrincipalName")} + p |= {"origin_file": p.get("origin_file")} + parsed_people.add(Contact.from_dict(p)) return parsed_people @@ -176,6 +181,7 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]: for b in buddies: buddies_of_b = b.get("value", {}).get("buddies", []) for b_of_b in buddies_of_b: + b_of_b |= {"origin_file": b.get("origin_file")} parsed_buddies.add(Contact.from_dict(b_of_b)) return parsed_buddies @@ -184,29 +190,61 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]: def _parse_conversations(conversations: list[dict]) -> set[Meeting]: cleaned_conversations = set() for c in conversations: - last_message = c.get("value", {}).get("lastMessage", {}) - c |= { - "cachedDeduplicationKey": last_message.get("cachedDeduplicationKey"), - } - if c.get("type", "") == "Meeting" and "meeting" in c.get( + if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get("value", {}).get( "threadProperties", {} ): + last_message = c.get("value", {}).get("lastMessage", {}) + meeting_properties = c.get("value", {}).get("threadProperties", {}) + c |= c.get("value", {}) + c |= {"client_update_time": c.get("clientUpdateTime")} + c |= {"id": c.get("id")} + c |= {"members": c.get("members")} + c |= {"thread_properties": meeting_properties} + c |= {"client_update_time": c.get("clientUpdateTime")} + c |= {"version": c.get("version")} + c |= {"last_message": last_message} + c |= {"cached_deduplication_key": c.get("id")} cleaned_conversations.add(Meeting.from_dict(c)) - return cleaned_conversations def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: cleaned_reply_chains = set() - for rc in reply_chains: - for message_values in rc.get("value", {}).get("messages", {}).values(): - message_values |= { - "origin_file": rc.get("origin_file"), - } - cleaned_reply_chains.add(Message.from_dict(message_values)) + rc |= {"origin_file": rc.get("origin_file")} + + message_dict = {} + if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get("messages", {}): + if rc.get("value", {}).get("messageMap", {}): + message_dict = rc.get("value", {}).get("messageMap", {}) + else: + message_dict = rc.get("value", {}).get("messages", {}) + + for k in message_dict: + md = message_dict[k] + + if md.get("messageType", "") == "RichText/Html" or md.get("messageType", "") == "Text": + rc |= rc.get("value", {}) + rc |= {"cached_deduplication_key": md.get("dedupeKey")} + rc |= {"clientmessageid": md.get("clientMessageId")} + rc |= {"composetime": md.get("clientArrivalTime")} + rc |= {"conversation_id": md.get("conversationId")} + rc |= {"content": md.get("content")} + rc |= {"contenttype": md.get("contentType")} + rc |= {"created_time": md.get("clientArrivalTime")} + rc |= {"creator": md.get("version")} + rc |= {"is_from_me": md.get("isSentByCurrentUser")} + rc |= {"messagetype": md.get("messageType")} + rc |= {"originalArrivalTime": md.get("version")} + rc |= {"client_arrival_time": md.get("clientArrivalTime")} + rc |= {"original_arrival_time": md.get("clientArrivalTime")} + rc |= {"version": md.get("version")} + rc |= {"properties": md.get("properties")} + + cleaned_reply_chains.add(Message.from_dict(rc)) + return cleaned_reply_chains @@ -227,7 +265,7 @@ def parse_records(records: list[dict]) -> list[dict]: # sort within groups i.e., Contacts, Meetings, Conversations parsed_records = ( sorted(_parse_people(people)) - + sorted(_parse_buddies(buddies)) + # + sorted(_parse_buddies(buddies)) + sorted(_parse_reply_chains(reply_chains)) + sorted(_parse_conversations(conversations)) ) From 10b0a875c6903243ecf92b9a02f868437b67b37f Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Wed, 17 Jan 2024 18:14:23 +0100 Subject: [PATCH 15/25] feat: minor refactorings --- src/forensicsim/backend.py | 23 +++++----- src/forensicsim/parser.py | 89 +++++++++++++++++++------------------ tools/Forensicsim_Parser.py | 2 +- tools/main.py | 4 +- 4 files changed, 61 insertions(+), 57 deletions(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 6533254..3aab92e 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -22,22 +22,13 @@ SOFTWARE. """ -import io import json -import os from pathlib import Path from chromedb import ( - ccl_blink_value_deserializer, ccl_chromium_indexeddb, ccl_chromium_localstorage, ccl_chromium_sessionstorage, - ccl_leveldb, - ccl_v8_value_deserializer, -) -from chromedb.ccl_chromium_indexeddb import ( - DatabaseMetadataType, - ObjectStoreMetadataType, ) TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"] @@ -54,6 +45,7 @@ Additionally, it has a flag to filter for datastores, which are interesting for us. """ + def parse_db(filepath, do_not_filter=False): # Open raw access to a LevelDB and deserialize the records. wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath) @@ -80,8 +72,17 @@ def parse_db(filepath, do_not_filter=False): # TODO: Fix None values state = None seq = None - extracted_values.append({"key": record.key.raw_key, "value": record.value, "origin_file": sourcefile, "store": obj_store_name, "state": state, "seq": seq}) - print(f"{obj_store_name} {db.name} (Records: {records_per_object_store})") + extracted_values.append({ + "key": record.key.raw_key, + "value": record.value, + "origin_file": sourcefile, + "store": obj_store_name, + "state": state, + "seq": seq, + }) + print( + f"{obj_store_name} {db.name} (Records: {records_per_object_store})" + ) return extracted_values diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py index bcfb656..218d391 100644 --- a/src/forensicsim/parser.py +++ b/src/forensicsim/parser.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field from datetime import datetime from pathlib import Path -from typing import Any, Optional +from typing import Any, Optional, Union from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from dataclasses_json import ( @@ -16,15 +16,16 @@ from forensicsim.backend import parse_db, write_results_to_json # Suppress Beautiful Soup warnings -warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning) +warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning) -def strip_html_tags(value): + +def strip_html_tags(value: str) -> str: # Get the text of any embedded html, such as divs, a href links soup = BeautifulSoup(value, features="html.parser") return soup.get_text() -def decode_dict(properties): +def decode_dict(properties: Union[bytes, str, dict]) -> dict[str, Any]: if isinstance(properties, bytes): soup = BeautifulSoup(properties, features="html.parser") properties = properties.decode(soup.original_encoding) @@ -38,11 +39,11 @@ def decode_dict(properties): return json.loads(properties, strict=False) -def decode_timestamp(content_utf8_encoded) -> datetime: +def decode_timestamp(content_utf8_encoded: str) -> datetime: return datetime.utcfromtimestamp(int(content_utf8_encoded) / 1000) -def encode_timestamp(timestamp) -> Optional[str]: +def encode_timestamp(timestamp: Optional[datetime]) -> Optional[str]: if timestamp is not None: return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f") return None @@ -71,13 +72,17 @@ class Meeting(DataClassJsonMixin): default="meeting", metadata=config(field_name="record_type") ) - def __eq__(self, other): + def __eq__(self, other: object) -> bool: + if not isinstance(other, Meeting): + return NotImplemented return self.cached_deduplication_key == other.cachedDeduplicationKey - def __hash__(self): + def __hash__(self) -> int: return hash(self.cached_deduplication_key) - def __lt__(self, other): + def __lt__(self, other: object) -> bool: + if not isinstance(other, Meeting): + return NotImplemented return self.cached_deduplication_key < other.cached_deduplication_key @@ -119,19 +124,23 @@ class Message(DataClassJsonMixin): default="message", metadata=config(field_name="record_type") ) - def __post_init__(self): + def __post_init__(self) -> None: if self.cached_deduplication_key is None: self.cached_deduplication_key = str(self.creator) + str( self.clientmessageid ) - def __eq__(self, other): + def __eq__(self, other: object) -> bool: + if not isinstance(other, Message): + return NotImplemented return self.cached_deduplication_key == other.cached_deduplication_key - def __hash__(self): + def __hash__(self) -> int: return hash(self.cached_deduplication_key) - def __lt__(self, other): + def __lt__(self, other: object) -> bool: + if not isinstance(other, Message): + return NotImplemented return self.cached_deduplication_key < other.cached_deduplication_key @@ -151,25 +160,24 @@ class Contact(DataClassJsonMixin): default="contact", metadata=config(field_name="record_type") ) - def __eq__(self, other): + def __eq__(self, other: object) -> bool: + if not isinstance(other, Contact): + return NotImplemented return self.mri == other.mri - def __hash__(self): + def __hash__(self) -> int: return hash(self.mri) - def __lt__(self, other): + def __lt__(self, other: object) -> bool: + if not isinstance(other, Contact): + return NotImplemented return self.mri < other.mri def _parse_people(people: list[dict]) -> set[Contact]: parsed_people = set() for p in people: - p |= p.get("value", {}) - p |= {"display_name": p.get("displayName")} - p |= {"email": p.get("email")} - p |= {"mri": p.get("mri")} - p |= {"user_principal_name": p.get("userPrincipalName")} p |= {"origin_file": p.get("origin_file")} parsed_people.add(Contact.from_dict(p)) @@ -181,7 +189,6 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]: for b in buddies: buddies_of_b = b.get("value", {}).get("buddies", []) for b_of_b in buddies_of_b: - b_of_b |= {"origin_file": b.get("origin_file")} parsed_buddies.add(Contact.from_dict(b_of_b)) return parsed_buddies @@ -190,21 +197,12 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]: def _parse_conversations(conversations: list[dict]) -> set[Meeting]: cleaned_conversations = set() for c in conversations: - - - if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get("value", {}).get( - "threadProperties", {} - ): - last_message = c.get("value", {}).get("lastMessage", {}) - meeting_properties = c.get("value", {}).get("threadProperties", {}) - c |= c.get("value", {}) - c |= {"client_update_time": c.get("clientUpdateTime")} - c |= {"id": c.get("id")} - c |= {"members": c.get("members")} - c |= {"thread_properties": meeting_properties} - c |= {"client_update_time": c.get("clientUpdateTime")} - c |= {"version": c.get("version")} - c |= {"last_message": last_message} + if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get( + "value", {} + ).get("threadProperties", {}): + c_value = c.get("value", {}) + c |= c_value + c |= {"thread_properties": c_value.get("threadProperties", {})} c |= {"cached_deduplication_key": c.get("id")} cleaned_conversations.add(Meeting.from_dict(c)) return cleaned_conversations @@ -214,9 +212,11 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: cleaned_reply_chains = set() for rc in reply_chains: rc |= {"origin_file": rc.get("origin_file")} - + message_dict = {} - if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get("messages", {}): + if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get( + "messages", {} + ): if rc.get("value", {}).get("messageMap", {}): message_dict = rc.get("value", {}).get("messageMap", {}) else: @@ -225,7 +225,10 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: for k in message_dict: md = message_dict[k] - if md.get("messageType", "") == "RichText/Html" or md.get("messageType", "") == "Text": + if ( + md.get("messageType", "") == "RichText/Html" + or md.get("messageType", "") == "Text" + ): rc |= rc.get("value", {}) rc |= {"cached_deduplication_key": md.get("dedupeKey")} rc |= {"clientmessageid": md.get("clientMessageId")} @@ -242,7 +245,7 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: rc |= {"original_arrival_time": md.get("clientArrivalTime")} rc |= {"version": md.get("version")} rc |= {"properties": md.get("properties")} - + cleaned_reply_chains.add(Message.from_dict(rc)) return cleaned_reply_chains @@ -265,14 +268,14 @@ def parse_records(records: list[dict]) -> list[dict]: # sort within groups i.e., Contacts, Meetings, Conversations parsed_records = ( sorted(_parse_people(people)) - # + sorted(_parse_buddies(buddies)) + # + sorted(_parse_buddies(buddies)) + sorted(_parse_reply_chains(reply_chains)) + sorted(_parse_conversations(conversations)) ) return [r.to_dict() for r in parsed_records] -def process_db(input_path: Path, output_path: Path): +def process_db(input_path: Path, output_path: Path) -> None: if not input_path.parts[-1].endswith(".leveldb"): raise ValueError(f"Expected a leveldb folder. Path: {input_path}") diff --git a/tools/Forensicsim_Parser.py b/tools/Forensicsim_Parser.py index 3b35d94..ce35ec1 100644 --- a/tools/Forensicsim_Parser.py +++ b/tools/Forensicsim_Parser.py @@ -88,7 +88,7 @@ # Factory that defines the name and details of the module and allows Autopsy # to create instances of the modules that will do the analysis. class ForensicIMIngestModuleFactory(IngestModuleFactoryAdapter): - def __init__(self): + def __init__(self) -> None: self.settings = None moduleName = "Microsoft Teams Parser" diff --git a/tools/main.py b/tools/main.py index 10bda5a..71f30e4 100644 --- a/tools/main.py +++ b/tools/main.py @@ -31,7 +31,7 @@ from forensicsim.parser import parse_records -def process_db(input_path: Path, output_path: Path): +def process_db(input_path: Path, output_path: Path) -> None: if not input_path.parts[-1].endswith(".leveldb"): raise ValueError(f"Expected a leveldb folder. Path: {input_path}") @@ -57,7 +57,7 @@ def process_db(input_path: Path, output_path: Path): required=True, help="File path to the processed output.", ) -def process_cmd(filepath, outputpath): +def process_cmd(filepath: Path, outputpath: Path) -> None: click.echo(XTRACT_HEADER) process_db(filepath, outputpath) From b5accff81297d942a458ea6c86396a3bfe2fb75e Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sat, 20 Jan 2024 11:35:12 +0100 Subject: [PATCH 16/25] feat: add option to pass .blob folder --- src/forensicsim/backend.py | 5 +++-- src/forensicsim/parser.py | 7 +++++-- tools/dump_leveldb.py | 29 +++++++++++++++++++---------- tools/main.py | 30 ++++++++++++++++-------------- 4 files changed, 43 insertions(+), 28 deletions(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 3aab92e..eea7b1c 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -46,10 +46,11 @@ """ -def parse_db(filepath, do_not_filter=False): +def parse_db(filepath : Path, blobpath: Path = None, do_not_filter: bool=False): # Open raw access to a LevelDB and deserialize the records. - wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath) + wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath) + extracted_values = [] for db_info in wrapper.database_ids: diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py index 218d391..20cfc91 100644 --- a/src/forensicsim/parser.py +++ b/src/forensicsim/parser.py @@ -275,10 +275,13 @@ def parse_records(records: list[dict]) -> list[dict]: return [r.to_dict() for r in parsed_records] -def process_db(input_path: Path, output_path: Path) -> None: +def process_db(input_path: Path, output_path: Path, blob_path: Path=None, do_not_filter: bool = True) -> None: if not input_path.parts[-1].endswith(".leveldb"): raise ValueError(f"Expected a leveldb folder. Path: {input_path}") - extracted_values = parse_db(input_path) + if blob_path is not None and not blob_path.parts[-1].endswith(".blob"): + raise ValueError(f"Expected a .blob folder. Path: {blob_path}") + + extracted_values = parse_db(input_path, blob_path, do_not_filter) parsed_records = parse_records(extracted_values) write_results_to_json(parsed_records, output_path) diff --git a/tools/dump_leveldb.py b/tools/dump_leveldb.py index bdcb814..3f6fe63 100644 --- a/tools/dump_leveldb.py +++ b/tools/dump_leveldb.py @@ -26,17 +26,14 @@ import click -from forensicsim.backend import parse_db, write_results_to_json +from forensicsim.backend import write_results_to_json +from forensicsim.parser import parse_db from forensicsim.consts import DUMP_HEADER - -def process_db(input_path, output_path): - # Do some basic error handling - if not input_path.parts[-1].endswith(".leveldb"): - raise ValueError(f"Expected a leveldb folder. Path: {input_path}") +def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None): # convert the database to a python list with nested dictionaries - extracted_values = parse_db(input_path, do_not_filter=True) + extracted_values = parse_db(input_path, blob_path, do_not_filter=True) # write the output to a json file write_results_to_json(extracted_values, output_path) @@ -50,8 +47,9 @@ def process_db(input_path, output_path): exists=True, readable=True, writable=False, dir_okay=True, path_type=Path ), required=True, - help="File path to the IndexedDB.", + help="File path to the .leveldb folder of the IndexedDB.", ) + @click.option( "-o", "--outputpath", @@ -59,9 +57,20 @@ def process_db(input_path, output_path): required=True, help="File path to the processed output.", ) -def process_cmd(filepath, outputpath): + +@click.option( + "-b", + "--blobpath", + type=click.Path( + exists=True, readable=True, writable=False, dir_okay=True, path_type=Path + ), + required=False, + help="File path to the .blob folder of the IndexedDB.", +) + +def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None: click.echo(DUMP_HEADER) - process_db(filepath, outputpath) + process_level_db(filepath, outputpath, blobpath) if __name__ == "__main__": diff --git a/tools/main.py b/tools/main.py index 71f30e4..4b05f3d 100644 --- a/tools/main.py +++ b/tools/main.py @@ -28,17 +28,7 @@ from forensicsim.backend import parse_db, write_results_to_json from forensicsim.consts import XTRACT_HEADER -from forensicsim.parser import parse_records - - -def process_db(input_path: Path, output_path: Path) -> None: - if not input_path.parts[-1].endswith(".leveldb"): - raise ValueError(f"Expected a leveldb folder. Path: {input_path}") - - extracted_values = parse_db(input_path) - parsed_records = parse_records(extracted_values) - write_results_to_json(parsed_records, output_path) - +from forensicsim.parser import process_db @click.command() @click.option( @@ -48,8 +38,9 @@ def process_db(input_path: Path, output_path: Path) -> None: exists=True, readable=True, writable=False, dir_okay=True, path_type=Path ), required=True, - help="File path to the IndexedDB.", + help="File path to the .leveldb folder of the IndexedDB.", ) + @click.option( "-o", "--outputpath", @@ -57,9 +48,20 @@ def process_db(input_path: Path, output_path: Path) -> None: required=True, help="File path to the processed output.", ) -def process_cmd(filepath: Path, outputpath: Path) -> None: + +@click.option( + "-b", + "--blobpath", + type=click.Path( + exists=True, readable=True, writable=False, dir_okay=True, path_type=Path + ), + required=False, + help="File path to the .blob folder of the IndexedDB.", +) + +def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None: click.echo(XTRACT_HEADER) - process_db(filepath, outputpath) + process_db(filepath, outputpath, blobpath, False) if __name__ == "__main__": From 6b6b42f9b5f8689058f59fca2f4ca0cff402f4ab Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sat, 20 Jan 2024 13:06:03 +0100 Subject: [PATCH 17/25] chore: update path of submodule --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 4099ff0..5298878 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "forensicsim-data"] path = forensicsim-data - url = https://github.com/KarelZe/forensicsim-data.git + url = https://github.com/lxndrblz/forensicsim-data.git From 08306f24dd3f201b7359d576736f090260f35ff3 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sat, 20 Jan 2024 13:36:11 +0100 Subject: [PATCH 18/25] chore: update path to chromedb package --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0361102..6ee0eed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,13 +10,13 @@ readme = "README.md" license = {file = "LICENSE.md"} requires-python = ">=3.9" authors = [ - { name = "Alexander Bilz", email = "github@markusbilz.com" }, + { name = "Alexander Bilz", email = "github@alexbilz.com" }, { name = "Markus Bilz", email = "github@markusbilz.com" } ] dependencies = [ "beautifulsoup4", "click", -"chromedb @ git+https://github.com/karelze/ccl_chrome_indexeddb@master", +"chromedb @ git+https://github.com/lxndrblz/chromedb@main", "dataclasses-json", "pause", "pyautogui", From 4e3bda91376daae4abbf8b127defeb3610588596 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sat, 20 Jan 2024 17:02:24 +0100 Subject: [PATCH 19/25] fix: restore parser capability for MS Teams v1 --- src/forensicsim/parser.py | 50 ++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py index 20cfc91..9664e5a 100644 --- a/src/forensicsim/parser.py +++ b/src/forensicsim/parser.py @@ -75,7 +75,7 @@ class Meeting(DataClassJsonMixin): def __eq__(self, other: object) -> bool: if not isinstance(other, Meeting): return NotImplemented - return self.cached_deduplication_key == other.cachedDeduplicationKey + return self.cached_deduplication_key == other.cached_deduplication_key def __hash__(self) -> int: return hash(self.cached_deduplication_key) @@ -108,7 +108,7 @@ class Message(DataClassJsonMixin): is_from_me: Optional[bool] = None message_kind: Optional[str] = None messagetype: Optional[str] = None - originalarrivaltime: Optional[str] = None + original_arrival_time: Optional[str] = None properties: dict[str, Any] = field( default_factory=dict, metadata=config(decoder=decode_dict) ) @@ -129,6 +129,11 @@ def __post_init__(self) -> None: self.cached_deduplication_key = str(self.creator) + str( self.clientmessageid ) + # change record type depending on properties + if "call-log" in self.properties: + self.record_type = "call" + if "activity" in self.properties: + self.record_type = "reaction" def __eq__(self, other: object) -> bool: if not isinstance(other, Message): @@ -193,7 +198,8 @@ def _parse_buddies(buddies: list[dict]) -> set[Contact]: parsed_buddies.add(Contact.from_dict(b_of_b)) return parsed_buddies - +# Conversations can contain multiple artefacts +# -> If type:Meeting then its a meeting def _parse_conversations(conversations: list[dict]) -> set[Meeting]: cleaned_conversations = set() for c in conversations: @@ -211,6 +217,13 @@ def _parse_conversations(conversations: list[dict]) -> set[Meeting]: def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: cleaned_reply_chains = set() for rc in reply_chains: + + + # Skip empty records + if rc["value"] is None: + continue + + rc |= rc.get("value", {}) rc |= {"origin_file": rc.get("origin_file")} message_dict = {} @@ -226,27 +239,26 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: md = message_dict[k] if ( - md.get("messageType", "") == "RichText/Html" - or md.get("messageType", "") == "Text" + md.get("messagetype", "") == "RichText/Html" + or md.get("messagetype", "") == "Text" ): - rc |= rc.get("value", {}) - rc |= {"cached_deduplication_key": md.get("dedupeKey")} - rc |= {"clientmessageid": md.get("clientMessageId")} - rc |= {"composetime": md.get("clientArrivalTime")} + rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")} + rc |= {"clientmessageid": md.get("clientmessageid")} + rc |= {"composetime": md.get("composetime")} rc |= {"conversation_id": md.get("conversationId")} rc |= {"content": md.get("content")} - rc |= {"contenttype": md.get("contentType")} - rc |= {"created_time": md.get("clientArrivalTime")} - rc |= {"creator": md.get("version")} - rc |= {"is_from_me": md.get("isSentByCurrentUser")} - rc |= {"messagetype": md.get("messageType")} - rc |= {"originalArrivalTime": md.get("version")} + rc |= {"contenttype": md.get("contenttype")} + rc |= {"created_time": md.get("createdTime")} + rc |= {"creator": md.get("creator")} + rc |= {"is_from_me": md.get("isFromMe")} + rc |= {"messagetype": md.get("messagetype")} + rc |= {"messageKind": md.get("messageKind")} rc |= {"client_arrival_time": md.get("clientArrivalTime")} - rc |= {"original_arrival_time": md.get("clientArrivalTime")} + rc |= {"original_arrival_time": md.get("originalarrivaltime")} rc |= {"version": md.get("version")} rc |= {"properties": md.get("properties")} - cleaned_reply_chains.add(Message.from_dict(rc)) + cleaned_reply_chains.add(Message.from_dict(rc)) return cleaned_reply_chains @@ -268,7 +280,7 @@ def parse_records(records: list[dict]) -> list[dict]: # sort within groups i.e., Contacts, Meetings, Conversations parsed_records = ( sorted(_parse_people(people)) - # + sorted(_parse_buddies(buddies)) + + sorted(_parse_buddies(buddies)) + sorted(_parse_reply_chains(reply_chains)) + sorted(_parse_conversations(conversations)) ) @@ -284,4 +296,4 @@ def process_db(input_path: Path, output_path: Path, blob_path: Path=None, do_not extracted_values = parse_db(input_path, blob_path, do_not_filter) parsed_records = parse_records(extracted_values) - write_results_to_json(parsed_records, output_path) + write_results_to_json(parsed_records, output_path) \ No newline at end of file From 2147d4107fab9291dba6f92d528936d31e465139 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sun, 21 Jan 2024 15:47:30 +0100 Subject: [PATCH 20/25] fix: parse v1 and v2 separately where necessary --- src/forensicsim/parser.py | 150 ++++++++++++++++++++++++++++---------- 1 file changed, 111 insertions(+), 39 deletions(-) diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py index 9664e5a..a9d6b38 100644 --- a/src/forensicsim/parser.py +++ b/src/forensicsim/parser.py @@ -179,42 +179,76 @@ def __lt__(self, other: object) -> bool: return self.mri < other.mri -def _parse_people(people: list[dict]) -> set[Contact]: +def _parse_people(people: list[dict], version: str) -> set[Contact]: parsed_people = set() + for p in people: + # Skip empty records + if p["value"] is None: + continue + + # Fetch relevant data p |= p.get("value", {}) p |= {"origin_file": p.get("origin_file")} + # Skip contacts without an MRI + if p.get("mri") is None: + continue + + if version == "v1" or version == "v2": + p |= {"display_name": p.get("displayName")} + p |= {"email": p.get("email")} + p |= {"mri": p.get("mri")} + p |= {"user_principal_name": p.get("userPrincipalName")} + else: + print("Teams Version is unknown. Can not extract records of type people.") + + parsed_people.add(Contact.from_dict(p)) return parsed_people -def _parse_buddies(buddies: list[dict]) -> set[Contact]: +def _parse_buddies(buddies: list[dict], version: str) -> set[Contact]: parsed_buddies = set() + for b in buddies: - buddies_of_b = b.get("value", {}).get("buddies", []) - for b_of_b in buddies_of_b: - b_of_b |= {"origin_file": b.get("origin_file")} - parsed_buddies.add(Contact.from_dict(b_of_b)) + # Skip empty records + if b["value"] is None: + continue + # Fetch relevant data + if version == "v1" or version == "v2": + buddies_of_b = b.get("value", {}).get("buddies", []) + for b_of_b in buddies_of_b: + b_of_b |= {"origin_file": b.get("origin_file")} + parsed_buddies.add(Contact.from_dict(b_of_b)) + else: + print("Teams Version is unknown. Can not extract records of type buddies.") return parsed_buddies # Conversations can contain multiple artefacts # -> If type:Meeting then its a meeting -def _parse_conversations(conversations: list[dict]) -> set[Meeting]: +def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting]: cleaned_conversations = set() for c in conversations: - if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get( - "value", {} - ).get("threadProperties", {}): - c_value = c.get("value", {}) - c |= c_value - c |= {"thread_properties": c_value.get("threadProperties", {})} - c |= {"cached_deduplication_key": c.get("id")} - cleaned_conversations.add(Meeting.from_dict(c)) + # Skip empty records + if c["value"] is None: + continue + # Fetch relevant data + if version == "v1" or version == "v2": + if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get( + "value", {} + ).get("threadProperties", {}): + c_value = c.get("value", {}) + c |= c_value + c |= {"thread_properties": c_value.get("threadProperties", {})} + c |= {"cached_deduplication_key": c.get("id")} + cleaned_conversations.add(Meeting.from_dict(c)) + else: + print("Teams Version is unknown. Can not extract records of type meeting.") return cleaned_conversations -def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: +def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]: cleaned_reply_chains = set() for rc in reply_chains: @@ -223,49 +257,84 @@ def _parse_reply_chains(reply_chains: list[dict]) -> set[Message]: if rc["value"] is None: continue + # Fetch relevant data rc |= rc.get("value", {}) rc |= {"origin_file": rc.get("origin_file")} message_dict = {} - if rc.get("value", {}).get("messageMap", {}) or rc.get("value", {}).get( - "messages", {} - ): - if rc.get("value", {}).get("messageMap", {}): - message_dict = rc.get("value", {}).get("messageMap", {}) - else: - message_dict = rc.get("value", {}).get("messages", {}) + if version == "v1": + message_dict = rc.get("value", {}).get("messages", {}) + elif version == "v2": + message_dict = rc.get("value", {}).get("messageMap", {}) + else: + print("Teams Version is unknown. Can not extract records of type reply_chains.") + continue for k in message_dict: md = message_dict[k] - if ( md.get("messagetype", "") == "RichText/Html" or md.get("messagetype", "") == "Text" + or md.get("messageType", "") == "RichText/Html" + or md.get("messageType", "") == "Text" ): - rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")} - rc |= {"clientmessageid": md.get("clientmessageid")} - rc |= {"composetime": md.get("composetime")} + if version == "v1": + + rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")} + rc |= {"clientmessageid": md.get("clientmessageid")} + rc |= {"composetime": md.get("composetime")} + rc |= {"contenttype": md.get("contenttype")} + rc |= {"created_time": md.get("createdTime")} + rc |= {"is_from_me": md.get("isFromMe")} + rc |= {"messagetype": md.get("messagetype")} + rc |= {"messageKind": md.get("messageKind")} + rc |= {"original_arrival_time": md.get("originalarrivaltime")} + + elif version == "v2": + rc |= {"cached_deduplication_key": md.get("dedupeKey")} + rc |= {"clientmessageid": md.get("clientMessageId")} + # set to clientArrivalTime as compose Time is no longer present + rc |= {"composetime": md.get("clientArrivalTime")} + rc |= {"contenttype": md.get("contentType")} + # set to clientArrivalTime as created time is no longer present + rc |= {"created_time": md.get("clientArrivalTime")} + rc |= {"is_from_me": md.get("isSentByCurrentUser")} + rc |= {"messagetype": md.get("messageType")} + rc |= {"original_arrival_time": md.get("originalArrivalTime")} + + # Similar across versions + rc |= {"creator": md.get("creator")} rc |= {"conversation_id": md.get("conversationId")} rc |= {"content": md.get("content")} - rc |= {"contenttype": md.get("contenttype")} - rc |= {"created_time": md.get("createdTime")} - rc |= {"creator": md.get("creator")} - rc |= {"is_from_me": md.get("isFromMe")} - rc |= {"messagetype": md.get("messagetype")} - rc |= {"messageKind": md.get("messageKind")} rc |= {"client_arrival_time": md.get("clientArrivalTime")} - rc |= {"original_arrival_time": md.get("originalarrivaltime")} rc |= {"version": md.get("version")} rc |= {"properties": md.get("properties")} cleaned_reply_chains.add(Message.from_dict(rc)) - + return cleaned_reply_chains +def identify_teams_version(reply_chains: list[dict]) -> str: + # Identify version based on reply chain structure + fingerprint_teams_version = "" + for rc in reply_chains: + rc |= rc.get("value", {}) + if rc.get("value", {}).get("messages", {}): + fingerprint_teams_version = "v1" + break + elif rc.get("value", {}).get("messageMap", {}): + fingerprint_teams_version = "v2" + break + else: + fingerprint_teams_version = "unknown" + + return fingerprint_teams_version + def parse_records(records: list[dict]) -> list[dict]: people, buddies, reply_chains, conversations = [], [], [], [] + for r in records: store = r.get("store", "other") if store == "people": @@ -277,12 +346,15 @@ def parse_records(records: list[dict]) -> list[dict]: elif store == "conversations": conversations.append(r) + # identify version + version = identify_teams_version(reply_chains) + # sort within groups i.e., Contacts, Meetings, Conversations parsed_records = ( - sorted(_parse_people(people)) - + sorted(_parse_buddies(buddies)) - + sorted(_parse_reply_chains(reply_chains)) - + sorted(_parse_conversations(conversations)) + sorted(_parse_people(people, version)) + + sorted(_parse_buddies(buddies, version)) + + sorted(_parse_reply_chains(reply_chains, version)) + + sorted(_parse_conversations(conversations, version)) ) return [r.to_dict() for r in parsed_records] From 7895b2206a69019854acae09460d60527656c284 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sun, 21 Jan 2024 15:48:00 +0100 Subject: [PATCH 21/25] fix: skip unnecessary object stores in parser mode --- src/forensicsim/backend.py | 2 +- tools/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index eea7b1c..59fe643 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -64,7 +64,7 @@ def parse_db(filepath : Path, blobpath: Path = None, do_not_filter: bool=False): # Skip empty object stores if obj_store_name is None: continue - if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter: + if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter is False: obj_store = db[obj_store_name] records_per_object_store = 0 for record in obj_store.iterate_records(): diff --git a/tools/main.py b/tools/main.py index 4b05f3d..e21c7e2 100644 --- a/tools/main.py +++ b/tools/main.py @@ -61,7 +61,7 @@ def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None: click.echo(XTRACT_HEADER) - process_db(filepath, outputpath, blobpath, False) + process_db(filepath, outputpath, blobpath, True) if __name__ == "__main__": From 68c7570029ae83c6fee577b2d0bb55701a085671 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sun, 21 Jan 2024 16:00:50 +0100 Subject: [PATCH 22/25] chore: fix ruff errors --- src/forensicsim/backend.py | 9 +++++-- src/forensicsim/parser.py | 51 ++++++++++++++++++++------------------ tools/dump_leveldb.py | 14 ++++++----- tools/main.py | 5 +--- 4 files changed, 43 insertions(+), 36 deletions(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 59fe643..514c2a0 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -24,6 +24,7 @@ import json from pathlib import Path +from typing import Optional from chromedb import ( ccl_chromium_indexeddb, @@ -46,11 +47,15 @@ """ -def parse_db(filepath : Path, blobpath: Path = None, do_not_filter: bool=False): +def parse_db( + filepath: Path, + blobpath: Optional[Path] = None, + do_not_filter: Optional[bool] = False, +): # Open raw access to a LevelDB and deserialize the records. wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath) - + extracted_values = [] for db_info in wrapper.database_ids: diff --git a/src/forensicsim/parser.py b/src/forensicsim/parser.py index a9d6b38..3ec3084 100644 --- a/src/forensicsim/parser.py +++ b/src/forensicsim/parser.py @@ -186,8 +186,8 @@ def _parse_people(people: list[dict], version: str) -> set[Contact]: # Skip empty records if p["value"] is None: continue - - # Fetch relevant data + + # Fetch relevant data p |= p.get("value", {}) p |= {"origin_file": p.get("origin_file")} @@ -201,8 +201,7 @@ def _parse_people(people: list[dict], version: str) -> set[Contact]: p |= {"mri": p.get("mri")} p |= {"user_principal_name": p.get("userPrincipalName")} else: - print("Teams Version is unknown. Can not extract records of type people.") - + print("Teams Version is unknown. Can not extract records of type people.") parsed_people.add(Contact.from_dict(p)) return parsed_people @@ -215,16 +214,17 @@ def _parse_buddies(buddies: list[dict], version: str) -> set[Contact]: # Skip empty records if b["value"] is None: continue - # Fetch relevant data + # Fetch relevant data if version == "v1" or version == "v2": buddies_of_b = b.get("value", {}).get("buddies", []) for b_of_b in buddies_of_b: b_of_b |= {"origin_file": b.get("origin_file")} parsed_buddies.add(Contact.from_dict(b_of_b)) else: - print("Teams Version is unknown. Can not extract records of type buddies.") + print("Teams Version is unknown. Can not extract records of type buddies.") return parsed_buddies + # Conversations can contain multiple artefacts # -> If type:Meeting then its a meeting def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting]: @@ -233,7 +233,7 @@ def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting # Skip empty records if c["value"] is None: continue - # Fetch relevant data + # Fetch relevant data if version == "v1" or version == "v2": if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get( "value", {} @@ -244,20 +244,18 @@ def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting c |= {"cached_deduplication_key": c.get("id")} cleaned_conversations.add(Meeting.from_dict(c)) else: - print("Teams Version is unknown. Can not extract records of type meeting.") + print("Teams Version is unknown. Can not extract records of type meeting.") return cleaned_conversations def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]: cleaned_reply_chains = set() for rc in reply_chains: - - # Skip empty records if rc["value"] is None: continue - - # Fetch relevant data + + # Fetch relevant data rc |= rc.get("value", {}) rc |= {"origin_file": rc.get("origin_file")} @@ -267,7 +265,9 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]: elif version == "v2": message_dict = rc.get("value", {}).get("messageMap", {}) else: - print("Teams Version is unknown. Can not extract records of type reply_chains.") + print( + "Teams Version is unknown. Can not extract records of type reply_chains." + ) continue for k in message_dict: @@ -279,7 +279,6 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]: or md.get("messageType", "") == "Text" ): if version == "v1": - rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")} rc |= {"clientmessageid": md.get("clientmessageid")} rc |= {"composetime": md.get("composetime")} @@ -289,7 +288,7 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]: rc |= {"messagetype": md.get("messagetype")} rc |= {"messageKind": md.get("messageKind")} rc |= {"original_arrival_time": md.get("originalarrivaltime")} - + elif version == "v2": rc |= {"cached_deduplication_key": md.get("dedupeKey")} rc |= {"clientmessageid": md.get("clientMessageId")} @@ -301,7 +300,7 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]: rc |= {"is_from_me": md.get("isSentByCurrentUser")} rc |= {"messagetype": md.get("messageType")} rc |= {"original_arrival_time": md.get("originalArrivalTime")} - + # Similar across versions rc |= {"creator": md.get("creator")} rc |= {"conversation_id": md.get("conversationId")} @@ -311,7 +310,7 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]: rc |= {"properties": md.get("properties")} cleaned_reply_chains.add(Message.from_dict(rc)) - + return cleaned_reply_chains @@ -321,20 +320,19 @@ def identify_teams_version(reply_chains: list[dict]) -> str: for rc in reply_chains: rc |= rc.get("value", {}) if rc.get("value", {}).get("messages", {}): - fingerprint_teams_version = "v1" - break + fingerprint_teams_version = "v1" elif rc.get("value", {}).get("messageMap", {}): - fingerprint_teams_version = "v2" - break + fingerprint_teams_version = "v2" else: fingerprint_teams_version = "unknown" + break return fingerprint_teams_version + def parse_records(records: list[dict]) -> list[dict]: people, buddies, reply_chains, conversations = [], [], [], [] - for r in records: store = r.get("store", "other") if store == "people": @@ -359,7 +357,12 @@ def parse_records(records: list[dict]) -> list[dict]: return [r.to_dict() for r in parsed_records] -def process_db(input_path: Path, output_path: Path, blob_path: Path=None, do_not_filter: bool = True) -> None: +def process_db( + input_path: Path, + output_path: Path, + blob_path: Optional[Path] = None, + do_not_filter: Optional[bool] = True, +) -> None: if not input_path.parts[-1].endswith(".leveldb"): raise ValueError(f"Expected a leveldb folder. Path: {input_path}") @@ -368,4 +371,4 @@ def process_db(input_path: Path, output_path: Path, blob_path: Path=None, do_not extracted_values = parse_db(input_path, blob_path, do_not_filter) parsed_records = parse_records(extracted_values) - write_results_to_json(parsed_records, output_path) \ No newline at end of file + write_results_to_json(parsed_records, output_path) diff --git a/tools/dump_leveldb.py b/tools/dump_leveldb.py index 3f6fe63..5b9241e 100644 --- a/tools/dump_leveldb.py +++ b/tools/dump_leveldb.py @@ -23,15 +23,18 @@ """ from pathlib import Path +from typing import Optional import click from forensicsim.backend import write_results_to_json -from forensicsim.parser import parse_db from forensicsim.consts import DUMP_HEADER +from forensicsim.parser import parse_db -def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None): +def process_level_db( + input_path: Path, output_path: Path, blob_path: Optional[Path] = None +) -> None: # convert the database to a python list with nested dictionaries extracted_values = parse_db(input_path, blob_path, do_not_filter=True) @@ -49,7 +52,6 @@ def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None): required=True, help="File path to the .leveldb folder of the IndexedDB.", ) - @click.option( "-o", "--outputpath", @@ -57,7 +59,6 @@ def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None): required=True, help="File path to the processed output.", ) - @click.option( "-b", "--blobpath", @@ -67,8 +68,9 @@ def process_level_db(input_path: Path, output_path: Path, blob_path: Path=None): required=False, help="File path to the .blob folder of the IndexedDB.", ) - -def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None: +def process_cmd( + filepath: Path, outputpath: Path, blobpath: Optional[Path] = None +) -> None: click.echo(DUMP_HEADER) process_level_db(filepath, outputpath, blobpath) diff --git a/tools/main.py b/tools/main.py index e21c7e2..db4ec0d 100644 --- a/tools/main.py +++ b/tools/main.py @@ -26,10 +26,10 @@ import click -from forensicsim.backend import parse_db, write_results_to_json from forensicsim.consts import XTRACT_HEADER from forensicsim.parser import process_db + @click.command() @click.option( "-f", @@ -40,7 +40,6 @@ required=True, help="File path to the .leveldb folder of the IndexedDB.", ) - @click.option( "-o", "--outputpath", @@ -48,7 +47,6 @@ required=True, help="File path to the processed output.", ) - @click.option( "-b", "--blobpath", @@ -58,7 +56,6 @@ required=False, help="File path to the .blob folder of the IndexedDB.", ) - def process_cmd(filepath: Path, outputpath: Path, blobpath: Path) -> None: click.echo(XTRACT_HEADER) process_db(filepath, outputpath, blobpath, True) From fc8d6c1fb67b05a7143272c9d5f14e425555ff52 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sun, 21 Jan 2024 16:14:47 +0100 Subject: [PATCH 23/25] fix: mypi errors --- src/forensicsim/backend.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 514c2a0..3cdfc84 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -24,7 +24,7 @@ import json from pathlib import Path -from typing import Optional +from typing import Optional, List, Any from chromedb import ( ccl_chromium_indexeddb, @@ -51,7 +51,7 @@ def parse_db( filepath: Path, blobpath: Optional[Path] = None, do_not_filter: Optional[bool] = False, -): +) -> List[dict[str, Any]]: # Open raw access to a LevelDB and deserialize the records. wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath) @@ -92,7 +92,7 @@ def parse_db( return extracted_values -def parse_localstorage(filepath): +def parse_localstorage(filepath: Path) -> List[dict[str, Any]]: local_store = ccl_chromium_localstorage.LocalStoreDb(filepath) extracted_values = [] for record in local_store.iter_all_records(): @@ -103,7 +103,7 @@ def parse_localstorage(filepath): return extracted_values -def parse_sessionstorage(filepath): +def parse_sessionstorage(filepath: Path) -> List[dict[str, Any]]: session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath) extracted_values = [] for host in session_storage: @@ -124,7 +124,7 @@ def parse_sessionstorage(filepath): return extracted_values -def write_results_to_json(data, outputpath): +def write_results_to_json(data: List[dict[str, Any]], outputpath: Path) -> None: # Dump messages into a json file try: with open(outputpath, "w", encoding="utf-8") as f: @@ -133,12 +133,3 @@ def write_results_to_json(data, outputpath): ) except OSError as e: print(e) - - -def parse_json(): - # read data from a file. This is only for testing purpose. - try: - with Path("teams.json").open() as json_file: - return json.load(json_file) - except OSError as e: - print(e) From cdf21846efc31f4942dbe75bd77ac814ac487034 Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sun, 21 Jan 2024 16:22:15 +0100 Subject: [PATCH 24/25] fix: fix more ruff errors --- src/forensicsim/backend.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/forensicsim/backend.py b/src/forensicsim/backend.py index 3cdfc84..fc58e50 100644 --- a/src/forensicsim/backend.py +++ b/src/forensicsim/backend.py @@ -24,7 +24,7 @@ import json from pathlib import Path -from typing import Optional, List, Any +from typing import Any, Optional from chromedb import ( ccl_chromium_indexeddb, @@ -51,7 +51,7 @@ def parse_db( filepath: Path, blobpath: Optional[Path] = None, do_not_filter: Optional[bool] = False, -) -> List[dict[str, Any]]: +) -> list[dict[str, Any]]: # Open raw access to a LevelDB and deserialize the records. wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath) @@ -92,7 +92,7 @@ def parse_db( return extracted_values -def parse_localstorage(filepath: Path) -> List[dict[str, Any]]: +def parse_localstorage(filepath: Path) -> list[dict[str, Any]]: local_store = ccl_chromium_localstorage.LocalStoreDb(filepath) extracted_values = [] for record in local_store.iter_all_records(): @@ -103,7 +103,7 @@ def parse_localstorage(filepath: Path) -> List[dict[str, Any]]: return extracted_values -def parse_sessionstorage(filepath: Path) -> List[dict[str, Any]]: +def parse_sessionstorage(filepath: Path) -> list[dict[str, Any]]: session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath) extracted_values = [] for host in session_storage: @@ -124,7 +124,7 @@ def parse_sessionstorage(filepath: Path) -> List[dict[str, Any]]: return extracted_values -def write_results_to_json(data: List[dict[str, Any]], outputpath: Path) -> None: +def write_results_to_json(data: list[dict[str, Any]], outputpath: Path) -> None: # Dump messages into a json file try: with open(outputpath, "w", encoding="utf-8") as f: From 98f381033cd2737051a13e18941d6fd12740460b Mon Sep 17 00:00:00 2001 From: Alexander Bilz Date: Sun, 21 Jan 2024 16:31:29 +0100 Subject: [PATCH 25/25] chore: remove upload of files and git diff --- .github/workflows/build.yaml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index cc9b8f1..8e208b3 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -26,21 +26,9 @@ jobs: .\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_v_1_4_00_11161.json" .\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.live.com_0.indexeddb.leveldb" -o "karelze.json" .\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja.json" - - name: Upload resultsπŸ“² - uses: actions/upload-artifact@v4 - with: - name: generated-outputs - path: | - jane_doe_v_1_4_00_11161.json - john_doe_v_1_4_00_11161.json - retention-days: 1 - - name: Test calling script πŸ“ž + - name: Test calling script πŸ–±οΈ run: | python tools/main.py --help python tools/dump_leveldb.py --help python tools/dump_localstorage.py --help python tools/dump_sessionstorage.py --help - - name: Calculate diff πŸ‘½ - run: | - git diff --no-index --word-diff .\forensicsim-data\expected-result\jane_doe_v_1_4_00_11161.json jane_doe_v_1_4_00_11161.json - git diff --no-index --word-diff .\forensicsim-data\expected-result\john_doe_v_1_4_00_11161.json john_doe_v_1_4_00_11161.json