Skip to content

Commit

Permalink
feat: improve handling of IndexedDB databases and restructure code (#36)
Browse files Browse the repository at this point in the history
* feat: extend gh action to run on MS Teams 2.0 datasets
* refactor: disable continue-on-error 🪲
* fix: unsupported operand type(s) for +: 'NoneType' and 'int'
* refactor: use WrappedIndexDB instead of FastIndexedDB
* feat: implement basic parser for the object stores
* feat: add option to pass .blob folder
* chore: update path of submodule
* chore: update path to chromedb package
* fix: parse v1 and v2 separately where necessary
* fix: skip unnecessary object stores in parser mode
---------

Co-authored-by: Alexander Bilz <mail@alexbilz.com>
  • Loading branch information
KarelZe and lxndrblz authored Jan 21, 2024
1 parent cf53e38 commit 9db96bf
Show file tree
Hide file tree
Showing 8 changed files with 286 additions and 293 deletions.
20 changes: 5 additions & 15 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
on:
push:
push:
jobs:
build:
build:
name: Build exectuable 📦
runs-on: windows-latest
steps:
Expand All @@ -24,21 +24,11 @@ jobs:
run: |
.\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe_v_1_4_00_11161.json"
.\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_v_1_4_00_11161.json"
- name: Upload results📲
uses: actions/upload-artifact@v4
with:
name: generated-outputs
path: |
jane_doe_v_1_4_00_11161.json
john_doe_v_1_4_00_11161.json
retention-days: 1
- name: Test calling script 📞
.\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.live.com_0.indexeddb.leveldb" -o "karelze.json"
.\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja.json"
- name: Test calling script 🖱️
run: |
python tools/main.py --help
python tools/dump_leveldb.py --help
python tools/dump_localstorage.py --help
python tools/dump_sessionstorage.py --help
- name: Calculate diff 👽
run: |
git diff --no-index --word-diff .\forensicsim-data\expected-result\jane_doe_v_1_4_00_11161.json jane_doe_v_1_4_00_11161.json
git diff --no-index --word-diff .\forensicsim-data\expected-result\john_doe_v_1_4_00_11161.json john_doe_v_1_4_00_11161.json
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ repos:
- id: debug-statements
- id: end-of-file-fixer
- id: mixed-line-ending
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.8.0
hooks:
- id: mypy
# yaml requires additional stubs.
# Similar to: https://stackoverflow.com/a/73603491/5755604
additional_dependencies: ['types-PyYAML']
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.11
hooks:
Expand Down
19 changes: 17 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ readme = "README.md"
license = {file = "LICENSE.md"}
requires-python = ">=3.9"
authors = [
{ name = "Alexander Bilz", email = "github@markusbilz.com" },
{ name = "Alexander Bilz", email = "github@alexbilz.com" },
{ name = "Markus Bilz", email = "github@markusbilz.com" }
]
dependencies = [
"beautifulsoup4",
"click",
"chromedb @ git+https://github.com/karelze/ccl_chrome_indexeddb@master",
"chromedb @ git+https://github.com/lxndrblz/chromedb@main",
"dataclasses-json",
"pause",
"pyautogui",
Expand All @@ -36,10 +36,24 @@ version = {attr = "forensicsim.__version__"}
dev=[
"build",
"pre-commit",
"mypy",
"ruff",
"tox",
]

[tool.mypy]

python_version = "3.9"

exclude = [
"tools/**.py"
]

# https://github.com/python/mypy/issues/2410
ignore_missing_imports = true
disallow_untyped_defs = true
disallow_untyped_calls = true
disallow_incomplete_defs = true

[tool.ruff]

Expand Down Expand Up @@ -105,6 +119,7 @@ commands =
# Syntax Checks
[testenv:lint]
commands =
python -m mypy src/forensicsim/backend.py
python -m ruff --output-format=github src
python -m ruff format src --check
Expand Down
242 changes: 41 additions & 201 deletions src/forensicsim/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,20 @@
SOFTWARE.
"""

import io
import json
import os
from pathlib import Path
from typing import Any, Optional

from chromedb import (
ccl_blink_value_deserializer,
ccl_chromium_indexeddb,
ccl_chromium_localstorage,
ccl_chromium_sessionstorage,
ccl_leveldb,
ccl_v8_value_deserializer,
)
from chromedb.ccl_chromium_indexeddb import (
DatabaseMetadataType,
ObjectStoreMetadataType,
)

TEAMS_DB_OBJECT_STORES = ["replychains", "conversations", "people", "buddylist"]

ENCODING = "iso-8859-1"

"""
The following code is heavily adopted from the RawLevelDb and IndexedDB processing proposed by CCL Group
Expand All @@ -53,197 +47,52 @@
"""


class FastIndexedDB:
def __init__(self, leveldb_dir: os.PathLike):
self._db = ccl_leveldb.RawLevelDb(leveldb_dir)
self._fetched_records = []
self.global_metadata = None
self.database_metadata = None
self.object_store_meta = None
self.fetch_data()

def fetch_data(self):
global_metadata_raw = {}

database_metadata_raw = {}
objectstore_metadata_raw = {}

self._fetched_records = []
# Fetch the records only once
for record in self._db.iterate_records_raw():
self._fetched_records.append(record)

for record in self._fetched_records:
# Global Metadata
if (
record.key.startswith(b"\x00\x00\x00\x00")
and record.state == ccl_leveldb.KeyState.Live
) and (
record.key not in global_metadata_raw
or global_metadata_raw[record.key].seq < record.seq
):
global_metadata_raw[record.key] = record

# Convert the raw metadata to a nice GlobalMetadata Object
global_metadata = ccl_chromium_indexeddb.GlobalMetadata(global_metadata_raw)

# Loop through the database IDs
for db_id in global_metadata.db_ids:
if db_id.dbid_no == None:
continue

if db_id.dbid_no > 0x7F:
raise NotImplementedError(
"there could be this many dbs, but I don't support it yet"
)

# Database keys end with 0
prefix_database = bytes([0, db_id.dbid_no, 0, 0])

# Objetstore keys end with 50
prefix_objectstore = bytes([0, db_id.dbid_no, 0, 0, 50])

for record in reversed(self._fetched_records):
if (
record.key.startswith(prefix_database)
and record.state == ccl_leveldb.KeyState.Live
):
# we only want live keys and the newest version thereof (highest seq)
meta_type = record.key[len(prefix_database)]
old_version = database_metadata_raw.get((db_id.dbid_no, meta_type))
if old_version is None or old_version.seq < record.seq:
database_metadata_raw[(db_id.dbid_no, meta_type)] = record
if (
record.key.startswith(prefix_objectstore)
and record.state == ccl_leveldb.KeyState.Live
):
# we only want live keys and the newest version thereof (highest seq)
try:
(
objstore_id,
varint_raw,
) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
record.key[len(prefix_objectstore) :]
)
except TypeError:
continue

meta_type = record.key[len(prefix_objectstore) + len(varint_raw)]

old_version = objectstore_metadata_raw.get((
db_id.dbid_no,
objstore_id,
meta_type,
))
def parse_db(
filepath: Path,
blobpath: Optional[Path] = None,
do_not_filter: Optional[bool] = False,
) -> list[dict[str, Any]]:
# Open raw access to a LevelDB and deserialize the records.

if old_version is None or old_version.seq < record.seq:
objectstore_metadata_raw[
(db_id.dbid_no, objstore_id, meta_type)
] = record
wrapper = ccl_chromium_indexeddb.WrappedIndexDB(filepath, blobpath)

self.global_metadata = global_metadata
self.database_metadata = ccl_chromium_indexeddb.DatabaseMetadata(
database_metadata_raw
)
self.object_store_meta = ccl_chromium_indexeddb.ObjectStoreMetadata(
objectstore_metadata_raw
)
extracted_values = []

def get_database_metadata(self, db_id: int, meta_type: DatabaseMetadataType):
return self.database_metadata.get_meta(db_id, meta_type)
for db_info in wrapper.database_ids:
# Skip databases without a valid dbid_no
if db_info.dbid_no is None:
continue

def get_object_store_metadata(
self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType
):
return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type)
db = wrapper[db_info.dbid_no]

def iterate_records(self, do_not_filter=False):
blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer()
# Loop through the databases and object stores based on their ids
for global_id in self.global_metadata.db_ids:
# print(f"Processing database: {global_id.name}")
if global_id.dbid_no == None:
print(f"WARNING: Skipping database {global_id.name}")
for obj_store_name in db.object_store_names:
# Skip empty object stores
if obj_store_name is None:
continue

for object_store_id in range(
1,
self.database_metadata.get_meta(
global_id.dbid_no, DatabaseMetadataType.MaximumObjectStoreId
)
+ 1,
):
datastore = self.object_store_meta.get_meta(
global_id.dbid_no,
object_store_id,
ObjectStoreMetadataType.StoreName,
)

# print(f"\t Processing object store: {datastore}")
if obj_store_name in TEAMS_DB_OBJECT_STORES or do_not_filter is False:
obj_store = db[obj_store_name]
records_per_object_store = 0
if datastore in TEAMS_DB_OBJECT_STORES or do_not_filter:
prefix = bytes([0, global_id.dbid_no, object_store_id, 1])
for record in self._fetched_records:
if record.key.startswith(prefix):
records_per_object_store += 1
# Skip records with empty values as these cant properly decoded
if record.value == b"":
continue
(
_value_version,
varint_raw,
) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
record.value
)
val_idx = len(varint_raw)
# read the blink envelope
blink_type_tag = record.value[val_idx]
if blink_type_tag != 0xFF:
print("Blink type tag not present")
val_idx += 1

(
_,
varint_raw,
) = ccl_chromium_indexeddb.custom_le_varint_from_bytes(
record.value[val_idx:]
)

val_idx += len(varint_raw)

# read the raw value of the record.
obj_raw = io.BytesIO(record.value[val_idx:])
try:
# Initialize deserializer and try deserialization.
deserializer = ccl_v8_value_deserializer.Deserializer(
obj_raw,
host_object_delegate=blink_deserializer.read,
)
value = deserializer.read()
yield {
"key": record.key,
"value": value,
"origin_file": record.origin_file,
"store": datastore,
"state": record.state,
"seq": record.seq,
}
except Exception:
# TODO Some proper error handling wouldn't hurt
continue
# print(f"{datastore} {global_id.name} {records_per_object_store}")


def parse_db(filepath, do_not_filter=False):
# Open raw access to a LevelDB and deserialize the records.
db = FastIndexedDB(filepath)
extracted_values = []
for record in db.iterate_records(do_not_filter):
extracted_values.append(record)
for record in obj_store.iterate_records():
records_per_object_store += 1
sourcefile = str(filepath)
# TODO: Fix None values
state = None
seq = None
extracted_values.append({
"key": record.key.raw_key,
"value": record.value,
"origin_file": sourcefile,
"store": obj_store_name,
"state": state,
"seq": seq,
})
print(
f"{obj_store_name} {db.name} (Records: {records_per_object_store})"
)
return extracted_values


def parse_localstorage(filepath):
def parse_localstorage(filepath: Path) -> list[dict[str, Any]]:
local_store = ccl_chromium_localstorage.LocalStoreDb(filepath)
extracted_values = []
for record in local_store.iter_all_records():
Expand All @@ -254,7 +103,7 @@ def parse_localstorage(filepath):
return extracted_values


def parse_sessionstorage(filepath):
def parse_sessionstorage(filepath: Path) -> list[dict[str, Any]]:
session_storage = ccl_chromium_sessionstorage.SessionStoreDb(filepath)
extracted_values = []
for host in session_storage:
Expand All @@ -275,7 +124,7 @@ def parse_sessionstorage(filepath):
return extracted_values


def write_results_to_json(data, outputpath):
def write_results_to_json(data: list[dict[str, Any]], outputpath: Path) -> None:
# Dump messages into a json file
try:
with open(outputpath, "w", encoding="utf-8") as f:
Expand All @@ -284,12 +133,3 @@ def write_results_to_json(data, outputpath):
)
except OSError as e:
print(e)


def parse_json():
# read data from a file. This is only for testing purpose.
try:
with Path("teams.json").open() as json_file:
return json.load(json_file)
except OSError as e:
print(e)
Loading

0 comments on commit 9db96bf

Please sign in to comment.