From d226f89760488a010a903638d5d7c53de46ffc72 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 20 Nov 2025 14:43:33 -0700
Subject: [PATCH] Added test_utilities_finalize.py exercising metadata
 persistence, document aggregation, DB splitting, and summary helpers to fully
 validate finalize.py.

---
 .../unit/utilities/test_utilities_finalize.py | 407 ++++++++++++++++++
 1 file changed, 407 insertions(+)
 create mode 100644 tests/python/unit/utilities/test_utilities_finalize.py

diff --git a/tests/python/unit/utilities/test_utilities_finalize.py b/tests/python/unit/utilities/test_utilities_finalize.py
new file mode 100644
index 000000000..2d01fca3a
--- /dev/null
+++ b/tests/python/unit/utilities/test_utilities_finalize.py
@@ -0,0 +1,407 @@
+"""Test COMPASS finalize utilities"""
+
+import json
+from datetime import datetime, timedelta
+from pathlib import Path
+from types import SimpleNamespace
+
+import pandas as pd
+import pytest
+
+from compass.utilities import finalize
+
+
+class DummyModelConfig:
+    """Lightweight LLM model config for grouping tests"""
+
+    def __init__(
+        self,
+        *,
+        name,
+        llm_call_kwargs=None,
+        llm_service_rate_limit,
+        text_splitter_chunk_size,
+        text_splitter_chunk_overlap,
+        client_type,
+    ):
+        self.name = name
+        self.llm_call_kwargs = llm_call_kwargs or {}
+        self.llm_service_rate_limit = llm_service_rate_limit
+        self.text_splitter_chunk_size = text_splitter_chunk_size
+        self.text_splitter_chunk_overlap = text_splitter_chunk_overlap
+        self.client_type = client_type
+
+    def __hash__(self):
+        return id(self)
+
+
+def test_save_run_meta_writes_meta_file(tmp_path, monkeypatch):
+    """Save run metadata with populated manifest entries"""
+
+    logs = tmp_path / "logs"
+    clean_files = tmp_path / "clean"
+    jurisdiction_dbs = tmp_path / "jurisdictions"
+    ordinance_files = tmp_path / "ordinances"
+    for path in (logs, clean_files, jurisdiction_dbs, ordinance_files):
+        path.mkdir()
+
+    (tmp_path / "usage.json").write_text("{}", encoding="utf-8")
+    (tmp_path / "jurisdictions.json").write_text("{}", encoding="utf-8")
+    (tmp_path / "quantitative_ordinances.csv").write_text(
+        "header\n",
+        encoding="utf-8",
+    )
+
+    dirs = SimpleNamespace(
+        logs=logs,
+        clean_files=clean_files,
+        jurisdiction_dbs=jurisdiction_dbs,
+        ordinance_files=ordinance_files,
+        out=tmp_path,
+    )
+
+    monkeypatch.setattr(finalize.getpass, "getuser", lambda: "testuser")
+
+    model = DummyModelConfig(
+        name="gpt",
+        llm_call_kwargs={"temperature": 0},
+        llm_service_rate_limit=5,
+        text_splitter_chunk_size=1000,
+        text_splitter_chunk_overlap=200,
+        client_type="openai",
+    )
+
+    start = datetime(2025, 1, 1, 12, 0, 0)
+    end = start + timedelta(hours=1, minutes=2, seconds=3)
+
+    seconds = finalize.save_run_meta(
+        dirs,
+        "wind",
+        start,
+        end,
+        num_jurisdictions_searched=4,
+        num_jurisdictions_found=2,
+        total_cost=12.34,
+        models={"task": model},
+    )
+
+    assert seconds == 3723
+
+    meta = json.loads((tmp_path / "meta.json").read_text(encoding="utf-8"))
+    assert meta["username"] == "testuser"
+    assert meta["technology"] == "wind"
+    assert meta["total_time"] == 3723
+    assert meta["total_time_string"] == str(end - start)
+    assert meta["cost"] == 12.34
+
+    manifest = meta["manifest"]
+    assert manifest["LOG_DIR"] == "logs"
+    assert manifest["CLEAN_FILE_DIR"] == "clean"
+    assert manifest["JURISDICTION_DBS_DIR"] == "jurisdictions"
+    assert manifest["ORDINANCE_FILES_DIR"] == "ordinances"
+    assert manifest["USAGE_FILE"] == "usage.json"
+    assert manifest["JURISDICTION_FILE"] == "jurisdictions.json"
+    assert manifest["QUAL_DATA_FILE"] == "quantitative_ordinances.csv"
+    assert manifest["META_FILE"] == "meta.json"
+
+    model_info = meta["models"][0]
+    assert model_info["name"] == "gpt"
+    assert model_info["tasks"] == ["task"]
+    assert model_info["llm_call_kwargs"] == {"temperature": 0}
+
+
+def test_save_run_meta_handles_getuser_error(tmp_path, monkeypatch):
+    """Fallback to unknown username when getpass fails"""
+
+    def _raise_os_error():
+        raise OSError("unavailable")
+
+    monkeypatch.setattr(finalize.getpass, "getuser", _raise_os_error)
+
+    dirs = SimpleNamespace(
+        logs=tmp_path / "missing_logs",
+        clean_files=tmp_path / "missing_clean",
+        jurisdiction_dbs=tmp_path / "missing_jurisdictions",
+        ordinance_files=tmp_path / "missing_ordinances",
+        out=tmp_path,
+    )
+
+    start = datetime(2025, 1, 1, 0, 0, 0)
+    end = start + timedelta(days=1, seconds=42)
+
+    seconds = finalize.save_run_meta(
+        dirs,
+        "solar",
+        start,
+        end,
+        num_jurisdictions_searched=0,
+        num_jurisdictions_found=0,
+        total_cost=0,
+        models={},
+    )
+
+    assert seconds == 42
+
+    meta = json.loads((tmp_path / "meta.json").read_text(encoding="utf-8"))
+    assert meta["username"] == "Unknown"
+    assert meta["cost"] is None
+    assert meta["manifest"]["LOG_DIR"] is None
+    assert meta["models"] == []
+
+
+def test_doc_infos_to_db_empty_input():
+    """No documents returns empty DataFrame"""
+
+    db, count = finalize.doc_infos_to_db([])
+    assert count == 0
+    assert db.empty
+    assert list(db.columns) == finalize._PARSED_COLS
+
+
+def test_doc_infos_to_db_compiles_and_formats(tmp_path):
+    """Compile document info into formatted DataFrame"""
+
+    empty_csv = tmp_path / "empty.csv"
+    pd.DataFrame(columns=["feature", "summary"]).to_csv(empty_csv, index=False)
+
+    valid_csv = tmp_path / "valid.csv"
+    pd.DataFrame(
+        [
+            {
+                "feature": "Height Limit",
+                "summary": "Maximum 100 ft",
+                "value": 100,
+                "units": "ft",
+                "adder": 300,
+            }
+        ]
+    ).to_csv(valid_csv, index=False)
+
+    jurisdiction = SimpleNamespace(
+        code="12345",
+        county="Example",
+        state="EX",
+        subdivision_name="Example Township",
+        type="county",
+    )
+
+    doc_infos = [
+        None,
+        {"ord_db_fp": None},
+        {
+            "ord_db_fp": empty_csv,
+            "source": "http://example.com/empty",
+            "date": (2020, 1, 1),
+            "jurisdiction": jurisdiction,
+        },
+        {
+            "ord_db_fp": valid_csv,
+            "source": "http://example.com/valid",
+            "date": (2022, 3, 4),
+            "jurisdiction": jurisdiction,
+        },
+    ]
+
+    db, count = finalize.doc_infos_to_db(doc_infos)
+    assert count == 1
+    assert len(db) == 1
+
+    row = db.iloc[0]
+    assert row["source"] == "http://example.com/valid"
+    assert row["ord_year"] == 2022
+    assert row["FIPS"] == "12345"
+    assert bool(row["quantitative"]) is True
+    assert pd.isna(row["adder"])
+
+
+def test_save_db_writes_csvs(tmp_path):
+    """Split qualitative and quantitative outputs"""
+
+    row_true = dict.fromkeys(finalize._PARSED_COLS)
+    row_true.update(
+        {
+            "county": "County A",
+            "state": "ST",
+            "subdivision": "Subdivision",
+            "jurisdiction_type": "county",
+            "FIPS": "00001",
+            "feature": "Height",
+            "value": 100,
+            "units": "ft",
+            "summary": "Maximum height",
+            "ord_year": 2020,
+            "source": "http://source",
+            "quantitative": True,
+        }
+    )
+
+    row_false = row_true.copy()
+    row_false.update(
+        {
+            "feature": "Setback",
+            "summary": "Setback distance",
+            "quantitative": False,
+        }
+    )
+
+    df = pd.DataFrame([row_true, row_false])
+    finalize.save_db(df, tmp_path)
+
+    quant_path = tmp_path / "quantitative_ordinances.csv"
+    qual_path = tmp_path / "qualitative_ordinances.csv"
+    assert quant_path.exists()
+    assert qual_path.exists()
+
+    quant = pd.read_csv(quant_path)
+    qual = pd.read_csv(qual_path)
+    assert list(quant.columns) == finalize.QUANT_OUT_COLS
+    assert len(quant) == 1
+    assert list(qual.columns) == finalize.QUAL_OUT_COLS
+    assert len(qual) == 1
+    assert quant.iloc[0]["feature"] == "Height"
+    assert qual.iloc[0]["feature"] == "Setback"
+
+
+def test_save_db_with_empty_df(tmp_path):
+    """Do nothing when DataFrame is empty"""
+
+    empty_df = pd.DataFrame(columns=finalize._PARSED_COLS)
+    finalize.save_db(empty_df, tmp_path)
+    assert not (tmp_path / "qualitative_ordinances.csv").exists()
+    assert not (tmp_path / "quantitative_ordinances.csv").exists()
+
+
+def test_db_results_populates_jurisdiction_fields():
+    """Populate DataFrame fields from jurisdiction metadata"""
+
+    base_df = pd.DataFrame(
+        [
+            {
+                "feature": "Height",
+                "summary": "Max height",
+            }
+        ]
+    )
+    jurisdiction = SimpleNamespace(
+        code="54321",
+        county="County B",
+        state="SB",
+        subdivision_name="Subdivision B",
+        type="city",
+    )
+    doc_info = {
+        "source": "http://example.com",
+        "date": (2021, 5, 6),
+        "jurisdiction": jurisdiction,
+    }
+
+    result = finalize._db_results(base_df.copy(), doc_info)
+    row = result.iloc[0]
+    assert row["source"] == "http://example.com"
+    assert row["ord_year"] == 2021
+    assert row["FIPS"] == "54321"
+    assert row["county"] == "County B"
+    assert row["jurisdiction_type"] == "city"
+
+
+def test_empirical_adjustments_caps_adder():
+    """Clamp adder values above empirical limit"""
+
+    db = pd.DataFrame({"adder": [300, 150]})
+    adjusted = finalize._empirical_adjustments(db.copy())
+    assert pd.isna(adjusted.loc[0, "adder"])
+    assert adjusted.loc[1, "adder"] == 150
+
+    no_adder_df = pd.DataFrame({"feature": ["Height"]})
+    adjusted_no_adder = finalize._empirical_adjustments(no_adder_df.copy())
+    pd.testing.assert_frame_equal(no_adder_df, adjusted_no_adder)
+
+
+def test_formatted_db_adds_missing_columns():
+    """Ensure formatted DataFrame contains expected columns"""
+
+    df = pd.DataFrame(
+        [
+            {
+                "feature": "Height",
+                "summary": "Max height",
+            }
+        ]
+    )
+    formatted = finalize._formatted_db(df)
+    assert list(formatted.columns) == finalize._PARSED_COLS
+    assert len(formatted) == 1
+    assert bool(formatted.iloc[0]["quantitative"]) is True
+
+
+def test_extract_model_info_from_all_models_groups_tasks():
+    """Group tasks by shared model configuration"""
+
+    shared_model = DummyModelConfig(
+        name="gpt",
+        llm_call_kwargs={},
+        llm_service_rate_limit=3,
+        text_splitter_chunk_size=1500,
+        text_splitter_chunk_overlap=100,
+        client_type="openai",
+    )
+    other_model = DummyModelConfig(
+        name="gpt-4",
+        llm_call_kwargs={"temperature": 0.2},
+        llm_service_rate_limit=1,
+        text_splitter_chunk_size=1200,
+        text_splitter_chunk_overlap=50,
+        client_type="azure",
+    )
+
+    models = {
+        "task_one": shared_model,
+        "task_two": shared_model,
+        "task_three": other_model,
+    }
+
+    info = finalize._extract_model_info_from_all_models(models)
+    assert len(info) == 2
+
+    first, second = info
+    assert first["name"] == "gpt"
+    assert first["tasks"] == ["task_one", "task_two"]
+    assert first["llm_call_kwargs"] is None
+
+    assert second["name"] == "gpt-4"
+    assert second["tasks"] == ["task_three"]
+    assert second["llm_call_kwargs"] == {"temperature": 0.2}
+
+
+def test_compile_run_summary_message_includes_cost(tmp_path):
+    """Include cost details when provided"""
+
+    message = finalize.compile_run_summary_message(3661, 42.5, tmp_path, 3)
+    assert "Total runtime: 1:01:01" in message
+    assert "Total cost" in message
+    assert "$42.50" in message
+    assert "Number of documents found: 3" in message
+
+
+def test_compile_run_summary_message_without_cost(tmp_path):
+    """Omit cost line when not provided"""
+
+    message = finalize.compile_run_summary_message(5, None, tmp_path, 0)
+    assert "Total cost" not in message
+    assert "Total runtime: 0:00:05" in message
+
+
+def test_elapsed_time_as_str_basic():
+    """Format elapsed time without days"""
+
+    assert finalize._elapsed_time_as_str(65) == "0:01:05"
+
+
+def test_elapsed_time_as_str_with_days():
+    """Format elapsed time spanning days"""
+
+    assert finalize._elapsed_time_as_str(90061) == "1 day, 1:01:01"
+
+
+if __name__ == "__main__":
+    pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])