Merge pull request #468 from johnpaulalex/retro_date

First look at script to extract data from expected.zip and update the…
ProjectDrawdown · Sep 28, 2021 · d95ad29 · d95ad29
2 parents 09ffd1d + f51e9ba
commit d95ad29
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 0 deletions.
diff --git a/tools/_README.md b/tools/_README.md
@@ -17,6 +17,7 @@ Core extraction code:
  * world_data_xls_extract.py
 
 Support code:
+ * copy_expected_to_scenario.py: Template for adding additional datapoints to scenarios via extracting from expected.zip
  * util<area>.py:  Utilities for reading Excel workbooks
  * rrs<area>.py: Copied here from ../solution/rrs.py because python hates relative dependencies.
 

diff --git a/tools/copy_expected_to_scenario.py b/tools/copy_expected_to_scenario.py
@@ -0,0 +1,84 @@
+"""Template for extracting data from the expected csv's back into scenario json.
+"""
+import datetime
+import glob
+import os
+import json
+import pathlib
+
+import pandas as pd
+import zipfile
+import importlib
+
+from solution import factory
+from tools import solution_xls_extract as sxe
+
+# Note that this timestamp could be one second off from the original Excel because our exported 
+# float precision is a bit too low. We export floats like
+# 43782.55755787 whereas one second in days = 1/86400 = 
+#      .000011574074074074073...
+# so rounding errors can occur. 
+def from_excel_timestamp(days_float, _epoch0=datetime.datetime(1899, 12, 31)):
+    if days_float >= 60:
+        days_float -= 1  # Excel leap year bug, 1900 is not a leap year!
+    return (_epoch0 + datetime.timedelta(days=days_float)).replace(microsecond=0)
+
+def find_scenario_in_record(df_expected, scenario_name):
+    """Find scenario start via 'Name of Scenario' row (or None if not found)."""
+    name_header_col = df_expected[:][3]
+    names_col = df_expected[:][4]
+    scenario_start_row = None
+    for row_idx, val in enumerate(name_header_col):
+        if val == 'Name of Scenario:' and names_col[row_idx] == scenario_name:
+            scenario_start_row = row_idx
+            break
+    return scenario_start_row
+
+def copy_expected_to_ac_json(solution_basedir, solution_name):
+    solution_dir = os.path.join(solution_basedir, solution_name)
+    expected_filename = os.path.join(solution_dir, 'tests', 'expected.zip')
+    ac_json_glob = os.path.join(solution_dir, 'ac', '*.json')
+    ac_jsons = glob.glob(ac_json_glob)
+    with zipfile.ZipFile(expected_filename) as zf:
+        for ac_json in ac_jsons:
+            print(ac_json)
+            jsonfile = pathlib.Path(ac_json).resolve()
+            d = json.loads( jsonfile.read_text(encoding='utf-8') )
+            scenario_name = d['name']
+
+            # Read exported tab for the scenario from expected.zip. Note that the ScenarioRecord tab is especially
+            # confusing: there's a separate exported file for each scenario, and each one holds _all_ the scenarios. We conservatively read only scenario x's
+            # rows from scenario x's ScenarioRecord in case the exports happened
+            # at different times.
+            sr_file = zf.open(scenario_name + "/" + 'ScenarioRecord')
+            df_expected = pd.read_csv(sr_file, header=None, na_values=['#REF!', '#DIV/0!', '#VALUE!', '(N/A)'])
+
+            scenario_start_row = find_scenario_in_record(df_expected, scenario_name)
+            assert scenario_start_row is not None, f'Could not find rows for scenario {scenario_name} in {sr_file}'
+
+            # Actually extract the datapoint we want and update the json dict.
+            # Creation date could be a date string or an excel float, handle both.
+            date_format = "%Y-%m-%d %H:%M:%S"
+            creation_date_str = df_expected.iloc[scenario_start_row,1]
+            if ':' in creation_date_str and '-' in creation_date_str:
+                creation_date = datetime.datetime.strptime(creation_date_str, date_format)
+            else:
+                creation_date_float = float(creation_date_str)
+                creation_date = from_excel_timestamp(creation_date_float)
+            d['creation_date'] = creation_date.strftime(date_format)
+
+            # Rewrite the json.            
+            sxe.write_scenario(jsonfile, d)
+    return ac_jsons
+
+if __name__ == "__main__":
+    # Run over all solutions. 
+    solution_basedir = pathlib.Path(__file__).parents[1]/"solution"
+    all_jsons_modified = []
+    for solution_name in sorted(factory.all_solutions()):
+        if solution_name == 'hfc_replacement':
+            # subdir PDS2-82p2050-Median\ has a space at the end. TODO
+            continue
+        all_jsons_modified.extend(copy_expected_to_ac_json(solution_basedir, solution_name))
+    print('Rewrote', len(all_jsons_modified), 'json files.')
+