Skip to content

Commit

Permalink
Merge pull request #224 from NREL/wenyi/comstock-check-sg-meta
Browse files Browse the repository at this point in the history
[WIP] Check the metadata in the postprocess when the pl.data in the memory
  • Loading branch information
rHorsey authored Oct 16, 2024
2 parents 92f46d3 + 191d216 commit 1f5866c
Showing 1 changed file with 64 additions and 0 deletions.
64 changes: 64 additions & 0 deletions postprocessing/comstockpostproc/comstock.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import polars as pl
import re
import datetime
import pytest

from comstockpostproc.naming_mixin import NamingMixin
from comstockpostproc.units_mixin import UnitsMixin
Expand Down Expand Up @@ -226,6 +227,7 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc
self.cached_parquet.append((upgrade_id, file_path)) #cached_parquet is a list of parquets used to export and reload
logger.info(f'Exporting to: {file_path}')
self.data = self.reorder_data_columns(self.data)
self._sightGlass_metadata_check(self.data)
self.data.write_parquet(file_path)
up_lazyframes.append(pl.scan_parquet(file_path))

Expand Down Expand Up @@ -2879,3 +2881,65 @@ def export_data_and_enumeration_dictionary(self):
file_path = os.path.abspath(os.path.join(self.output_dir, file_name))
logger.info(f'Exporting enumeration dictionary to: {file_path}')
enum_dictionary.write_csv(file_path, separator='\t')


def _sightGlass_metadata_check(self, row_segment: pl.DataFrame):
# Check that the metadata columns are present in the data
# when the columns are in memory
err_log = ""

#check no na values in any columns
if row_segment.null_count().pipe(sum).item() > 0:
err_log += 'Null values found in data\n'
for c in row_segment.columns:
if c.startswith("out.qoi.") or c.startswith("out.utility_bills.") or c.startswith('applicability.upgrade_add_pvwatts'):
continue
if row_segment[c].null_count() > 0:
err_log += f'Column {c} has null values\n'

SIGHTGLASS_REQUIRED_COLS = [self.BLDG_ID, self.META_IDX, self.UPGRADE_ID,
self.BLDG_WEIGHT, self.UPGRADE_APPL, self.FLR_AREA]

for col in SIGHTGLASS_REQUIRED_COLS:
if col not in row_segment.columns:
err_log += f'{col} not found in data, which is needed for sightglass\n'

#Skip pattern, may need delete later:
pattern = r'out\.electricity\.total\.[a-zA-Z]{3}\.energy_consumption'

for c in row_segment.columns:
if re.search('[^a-z0-9._]', c):
# (f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)')
err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n'

#Actually that's the perfect case to use regex to check the summary.
TOTAL_PATTERN = r'out\.([a-zA-Z_]+)\.total\.energy_consumption\.\.kwh'
ENDUSE_PATTERN = r'out\.([a-zA-Z_]+)\.(?!total)([a-zA-Z_]+)\.energy_consumption\.\.kwh'
MONTH_PATTERN = r'out\.electricity\.total\.([a-zA-Z]{3})\.energy_consumption'

#Find the sum of total culmns for each type fuels, and for each fuel type find the sum of different
#enduse columns. And record them in a dictionary like: {fuel_type: total_energy}
fuel_total, end_use_total, month_total = {}, {}, {}
for c in row_segment.columns:
if re.match(TOTAL_PATTERN, c):
fuel_type = re.match(TOTAL_PATTERN, c).group(1)
fuel_total[fuel_type] = row_segment[c].sum()
elif re.match(ENDUSE_PATTERN, c):
fuel_type = re.match(ENDUSE_PATTERN, c).group(1)
end_use_total[fuel_type] = end_use_total.get(fuel_type, 0) + row_segment[c].sum()
elif re.match(MONTH_PATTERN, c):
month = re.match(MONTH_PATTERN, c).group(1)
month_total[month] = row_segment[c].sum()

logger.info(f"Fuel total: {fuel_total}, Enduse total: {end_use_total}, Month total: {month_total}")
# Check that the total site energy is the sum of the fuel totals
for fuel, total in end_use_total.items():
if not total == pytest.approx(fuel_total[fuel], rel=0.001):
err_log += f'Fuel total for {fuel} does not match sum of enduse columns\n'
if not sum(fuel_total.values()) == pytest.approx(row_segment[self.ANN_TOT_ENGY_KBTU].sum(), rel=0.001):
err_log += 'Site total does not match sum of fuel totals\n'
if not sum(month_total.values()) == pytest.approx(row_segment[self.ANN_TOT_ELEC_KBTU].sum(), rel=0.01):
err_log += 'Electricity total does not match sum of month totals\n'

if err_log:
raise ValueError(err_log)

0 comments on commit 1f5866c

Please sign in to comment.