Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Check the metadata in the postprocess when the pl.data in the memory #224

Merged
merged 3 commits into from
Oct 16, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions postprocessing/comstockpostproc/comstock.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import polars as pl
import re
import datetime
import pytest

from comstockpostproc.naming_mixin import NamingMixin
from comstockpostproc.units_mixin import UnitsMixin
Expand Down Expand Up @@ -226,6 +227,7 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc
self.cached_parquet.append((upgrade_id, file_path)) #cached_parquet is a list of parquets used to export and reload
logger.info(f'Exporting to: {file_path}')
self.data = self.reorder_data_columns(self.data)
self._sightGlass_metadata_check(self.data)
self.data.write_parquet(file_path)
up_lazyframes.append(pl.scan_parquet(file_path))

Expand Down Expand Up @@ -2879,3 +2881,65 @@ def export_data_and_enumeration_dictionary(self):
file_path = os.path.abspath(os.path.join(self.output_dir, file_name))
logger.info(f'Exporting enumeration dictionary to: {file_path}')
enum_dictionary.write_csv(file_path, separator='\t')


def _sightGlass_metadata_check(self, row_segment: pl.DataFrame):
# Check that the metadata columns are present in the data
# when the columns are in memory
err_log = ""

#check no na values in any columns
if row_segment.null_count().pipe(sum).item() > 0:
err_log += 'Null values found in data\n'
for c in row_segment.columns:
if c.startswith("out.qoi.") or c.startswith("out.utility_bills.") or c.startswith('applicability.upgrade_add_pvwatts'):
continue
if row_segment[c].null_count() > 0:
err_log += f'Column {c} has null values\n'

SIGHTGLASS_REQUIRED_COLS = [self.BLDG_ID, self.META_IDX, self.UPGRADE_ID,
self.BLDG_WEIGHT, self.UPGRADE_APPL, self.FLR_AREA]

for col in SIGHTGLASS_REQUIRED_COLS:
if col not in row_segment.columns:
err_log += f'{col} not found in data, which is needed for sightglass\n'

#Skip pattern, may need delete later:
pattern = r'out\.electricity\.total\.[a-zA-Z]{3}\.energy_consumption'

for c in row_segment.columns:
if re.search('[^a-z0-9._]', c):
# (f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)')
err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n'

#Actually that's the perfect case to use regex to check the summary.
TOTAL_PATTERN = r'out\.([a-zA-Z_]+)\.total\.energy_consumption\.\.kwh'
ENDUSE_PATTERN = r'out\.([a-zA-Z_]+)\.(?!total)([a-zA-Z_]+)\.energy_consumption\.\.kwh'
MONTH_PATTERN = r'out\.electricity\.total\.([a-zA-Z]{3})\.energy_consumption'

#Find the sum of total culmns for each type fuels, and for each fuel type find the sum of different
#enduse columns. And record them in a dictionary like: {fuel_type: total_energy}
fuel_total, end_use_total, month_total = {}, {}, {}
for c in row_segment.columns:
if re.match(TOTAL_PATTERN, c):
fuel_type = re.match(TOTAL_PATTERN, c).group(1)
fuel_total[fuel_type] = row_segment[c].sum()
elif re.match(ENDUSE_PATTERN, c):
fuel_type = re.match(ENDUSE_PATTERN, c).group(1)
end_use_total[fuel_type] = end_use_total.get(fuel_type, 0) + row_segment[c].sum()
elif re.match(MONTH_PATTERN, c):
month = re.match(MONTH_PATTERN, c).group(1)
month_total[month] = row_segment[c].sum()

logger.info(f"Fuel total: {fuel_total}, Enduse total: {end_use_total}, Month total: {month_total}")
# Check that the total site energy is the sum of the fuel totals
for fuel, total in end_use_total.items():
if not total == pytest.approx(fuel_total[fuel], rel=0.001):
err_log += f'Fuel total for {fuel} does not match sum of enduse columns\n'
if not sum(fuel_total.values()) == pytest.approx(row_segment[self.ANN_TOT_ENGY_KBTU].sum(), rel=0.001):
err_log += 'Site total does not match sum of fuel totals\n'
if not sum(month_total.values()) == pytest.approx(row_segment[self.ANN_TOT_ELEC_KBTU].sum(), rel=0.01):
err_log += 'Electricity total does not match sum of month totals\n'

if err_log:
raise ValueError(err_log)