From 0f73f8c517aceff70c24e8d90ffac49cdf9f693c Mon Sep 17 00:00:00 2001 From: Wenyi Kuang Date: Wed, 18 Sep 2024 17:19:47 -0400 Subject: [PATCH 1/3] sight glass check --- postprocessing/comstockpostproc/comstock.py | 40 +++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index df2ff0a2..109d9dd3 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -2879,3 +2879,43 @@ def export_data_and_enumeration_dictionary(self): file_path = os.path.abspath(os.path.join(self.output_dir, file_name)) logger.info(f'Exporting enumeration dictionary to: {file_path}') enum_dictionary.write_csv(file_path, separator='\t') + + + def _sightGlass_metadata_check(self, row_segment: pl.DataFrame): + # Check that the metadata columns are present in the data + # when the columns are in memory + err_log = "" + upgrade_id = row_segment[self.UPGRADE_ID].unique().item() + + #check no na values in any columns + if row_segment.null_count().pipe(sum).item() > 0: + err_log += 'Null values found in data\n' + + if self.BLDG_ID not in row_segment.columns: + err_log += f'{self.BLDG_ID} not found in data\n' + + SIGHTGLASS_REQUIRED_COLS = [self.META_IDX, self.UPGRADE_ID, + self.BLDG_WEIGHT, self.UPGRADE_APPL, self.FLR_AREA] + + for col in SIGHTGLASS_REQUIRED_COLS: + if col not in row_segment.columns: + err_log += f'{col} not found in data, which is needed for sightglass\n' + + for c in row_segment.columns: + if re.search('[^a-z0-9._]', c): + # (f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)') + err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n' + + site_total_col, fuel_total_cols, enduse_cols = None, [], [] + for c in row_segment.columns: + if ("out." in c) and (".energy_consumption" in c): + if "intensity" not in c: + o, fuel, end_use, ec = c.split('.') + if fuel == "site_energy": + site_total_col = c + elif end_use == "total": + fuel_total_cols.append(c) + else: + enduse_cols.append(c) + + \ No newline at end of file From 7d47e3df06c603a765771d14a56eddb2362417c4 Mon Sep 17 00:00:00 2001 From: Wenyi Kuang Date: Tue, 24 Sep 2024 18:08:07 -0600 Subject: [PATCH 2/3] Check the colnms null and etrc. --- postprocessing/comstockpostproc/comstock.py | 87 ++++++++++++++++----- 1 file changed, 67 insertions(+), 20 deletions(-) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index 109d9dd3..039b53c4 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -13,6 +13,7 @@ import polars as pl import re import datetime +import pytest from comstockpostproc.naming_mixin import NamingMixin from comstockpostproc.units_mixin import UnitsMixin @@ -226,6 +227,7 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc self.cached_parquet.append((upgrade_id, file_path)) #cached_parquet is a list of parquets used to export and reload logger.info(f'Exporting to: {file_path}') self.data = self.reorder_data_columns(self.data) + self._sightGlass_metadata_check(self.data) self.data.write_parquet(file_path) up_lazyframes.append(pl.scan_parquet(file_path)) @@ -2885,37 +2887,82 @@ def _sightGlass_metadata_check(self, row_segment: pl.DataFrame): # Check that the metadata columns are present in the data # when the columns are in memory err_log = "" - upgrade_id = row_segment[self.UPGRADE_ID].unique().item() - + #check no na values in any columns if row_segment.null_count().pipe(sum).item() > 0: err_log += 'Null values found in data\n' + for c in row_segment.columns: + if row_segment[c].null_count() > 0: + err_log += f'Column {c} has null values\n' - if self.BLDG_ID not in row_segment.columns: - err_log += f'{self.BLDG_ID} not found in data\n' - - SIGHTGLASS_REQUIRED_COLS = [self.META_IDX, self.UPGRADE_ID, + SIGHTGLASS_REQUIRED_COLS = [self.BLDG_ID, self.META_IDX, self.UPGRADE_ID, self.BLDG_WEIGHT, self.UPGRADE_APPL, self.FLR_AREA] for col in SIGHTGLASS_REQUIRED_COLS: if col not in row_segment.columns: err_log += f'{col} not found in data, which is needed for sightglass\n' - + + #Skip pattern, may need delete later: + pattern = r'out\.electricity\.total\.[a-zA-Z]{3}\.energy_consumption' + for c in row_segment.columns: if re.search('[^a-z0-9._]', c): # (f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)') - err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n' - + err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n' site_total_col, fuel_total_cols, enduse_cols = None, [], [] for c in row_segment.columns: - if ("out." in c) and (".energy_consumption" in c): - if "intensity" not in c: - o, fuel, end_use, ec = c.split('.') - if fuel == "site_energy": - site_total_col = c - elif end_use == "total": - fuel_total_cols.append(c) - else: - enduse_cols.append(c) - - \ No newline at end of file + if 'out.' in c: + if '.energy_consumption' in c: + if re.match(pattern, c): + continue + if not 'intensity' in c: + col, unit = c.split("..") + o, fuel, end_use, ec = col.split('.') + if fuel == 'site_energy': + site_total_col = c + elif end_use == 'total': + fuel_total_cols.append(c) + else: + enduse_cols.append(c) + + for ft in fuel_total_cols: + col, unit = ft.split("..") + tgt_o, tgt_fuel, tgt_end_use, tgt_ec = col.split('.') + # Get the total according to the column + tot_col_val_kwh = row_segment[ft].sum() + + # Calculate total by summing enduse columns + calc_tot_val_kwh = 0 + for eu in enduse_cols: + col, unit = eu.split("..") + o, fuel, end_use, ec = col.split('.') + if fuel == tgt_fuel: + calc_tot_val_kwh += row_segment[eu].sum() + logger.debug(f'{eu} = {calc_tot_val_kwh}') + + # Compare + if calc_tot_val_kwh != pytest.approx(tot_col_val_kwh, rel=0.001): + logging_info = f"Checking {ft} total col against sum of enduse cols for the fuel\n" + logging_info += f'{ft}; total col = {tot_col_val_kwh}; sum of enduse cols = {calc_tot_val_kwh}' + err_log += logging_info + logger.error(logging_info) + + # Check total site energy against sum of fuel total cols + tot_col_val_kwh = row_segment[site_total_col].sum() + + # Calculate total by summing fuel total columns + calc_tot_val_kwh = 0 + for ft in fuel_total_cols: + logger.debug(f'adding {row_segment[ft].sum()} for {ft}') + calc_tot_val_kwh += row_segment[ft].sum() + + # Compare + if not calc_tot_val_kwh == pytest.approx(tot_col_val_kwh, rel=0.001): + logging_info = f'site total col = {tot_col_val_kwh}; sum of fuel total cols = {calc_tot_val_kwh}' + logger.error(logging_info) + err_log += logging_info + else: + logger.debug(f'site total col: {tot_col_val_kwh}; sum of fuel total cols: {calc_tot_val_kwh}') + + if err_log: + raise ValueError(err_log) \ No newline at end of file From 191d216e33bb3273ac7b090dde2fe919636ffa90 Mon Sep 17 00:00:00 2001 From: Wenyi Kuang Date: Wed, 25 Sep 2024 14:05:22 -0600 Subject: [PATCH 3/3] Refactored partial of the logic of meta check and skip sererval columns. --- postprocessing/comstockpostproc/comstock.py | 85 ++++++++------------- 1 file changed, 31 insertions(+), 54 deletions(-) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index 039b53c4..e6d0dd6e 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -2892,6 +2892,8 @@ def _sightGlass_metadata_check(self, row_segment: pl.DataFrame): if row_segment.null_count().pipe(sum).item() > 0: err_log += 'Null values found in data\n' for c in row_segment.columns: + if c.startswith("out.qoi.") or c.startswith("out.utility_bills.") or c.startswith('applicability.upgrade_add_pvwatts'): + continue if row_segment[c].null_count() > 0: err_log += f'Column {c} has null values\n' @@ -2909,60 +2911,35 @@ def _sightGlass_metadata_check(self, row_segment: pl.DataFrame): if re.search('[^a-z0-9._]', c): # (f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)') err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n' - site_total_col, fuel_total_cols, enduse_cols = None, [], [] + + #Actually that's the perfect case to use regex to check the summary. + TOTAL_PATTERN = r'out\.([a-zA-Z_]+)\.total\.energy_consumption\.\.kwh' + ENDUSE_PATTERN = r'out\.([a-zA-Z_]+)\.(?!total)([a-zA-Z_]+)\.energy_consumption\.\.kwh' + MONTH_PATTERN = r'out\.electricity\.total\.([a-zA-Z]{3})\.energy_consumption' + + #Find the sum of total culmns for each type fuels, and for each fuel type find the sum of different + #enduse columns. And record them in a dictionary like: {fuel_type: total_energy} + fuel_total, end_use_total, month_total = {}, {}, {} for c in row_segment.columns: - if 'out.' in c: - if '.energy_consumption' in c: - if re.match(pattern, c): - continue - if not 'intensity' in c: - col, unit = c.split("..") - o, fuel, end_use, ec = col.split('.') - if fuel == 'site_energy': - site_total_col = c - elif end_use == 'total': - fuel_total_cols.append(c) - else: - enduse_cols.append(c) - - for ft in fuel_total_cols: - col, unit = ft.split("..") - tgt_o, tgt_fuel, tgt_end_use, tgt_ec = col.split('.') - # Get the total according to the column - tot_col_val_kwh = row_segment[ft].sum() - - # Calculate total by summing enduse columns - calc_tot_val_kwh = 0 - for eu in enduse_cols: - col, unit = eu.split("..") - o, fuel, end_use, ec = col.split('.') - if fuel == tgt_fuel: - calc_tot_val_kwh += row_segment[eu].sum() - logger.debug(f'{eu} = {calc_tot_val_kwh}') - - # Compare - if calc_tot_val_kwh != pytest.approx(tot_col_val_kwh, rel=0.001): - logging_info = f"Checking {ft} total col against sum of enduse cols for the fuel\n" - logging_info += f'{ft}; total col = {tot_col_val_kwh}; sum of enduse cols = {calc_tot_val_kwh}' - err_log += logging_info - logger.error(logging_info) - - # Check total site energy against sum of fuel total cols - tot_col_val_kwh = row_segment[site_total_col].sum() - - # Calculate total by summing fuel total columns - calc_tot_val_kwh = 0 - for ft in fuel_total_cols: - logger.debug(f'adding {row_segment[ft].sum()} for {ft}') - calc_tot_val_kwh += row_segment[ft].sum() - - # Compare - if not calc_tot_val_kwh == pytest.approx(tot_col_val_kwh, rel=0.001): - logging_info = f'site total col = {tot_col_val_kwh}; sum of fuel total cols = {calc_tot_val_kwh}' - logger.error(logging_info) - err_log += logging_info - else: - logger.debug(f'site total col: {tot_col_val_kwh}; sum of fuel total cols: {calc_tot_val_kwh}') - + if re.match(TOTAL_PATTERN, c): + fuel_type = re.match(TOTAL_PATTERN, c).group(1) + fuel_total[fuel_type] = row_segment[c].sum() + elif re.match(ENDUSE_PATTERN, c): + fuel_type = re.match(ENDUSE_PATTERN, c).group(1) + end_use_total[fuel_type] = end_use_total.get(fuel_type, 0) + row_segment[c].sum() + elif re.match(MONTH_PATTERN, c): + month = re.match(MONTH_PATTERN, c).group(1) + month_total[month] = row_segment[c].sum() + + logger.info(f"Fuel total: {fuel_total}, Enduse total: {end_use_total}, Month total: {month_total}") + # Check that the total site energy is the sum of the fuel totals + for fuel, total in end_use_total.items(): + if not total == pytest.approx(fuel_total[fuel], rel=0.001): + err_log += f'Fuel total for {fuel} does not match sum of enduse columns\n' + if not sum(fuel_total.values()) == pytest.approx(row_segment[self.ANN_TOT_ENGY_KBTU].sum(), rel=0.001): + err_log += 'Site total does not match sum of fuel totals\n' + if not sum(month_total.values()) == pytest.approx(row_segment[self.ANN_TOT_ELEC_KBTU].sum(), rel=0.01): + err_log += 'Electricity total does not match sum of month totals\n' + if err_log: raise ValueError(err_log) \ No newline at end of file