From 0f73f8c517aceff70c24e8d90ffac49cdf9f693c Mon Sep 17 00:00:00 2001
From: Wenyi Kuang <wkuang@nrel.gov>
Date: Wed, 18 Sep 2024 17:19:47 -0400
Subject: [PATCH 1/3] sight glass check

---
 postprocessing/comstockpostproc/comstock.py | 40 +++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py
index df2ff0a2..109d9dd3 100644
--- a/postprocessing/comstockpostproc/comstock.py
+++ b/postprocessing/comstockpostproc/comstock.py
@@ -2879,3 +2879,43 @@ def export_data_and_enumeration_dictionary(self):
         file_path = os.path.abspath(os.path.join(self.output_dir, file_name))
         logger.info(f'Exporting enumeration dictionary to: {file_path}')
         enum_dictionary.write_csv(file_path, separator='\t')
+
+
+    def _sightGlass_metadata_check(self, row_segment: pl.DataFrame):
+        # Check that the metadata columns are present in the data
+        # when the columns are in memory
+        err_log = "" 
+        upgrade_id = row_segment[self.UPGRADE_ID].unique().item()
+        
+        #check no na values in any columns
+        if row_segment.null_count().pipe(sum).item() > 0:
+            err_log += 'Null values found in data\n'
+
+        if self.BLDG_ID not in row_segment.columns:
+            err_log += f'{self.BLDG_ID} not found in data\n'
+
+        SIGHTGLASS_REQUIRED_COLS = [self.META_IDX, self.UPGRADE_ID, 
+                                    self.BLDG_WEIGHT, self.UPGRADE_APPL, self.FLR_AREA]
+        
+        for col in SIGHTGLASS_REQUIRED_COLS:
+            if col not in row_segment.columns:
+                err_log += f'{col} not found in data, which is needed for sightglass\n'
+        
+        for c in row_segment.columns:
+            if re.search('[^a-z0-9._]', c):
+                # (f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)')
+                err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n'
+
+        site_total_col, fuel_total_cols, enduse_cols = None, [], []
+        for c in row_segment.columns:
+            if ("out." in c) and (".energy_consumption" in c):
+                if "intensity" not in c:
+                    o, fuel, end_use, ec = c.split('.')
+                    if fuel == "site_energy":
+                        site_total_col = c
+                    elif end_use == "total":
+                        fuel_total_cols.append(c)
+                    else:
+                        enduse_cols.append(c)
+        
+        
\ No newline at end of file

From 7d47e3df06c603a765771d14a56eddb2362417c4 Mon Sep 17 00:00:00 2001
From: Wenyi Kuang <wkuang@nrel.gov>
Date: Tue, 24 Sep 2024 18:08:07 -0600
Subject: [PATCH 2/3] Check the colnms null and etrc.

---
 postprocessing/comstockpostproc/comstock.py | 87 ++++++++++++++++-----
 1 file changed, 67 insertions(+), 20 deletions(-)

diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py
index 109d9dd3..039b53c4 100644
--- a/postprocessing/comstockpostproc/comstock.py
+++ b/postprocessing/comstockpostproc/comstock.py
@@ -13,6 +13,7 @@
 import polars as pl
 import re
 import datetime
+import pytest
 
 from comstockpostproc.naming_mixin import NamingMixin
 from comstockpostproc.units_mixin import UnitsMixin
@@ -226,6 +227,7 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc
                 self.cached_parquet.append((upgrade_id, file_path)) #cached_parquet is a list of parquets used to export and reload
                 logger.info(f'Exporting to: {file_path}')
                 self.data = self.reorder_data_columns(self.data)
+                self._sightGlass_metadata_check(self.data)
                 self.data.write_parquet(file_path)
                 up_lazyframes.append(pl.scan_parquet(file_path))
 
@@ -2885,37 +2887,82 @@ def _sightGlass_metadata_check(self, row_segment: pl.DataFrame):
         # Check that the metadata columns are present in the data
         # when the columns are in memory
         err_log = "" 
-        upgrade_id = row_segment[self.UPGRADE_ID].unique().item()
-        
+
         #check no na values in any columns
         if row_segment.null_count().pipe(sum).item() > 0:
             err_log += 'Null values found in data\n'
+            for c in row_segment.columns:
+                if row_segment[c].null_count() > 0:
+                    err_log += f'Column {c} has null values\n'
 
-        if self.BLDG_ID not in row_segment.columns:
-            err_log += f'{self.BLDG_ID} not found in data\n'
-
-        SIGHTGLASS_REQUIRED_COLS = [self.META_IDX, self.UPGRADE_ID, 
+        SIGHTGLASS_REQUIRED_COLS = [self.BLDG_ID, self.META_IDX, self.UPGRADE_ID, 
                                     self.BLDG_WEIGHT, self.UPGRADE_APPL, self.FLR_AREA]
         
         for col in SIGHTGLASS_REQUIRED_COLS:
             if col not in row_segment.columns:
                 err_log += f'{col} not found in data, which is needed for sightglass\n'
-        
+
+        #Skip pattern, may need delete later:
+        pattern = r'out\.electricity\.total\.[a-zA-Z]{3}\.energy_consumption'
+
         for c in row_segment.columns:
             if re.search('[^a-z0-9._]', c):
                 # (f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)')
-                err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n'
-
+                err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n' 
         site_total_col, fuel_total_cols, enduse_cols = None, [], []
         for c in row_segment.columns:
-            if ("out." in c) and (".energy_consumption" in c):
-                if "intensity" not in c:
-                    o, fuel, end_use, ec = c.split('.')
-                    if fuel == "site_energy":
-                        site_total_col = c
-                    elif end_use == "total":
-                        fuel_total_cols.append(c)
-                    else:
-                        enduse_cols.append(c)
-        
-        
\ No newline at end of file
+            if 'out.' in c:
+                if '.energy_consumption' in c:
+                    if re.match(pattern, c):
+                        continue
+                    if not 'intensity' in c:
+                        col, unit = c.split("..")
+                        o, fuel, end_use, ec = col.split('.')
+                        if fuel == 'site_energy':
+                            site_total_col = c
+                        elif end_use == 'total':
+                            fuel_total_cols.append(c)
+                        else:
+                            enduse_cols.append(c)
+
+        for ft in fuel_total_cols:
+            col, unit = ft.split("..")
+            tgt_o, tgt_fuel, tgt_end_use, tgt_ec = col.split('.')
+            # Get the total according to the column
+            tot_col_val_kwh = row_segment[ft].sum()
+
+            # Calculate total by summing enduse columns
+            calc_tot_val_kwh = 0
+            for eu in enduse_cols:
+                col, unit = eu.split("..")
+                o, fuel, end_use, ec = col.split('.')
+                if fuel == tgt_fuel:
+                    calc_tot_val_kwh += row_segment[eu].sum()
+                    logger.debug(f'{eu} = {calc_tot_val_kwh}')
+
+            # Compare
+            if calc_tot_val_kwh != pytest.approx(tot_col_val_kwh, rel=0.001):
+                logging_info = f"Checking {ft} total col against sum of enduse cols for the fuel\n"
+                logging_info += f'{ft}; total col = {tot_col_val_kwh}; sum of enduse cols = {calc_tot_val_kwh}'
+                err_log += logging_info
+                logger.error(logging_info)
+
+        # Check total site energy against sum of fuel total cols
+        tot_col_val_kwh = row_segment[site_total_col].sum()
+
+        # Calculate total by summing fuel total columns
+        calc_tot_val_kwh = 0
+        for ft in fuel_total_cols:
+            logger.debug(f'adding {row_segment[ft].sum()} for {ft}')
+            calc_tot_val_kwh += row_segment[ft].sum()
+
+        # Compare
+        if not calc_tot_val_kwh == pytest.approx(tot_col_val_kwh, rel=0.001):
+            logging_info = f'site total col = {tot_col_val_kwh}; sum of fuel total cols = {calc_tot_val_kwh}'
+            logger.error(logging_info)
+            err_log += logging_info
+        else:
+            logger.debug(f'site total col: {tot_col_val_kwh}; sum of fuel total cols: {calc_tot_val_kwh}')
+
+        if err_log:
+            raise ValueError(err_log)
\ No newline at end of file

From 191d216e33bb3273ac7b090dde2fe919636ffa90 Mon Sep 17 00:00:00 2001
From: Wenyi Kuang <wkuang@nrel.gov>
Date: Wed, 25 Sep 2024 14:05:22 -0600
Subject: [PATCH 3/3] Refactored partial of the logic of meta check and skip
 sererval columns.

---
 postprocessing/comstockpostproc/comstock.py | 85 ++++++++-------------
 1 file changed, 31 insertions(+), 54 deletions(-)

diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py
index 039b53c4..e6d0dd6e 100644
--- a/postprocessing/comstockpostproc/comstock.py
+++ b/postprocessing/comstockpostproc/comstock.py
@@ -2892,6 +2892,8 @@ def _sightGlass_metadata_check(self, row_segment: pl.DataFrame):
         if row_segment.null_count().pipe(sum).item() > 0:
             err_log += 'Null values found in data\n'
             for c in row_segment.columns:
+                if c.startswith("out.qoi.") or c.startswith("out.utility_bills.") or c.startswith('applicability.upgrade_add_pvwatts'):
+                    continue
                 if row_segment[c].null_count() > 0:
                     err_log += f'Column {c} has null values\n'
 
@@ -2909,60 +2911,35 @@ def _sightGlass_metadata_check(self, row_segment: pl.DataFrame):
             if re.search('[^a-z0-9._]', c):
                 # (f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)')
                 err_log += f'Column {c} violates name rules: may only contain . _ 0-9 lowercaseletters (no spaces)\n' 
-        site_total_col, fuel_total_cols, enduse_cols = None, [], []
+        
+        #Actually that's the perfect case to use regex to check the summary.
+        TOTAL_PATTERN = r'out\.([a-zA-Z_]+)\.total\.energy_consumption\.\.kwh'
+        ENDUSE_PATTERN = r'out\.([a-zA-Z_]+)\.(?!total)([a-zA-Z_]+)\.energy_consumption\.\.kwh'
+        MONTH_PATTERN = r'out\.electricity\.total\.([a-zA-Z]{3})\.energy_consumption'
+
+        #Find the sum of total culmns for each type fuels, and for each fuel type find the sum of different
+        #enduse columns. And record them in a dictionary like: {fuel_type: total_energy}
+        fuel_total, end_use_total, month_total = {}, {}, {}
         for c in row_segment.columns:
-            if 'out.' in c:
-                if '.energy_consumption' in c:
-                    if re.match(pattern, c):
-                        continue
-                    if not 'intensity' in c:
-                        col, unit = c.split("..")
-                        o, fuel, end_use, ec = col.split('.')
-                        if fuel == 'site_energy':
-                            site_total_col = c
-                        elif end_use == 'total':
-                            fuel_total_cols.append(c)
-                        else:
-                            enduse_cols.append(c)
-
-        for ft in fuel_total_cols:
-            col, unit = ft.split("..")
-            tgt_o, tgt_fuel, tgt_end_use, tgt_ec = col.split('.')
-            # Get the total according to the column
-            tot_col_val_kwh = row_segment[ft].sum()
-
-            # Calculate total by summing enduse columns
-            calc_tot_val_kwh = 0
-            for eu in enduse_cols:
-                col, unit = eu.split("..")
-                o, fuel, end_use, ec = col.split('.')
-                if fuel == tgt_fuel:
-                    calc_tot_val_kwh += row_segment[eu].sum()
-                    logger.debug(f'{eu} = {calc_tot_val_kwh}')
-
-            # Compare
-            if calc_tot_val_kwh != pytest.approx(tot_col_val_kwh, rel=0.001):
-                logging_info = f"Checking {ft} total col against sum of enduse cols for the fuel\n"
-                logging_info += f'{ft}; total col = {tot_col_val_kwh}; sum of enduse cols = {calc_tot_val_kwh}'
-                err_log += logging_info
-                logger.error(logging_info)
-
-        # Check total site energy against sum of fuel total cols
-        tot_col_val_kwh = row_segment[site_total_col].sum()
-
-        # Calculate total by summing fuel total columns
-        calc_tot_val_kwh = 0
-        for ft in fuel_total_cols:
-            logger.debug(f'adding {row_segment[ft].sum()} for {ft}')
-            calc_tot_val_kwh += row_segment[ft].sum()
-
-        # Compare
-        if not calc_tot_val_kwh == pytest.approx(tot_col_val_kwh, rel=0.001):
-            logging_info = f'site total col = {tot_col_val_kwh}; sum of fuel total cols = {calc_tot_val_kwh}'
-            logger.error(logging_info)
-            err_log += logging_info
-        else:
-            logger.debug(f'site total col: {tot_col_val_kwh}; sum of fuel total cols: {calc_tot_val_kwh}')
-
+            if re.match(TOTAL_PATTERN, c):
+                fuel_type = re.match(TOTAL_PATTERN, c).group(1)
+                fuel_total[fuel_type] = row_segment[c].sum()
+            elif re.match(ENDUSE_PATTERN, c):
+                fuel_type = re.match(ENDUSE_PATTERN, c).group(1)
+                end_use_total[fuel_type] = end_use_total.get(fuel_type, 0) + row_segment[c].sum()
+            elif re.match(MONTH_PATTERN, c):
+                month = re.match(MONTH_PATTERN, c).group(1)
+                month_total[month] = row_segment[c].sum()
+        
+        logger.info(f"Fuel total: {fuel_total}, Enduse total: {end_use_total}, Month total: {month_total}")
+        # Check that the total site energy is the sum of the fuel totals
+        for fuel, total in end_use_total.items():
+            if not total == pytest.approx(fuel_total[fuel], rel=0.001):
+                err_log += f'Fuel total for {fuel} does not match sum of enduse columns\n'
+        if not sum(fuel_total.values()) == pytest.approx(row_segment[self.ANN_TOT_ENGY_KBTU].sum(), rel=0.001):
+            err_log += 'Site total does not match sum of fuel totals\n'
+        if not sum(month_total.values()) == pytest.approx(row_segment[self.ANN_TOT_ELEC_KBTU].sum(), rel=0.01):
+            err_log += 'Electricity total does not match sum of month totals\n'
+    
         if err_log:
             raise ValueError(err_log)
\ No newline at end of file