Merge pull request #75 from uchicago-dsi/dev

Dev
uchicago-dsi · Sep 11, 2024 · 3f1d9a8 · 3f1d9a8
2 parents ee7c879 + fad0ad7
commit 3f1d9a8
Showing 1 changed file with 100 additions and 3 deletions.
diff --git a/scripts/pipeline-template.py b/scripts/pipeline-template.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import Any, Dict, Optional
 
+import numpy as np
 import pandas as pd
 
 CURRENT_DIR = Path(__file__).resolve().parent
@@ -19,11 +20,11 @@
     "Item Brand",
     "Item Name",
     "Item Description Refined",
-    "Item Description Refined (Trial)",
+    # "Item Description Refined (Trial)",
     "Material Class I",
     "Material Class II",
     "Material Class III",
-    "Start Weight",
+    # "Start Weight",
     "% Residuals (Mass)",
     "% Residuals (Area)",
 ]
@@ -252,6 +253,17 @@ def calculate_results(self, data: pd.DataFrame) -> pd.DataFrame:
         """
         return data
 
+    def merge_with_trials(self, data: pd.DataFrame) -> pd.DataFrame:
+        """Merges the data with trial information.
+
+        Args:
+            data: Data to merge.
+
+        Returns:
+            Data merged with trial information.
+        """
+        return data.merge(self.trials, left_on="Trial ID", right_on="Public Trial ID")
+
     def run(self, save: bool = False) -> pd.DataFrame:
         """Runs the data pipeline.
 
@@ -270,7 +282,7 @@ def run(self, save: bool = False) -> pd.DataFrame:
         data = self.preprocess_data(data)
         data = self.join_with_items(data)
         data = self.calculate_results(data)
-        data = data.merge(self.trials, left_on="Trial ID", right_on="Public Trial ID")
+        data = self.merge_with_trials(data)
         data = data[TRIAL_COLS]
         if save:
             data.to_csv(self.output_filepath, index=False)
@@ -279,6 +291,91 @@ def run(self, save: bool = False) -> pd.DataFrame:
         return data
 
 
+class NewTemplatePipeline(AbstractDataPipeline):
+    """Pipeline for processing data from the new template."""
+
+    def load_data(self, data_filepath: Path, sheet_name: int = 0, skiprows: int = 0) -> pd.DataFrame:
+        """Loads data from the specified CSV file.
+
+        Args:
+            data_filepath: Path to the data file.
+            sheet_name: Sheet name or index to load. Defaults to 0.
+            skiprows: Number of rows to skip at the start of the file. Defaults to 0.
+
+        Returns:
+            Loaded data.
+        """
+        # Read the CSV file into a DataFrame
+        data = pd.read_csv(data_filepath)
+
+        # Find the index of the first completely empty row — formatted so there's comments below the data
+        first_empty_row_index = data[data.isna().all(axis=1)].index.min()
+
+        # If an empty row is found, drop all rows below it
+        if pd.notna(first_empty_row_index):
+            data = data[:first_empty_row_index]
+
+        return data
+
+    def preprocess_data(self, data):
+        """Preprocesses the data.
+
+        Args:
+            data: Data to preprocess.
+
+        Returns:
+            The preprocess data.
+        """
+        data = data.rename(
+            columns={
+                "Trial": "Trial ID",
+            }
+        )
+        percentage_cols = ["% Residuals (Dry Weight)", "% Residuals (Wet Weight)", "% Residuals (Area)"]
+        data[percentage_cols] = data[percentage_cols].replace("no data", np.nan)
+        # TODO: Depending on how the data actually comes in, maybe we don't want to do it this way?
+        data[percentage_cols] = data[percentage_cols].replace("%", "", regex=True).astype(float) / 100
+
+        # Prefer dry weight to wet weight if available
+        data["% Residuals (Mass)"] = data["% Residuals (Dry Weight)"].fillna(data["% Residuals (Wet Weight)"])
+
+        return data
+
+    def join_with_items(self, data):
+        """Join with the items table
+
+        Args:
+            data: Data to join.
+
+        Returns:
+            The joined data
+        """
+        return self.items.drop_duplicates(subset="Item Name").merge(data, on="Item Name")
+
+    def merge_with_trials(self, data):
+        """Join with the trials table
+
+        Args:
+            data: Data to join.
+
+        Returns:
+            The joined data
+        """
+        dummy_trial = {
+            "Trial ID": "44547-01-21",
+            "Test Method": "Mesh Bag",
+            "Technology": "Windrow",
+        }
+        self.trials = pd.concat([self.trials, pd.DataFrame(dummy_trial, index=[0])], ignore_index=True)
+        return data.merge(self.trials, on="Trial ID")
+
+
+NEW_TEMPLATE_PATH = DATA_DIR / "CFTP_DisintegrationDataInput_Template_sept92024.csv"
+new_template_pipeline = NewTemplatePipeline(NEW_TEMPLATE_PATH, trial_name="Dummy Data for New Template")
+# TODO: This is commented out so we don't add the dummy data to the "real" data
+# processed_data.append(new_template_pipeline.run())
+
+
 class CASP004Pipeline(AbstractDataPipeline):
     """Pipeline for processing CASP004 trial data."""