Skip to content

Commit

Permalink
Merge pull request #75 from uchicago-dsi/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
toddnief authored Sep 11, 2024
2 parents ee7c879 + fad0ad7 commit 3f1d9a8
Showing 1 changed file with 100 additions and 3 deletions.
103 changes: 100 additions & 3 deletions scripts/pipeline-template.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import Any, Dict, Optional

import numpy as np
import pandas as pd

CURRENT_DIR = Path(__file__).resolve().parent
Expand All @@ -19,11 +20,11 @@
"Item Brand",
"Item Name",
"Item Description Refined",
"Item Description Refined (Trial)",
# "Item Description Refined (Trial)",
"Material Class I",
"Material Class II",
"Material Class III",
"Start Weight",
# "Start Weight",
"% Residuals (Mass)",
"% Residuals (Area)",
]
Expand Down Expand Up @@ -252,6 +253,17 @@ def calculate_results(self, data: pd.DataFrame) -> pd.DataFrame:
"""
return data

def merge_with_trials(self, data: pd.DataFrame) -> pd.DataFrame:
"""Merges the data with trial information.
Args:
data: Data to merge.
Returns:
Data merged with trial information.
"""
return data.merge(self.trials, left_on="Trial ID", right_on="Public Trial ID")

def run(self, save: bool = False) -> pd.DataFrame:
"""Runs the data pipeline.
Expand All @@ -270,7 +282,7 @@ def run(self, save: bool = False) -> pd.DataFrame:
data = self.preprocess_data(data)
data = self.join_with_items(data)
data = self.calculate_results(data)
data = data.merge(self.trials, left_on="Trial ID", right_on="Public Trial ID")
data = self.merge_with_trials(data)
data = data[TRIAL_COLS]
if save:
data.to_csv(self.output_filepath, index=False)
Expand All @@ -279,6 +291,91 @@ def run(self, save: bool = False) -> pd.DataFrame:
return data


class NewTemplatePipeline(AbstractDataPipeline):
"""Pipeline for processing data from the new template."""

def load_data(self, data_filepath: Path, sheet_name: int = 0, skiprows: int = 0) -> pd.DataFrame:
"""Loads data from the specified CSV file.
Args:
data_filepath: Path to the data file.
sheet_name: Sheet name or index to load. Defaults to 0.
skiprows: Number of rows to skip at the start of the file. Defaults to 0.
Returns:
Loaded data.
"""
# Read the CSV file into a DataFrame
data = pd.read_csv(data_filepath)

# Find the index of the first completely empty row — formatted so there's comments below the data
first_empty_row_index = data[data.isna().all(axis=1)].index.min()

# If an empty row is found, drop all rows below it
if pd.notna(first_empty_row_index):
data = data[:first_empty_row_index]

return data

def preprocess_data(self, data):
"""Preprocesses the data.
Args:
data: Data to preprocess.
Returns:
The preprocess data.
"""
data = data.rename(
columns={
"Trial": "Trial ID",
}
)
percentage_cols = ["% Residuals (Dry Weight)", "% Residuals (Wet Weight)", "% Residuals (Area)"]
data[percentage_cols] = data[percentage_cols].replace("no data", np.nan)
# TODO: Depending on how the data actually comes in, maybe we don't want to do it this way?
data[percentage_cols] = data[percentage_cols].replace("%", "", regex=True).astype(float) / 100

# Prefer dry weight to wet weight if available
data["% Residuals (Mass)"] = data["% Residuals (Dry Weight)"].fillna(data["% Residuals (Wet Weight)"])

return data

def join_with_items(self, data):
"""Join with the items table
Args:
data: Data to join.
Returns:
The joined data
"""
return self.items.drop_duplicates(subset="Item Name").merge(data, on="Item Name")

def merge_with_trials(self, data):
"""Join with the trials table
Args:
data: Data to join.
Returns:
The joined data
"""
dummy_trial = {
"Trial ID": "44547-01-21",
"Test Method": "Mesh Bag",
"Technology": "Windrow",
}
self.trials = pd.concat([self.trials, pd.DataFrame(dummy_trial, index=[0])], ignore_index=True)
return data.merge(self.trials, on="Trial ID")


NEW_TEMPLATE_PATH = DATA_DIR / "CFTP_DisintegrationDataInput_Template_sept92024.csv"
new_template_pipeline = NewTemplatePipeline(NEW_TEMPLATE_PATH, trial_name="Dummy Data for New Template")
# TODO: This is commented out so we don't add the dummy data to the "real" data
# processed_data.append(new_template_pipeline.run())


class CASP004Pipeline(AbstractDataPipeline):
"""Pipeline for processing CASP004 trial data."""

Expand Down

0 comments on commit 3f1d9a8

Please sign in to comment.