Initial refactoring of process_report

This refactor commit is the first of a few, to lay out intial structure A new submodule, `invoices`, is added, containing a base class `Invoice` which is inherited by all other invoices. Currently, only the lenovo and nonbillable invoice has classes which inherits from `Invoice` Also created and partially populated an `util.py` file, containing functions placed above `main()` in `process_report.py` After the refactoring process is fully complete, these utility functions will be completely removed from `process_report.py`
CCI-MOC · May 30, 2024 · 02c84ed · 02c84ed
1 parent 18ff61f
commit 02c84ed
Show file tree

Hide file tree

Showing 6 changed files with 188 additions and 56 deletions.
diff --git a/process_report/invoices/invoice.py b/process_report/invoices/invoice.py
@@ -0,0 +1,78 @@
+from dataclasses import dataclass
+import pandas
+
+import process_report.util as util
+
+
+### Invoice field names
+INVOICE_DATE_FIELD = "Invoice Month"
+PROJECT_FIELD = "Project - Allocation"
+PROJECT_ID_FIELD = "Project - Allocation ID"
+PI_FIELD = "Manager (PI)"
+INVOICE_EMAIL_FIELD = "Invoice Email"
+INVOICE_ADDRESS_FIELD = "Invoice Address"
+INSTITUTION_FIELD = "Institution"
+INSTITUTION_ID_FIELD = "Institution - Specific Code"
+SU_HOURS_FIELD = "SU Hours (GBhr or SUhr)"
+SU_TYPE_FIELD = "SU Type"
+COST_FIELD = "Cost"
+CREDIT_FIELD = "Credit"
+CREDIT_CODE_FIELD = "Credit Code"
+SUBSIDY_FIELD = "Subsidy"
+BALANCE_FIELD = "Balance"
+###
+
+
+@dataclass
+class Invoice:
+    name: str
+    invoice_month: str
+    data: pandas.DataFrame
+
+    def process(self):
+        self._prepare()
+        self._process()
+        self._prepare_export()
+
+    @property
+    def output_path(self) -> str:
+        return f"{self.name} {self.invoice_month}.csv"
+
+    @property
+    def output_s3_key(self) -> str:
+        return f"Invoices/{self.invoice_month}/{self.name} {self.invoice_month}.csv"
+
+    @property
+    def output_s3_archive_key(self):
+        return f"Invoices/{self.invoice_month}/Archive/{self.name} {self.invoice_month} {util.get_iso8601_time()}.csv"
+
+    def _prepare(self):
+        """Prepares the data for processing.
+
+        Implement in subclass if necessary. May add or remove columns
+        necessary for processing, add or remove rows, validate the data, or
+        perform simple substitutions.
+        """
+        pass
+
+    def _process(self):
+        """Processes the data.
+
+        Implement in subclass if necessary. Performs necessary calculations
+        on the data, e.g. applying subsidies or credits.
+        """
+        pass
+
+    def _prepare_export(self):
+        """Prepares the data for export.
+
+        Implement in subclass if necessary. May add or remove columns or rows
+        that should or should not be exported after processing."""
+        pass
+
+    def export(self):
+        self.data.to_csv(self.output_path)
+
+    def export_s3(self, s3_bucket):
+        s3_bucket.upload_file(self.output_path, self.output_s3_key)
+        s3_bucket.upload_file(self.output_path, self.output_s3_archive_key)
diff --git a/process_report/invoices/lenovo_invoice.py b/process_report/invoices/lenovo_invoice.py
@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+
+import process_report.invoices.invoice as invoice
+
+
+@dataclass
+class LenovoInvoice(invoice.Invoice):
+    LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
+    SU_CHARGE_MULTIPLIER = 1
+
+    def _prepare(self):
+        self.data = self.data[
+            self.data[invoice.SU_TYPE_FIELD].isin(self.LENOVO_SU_TYPES)
+        ][
+            [
+                invoice.INVOICE_DATE_FIELD,
+                invoice.PROJECT_FIELD,
+                invoice.INSTITUTION_FIELD,
+                invoice.SU_HOURS_FIELD,
+                invoice.SU_TYPE_FIELD,
+            ]
+        ].copy()
+
+        self.data.rename(columns={invoice.SU_HOURS_FIELD: "SU Hours"}, inplace=True)
+        self.data.insert(len(self.data.columns), "SU Charge", self.SU_CHARGE_MULTIPLIER)
+
+    def _process(self):
+        self.data["Charge"] = self.data["SU Hours"] * self.data["SU Charge"]
diff --git a/process_report/invoices/nonbillable_invoice.py b/process_report/invoices/nonbillable_invoice.py
@@ -0,0 +1,15 @@
+from dataclasses import dataclass
+
+import process_report.invoices.invoice as invoice
+
+
+@dataclass
+class NonbillableInvoice(invoice.Invoice):
+    nonbillable_pis: list[str]
+    nonbillable_projects: list[str]
+
+    def _prepare_export(self):
+        self.data = self.data[
+            self.data[invoice.PI_FIELD].isin(self.nonbillable_pis)
+            | self.data[invoice.PROJECT_FIELD].isin(self.nonbillable_projects)
+        ]
diff --git a/process_report/process_report.py b/process_report/process_report.py
@@ -9,6 +9,8 @@
 import boto3
 import pyarrow
 
+from process_report.invoices import lenovo_invoice, nonbillable_invoice
+
 
 ### Invoice field names
 INVOICE_DATE_FIELD = "Invoice Month"
@@ -150,7 +152,7 @@ def main():
     parser.add_argument(
         "--nonbillable-file",
         required=False,
-        default="nonbillable.csv",
+        default="nonbillable",
         help="Name of nonbillable file",
     )
     parser.add_argument(
@@ -180,7 +182,7 @@ def main():
     parser.add_argument(
         "--Lenovo-file",
         required=False,
-        default="Lenovo.csv",
+        default="Lenovo",
         help="Name of output csv for Lenovo SU Types invoice",
     )
     parser.add_argument(
@@ -225,8 +227,22 @@ def main():
     projects = list(set(projects + timed_projects_list))
 
     merged_dataframe = add_institution(merged_dataframe)
-    export_lenovo(merged_dataframe, args.Lenovo_file)
-    remove_billables(merged_dataframe, pi, projects, args.nonbillable_file)
+    lenovo_inv = lenovo_invoice.LenovoInvoice(
+        name=args.Lenovo_file, invoice_month=invoice_month, data=merged_dataframe.copy()
+    )
+    nonbillable_inv = nonbillable_invoice.NonbillableInvoice(
+        name=args.nonbillable_file,
+        invoice_month=invoice_month,
+        data=merged_dataframe.copy(),
+        nonbillable_pis=pi,
+        nonbillable_projects=projects,
+    )
+    for invoice in [lenovo_inv, nonbillable_inv]:
+        invoice.process()
+        invoice.export()
+        if args.upload_to_s3:
+            bucket = get_invoice_bucket()
+            invoice.export_s3(bucket)
 
     billable_projects = remove_non_billables(merged_dataframe, pi, projects)
     billable_projects = validate_pi_names(billable_projects)
@@ -239,9 +255,7 @@ def main():
 
     if args.upload_to_s3:
         invoice_list = [
-            args.nonbillable_file,
             args.output_file,
-            args.Lenovo_file,
         ]
 
         for pi_invoice in os.listdir(args.output_folder):
@@ -315,17 +329,6 @@ def remove_non_billables(dataframe, pi, projects):
     return filtered_dataframe
 
 
-def remove_billables(dataframe, pi, projects, output_file):
-    """Removes projects and PIs that should be billed from the dataframe
-
-    So this *keeps* the projects/pis that should not be billed.
-    """
-    filtered_dataframe = dataframe[
-        dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)
-    ]
-    filtered_dataframe.to_csv(output_file, index=False)
-
-
 def validate_pi_names(dataframe):
     invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])]
     for i, row in invalid_pi_projects.iterrows():
@@ -499,26 +502,6 @@ def export_HU_BU(dataframe, output_file):
     HU_BU_projects.to_csv(output_file)
 
 
-def export_lenovo(dataframe: pandas.DataFrame, output_file):
-    LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
-    SU_CHARGE_MULTIPLIER = 1
-
-    lenovo_df = dataframe[dataframe[SU_TYPE_FIELD].isin(LENOVO_SU_TYPES)][
-        [
-            INVOICE_DATE_FIELD,
-            PROJECT_FIELD,
-            INSTITUTION_FIELD,
-            SU_HOURS_FIELD,
-            SU_TYPE_FIELD,
-        ]
-    ].copy()
-
-    lenovo_df.rename(columns={SU_HOURS_FIELD: "SU Hours"}, inplace=True)
-    lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER)
-    lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"]
-    lenovo_df.to_csv(output_file)
-
-
 def upload_to_s3(invoice_list: list, invoice_month):
     invoice_bucket = get_invoice_bucket()
     for invoice_filename in invoice_list:

diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py
@@ -6,6 +6,7 @@
 from textwrap import dedent
 
 from process_report import process_report
+from process_report.invoices import lenovo_invoice, nonbillable_invoice
 
 
 class TestGetInvoiceDate(TestCase):
@@ -72,6 +73,9 @@ def setUp(self):
 
         self.pi_to_exclude = ["PI2", "PI3"]
         self.projects_to_exclude = ["ProjectB", "ProjectD"]
+        self.nonbillable_invoice = nonbillable_invoice.NonbillableInvoice(
+            "Foo", "Foo", self.dataframe, self.pi_to_exclude, self.projects_to_exclude
+        )
 
         self.output_file = tempfile.NamedTemporaryFile(delete=False)
         self.output_file2 = tempfile.NamedTemporaryFile(delete=False)
@@ -105,14 +109,8 @@ def test_remove_non_billables(self):
         self.assertIn("ProjectE", result_df["Project - Allocation"].tolist())
 
     def test_remove_billables(self):
-        process_report.remove_billables(
-            self.dataframe,
-            self.pi_to_exclude,
-            self.projects_to_exclude,
-            self.output_file2.name,
-        )
-
-        result_df = pandas.read_csv(self.output_file2.name)
+        self.nonbillable_invoice.process()
+        result_df = self.nonbillable_invoice.data
 
         self.assertIn("PI2", result_df["Manager (PI)"].tolist())
         self.assertIn("PI3", result_df["Manager (PI)"].tolist())
@@ -568,18 +566,13 @@ def setUp(self):
                 "OpenStack GPUK80",
             ],
         }
-        self.dataframe = pandas.DataFrame(data)
-
-        output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv")
-        self.output_file = output_file.name
-
-    def tearDown(self):
-        os.remove(self.output_file)
-
-    def test_apply_credit_0002(self):
-        process_report.export_lenovo(self.dataframe, self.output_file)
-        output_df = pandas.read_csv(self.output_file)
+        self.lenovo_invoice = lenovo_invoice.LenovoInvoice(
+            "Lenovo", "2023-01", pandas.DataFrame(data)
+        )
+        self.lenovo_invoice.process()
 
+    def test_process_lenovo(self):
+        output_df = self.lenovo_invoice.data
         self.assertTrue(
             set(
                 [

diff --git a/process_report/util.py b/process_report/util.py
@@ -0,0 +1,35 @@
+import datetime
+import json
+import logging
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def get_institution_from_pi(institute_map, pi_uname):
+    institution_key = pi_uname.split("@")[-1]
+    institution_name = institute_map.get(institution_key, "")
+
+    if institution_name == "":
+        logger.warn(f"PI name {pi_uname} does not match any institution!")
+
+    return institution_name
+
+
+def load_institute_map() -> dict:
+    with open("process_report/institute_map.json", "r") as f:
+        institute_map = json.load(f)
+
+    return institute_map
+
+
+def get_iso8601_time():
+    return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")
+
+
+def compare_invoice_month(month_1, month_2):
+    """Returns True if 1st date is later than 2nd date"""
+    dt1 = datetime.datetime.strptime(month_1, "%Y-%m")
+    dt2 = datetime.datetime.strptime(month_2, "%Y-%m")
+    return dt1 > dt2