Implemented S3 integration

This commit gives the user the option fetch invoices from S3 storage. Several env vars are needed to authenticate to S3 storage. More details on these vars can be found in the function `get_invoice_bucket()` from `process_report.py`
CCI-MOC · Apr 22, 2024 · dba44b4 · dba44b4
1 parent c54a956
commit dba44b4
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 51 deletions.
diff --git a/process_report/process_report.py b/process_report/process_report.py
@@ -1,9 +1,12 @@
 import argparse
 import os
 import sys
+import datetime
 
 import json
 import pandas
+import boto3
+from botocore.config import Config
 
 
 ### Invoice field names
@@ -35,7 +38,7 @@ def get_institution_from_pi(institute_map, pi_uname):
 
 
 def load_institute_map() -> dict:
-    with open("institute_map.json", "r") as f:
+    with open("process_report/institute_map.json", "r") as f:
         institute_map = json.load(f)
 
     return institute_map
@@ -62,15 +65,33 @@ def is_old_pi(old_pi_dict, pi, invoice_month):
     return False
 
 
+def get_invoice_bucket():
+    b2_resource = boto3.resource(
+        service_name="s3",
+        endpoint_url=os.environ["B2_ENDPOINT"],
+        aws_access_key_id=os.environ["B2_KEY_ID"],
+        aws_secret_access_key=os.environ["B2_APP_KEY"],
+        config=Config(
+            signature_version="s3v4",
+        ),
+    )
+    return b2_resource.Bucket(os.environ["B2_BUCKET_NAME"])
+
+
+def get_iso8601_time():
+    return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")
+
+
 def main():
     """Remove non-billable PIs and projects"""
 
     parser = argparse.ArgumentParser()
 
+    parser.add_argument("--upload-to-s3", action="store_false")
     parser.add_argument(
-        "csv_files",
-        nargs="+",
-        help="One or more CSV files that need to be processed",
+        "--invoice-month",
+        required=True,
+        help="Invoice month to process",
     )
     parser.add_argument(
         "--pi-file",
@@ -117,7 +138,10 @@ def main():
         help="Name of csv file listing previously billed PIs",
     )
     args = parser.parse_args()
-    merged_dataframe = merge_csv(args.csv_files)
+
+    invoice_month = args.invoice_month
+    csv_files = fetch_S3_invoices(invoice_month)
+    merged_dataframe = merge_csv(csv_files)
 
     pi = []
     projects = []
@@ -126,26 +150,49 @@ def main():
     with open(args.projects_file) as file:
         projects = [line.rstrip() for line in file]
 
-    invoice_date = get_invoice_date(merged_dataframe)
-    print("Invoice date: " + str(invoice_date))
+    print("Invoice date: " + str(invoice_month))
 
-    timed_projects_list = timed_projects(args.timed_projects_file, invoice_date)
+    timed_projects_list = timed_projects(args.timed_projects_file, invoice_month)
     print("The following timed-projects will not be billed for this period: ")
     print(timed_projects_list)
 
     projects = list(set(projects + timed_projects_list))
 
     merged_dataframe = add_institution(merged_dataframe)
-    remove_billables(merged_dataframe, pi, projects, "non_billable.csv")
+    remove_billables(
+        merged_dataframe,
+        pi,
+        projects,
+        "nonbillable.csv",
+        invoice_month,
+        args.upload_to_s3,
+    )
 
     billable_projects = remove_non_billables(merged_dataframe, pi, projects)
     billable_projects = validate_pi_names(billable_projects)
     credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file)
-    export_billables(credited_projects, args.output_file)
-    export_pi_billables(billable_projects, args.output_folder)
-    export_HU_only(billable_projects, args.HU_invoice_file)
-    export_HU_BU(billable_projects, args.HU_BU_invoice_file)
-    export_lenovo(billable_projects)
+
+    export_billables(
+        credited_projects, args.output_file, invoice_month, args.upload_to_s3
+    )
+    export_pi_billables(billable_projects, args.output_folder, invoice_month)
+    export_HU_only(billable_projects, args.HU_invoice_file, invoice_month)
+    export_HU_BU(billable_projects, args.HU_BU_invoice_file, invoice_month)
+    export_lenovo(billable_projects, invoice_month)
+
+
+def fetch_S3_invoices(invoice_month):
+    """Fetches usage invoices from S3 given invoice month"""
+    s3_invoice_list = list()
+    invoice_bucket = get_invoice_bucket()
+    for obj in invoice_bucket.objects.filter(
+        Prefix=f"Invoices/{invoice_month}/Service Invoices/"
+    ):
+        local_name = obj.key.split("/")[-1]
+        s3_invoice_list.append(local_name)
+        invoice_bucket.download_file(obj.key, local_name)
+
+    return s3_invoice_list
 
 
 def merge_csv(files):
@@ -195,16 +242,26 @@ def remove_non_billables(dataframe, pi, projects):
     return filtered_dataframe
 
 
-def remove_billables(dataframe, pi, projects, output_file):
+def remove_billables(dataframe, pi, projects, output_file, invoice_month, upload_to_s3):
     """Removes projects and PIs that should be billed from the dataframe
 
     So this *keeps* the projects/pis that should not be billed.
     """
     filtered_dataframe = dataframe[
         dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)
     ]
+
+    invoice_b2_path = (
+        f"Invoices/{invoice_month}/NERC (Non-Billable) {invoice_month}.csv"
+    )
+    invoice_b2_path_archive = f"Invoices/{invoice_month}/Archive/NERC (Non-Billable) {invoice_month} {get_iso8601_time()}.csv"
     filtered_dataframe.to_csv(output_file, index=False)
 
+    if upload_to_s3:
+        invoice_bucket = get_invoice_bucket()
+        invoice_bucket.upload_file(output_file, invoice_b2_path)
+        invoice_bucket.upload_file(output_file, invoice_b2_path_archive)
+
 
 def validate_pi_names(dataframe):
     invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])]
@@ -215,15 +272,24 @@ def validate_pi_names(dataframe):
     return dataframe
 
 
-def export_billables(dataframe, output_file):
-    dataframe.to_csv(output_file, index=False)
+def export_billables(dataframe, output_file, invoice_month, upload_to_s3):
+    invoice_b2_path = f"Invoices/{invoice_month}/NERC {invoice_month}.csv"
+    invoice_b2_path_archive = (
+        f"Invoices/{invoice_month}/Archive/"
+        + f"NERC {invoice_month} {get_iso8601_time()}.csv"
+    )
+    dataframe.to_csv(output_file)
+
+    if upload_to_s3:
+        invoice_bucket = get_invoice_bucket()
+        invoice_bucket.upload_file(output_file, invoice_b2_path)
+        invoice_bucket.upload_file(output_file, invoice_b2_path_archive)
 
 
-def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
+def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_month):
     if not os.path.exists(output_folder):
         os.mkdir(output_folder)
 
-    invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]
     pi_list = dataframe[PI_FIELD].unique()
 
     for pi in pi_list:
@@ -234,6 +300,7 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
         pi_projects.to_csv(
             output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv"
         )
+        # TODO (Quan Pham) Where to place these
 
 
 def apply_credits_new_pi(dataframe, old_pi_file):
@@ -297,23 +364,23 @@ def add_institution(dataframe: pandas.DataFrame):
     return dataframe
 
 
-def export_HU_only(dataframe, output_file):
+def export_HU_only(dataframe, output_file, invoice_month):
     HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"]
     HU_projects.to_csv(output_file)
+    # TODO (Quan Pham) Where to place these
 
 
-def export_HU_BU(dataframe, output_file):
+def export_HU_BU(dataframe, output_file, invoice_month):
     HU_BU_projects = dataframe[
         (dataframe[INSTITUTION_FIELD] == "Harvard University")
         | (dataframe[INSTITUTION_FIELD] == "Boston University")
     ]
     HU_BU_projects.to_csv(output_file)
+    # TODO (Quan Pham) Where to place these
 
 
-def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
-    lenovo_file_name = (
-        output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv"
-    )
+def export_lenovo(dataframe: pandas.DataFrame, invoice_month, output_file=None):
+    lenovo_file_name = output_file or f"Lenovo_{invoice_month}.csv"
 
     LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
     SU_CHARGE_MULTIPLIER = 1
@@ -332,6 +399,7 @@ def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
     lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER)
     lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"]
     lenovo_df.to_csv(lenovo_file_name)
+    # TODO (Quan Pham) Where to place these
 
 
 if __name__ == "__main__":

diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py
@@ -1,4 +1,5 @@
 from unittest import TestCase
+from unittest.mock import patch
 import tempfile
 import pandas
 import os
@@ -55,6 +56,7 @@ def test_timed_projects(self):
         self.assertEqual(excluded_projects, expected_projects)
 
 
+@patch("process_report.process_report.get_invoice_bucket")
 class TestRemoveNonBillables(TestCase):
     def setUp(self):
         data = {
@@ -69,7 +71,7 @@ def setUp(self):
             "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"],
         }
         self.dataframe = pandas.DataFrame(data)
-
+        self.invoice_month = "2024-03"
         self.pi_to_exclude = ["PI2", "PI3"]
         self.projects_to_exclude = ["ProjectB", "ProjectD"]
 
@@ -80,11 +82,13 @@ def tearDown(self):
         os.remove(self.output_file.name)
         os.remove(self.output_file2.name)
 
-    def test_remove_non_billables(self):
+    def test_remove_non_billables(self, mock_bucket):
         billables_df = process_report.remove_non_billables(
             self.dataframe, self.pi_to_exclude, self.projects_to_exclude
         )
-        process_report.export_billables(billables_df, self.output_file.name)
+        process_report.export_billables(
+            billables_df, self.output_file.name, self.invoice_month, False
+        )
 
         result_df = pandas.read_csv(self.output_file.name)
 
@@ -104,12 +108,14 @@ def test_remove_non_billables(self):
         self.assertIn("ProjectA", result_df["Project - Allocation"].tolist())
         self.assertIn("ProjectE", result_df["Project - Allocation"].tolist())
 
-    def test_remove_billables(self):
+    def test_remove_billables(self, mock_bucket):
         process_report.remove_billables(
             self.dataframe,
             self.pi_to_exclude,
             self.projects_to_exclude,
             self.output_file2.name,
+            self.invoice_month,
+            False,
         )
 
         result_df = pandas.read_csv(self.output_file2.name)
@@ -181,10 +187,13 @@ def setUp(self):
             "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"],
         }
         self.dataframe = pandas.DataFrame(data)
+        self.invoice_month = data["Invoice Month"][0]
 
     def test_export_pi(self):
         output_dir = tempfile.TemporaryDirectory()
-        process_report.export_pi_billables(self.dataframe, output_dir.name)
+        process_report.export_pi_billables(
+            self.dataframe, output_dir.name, self.invoice_month
+        )
 
         pi_csv_1 = f'{self.dataframe["Institution"][0]}_{self.dataframe["Manager (PI)"][0]}_{self.dataframe["Invoice Month"][0]}.csv'
         pi_csv_2 = f'{self.dataframe["Institution"][3]}_{self.dataframe["Manager (PI)"][3]}_{self.dataframe["Invoice Month"][3]}.csv'
@@ -381,6 +390,7 @@ def setUp(self):
             ],
         }
         self.dataframe = pandas.DataFrame(data)
+        self.invoice_month = "2024-03"
 
         output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv")
         self.output_file = output_file.name
@@ -389,7 +399,9 @@ def tearDown(self):
         os.remove(self.output_file)
 
     def test_apply_credit_0002(self):
-        process_report.export_lenovo(self.dataframe, self.output_file)
+        process_report.export_lenovo(
+            self.dataframe, self.invoice_month, self.output_file
+        )
         output_df = pandas.read_csv(self.output_file)
 
         self.assertTrue(

diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
 pandas
+boto3
diff --git a/tools/clone_nonbillables_and_process.sh b/tools/clone_nonbillables_and_process.sh