Containerized billing and implemented S3 integration

The container expects the user to provide several environment variables. More details in the Dockerfile and `clone_nonbillables_and_process_invoice.sh`. Most importantly, the `old_pi.csv` file must be located in repo home directory when building the Docker image The shell script clones the non-billable repo, while the connection to Backblaze B2 is done in the Python script through `boto3`
CCI-MOC · Apr 22, 2024 · dda6c6a · dda6c6a
1 parent c54a956
commit dda6c6a
Show file tree

Hide file tree

Showing 7 changed files with 222 additions and 51 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -0,0 +1,68 @@
+name: Build
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+on:
+  push:
+    branches: [main]
+    # Publish semver tags as releases.
+    tags: ['v*.*.*']
+  pull_request:
+    branches: [main]
+
+env:
+  # Use docker.io for Docker Hub if empty
+  REGISTRY: ghcr.io
+  # github.repository as <account>/<repo>
+  IMAGE_NAME: ${{ github.repository }}
+
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Log into registry ${{ env.REGISTRY }}
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # Extract metadata (tags, labels) for Docker
+      # https://github.com/docker/metadata-action
+      - name: Extract Docker metadata
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=semver,pattern=v{{version}}
+            type=semver,pattern=v{{major}}.{{minor}}
+            type=semver,pattern=v{{major}}
+            type=ref,event=branch
+            type=ref,event=pr
+            type=sha
+
+      # Build and push Docker image with Buildx (don't push on PR)
+      # https://github.com/docker/build-push-action
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y git
+
+COPY . .
+RUN pip install -r requirements.txt
+
+CMD ["./tools/clone_nonbillables_and_process_invoice.sh"]
diff --git a/process_report/process_report.py b/process_report/process_report.py
@@ -1,9 +1,12 @@
 import argparse
 import os
 import sys
+import datetime
 
 import json
 import pandas
+import boto3
+from botocore.config import Config
 
 
 ### Invoice field names
@@ -35,7 +38,7 @@ def get_institution_from_pi(institute_map, pi_uname):
 
 
 def load_institute_map() -> dict:
-    with open("institute_map.json", "r") as f:
+    with open("process_report/institute_map.json", "r") as f:
         institute_map = json.load(f)
 
     return institute_map
@@ -62,15 +65,42 @@ def is_old_pi(old_pi_dict, pi, invoice_month):
     return False
 
 
+def get_invoice_bucket():
+    s3_vars = load_S3_env_vars()
+    b2_resource = boto3.resource(
+        service_name="s3",
+        endpoint_url=s3_vars["B2_ENDPOINT"],  # Backblaze endpoint
+        aws_access_key_id=s3_vars["B2_KEY_ID"],  # Backblaze keyID
+        aws_secret_access_key=s3_vars["B2_APP_KEY"],  # Backblaze applicationKey
+        config=Config(
+            signature_version="s3v4",
+        ),
+    )
+    return b2_resource.Bucket(s3_vars["B2_BUCKET_NAME"])
+
+
+def load_S3_env_vars() -> dict:
+    s3_vars = dict()
+    for name, val in os.environ.items():
+        if name.startswith("B2_"):
+            s3_vars[name] = val
+
+    return s3_vars
+
+
+def get_iso8601_time():
+    return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")
+
+
 def main():
     """Remove non-billable PIs and projects"""
 
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        "csv_files",
-        nargs="+",
-        help="One or more CSV files that need to be processed",
+        "--invoice-month",
+        required=True,
+        help="Invoice month to process",
     )
     parser.add_argument(
         "--pi-file",
@@ -117,7 +147,10 @@ def main():
         help="Name of csv file listing previously billed PIs",
     )
     args = parser.parse_args()
-    merged_dataframe = merge_csv(args.csv_files)
+
+    invoice_month = args.invoice_month
+    csv_files = fetch_S3_invoices(invoice_month)
+    merged_dataframe = merge_csv(csv_files)
 
     pi = []
     projects = []
@@ -126,26 +159,40 @@ def main():
     with open(args.projects_file) as file:
         projects = [line.rstrip() for line in file]
 
-    invoice_date = get_invoice_date(merged_dataframe)
-    print("Invoice date: " + str(invoice_date))
+    print("Invoice date: " + str(invoice_month))
 
-    timed_projects_list = timed_projects(args.timed_projects_file, invoice_date)
+    timed_projects_list = timed_projects(args.timed_projects_file, invoice_month)
     print("The following timed-projects will not be billed for this period: ")
     print(timed_projects_list)
 
     projects = list(set(projects + timed_projects_list))
 
     merged_dataframe = add_institution(merged_dataframe)
-    remove_billables(merged_dataframe, pi, projects, "non_billable.csv")
+    remove_billables(merged_dataframe, pi, projects, "nonbillable.csv", invoice_month)
 
     billable_projects = remove_non_billables(merged_dataframe, pi, projects)
     billable_projects = validate_pi_names(billable_projects)
     credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file)
-    export_billables(credited_projects, args.output_file)
-    export_pi_billables(billable_projects, args.output_folder)
-    export_HU_only(billable_projects, args.HU_invoice_file)
-    export_HU_BU(billable_projects, args.HU_BU_invoice_file)
-    export_lenovo(billable_projects)
+
+    export_billables(credited_projects, args.output_file, invoice_month)
+    export_pi_billables(billable_projects, args.output_folder, invoice_month)
+    export_HU_only(billable_projects, args.HU_invoice_file, invoice_month)
+    export_HU_BU(billable_projects, args.HU_BU_invoice_file, invoice_month)
+    export_lenovo(billable_projects, invoice_month)
+
+
+def fetch_S3_invoices(invoice_month):
+    """Fetches usage invoices from S3 given invoice month"""
+    s3_invoice_list = list()
+    invoice_bucket = get_invoice_bucket()
+    for obj in invoice_bucket.objects.filter(
+        Prefix=f"Invoices/{invoice_month}/Service Invoices/"
+    ):
+        local_name = obj.key.split("/")[-1]
+        s3_invoice_list.append(local_name)
+        invoice_bucket.download_file(obj.key, local_name)
+
+    return s3_invoice_list
 
 
 def merge_csv(files):
@@ -195,15 +242,23 @@ def remove_non_billables(dataframe, pi, projects):
     return filtered_dataframe
 
 
-def remove_billables(dataframe, pi, projects, output_file):
+def remove_billables(dataframe, pi, projects, output_file, invoice_month):
     """Removes projects and PIs that should be billed from the dataframe
 
     So this *keeps* the projects/pis that should not be billed.
     """
     filtered_dataframe = dataframe[
         dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)
     ]
+
+    invoice_b2_path = (
+        f"Invoices/{invoice_month}/NERC (Non-Billable) {invoice_month}.csv"
+    )
+    invoice_b2_path_archive = f"Invoices/{invoice_month}/Archive/NERC (Non-Billable) {invoice_month} {get_iso8601_time()}.csv"
     filtered_dataframe.to_csv(output_file, index=False)
+    invoice_bucket = get_invoice_bucket()
+    invoice_bucket.upload_file(output_file, invoice_b2_path)
+    invoice_bucket.upload_file(output_file, invoice_b2_path_archive)
 
 
 def validate_pi_names(dataframe):
@@ -215,15 +270,22 @@ def validate_pi_names(dataframe):
     return dataframe
 
 
-def export_billables(dataframe, output_file):
-    dataframe.to_csv(output_file, index=False)
+def export_billables(dataframe, output_file, invoice_month):
+    invoice_b2_path = f"Invoices/{invoice_month}/NERC {invoice_month}.csv"
+    invoice_b2_path_archive = (
+        f"Invoices/{invoice_month}/Archive/"
+        + f"NERC {invoice_month} {get_iso8601_time()}.csv"
+    )
+    dataframe.to_csv(output_file)
+    invoice_bucket = get_invoice_bucket()
+    invoice_bucket.upload_file(output_file, invoice_b2_path)
+    invoice_bucket.upload_file(output_file, invoice_b2_path_archive)
 
 
-def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
+def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_month):
     if not os.path.exists(output_folder):
         os.mkdir(output_folder)
 
-    invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]
     pi_list = dataframe[PI_FIELD].unique()
 
     for pi in pi_list:
@@ -234,6 +296,7 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
         pi_projects.to_csv(
             output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv"
         )
+        # TODO (Quan Pham) Where to place these
 
 
 def apply_credits_new_pi(dataframe, old_pi_file):
@@ -297,23 +360,23 @@ def add_institution(dataframe: pandas.DataFrame):
     return dataframe
 
 
-def export_HU_only(dataframe, output_file):
+def export_HU_only(dataframe, output_file, invoice_month):
     HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"]
     HU_projects.to_csv(output_file)
+    # TODO (Quan Pham) Where to place these
 
 
-def export_HU_BU(dataframe, output_file):
+def export_HU_BU(dataframe, output_file, invoice_month):
     HU_BU_projects = dataframe[
         (dataframe[INSTITUTION_FIELD] == "Harvard University")
         | (dataframe[INSTITUTION_FIELD] == "Boston University")
     ]
     HU_BU_projects.to_csv(output_file)
+    # TODO (Quan Pham) Where to place these
 
 
-def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
-    lenovo_file_name = (
-        output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv"
-    )
+def export_lenovo(dataframe: pandas.DataFrame, invoice_month, output_file=None):
+    lenovo_file_name = output_file or f"Lenovo_{invoice_month}.csv"
 
     LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
     SU_CHARGE_MULTIPLIER = 1
@@ -332,6 +395,7 @@ def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
     lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER)
     lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"]
     lenovo_df.to_csv(lenovo_file_name)
+    # TODO (Quan Pham) Where to place these
 
 
 if __name__ == "__main__":

diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py
@@ -1,4 +1,5 @@
 from unittest import TestCase
+from unittest.mock import patch
 import tempfile
 import pandas
 import os
@@ -55,6 +56,7 @@ def test_timed_projects(self):
         self.assertEqual(excluded_projects, expected_projects)
 
 
+@patch("process_report.process_report.get_invoice_bucket")
 class TestRemoveNonBillables(TestCase):
     def setUp(self):
         data = {
@@ -69,7 +71,7 @@ def setUp(self):
             "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"],
         }
         self.dataframe = pandas.DataFrame(data)
-
+        self.invoice_month = "2024-03"
         self.pi_to_exclude = ["PI2", "PI3"]
         self.projects_to_exclude = ["ProjectB", "ProjectD"]
 
@@ -80,11 +82,13 @@ def tearDown(self):
         os.remove(self.output_file.name)
         os.remove(self.output_file2.name)
 
-    def test_remove_non_billables(self):
+    def test_remove_non_billables(self, mock_bucket):
         billables_df = process_report.remove_non_billables(
             self.dataframe, self.pi_to_exclude, self.projects_to_exclude
         )
-        process_report.export_billables(billables_df, self.output_file.name)
+        process_report.export_billables(
+            billables_df, self.output_file.name, self.invoice_month
+        )
 
         result_df = pandas.read_csv(self.output_file.name)
 
@@ -104,12 +108,13 @@ def test_remove_non_billables(self):
         self.assertIn("ProjectA", result_df["Project - Allocation"].tolist())
         self.assertIn("ProjectE", result_df["Project - Allocation"].tolist())
 
-    def test_remove_billables(self):
+    def test_remove_billables(self, mock_bucket):
         process_report.remove_billables(
             self.dataframe,
             self.pi_to_exclude,
             self.projects_to_exclude,
             self.output_file2.name,
+            self.invoice_month,
         )
 
         result_df = pandas.read_csv(self.output_file2.name)
@@ -181,10 +186,13 @@ def setUp(self):
             "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"],
         }
         self.dataframe = pandas.DataFrame(data)
+        self.invoice_month = data["Invoice Month"][0]
 
     def test_export_pi(self):
         output_dir = tempfile.TemporaryDirectory()
-        process_report.export_pi_billables(self.dataframe, output_dir.name)
+        process_report.export_pi_billables(
+            self.dataframe, output_dir.name, self.invoice_month
+        )
 
         pi_csv_1 = f'{self.dataframe["Institution"][0]}_{self.dataframe["Manager (PI)"][0]}_{self.dataframe["Invoice Month"][0]}.csv'
         pi_csv_2 = f'{self.dataframe["Institution"][3]}_{self.dataframe["Manager (PI)"][3]}_{self.dataframe["Invoice Month"][3]}.csv'
@@ -381,6 +389,7 @@ def setUp(self):
             ],
         }
         self.dataframe = pandas.DataFrame(data)
+        self.invoice_month = "2024-03"
 
         output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv")
         self.output_file = output_file.name
@@ -389,7 +398,9 @@ def tearDown(self):
         os.remove(self.output_file)
 
     def test_apply_credit_0002(self):
-        process_report.export_lenovo(self.dataframe, self.output_file)
+        process_report.export_lenovo(
+            self.dataframe, self.invoice_month, self.output_file
+        )
         output_df = pandas.read_csv(self.output_file)
 
         self.assertTrue(

diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
 pandas
+boto3