From dda6c6aba52bbceef5de1f716cdea74803fbbb9b Mon Sep 17 00:00:00 2001 From: QuanMPhm Date: Fri, 19 Apr 2024 16:36:05 -0400 Subject: [PATCH] Containerized billing and implemented S3 integration The container expects the user to provide several environment variables. More details in the Dockerfile and `clone_nonbillables_and_process_invoice.sh`. Most importantly, the `old_pi.csv` file must be located in repo home directory when building the Docker image The shell script clones the non-billable repo, while the connection to Backblaze B2 is done in the Python script through `boto3` --- .github/workflows/build.yaml | 68 +++++++++++ Dockerfile | 10 ++ process_report/process_report.py | 114 ++++++++++++++---- process_report/tests/unit_tests.py | 23 +++- requirements.txt | 1 + tools/clone_nonbillables_and_process.sh | 20 --- .../clone_nonbillables_and_process_invoice.sh | 37 ++++++ 7 files changed, 222 insertions(+), 51 deletions(-) create mode 100644 .github/workflows/build.yaml create mode 100644 Dockerfile delete mode 100644 tools/clone_nonbillables_and_process.sh create mode 100755 tools/clone_nonbillables_and_process_invoice.sh diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..c758d0e --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,68 @@ +name: Build + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +on: + push: + branches: [main] + # Publish semver tags as releases. + tags: ['v*.*.*'] + pull_request: + branches: [main] + +env: + # Use docker.io for Docker Hub if empty + REGISTRY: ghcr.io + # github.repository as / + IMAGE_NAME: ${{ github.repository }} + + +jobs: + build: + + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Login against a Docker registry except on PR + # https://github.com/docker/login-action + - name: Log into registry ${{ env.REGISTRY }} + if: github.event_name != 'pull_request' + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Extract metadata (tags, labels) for Docker + # https://github.com/docker/metadata-action + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=semver,pattern=v{{version}} + type=semver,pattern=v{{major}}.{{minor}} + type=semver,pattern=v{{major}} + type=ref,event=branch + type=ref,event=pr + type=sha + + # Build and push Docker image with Buildx (don't push on PR) + # https://github.com/docker/build-push-action + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a68855e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y git + +COPY . . +RUN pip install -r requirements.txt + +CMD ["./tools/clone_nonbillables_and_process_invoice.sh"] diff --git a/process_report/process_report.py b/process_report/process_report.py index b2d2dd9..254dcfb 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -1,9 +1,12 @@ import argparse import os import sys +import datetime import json import pandas +import boto3 +from botocore.config import Config ### Invoice field names @@ -35,7 +38,7 @@ def get_institution_from_pi(institute_map, pi_uname): def load_institute_map() -> dict: - with open("institute_map.json", "r") as f: + with open("process_report/institute_map.json", "r") as f: institute_map = json.load(f) return institute_map @@ -62,15 +65,42 @@ def is_old_pi(old_pi_dict, pi, invoice_month): return False +def get_invoice_bucket(): + s3_vars = load_S3_env_vars() + b2_resource = boto3.resource( + service_name="s3", + endpoint_url=s3_vars["B2_ENDPOINT"], # Backblaze endpoint + aws_access_key_id=s3_vars["B2_KEY_ID"], # Backblaze keyID + aws_secret_access_key=s3_vars["B2_APP_KEY"], # Backblaze applicationKey + config=Config( + signature_version="s3v4", + ), + ) + return b2_resource.Bucket(s3_vars["B2_BUCKET_NAME"]) + + +def load_S3_env_vars() -> dict: + s3_vars = dict() + for name, val in os.environ.items(): + if name.startswith("B2_"): + s3_vars[name] = val + + return s3_vars + + +def get_iso8601_time(): + return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ") + + def main(): """Remove non-billable PIs and projects""" parser = argparse.ArgumentParser() parser.add_argument( - "csv_files", - nargs="+", - help="One or more CSV files that need to be processed", + "--invoice-month", + required=True, + help="Invoice month to process", ) parser.add_argument( "--pi-file", @@ -117,7 +147,10 @@ def main(): help="Name of csv file listing previously billed PIs", ) args = parser.parse_args() - merged_dataframe = merge_csv(args.csv_files) + + invoice_month = args.invoice_month + csv_files = fetch_S3_invoices(invoice_month) + merged_dataframe = merge_csv(csv_files) pi = [] projects = [] @@ -126,26 +159,40 @@ def main(): with open(args.projects_file) as file: projects = [line.rstrip() for line in file] - invoice_date = get_invoice_date(merged_dataframe) - print("Invoice date: " + str(invoice_date)) + print("Invoice date: " + str(invoice_month)) - timed_projects_list = timed_projects(args.timed_projects_file, invoice_date) + timed_projects_list = timed_projects(args.timed_projects_file, invoice_month) print("The following timed-projects will not be billed for this period: ") print(timed_projects_list) projects = list(set(projects + timed_projects_list)) merged_dataframe = add_institution(merged_dataframe) - remove_billables(merged_dataframe, pi, projects, "non_billable.csv") + remove_billables(merged_dataframe, pi, projects, "nonbillable.csv", invoice_month) billable_projects = remove_non_billables(merged_dataframe, pi, projects) billable_projects = validate_pi_names(billable_projects) credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file) - export_billables(credited_projects, args.output_file) - export_pi_billables(billable_projects, args.output_folder) - export_HU_only(billable_projects, args.HU_invoice_file) - export_HU_BU(billable_projects, args.HU_BU_invoice_file) - export_lenovo(billable_projects) + + export_billables(credited_projects, args.output_file, invoice_month) + export_pi_billables(billable_projects, args.output_folder, invoice_month) + export_HU_only(billable_projects, args.HU_invoice_file, invoice_month) + export_HU_BU(billable_projects, args.HU_BU_invoice_file, invoice_month) + export_lenovo(billable_projects, invoice_month) + + +def fetch_S3_invoices(invoice_month): + """Fetches usage invoices from S3 given invoice month""" + s3_invoice_list = list() + invoice_bucket = get_invoice_bucket() + for obj in invoice_bucket.objects.filter( + Prefix=f"Invoices/{invoice_month}/Service Invoices/" + ): + local_name = obj.key.split("/")[-1] + s3_invoice_list.append(local_name) + invoice_bucket.download_file(obj.key, local_name) + + return s3_invoice_list def merge_csv(files): @@ -195,7 +242,7 @@ def remove_non_billables(dataframe, pi, projects): return filtered_dataframe -def remove_billables(dataframe, pi, projects, output_file): +def remove_billables(dataframe, pi, projects, output_file, invoice_month): """Removes projects and PIs that should be billed from the dataframe So this *keeps* the projects/pis that should not be billed. @@ -203,7 +250,15 @@ def remove_billables(dataframe, pi, projects, output_file): filtered_dataframe = dataframe[ dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects) ] + + invoice_b2_path = ( + f"Invoices/{invoice_month}/NERC (Non-Billable) {invoice_month}.csv" + ) + invoice_b2_path_archive = f"Invoices/{invoice_month}/Archive/NERC (Non-Billable) {invoice_month} {get_iso8601_time()}.csv" filtered_dataframe.to_csv(output_file, index=False) + invoice_bucket = get_invoice_bucket() + invoice_bucket.upload_file(output_file, invoice_b2_path) + invoice_bucket.upload_file(output_file, invoice_b2_path_archive) def validate_pi_names(dataframe): @@ -215,15 +270,22 @@ def validate_pi_names(dataframe): return dataframe -def export_billables(dataframe, output_file): - dataframe.to_csv(output_file, index=False) +def export_billables(dataframe, output_file, invoice_month): + invoice_b2_path = f"Invoices/{invoice_month}/NERC {invoice_month}.csv" + invoice_b2_path_archive = ( + f"Invoices/{invoice_month}/Archive/" + + f"NERC {invoice_month} {get_iso8601_time()}.csv" + ) + dataframe.to_csv(output_file) + invoice_bucket = get_invoice_bucket() + invoice_bucket.upload_file(output_file, invoice_b2_path) + invoice_bucket.upload_file(output_file, invoice_b2_path_archive) -def export_pi_billables(dataframe: pandas.DataFrame, output_folder): +def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_month): if not os.path.exists(output_folder): os.mkdir(output_folder) - invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0] pi_list = dataframe[PI_FIELD].unique() for pi in pi_list: @@ -234,6 +296,7 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder): pi_projects.to_csv( output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv" ) + # TODO (Quan Pham) Where to place these def apply_credits_new_pi(dataframe, old_pi_file): @@ -297,23 +360,23 @@ def add_institution(dataframe: pandas.DataFrame): return dataframe -def export_HU_only(dataframe, output_file): +def export_HU_only(dataframe, output_file, invoice_month): HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"] HU_projects.to_csv(output_file) + # TODO (Quan Pham) Where to place these -def export_HU_BU(dataframe, output_file): +def export_HU_BU(dataframe, output_file, invoice_month): HU_BU_projects = dataframe[ (dataframe[INSTITUTION_FIELD] == "Harvard University") | (dataframe[INSTITUTION_FIELD] == "Boston University") ] HU_BU_projects.to_csv(output_file) + # TODO (Quan Pham) Where to place these -def export_lenovo(dataframe: pandas.DataFrame, output_file=None): - lenovo_file_name = ( - output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv" - ) +def export_lenovo(dataframe: pandas.DataFrame, invoice_month, output_file=None): + lenovo_file_name = output_file or f"Lenovo_{invoice_month}.csv" LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] SU_CHARGE_MULTIPLIER = 1 @@ -332,6 +395,7 @@ def export_lenovo(dataframe: pandas.DataFrame, output_file=None): lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER) lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"] lenovo_df.to_csv(lenovo_file_name) + # TODO (Quan Pham) Where to place these if __name__ == "__main__": diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 8f0c381..075ed9e 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -1,4 +1,5 @@ from unittest import TestCase +from unittest.mock import patch import tempfile import pandas import os @@ -55,6 +56,7 @@ def test_timed_projects(self): self.assertEqual(excluded_projects, expected_projects) +@patch("process_report.process_report.get_invoice_bucket") class TestRemoveNonBillables(TestCase): def setUp(self): data = { @@ -69,7 +71,7 @@ def setUp(self): "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"], } self.dataframe = pandas.DataFrame(data) - + self.invoice_month = "2024-03" self.pi_to_exclude = ["PI2", "PI3"] self.projects_to_exclude = ["ProjectB", "ProjectD"] @@ -80,11 +82,13 @@ def tearDown(self): os.remove(self.output_file.name) os.remove(self.output_file2.name) - def test_remove_non_billables(self): + def test_remove_non_billables(self, mock_bucket): billables_df = process_report.remove_non_billables( self.dataframe, self.pi_to_exclude, self.projects_to_exclude ) - process_report.export_billables(billables_df, self.output_file.name) + process_report.export_billables( + billables_df, self.output_file.name, self.invoice_month + ) result_df = pandas.read_csv(self.output_file.name) @@ -104,12 +108,13 @@ def test_remove_non_billables(self): self.assertIn("ProjectA", result_df["Project - Allocation"].tolist()) self.assertIn("ProjectE", result_df["Project - Allocation"].tolist()) - def test_remove_billables(self): + def test_remove_billables(self, mock_bucket): process_report.remove_billables( self.dataframe, self.pi_to_exclude, self.projects_to_exclude, self.output_file2.name, + self.invoice_month, ) result_df = pandas.read_csv(self.output_file2.name) @@ -181,10 +186,13 @@ def setUp(self): "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"], } self.dataframe = pandas.DataFrame(data) + self.invoice_month = data["Invoice Month"][0] def test_export_pi(self): output_dir = tempfile.TemporaryDirectory() - process_report.export_pi_billables(self.dataframe, output_dir.name) + process_report.export_pi_billables( + self.dataframe, output_dir.name, self.invoice_month + ) pi_csv_1 = f'{self.dataframe["Institution"][0]}_{self.dataframe["Manager (PI)"][0]}_{self.dataframe["Invoice Month"][0]}.csv' pi_csv_2 = f'{self.dataframe["Institution"][3]}_{self.dataframe["Manager (PI)"][3]}_{self.dataframe["Invoice Month"][3]}.csv' @@ -381,6 +389,7 @@ def setUp(self): ], } self.dataframe = pandas.DataFrame(data) + self.invoice_month = "2024-03" output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv") self.output_file = output_file.name @@ -389,7 +398,9 @@ def tearDown(self): os.remove(self.output_file) def test_apply_credit_0002(self): - process_report.export_lenovo(self.dataframe, self.output_file) + process_report.export_lenovo( + self.dataframe, self.invoice_month, self.output_file + ) output_df = pandas.read_csv(self.output_file) self.assertTrue( diff --git a/requirements.txt b/requirements.txt index fb6c7ed..b650973 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ pandas +boto3 diff --git a/tools/clone_nonbillables_and_process.sh b/tools/clone_nonbillables_and_process.sh deleted file mode 100644 index bcc77e2..0000000 --- a/tools/clone_nonbillables_and_process.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh -set -xe - -# Add deploy key to ssh config -mkdir -p ~/.ssh -if [ ! -e ~/.ssh/config ]; then - touch ~/.ssh/config - touch ~/.ssh/id_nonbillable - echo " - Host github-nonbillable - HostName github.com - IdentityFile ~/.ssh/id_nonbillable - " > ~/.ssh/config - echo "$GH_NONBILLABLE_DEPLOYKEY" > ~/.ssh/id_nonbillable - chmod 600 ~/.ssh/id_nonbillable -fi - -if [ ! -d ./non-billable-projects ]; then - git clone git@github-nonbillable:CCI-MOC/non-billable-projects.git ./non-billable-projects -fi diff --git a/tools/clone_nonbillables_and_process_invoice.sh b/tools/clone_nonbillables_and_process_invoice.sh new file mode 100755 index 0000000..0fa4830 --- /dev/null +++ b/tools/clone_nonbillables_and_process_invoice.sh @@ -0,0 +1,37 @@ +#!/bin/sh +set -xe + +# Add deploy key to ssh config +mkdir -p ~/.ssh +if [ ! -e ~/.ssh/config ]; then + touch ~/.ssh/config + touch ~/.ssh/id_nonbillable + echo " + Host github-nonbillable + HostName github.com + IdentityFile ~/.ssh/id_nonbillable + " > ~/.ssh/config + echo "$GH_NONBILLABLE_DEPLOYKEY" > ~/.ssh/id_nonbillable + chmod 600 ~/.ssh/id_nonbillable +fi + +if [ ! -d ./non-billable-projects ]; then + touch ~/.ssh/known_hosts + echo "github.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl + github.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg= + github.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk=" \ + >> ~/.ssh/known_hosts + git clone git@github-nonbillable:CCI-MOC/non-billable-projects.git ./non-billable-projects +fi + +INVOICE_MONTH=$(date --date="$(date +%Y-%m-01) -1 month" +%Y-%m) +export B2_ENDPOINT +export B2_APP_KEY +export B2_KEY_ID +export B2_BUCKET_NAME +python process_report/process_report.py \ + --invoice-month $INVOICE_MONTH \ + --pi-file ./non-billable-projects/pi.txt \ + --projects-file ./non-billable-projects/projects.txt \ + --timed-projects-file ./non-billable-projects/timed_projects.txt \ + --old-pi-file old_pi.csv