diff --git a/process_report/process_report.py b/process_report/process_report.py index b2d2dd9..6af8fe7 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -1,9 +1,12 @@ import argparse import os import sys +import datetime import json import pandas +import boto3 +from botocore.config import Config ### Invoice field names @@ -35,7 +38,7 @@ def get_institution_from_pi(institute_map, pi_uname): def load_institute_map() -> dict: - with open("institute_map.json", "r") as f: + with open("process_report/institute_map.json", "r") as f: institute_map = json.load(f) return institute_map @@ -62,15 +65,33 @@ def is_old_pi(old_pi_dict, pi, invoice_month): return False +def get_invoice_bucket(): + b2_resource = boto3.resource( + service_name="s3", + endpoint_url=os.environ["B2_ENDPOINT"], + aws_access_key_id=os.environ["B2_KEY_ID"], + aws_secret_access_key=os.environ["B2_APP_KEY"], + config=Config( + signature_version="s3v4", + ), + ) + return b2_resource.Bucket(os.environ["B2_BUCKET_NAME"]) + + +def get_iso8601_time(): + return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ") + + def main(): """Remove non-billable PIs and projects""" parser = argparse.ArgumentParser() + parser.add_argument("--upload-to-s3", action="store_false") parser.add_argument( - "csv_files", - nargs="+", - help="One or more CSV files that need to be processed", + "--invoice-month", + required=True, + help="Invoice month to process", ) parser.add_argument( "--pi-file", @@ -117,7 +138,10 @@ def main(): help="Name of csv file listing previously billed PIs", ) args = parser.parse_args() - merged_dataframe = merge_csv(args.csv_files) + + invoice_month = args.invoice_month + csv_files = fetch_S3_invoices(invoice_month) + merged_dataframe = merge_csv(csv_files) pi = [] projects = [] @@ -126,26 +150,49 @@ def main(): with open(args.projects_file) as file: projects = [line.rstrip() for line in file] - invoice_date = get_invoice_date(merged_dataframe) - print("Invoice date: " + str(invoice_date)) + print("Invoice date: " + str(invoice_month)) - timed_projects_list = timed_projects(args.timed_projects_file, invoice_date) + timed_projects_list = timed_projects(args.timed_projects_file, invoice_month) print("The following timed-projects will not be billed for this period: ") print(timed_projects_list) projects = list(set(projects + timed_projects_list)) merged_dataframe = add_institution(merged_dataframe) - remove_billables(merged_dataframe, pi, projects, "non_billable.csv") + remove_billables( + merged_dataframe, + pi, + projects, + "nonbillable.csv", + invoice_month, + args.upload_to_s3, + ) billable_projects = remove_non_billables(merged_dataframe, pi, projects) billable_projects = validate_pi_names(billable_projects) credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file) - export_billables(credited_projects, args.output_file) - export_pi_billables(billable_projects, args.output_folder) - export_HU_only(billable_projects, args.HU_invoice_file) - export_HU_BU(billable_projects, args.HU_BU_invoice_file) - export_lenovo(billable_projects) + + export_billables( + credited_projects, args.output_file, invoice_month, args.upload_to_s3 + ) + export_pi_billables(billable_projects, args.output_folder, invoice_month) + export_HU_only(billable_projects, args.HU_invoice_file, invoice_month) + export_HU_BU(billable_projects, args.HU_BU_invoice_file, invoice_month) + export_lenovo(billable_projects, invoice_month) + + +def fetch_S3_invoices(invoice_month): + """Fetches usage invoices from S3 given invoice month""" + s3_invoice_list = list() + invoice_bucket = get_invoice_bucket() + for obj in invoice_bucket.objects.filter( + Prefix=f"Invoices/{invoice_month}/Service Invoices/" + ): + local_name = obj.key.split("/")[-1] + s3_invoice_list.append(local_name) + invoice_bucket.download_file(obj.key, local_name) + + return s3_invoice_list def merge_csv(files): @@ -195,7 +242,7 @@ def remove_non_billables(dataframe, pi, projects): return filtered_dataframe -def remove_billables(dataframe, pi, projects, output_file): +def remove_billables(dataframe, pi, projects, output_file, invoice_month, upload_to_s3): """Removes projects and PIs that should be billed from the dataframe So this *keeps* the projects/pis that should not be billed. @@ -203,8 +250,18 @@ def remove_billables(dataframe, pi, projects, output_file): filtered_dataframe = dataframe[ dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects) ] + + invoice_b2_path = ( + f"Invoices/{invoice_month}/NERC (Non-Billable) {invoice_month}.csv" + ) + invoice_b2_path_archive = f"Invoices/{invoice_month}/Archive/NERC (Non-Billable) {invoice_month} {get_iso8601_time()}.csv" filtered_dataframe.to_csv(output_file, index=False) + if upload_to_s3: + invoice_bucket = get_invoice_bucket() + invoice_bucket.upload_file(output_file, invoice_b2_path) + invoice_bucket.upload_file(output_file, invoice_b2_path_archive) + def validate_pi_names(dataframe): invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])] @@ -215,15 +272,24 @@ def validate_pi_names(dataframe): return dataframe -def export_billables(dataframe, output_file): - dataframe.to_csv(output_file, index=False) +def export_billables(dataframe, output_file, invoice_month, upload_to_s3): + invoice_b2_path = f"Invoices/{invoice_month}/NERC {invoice_month}.csv" + invoice_b2_path_archive = ( + f"Invoices/{invoice_month}/Archive/" + + f"NERC {invoice_month} {get_iso8601_time()}.csv" + ) + dataframe.to_csv(output_file) + + if upload_to_s3: + invoice_bucket = get_invoice_bucket() + invoice_bucket.upload_file(output_file, invoice_b2_path) + invoice_bucket.upload_file(output_file, invoice_b2_path_archive) -def export_pi_billables(dataframe: pandas.DataFrame, output_folder): +def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_month): if not os.path.exists(output_folder): os.mkdir(output_folder) - invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0] pi_list = dataframe[PI_FIELD].unique() for pi in pi_list: @@ -234,6 +300,7 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder): pi_projects.to_csv( output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv" ) + # TODO (Quan Pham) Where to place these def apply_credits_new_pi(dataframe, old_pi_file): @@ -297,23 +364,23 @@ def add_institution(dataframe: pandas.DataFrame): return dataframe -def export_HU_only(dataframe, output_file): +def export_HU_only(dataframe, output_file, invoice_month): HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"] HU_projects.to_csv(output_file) + # TODO (Quan Pham) Where to place these -def export_HU_BU(dataframe, output_file): +def export_HU_BU(dataframe, output_file, invoice_month): HU_BU_projects = dataframe[ (dataframe[INSTITUTION_FIELD] == "Harvard University") | (dataframe[INSTITUTION_FIELD] == "Boston University") ] HU_BU_projects.to_csv(output_file) + # TODO (Quan Pham) Where to place these -def export_lenovo(dataframe: pandas.DataFrame, output_file=None): - lenovo_file_name = ( - output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv" - ) +def export_lenovo(dataframe: pandas.DataFrame, invoice_month, output_file=None): + lenovo_file_name = output_file or f"Lenovo_{invoice_month}.csv" LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] SU_CHARGE_MULTIPLIER = 1 @@ -332,6 +399,7 @@ def export_lenovo(dataframe: pandas.DataFrame, output_file=None): lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER) lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"] lenovo_df.to_csv(lenovo_file_name) + # TODO (Quan Pham) Where to place these if __name__ == "__main__": diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 8f0c381..e8a2b96 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -1,4 +1,5 @@ from unittest import TestCase +from unittest.mock import patch import tempfile import pandas import os @@ -55,6 +56,7 @@ def test_timed_projects(self): self.assertEqual(excluded_projects, expected_projects) +@patch("process_report.process_report.get_invoice_bucket") class TestRemoveNonBillables(TestCase): def setUp(self): data = { @@ -69,7 +71,7 @@ def setUp(self): "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"], } self.dataframe = pandas.DataFrame(data) - + self.invoice_month = "2024-03" self.pi_to_exclude = ["PI2", "PI3"] self.projects_to_exclude = ["ProjectB", "ProjectD"] @@ -80,11 +82,13 @@ def tearDown(self): os.remove(self.output_file.name) os.remove(self.output_file2.name) - def test_remove_non_billables(self): + def test_remove_non_billables(self, mock_bucket): billables_df = process_report.remove_non_billables( self.dataframe, self.pi_to_exclude, self.projects_to_exclude ) - process_report.export_billables(billables_df, self.output_file.name) + process_report.export_billables( + billables_df, self.output_file.name, self.invoice_month, False + ) result_df = pandas.read_csv(self.output_file.name) @@ -104,12 +108,14 @@ def test_remove_non_billables(self): self.assertIn("ProjectA", result_df["Project - Allocation"].tolist()) self.assertIn("ProjectE", result_df["Project - Allocation"].tolist()) - def test_remove_billables(self): + def test_remove_billables(self, mock_bucket): process_report.remove_billables( self.dataframe, self.pi_to_exclude, self.projects_to_exclude, self.output_file2.name, + self.invoice_month, + False, ) result_df = pandas.read_csv(self.output_file2.name) @@ -181,10 +187,13 @@ def setUp(self): "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"], } self.dataframe = pandas.DataFrame(data) + self.invoice_month = data["Invoice Month"][0] def test_export_pi(self): output_dir = tempfile.TemporaryDirectory() - process_report.export_pi_billables(self.dataframe, output_dir.name) + process_report.export_pi_billables( + self.dataframe, output_dir.name, self.invoice_month + ) pi_csv_1 = f'{self.dataframe["Institution"][0]}_{self.dataframe["Manager (PI)"][0]}_{self.dataframe["Invoice Month"][0]}.csv' pi_csv_2 = f'{self.dataframe["Institution"][3]}_{self.dataframe["Manager (PI)"][3]}_{self.dataframe["Invoice Month"][3]}.csv' @@ -381,6 +390,7 @@ def setUp(self): ], } self.dataframe = pandas.DataFrame(data) + self.invoice_month = "2024-03" output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv") self.output_file = output_file.name @@ -389,7 +399,9 @@ def tearDown(self): os.remove(self.output_file) def test_apply_credit_0002(self): - process_report.export_lenovo(self.dataframe, self.output_file) + process_report.export_lenovo( + self.dataframe, self.invoice_month, self.output_file + ) output_df = pandas.read_csv(self.output_file) self.assertTrue( diff --git a/requirements.txt b/requirements.txt index fb6c7ed..b650973 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ pandas +boto3 diff --git a/tools/clone_nonbillables_and_process.sh b/tools/clone_nonbillables_and_process.sh deleted file mode 100644 index bcc77e2..0000000 --- a/tools/clone_nonbillables_and_process.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh -set -xe - -# Add deploy key to ssh config -mkdir -p ~/.ssh -if [ ! -e ~/.ssh/config ]; then - touch ~/.ssh/config - touch ~/.ssh/id_nonbillable - echo " - Host github-nonbillable - HostName github.com - IdentityFile ~/.ssh/id_nonbillable - " > ~/.ssh/config - echo "$GH_NONBILLABLE_DEPLOYKEY" > ~/.ssh/id_nonbillable - chmod 600 ~/.ssh/id_nonbillable -fi - -if [ ! -d ./non-billable-projects ]; then - git clone git@github-nonbillable:CCI-MOC/non-billable-projects.git ./non-billable-projects -fi