From ca2772f3b3a65f65ae324fcb2cc8a77d3920a690 Mon Sep 17 00:00:00 2001 From: QuanMPhm Date: Fri, 19 Apr 2024 16:36:05 -0400 Subject: [PATCH] Implemented S3 integration This commit gives the user the option fetch invoices from S3 storage. Several env vars are needed to authenticate to S3 storage. More details on these vars can be found in the function `get_invoice_bucket()` from `process_report.py` Note that the user can provide filenames containing "{}" to inject the invoice month, allowing for some convenient formatting --- process_report/process_report.py | 162 ++++++++++++++---- process_report/tests/unit_tests.py | 33 ++-- requirements.txt | 1 + tools/clone_nonbillables_and_process.sh | 20 --- .../clone_nonbillables_and_process_invoice.sh | 40 +++++ 5 files changed, 181 insertions(+), 75 deletions(-) delete mode 100644 tools/clone_nonbillables_and_process.sh create mode 100644 tools/clone_nonbillables_and_process_invoice.sh diff --git a/process_report/process_report.py b/process_report/process_report.py index b2d2dd9..9077d11 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -1,9 +1,17 @@ import argparse import os import sys +import datetime import json import pandas +import boto3 +from botocore.config import Config + + +EXPORT_DATAFRAME = "dataframe" +EXPORT_LOCAL_PATH = "local_path" +EXPORT_S3_PATHS = "s3_paths" ### Invoice field names @@ -35,7 +43,7 @@ def get_institution_from_pi(institute_map, pi_uname): def load_institute_map() -> dict: - with open("institute_map.json", "r") as f: + with open("process_report/institute_map.json", "r") as f: institute_map = json.load(f) return institute_map @@ -62,6 +70,23 @@ def is_old_pi(old_pi_dict, pi, invoice_month): return False +def get_invoice_bucket(): + b2_resource = boto3.resource( + service_name="s3", + endpoint_url=os.environ["B2_ENDPOINT"], + aws_access_key_id=os.environ["B2_KEY_ID"], + aws_secret_access_key=os.environ["B2_APP_KEY"], + config=Config( + signature_version="s3v4", + ), + ) + return b2_resource.Bucket(os.environ["B2_BUCKET_NAME"]) + + +def get_iso8601_time(): + return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ") + + def main(): """Remove non-billable PIs and projects""" @@ -72,6 +97,12 @@ def main(): nargs="+", help="One or more CSV files that need to be processed", ) + parser.add_argument("--upload-to-s3", action="store_true") + parser.add_argument( + "--invoice-month", + required=True, + help="Invoice month to process", + ) parser.add_argument( "--pi-file", required=True, @@ -117,7 +148,15 @@ def main(): help="Name of csv file listing previously billed PIs", ) args = parser.parse_args() - merged_dataframe = merge_csv(args.csv_files) + + invoice_month = args.invoice_month + + if args.upload_to_s3: + csv_files = fetch_S3_invoices(invoice_month) + else: + csv_files = args.csv_files + + merged_dataframe = merge_csv(csv_files) pi = [] projects = [] @@ -126,26 +165,45 @@ def main(): with open(args.projects_file) as file: projects = [line.rstrip() for line in file] - invoice_date = get_invoice_date(merged_dataframe) - print("Invoice date: " + str(invoice_date)) + print("Invoice date: " + str(invoice_month)) - timed_projects_list = timed_projects(args.timed_projects_file, invoice_date) + timed_projects_list = timed_projects(args.timed_projects_file, invoice_month) print("The following timed-projects will not be billed for this period: ") print(timed_projects_list) projects = list(set(projects + timed_projects_list)) + invoice_list = list() + merged_dataframe = add_institution(merged_dataframe) - remove_billables(merged_dataframe, pi, projects, "non_billable.csv") + invoice_list.append( + remove_billables(merged_dataframe, pi, projects, "nonbillable.csv") + ) billable_projects = remove_non_billables(merged_dataframe, pi, projects) billable_projects = validate_pi_names(billable_projects) credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file) - export_billables(credited_projects, args.output_file) - export_pi_billables(billable_projects, args.output_folder) - export_HU_only(billable_projects, args.HU_invoice_file) - export_HU_BU(billable_projects, args.HU_BU_invoice_file) - export_lenovo(billable_projects) + + invoice_list.append(export_billables(credited_projects, args.output_file)) + export_pi_billables(credited_projects, args.output_folder, invoice_month) + export_HU_only(credited_projects, args.HU_invoice_file) + export_HU_BU(credited_projects, args.HU_BU_invoice_file) + export_lenovo(credited_projects, invoice_month) + export_invoices(invoice_list, args.upload_to_s3, invoice_month) + + +def fetch_S3_invoices(invoice_month): + """Fetches usage invoices from S3 given invoice month""" + s3_invoice_list = list() + invoice_bucket = get_invoice_bucket() + for obj in invoice_bucket.objects.filter( + Prefix=f"Invoices/{invoice_month}/Service Invoices/" + ): + local_name = obj.key.split("/")[-1] + s3_invoice_list.append(local_name) + invoice_bucket.download_file(obj.key, local_name) + + return s3_invoice_list def merge_csv(files): @@ -203,7 +261,17 @@ def remove_billables(dataframe, pi, projects, output_file): filtered_dataframe = dataframe[ dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects) ] - filtered_dataframe.to_csv(output_file, index=False) + + invoice_b2_path = f"Invoices/{{}}/NERC (Non-Billable) {{}}.csv" # noqa: F541 + invoice_b2_path_archive = ( + f"Invoices/{{}}/Archive/NERC (Non-Billable) {{}} {get_iso8601_time()}.csv" + ) + + return { + EXPORT_DATAFRAME: filtered_dataframe, + EXPORT_LOCAL_PATH: output_file, + EXPORT_S3_PATHS: [invoice_b2_path, invoice_b2_path_archive], + } def validate_pi_names(dataframe): @@ -215,27 +283,6 @@ def validate_pi_names(dataframe): return dataframe -def export_billables(dataframe, output_file): - dataframe.to_csv(output_file, index=False) - - -def export_pi_billables(dataframe: pandas.DataFrame, output_folder): - if not os.path.exists(output_folder): - os.mkdir(output_folder) - - invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0] - pi_list = dataframe[PI_FIELD].unique() - - for pi in pi_list: - if pandas.isna(pi): - continue - pi_projects = dataframe[dataframe[PI_FIELD] == pi] - pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0] - pi_projects.to_csv( - output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv" - ) - - def apply_credits_new_pi(dataframe, old_pi_file): new_pi_credit_code = "0002" new_pi_credit_amount = 1000 @@ -297,9 +344,40 @@ def add_institution(dataframe: pandas.DataFrame): return dataframe +def export_billables(dataframe, output_file): + invoice_b2_path = f"Invoices/{{}}/NERC {{}}.csv" # noqa: F541 + invoice_b2_path_archive = ( + f"Invoices/{{}}/Archive/NERC {{}} {get_iso8601_time()}.csv" + ) + + return { + EXPORT_DATAFRAME: dataframe, + EXPORT_LOCAL_PATH: output_file, + EXPORT_S3_PATHS: [invoice_b2_path, invoice_b2_path_archive], + } + + +def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_month): + if not os.path.exists(output_folder): + os.mkdir(output_folder) + + pi_list = dataframe[PI_FIELD].unique() + + for pi in pi_list: + if pandas.isna(pi): + continue + pi_projects = dataframe[dataframe[PI_FIELD] == pi] + pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0] + pi_projects.to_csv( + output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv" + ) + # TODO (Quan Pham) Where to place these + + def export_HU_only(dataframe, output_file): HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"] HU_projects.to_csv(output_file) + # TODO (Quan Pham) Where to place these def export_HU_BU(dataframe, output_file): @@ -308,12 +386,11 @@ def export_HU_BU(dataframe, output_file): | (dataframe[INSTITUTION_FIELD] == "Boston University") ] HU_BU_projects.to_csv(output_file) + # TODO (Quan Pham) Where to place these -def export_lenovo(dataframe: pandas.DataFrame, output_file=None): - lenovo_file_name = ( - output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv" - ) +def export_lenovo(dataframe: pandas.DataFrame, invoice_month, output_file=None): + lenovo_file_name = output_file or f"Lenovo_{invoice_month}.csv" LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] SU_CHARGE_MULTIPLIER = 1 @@ -332,6 +409,17 @@ def export_lenovo(dataframe: pandas.DataFrame, output_file=None): lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER) lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"] lenovo_df.to_csv(lenovo_file_name) + # TODO (Quan Pham) Where to place these + + +def export_invoices(invoice_list: list, upload_to_s3, invoice_month): + for invoice in invoice_list: + local_path = invoice[EXPORT_LOCAL_PATH].format(invoice_month) + invoice[EXPORT_DATAFRAME].to_csv(local_path) + if upload_to_s3: + invoice_bucket = get_invoice_bucket() + for s3_path in invoice[EXPORT_S3_PATHS]: + invoice_bucket.upload_file(local_path, s3_path.format(invoice_month)) if __name__ == "__main__": diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 8f0c381..54a3f6e 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -69,24 +69,17 @@ def setUp(self): "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"], } self.dataframe = pandas.DataFrame(data) - + self.invoice_month = "2024-03" self.pi_to_exclude = ["PI2", "PI3"] self.projects_to_exclude = ["ProjectB", "ProjectD"] - self.output_file = tempfile.NamedTemporaryFile(delete=False) - self.output_file2 = tempfile.NamedTemporaryFile(delete=False) - - def tearDown(self): - os.remove(self.output_file.name) - os.remove(self.output_file2.name) - def test_remove_non_billables(self): billables_df = process_report.remove_non_billables( self.dataframe, self.pi_to_exclude, self.projects_to_exclude ) - process_report.export_billables(billables_df, self.output_file.name) - - result_df = pandas.read_csv(self.output_file.name) + result_df = process_report.export_billables(billables_df, "fake_invoice")[ + "dataframe" + ] self.assertNotIn("PI2", result_df["Manager (PI)"].tolist()) self.assertNotIn("PI3", result_df["Manager (PI)"].tolist()) @@ -105,14 +98,12 @@ def test_remove_non_billables(self): self.assertIn("ProjectE", result_df["Project - Allocation"].tolist()) def test_remove_billables(self): - process_report.remove_billables( + result_df = process_report.remove_billables( self.dataframe, self.pi_to_exclude, self.projects_to_exclude, - self.output_file2.name, - ) - - result_df = pandas.read_csv(self.output_file2.name) + "fake_invoice", + )["dataframe"] self.assertIn("PI2", result_df["Manager (PI)"].tolist()) self.assertIn("PI3", result_df["Manager (PI)"].tolist()) @@ -181,10 +172,13 @@ def setUp(self): "Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"], } self.dataframe = pandas.DataFrame(data) + self.invoice_month = data["Invoice Month"][0] def test_export_pi(self): output_dir = tempfile.TemporaryDirectory() - process_report.export_pi_billables(self.dataframe, output_dir.name) + process_report.export_pi_billables( + self.dataframe, output_dir.name, self.invoice_month + ) pi_csv_1 = f'{self.dataframe["Institution"][0]}_{self.dataframe["Manager (PI)"][0]}_{self.dataframe["Invoice Month"][0]}.csv' pi_csv_2 = f'{self.dataframe["Institution"][3]}_{self.dataframe["Manager (PI)"][3]}_{self.dataframe["Invoice Month"][3]}.csv' @@ -381,6 +375,7 @@ def setUp(self): ], } self.dataframe = pandas.DataFrame(data) + self.invoice_month = "2024-03" output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv") self.output_file = output_file.name @@ -389,7 +384,9 @@ def tearDown(self): os.remove(self.output_file) def test_apply_credit_0002(self): - process_report.export_lenovo(self.dataframe, self.output_file) + process_report.export_lenovo( + self.dataframe, self.invoice_month, self.output_file + ) output_df = pandas.read_csv(self.output_file) self.assertTrue( diff --git a/requirements.txt b/requirements.txt index fb6c7ed..b650973 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ pandas +boto3 diff --git a/tools/clone_nonbillables_and_process.sh b/tools/clone_nonbillables_and_process.sh deleted file mode 100644 index bcc77e2..0000000 --- a/tools/clone_nonbillables_and_process.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh -set -xe - -# Add deploy key to ssh config -mkdir -p ~/.ssh -if [ ! -e ~/.ssh/config ]; then - touch ~/.ssh/config - touch ~/.ssh/id_nonbillable - echo " - Host github-nonbillable - HostName github.com - IdentityFile ~/.ssh/id_nonbillable - " > ~/.ssh/config - echo "$GH_NONBILLABLE_DEPLOYKEY" > ~/.ssh/id_nonbillable - chmod 600 ~/.ssh/id_nonbillable -fi - -if [ ! -d ./non-billable-projects ]; then - git clone git@github-nonbillable:CCI-MOC/non-billable-projects.git ./non-billable-projects -fi diff --git a/tools/clone_nonbillables_and_process_invoice.sh b/tools/clone_nonbillables_and_process_invoice.sh new file mode 100644 index 0000000..75503b5 --- /dev/null +++ b/tools/clone_nonbillables_and_process_invoice.sh @@ -0,0 +1,40 @@ +#!/bin/sh +set -xe + +# Add deploy key to ssh config +mkdir -p ~/.ssh +if [ ! -e ~/.ssh/config ]; then + touch ~/.ssh/config + touch ~/.ssh/id_nonbillable + echo " + Host github-nonbillable + HostName github.com + IdentityFile ~/.ssh/id_nonbillable + " > ~/.ssh/config + echo "$GH_NONBILLABLE_DEPLOYKEY" > ~/.ssh/id_nonbillable + chmod 600 ~/.ssh/id_nonbillable +fi + +if [ ! -d ~/.ssh/known_hosts ]; then + touch ~/.ssh/known_hosts + echo "github.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl + github.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg= + github.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk= + " >> ~/.ssh/known_hosts +fi + +if [ ! -d ./non-billable-projects ]; then + git clone git@github-nonbillable:CCI-MOC/non-billable-projects.git ./non-billable-projects +fi + +INVOICE_MONTH=$(date --date="$(date +%Y-%m-01) -1 month" +%Y-%m) +export B2_ENDPOINT +export B2_APP_KEY +export B2_KEY_ID +export B2_BUCKET_NAME +python process_report/process_report.py \ + --invoice-month $INVOICE_MONTH \ + --pi-file ./non-billable-projects/pi.txt \ + --projects-file ./non-billable-projects/projects.txt \ + --timed-projects-file ./non-billable-projects/timed_projects.txt \ + --old-pi-file old_pi.csv