diff --git a/Dockerfile b/Dockerfile index 464519d..8c4af28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,6 @@ RUN pip install -r requirements.txt COPY tools/ tools/ COPY process_report/process_report.py process_report/ -COPY process_report/institute_map.json process_report/ +COPY process_report/institute_list.yaml process_report/ CMD ["tools/clone_nonbillables_and_process_invoice.sh"] diff --git a/process_report/institute_list.yaml b/process_report/institute_list.yaml new file mode 100644 index 0000000..b459bd1 --- /dev/null +++ b/process_report/institute_list.yaml @@ -0,0 +1,67 @@ +- display_name: Northeastern University + domains: + - northeastern.edu +- display_name: Boston University + domains: + - bu.edu +- display_name: Bentley + domains: + - bentley.edu +- display_name: University of Rhode Island + domains: + - uri.edu +- display_name: Red Hat + domains: + - redhat.com +- display_name: Boston Childrens Hospital + domains: + - childrens.harvard.edu +- display_name: McLean Hospital + domains: + - mclean.harvard.edu +- display_name: Massachusetts Eye & Ear + domains: + - meei.harvard.edu +- display_name: Dana-Farber Cancer Institute + domains: + - dfci.harvard.edu +- display_name: Brigham and Women's Hospital + domains: + - bwh.harvard.edu +- display_name: Beth Israel Deaconess Medical Center + domains: + - bidmc.harvard.edu +- display_name: Harvard University + domains: + - harvard.edu + - chemistry.harvard.edu +- display_name: Worcester Polytechnic Institute + domains: + - wpi.edu +- display_name: Massachusetts Institute of Technology + domains: + - mit.edu +- display_name: University of Massachusetts Amherst + domains: + - umass.edu +- display_name: University of Massachusetts Lowell + domains: + - uml.edu +- display_name: Code For Boston + domains: + - codeforboston.org +- display_name: Yale University + domains: + - yale.edu +- display_name: Dartmouth College + domains: + - dartmouth.edu +- display_name: Photrek + domains: + - photrek.io +- display_name: Positron Networks + domains: + - positronnetworks.com +- display_name: Next Generation Justice + domains: + - nextgenjustice.llc diff --git a/process_report/invoices/billable_invoice.py b/process_report/invoices/billable_invoice.py index 0cf7d3e..efaddd1 100644 --- a/process_report/invoices/billable_invoice.py +++ b/process_report/invoices/billable_invoice.py @@ -23,6 +23,7 @@ class BillableInvoice(discount_invoice.DiscountInvoice): nonbillable_pis: list[str] nonbillable_projects: list[str] old_pi_filepath: str + limit_new_pi_credit_to_partners: bool = False @staticmethod def _load_old_pis(old_pi_filepath) -> pandas.DataFrame: @@ -115,6 +116,28 @@ def export_s3(self, s3_bucket): super().export_s3(s3_bucket) s3_bucket.upload_file(self.old_pi_filepath, self.PI_S3_FILEPATH) + def _filter_partners(self, data): + active_partnerships = list() + institute_list = util.load_institute_list() + for institute_info in institute_list: + if partnership_start_date := institute_info.get( + "mghpcc_partnership_start_date" + ): + if util.get_month_diff(self.invoice_month, partnership_start_date) >= 0: + active_partnerships.append(institute_info["display_name"]) + + return data[data[invoice.INSTITUTION_FIELD].isin(active_partnerships)] + + def _filter_excluded_su_types(self, data): + return data[~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES))] + + def _get_credit_eligible_projects(self, data: pandas.DataFrame): + filtered_data = self._filter_excluded_su_types(data) + if self.limit_new_pi_credit_to_partners: + filtered_data = self._filter_partners(filtered_data) + + return filtered_data + def _apply_credits_new_pi( self, data: pandas.DataFrame, old_pi_df: pandas.DataFrame ): @@ -140,11 +163,11 @@ def get_initial_credit_amount( ) print(f"New PI Credit set at {new_pi_credit_amount} for {self.invoice_month}") - current_pi_set = set(data[invoice.PI_FIELD]) + credit_eligible_projects = self._get_credit_eligible_projects(data) + current_pi_set = set(credit_eligible_projects[invoice.PI_FIELD]) for pi in current_pi_set: - credit_eligible_projects = data[ - (data[invoice.PI_FIELD] == pi) - & ~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES)) + pi_projects = credit_eligible_projects[ + credit_eligible_projects[invoice.PI_FIELD] == pi ] pi_age = self._get_pi_age(old_pi_df, pi, self.invoice_month) pi_old_pi_entry = old_pi_df.loc[ @@ -152,7 +175,7 @@ def get_initial_credit_amount( ].squeeze() if pi_age > 1: - for i, row in credit_eligible_projects.iterrows(): + for i, row in pi_projects.iterrows(): data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD] else: if pi_age == 0: @@ -180,7 +203,7 @@ def get_initial_credit_amount( credits_used = self.apply_flat_discount( data, - credit_eligible_projects, + pi_projects, remaining_credit, invoice.CREDIT_FIELD, invoice.BALANCE_FIELD, diff --git a/process_report/process_report.py b/process_report/process_report.py index 4207152..d88808e 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -2,11 +2,11 @@ import sys import datetime -import json import pandas import pyarrow +from nerc_rates import load_from_url -from process_report.util import get_invoice_bucket, process_and_export_invoices +from process_report import util from process_report.invoices import ( lenovo_invoice, nonbillable_invoice, @@ -51,26 +51,6 @@ ALIAS_S3_FILEPATH = "PIs/alias.csv" -def get_institution_from_pi(institute_map, pi_uname): - institution_domain = pi_uname.split("@")[-1] - for i in range(institution_domain.count(".") + 1): - if institution_name := institute_map.get(institution_domain, ""): - break - institution_domain = institution_domain[institution_domain.find(".") + 1 :] - - if institution_name == "": - print(f"Warning: PI name {pi_uname} does not match any institution!") - - return institution_name - - -def load_institute_map() -> dict: - with open("process_report/institute_map.json", "r") as f: - institute_map = json.load(f) - - return institute_map - - def load_alias(alias_file): alias_dict = dict() @@ -236,6 +216,7 @@ def main(): if args.upload_to_s3: backup_to_s3_old_pi_file(old_pi_file) + rates_info = load_from_url() billable_inv = billable_invoice.BillableInvoice( name=args.output_file, invoice_month=invoice_month, @@ -243,9 +224,12 @@ def main(): nonbillable_pis=pi, nonbillable_projects=projects, old_pi_filepath=old_pi_file, + limit_new_pi_credit_to_partners=rates_info.get_value_at( + "Limit New PI Credit to MGHPCC Partners", invoice_month + ), ) - process_and_export_invoices( + util.process_and_export_invoices( [lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3 ) @@ -266,7 +250,7 @@ def main(): name=args.output_folder, invoice_month=invoice_month, data=billable_inv.data ) - process_and_export_invoices( + util.process_and_export_invoices( [nerc_total_inv, bu_internal_inv, pi_inv], args.upload_to_s3 ) @@ -274,7 +258,7 @@ def main(): def fetch_s3_invoices(invoice_month): """Fetches usage invoices from S3 given invoice month""" s3_invoice_list = list() - invoice_bucket = get_invoice_bucket() + invoice_bucket = util.get_invoice_bucket() for obj in invoice_bucket.objects.filter( Prefix=f"Invoices/{invoice_month}/Service Invoices/" ): @@ -339,20 +323,20 @@ def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict): def fetch_s3_alias_file(): local_name = "alias.csv" - invoice_bucket = get_invoice_bucket() + invoice_bucket = util.get_invoice_bucket() invoice_bucket.download_file(ALIAS_S3_FILEPATH, local_name) return local_name def fetch_s3_old_pi_file(): local_name = "PI.csv" - invoice_bucket = get_invoice_bucket() + invoice_bucket = util.get_invoice_bucket() invoice_bucket.download_file(PI_S3_FILEPATH, local_name) return local_name def backup_to_s3_old_pi_file(old_pi_file): - invoice_bucket = get_invoice_bucket() + invoice_bucket = util.get_invoice_bucket() invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv") @@ -368,14 +352,15 @@ def add_institution(dataframe: pandas.DataFrame): The list of mappings are defined in `institute_map.json`. """ - institute_map = load_institute_map() + institute_list = util.load_institute_list() + institute_map = util.get_institute_mapping(institute_list) dataframe = dataframe.astype({INSTITUTION_FIELD: "str"}) for i, row in dataframe.iterrows(): pi_name = row[PI_FIELD] if pandas.isna(pi_name): print(f"Project {row[PROJECT_FIELD]} has no PI") else: - dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi( + dataframe.at[i, INSTITUTION_FIELD] = util.get_institution_from_pi( institute_map, pi_name ) diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index b2eeb3a..9bce39f 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -250,7 +250,7 @@ def test_get_pi_institution(self): for pi_email, answer in answers.items(): self.assertEqual( - process_report.get_institution_from_pi(institute_map, pi_email), answer + util.get_institution_from_pi(institute_map, pi_email), answer ) @@ -789,7 +789,7 @@ def test_process_lenovo(self): class TestUploadToS3(TestCase): - @mock.patch("process_report.process_report.get_invoice_bucket") + @mock.patch("process_report.util.get_invoice_bucket") @mock.patch("process_report.util.get_iso8601_time") def test_upload_to_s3(self, mock_get_time, mock_get_bucket): mock_bucket = mock.MagicMock() @@ -833,3 +833,38 @@ def test_upload_to_s3(self, mock_get_time, mock_get_bucket): for i, call_args in enumerate(mock_bucket.upload_file.call_args_list): self.assertTrue(answers[i] in call_args) + + +class TestNERCRates(TestCase): + @mock.patch("process_report.util.load_institute_list") + def test_flag_limit_new_pi_credit(self, mock_load_institute_list): + mock_load_institute_list.return_value = [ + {"display_name": "BU", "mghpcc_partnership_start_date": "2024-02"}, + {"display_name": "HU", "mghpcc_partnership_start_date": "2024-6"}, + {"display_name": "NEU", "mghpcc_partnership_start_date": "2024-11"}, + ] + sample_df = pandas.DataFrame( + { + "Institution": ["BU", "HU", "NEU", "MIT", "BC"], + } + ) + sample_inv = test_utils.new_billable_invoice( + limit_new_pi_credit_to_partners=True + ) + + # When no partnerships are active + sample_inv.invoice_month = "2024-01" + output_df = sample_inv._filter_partners(sample_df) + self.assertTrue(output_df.empty) + + # When some partnerships are active + sample_inv.invoice_month = "2024-06" + output_df = sample_inv._filter_partners(sample_df) + answer_df = pandas.DataFrame({"Institution": ["BU", "HU"]}) + self.assertTrue(output_df.equals(answer_df)) + + # When all partnerships are active + sample_inv.invoice_month = "2024-12" + output_df = sample_inv._filter_partners(sample_df) + answer_df = pandas.DataFrame({"Institution": ["BU", "HU", "NEU"]}) + self.assertTrue(output_df.equals(answer_df)) diff --git a/process_report/tests/util.py b/process_report/tests/util.py index 95148d1..c546441 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -23,6 +23,7 @@ def new_billable_invoice( nonbillable_pis=[], nonbillable_projects=[], old_pi_filepath="", + limit_new_pi_credit_to_partners=False, ): return billable_invoice.BillableInvoice( name, @@ -31,6 +32,7 @@ def new_billable_invoice( nonbillable_pis, nonbillable_projects, old_pi_filepath, + limit_new_pi_credit_to_partners, ) diff --git a/process_report/util.py b/process_report/util.py index 45b9ed4..8063bc4 100644 --- a/process_report/util.py +++ b/process_report/util.py @@ -1,6 +1,6 @@ import os import datetime -import json +import yaml import logging import functools @@ -27,9 +27,21 @@ def get_invoice_bucket(): return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing")) +def get_institute_mapping(institute_list: list): + institute_map = dict() + for institute_info in institute_list: + for domain in institute_info["domains"]: + institute_map[domain] = institute_info["display_name"] + + return institute_map + + def get_institution_from_pi(institute_map, pi_uname): - institution_key = pi_uname.split("@")[-1] - institution_name = institute_map.get(institution_key, "") + institution_domain = pi_uname.split("@")[-1] + for i in range(institution_domain.count(".") + 1): + if institution_name := institute_map.get(institution_domain, ""): + break + institution_domain = institution_domain[institution_domain.find(".") + 1 :] if institution_name == "": logger.warn(f"PI name {pi_uname} does not match any institution!") @@ -37,11 +49,11 @@ def get_institution_from_pi(institute_map, pi_uname): return institution_name -def load_institute_map() -> dict: - with open("process_report/institute_map.json", "r") as f: - institute_map = json.load(f) +def load_institute_list(): + with open("process_report/institute_list.yaml", "r") as f: + institute_list = yaml.safe_load(f) - return institute_map + return institute_list def get_iso8601_time(): diff --git a/requirements.txt b/requirements.txt index cc0d852..748b45a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +git+https://github.com/CCI-MOC/nerc-rates@74eb4a7#egg=nerc_rates pandas pyarrow boto3