From 90e7e9b31e546eeab9550c55d0e816aabe8f373d Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Thu, 12 Sep 2024 09:37:33 -0400 Subject: [PATCH 1/2] Restructured institute_map.json into a YAML list As prerequisite to adding metadata about our list of institutions, such as whether MGHPCC has a partnership with them, and to make the file more human-friendly, `institute.json` has been converted to a YAML and formatted as a list. Each element of the YAML list must be a dict with 2 attributes: - `display_name`: The name of the institute, as will appear on invoices - `domains`: A list containing domains for that institute Additional metadata can be freely added to each institute dict as our billing needs change. There is also some small cleanup. Namely, some functions in `process_report.py` that have been moved to `util.py` were not removed during refactoring. --- Dockerfile | 2 +- process_report/institute_list.yaml | 74 ++++++++++++++++++++++++++++++ process_report/process_report.py | 40 ++++------------ process_report/tests/unit_tests.py | 4 +- process_report/util.py | 28 +++++++---- 5 files changed, 107 insertions(+), 41 deletions(-) create mode 100644 process_report/institute_list.yaml diff --git a/Dockerfile b/Dockerfile index 464519d..8c4af28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,6 @@ RUN pip install -r requirements.txt COPY tools/ tools/ COPY process_report/process_report.py process_report/ -COPY process_report/institute_map.json process_report/ +COPY process_report/institute_list.yaml process_report/ CMD ["tools/clone_nonbillables_and_process_invoice.sh"] diff --git a/process_report/institute_list.yaml b/process_report/institute_list.yaml new file mode 100644 index 0000000..b0d85aa --- /dev/null +++ b/process_report/institute_list.yaml @@ -0,0 +1,74 @@ +- display_name: Northeastern University + domains: + - northeastern.edu +- display_name: Boston University + domains: + - bu.edu + - robbaron +- display_name: Bentley + domains: + - bentley.edu +- display_name: University of Rhode Island + domains: + - uri.edu +- display_name: Red Hat + domains: + - redhat.com +- display_name: Boston Childrens Hospital + domains: + - childrens.harvard.edu + - rudolph +- display_name: McLean Hospital + domains: + - mclean.harvard.edu +- display_name: Massachusetts Eye & Ear + domains: + - meei.harvard.edu +- display_name: Dana-Farber Cancer Institute + domains: + - dfci.harvard.edu +- display_name: Brigham and Women's Hospital + domains: + - bwh.harvard.edu +- display_name: Beth Israel Deaconess Medical Center + domains: + - bidmc.harvard.edu +- display_name: Harvard University + domains: + - harvard.edu + - mmsh + - kmdalton + - francesco.pontiggia + - chemistry.harvard.edu +- display_name: Worcester Polytechnic Institute + domains: + - wpi.edu +- display_name: Massachusetts Institute of Technology + domains: + - mit.edu +- display_name: University of Massachusetts Amherst + domains: + - umass.edu + - gstuart + - mzink +- display_name: University of Massachusetts Lowell + domains: + - uml.edu +- display_name: Code For Boston + domains: + - codeforboston.org +- display_name: Yale University + domains: + - yale.edu +- display_name: Dartmouth College + domains: + - dartmouth.edu +- display_name: Photrek + domains: + - photrek.io +- display_name: Positron Networks + domains: + - positronnetworks.com +- display_name: Next Generation Justice + domains: + - nextgenjustice.llc diff --git a/process_report/process_report.py b/process_report/process_report.py index 4207152..49d31d7 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -2,11 +2,10 @@ import sys import datetime -import json import pandas import pyarrow -from process_report.util import get_invoice_bucket, process_and_export_invoices +from process_report import util from process_report.invoices import ( lenovo_invoice, nonbillable_invoice, @@ -51,26 +50,6 @@ ALIAS_S3_FILEPATH = "PIs/alias.csv" -def get_institution_from_pi(institute_map, pi_uname): - institution_domain = pi_uname.split("@")[-1] - for i in range(institution_domain.count(".") + 1): - if institution_name := institute_map.get(institution_domain, ""): - break - institution_domain = institution_domain[institution_domain.find(".") + 1 :] - - if institution_name == "": - print(f"Warning: PI name {pi_uname} does not match any institution!") - - return institution_name - - -def load_institute_map() -> dict: - with open("process_report/institute_map.json", "r") as f: - institute_map = json.load(f) - - return institute_map - - def load_alias(alias_file): alias_dict = dict() @@ -245,7 +224,7 @@ def main(): old_pi_filepath=old_pi_file, ) - process_and_export_invoices( + util.process_and_export_invoices( [lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3 ) @@ -266,7 +245,7 @@ def main(): name=args.output_folder, invoice_month=invoice_month, data=billable_inv.data ) - process_and_export_invoices( + util.process_and_export_invoices( [nerc_total_inv, bu_internal_inv, pi_inv], args.upload_to_s3 ) @@ -274,7 +253,7 @@ def main(): def fetch_s3_invoices(invoice_month): """Fetches usage invoices from S3 given invoice month""" s3_invoice_list = list() - invoice_bucket = get_invoice_bucket() + invoice_bucket = util.get_invoice_bucket() for obj in invoice_bucket.objects.filter( Prefix=f"Invoices/{invoice_month}/Service Invoices/" ): @@ -339,20 +318,20 @@ def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict): def fetch_s3_alias_file(): local_name = "alias.csv" - invoice_bucket = get_invoice_bucket() + invoice_bucket = util.get_invoice_bucket() invoice_bucket.download_file(ALIAS_S3_FILEPATH, local_name) return local_name def fetch_s3_old_pi_file(): local_name = "PI.csv" - invoice_bucket = get_invoice_bucket() + invoice_bucket = util.get_invoice_bucket() invoice_bucket.download_file(PI_S3_FILEPATH, local_name) return local_name def backup_to_s3_old_pi_file(old_pi_file): - invoice_bucket = get_invoice_bucket() + invoice_bucket = util.get_invoice_bucket() invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv") @@ -368,14 +347,15 @@ def add_institution(dataframe: pandas.DataFrame): The list of mappings are defined in `institute_map.json`. """ - institute_map = load_institute_map() + institute_list = util.load_institute_list() + institute_map = util.get_institute_mapping(institute_list) dataframe = dataframe.astype({INSTITUTION_FIELD: "str"}) for i, row in dataframe.iterrows(): pi_name = row[PI_FIELD] if pandas.isna(pi_name): print(f"Project {row[PROJECT_FIELD]} has no PI") else: - dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi( + dataframe.at[i, INSTITUTION_FIELD] = util.get_institution_from_pi( institute_map, pi_name ) diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index b2eeb3a..d56f7f9 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -250,7 +250,7 @@ def test_get_pi_institution(self): for pi_email, answer in answers.items(): self.assertEqual( - process_report.get_institution_from_pi(institute_map, pi_email), answer + util.get_institution_from_pi(institute_map, pi_email), answer ) @@ -789,7 +789,7 @@ def test_process_lenovo(self): class TestUploadToS3(TestCase): - @mock.patch("process_report.process_report.get_invoice_bucket") + @mock.patch("process_report.util.get_invoice_bucket") @mock.patch("process_report.util.get_iso8601_time") def test_upload_to_s3(self, mock_get_time, mock_get_bucket): mock_bucket = mock.MagicMock() diff --git a/process_report/util.py b/process_report/util.py index 45b9ed4..4f1ef79 100644 --- a/process_report/util.py +++ b/process_report/util.py @@ -1,6 +1,6 @@ import os import datetime -import json +import yaml import logging import functools @@ -27,21 +27,33 @@ def get_invoice_bucket(): return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing")) +def get_institute_mapping(institute_list: list): + institute_map = dict() + for institute_info in institute_list: + for domain in institute_info["domains"]: + institute_map[domain] = institute_info["display_name"] + + return institute_map + + def get_institution_from_pi(institute_map, pi_uname): - institution_key = pi_uname.split("@")[-1] - institution_name = institute_map.get(institution_key, "") + institution_domain = pi_uname.split("@")[-1] + for i in range(institution_domain.count(".") + 1): + if institution_name := institute_map.get(institution_domain, ""): + break + institution_domain = institution_domain[institution_domain.find(".") + 1 :] if institution_name == "": - logger.warn(f"PI name {pi_uname} does not match any institution!") + print(f"Warning: PI name {pi_uname} does not match any institution!") return institution_name -def load_institute_map() -> dict: - with open("process_report/institute_map.json", "r") as f: - institute_map = json.load(f) +def load_institute_list(): + with open("process_report/institute_list.yaml", "r") as f: + institute_list = yaml.safe_load(f) - return institute_map + return institute_list def get_iso8601_time(): From 696407c13f7c377649169f372b120f4851605a0f Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 11 Sep 2024 13:44:40 -0400 Subject: [PATCH 2/2] Allow limiting New-PI credit to partner institutions `process_report.py` will now use `nerc_rates` to determine if the New-PI credit is limited to institutions that are MGHPCC partners for the given invoice month. The script determines whether an institution's MGHPCC partnership is active by first seeing if the `mghpcc_partnership_start_date` field is set in `institute_list.yaml`, then checks if the start date is within or before the invoice month. Also removed usernames from `institute_list.yaml`, as they are no longer nessecary given the alias validation step. --- process_report/institute_list.yaml | 7 ----- process_report/invoices/billable_invoice.py | 35 +++++++++++++++++---- process_report/process_report.py | 5 +++ process_report/tests/unit_tests.py | 35 +++++++++++++++++++++ process_report/tests/util.py | 2 ++ process_report/util.py | 2 +- requirements.txt | 1 + 7 files changed, 73 insertions(+), 14 deletions(-) diff --git a/process_report/institute_list.yaml b/process_report/institute_list.yaml index b0d85aa..b459bd1 100644 --- a/process_report/institute_list.yaml +++ b/process_report/institute_list.yaml @@ -4,7 +4,6 @@ - display_name: Boston University domains: - bu.edu - - robbaron - display_name: Bentley domains: - bentley.edu @@ -17,7 +16,6 @@ - display_name: Boston Childrens Hospital domains: - childrens.harvard.edu - - rudolph - display_name: McLean Hospital domains: - mclean.harvard.edu @@ -36,9 +34,6 @@ - display_name: Harvard University domains: - harvard.edu - - mmsh - - kmdalton - - francesco.pontiggia - chemistry.harvard.edu - display_name: Worcester Polytechnic Institute domains: @@ -49,8 +44,6 @@ - display_name: University of Massachusetts Amherst domains: - umass.edu - - gstuart - - mzink - display_name: University of Massachusetts Lowell domains: - uml.edu diff --git a/process_report/invoices/billable_invoice.py b/process_report/invoices/billable_invoice.py index 0cf7d3e..efaddd1 100644 --- a/process_report/invoices/billable_invoice.py +++ b/process_report/invoices/billable_invoice.py @@ -23,6 +23,7 @@ class BillableInvoice(discount_invoice.DiscountInvoice): nonbillable_pis: list[str] nonbillable_projects: list[str] old_pi_filepath: str + limit_new_pi_credit_to_partners: bool = False @staticmethod def _load_old_pis(old_pi_filepath) -> pandas.DataFrame: @@ -115,6 +116,28 @@ def export_s3(self, s3_bucket): super().export_s3(s3_bucket) s3_bucket.upload_file(self.old_pi_filepath, self.PI_S3_FILEPATH) + def _filter_partners(self, data): + active_partnerships = list() + institute_list = util.load_institute_list() + for institute_info in institute_list: + if partnership_start_date := institute_info.get( + "mghpcc_partnership_start_date" + ): + if util.get_month_diff(self.invoice_month, partnership_start_date) >= 0: + active_partnerships.append(institute_info["display_name"]) + + return data[data[invoice.INSTITUTION_FIELD].isin(active_partnerships)] + + def _filter_excluded_su_types(self, data): + return data[~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES))] + + def _get_credit_eligible_projects(self, data: pandas.DataFrame): + filtered_data = self._filter_excluded_su_types(data) + if self.limit_new_pi_credit_to_partners: + filtered_data = self._filter_partners(filtered_data) + + return filtered_data + def _apply_credits_new_pi( self, data: pandas.DataFrame, old_pi_df: pandas.DataFrame ): @@ -140,11 +163,11 @@ def get_initial_credit_amount( ) print(f"New PI Credit set at {new_pi_credit_amount} for {self.invoice_month}") - current_pi_set = set(data[invoice.PI_FIELD]) + credit_eligible_projects = self._get_credit_eligible_projects(data) + current_pi_set = set(credit_eligible_projects[invoice.PI_FIELD]) for pi in current_pi_set: - credit_eligible_projects = data[ - (data[invoice.PI_FIELD] == pi) - & ~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES)) + pi_projects = credit_eligible_projects[ + credit_eligible_projects[invoice.PI_FIELD] == pi ] pi_age = self._get_pi_age(old_pi_df, pi, self.invoice_month) pi_old_pi_entry = old_pi_df.loc[ @@ -152,7 +175,7 @@ def get_initial_credit_amount( ].squeeze() if pi_age > 1: - for i, row in credit_eligible_projects.iterrows(): + for i, row in pi_projects.iterrows(): data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD] else: if pi_age == 0: @@ -180,7 +203,7 @@ def get_initial_credit_amount( credits_used = self.apply_flat_discount( data, - credit_eligible_projects, + pi_projects, remaining_credit, invoice.CREDIT_FIELD, invoice.BALANCE_FIELD, diff --git a/process_report/process_report.py b/process_report/process_report.py index 49d31d7..d88808e 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -4,6 +4,7 @@ import pandas import pyarrow +from nerc_rates import load_from_url from process_report import util from process_report.invoices import ( @@ -215,6 +216,7 @@ def main(): if args.upload_to_s3: backup_to_s3_old_pi_file(old_pi_file) + rates_info = load_from_url() billable_inv = billable_invoice.BillableInvoice( name=args.output_file, invoice_month=invoice_month, @@ -222,6 +224,9 @@ def main(): nonbillable_pis=pi, nonbillable_projects=projects, old_pi_filepath=old_pi_file, + limit_new_pi_credit_to_partners=rates_info.get_value_at( + "Limit New PI Credit to MGHPCC Partners", invoice_month + ), ) util.process_and_export_invoices( diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index d56f7f9..9bce39f 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -833,3 +833,38 @@ def test_upload_to_s3(self, mock_get_time, mock_get_bucket): for i, call_args in enumerate(mock_bucket.upload_file.call_args_list): self.assertTrue(answers[i] in call_args) + + +class TestNERCRates(TestCase): + @mock.patch("process_report.util.load_institute_list") + def test_flag_limit_new_pi_credit(self, mock_load_institute_list): + mock_load_institute_list.return_value = [ + {"display_name": "BU", "mghpcc_partnership_start_date": "2024-02"}, + {"display_name": "HU", "mghpcc_partnership_start_date": "2024-6"}, + {"display_name": "NEU", "mghpcc_partnership_start_date": "2024-11"}, + ] + sample_df = pandas.DataFrame( + { + "Institution": ["BU", "HU", "NEU", "MIT", "BC"], + } + ) + sample_inv = test_utils.new_billable_invoice( + limit_new_pi_credit_to_partners=True + ) + + # When no partnerships are active + sample_inv.invoice_month = "2024-01" + output_df = sample_inv._filter_partners(sample_df) + self.assertTrue(output_df.empty) + + # When some partnerships are active + sample_inv.invoice_month = "2024-06" + output_df = sample_inv._filter_partners(sample_df) + answer_df = pandas.DataFrame({"Institution": ["BU", "HU"]}) + self.assertTrue(output_df.equals(answer_df)) + + # When all partnerships are active + sample_inv.invoice_month = "2024-12" + output_df = sample_inv._filter_partners(sample_df) + answer_df = pandas.DataFrame({"Institution": ["BU", "HU", "NEU"]}) + self.assertTrue(output_df.equals(answer_df)) diff --git a/process_report/tests/util.py b/process_report/tests/util.py index 95148d1..c546441 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -23,6 +23,7 @@ def new_billable_invoice( nonbillable_pis=[], nonbillable_projects=[], old_pi_filepath="", + limit_new_pi_credit_to_partners=False, ): return billable_invoice.BillableInvoice( name, @@ -31,6 +32,7 @@ def new_billable_invoice( nonbillable_pis, nonbillable_projects, old_pi_filepath, + limit_new_pi_credit_to_partners, ) diff --git a/process_report/util.py b/process_report/util.py index 4f1ef79..8063bc4 100644 --- a/process_report/util.py +++ b/process_report/util.py @@ -44,7 +44,7 @@ def get_institution_from_pi(institute_map, pi_uname): institution_domain = institution_domain[institution_domain.find(".") + 1 :] if institution_name == "": - print(f"Warning: PI name {pi_uname} does not match any institution!") + logger.warn(f"PI name {pi_uname} does not match any institution!") return institution_name diff --git a/requirements.txt b/requirements.txt index cc0d852..748b45a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +git+https://github.com/CCI-MOC/nerc-rates@74eb4a7#egg=nerc_rates pandas pyarrow boto3