Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow limiting New-PI credit to partner institutions #96

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ RUN pip install -r requirements.txt

COPY tools/ tools/
COPY process_report/process_report.py process_report/
COPY process_report/institute_map.json process_report/
COPY process_report/institute_list.yaml process_report/

CMD ["tools/clone_nonbillables_and_process_invoice.sh"]
67 changes: 67 additions & 0 deletions process_report/institute_list.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
- display_name: Northeastern University
domains:
- northeastern.edu
- display_name: Boston University
domains:
- bu.edu
- display_name: Bentley
domains:
- bentley.edu
- display_name: University of Rhode Island
domains:
- uri.edu
- display_name: Red Hat
domains:
- redhat.com
- display_name: Boston Childrens Hospital
domains:
- childrens.harvard.edu
- display_name: McLean Hospital
domains:
- mclean.harvard.edu
- display_name: Massachusetts Eye & Ear
domains:
- meei.harvard.edu
- display_name: Dana-Farber Cancer Institute
domains:
- dfci.harvard.edu
- display_name: Brigham and Women's Hospital
domains:
- bwh.harvard.edu
- display_name: Beth Israel Deaconess Medical Center
domains:
- bidmc.harvard.edu
- display_name: Harvard University
domains:
- harvard.edu
- chemistry.harvard.edu
- display_name: Worcester Polytechnic Institute
domains:
- wpi.edu
- display_name: Massachusetts Institute of Technology
domains:
- mit.edu
- display_name: University of Massachusetts Amherst
domains:
- umass.edu
- display_name: University of Massachusetts Lowell
domains:
- uml.edu
- display_name: Code For Boston
domains:
- codeforboston.org
- display_name: Yale University
domains:
- yale.edu
- display_name: Dartmouth College
domains:
- dartmouth.edu
- display_name: Photrek
domains:
- photrek.io
- display_name: Positron Networks
domains:
- positronnetworks.com
- display_name: Next Generation Justice
domains:
- nextgenjustice.llc
35 changes: 29 additions & 6 deletions process_report/invoices/billable_invoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class BillableInvoice(discount_invoice.DiscountInvoice):
nonbillable_pis: list[str]
nonbillable_projects: list[str]
old_pi_filepath: str
limit_new_pi_credit_to_partners: bool = False

@staticmethod
def _load_old_pis(old_pi_filepath) -> pandas.DataFrame:
Expand Down Expand Up @@ -115,6 +116,28 @@ def export_s3(self, s3_bucket):
super().export_s3(s3_bucket)
s3_bucket.upload_file(self.old_pi_filepath, self.PI_S3_FILEPATH)

def _filter_partners(self, data):
active_partnerships = list()
institute_list = util.load_institute_list()
for institute_info in institute_list:
if partnership_start_date := institute_info.get(
"mghpcc_partnership_start_date"
):
QuanMPhm marked this conversation as resolved.
Show resolved Hide resolved
if util.get_month_diff(self.invoice_month, partnership_start_date) >= 0:
active_partnerships.append(institute_info["display_name"])

return data[data[invoice.INSTITUTION_FIELD].isin(active_partnerships)]

def _filter_excluded_su_types(self, data):
return data[~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES))]

def _get_credit_eligible_projects(self, data: pandas.DataFrame):
filtered_data = self._filter_excluded_su_types(data)
if self.limit_new_pi_credit_to_partners:
filtered_data = self._filter_partners(filtered_data)

return filtered_data

def _apply_credits_new_pi(
self, data: pandas.DataFrame, old_pi_df: pandas.DataFrame
):
Expand All @@ -140,19 +163,19 @@ def get_initial_credit_amount(
)
print(f"New PI Credit set at {new_pi_credit_amount} for {self.invoice_month}")

current_pi_set = set(data[invoice.PI_FIELD])
credit_eligible_projects = self._get_credit_eligible_projects(data)
naved001 marked this conversation as resolved.
Show resolved Hide resolved
current_pi_set = set(credit_eligible_projects[invoice.PI_FIELD])
for pi in current_pi_set:
credit_eligible_projects = data[
(data[invoice.PI_FIELD] == pi)
& ~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES))
pi_projects = credit_eligible_projects[
credit_eligible_projects[invoice.PI_FIELD] == pi
]
pi_age = self._get_pi_age(old_pi_df, pi, self.invoice_month)
pi_old_pi_entry = old_pi_df.loc[
old_pi_df[invoice.PI_PI_FIELD] == pi
].squeeze()

if pi_age > 1:
for i, row in credit_eligible_projects.iterrows():
for i, row in pi_projects.iterrows():
data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD]
else:
if pi_age == 0:
Expand Down Expand Up @@ -180,7 +203,7 @@ def get_initial_credit_amount(

credits_used = self.apply_flat_discount(
data,
credit_eligible_projects,
pi_projects,
remaining_credit,
invoice.CREDIT_FIELD,
invoice.BALANCE_FIELD,
Expand Down
45 changes: 15 additions & 30 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import sys
import datetime

import json
import pandas
import pyarrow
from nerc_rates import load_from_url

from process_report.util import get_invoice_bucket, process_and_export_invoices
from process_report import util
from process_report.invoices import (
lenovo_invoice,
nonbillable_invoice,
Expand Down Expand Up @@ -51,26 +51,6 @@
ALIAS_S3_FILEPATH = "PIs/alias.csv"


def get_institution_from_pi(institute_map, pi_uname):
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map


def load_alias(alias_file):
alias_dict = dict()

Expand Down Expand Up @@ -236,16 +216,20 @@ def main():
if args.upload_to_s3:
backup_to_s3_old_pi_file(old_pi_file)

rates_info = load_from_url()
billable_inv = billable_invoice.BillableInvoice(
name=args.output_file,
invoice_month=invoice_month,
data=merged_dataframe.copy(),
nonbillable_pis=pi,
nonbillable_projects=projects,
old_pi_filepath=old_pi_file,
limit_new_pi_credit_to_partners=rates_info.get_value_at(
"Limit New PI Credit to MGHPCC Partners", invoice_month
),
)

process_and_export_invoices(
util.process_and_export_invoices(
[lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3
)

Expand All @@ -266,15 +250,15 @@ def main():
name=args.output_folder, invoice_month=invoice_month, data=billable_inv.data
)

process_and_export_invoices(
util.process_and_export_invoices(
[nerc_total_inv, bu_internal_inv, pi_inv], args.upload_to_s3
)


def fetch_s3_invoices(invoice_month):
"""Fetches usage invoices from S3 given invoice month"""
s3_invoice_list = list()
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
for obj in invoice_bucket.objects.filter(
Prefix=f"Invoices/{invoice_month}/Service Invoices/"
):
Expand Down Expand Up @@ -339,20 +323,20 @@ def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):

def fetch_s3_alias_file():
local_name = "alias.csv"
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.download_file(ALIAS_S3_FILEPATH, local_name)
return local_name


def fetch_s3_old_pi_file():
local_name = "PI.csv"
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.download_file(PI_S3_FILEPATH, local_name)
return local_name


def backup_to_s3_old_pi_file(old_pi_file):
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv")


Expand All @@ -368,14 +352,15 @@ def add_institution(dataframe: pandas.DataFrame):

The list of mappings are defined in `institute_map.json`.
"""
institute_map = load_institute_map()
institute_list = util.load_institute_list()
institute_map = util.get_institute_mapping(institute_list)
dataframe = dataframe.astype({INSTITUTION_FIELD: "str"})
for i, row in dataframe.iterrows():
pi_name = row[PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[PROJECT_FIELD]} has no PI")
else:
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(
dataframe.at[i, INSTITUTION_FIELD] = util.get_institution_from_pi(
institute_map, pi_name
)

Expand Down
39 changes: 37 additions & 2 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def test_get_pi_institution(self):

for pi_email, answer in answers.items():
self.assertEqual(
process_report.get_institution_from_pi(institute_map, pi_email), answer
util.get_institution_from_pi(institute_map, pi_email), answer
)


Expand Down Expand Up @@ -789,7 +789,7 @@ def test_process_lenovo(self):


class TestUploadToS3(TestCase):
@mock.patch("process_report.process_report.get_invoice_bucket")
@mock.patch("process_report.util.get_invoice_bucket")
@mock.patch("process_report.util.get_iso8601_time")
def test_upload_to_s3(self, mock_get_time, mock_get_bucket):
mock_bucket = mock.MagicMock()
Expand Down Expand Up @@ -833,3 +833,38 @@ def test_upload_to_s3(self, mock_get_time, mock_get_bucket):

for i, call_args in enumerate(mock_bucket.upload_file.call_args_list):
self.assertTrue(answers[i] in call_args)


class TestNERCRates(TestCase):
@mock.patch("process_report.util.load_institute_list")
def test_flag_limit_new_pi_credit(self, mock_load_institute_list):
mock_load_institute_list.return_value = [
{"display_name": "BU", "mghpcc_partnership_start_date": "2024-02"},
{"display_name": "HU", "mghpcc_partnership_start_date": "2024-6"},
{"display_name": "NEU", "mghpcc_partnership_start_date": "2024-11"},
]
sample_df = pandas.DataFrame(
{
"Institution": ["BU", "HU", "NEU", "MIT", "BC"],
}
)
sample_inv = test_utils.new_billable_invoice(
limit_new_pi_credit_to_partners=True
)

# When no partnerships are active
sample_inv.invoice_month = "2024-01"
output_df = sample_inv._filter_partners(sample_df)
self.assertTrue(output_df.empty)

# When some partnerships are active
sample_inv.invoice_month = "2024-06"
output_df = sample_inv._filter_partners(sample_df)
answer_df = pandas.DataFrame({"Institution": ["BU", "HU"]})
self.assertTrue(output_df.equals(answer_df))

# When all partnerships are active
sample_inv.invoice_month = "2024-12"
output_df = sample_inv._filter_partners(sample_df)
answer_df = pandas.DataFrame({"Institution": ["BU", "HU", "NEU"]})
self.assertTrue(output_df.equals(answer_df))
2 changes: 2 additions & 0 deletions process_report/tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def new_billable_invoice(
nonbillable_pis=[],
nonbillable_projects=[],
old_pi_filepath="",
limit_new_pi_credit_to_partners=False,
):
return billable_invoice.BillableInvoice(
name,
Expand All @@ -31,6 +32,7 @@ def new_billable_invoice(
nonbillable_pis,
nonbillable_projects,
old_pi_filepath,
limit_new_pi_credit_to_partners,
)


Expand Down
26 changes: 19 additions & 7 deletions process_report/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import datetime
import json
import yaml
import logging
import functools

Expand All @@ -27,21 +27,33 @@ def get_invoice_bucket():
return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing"))


def get_institute_mapping(institute_list: list):
institute_map = dict()
for institute_info in institute_list:
for domain in institute_info["domains"]:
institute_map[domain] = institute_info["display_name"]

return institute_map


def get_institution_from_pi(institute_map, pi_uname):
institution_key = pi_uname.split("@")[-1]
institution_name = institute_map.get(institution_key, "")
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
logger.warn(f"PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)
def load_institute_list():
with open("process_report/institute_list.yaml", "r") as f:
institute_list = yaml.safe_load(f)

return institute_map
return institute_list


def get_iso8601_time():
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
git+https://github.com/CCI-MOC/nerc-rates@74eb4a7#egg=nerc_rates
pandas
pyarrow
boto3
Loading