Skip to content

Commit

Permalink
Added processing to apply project credits, determine institution name…
Browse files Browse the repository at this point in the history
… for each PI, and exporting HU and BU invoices
  • Loading branch information
QuanMPhm committed Apr 10, 2024
1 parent 1ec80b2 commit af098d6
Show file tree
Hide file tree
Showing 3 changed files with 308 additions and 16 deletions.
36 changes: 36 additions & 0 deletions process_report/institute_map.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"northeastern.edu" : "Northeastern University",
"bu.edu" : "Boston University",
"bentley.edu" : "Bentley",
"uri.edu" : "University of Rhode Island",
"redhat.com" : "Red Hat",
"childrens.harvard.edu" : "Boston Childrens Hospital",
"mclean.harvard.edu" : "McLean Hospital",
"meei.harvard.edu" : "Massachusetts Eye & Ear",
"dfci.harvard.edu" : "Dana-Farber Cancer Institute",
"bwh.harvard.edu" : "Brigham and Women's Hospital",
"bidmc.harvard.edu" : "Beth Israel Deaconess Medical Center",
"fas.harvard.edu" : "Harvard University",
"cga.harvard.edu" : "Harvard University",
"iq.harvard.edu" : "Harvard University",
"hks.harvard.edu" : "Harvard University",
"hsph.harvard.edu" : "Harvard University",
"seas.harvard.edu" : "Harvard University",
"gse.harvard.edu" : "Harvard University",
"gov.harvard.edu" : "Harvard University",
"oeb.harvard.edu" : "Harvard University",
"harvard.edu" : "Harvard University",
"wpi.edu" : "Worcester Polytechnic Institute",
"mit.edu" : "Massachusetts Institute of Technology",
"umass.edu" : "University of Massachusetts Amherst",
"uml.edu" : "University of Massachusetts Lowell",
"codeforboston.org" : "Code For Boston",
"yale.edu" : "Yale University",
"mmsh" : "Harvard University",
"gstuart" : "University of Massachusetts Amherst",
"rudolph" : "Boston Childrens Hospital",
"robbaron" : "Boston University",
"kmdalton" : "Harvard University",
"mzink" : "University of Massachusetts Amherst",
"francesco.pontiggia" : "Harvard University"
}
198 changes: 183 additions & 15 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,68 @@
import argparse
import os
import sys

import json
import pandas


### Invoice field names
INVOICE_DATE_FIELD = 'Invoice Month'
PROJECT_FIELD = 'Project - Allocation'
PROJECT_ID_FIELD = 'Project - Allocation ID'
PI_FIELD = 'Manager (PI)'
INVOICE_EMAIL_FIELD = 'Invoice Email'
INVOICE_ADDRESS_FIELD = 'Invoice Address'
INSTITUTION_FIELD = 'Institution'
INSTITUTION_ID_FIELD = 'Institution - Specific Code'
SU_HOURS_FIELD = 'SU Hours (GBhr or SUhr)'
SU_TYPE_FIELD = 'SU Type'
COST_FIELD = 'Cost'
CREDIT_FIELD = 'Credit'
CREDIT_CODE_FIELD = 'Credit Code'
BALANCE_FIELD = 'Balance'
###


def get_institution_from_pi(pi_uname):

dir_path = os.path.dirname(__file__)
with open(f'{dir_path}/institute_map.json', 'r') as f:
institute_map = json.load(f)

if '@' in pi_uname:
domain = pi_uname.split('@')[1]
institute_name = institute_map.get(domain, '')
else:
institute_name = institute_map.get(pi_uname, '')

if institute_name == '':
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institute_name


def load_old_pis(old_pi_file):
old_pi_dict = dict()

try:
with open(old_pi_file) as f:
for pi_info in f:
pi, first_month = pi_info.strip().split(',')
old_pi_dict[pi] = first_month
except FileNotFoundError:
print('Applying credit 0002 failed. Old PI file does not exist')
sys.exit(1)

return old_pi_dict


def is_old_pi(old_pi_dict, pi, invoice_month):
if pi in old_pi_dict and old_pi_dict[pi] != invoice_month:
return True
return False


def main():
"""Remove non-billable PIs and projects"""

Expand Down Expand Up @@ -41,6 +100,23 @@ def main():
default="pi_invoices",
help="Name of output folder containing pi-specific invoice csvs"
)
parser.add_argument(
"--HU-invoice-file",
required=False,
default="HU_only.csv",
help="Name of output csv for HU invoices"
)
parser.add_argument(
"--HU-BU-invoice-file",
required=False,
default="HU_BU.csv",
help="Name of output csv for HU and BU invoices"
)
parser.add_argument(
"--old-pi-file",
required=False,
help="Name of csv file listing previously billed PIs"
)
args = parser.parse_args()
merged_dataframe = merge_csv(args.csv_files)

Expand All @@ -54,15 +130,22 @@ def main():
invoice_date = get_invoice_date(merged_dataframe)
print("Invoice date: " + str(invoice_date))

timed_projects_list = timed_projects(args.timed_projects_file, invoice_date)
print("The following timed-projects will not be billed for this period: ")
print(timed_projects_list)
# timed_projects_list = timed_projects(args.timed_projects_file, invoice_date)
# print("The following timed-projects will not be billed for this period: ")
# print(timed_projects_list)

projects = list(set(projects + timed_projects_list))
# projects = list(set(projects + timed_projects_list))

billable_projects = remove_non_billables(merged_dataframe, pi, projects, args.output_file)
merged_dataframe = add_institution(merged_dataframe)
remove_billables(merged_dataframe, pi, projects, "non_billable.csv")

billable_projects = remove_non_billables(merged_dataframe, pi, projects)
billable_projects = validate_billables(billable_projects)
credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file)
export_billables(credited_projects, args.output_file)
export_pi_billables(billable_projects, args.output_folder)
export_HU_only(billable_projects, args.HU_invoice_file)
export_HU_BU(billable_projects, args.HU_BU_invoice_file)


def merge_csv(files):
Expand All @@ -83,7 +166,7 @@ def get_invoice_date(dataframe):
Note that it only checks the first entry because it should
be the same for every row.
"""
invoice_date_str = dataframe['Invoice Month'][0]
invoice_date_str = dataframe[INVOICE_DATE_FIELD][0]
invoice_date = pandas.to_datetime(invoice_date_str, format='%Y-%m')
return invoice_date

Expand All @@ -100,10 +183,9 @@ def timed_projects(timed_projects_file, invoice_date):
return dataframe[mask]['Project'].to_list()


def remove_non_billables(dataframe, pi, projects, output_file):
def remove_non_billables(dataframe, pi, projects):
"""Removes projects and PIs that should not be billed from the dataframe"""
filtered_dataframe = dataframe[~dataframe['Manager (PI)'].isin(pi) & ~dataframe['Project - Allocation'].isin(projects)]
filtered_dataframe.to_csv(output_file, index=False)
filtered_dataframe = dataframe[~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects)]
return filtered_dataframe


Expand All @@ -112,21 +194,107 @@ def remove_billables(dataframe, pi, projects, output_file):
So this *keeps* the projects/pis that should not be billed.
"""
filtered_dataframe = dataframe[dataframe['Manager (PI)'].isin(pi) | dataframe['Project - Allocation'].isin(projects)]
filtered_dataframe = dataframe[dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)]
filtered_dataframe.to_csv(output_file, index=False)


def validate_billables(dataframe):
# Validate PI name
invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])]
for i, row in invalid_pi_projects.iterrows():
print(f'Warning: Project {row[PROJECT_FIELD]} has empty PI field')
dataframe = dataframe[~pandas.isna(dataframe[PI_FIELD])]

return dataframe


def export_billables(dataframe, output_file):
dataframe.to_csv(output_file, index=False)


def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
if not os.path.exists(output_folder):
os.mkdir(output_folder)

invoice_month = dataframe['Invoice Month'].iat[0]
pi_list = dataframe['Manager (PI)'].unique()
invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]
pi_list = dataframe[PI_FIELD].unique()

for pi in pi_list:
pi_projects = dataframe[dataframe['Manager (PI)'] == pi]
pi_instituition = pi_projects['Institution'].iat[0]
if pandas.isna(pi):
continue
pi_projects = dataframe[dataframe[PI_FIELD] == pi]
pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0]
pi_projects.to_csv(output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv")



def apply_credits_new_pi(dataframe, old_pi_file):
new_pi_credit_code = "0002"
new_pi_credit_amount = 1000

dataframe[CREDIT_FIELD] = None
dataframe[CREDIT_CODE_FIELD] = None
dataframe[BALANCE_FIELD] = 0

old_pi_dict = load_old_pis(old_pi_file)

current_pi_list = dataframe[PI_FIELD].unique()
invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]

for pi in current_pi_list:
pi_projects = dataframe[dataframe[PI_FIELD] == pi]

if is_old_pi(old_pi_dict, pi, invoice_month):
for i, row in pi_projects.iterrows():
dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD]
else:
remaining_credit = new_pi_credit_amount
for i, row in pi_projects.iterrows():
project_cost = row[COST_FIELD]
applied_credit = min(project_cost, remaining_credit)

dataframe.at[i, CREDIT_FIELD] = applied_credit
dataframe.at[i, CREDIT_CODE_FIELD] = new_pi_credit_code
dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - applied_credit
remaining_credit -= applied_credit

if remaining_credit == 0:
break

return dataframe


def add_institution(dataframe: pandas.DataFrame):
"""Determine every PI's institution name, logging any PI whose institution cannot be determined
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
the username is not an email address.
Exact matches are then mapped to the corresponding institution name.
I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University"
The list of mappings are defined in `institute_map.json`.
"""
for i, row in dataframe.iterrows():
pi_name = row[PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[PROJECT_FIELD]} has no PI") # Nan check
else:
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(pi_name)

return dataframe


def export_HU_only(dataframe, output_file):
HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == 'Harvard University']
HU_projects.to_csv(output_file)


def export_HU_BU(dataframe, output_file):
HU_BU_projects = dataframe[(dataframe[INSTITUTION_FIELD] == 'Harvard University') |
(dataframe[INSTITUTION_FIELD] == 'Boston University')]
HU_BU_projects.to_csv(output_file)


if __name__ == "__main__":
main()
Loading

0 comments on commit af098d6

Please sign in to comment.