Skip to content

Commit

Permalink
Merge pull request #106 from QuanMPhm/99/implement_processor
Browse files Browse the repository at this point in the history
Implemented `Processor` class and refactored some preliminary processing
  • Loading branch information
QuanMPhm authored Oct 21, 2024
2 parents 244be32 + 8d56a6d commit 6276e84
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 74 deletions.
65 changes: 25 additions & 40 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
bu_internal_invoice,
pi_specific_invoice,
)

from process_report.processors import (
validate_pi_alias_processor,
add_institution_processor,
)

### PI file field names
PI_PI_FIELD = "PI"
Expand Down Expand Up @@ -200,15 +203,31 @@ def main():

projects = list(set(projects + timed_projects_list))

merged_dataframe = validate_pi_aliases(merged_dataframe, alias_dict)
merged_dataframe = add_institution(merged_dataframe)
### Preliminary processing

validate_pi_alias_proc = validate_pi_alias_processor.ValidatePIAliasProcessor(
"", invoice_month, merged_dataframe, alias_dict
)
validate_pi_alias_proc.process()

add_institute_proc = add_institution_processor.AddInstitutionProcessor(
"", invoice_month, validate_pi_alias_proc.data
)
add_institute_proc.process()

preliminary_processed_data = add_institute_proc.data

### Finish preliminary processing

lenovo_inv = lenovo_invoice.LenovoInvoice(
name=args.Lenovo_file, invoice_month=invoice_month, data=merged_dataframe.copy()
name=args.Lenovo_file,
invoice_month=invoice_month,
data=preliminary_processed_data.copy(),
)
nonbillable_inv = nonbillable_invoice.NonbillableInvoice(
name=args.nonbillable_file,
invoice_month=invoice_month,
data=merged_dataframe.copy(),
data=preliminary_processed_data.copy(),
nonbillable_pis=pi,
nonbillable_projects=projects,
)
Expand All @@ -220,7 +239,7 @@ def main():
billable_inv = billable_invoice.BillableInvoice(
name=args.output_file,
invoice_month=invoice_month,
data=merged_dataframe.copy(),
data=preliminary_processed_data.copy(),
nonbillable_pis=pi,
nonbillable_projects=projects,
old_pi_filepath=old_pi_file,
Expand Down Expand Up @@ -314,13 +333,6 @@ def timed_projects(timed_projects_file, invoice_date):
return dataframe[mask]["Project"].to_list()


def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):
for pi, pi_aliases in alias_dict.items():
dataframe.loc[dataframe[PI_FIELD].isin(pi_aliases), PI_FIELD] = pi

return dataframe


def fetch_s3_alias_file():
local_name = "alias.csv"
invoice_bucket = util.get_invoice_bucket()
Expand All @@ -340,33 +352,6 @@ def backup_to_s3_old_pi_file(old_pi_file):
invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv")


def add_institution(dataframe: pandas.DataFrame):
"""Determine every PI's institution name, logging any PI whose institution cannot be determined
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
the username is not an email address.
Exact matches are then mapped to the corresponding institution name.
I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University"
The list of mappings are defined in `institute_map.json`.
"""
institute_list = util.load_institute_list()
institute_map = util.get_institute_mapping(institute_list)
dataframe = dataframe.astype({INSTITUTION_FIELD: "str"})
for i, row in dataframe.iterrows():
pi_name = row[PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[PROJECT_FIELD]} has no PI")
else:
dataframe.at[i, INSTITUTION_FIELD] = util.get_institution_from_pi(
institute_map, pi_name
)

return dataframe


def export_billables(dataframe, output_file):
dataframe.to_csv(output_file, index=False)

Expand Down
59 changes: 59 additions & 0 deletions process_report/processors/add_institution_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from dataclasses import dataclass

import pandas

from process_report.invoices import invoice
from process_report.processors import processor
from process_report import util


@dataclass
class AddInstitutionProcessor(processor.Processor):
@staticmethod
def _get_institute_mapping(institute_list: list):
institute_map = dict()
for institute_info in institute_list:
for domain in institute_info["domains"]:
institute_map[domain] = institute_info["display_name"]

return institute_map

@staticmethod
def _get_institution_from_pi(institute_map, pi_uname):
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institution_name

def _add_institution(self):
"""Determine every PI's institution name, logging any PI whose institution cannot be determined
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
the username is not an email address.
Exact matches are then mapped to the corresponding institution name.
I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University"
The list of mappings are defined in `institute_map.json`.
"""
institute_list = util.load_institute_list()
institute_map = self._get_institute_mapping(institute_list)
self.data = self.data.astype({invoice.INSTITUTION_FIELD: "str"})
for i, row in self.data.iterrows():
pi_name = row[invoice.PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[invoice.PROJECT_FIELD]} has no PI")
else:
self.data.at[
i, invoice.INSTITUTION_FIELD
] = self._get_institution_from_pi(institute_map, pi_name)

def _process(self):
self._add_institution()
8 changes: 8 additions & 0 deletions process_report/processors/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from dataclasses import dataclass

from process_report.invoices import invoice


@dataclass
class Processor(invoice.Invoice):
pass
18 changes: 18 additions & 0 deletions process_report/processors/validate_pi_alias_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dataclasses import dataclass

from process_report.invoices import invoice
from process_report.processors import processor


@dataclass
class ValidatePIAliasProcessor(processor.Processor):
alias_map: dict

def _validate_pi_aliases(self):
for pi, pi_aliases in self.alias_map.items():
self.data.loc[
self.data[invoice.PI_FIELD].isin(pi_aliases), invoice.PI_FIELD
] = pi

def _process(self):
self._validate_pi_aliases()
27 changes: 15 additions & 12 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def test_export_pi(self, mock_filter_cols):
self.assertNotIn("ProjectC", pi_df["Project - Allocation"].tolist())


class TestGetInstitute(TestCase):
class TestAddInstituteProcessor(TestCase):
def test_get_pi_institution(self):
institute_map = {
"harvard.edu": "Harvard University",
Expand Down Expand Up @@ -251,31 +251,34 @@ def test_get_pi_institution(self):
"g@bidmc.harvard.edu": "Beth Israel Deaconess Medical Center",
}

add_institute_proc = test_utils.new_add_institution_processor()

for pi_email, answer in answers.items():
self.assertEqual(
util.get_institution_from_pi(institute_map, pi_email), answer
add_institute_proc._get_institution_from_pi(institute_map, pi_email),
answer,
)


class TestAlias(TestCase):
def setUp(self):
self.alias_dict = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]}

self.data = pandas.DataFrame(
class TestValidateAliasProcessor(TestCase):
def test_validate_alias(self):
alias_map = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]}
test_data = pandas.DataFrame(
{
"Manager (PI)": ["PI1", "PI1_1", "PI1_2", "PI2_1", "PI2_1"],
}
)

self.answer = pandas.DataFrame(
answer_data = pandas.DataFrame(
{
"Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"],
}
)

def test_validate_alias(self):
output = process_report.validate_pi_aliases(self.data, self.alias_dict)
self.assertTrue(self.answer.equals(output))
validate_pi_alias_proc = test_utils.new_validate_pi_alias_processor(
data=test_data, alias_map=alias_map
)
validate_pi_alias_proc.process()
self.assertTrue(answer_data.equals(validate_pi_alias_proc.data))


class TestMonthUtils(TestCase):
Expand Down
21 changes: 21 additions & 0 deletions process_report/tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
pi_specific_invoice,
)

from process_report.processors import (
add_institution_processor,
validate_pi_alias_processor,
)


def new_base_invoice(
name="",
Expand Down Expand Up @@ -54,3 +59,19 @@ def new_pi_specific_invoice(
invoice_month,
data,
)


def new_add_institution_processor(
name="",
invoice_month="0000-00",
data=pandas.DataFrame(),
):
return add_institution_processor.AddInstitutionProcessor(name, invoice_month, data)


def new_validate_pi_alias_processor(
name="", invoice_month="0000-00", data=pandas.DataFrame(), alias_map={}
):
return validate_pi_alias_processor.ValidatePIAliasProcessor(
name, invoice_month, data, alias_map
)
22 changes: 0 additions & 22 deletions process_report/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,6 @@ def get_invoice_bucket():
return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing"))


def get_institute_mapping(institute_list: list):
institute_map = dict()
for institute_info in institute_list:
for domain in institute_info["domains"]:
institute_map[domain] = institute_info["display_name"]

return institute_map


def get_institution_from_pi(institute_map, pi_uname):
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
logger.warn(f"PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_list():
with open("process_report/institute_list.yaml", "r") as f:
institute_list = yaml.safe_load(f)
Expand Down

0 comments on commit 6276e84

Please sign in to comment.