-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #106 from QuanMPhm/99/implement_processor
Implemented `Processor` class and refactored some preliminary processing
- Loading branch information
Showing
7 changed files
with
146 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from dataclasses import dataclass | ||
|
||
import pandas | ||
|
||
from process_report.invoices import invoice | ||
from process_report.processors import processor | ||
from process_report import util | ||
|
||
|
||
@dataclass | ||
class AddInstitutionProcessor(processor.Processor): | ||
@staticmethod | ||
def _get_institute_mapping(institute_list: list): | ||
institute_map = dict() | ||
for institute_info in institute_list: | ||
for domain in institute_info["domains"]: | ||
institute_map[domain] = institute_info["display_name"] | ||
|
||
return institute_map | ||
|
||
@staticmethod | ||
def _get_institution_from_pi(institute_map, pi_uname): | ||
institution_domain = pi_uname.split("@")[-1] | ||
for i in range(institution_domain.count(".") + 1): | ||
if institution_name := institute_map.get(institution_domain, ""): | ||
break | ||
institution_domain = institution_domain[institution_domain.find(".") + 1 :] | ||
|
||
if institution_name == "": | ||
print(f"Warning: PI name {pi_uname} does not match any institution!") | ||
|
||
return institution_name | ||
|
||
def _add_institution(self): | ||
"""Determine every PI's institution name, logging any PI whose institution cannot be determined | ||
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to | ||
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if | ||
the username is not an email address. | ||
Exact matches are then mapped to the corresponding institution name. | ||
I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University" | ||
The list of mappings are defined in `institute_map.json`. | ||
""" | ||
institute_list = util.load_institute_list() | ||
institute_map = self._get_institute_mapping(institute_list) | ||
self.data = self.data.astype({invoice.INSTITUTION_FIELD: "str"}) | ||
for i, row in self.data.iterrows(): | ||
pi_name = row[invoice.PI_FIELD] | ||
if pandas.isna(pi_name): | ||
print(f"Project {row[invoice.PROJECT_FIELD]} has no PI") | ||
else: | ||
self.data.at[ | ||
i, invoice.INSTITUTION_FIELD | ||
] = self._get_institution_from_pi(institute_map, pi_name) | ||
|
||
def _process(self): | ||
self._add_institution() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from dataclasses import dataclass | ||
|
||
from process_report.invoices import invoice | ||
|
||
|
||
@dataclass | ||
class Processor(invoice.Invoice): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from dataclasses import dataclass | ||
|
||
from process_report.invoices import invoice | ||
from process_report.processors import processor | ||
|
||
|
||
@dataclass | ||
class ValidatePIAliasProcessor(processor.Processor): | ||
alias_map: dict | ||
|
||
def _validate_pi_aliases(self): | ||
for pi, pi_aliases in self.alias_map.items(): | ||
self.data.loc[ | ||
self.data[invoice.PI_FIELD].isin(pi_aliases), invoice.PI_FIELD | ||
] = pi | ||
|
||
def _process(self): | ||
self._validate_pi_aliases() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters