Skip to content

Commit

Permalink
Implemented S3 integration
Browse files Browse the repository at this point in the history
This commit gives the user the option fetch invoices from S3 storage. Several env vars are needed to authenticate to S3 storage.
More details on these vars can be found in the function `get_invoice_bucket()` from `process_report.py`
Note that the user can provide filenames containing "{}" to inject the invoice month, allowing for some convenient formatting
  • Loading branch information
QuanMPhm committed Apr 23, 2024
1 parent c54a956 commit c852c30
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 42 deletions.
169 changes: 129 additions & 40 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import argparse
import os
import sys
import datetime

import json
import pandas
import boto3
from botocore.config import Config


### Invoice field names
Expand Down Expand Up @@ -35,7 +38,7 @@ def get_institution_from_pi(institute_map, pi_uname):


def load_institute_map() -> dict:
with open("institute_map.json", "r") as f:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map
Expand All @@ -62,16 +65,45 @@ def is_old_pi(old_pi_dict, pi, invoice_month):
return False


def get_invoice_bucket():
try:
s3_resource = boto3.resource(
service_name="s3",
endpoint_url=os.environ.get(
"S3_ENDPOINT", "https://s3.us-east-005.backblazeb2.com"
),
aws_access_key_id=os.environ["S3_KEY_ID"],
aws_secret_access_key=os.environ["S3_APP_KEY"],
config=Config(
signature_version="s3v4",
),
)
except KeyError:
print("Error: Environment variables for S3 authentication not set")
return s3_resource.Bucket(os.environ["S3_BUCKET_NAME"])


def get_iso8601_time():
return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")


def main():
"""Remove non-billable PIs and projects"""

parser = argparse.ArgumentParser()

parser.add_argument(
"csv_files",
nargs="+",
nargs="*",
help="One or more CSV files that need to be processed",
)
parser.add_argument("--fetch-from-s3", action="store_true")
parser.add_argument("--upload-to-s3", action="store_true")
parser.add_argument(
"--invoice-month",
required=True,
help="Invoice month to process",
)
parser.add_argument(
"--pi-file",
required=True,
Expand All @@ -87,6 +119,13 @@ def main():
required=True,
help="File containing list of projects that are non-billable within a specified duration",
)

parser.add_argument(
"--nonbillable-file",
required=False,
default="nonbillable.csv",
help="Name of nonbillable file",
)
parser.add_argument(
"--output-file",
required=False,
Expand All @@ -103,21 +142,35 @@ def main():
"--HU-invoice-file",
required=False,
default="HU_only.csv",
help="Name of output csv for HU invoices",
help="Name of output csv for HU invoice",
)
parser.add_argument(
"--HU-BU-invoice-file",
required=False,
default="HU_BU.csv",
help="Name of output csv for HU and BU invoices",
help="Name of output csv for HU and BU invoice",
)
parser.add_argument(
"--Lenovo-file",
required=False,
default="Lenovo.csv",
help="Name of output csv for Lenovo SU Types invoice",
)
parser.add_argument(
"--old-pi-file",
required=False,
help="Name of csv file listing previously billed PIs",
)
args = parser.parse_args()
merged_dataframe = merge_csv(args.csv_files)

invoice_month = args.invoice_month

if args.fetch_from_s3:
csv_files = fetch_S3_invoices(invoice_month)
else:
csv_files = args.csv_files

merged_dataframe = merge_csv(csv_files)

pi = []
projects = []
Expand All @@ -126,26 +179,54 @@ def main():
with open(args.projects_file) as file:
projects = [line.rstrip() for line in file]

invoice_date = get_invoice_date(merged_dataframe)
print("Invoice date: " + str(invoice_date))
print("Invoice date: " + str(invoice_month))

timed_projects_list = timed_projects(args.timed_projects_file, invoice_date)
timed_projects_list = timed_projects(args.timed_projects_file, invoice_month)
print("The following timed-projects will not be billed for this period: ")
print(timed_projects_list)

projects = list(set(projects + timed_projects_list))

merged_dataframe = add_institution(merged_dataframe)
remove_billables(merged_dataframe, pi, projects, "non_billable.csv")
remove_billables(merged_dataframe, pi, projects, args.nonbillable_file)

billable_projects = remove_non_billables(merged_dataframe, pi, projects)
billable_projects = validate_pi_names(billable_projects)
credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file)

export_billables(credited_projects, args.output_file)
export_pi_billables(billable_projects, args.output_folder)
export_HU_only(billable_projects, args.HU_invoice_file)
export_HU_BU(billable_projects, args.HU_BU_invoice_file)
export_lenovo(billable_projects)
export_pi_billables(credited_projects, args.output_folder, invoice_month)
export_HU_only(credited_projects, args.HU_invoice_file)
export_HU_BU(credited_projects, args.HU_BU_invoice_file)
export_lenovo(credited_projects, args.Lenovo_file)

if args.upload_to_s3:
invoice_list = [
args.nonbillable_file,
args.output_file,
args.HU_invoice_file,
args.HU_BU_invoice_file,
args.Lenovo_file,
]

for pi_invoice in os.listdir(args.output_folder):
invoice_list.append(os.path.join(args.output_folder, pi_invoice))

upload_to_s3(invoice_list, invoice_month)


def fetch_S3_invoices(invoice_month):
"""Fetches usage invoices from S3 given invoice month"""
s3_invoice_list = list()
invoice_bucket = get_invoice_bucket()
for obj in invoice_bucket.objects.filter(
Prefix=f"Invoices/{invoice_month}/Service Invoices/"
):
local_name = obj.key.split("/")[-1]
s3_invoice_list.append(local_name)
invoice_bucket.download_file(obj.key, local_name)

return s3_invoice_list


def merge_csv(files):
Expand Down Expand Up @@ -215,27 +296,6 @@ def validate_pi_names(dataframe):
return dataframe


def export_billables(dataframe, output_file):
dataframe.to_csv(output_file, index=False)


def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
if not os.path.exists(output_folder):
os.mkdir(output_folder)

invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]
pi_list = dataframe[PI_FIELD].unique()

for pi in pi_list:
if pandas.isna(pi):
continue
pi_projects = dataframe[dataframe[PI_FIELD] == pi]
pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0]
pi_projects.to_csv(
output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv"
)


def apply_credits_new_pi(dataframe, old_pi_file):
new_pi_credit_code = "0002"
new_pi_credit_amount = 1000
Expand Down Expand Up @@ -297,6 +357,26 @@ def add_institution(dataframe: pandas.DataFrame):
return dataframe


def export_billables(dataframe, output_file):
dataframe.to_csv(output_file, index=False)


def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_month):
if not os.path.exists(output_folder):
os.mkdir(output_folder)

pi_list = dataframe[PI_FIELD].unique()

for pi in pi_list:
if pandas.isna(pi):
continue
pi_projects = dataframe[dataframe[PI_FIELD] == pi]
pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0]
pi_projects.to_csv(
output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv"
)


def export_HU_only(dataframe, output_file):
HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"]
HU_projects.to_csv(output_file)
Expand All @@ -310,11 +390,7 @@ def export_HU_BU(dataframe, output_file):
HU_BU_projects.to_csv(output_file)


def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
lenovo_file_name = (
output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv"
)

def export_lenovo(dataframe: pandas.DataFrame, output_file):
LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
SU_CHARGE_MULTIPLIER = 1

Expand All @@ -331,7 +407,20 @@ def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
lenovo_df.rename(columns={SU_HOURS_FIELD: "SU Hours"}, inplace=True)
lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER)
lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"]
lenovo_df.to_csv(lenovo_file_name)
lenovo_df.to_csv(output_file)
# TODO (Quan Pham) Where to place these


def upload_to_s3(invoice_list: list, invoice_month):
invoice_bucket = get_invoice_bucket()
for invoice_filename in invoice_list:
striped_filename = os.path.splitext(invoice_filename)[0]
invoice_s3_path = (
f"Invoices/{invoice_month}/{striped_filename} {invoice_month}.csv"
)
invoice_s3_path_archive = f"Invoices/{invoice_month}/Archive/{striped_filename} {invoice_month} {get_iso8601_time()}.csv"
invoice_bucket.upload_file(invoice_filename, invoice_s3_path)
invoice_bucket.upload_file(invoice_filename, invoice_s3_path_archive)


if __name__ == "__main__":
Expand Down
40 changes: 38 additions & 2 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from unittest import TestCase
from unittest import TestCase, mock
import tempfile
import pandas
import os
Expand Down Expand Up @@ -181,10 +181,13 @@ def setUp(self):
"Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"],
}
self.dataframe = pandas.DataFrame(data)
self.invoice_month = data["Invoice Month"][0]

def test_export_pi(self):
output_dir = tempfile.TemporaryDirectory()
process_report.export_pi_billables(self.dataframe, output_dir.name)
process_report.export_pi_billables(
self.dataframe, output_dir.name, self.invoice_month
)

pi_csv_1 = f'{self.dataframe["Institution"][0]}_{self.dataframe["Manager (PI)"][0]}_{self.dataframe["Invoice Month"][0]}.csv'
pi_csv_2 = f'{self.dataframe["Institution"][3]}_{self.dataframe["Manager (PI)"][3]}_{self.dataframe["Invoice Month"][3]}.csv'
Expand Down Expand Up @@ -412,3 +415,36 @@ def test_apply_credit_0002(self):
["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"],
)
self.assertEqual(row["Charge"], row["SU Charge"] * row["SU Hours"])


class TestUploadToS3(TestCase):
@mock.patch("process_report.process_report.get_invoice_bucket")
@mock.patch("process_report.process_report.get_iso8601_time")
def test_remove_prefix(self, mock_get_time, mock_get_bucket):
mock_bucket = mock.MagicMock()
mock_get_bucket.return_value = mock_bucket
mock_get_time.return_value = "0"

invoice_month = "2024-03"
filenames = ["test.csv", "test2.test.csv", "test3"]
answers = [
("test.csv", f"Invoices/{invoice_month}/test {invoice_month}.csv"),
(
"test.csv",
f"Invoices/{invoice_month}/Archive/test {invoice_month} 0.csv",
),
(
"test2.test.csv",
f"Invoices/{invoice_month}/test2.test {invoice_month}.csv",
),
(
"test2.test.csv",
f"Invoices/{invoice_month}/Archive/test2.test {invoice_month} 0.csv",
),
("test3", f"Invoices/{invoice_month}/test3 {invoice_month}.csv"),
("test3", f"Invoices/{invoice_month}/Archive/test3 {invoice_month} 0.csv"),
]

process_report.upload_to_s3(filenames, invoice_month)
for i, call_args in enumerate(mock_bucket.upload_file.call_args_list):
self.assertTrue(answers[i] in call_args)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pandas
boto3
20 changes: 20 additions & 0 deletions tools/clone_nonbillables_and_process.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,26 @@ if [ ! -e ~/.ssh/config ]; then
chmod 600 ~/.ssh/id_nonbillable
fi

if [ ! -d ~/.ssh/known_hosts ]; then
touch ~/.ssh/known_hosts
echo "github.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl
github.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg=
github.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk=
" >> ~/.ssh/known_hosts
fi

if [ ! -d ./non-billable-projects ]; then
git clone git@github-nonbillable:CCI-MOC/non-billable-projects.git ./non-billable-projects
fi

INVOICE_MONTH=$(date --date="$(date +%Y-%m-01) -1 month" +%Y-%m)
export B2_ENDPOINT
export B2_APP_KEY
export B2_KEY_ID
export B2_BUCKET_NAME
python process_report/process_report.py \
--invoice-month $INVOICE_MONTH \
--pi-file ./non-billable-projects/pi.txt \
--projects-file ./non-billable-projects/projects.txt \
--timed-projects-file ./non-billable-projects/timed_projects.txt \
--old-pi-file old_pi.csv

0 comments on commit c852c30

Please sign in to comment.