Skip to content

Commit

Permalink
Containerized billing and implemented S3 integration
Browse files Browse the repository at this point in the history
The container expects the user to provide several environment variables. More details in the Dockerfile and `clone_nonbillables_and_process_invoice.sh`.
Most importantly, the `old_pi.csv` file must be located in repo home directory when building the Docker image
The shell script clones the non-billable repo, while the connection to Backblaze B2 is done in the Python script through `boto3`
  • Loading branch information
QuanMPhm committed Apr 22, 2024
1 parent c54a956 commit dda6c6a
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 51 deletions.
68 changes: 68 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
name: Build

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

on:
push:
branches: [main]
# Publish semver tags as releases.
tags: ['v*.*.*']
pull_request:
branches: [main]

env:
# Use docker.io for Docker Hub if empty
REGISTRY: ghcr.io
# github.repository as <account>/<repo>
IMAGE_NAME: ${{ github.repository }}


jobs:
build:

runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v3

# Login against a Docker registry except on PR
# https://github.com/docker/login-action
- name: Log into registry ${{ env.REGISTRY }}
if: github.event_name != 'pull_request'
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

# Extract metadata (tags, labels) for Docker
# https://github.com/docker/metadata-action
- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=semver,pattern=v{{version}}
type=semver,pattern=v{{major}}.{{minor}}
type=semver,pattern=v{{major}}
type=ref,event=branch
type=ref,event=pr
type=sha
# Build and push Docker image with Buildx (don't push on PR)
# https://github.com/docker/build-push-action
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
10 changes: 10 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM python:3.11-slim

WORKDIR /app

RUN apt-get update && apt-get install -y git

COPY . .
RUN pip install -r requirements.txt

CMD ["./tools/clone_nonbillables_and_process_invoice.sh"]
114 changes: 89 additions & 25 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import argparse
import os
import sys
import datetime

import json
import pandas
import boto3
from botocore.config import Config


### Invoice field names
Expand Down Expand Up @@ -35,7 +38,7 @@ def get_institution_from_pi(institute_map, pi_uname):


def load_institute_map() -> dict:
with open("institute_map.json", "r") as f:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map
Expand All @@ -62,15 +65,42 @@ def is_old_pi(old_pi_dict, pi, invoice_month):
return False


def get_invoice_bucket():
s3_vars = load_S3_env_vars()
b2_resource = boto3.resource(
service_name="s3",
endpoint_url=s3_vars["B2_ENDPOINT"], # Backblaze endpoint
aws_access_key_id=s3_vars["B2_KEY_ID"], # Backblaze keyID
aws_secret_access_key=s3_vars["B2_APP_KEY"], # Backblaze applicationKey
config=Config(
signature_version="s3v4",
),
)
return b2_resource.Bucket(s3_vars["B2_BUCKET_NAME"])


def load_S3_env_vars() -> dict:
s3_vars = dict()
for name, val in os.environ.items():
if name.startswith("B2_"):
s3_vars[name] = val

return s3_vars


def get_iso8601_time():
return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")


def main():
"""Remove non-billable PIs and projects"""

parser = argparse.ArgumentParser()

parser.add_argument(
"csv_files",
nargs="+",
help="One or more CSV files that need to be processed",
"--invoice-month",
required=True,
help="Invoice month to process",
)
parser.add_argument(
"--pi-file",
Expand Down Expand Up @@ -117,7 +147,10 @@ def main():
help="Name of csv file listing previously billed PIs",
)
args = parser.parse_args()
merged_dataframe = merge_csv(args.csv_files)

invoice_month = args.invoice_month
csv_files = fetch_S3_invoices(invoice_month)
merged_dataframe = merge_csv(csv_files)

pi = []
projects = []
Expand All @@ -126,26 +159,40 @@ def main():
with open(args.projects_file) as file:
projects = [line.rstrip() for line in file]

invoice_date = get_invoice_date(merged_dataframe)
print("Invoice date: " + str(invoice_date))
print("Invoice date: " + str(invoice_month))

timed_projects_list = timed_projects(args.timed_projects_file, invoice_date)
timed_projects_list = timed_projects(args.timed_projects_file, invoice_month)
print("The following timed-projects will not be billed for this period: ")
print(timed_projects_list)

projects = list(set(projects + timed_projects_list))

merged_dataframe = add_institution(merged_dataframe)
remove_billables(merged_dataframe, pi, projects, "non_billable.csv")
remove_billables(merged_dataframe, pi, projects, "nonbillable.csv", invoice_month)

billable_projects = remove_non_billables(merged_dataframe, pi, projects)
billable_projects = validate_pi_names(billable_projects)
credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file)
export_billables(credited_projects, args.output_file)
export_pi_billables(billable_projects, args.output_folder)
export_HU_only(billable_projects, args.HU_invoice_file)
export_HU_BU(billable_projects, args.HU_BU_invoice_file)
export_lenovo(billable_projects)

export_billables(credited_projects, args.output_file, invoice_month)
export_pi_billables(billable_projects, args.output_folder, invoice_month)
export_HU_only(billable_projects, args.HU_invoice_file, invoice_month)
export_HU_BU(billable_projects, args.HU_BU_invoice_file, invoice_month)
export_lenovo(billable_projects, invoice_month)


def fetch_S3_invoices(invoice_month):
"""Fetches usage invoices from S3 given invoice month"""
s3_invoice_list = list()
invoice_bucket = get_invoice_bucket()
for obj in invoice_bucket.objects.filter(
Prefix=f"Invoices/{invoice_month}/Service Invoices/"
):
local_name = obj.key.split("/")[-1]
s3_invoice_list.append(local_name)
invoice_bucket.download_file(obj.key, local_name)

return s3_invoice_list


def merge_csv(files):
Expand Down Expand Up @@ -195,15 +242,23 @@ def remove_non_billables(dataframe, pi, projects):
return filtered_dataframe


def remove_billables(dataframe, pi, projects, output_file):
def remove_billables(dataframe, pi, projects, output_file, invoice_month):
"""Removes projects and PIs that should be billed from the dataframe
So this *keeps* the projects/pis that should not be billed.
"""
filtered_dataframe = dataframe[
dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)
]

invoice_b2_path = (
f"Invoices/{invoice_month}/NERC (Non-Billable) {invoice_month}.csv"
)
invoice_b2_path_archive = f"Invoices/{invoice_month}/Archive/NERC (Non-Billable) {invoice_month} {get_iso8601_time()}.csv"
filtered_dataframe.to_csv(output_file, index=False)
invoice_bucket = get_invoice_bucket()
invoice_bucket.upload_file(output_file, invoice_b2_path)
invoice_bucket.upload_file(output_file, invoice_b2_path_archive)


def validate_pi_names(dataframe):
Expand All @@ -215,15 +270,22 @@ def validate_pi_names(dataframe):
return dataframe


def export_billables(dataframe, output_file):
dataframe.to_csv(output_file, index=False)
def export_billables(dataframe, output_file, invoice_month):
invoice_b2_path = f"Invoices/{invoice_month}/NERC {invoice_month}.csv"
invoice_b2_path_archive = (
f"Invoices/{invoice_month}/Archive/"
+ f"NERC {invoice_month} {get_iso8601_time()}.csv"
)
dataframe.to_csv(output_file)
invoice_bucket = get_invoice_bucket()
invoice_bucket.upload_file(output_file, invoice_b2_path)
invoice_bucket.upload_file(output_file, invoice_b2_path_archive)


def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_month):
if not os.path.exists(output_folder):
os.mkdir(output_folder)

invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0]
pi_list = dataframe[PI_FIELD].unique()

for pi in pi_list:
Expand All @@ -234,6 +296,7 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder):
pi_projects.to_csv(
output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv"
)
# TODO (Quan Pham) Where to place these


def apply_credits_new_pi(dataframe, old_pi_file):
Expand Down Expand Up @@ -297,23 +360,23 @@ def add_institution(dataframe: pandas.DataFrame):
return dataframe


def export_HU_only(dataframe, output_file):
def export_HU_only(dataframe, output_file, invoice_month):
HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Harvard University"]
HU_projects.to_csv(output_file)
# TODO (Quan Pham) Where to place these


def export_HU_BU(dataframe, output_file):
def export_HU_BU(dataframe, output_file, invoice_month):
HU_BU_projects = dataframe[
(dataframe[INSTITUTION_FIELD] == "Harvard University")
| (dataframe[INSTITUTION_FIELD] == "Boston University")
]
HU_BU_projects.to_csv(output_file)
# TODO (Quan Pham) Where to place these


def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
lenovo_file_name = (
output_file or f"Lenovo_{dataframe[INVOICE_DATE_FIELD].iat[0]}.csv"
)
def export_lenovo(dataframe: pandas.DataFrame, invoice_month, output_file=None):
lenovo_file_name = output_file or f"Lenovo_{invoice_month}.csv"

LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
SU_CHARGE_MULTIPLIER = 1
Expand All @@ -332,6 +395,7 @@ def export_lenovo(dataframe: pandas.DataFrame, output_file=None):
lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER)
lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"]
lenovo_df.to_csv(lenovo_file_name)
# TODO (Quan Pham) Where to place these


if __name__ == "__main__":
Expand Down
23 changes: 17 additions & 6 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unittest import TestCase
from unittest.mock import patch
import tempfile
import pandas
import os
Expand Down Expand Up @@ -55,6 +56,7 @@ def test_timed_projects(self):
self.assertEqual(excluded_projects, expected_projects)


@patch("process_report.process_report.get_invoice_bucket")
class TestRemoveNonBillables(TestCase):
def setUp(self):
data = {
Expand All @@ -69,7 +71,7 @@ def setUp(self):
"Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"],
}
self.dataframe = pandas.DataFrame(data)

self.invoice_month = "2024-03"
self.pi_to_exclude = ["PI2", "PI3"]
self.projects_to_exclude = ["ProjectB", "ProjectD"]

Expand All @@ -80,11 +82,13 @@ def tearDown(self):
os.remove(self.output_file.name)
os.remove(self.output_file2.name)

def test_remove_non_billables(self):
def test_remove_non_billables(self, mock_bucket):
billables_df = process_report.remove_non_billables(
self.dataframe, self.pi_to_exclude, self.projects_to_exclude
)
process_report.export_billables(billables_df, self.output_file.name)
process_report.export_billables(
billables_df, self.output_file.name, self.invoice_month
)

result_df = pandas.read_csv(self.output_file.name)

Expand All @@ -104,12 +108,13 @@ def test_remove_non_billables(self):
self.assertIn("ProjectA", result_df["Project - Allocation"].tolist())
self.assertIn("ProjectE", result_df["Project - Allocation"].tolist())

def test_remove_billables(self):
def test_remove_billables(self, mock_bucket):
process_report.remove_billables(
self.dataframe,
self.pi_to_exclude,
self.projects_to_exclude,
self.output_file2.name,
self.invoice_month,
)

result_df = pandas.read_csv(self.output_file2.name)
Expand Down Expand Up @@ -181,10 +186,13 @@ def setUp(self):
"Untouch Data Column": ["DataA", "DataB", "DataC", "DataD", "DataE"],
}
self.dataframe = pandas.DataFrame(data)
self.invoice_month = data["Invoice Month"][0]

def test_export_pi(self):
output_dir = tempfile.TemporaryDirectory()
process_report.export_pi_billables(self.dataframe, output_dir.name)
process_report.export_pi_billables(
self.dataframe, output_dir.name, self.invoice_month
)

pi_csv_1 = f'{self.dataframe["Institution"][0]}_{self.dataframe["Manager (PI)"][0]}_{self.dataframe["Invoice Month"][0]}.csv'
pi_csv_2 = f'{self.dataframe["Institution"][3]}_{self.dataframe["Manager (PI)"][3]}_{self.dataframe["Invoice Month"][3]}.csv'
Expand Down Expand Up @@ -381,6 +389,7 @@ def setUp(self):
],
}
self.dataframe = pandas.DataFrame(data)
self.invoice_month = "2024-03"

output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv")
self.output_file = output_file.name
Expand All @@ -389,7 +398,9 @@ def tearDown(self):
os.remove(self.output_file)

def test_apply_credit_0002(self):
process_report.export_lenovo(self.dataframe, self.output_file)
process_report.export_lenovo(
self.dataframe, self.invoice_month, self.output_file
)
output_df = pandas.read_csv(self.output_file)

self.assertTrue(
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pandas
boto3
Loading

0 comments on commit dda6c6a

Please sign in to comment.