opensciencegrid
diff --git a/‎.github/scripts/check_project_fos_precision/field_of_science.py
Lines changed: 188 additions & 0 deletions b/‎.github/scripts/check_project_fos_precision/field_of_science.py
Lines changed: 188 additions & 0 deletions
diff --git a/‎.github/scripts/check_project_fos_precision/main.py
Lines changed: 88 additions & 0 deletions b/‎.github/scripts/check_project_fos_precision/main.py
Lines changed: 88 additions & 0 deletions
diff --git a/‎.github/scripts/check_project_fos_precision/requirements.txt
Lines changed: 12 additions & 0 deletions b/‎.github/scripts/check_project_fos_precision/requirements.txt
Lines changed: 12 additions & 0 deletions
diff --git a/‎.github/workflows/check_project_fos_precision.yml
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/check_project_fos_precision.yml
Lines changed: 22 additions & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 3 additions & 1 deletion b/‎Dockerfile
Lines changed: 3 additions & 1 deletion
diff --git a/‎Procfile
Lines changed: 0 additions & 2 deletions b/‎Procfile
Lines changed: 0 additions & 2 deletions
diff --git a/‎bin/osg-notify
Lines changed: 7 additions & 4 deletions b/‎bin/osg-notify
Lines changed: 7 additions & 4 deletions
@@ -0,0 +1,188 @@
+from functools import lru_cache
+from typing import Union
+import string
+
+import pandas as pd
+
+
+@lru_cache()
+def get_cip_df():
+
+    cip_df = pd.read_excel("data/SED-CIP-2022.xlsx")
+
+    # Drop the first two rows and make the third row the column title
+    cip_df.columns = cip_df.iloc[2]
+    cip_df = cip_df.iloc[3:]
+
+    cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0))
+    cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1))
+    cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2))
+
+    return cip_df
+
+
+def get_matching_rows(cip_df, broad_id, major_id, detailed_id):
+
+    # Check the finest grain first
+    detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                cip_df["DetailedFieldId"] == detailed_id)]
+
+    if len(detailed_rows) > 0:
+        return detailed_rows
+
+    # Check the major grain
+    major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)]
+
+    if len(major_rows) > 0:
+        return major_rows
+
+    # Check the broad grain
+    broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id]
+
+    if len(broad_rows) > 0:
+        return broad_rows
+
+    raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}")
+
+
+def map_id_to_fields_of_science(id: str):
+
+    # Define the fields we hope to populate
+    broad_field_of_science = None
+    major_field_of_science = None
+    detailed_field_of_science = None
+
+    cip_df = get_cip_df()
+
+    # If we have a direct match, return it
+    direct_match = cip_df[cip_df["SED-CIP code"] == id]
+    if len(direct_match) > 0:
+        return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]]
+
+    # Add the broad field
+    broad_id = get_id(id, 0)
+    major_id = get_id(id, 1)
+    detailed_id = get_id(id, 2)
+
+    try:
+        matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id)
+    except ValueError as e:
+        print(id)
+        return [broad_field_of_science, major_field_of_science, detailed_field_of_science]
+
+    possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows()))
+    if broad_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())):
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)])
+
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_broad_field
+
+        print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}")
+
+        broad_field_of_science = best_option
+
+    possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows()))
+    if major_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_major_field in possible_major_fields:
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                        cip_df["New major field"] == possible_major_field)])
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_major_field
+
+        print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}")
+
+        major_field_of_science = best_option
+
+    possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows()))
+    if detailed_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_detailed_field in possible_detailed_fields:
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                        cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)])
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_detailed_field
+
+        print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}")
+
+        detailed_field_of_science = best_option
+
+    return [broad_field_of_science, major_field_of_science, detailed_field_of_science]
+
+
+def get_id(id: Union[float, str], granularity: int):
+
+    # Check if None
+    if pd.isna(id):
+        return None
+
+    # Fix up issues from reading the id as a float
+    digits = [x for x in str(id) if x in string.digits]
+
+    # If the first part is preceded with a 0, (01.2023)
+    if len(str(id).split(".")[0]) == 1:
+        digits = ['0', *digits]
+
+    # If the number ends with a 0, (10.2320)
+    if len(digits) % 2 == 1:
+        digits = [*digits, '0']
+
+
+    if len(digits) % 2 == 1:
+        digits = ['0', *digits]
+
+    if granularity == 0:
+        return "".join(digits[:2])
+
+    if granularity == 1:
+
+        if len(digits) < 4:
+            return None
+
+        return "".join(digits[2:4])
+
+    if granularity == 2:
+
+        if len(digits) < 6:
+            return None
+
+        return "".join(digits[4:])
+
+
+def tests():
+
+    if get_id(1.0, 0) != "01":
+        raise ValueError("Test failed")
+
+    if get_id(1.0, 1) != "00":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 2) != "20":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 1) != "23":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 0) != "10":
+        raise ValueError("Test failed")
+
+    if get_id(01.23, 2) != None:
+        raise ValueError("Test failed")
+
+    if get_id(01.23, 0) != "01":
+        raise ValueError("Test failed")
+
+    if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]:
+        raise ValueError("Test failed")
+
+if __name__ == "__main__":
+    tests()
+    print("All tests passed")
@@ -0,0 +1,88 @@
+import sys
+import datetime
+
+import yaml
+import requests
+
+from field_of_science import get_id
+
+
+def get_active_projects(start_date: datetime.datetime):
+    response = requests.get(
+        "https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search",
+        json={
+            "size": 0,
+            "query": {
+                "bool": {
+                    "filter": [
+                        {
+                            "term": {
+                                "ResourceType": "Payload"
+                            }
+                        },
+                        {
+                            "range": {
+                                "EndTime": {
+                                    "lte": int(datetime.datetime.now().timestamp() * 1000),
+                                    "gte": int(start_date.timestamp() * 1000)
+                                }
+                            }
+                        }
+                    ]
+                },
+            },
+            "aggs": {
+                "projects": {
+                    "terms": {
+                        "field": "ProjectName",
+                        "size": 99999999
+                    },
+                    "aggs": {
+                        "projectJobsRan": {
+                            "sum": {
+                                "field": "Njobs"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    )
+
+    data = response.json()
+
+    active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']]
+
+    return active_projects
+
+
+
+def has_detailed_precision(id: str):
+    return get_id(id, granularity=1) is not None
+
+
+def main():
+    one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
+    active_project_names = get_active_projects(one_year_ago)
+
+    print(active_project_names)
+
+    exceptions = []
+    for project_name in active_project_names:
+        try:
+            project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader)
+
+            if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]):
+                exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.")
+
+        except FileNotFoundError as e:
+            pass
+
+
+    if exceptions:
+        print("\n".join(exceptions), sys.stderr)
+        raise Exception("Projects without detailed precision need to be updated.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,12 @@
+certifi==2024.2.2
+charset-normalizer==3.3.2
+idna==3.7
+numpy==1.26.4
+pandas==2.2.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+requests==2.31.0
+six==1.16.0
+tzdata==2024.1
+urllib3==2.2.1
@@ -0,0 +1,22 @@
+name: Check Project FOS Precision
+on:
+  pull_request:
+    branches:
+      - main
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  check:
+    name: Check
+    runs-on: ubuntu-latest
+    if: startsWith(github.repository, 'opensciencegrid/')
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9.15
+          cache: 'pip' # caching pip dependencies
+      - run: pip install -r ./.github/scripts/check_project_fos_precision/requirements.txt
+      - run: python ./.github/scripts/check_project_fos_precision/main.py
@@ -30,7 +30,9 @@ RUN pip3 install --no-cache-dir -r requirements-apache.txt
 # Create data directory, and gather SSH keys for git
 RUN mkdir                  /data && \
     chown -v apache:apache /data && \
-    ssh-keyscan github.com bitbucket.org >> /etc/ssh/ssh_known_hosts
+    ssh-keyscan github.com bitbucket.org >> /etc/ssh/ssh_known_hosts && \
+    git config --global --add safe.directory /data/app/topology && \
+    git config --global --add safe.directory /data/app/contact
 
 # Add fetch-crl cronjob
 # Add daily restart of httpd to load renewed certificates
 
@@ -22,6 +22,7 @@ if __name__ == "__main__" and __package__ is None:
     sys.path.append(_parent + "/src")
 
 import topology_utils
+from topology_utils import TopologyPoolManager
 import net_name_addr_utils
 
 # Parts of this implementation are from the following StackOverflow answer:
@@ -182,13 +183,14 @@ def has_non_printable_ascii_characters(contents):
 
 def main():
     args = parseargs()
-
+    pm = TopologyPoolManager()
     recipients = set(args.recipients.split())
     if args.oim_recipients and 'vos' in args.oim_recipients:
         attempts = 3
         while attempts > 0:
             try:
-                results = topology_utils.get_vo_contacts(args)
+                results = pm.get_vo_contacts(args)
+                break
             except topology_utils.InvalidPathError as exc:
                 print(exc)
                 exit(1)
@@ -211,9 +213,10 @@ def main():
         while attempts > 0:
             try:
                 if args.fqdn_filter:
-                    results = topology_utils.get_resource_contacts_by_fqdn(args)
+                    results = pm.get_resource_contacts_by_fqdn(args)
                 else:
-                    results = topology_utils.get_resource_contacts(args)
+                    results = pm.get_resource_contacts(args)
+                break
             except topology_utils.InvalidPathError as exc:
                 exit(str(exc))
             except topology_utils.IncorrectPasswordError as exc: