Skip to content

Commit 71f3b70

Browse files
Merge branch 'master' into pr/bgcache
2 parents bc07037 + 8835bd2 commit 71f3b70

File tree

1,274 files changed

+8609
-645
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,274 files changed

+8609
-645
lines changed
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
from functools import lru_cache
2+
from typing import Union
3+
import string
4+
5+
import pandas as pd
6+
7+
8+
@lru_cache()
9+
def get_cip_df():
10+
11+
cip_df = pd.read_excel("data/SED-CIP-2022.xlsx")
12+
13+
# Drop the first two rows and make the third row the column title
14+
cip_df.columns = cip_df.iloc[2]
15+
cip_df = cip_df.iloc[3:]
16+
17+
cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0))
18+
cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1))
19+
cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2))
20+
21+
return cip_df
22+
23+
24+
def get_matching_rows(cip_df, broad_id, major_id, detailed_id):
25+
26+
# Check the finest grain first
27+
detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
28+
cip_df["DetailedFieldId"] == detailed_id)]
29+
30+
if len(detailed_rows) > 0:
31+
return detailed_rows
32+
33+
# Check the major grain
34+
major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)]
35+
36+
if len(major_rows) > 0:
37+
return major_rows
38+
39+
# Check the broad grain
40+
broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id]
41+
42+
if len(broad_rows) > 0:
43+
return broad_rows
44+
45+
raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}")
46+
47+
48+
def map_id_to_fields_of_science(id: str):
49+
50+
# Define the fields we hope to populate
51+
broad_field_of_science = None
52+
major_field_of_science = None
53+
detailed_field_of_science = None
54+
55+
cip_df = get_cip_df()
56+
57+
# If we have a direct match, return it
58+
direct_match = cip_df[cip_df["SED-CIP code"] == id]
59+
if len(direct_match) > 0:
60+
return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]]
61+
62+
# Add the broad field
63+
broad_id = get_id(id, 0)
64+
major_id = get_id(id, 1)
65+
detailed_id = get_id(id, 2)
66+
67+
try:
68+
matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id)
69+
except ValueError as e:
70+
print(id)
71+
return [broad_field_of_science, major_field_of_science, detailed_field_of_science]
72+
73+
possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows()))
74+
if broad_id is not None:
75+
best_option = None
76+
max_rows = 0
77+
for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())):
78+
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)])
79+
80+
if l > max_rows:
81+
max_rows = l
82+
best_option = possible_broad_field
83+
84+
print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}")
85+
86+
broad_field_of_science = best_option
87+
88+
possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows()))
89+
if major_id is not None:
90+
best_option = None
91+
max_rows = 0
92+
for possible_major_field in possible_major_fields:
93+
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
94+
cip_df["New major field"] == possible_major_field)])
95+
if l > max_rows:
96+
max_rows = l
97+
best_option = possible_major_field
98+
99+
print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}")
100+
101+
major_field_of_science = best_option
102+
103+
possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows()))
104+
if detailed_id is not None:
105+
best_option = None
106+
max_rows = 0
107+
for possible_detailed_field in possible_detailed_fields:
108+
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
109+
cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)])
110+
if l > max_rows:
111+
max_rows = l
112+
best_option = possible_detailed_field
113+
114+
print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}")
115+
116+
detailed_field_of_science = best_option
117+
118+
return [broad_field_of_science, major_field_of_science, detailed_field_of_science]
119+
120+
121+
def get_id(id: Union[float, str], granularity: int):
122+
123+
# Check if None
124+
if pd.isna(id):
125+
return None
126+
127+
# Fix up issues from reading the id as a float
128+
digits = [x for x in str(id) if x in string.digits]
129+
130+
# If the first part is preceded with a 0, (01.2023)
131+
if len(str(id).split(".")[0]) == 1:
132+
digits = ['0', *digits]
133+
134+
# If the number ends with a 0, (10.2320)
135+
if len(digits) % 2 == 1:
136+
digits = [*digits, '0']
137+
138+
139+
if len(digits) % 2 == 1:
140+
digits = ['0', *digits]
141+
142+
if granularity == 0:
143+
return "".join(digits[:2])
144+
145+
if granularity == 1:
146+
147+
if len(digits) < 4:
148+
return None
149+
150+
return "".join(digits[2:4])
151+
152+
if granularity == 2:
153+
154+
if len(digits) < 6:
155+
return None
156+
157+
return "".join(digits[4:])
158+
159+
160+
def tests():
161+
162+
if get_id(1.0, 0) != "01":
163+
raise ValueError("Test failed")
164+
165+
if get_id(1.0, 1) != "00":
166+
raise ValueError("Test failed")
167+
168+
if get_id(10.2320, 2) != "20":
169+
raise ValueError("Test failed")
170+
171+
if get_id(10.2320, 1) != "23":
172+
raise ValueError("Test failed")
173+
174+
if get_id(10.2320, 0) != "10":
175+
raise ValueError("Test failed")
176+
177+
if get_id(01.23, 2) != None:
178+
raise ValueError("Test failed")
179+
180+
if get_id(01.23, 0) != "01":
181+
raise ValueError("Test failed")
182+
183+
if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]:
184+
raise ValueError("Test failed")
185+
186+
if __name__ == "__main__":
187+
tests()
188+
print("All tests passed")
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import sys
2+
import datetime
3+
4+
import yaml
5+
import requests
6+
7+
from field_of_science import get_id
8+
9+
10+
def get_active_projects(start_date: datetime.datetime):
11+
response = requests.get(
12+
"https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search",
13+
json={
14+
"size": 0,
15+
"query": {
16+
"bool": {
17+
"filter": [
18+
{
19+
"term": {
20+
"ResourceType": "Payload"
21+
}
22+
},
23+
{
24+
"range": {
25+
"EndTime": {
26+
"lte": int(datetime.datetime.now().timestamp() * 1000),
27+
"gte": int(start_date.timestamp() * 1000)
28+
}
29+
}
30+
}
31+
]
32+
},
33+
},
34+
"aggs": {
35+
"projects": {
36+
"terms": {
37+
"field": "ProjectName",
38+
"size": 99999999
39+
},
40+
"aggs": {
41+
"projectJobsRan": {
42+
"sum": {
43+
"field": "Njobs"
44+
}
45+
}
46+
}
47+
}
48+
}
49+
}
50+
)
51+
52+
data = response.json()
53+
54+
active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']]
55+
56+
return active_projects
57+
58+
59+
60+
def has_detailed_precision(id: str):
61+
return get_id(id, granularity=1) is not None
62+
63+
64+
def main():
65+
one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
66+
active_project_names = get_active_projects(one_year_ago)
67+
68+
print(active_project_names)
69+
70+
exceptions = []
71+
for project_name in active_project_names:
72+
try:
73+
project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader)
74+
75+
if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]):
76+
exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.")
77+
78+
except FileNotFoundError as e:
79+
pass
80+
81+
82+
if exceptions:
83+
print("\n".join(exceptions), sys.stderr)
84+
raise Exception("Projects without detailed precision need to be updated.")
85+
86+
87+
if __name__ == "__main__":
88+
main()
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
certifi==2024.2.2
2+
charset-normalizer==3.3.2
3+
idna==3.7
4+
numpy==1.26.4
5+
pandas==2.2.2
6+
python-dateutil==2.9.0.post0
7+
pytz==2024.1
8+
PyYAML==6.0.1
9+
requests==2.31.0
10+
six==1.16.0
11+
tzdata==2024.1
12+
urllib3==2.2.1
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: Check Project FOS Precision
2+
on:
3+
pull_request:
4+
branches:
5+
- main
6+
schedule:
7+
- cron: '0 0 * * *'
8+
9+
jobs:
10+
check:
11+
name: Check
12+
runs-on: ubuntu-latest
13+
if: startsWith(github.repository, 'opensciencegrid/')
14+
steps:
15+
- uses: actions/checkout@v3
16+
- name: Set up Python
17+
uses: actions/setup-python@v4
18+
with:
19+
python-version: 3.9.15
20+
cache: 'pip' # caching pip dependencies
21+
- run: pip install -r ./.github/scripts/check_project_fos_precision/requirements.txt
22+
- run: python ./.github/scripts/check_project_fos_precision/main.py

Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ RUN pip3 install --no-cache-dir -r requirements-apache.txt
3030
# Create data directory, and gather SSH keys for git
3131
RUN mkdir /data && \
3232
chown -v apache:apache /data && \
33-
ssh-keyscan github.com bitbucket.org >> /etc/ssh/ssh_known_hosts
33+
ssh-keyscan github.com bitbucket.org >> /etc/ssh/ssh_known_hosts && \
34+
git config --global --add safe.directory /data/app/topology && \
35+
git config --global --add safe.directory /data/app/contact
3436

3537
# Add fetch-crl cronjob
3638
# Add daily restart of httpd to load renewed certificates

Procfile

Lines changed: 0 additions & 2 deletions
This file was deleted.

bin/osg-notify

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ if __name__ == "__main__" and __package__ is None:
2222
sys.path.append(_parent + "/src")
2323

2424
import topology_utils
25+
from topology_utils import TopologyPoolManager
2526
import net_name_addr_utils
2627

2728
# Parts of this implementation are from the following StackOverflow answer:
@@ -182,13 +183,14 @@ def has_non_printable_ascii_characters(contents):
182183

183184
def main():
184185
args = parseargs()
185-
186+
pm = TopologyPoolManager()
186187
recipients = set(args.recipients.split())
187188
if args.oim_recipients and 'vos' in args.oim_recipients:
188189
attempts = 3
189190
while attempts > 0:
190191
try:
191-
results = topology_utils.get_vo_contacts(args)
192+
results = pm.get_vo_contacts(args)
193+
break
192194
except topology_utils.InvalidPathError as exc:
193195
print(exc)
194196
exit(1)
@@ -211,9 +213,10 @@ def main():
211213
while attempts > 0:
212214
try:
213215
if args.fqdn_filter:
214-
results = topology_utils.get_resource_contacts_by_fqdn(args)
216+
results = pm.get_resource_contacts_by_fqdn(args)
215217
else:
216-
results = topology_utils.get_resource_contacts(args)
218+
results = pm.get_resource_contacts(args)
219+
break
217220
except topology_utils.InvalidPathError as exc:
218221
exit(str(exc))
219222
except topology_utils.IncorrectPasswordError as exc:

0 commit comments

Comments
 (0)