Skip to content

Commit 834548e

Browse files
committed
normalize affiliations for authorship data drawn from datatracker
1 parent 375751f commit 834548e

File tree

3 files changed

+1081
-1020
lines changed

3 files changed

+1081
-1020
lines changed

bigbang/analysis/datatracker.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,35 @@
22
Scripts for processing data from the IETF DataTracker
33
"""
44

5-
from ietfdata.datatracker import *
6-
from ietfdata.datatracker_ext import *
5+
from bigbang.config import CONFIG
6+
7+
import bigbang.datasets.organizations as bdo
8+
79
from datetime import date, datetime, timezone
810
from dateutil.parser import *
911
import json as json
1012

1113
import pandas as pd
1214
import re
1315

16+
17+
from ietfdata.datatracker import *
18+
from ietfdata.datatracker_ext import *
19+
from ietfdata.rfcindex import *
20+
21+
import sys
22+
23+
# adding the cache configuration path here
24+
cache_path = os.path.abspath(os.path.join(os.path.dirname(__file__), CONFIG.ietfdata_cache_path))
25+
sys.path.insert(0, cache_path)
26+
print(f"cache path: {cache_path}")
27+
1428
dt = DataTrackerExt()
1529
ri = RFCIndex()
1630

31+
odf = bdo.load_data()
1732

18-
def rfc_author_data(rfc):
33+
def rfc_author_data(rfc, normalize = True):
1934
record = {}
2035

2136
record["title"] = rfc.title
@@ -39,11 +54,16 @@ def rfc_author_data(rfc):
3954
for author in dt.document_authors(draft):
4055
person = dt.person(author.person)
4156

57+
affiliation = author.affiliation
58+
59+
if normalize:
60+
affiliation = normalize_affiliation(affiliation)
61+
4262
author = {
4363
"id": person.id,
4464
"country": author.country,
4565
"name": person.name,
46-
"affiliation": author.affiliation,
66+
"affiliation": affiliation,
4767
}
4868

4969
record["authors"].append(author)
@@ -164,7 +184,7 @@ def email_from_uri(email_uri):
164184
return m.group(1) if m else None
165185

166186

167-
dt = DataTracker(use_cache=True)
187+
dt = DataTracker()
168188

169189

170190
def get_group_histories(wg_name):
@@ -178,7 +198,7 @@ def get_group_histories(wg_name):
178198
group_role_histories = [
179199
dt.group_role_histories(
180200
group=grp_hist,
181-
name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/")),
201+
name=dt.role_name(RoleNameURI(uri="/api/v1/name/rolename/chair/")),
182202
)
183203
for grp_hist in group_histories
184204
]
@@ -210,7 +230,7 @@ def leadership_ranges(group_acronym):
210230
for r in list(
211231
dt.group_role_histories(
212232
group=h,
213-
name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/")),
233+
name=dt.role_name(RoleNameURI(uri="/api/v1/name/rolename/chair/")),
214234
)
215235
)
216236
]
@@ -234,3 +254,18 @@ def leadership_ranges(group_acronym):
234254
agged = agged.sort_values(by="datetime_max")
235255

236256
return ghcr_df, agged
257+
258+
259+
def normalize_affiliation(affil):
260+
"""
261+
262+
Probably should be somewhere else.
263+
"""
264+
affil = affil.strip()
265+
266+
lookup = bdo.lookup_normalized(affil, odf)
267+
268+
if lookup is not None:
269+
affil = lookup
270+
271+
return affil

0 commit comments

Comments
 (0)