Skip to content

Commit

Permalink
rough fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
sbenthall committed Jun 3, 2024
1 parent c04095d commit 5fdb473
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 65 deletions.
37 changes: 21 additions & 16 deletions bigbang/analysis/affiliation.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,42 @@
from bigbang.analysis.influence import *
from bigbang.analysis.utils import localize_to_utc

affil_start_date_col_name = 'Time start (mm/yyyy)'
affil_end_date_col_name = 'Time end (mm/yyyy)'
affil_affiliation_col_name = 'Affiliation'
affil_start_date_col_name = "Time start (mm/yyyy)"
affil_end_date_col_name = "Time end (mm/yyyy)"
affil_affiliation_col_name = "Affiliation"

def affiliated_influence(arx, affiliations, top_n = 50):

def affiliated_influence(arx, affiliations, top_n=50):
## this is defined in influence.py, and builds a sender_cat column
## based on email domain
augment(arx)

## this further looks up the email author in the affiliations table
## and modifies the sender_cat column
arx.data['sender_cat'] = arx.data.apply(
lambda mrow: lookup_affiliation(mrow['sender_cat'], mrow['Date'], affiliations),
axis=1)

arx.data["sender_cat"] = arx.data.apply(
lambda mrow: lookup_affiliation(mrow["sender_cat"], mrow["Date"], affiliations),
axis=1,
)

top_ddd = aggregate_activity(arx, top_n)

return top_ddd


def lookup_affiliation(name, date, affiliation_data):
"""
Find the affiliation of a name on a particular date,
given an affiliation data file.
"""
name_affils = affiliation_data[affiliation_data['Name'] == name]
date = localize_to_utc(date)
name_affils = affiliation_data[affiliation_data["Name"] == name]

date = localize_to_utc(date)

for na_row in name_affils.iterrows():
if date > na_row[1][affil_start_date_col_name] \
and date < na_row[1][affil_end_date_col_name]:
return na_row[1][affil_affiliation_col_name]

return name
if (
date > na_row[1][affil_start_date_col_name]
and date < na_row[1][affil_end_date_col_name]
):
return na_row[1][affil_affiliation_col_name]

return name
65 changes: 37 additions & 28 deletions bigbang/analysis/datatracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
def rfc_author_data(rfc):
record = {}

record['title'] = rfc.title
record['draft'] = rfc.draft
record['date'] = rfc.date()
record['wg'] = rfc.wg
record['docid'] = rfc.doc_id
record["title"] = rfc.title
record["draft"] = rfc.draft
record["date"] = rfc.date()
record["wg"] = rfc.wg
record["docid"] = rfc.doc_id

draft = None
if rfc.draft is not None:
Expand All @@ -33,45 +33,48 @@ def rfc_author_data(rfc):
draft = dt.document_from_rfc(rfc.doc_id)
if draft is not None:

record['draft-date'] = draft.time
record['authors'] = []

record["draft-date"] = draft.time
record["authors"] = []

for author in dt.document_authors(draft):
person = dt.person(author.person)

author = {
"id" : person.id,
"country" : author.country,
"name" : person.name,
"affiliation" : author.affiliation
"id": person.id,
"country": author.country,
"name": person.name,
"affiliation": author.affiliation,
}

record['authors'].append(author)
record["authors"].append(author)

record['revision'] = draft.rev
record["revision"] = draft.rev

return record

else:
return None


def authorship_from_rfc_data(rfc_data):
records = []

for author in rfc_data['authors']:
for author in rfc_data["authors"]:
author_record = author.copy()

author_record['draft'] = rfc_data['draft']
author_record['title'] = rfc_data['title']
author_record['date'] = rfc_data['date'].strftime('%Y-%m-%d') # format this to string!
author_record['wg'] = rfc_data['wg']
author_record['docid'] = rfc_data['docid']
author_record["draft"] = rfc_data["draft"]
author_record["title"] = rfc_data["title"]
author_record["date"] = rfc_data["date"].strftime(
"%Y-%m-%d"
) # format this to string!
author_record["wg"] = rfc_data["wg"]
author_record["docid"] = rfc_data["docid"]

records.append(author_record)

return records


def rfc_authors_from_working_group(acr):
"""
Get a dataframe of all authors of RFCs published
Expand All @@ -93,6 +96,7 @@ def rfc_authors_from_working_group(acr):

return df


def draft_authors_from_working_group(acr):
"""
Get a dataframe of all authors of drafts published
Expand All @@ -111,13 +115,15 @@ def draft_authors_from_working_group(acr):
# get drafts.
# filter by rfc status here?
for draft in dt.documents(
group=g, doctype=dt.document_type_from_slug("rfc") #"draft"
group=g, doctype=dt.document_type_from_slug("rfc") # "draft"
): # status argument
# interested in all submissions, or just the most recent?

if draft.rfc:
submissions = [dt.submission(sub_url) for sub_url in draft.submissions]
submissions = sorted(submissions, key=lambda s: s.submission_date, reverse=True)
submissions = sorted(
submissions, key=lambda s: s.submission_date, reverse=True
)

print(f"len(submissions) == {len(submissions)}")
if len(submissions) > 0:
Expand All @@ -142,12 +148,13 @@ def draft_authors_from_working_group(acr):
records.append(authors)

records = sum(records, [])
records = sorted(records, key=lambda x: x['rfc'])
records = sorted(records, key=lambda x: x["rfc"])

df = pd.DataFrame.from_records(records)

return df


em_re = "/api/v1/person/email/([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7})/"


Expand Down Expand Up @@ -195,10 +202,10 @@ def leadership_ranges(group_acronym):
{
"datetime_max": h.time,
"datetime_min": h.time,
#"email": email_from_uri(r.email.uri),
# "email": email_from_uri(r.email.uri),
"person_uri": r.person.uri,
"name": dt.person(r.person).name,
#"biography": dt.person(r.person).biography,
# "biography": dt.person(r.person).biography,
}
for r in list(
dt.group_role_histories(
Expand All @@ -213,15 +220,17 @@ def leadership_ranges(group_acronym):
gh_chair_records = sum(gh_chair_records, [])
ghcr_df = pd.DataFrame.from_records(gh_chair_records)

agged = ghcr_df.groupby(["name", "person_uri"]).agg( # "email", "biography"
agged = ghcr_df.groupby(["name", "person_uri"]).agg( # "email", "biography"
{"datetime_min": "min", "datetime_max": "max"}
)

## Minimum time is the first record.
#agged["datetime_min"].replace({ghcr_df["datetime_min"].min(): None}, inplace=True)
# agged["datetime_min"].replace({ghcr_df["datetime_min"].min(): None}, inplace=True)

## TODO: replace with current time
agged["datetime_max"].replace({ghcr_df["datetime_max"].max(): datetime.now(timezone.utc)}, inplace=True)
agged["datetime_max"].replace(
{ghcr_df["datetime_max"].max(): datetime.now(timezone.utc)}, inplace=True
)
agged = agged.sort_values(by="datetime_max")

return ghcr_df, agged
52 changes: 31 additions & 21 deletions bigbang/analysis/influence.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,52 +20,57 @@
dd = domains.load_data()
odf = organizations.load_data()

good_categories = ["company", "academic", "sdo"] # not "generic"
good_categories = ["company", "academic", "sdo"] # not "generic"


def lookup_stakeholder_by_domain(domain):
"""
For an email domain, use the organization data provided in BigBang
to look up the organization name associate with that email domain.
"""
search = odf['email domain names'].apply(lambda dn: domain in str(dn))
search = odf["email domain names"].apply(lambda dn: domain in str(dn))

orgs = odf[search]
top_orgs = orgs[orgs['subsidiary of / alias of'].isna()]
if top_orgs.shape[0] > 0:
return top_orgs['name'].iloc[0]

top_orgs = orgs[orgs["subsidiary of / alias of"].isna()]

if top_orgs.shape[0] > 0:
return top_orgs["name"].iloc[0]
else:
return domain


def normalize_senders_by_domain(row):
try:
if dd.loc[row['domain']]['category'] in good_categories:
return lookup_stakeholder_by_domain(row['domain'])
if dd.loc[row["domain"]]["category"] in good_categories:
return lookup_stakeholder_by_domain(row["domain"])
else:
return parse.clean_from(row['From'])
return parse.clean_from(row["From"])
except Exception as e:
return parse.clean_from(row['From'])
return parse.clean_from(row["From"])


def is_affiliation(domain):
try:
if dd.loc[domain]['category'] in good_categories:
if dd.loc[domain]["category"] in good_categories:
return lookup_stakeholder_by_domain(domain)
else:
return "Unaffiliated"
except:
return "Unaffiliated"


def augment(arx):
"""
Add to an email archive's data three new columns: an email addres,
an email domain, and the 'category' of the sender, which may be an
organization name, 'Unaffiliated', or a cleaned version of the email's
From field.
"""
arx.data['email'] = arx.data['From'].apply(utils.extract_email)
arx.data['domain'] = arx.data['From'].apply(utils.extract_domain)
arx.data['sender_cat'] = arx.data.apply(normalize_senders_by_domain, axis=1)
arx.data["email"] = arx.data["From"].apply(utils.extract_email)
arx.data["domain"] = arx.data["From"].apply(utils.extract_domain)
arx.data["sender_cat"] = arx.data.apply(normalize_senders_by_domain, axis=1)


def aggregate_activity(aarx, top_n):
"""
Expand All @@ -76,15 +81,20 @@ def aggregate_activity(aarx, top_n):
TODO: generalize this, with more flexible frequency.
TODO: Internalize the 'augment' preprocessing.
"""
grouped = aarx.data.groupby(['sender_cat', pd.Grouper(key='Date', freq='Y')]) \
.count().reset_index().sort_values('Date')

grouped = (
aarx.data.groupby(["sender_cat", pd.Grouper(key="Date", freq="Y")])
.count()
.reset_index()
.sort_values("Date")
)

ddd = grouped.pivot(columns="sender_cat", index="Date", values="From").fillna(0)

top_ddd = ddd[ddd.sum().sort_values(ascending=False)[:top_n].index]

return top_ddd


def influence_from_arx(arx, top_n):
"""
Return a dataframe with the annual influence of each organizational
Expand All @@ -93,5 +103,5 @@ def influence_from_arx(arx, top_n):
top_n = 50
augment(arx)
aaarx = aggregate_activity(arx, top_n)

return aaarx
Loading

0 comments on commit 5fdb473

Please sign in to comment.