-
Notifications
You must be signed in to change notification settings - Fork 0
/
assign_labels.py
155 lines (127 loc) · 6.99 KB
/
assign_labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Map arXiv subjects to class labels.
"""
from sklearn.model_selection import train_test_split
from write_partitions import FACTORS, SUBJECTS
RELEVANT_SUBJECTS = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'cs.MA', 'cs.RO', 'stat.ML']
def label(df):
# Create indicators for each relevant subject, named by arXiv abbreviation, e.g. 'cs_AI' (also creating
# 'split_categories' and 'category_n' columns)
df = indicate_subjects(df)
df = label_any_ai(df)
# We now have e.g. 'Any_AI' taking true/false and 'split_Any_AI' giving whether an article is train/dev/test/omit
# set; similarly for each subject
# Add a 'primary_label' column giving the primary label like 'cs_RO'
df = label_multiclass(df)
# Collapse MA, which is small, into LG for 'coarse multiclass' label (explored but not used in paper)
df['coarse_multiclass'] = df['primary_label']
df['coarse_multiclass'] = df['coarse_multiclass'].apply(lambda x: 'cs_LG' if x == 'cs_MA' else x)
df['coarse_multiclass'].value_counts()
# Add a 'first_ai_label' column giving the first AI-relevant label (used for descriptives)
df = label_multiclass(df, primary=False)
df['first_ai_label'] = df['first_ai_label'].str.replace('.', '_')
df['first_ai_label'] = df['first_ai_label'].apply(lambda x: 'cs_LG' if x == 'cs_MA' else x)
df['first_ai_label'].value_counts()
# Add a 'multilabel' column giving an array of 0-K labels (explored but not used in paper)
df['cs_LG_MA'] = df['cs_LG'] | df['cs_MA']
df['cs_LG_MA'].value_counts()
df = add_multilabel(df)
# Add undersample columns (used in ongoing work, not in paper)
for f in FACTORS:
df = undersample(df, label_col='Any_AI', new_split_col=f'split_Any_AI_{f}_1', factor=f)
for subj in SUBJECTS:
print(subj)
df = undersample(df, label_col=subj, factor=f)
return df
def label_any_ai(df):
"""Assign the "Any_AI" label, positive if any of the AI-relevant subjects are in an article's categories."""
df['Any_AI'] = df['split_categories'].apply(lambda x: any(subj in x for subj in RELEVANT_SUBJECTS))
df = split_label(df, label_col='Any_AI')
return df
def label_multiclass(d, labels=('cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'cs.MA', 'cs.RO'), primary=True):
"""Create a column "primary_label" with the primary (first) subject of an article if relevant, else "Other"."""
if primary:
d['primary_label'] = d['split_categories'].apply(
lambda x: x[0].replace('.', '_') if x[0] in labels else 'Other')
else:
d['first_ai_label'] = d['split_categories'].apply(lambda cats: [x for x in cats if x in labels])
d['first_ai_label'] = d['first_ai_label'].apply(lambda cats: cats[0] if cats else 'Other')
return d
def indicate_subjects(df):
"""Create indicator columns for each label of interest.
We consider stat.ML and cs.LG interchangeable. Ostensibly (see https://arxiv.org/corr/subjectclasses) papers are
automatically cross-posted between the two, but in the data this isn't true for a small number of papers.
"""
# Quick solution: replace all 'cs.LG' with 'stat.ML' - sometimes results in repeated 'cs.LG', but we don't assume
# uniqueness
df['categories'] = df['categories'].str.replace('stat.ML', 'cs.LG')
df['split_categories'] = df['categories'].str.split(' ')
df['category_n'] = df['split_categories'].apply(lambda x: len(set(x)))
for cat in [subj for subj in RELEVANT_SUBJECTS if subj != 'stat.ML']:
# Period is an illegal character for labels or columns somewhere downstream
col = cat.replace('.', '_')
df[col] = df['categories'].str.contains(cat)
df = split_label(df, col)
return df
def add_multilabel(d, columns=('cs_AI', 'cs_CL', 'cs_CV', 'cs_LG_MA', 'cs_RO')):
"""Create a column "multilabel" giving an array of subject labels as strings."""
d['multilabel'] = d.apply(lambda row: [k for k, v in row.items() if k in columns and v], axis=1)
return d
def undersample(d, label_col='Any_AI', split_col=None, new_split_col=None, split_value='train', factor=4, to_test=True):
"""Undersample negative examples to adjust class balance.
The result is a new split column. From e.g. "split_cs_AI", which gives the assignment of papers to train/dev/test
stratified by year and the cs_AI label, we create "split_cs_AI_4_1" for factor=4, representing new assignments in
which negative class examples have been undersampled to achieve a 4:1 imbalance.
"""
if split_col is None:
split_col = f'split_{label_col}'
# We keep the original split_col and modify a copy
if new_split_col is None:
new_split_col = f'split_{label_col}_{factor}_1'
d[new_split_col] = d[split_col]
# Get the original ratio of negative to positive examples
counts = d.loc[d[split_col] == split_value, label_col].value_counts()
ratio = counts[False] / counts[True]
print(f'Original class ratio is {ratio.round(1)}:1 neg:pos')
if factor > ratio:
# We only undersample
print(f'Doing nothing for "factor" > {ratio.round(1)}')
return d
# Keep negative examples to produce the ratio indicated by factor
n_keep = counts[False] - counts[True] * factor
negative_idx = d.loc[(d[split_col] == split_value) & (~d[label_col])].index
_, drop_idx = train_test_split(negative_idx, test_size=n_keep, random_state=20200212,
stratify=d.loc[negative_idx, 'year'])
# Either reassign the excluded examples to the test set or assign them to no set
if to_test:
d.loc[drop_idx, new_split_col] = 'test'
else:
d.loc[drop_idx, new_split_col] = None
# Verify the new ratio of negative to positive examples
new_counts = d.loc[d[new_split_col] == split_value, label_col].value_counts()
new_ratio = new_counts[False] / new_counts[True]
assert float(factor) == new_ratio
print(f'New class ratio using "{new_split_col}" column is {new_ratio.round(1)}:1 neg:pos')
return d
def split_label(df, label_col):
"""Create a column containing the assignment of papers into train/dev/test sets.
The naming convention is for a label column like "cs_AI" to have an assignment column "split_cs_AI".
"""
# Ensure we have a default numeric index
df = df.reset_index(drop=True)
# First an 80/20 train/not split -> 80/10/10 train/dev/test split
train_i, temp_i = train_test_split(df.index, train_size=.8, stratify=df[['year', label_col]], random_state=20200126)
dev_i, test_i = train_test_split(temp_i, train_size=.5, stratify=df.loc[temp_i, ['year', label_col]],
random_state=20200126)
split_col = f'split_{label_col}'
df[split_col] = None
df.loc[train_i, split_col] = 'train'
df.loc[dev_i, split_col] = 'dev'
df.loc[test_i, split_col] = 'test'
# Check balance
year_balance = df.loc[df[split_col] == 'train']. \
groupby(['year', label_col]). \
agg({'id': 'count'}). \
rename(columns={'id': 'count'})
print(year_balance)
return df