-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauthors_extractor.py
73 lines (63 loc) · 2.98 KB
/
authors_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from plot_utils import *
from utility import *
from pyspark import SparkConf
from pyspark import SparkContext
def authors_flat_map(x):
"""
Create list of authors from the publication.
Arguments:
x: (id, (publication, topics))
Returns:
authors: the list of authors
"""
authors = []
topics_vector = x[1][1]['topics']
publication = x[1][0]
for author in publication['authors']:
coauthors = publication['authors'].copy()
coauthors.remove(author)
authors.append(dict(id=md5(author['name'] + publication['id']),
name=author['name'],
org=author['org'],
pub_id=publication['id'],
gt_id=author.get('id', ''),
keywords=publication['keywords'],
venue=publication['venue'],
year=publication['year'],
topics=topics_vector,
coauthors=coauthors
))
return authors
def author_extractor(sc, input_data_path, lda_topics_path, output_data_path):
"""
Extract author from input publications and create dataset.
Arguments:
sc: the spark context
input_data_path: location of the json input file (the publication data)
lda_topics_path: location of the json input file (the topics data)
output_data_path: location of the json output file
Returns:
data_lemmatized: the lemmatization of the data in the form [(id, lemmatization)]
"""
data = sc.textFile(input_data_path).map(json.loads).map(lambda x: (x['id'], x))
lda_topics = sc.textFile(lda_topics_path).map(json.loads).map(lambda x: (x['id'], x))
authors = data.join(lda_topics).flatMap(authors_flat_map)
print("Author extracted: %s" % authors.count())
list_to_file(authors.collect(), output_data_path)
return authors
if __name__ == '__main__':
conf = SparkConf().setAppName('author extractor').setMaster('local[*]')
sc = SparkContext(conf=conf)
# author_extractor(sc=sc,
# input_data_path="datasets/processed/aminer_wiw_pubs.json",
# lda_topics_path="datasets/processed/aminer_wiw_pubs_lda_topics.json",
# output_data_path="datasets/processed/aminer_wiw_authors.json")
#
# conf = SparkConf().setAppName('author extractor').setMaster('local[*]')
# sc = SparkContext(conf=conf)
# papers = sc.parallelize(pd.read_json("datasets/processed/aminer_wiw_authors.json", lines=True).to_dict('records'))\
# .filter(lambda x: lnfi(x['name']) == 'zzhang')
# tsne_plot(element=papers.collect(), output_path="results/tsne_plot_author_wiw_hyang")
data = sc.textFile("datasets/processed/aminer_wiw_authors.json").map(json.loads)
data_field_stats(data, "org", "string", "results/authors_orgs_stats.txt")
data_field_stats(data, "keywords", "list", "results/authors_keywords.txt")