This repository has been archived by the owner on Jan 13, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 20
/
pubmed_data.py
101 lines (81 loc) · 3.21 KB
/
pubmed_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import requests
import xml.etree.ElementTree as et
import traceback
import json
import utils
from os.path import join
import os
europepmc_full_text_url = 'http://www.ebi.ac.uk/europepmc/webservices/rest/{}/fullTextXML'
europepmc_search_url = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?' \
'query={term}' \
'%20open_access:y' \
'%20HAS_FT:y' \
'&format=json' \
'&pageSize={pageSize}'
def get_pmc_paper_fulltext(pmcid):
s = ''
full_xml = requests.get(europepmc_full_text_url.format(pmcid)).content
try:
root = et.fromstring(full_xml)
# print iter_element_text(root)
text_data = []
telem = root.find('.//article-title')
title = ''
if telem is not None:
title = ''.join(telem.itertext())
title += '.'
text_data.append(title)
elem = root.find('.//abstract')
if elem is not None:
abstract = iterate_get_text(elem)
text_data.append('abstract\n' + abstract)
elem = root.find('body')
if elem is not None:
text_data.append(iterate_get_text(elem))
s = '\n'.join(text_data)
except Exception, e:
traceback.print_exc()
return s
def iterate_get_text(elem):
"""
get all inner text values of this element with special cares of
1) ignoring not relevant nodes, e.g., xref, table
2) making section/title texts identifiable by the sentence tokenizer
Args:
elem:
Returns:
"""
remove_tags = ['table']
line_break_tags = ['sec', 'title']
s = ''
if elem.tag not in remove_tags:
local_str = elem.text.strip() + ' ' if elem.text is not None else ''
if elem.tag in line_break_tags and elem.text is not None:
local_str = '\n' + local_str + '.\n'
s += local_str
for e in list(elem):
ct = iterate_get_text(e)
s += (' ' + ct) if len(ct) > 0 else ''
s += elem.tail.strip() + ' ' if elem.tail is not None else ''
return s
def search_pmc(term, page_size=50):
result_json = requests.get(europepmc_search_url.format(**{'term': term, 'pageSize': page_size})).content
print result_json
return json.loads(result_json)['resultList']['result']
def do_download_pmc_full_text(pmcid, data_folder):
text = get_pmc_paper_fulltext(pmcid)
utils.save_string(text, join(data_folder, pmcid))
def dump_pmc_data(term, page_size, data_path):
docs = search_pmc(term, page_size)
utils.save_json_array(docs, join(data_path, 'pmc_docs.json'))
utils.multi_thread_tasking([d['pmcid'] for d in docs if 'pmcid' in d], 10, do_download_pmc_full_text,
args=[join(data_path, 'fulltext')])
print 'done'
def dump_pmc_doc_ids(doc_json, file_path):
docs = utils.load_json_data(doc_json)
utils.save_string('\n'.join([d['pmcid'] for d in docs if 'pmcid' in d]), file_path)
if __name__ == "__main__":
# print get_pmc_paper_fulltext('PMC5309427')
# search_pmc('parkinson''s')
dump_pmc_data('parkinson''s', 20, './index_settings')
# dump_pmc_doc_ids('./pumed_test/pmc_docs.json', './pumed_test/pmc_doc_ids.txt')