-
Notifications
You must be signed in to change notification settings - Fork 0
/
pub_crawler.py
162 lines (124 loc) · 4.97 KB
/
pub_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
'''
Part 1: Crawler
biopython needs to be installed on system user either
pip install biopython or conda install biopython
Below is a hyper link that gave a nice example of how to query by key term
http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec158
This shows how to query by date
https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
'''
import re
import numpy as np
from Bio import Entrez, Medline
import csv
#Enter your email
Entrez.email = "garret.munoz@uth.tmc.edu"
def date_converter(input_date):
'''
input_date: string of date and must have format MM/DD/YYYY
outputs a string with for the date in format YYYY/MM/DD
This will not catch months that don't exist such as 13 etc.
'''
#Checks if it matches the needed date format
match = re.match("[0-1][0-9]/[0-3][0-9]/[0-9]{4}", input_date)
if match is not None:
output_date = input_date.split('/')
output_date = output_date[2] + '/' + output_date[0] + '/' + output_date[1]
else:
raise Exception("Date must be in format MM/DD/YYYY")
if int(input_date[0:2]) > 12 or int(input_date[0:2]) < 1:
raise Exception("Month must be between 1 and 12")
if int(input_date[3:5]) > 31 or int(input_date[3:5]) < 1:
raise Exception("Dates must be between 1 and 31")
return output_date
def pubmed_crawl(key_term, mindate, maxdate):
'''
searches publications based on a key term between the mindate and maxdate
key_term: string value of the topic to be searched
mindate: minimum date of publication in format of 'YYYY/MM/DD'
maxdate: maximum date of publication in format of 'YYYY/MM/DD'
TODO: Notice mindate and maxdate utilize a different date format than given by
the specifications (MM/DD/YYYY), we will need to clean the inputted date to the format that
Entrez requires (YYYY/MM/DD). We should also have a check that the length of the inputted dates
are equal to 10
'''
#mindate must be less than or equal to the maxdate
assert (maxdate>=mindate), 'End date must be after or on start date'
#queries for the ID by key term
handle = Entrez.esearch(db="pubmed", term=key_term, retmax=100000,datetype='edat' ,mindate=mindate, maxdate=maxdate)
record = Entrez.read(handle)
idlist = record['IdList']
handle.close()
article_count = len(idlist)
#retrieves publication data by the id's
records = []
retstart_list = [x for x in range(0, article_count, 1000)]
for retstart in retstart_list:
handle = Entrez.efetch(db="pubmed", id=idlist,
rettype="medline", retmode="text",
retstart=retstart, retmax=1000)
txt_output = Medline.parse(handle)
for record in txt_output:
records.append(record)
handle.close()
return records
#TODO Parse the PMID, authors, abstract, and publication time from the outputted data
# and then export results to csv
def pubmed_parser(input_record):
pmid = int(input_record['PMID'])
try:
title = input_record['TI']
except KeyError:
#there are some articles that have no abstract
return None
try:
abstract = input_record['AB']
except KeyError:
#there are some articles that have no abstract
abstract = ''
try:
pub_date = input_record['EDAT'][0:10]
except KeyError:
return None
try:
authors = input_record['FAU']
except KeyError:
return None
output_record = {'PMID': pmid, 'Authors':authors,
'Pub_Date': pub_date, 'Abstract':abstract, 'Title':title}
return output_record
#test def
def hiv_crawl_test():
key_word = 'HIV'
min_date = '01/01/2020'
max_date = '03/01/2020'
min_date = date_converter(min_date)
max_date = date_converter(max_date)
pub_data = pubmed_crawl(key_word, min_date, max_date)
return pub_data
def data_outputter(pub_data):
output_pub_data = []
for record in pub_data:
parsed_record = pubmed_parser(record)
if parsed_record is not None:
output_pub_data.append(parsed_record)
return output_pub_data
def pub_prompt(file_name):
assert (file_name.endswith('csv')), 'file_name must be a .csv file'
key_word = input('Key Word to search: ')
assert (len(key_word) > 0), 'A key word must be used'
min_date = input('Start date (MM/DD/YYYY): ')
max_date = input('End date (MM/DD/YYYY): ')
min_date = date_converter(min_date)
max_date = date_converter(max_date)
pub_data = pubmed_crawl(key_word, min_date, max_date)
output_pub_data = data_outputter(pub_data)
with open(file_name, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=output_pub_data[0].keys())
writer.writeheader()
writer.writerows(output_pub_data)
print('query outputted {} publications'.format(len(output_pub_data)))
print('file written to {}'.format(file_name))
return True
if __name__ == "__main__":
pub_prompt('publication_output.csv')