-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
138 lines (113 loc) · 4.37 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import importlib
import subprocess
packages_to_check = ['bio', 'tweepy', 'argparse', 'time']
for package in packages_to_check:
try:
importlib.import_module(package)
print(f"{package} is already installed.")
except ImportError:
print(f"{package} is not installed. Installing...")
subprocess.run(['pip', 'install', package])
import time
import argparse
import tweepy
from Bio import Entrez
def get_args():
parser = argparse.ArgumentParser(
description='PubMed scraping bot',
prog='pubmed_bot',
formatter_class=argparse.RawDescriptionHelpFormatter,)
parser.add_argument(
'-doi',
dest='doi_db',
default='doi_db.txt',
help='DOI database location')
parser.add_argument(
'-topic',
dest='topic',
default='topics.txt',
help='Topic .txt file location')
args = parser.parse_args()
doi_db_loc = args.doi_db
topic = args.topic
return doi_db_loc, topic
def publish_twitter(tweet_string):
api_key = 'INPUT YOUR TWITTER API KEY HERE'
api_secret = 'INPUT YOUR SECRET TWITTER API KEY HERE'
bearer_token = 'INPUT YOUR TWITTER BEARER TOKEN HERE'
access_token = 'INPUT YOUR TWITTER ACCESS TOKEN HERE'
secret_access = 'INPUT YOUR SECRET ACCESS TOKEN HERE'
client = tweepy.Client(bearer_token, api_key, api_secret, access_token, secret_access)
auth = tweepy.OAuth1UserHandler(api_key, api_secret, access_token, secret_access)
api = tweepy.API(auth)
tweet = client.create_tweet(text=tweet_string)
return tweet
def pubmed_scrape(query, bot_email, max_scrapes):
article_list = {}
Entrez.email = bot_email
ncbi_scrape = Entrez.esearch(db='pubmed', term=query, retmax=max_scrapes)
scrape_list = Entrez.read(ncbi_scrape)
list_format = scrape_list['IdList']
for scrape in list_format:
id_num = scrape
summary = Entrez.esummary(db='pubmed', id=id_num)
read_summary_list = Entrez.read(summary)
read_summary = read_summary_list[0] # Access the dictionary within the list
doi = read_summary.get('DOI', 'Unknown DOI')
title = read_summary.get('Title', 'Unknown Title')
pub_date = read_summary.get('PubDate', 'Unknown PubDate')
article_list[doi] = {'Title': title, 'PubDate': pub_date}
time.sleep(3)
return article_list
def doi_checker(doi_to_check, doi_db_filename):
temp_dois = []
with open(doi_db_filename, 'r') as doi_database:
for entry in doi_database:
entry = entry.rstrip()
temp_dois.append(entry)
if doi_to_check in temp_dois:
return True, doi_to_check
elif doi_to_check not in temp_dois:
with open(doi_db_filename, 'a') as doi_database:
doi_database.write(doi_to_check + '\n')
return False, doi_to_check
def string_formatter(title, doi):
url = 'http://dx.doi.org/' + str(doi)
prelim_tweet = f'{title} ({url})'
if len(prelim_tweet) > 260:
short_title = f'{title[:151]}...'
tweet = f'{short_title} ({url})'
if len(tweet) > 260 and len(url) > 110:
url_replace = 'URL too long to tweet'
tweet = f'{short_title} ({url_replace})'
else:
tweet = f'{short_title} ({url})'
else:
tweet = prelim_tweet
return tweet
if __name__ == '__main__':
doi_db, topic_list = get_args()
t_list = []
new_tweets = 0
with open(topic_list, 'r') as topic_list:
for line in topic_list:
line = line.rstrip()
t_list.append(line)
print(f'Topic list: {t_list}')
for t in t_list:
print(f'Performing Pubmed search for "{t}"...')
bot_search = pubmed_scrape(t, 'INPUT YOUR EMAIL HERE', 15)
for doi, info in bot_search.items():
status, doi = doi_checker(doi, doi_db)
if not status:
print(f'{doi} not found in database - preparing to tweet')
title = info['Title'] # Access the title from the info dictionary
to_tweet = string_formatter(title, doi)
publish_twitter(to_tweet)
print('Published to Twitter')
new_tweets += 1
time.sleep(10)
else:
print(f'{doi} already in database')
time.sleep(0.5)
print(f'Query complete, {new_tweets} new tweets published - returning to sleep')