-
Notifications
You must be signed in to change notification settings - Fork 0
/
run-search.py
403 lines (373 loc) · 17.5 KB
/
run-search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
# To run this code, open terminal and use "python3.9 filename.py"
# To fixed text encoding issues in MySQL, follow this guide: https://sebhastian.com/mysql-incorrect-string-value/ OR USE: ALTER TABLE tweepy_cashtags_main CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
import mysql.connector # For connecting to dataset
import tweepy
import sys # Needed for sys.exit() method
import pandas
import time #for the time.sleep() function, used to create a delay between twint searches. Also using it to track compute times
from mysql.connector import errorcode # Used in error checking on table creation.
from api_keys import apikey, apikeysecret, access_token, access_token_secret, config, createDBconfig, DB_name
from modules.CleanTickerList import cleaning, merge_and_export
from modules.createTables import create_tables
from modules.cursorModule import database_startup, cursor, db
# from modules.cursorModule import cursor2 -- cannot import if it is not made. consider 'try' function.
##################### Set parameters and start counters #####################
print("#### TWEEPY Cashtag Scraper, MySQL version ####")
loop = 0 # start loop counter
starttime = int(time.perf_counter()) #start the time counter and convert variable to an interger
################ Start Twitter connection ####################
auth = tweepy.OAuthHandler(apikey, apikeysecret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
################ Functions ####################
def setup_user_table():
try:
cursor.execute("""CREATE TABLE `users` (
`user_id` BIGINT UNIQUE NOT NULL,
`id_str` varchar(255),
`screen_name` varchar(255),
`majority_lang` varchar(255),
`date_updated_ISO` varchar(255),
`date_updated_unix` BIGINT,
`cap_english` FLOAT(12),
`cap_universal` FLOAT(12),
`display_eng_astroturf` FLOAT(12),
`display_eng_fakefollower` FLOAT(12),
`display_eng_financial` FLOAT(12),
`display_eng_other` FLOAT(12),
`display_eng_overall` FLOAT(12),
`display_eng_selfdeclared` FLOAT(12),
`display_eng_spammer` FLOAT(12),
`display_universal_astroturf` FLOAT(12),
`display_universal_fakefollower` FLOAT(12),
`display_universal_financial` FLOAT(12),
`display_universal_other` FLOAT(12),
`display_universal_overall` FLOAT(12),
`display_universal_selfdeclared` FLOAT(12),
`display_universal_spammer` FLOAT(12),
`raw_eng_astroturf` FLOAT(12),
`raw_eng_fakefollower` FLOAT(12),
`raw_eng_financial` FLOAT(12),
`raw_eng_other` FLOAT(12),
`raw_eng_overall` FLOAT(12),
`raw_eng_selfdeclared` FLOAT(12),
`raw_eng_spammer` FLOAT(12),
`raw_universal_astroturf` FLOAT(12),
`raw_universal_fakefollower` FLOAT(12),
`raw_universal_financial` FLOAT(12),
`raw_universal_other` FLOAT(12),
`raw_universal_overall` FLOAT(12),
`raw_universal_selfdeclared` FLOAT(12),
`raw_universal_spammer` FLOAT(12),
`botolite_bs` FLOAT(12),
PRIMARY KEY (`user_id`)
) Engine = InnoDB DEFAULT CHARSET=utf8mb4
""")
print("Added a table.\n")
except Exception as e:
print (e,"\n")
def tweepy1_search(ST):
tweets_found = 0
for result2 in api.search(q=ST, result_type="recent", count=300, lang="en"):
#Data dictionary: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet
# Details on possible parameters: https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/api-reference/get-search-tweets
#NOTE: This v1.1 api gets a "sample" of tweets from the past 7 days, upgrade API to get a "complete" list of tweets - https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/overview
#NOTE: To get results, it must be a loop as the 'SearchResults' object is a group of objects.
# List comprehension values and potentially N/A returns. calculation of potentially N/A variable needs to be outside of the dict to enable use of the try:-except: method.
hashes = [] # Using same list comprehension method as used in cashtag parsing further down. Need to use a list to use the append method on the next line.
[hashes.append(j["text"]) for j in result2.entities["hashtags"]]
hashes = str(hashes) #Convert from a list into a string to work with mySQL.
cash = [] # Using a list comprehension to parse out dictionary values, out of a list of dictionaries (single dictionary if only one cashtag). NOT FUN. Need to use a list to use the append method on the next line.
[cash.append(i["text"]) for i in result2.entities["symbols"]] #loops through each item of the dictionary symbols, extracts the text field (which are cashtags), then adds it to a predefined list.
cash = str(cash)
#to build this function, first indexing was tried, then 'for-loops', then list comprehension.
try: #only gets the first user mentioned
parsed_user_mentions_id = result2.entities["user_mentions"][0]["id"]
parsed_user_mentions_screen_name = result2.entities["user_mentions"][0]["screen_name"]
except:
parsed_user_mentions_id = ""
parsed_user_mentions_screen_name = ""
try:
possibly_sensitive = result2.possibly_sensitive
except:
possibly_sensitive = ""
try: #only gets the first url mentioned
url_in_tweet = result2.entities["urls"][0]["expanded_url"] #references an object, with a sub-object (entities), which references an array (urls), then a place in a list (0), then a dict (url).
except:
url_in_tweet = ""
df_dict = {"tweet_id": result2.id,
"tweet_id_str": result2.id_str,
"tweet_text_short": result2.text,
"created_at": result2.created_at,
"derived_URL": "https://www.twitter.com/"+result2.user.screen_name+"/status/"+result2.id_str,
"source": result2.source,
"truncated": result2.truncated,
"in_reply_to_screen_name": result2.in_reply_to_screen_name,
"in_reply_to_status_id": result2.in_reply_to_status_id,
"in_reply_to_user_id": result2.in_reply_to_user_id,
"is_quote_status": result2.is_quote_status,
"retweet_count": result2.retweet_count,
"tweet_language": result2.lang,
"user_id": result2.user.id,
"user_id_str": result2.user.id_str,
"name": result2.user.name,
"screen_name": result2.user.screen_name,
"location": result2.user.location,
"derived_account_url": "https://www.twitter.com/"+result2.user.screen_name,
"profile_link": result2.user.url,
"user_description": result2.user.description,
"protected": result2.user.protected,
"verified": result2.user.verified,
"followers_count": result2.user.followers_count,
"friends_count": result2.user.friends_count,
"listed_count": result2.user.listed_count,
"favourites_count": result2.user.favourites_count,
"statuses_count": result2.user.statuses_count,
"date_user_created": result2.user.created_at,
"user_utc_offset": result2.user.utc_offset,
"user_time_zone": result2.user.time_zone,
"geo_enabled": result2.user.geo_enabled,
"user_lang": result2.user.lang,
"contributors_enabled": result2.user.contributors_enabled,
"is_translator": result2.user.is_translator,
"default_profile": result2.user.default_profile,
"default_profile_image": result2.user.default_profile_image,
"favorite_count": result2.favorite_count,
"parsed_user_mentions_id" : parsed_user_mentions_id,
"parsed_user_mentions_screen_name": parsed_user_mentions_screen_name,
"possibly_sensitive" : possibly_sensitive,
"url_in_tweet" : url_in_tweet,
"cashtags" : cash,
"hashtags" : hashes,
"tweepy_search_term" : ST}
sql = """INSERT INTO `tweepy_cashtags_temp` (
`tweet_id`,
`tweet_id_str`,
`tweet_text_short`,
`created_at`,
`derived_URL`,
`source`,
`truncated`,
`in_reply_to_screen_name`,
`in_reply_to_status_id`,
`in_reply_to_user_id`,
`is_quote_status`,
`retweet_count`,
`tweet_language`,
`user_id`,
`user_id_str`,
`name`,
`screen_name`,
`location`,
`derived_account_url`,
`profile_link`,
`user_description`,
`protected`,
`verified`,
`followers_count`,
`friends_count`,
`listed_count`,
`favourites_count`,
`statuses_count`,
`date_user_created`,
`user_utc_offset`,
`user_time_zone`,
`geo_enabled`,
`user_lang`,
`contributors_enabled`,
`is_translator`,
`default_profile`,
`default_profile_image`,
`favorite_count`,
`parsed_user_mentions_id`,
`parsed_user_mentions_screen_name`,
`possibly_sensitive`,
`url_in_tweet`,
`cashtags`,
`hashtags`,
`tweepy_search_term`
)
VALUES (
%(tweet_id)s,
%(tweet_id_str)s,
%(tweet_text_short)s,
%(created_at)s,
%(derived_URL)s,
%(source)s,
%(truncated)s,
%(in_reply_to_screen_name)s,
%(in_reply_to_status_id)s,
%(in_reply_to_user_id)s,
%(is_quote_status)s,
%(retweet_count)s,
%(tweet_language)s,
%(user_id)s,
%(user_id_str)s,
%(name)s,
%(screen_name)s,
%(location)s,
%(derived_account_url)s,
%(profile_link)s,
%(user_description)s,
%(protected)s,
%(verified)s,
%(followers_count)s,
%(friends_count)s,
%(listed_count)s,
%(favourites_count)s,
%(statuses_count)s,
%(date_user_created)s,
%(user_utc_offset)s,
%(user_time_zone)s,
%(geo_enabled)s,
%(user_lang)s,
%(contributors_enabled)s,
%(is_translator)s,
%(default_profile)s,
%(default_profile_image)s,
%(favorite_count)s,
%(parsed_user_mentions_id)s,
%(parsed_user_mentions_screen_name)s,
%(possibly_sensitive)s,
%(url_in_tweet)s,
%(cashtags)s,
%(hashtags)s,
%(tweepy_search_term)s
);"""
# Data needs to be in the following dictionary format for mySQL
data = {
'tweet_id':df_dict["tweet_id"],
'tweet_id_str':df_dict["tweet_id_str"],
'tweet_text_short':df_dict["tweet_text_short"],
'created_at':df_dict["created_at"],
'derived_URL':df_dict["derived_URL"],
'source':df_dict["source"],
'truncated':df_dict["truncated"],
'in_reply_to_screen_name':df_dict["in_reply_to_screen_name"],
'in_reply_to_status_id':df_dict["in_reply_to_status_id"],
'in_reply_to_user_id':df_dict["in_reply_to_user_id"],
'is_quote_status':df_dict["is_quote_status"],
'retweet_count':df_dict["retweet_count"],
'tweet_language':df_dict["tweet_language"],
'user_id':df_dict["user_id"],
'user_id_str':df_dict["user_id_str"],
'name':df_dict["name"],
'screen_name':df_dict["screen_name"],
'location':df_dict["location"],
'derived_account_url':df_dict["derived_account_url"],
'profile_link':df_dict["profile_link"],
'user_description':df_dict["user_description"],
'protected':df_dict["protected"],
'verified':df_dict["verified"],
'followers_count':df_dict["followers_count"],
'friends_count':df_dict["friends_count"],
'listed_count':df_dict["listed_count"],
'favourites_count':df_dict["favourites_count"],
'statuses_count':df_dict["statuses_count"],
'date_user_created':df_dict["date_user_created"],
'user_utc_offset':df_dict["user_utc_offset"],
'user_time_zone':df_dict["user_time_zone"],
'geo_enabled':df_dict["geo_enabled"],
'user_lang':df_dict["user_lang"],
'contributors_enabled':df_dict["contributors_enabled"],
'is_translator':df_dict["is_translator"],
'default_profile':df_dict["default_profile"],
'default_profile_image':df_dict["default_profile_image"],
'favorite_count':df_dict["favorite_count"],
'parsed_user_mentions_id':df_dict["parsed_user_mentions_id"],
'parsed_user_mentions_screen_name':df_dict["parsed_user_mentions_screen_name"],
'possibly_sensitive':df_dict["possibly_sensitive"],
'url_in_tweet':df_dict["url_in_tweet"],
'cashtags':df_dict["cashtags"],
'hashtags':df_dict["hashtags"],
'tweepy_search_term':df_dict["tweepy_search_term"]
}
try:
cursor.execute(sql, data)
# print ("Row added!")
tweets_found += 1
except Exception as e:
print (e)
db.commit()
print("Tweets found:", tweets_found)
# Clear out the Temp table from retweets before merging with the main table.
cursor.execute("DELETE FROM tweepy_cashtags_temp WHERE `tweet_text_short` LIKE 'RT @%'")
# print ("Done with:", ST)
merge_sql = """INSERT IGNORE tweepy_cashtags_main
SELECT * FROM tweepy_cashtags_temp
"""
# with db: # Combine temp table and main table
cursor.execute(merge_sql, params=None)
print ("Saved results from", ST, "to main table successful.")
# Delete the temp table to make way for a fresh extract
clear_sql = """DELETE FROM tweepy_cashtags_temp"""
cursor.execute(clear_sql, params=None)
# print ("Deleted temp table for:", ST)
def delete_RTs():
try:
cursor.execute("DELETE FROM tweepy_cashtags_main WHERE `tweet_text_short` LIKE 'RT @%'")
db.commit()
print ("Deleted retweets.")
except Exception as e:
print("Error with delete_RTs:", e)
def populate_users_table():
try:
cursor.execute("INSERT IGNORE INTO users (`user_id`) SELECT DISTINCT `user_id` FROM tweepy_cashtags_main")
count = cursor.rowcount
print ("Users table updated.\n")
print ("Affected rows:", count)
db.commit()
except Exception as e:
print("Populate users table error:", e)
def count_rows_users():
sql = ("SELECT COUNT(`user_id`) FROM users")
cursor.execute(sql)
result = cursor.fetchall() # Gets the result as a tuple. Can use "cursor.fetchone()" to get a single row.
print ("The number of unique accounts are:", result[0][0],"\n")
def count_rows_tweets():
sql = ("SELECT COUNT(`tweet_id`) FROM tweepy_cashtags_main")
cursor.execute(sql)
result = cursor.fetchall() # Gets the result as a tuple. Can use "cursor.fetchone()" to get a single row.
print ("The number of Tweets saved is:", result[0][0])
##################### Main Block #####################
stocks_df = pandas.read_csv(r'tickerlist.csv') # Import the stock list and set it as the search terms
ST = stocks_df.loc[:, 'cashtags'] #make the 'cashtags' column into a list # ST is shorthand for "search term".
#print (ST) # Uncheck for debugging.
# ST = ["$QAN"] # Uncheck for debugging. NOTE: This MUST be a tuple or a list. If left as a string it will loop over each letter.
for master_loop in range(1,300): # range(x,y) Where x is starting number and y is ending number. Increments of 1.
print ("Checking/Creating database tables...")
create_tables()
print ("Counting the number of users...")
count_rows_users()
print ("Counting the number of rows...")
count_rows_tweets()
print ("Deleting retweets...")
delete_RTs()
count_rows_tweets()
setup_user_table()
populate_users_table()
for item in ST: #Loop through each ticker symbol.
try_search = 0
max_tries = 10
while try_search < max_tries:
try:
tweepy1_search(item)
print (f"Success! on {item}, search try number: {try_search}")
break
except tweepy.error.TweepError as save_error:
time.sleep(10)
print (save_error)
try_search += 1
print (f"Failed on {item}, retry number: {try_search}")
loop += 1
midtime = int(time.perf_counter()) # Print an interim search time
print ("Scraped tweets for:", item, "Searching for:", int((midtime-starttime)/60), "mins, or", int(((midtime-starttime)/60)/60), "hour/s.")
count_rows_tweets()
print ("Loop number:", loop, "Master loop:", master_loop, "\n")
print ("Sleeping 15mins before starting another master loop...")
time.sleep (900)
#show the total scan time
finishtime = int(time.perf_counter()) #int the perf_counter float to an integer, easier to read
print("\nFinished scanning! Scantime:", int((finishtime-starttime)/60), "minutes. Or", int(((finishtime-starttime)/60)/60), "hour/s.")
db.commit()
db.close()