-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTwitter Sentiment.py
421 lines (341 loc) · 16.9 KB
/
Twitter Sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
import numpy as np
import pandas as pd
import io
import requests
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from textblob import TextBlob, Word
import sklearn.ensemble as skens
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import datetime
from scipy import stats
import pandas_datareader as pdr # conenct yahoo finance
from datetime import datetime
import gensim # word2vec model for text similarity
# ----------------------------------------------------------------------------
# 1. load datasets
# ----------------------------------------------------------------------------
# get adjusted IBM minitely stock price on April, 9, 2009
ibm_url = 'https://raw.githubusercontent.com/xinyexu/SI618-Individual-Project/master/IBM_Adjusted_20090407.csv'
ibm_string_file = requests.get(ibm_url).content
ibm = pd.read_csv(io.StringIO(ibm_string_file.decode('utf-8')))
# get twitter data on April 7, 8, 19, on April, 2009
twitter_pos = '/Users/xuxinye/Desktop/SI 618 Project/trainingandtestdata/twitter information.csv'
twitter = pd.read_csv(twitter_pos, header=None, engine='python')
twitter.columns = ['Polarity', 'ID', 'Date', 'Query', 'User', 'Tweet']
# engine = 'python' to avoid errors; or encoding='latin1',
# https://stackoverflow.com/questions/18171739/unicodedecodeerror-when-reading-csv-file-in-pandas-with-python
# trasfer string date into date format (slowly)
twitter.loc[:,'Date'] = pd.to_datetime(twitter.Date, errors = 'ignore')
print(twitter.Date.describe())
# ----------------------------------------------------------------------------
# 2. explore dataframe
# ----------------------------------------------------------------------------
# show the length of tweet
sns.set_style("whitegrid", {'axes.grid' : True})
sns.distplot(twitter.Tweet.str.len(), kde=False).set_title('Tweet Length Dist')
plt.xlim(0, 175)
plt.show(block=True)
# groupby different day to check dates
check_date = twitter.set_index('Date').groupby(pd.Grouper(freq='D')).count()
check_date_wona = check_date[(check_date.T != 0).any()]
# find weekdays: Monday is 0 and Sunday is 6
check_date_wona.loc[:, 'Weekdays'] = [check_date_wona.index[i].weekday() for i in range(len(check_date_wona))]
print('Check weekdays', '\n', check_date_wona.Weekdays.value_counts())
# show the Number of Tweet
plt.bar(check_date_wona.index, check_date_wona.Tweet)
plt.xlabel('Date')
plt.ylabel('Number of Tweet')
plt.title('Number of Tweet of each date')
plt.show(block=True)
# show common words in people's tweet by word cloud
text = ''
for sens in twitter.Tweet:
text += sens
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure(figsize=(25,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show(block=True)
# show the distribution of sentiment in each date
from collections import Counter
twitter_cate = twitter.copy()
twitter_cate.loc[:,'Polarity'] = twitter_cate.loc[:,'Polarity'].astype(object)
twitter_cate = pd.get_dummies(twitter_cate, columns=['Polarity'])
check_senti = twitter_cate.set_index('Date').groupby(pd.Grouper(freq='D'))['Polarity_0', 'Polarity_4'].sum()
check_summary = pd.concat([check_date_wona, check_senti], axis = 1, join = 'inner')
check_summary.loc[:, 'Pol_0_rat'] = check_summary.Polarity_0 / check_summary.Polarity
check_summary.loc[:, 'Pol_4_rat'] = check_summary.Polarity_4 / check_summary.Polarity
x = check_summary.index
ax = plt.subplot(111)
ax.set_title('Polarity Neg vs Pos', fontsize=10)
ax.bar(x, check_summary.Pol_0_rat,color='b',align='center', label='Polarity Neg')
ax.bar(x, check_summary.Pol_4_rat, bottom=check_summary.Pol_0_rat, color='y',align='center', label='Polarity Pos')
ax.xaxis_date()
plt.legend(bbox_to_anchor=(0.05,0.8), loc='center left')
# plt.plot(check_summary.index, check_summary[['Polarity_0', 'Polarity_4']], kind="bar")
# sns.barplot(check_summary.index, check_summary[['Polarity_0', 'Polarity_4']]).set_title('Polarity dist during this period')
plt.show(block=True)
# We found the data after 2009-05-29 only contains negative sentiment
check_short = check_summary[check_summary.index<='2009-05-28']
print(len(check_short))
print(check_short.Pol_4_rat.mean())
print('check weekdays', '\n', check_short.Weekdays.value_counts())
# ----------------------------------------------------------------------------
# 3. find the best way to estimate sentiment
# ----------------------------------------------------------------------------
# sort weitter dataset by time, ascending
twitter_asc = twitter.sort_values(by=['Date'], ascending=True)
twitter_asc = twitter_asc.reset_index(drop=True) # drop original indexs
# seperate train (80% dates) and test by its indexs:
train_end = check_date_wona.index[round(len(check_date_wona)*0.8)] # Timestamp('2009-06-16 00:00:00')
train_end_index = twitter_asc[twitter_asc.loc[:, 'Date'] > train_end].index[0] # 758080
twitter_train = twitter_asc.loc[0:train_end_index-1,:]
twitter_test = twitter_asc.loc[train_end_index:,:]
print('length of twitter_train', len(twitter_train), ';length of twitter_test', len(twitter_test),
';ratio of total train data/test', len(twitter_train)/len(twitter_asc))
# create text vectorizer
# vectorizer = CountVectorizer(min_df=.001, max_df=.8, stop_words='english')
vectorizer = CountVectorizer(stop_words='english')
train_dtm = vectorizer.fit_transform(twitter_train.Tweet)
test_dtm = vectorizer.transform(twitter_test.Tweet)
# (a) Naive Bayes
# train Naive Bayes
nb = MultinomialNB()
nb.fit(train_dtm, twitter_train.Polarity)
# Evaluate Results
# train accuracy
pred_polarity_train = nb.predict(train_dtm)
accuracy_score(twitter_train.Polarity, pred_polarity_train) # 0.8509537252005065
# test accuracy
pred_polarity = nb.predict(test_dtm)
accuracy_score(twitter_test.Polarity, pred_polarity) # 0.9242709021810972
# class_log_prior_
print(nb.class_log_prior_)
# (b) TextBlob package for sentiment
# example:
sample = twitter_train.Tweet[0]
print('Tweet: ',sample)
sample_est = TextBlob(sample).polarity
print('Real Polarity: ', twitter_train.Polarity[0], '; Est Polarity: ', sample_est)
# other example
# train[['text']].sample(10).assign(sentiment=lambda x: x.text.apply(estimate_polarity)).sort_values('sentiment')
# Evaluate Results for TextBlob
# rescale textbolb plarity the [-1, 1] to 0 or 4
def textbolb_scale(text):
tetbolb_pol = TextBlob(text).polarity
if tetbolb_pol > 0:
return(4)
else:
return(0)
# ignore neutral here
# train
pred_polarity_textblob_train = twitter_train.Tweet.apply(textbolb_scale)
accuracy_score(twitter_train.Polarity, pred_polarity_textblob_train) # 0.6460162515829464
# test
pred_polarity_textblob = twitter_test.Tweet.apply(textbolb_scale)
accuracy_score(twitter_test.Polarity, pred_polarity_textblob) # 0.688956130204891
# another way even without scale:
# test_set = twitter_test.Tweet.apply(lambda x: TextBlob(x).sentiment.polarity)
# accuracy_score(twitter_test.Polarity, (test_set>0).astype(int))
# (c) Random Forest
# test coding:
# train_dtm = vectorizer.fit_transform(twitter_train.Tweet[:100])
rf_model = skens.RandomForestClassifier(n_estimators=10, oob_score=False, criterion='entropy')
# oob_score: whether to use out-of-bag samples to estimate the R^2 on unseen data.
rf_model.fit(train_dtm, twitter_train.Polarity)
# Evaluate Results for
# train accuracy
pred_polarity_train_RF = rf_model.predict(train_dtm)
accuracy_score(twitter_train.Polarity, pred_polarity_train_RF) # 0.8242256753904601
# test accuracy
pred_polarity_RF = rf_model.predict(test_dtm)
accuracy_score(twitter_test.Polarity, pred_polarity_RF) # 0.8706316093853271
# find the best RF model with optimal parameters
params = {
'n_estimators': [5, 10, 15, 20, 25],
# 'max_depth': [2, 5, 7, 9],
}
RF_CV = GridSearchCV(rf_model, cv=10, param_grid=params) # , iid = True
RF_CV.fit(train_dtm,twitter_train.Polarity)
print(RF_CV.best_estimator_)
print(RF_CV.cv_results_['mean_test_score'])
print(RF_CV.cv_results_['mean_test_score'].max())
# # find the best NB model with optimal parameters
# params = {
# 'n_estimators': [5, 10, 15, 20, 25],
# 'max_depth': [2, 5, 7, 9],
# }
# NB_CV = GridSearchCV(nb, cv=10, param_grid=params) # , iid = True
# NB_CV.fit(train_dtm,twitter_train.Polarity)
# print(NB_CV.cv_results_['mean_test_score'])
# ----------------------------------------------------------------------------
# 4. relationship between IBM stock price and all tweet sentiment
# ----------------------------------------------------------------------------
# for train dates, using true polarity
# add Date and Time together
ibm.loc[:, 'DateTime'] = pd.to_datetime(ibm.Date + ' ' + ibm.Time)
ibm = ibm.set_index('DateTime')
# average the tweet polarity in minutes, mean is 2 (4 for pos, 0 for neg)!
# pd.groupby frequencies: http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
twitter_1min = twitter.set_index('Date').groupby(pd.Grouper(freq='min')).Polarity.mean()
# We found the data after 2009-05-29 only contains negative sentiment, drop them
twitter_1min_short = twitter_1min[twitter_1min.index<='2009-05-28']
twitter_1min_wona = twitter_1min_short.dropna()
# merge with twitter dataset sentiment average for the previous minnus
ibm_and_twe = pd.merge(pd.DataFrame(ibm.Close), pd.DataFrame(twitter_1min_wona),
left_index=True, right_index=True, how='outer')
ibm_and_twe_wona = ibm_and_twe.dropna()
ibm_and_twe = pd.concat([ibm.Close, twitter_1min_wona], axis = 1, join = 'inner')
# we find the most tweets in above datasets happened durinig the night, without trading!!!
# summary them into daily sentiment score
twitter_1d = twitter.set_index('Date').groupby(pd.Grouper(freq='D')).Polarity.mean()
# We found the data after 2009-05-29 only contains negative sentiment, drop them
twitter_1d_short = twitter_1d[twitter_1d.index<='2009-05-28']
twitter_1d_wona = twitter_1d_short.dropna()
# merge with ibm price,
# get ibm daily price
ibm_daily = pdr.get_data_yahoo(symbols='IBM', start=datetime(2009, 4, 6), end=datetime(2009, 5, 28))
# merge with twitter dataset sentiment average for the previous day
ibm_dialy_twe = pd.concat([ibm_daily[['Adj Close']], twitter_1d_wona], axis = 1, join = 'inner')
# find the optimal lag days of Polarity with Adj Close by spearman correlation
# update lag days Polarity by moving average with previous lag days
spearman = []
for i in range(10):
pol_lag = ibm_dialy_twe[['Polarity']].rolling(window=i+1).mean()
spearman.append(stats.spearmanr(pol_lag, ibm_dialy_twe[['Adj Close']], nan_policy='omit')[0])
optimal_roll = spearman.index(max(spearman)) + 1
print('max spearman: ', max(spearman), 'optimal lag: ', optimal_roll)
# correlation pearson and spearman
# lag Polarity one day
ibm_dialy_twe.loc[:,'Polarity'] = ibm_dialy_twe[['Polarity']].shift(1)
g = sns.JointGrid(data=ibm_dialy_twe,x='Adj Close',y='Polarity')
g = g.plot(sns.regplot, sns.distplot)
# g = g.set(xlabel='Adj Close', ylabel='Lag 1 Polarity')
g = g.annotate(stats.spearmanr)
plt.ylabel("Lag 1 Polarity")
plt.show(block=True)
# lag Polarity five day
pol_lag = ibm_dialy_twe[['Polarity']].rolling(window=optimal_roll).mean()
g = sns.JointGrid(ibm_dialy_twe[['Adj Close']], pol_lag)
g = g.plot(sns.regplot, sns.distplot)
g = g.annotate(stats.spearmanr)
plt.xlabel("Adj Close")
plt.ylabel("Lag 8 Polarity")
plt.show(block=True)
# ----------------------------------------------------------------------------
# 5. relationship between IBM stock price and relevant tweet sentiment
# ----------------------------------------------------------------------------
# Find More Relevant Tweets with IBM
# set IBM key words to filter relevant tweets (add 'IBM' itself)
# IBM industry: https://www.ibm.com/industries?lnk=min
ibm_key = 'Aerospace, defense, Automotive, Banking, financial markets, Chemicals, Construction, Education, ' \
'Electronics, Energy, utilities, Government, Healthcare, Insurance, Life, sciences, Manufacturing,' \
' Metals, mining, Oil, gas, Retail, Consumer, Products, Telecommunications, media, entertainment, ' \
'Travel, transportation, IBM'
# short the tweet dataset to date
twitter_bef_0528 = twitter_asc[twitter_asc.Date<='2009-05-28']
print(len(twitter_bef_0528))
# pre-train mode
# load pre-trained word2vec model
# from https://github.com/eyaler/word2vec-slim
w2v_mod = gensim.models.KeyedVectors.load_word2vec_format("/Users/xuxinye/Downloads/618_07_NLP/"
"GoogleNews-vectors-negative300-SLIM.bin", binary=True)
# Train my own word2vec vector.
# a sequence of sentences as its input. Each sentence a list of words (utf8 strings):
sentences = [sens.split() for sens in twitter_bef_0528.Tweet]
sentences.append(ibm_key.split()) # add ibm words
mymodel = gensim.models.Word2Vec(sentences, min_count=10)
# by pre train model
total_score = []
for row in range(3):
score = []
for i in twitter_bef_0528.Tweet[row].split():
for ibm_word in ibm_key:
try:
score.append(w2v_mod.similarity(i, ibm_word))
except KeyError:
score.append(0)
total_score.append(np.mean(score))
print(total_score)
# [0.0370971685863098, 0.030408827317550534, 0.03274327167493441]
# words not in vocabulary
# words not in a little more sophisticated approach has been implemented in fastText
# (now also integrated into gensim): break the unknown word into smaller character n-grams.
# Assemble the word vector from vectors of these ngrams.
# https://radimrehurek.com/gensim/models/fasttext.html
total_score = []
for row in range(3):
score = []
for i in twitter_bef_0528.Tweet[row].split():
for ibm_word in ibm_key:
try:
score.append(mymodel.similarity(i, ibm_word))
except KeyError:
score.append(0)
total_score.append(np.mean(score))
print(total_score)
# [0.050589945092463284, 0.05649464393900006, 0.036852518259192654]
# use my model for total tweests, too slow, need improvement
total_score = []
for row in range(len(twitter_bef_0528)):
score = []
for i in twitter_bef_0528.Tweet[row].split():
for ibm_word in ibm_key:
try:
score.append(mymodel.similarity(i, ibm_word))
except KeyError:
score.append(0)
total_score.append(np.mean(score))
twitter_simi_ibm = pd.concat([twitter_bef_0528, pd.Series(total_score)], axis = 1, join = 'inner')
# twitter_simi_ibm.to_csv('twitter_simi_score.csv', index=False)
twitter_simi_ibm = twitter_simi_ibm.rename(columns={0: 'Similarity_ibm'})
# find statistics of similarity distribution
print(twitter_simi_ibm.Similarity_ibm.describe())
benchmarkt = np.mean(twitter_simi_ibm.Similarity_ibm)
twitter_simi_ibm_overmean = twitter_simi_ibm[twitter_simi_ibm.Similarity_ibm >= benchmarkt]
# summary them into daily sentiment score
twitter_1d_ibm = twitter_simi_ibm_overmean.set_index('Date').groupby(pd.Grouper(freq='D')).Polarity.mean()
# We found the data after 2009-05-29 only contains negative sentiment, drop them
twitter_1d_ibm_wona = twitter_1d_ibm.dropna()
# merge with twitter dataset sentiment average for the previous day
ibm_releated_twe = pd.concat([ibm_daily[['Adj Close']], twitter_1d_ibm_wona], axis = 1, join = 'inner')
# find the optimal lag days of Polarity with Adj Close by spearman correlation
# update lag days Polarity by moving average with previous lag days
spearman = []
for i in range(12):
pol_lag = ibm_releated_twe[['Polarity']].rolling(window=i+1).mean()
spearman.append(stats.spearmanr(pol_lag, ibm_releated_twe[['Adj Close']], nan_policy='omit')[0])
optimal_roll = spearman.index(max(spearman)) + 1
print('max spearman: ', max(spearman), 'optimal lag: ', optimal_roll)
# correlation pearson and spearman
# lag Polarity one day
ibm_releated_twe.loc[:,'Polarity'] = ibm_releated_twe[['Polarity']].shift(1)
g = sns.JointGrid(data=ibm_releated_twe,x='Adj Close',y='Polarity')
g = g.plot(sns.regplot, sns.distplot)
# g = g.set(xlabel='Adj Close', ylabel='Lag 1 Polarity')
g = g.annotate(stats.spearmanr)
plt.ylabel("Lag 1 Polarity")
plt.show(block=True)
# lag Polarity optimal lag day
pol_lag = ibm_releated_twe[['Polarity']].rolling(window=optimal_roll).mean()
g = sns.JointGrid(ibm_releated_twe[['Adj Close']], pol_lag)
g = g.plot(sns.regplot, sns.distplot)
g = g.annotate(stats.spearmanr)
plt.xlabel("Adj Close")
plt.ylabel("Lag 10 Polarity")
plt.show(block=True)
# alternative data sets:
# load new dataset with sentiment score and ibm price
ibm_url_new = 'https://raw.githubusercontent.com/aficionado/upordown/master/data/ibm_sentiment.csv'
ibm_string_file_new = requests.get(ibm_url_new).content
ibm_new = pd.read_csv(io.StringIO(ibm_string_file_new.decode('utf-8')))
sns.pairplot(ibm_new[['Volume', 'Close', 'Bullish', 'Bearish']])
plt.show(block=True)