-
Notifications
You must be signed in to change notification settings - Fork 0
/
Assignment1_Vinit_Shah.py
227 lines (169 loc) · 6.94 KB
/
Assignment1_Vinit_Shah.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# coding: utf-8
# In[ ]:
#!/usr/bin/python3
import requests
def getData():
url = 'https://api.nytimes.com/svc/mostpopular/v2/mostshared/all-sections/1.json?api-key=INSERT KEY'
results = requests.get(url).json()
data = results['results']
return data
# #Deliverable
# An iPython notebook with the code, and with some text explaining the data source and your objective (use the "File" => "Download As" => "Notebook (.ipynb)". Ensure that you upload the ipynb file.
# The name of your database together with the IP address of your machine, so that we can connect and inspect your database
# A few screenshots of your database, showing the data that you store and how they change over time (this is as a fallback, just in case we cannot connect to your database).
#
# API used: New York Times Most Pupular (By Shares) & Watson's Alchemy API to extract URL Text Sentiment and URL Entity
# Database Name: NYTimes
# Table Name: Articles, Articles_Constant
# IP Address: 54.165.54.4
#
# The python code collects the most shared articles on New York Times using the NYT Most Popular API. The next step is to analyze the text using the Watson API. One of the thins I do is to extract the most relevant entity and then get the overall sentiment of the article. This is done to determine what type of sentiment is positive and what's negative. What's the opinion of the media about certain topics and what is shared commonly, positive sentiment articles or negative. What I observed is that more negative sentiment articles are shared in terms of number of articles shared, however positive sentiment articles though fewer in quantity are shared more often.
#
#
#
# In[ ]:
#SETTING UP THE DATABASE CONNECTION
import MySQLdb as mdb
import sys
con = mdb.connect(host = 'localhost',
user = 'root',
passwd = 'YEAH RIGHT',
charset='utf8', use_unicode=True);
# Check for existing database
##db_name = 'NYTimes'
#drop_db_query = "DROP DATABASE IF EXISTS {db}".format(db=db_name)
#cursor = con.cursor()
#cursor.execute(drop_db_query)
#cursor.close()
# Create the database
def createDB():
db_name = 'NYTimes'
create_db_query = "CREATE DATABASE IF NOT EXISTS {db} DEFAULT CHARACTER SET 'utf8'".format(db=db_name)
cursor = con.cursor()
cursor.execute(create_db_query)
cursor.close()
return
# In[ ]:
#Create time invariant table
def createInvariant():
cursor = con.cursor()
db_name = 'NYTimes'
table1_name = 'Articles_Constant'
# The {db} and {table} are placeholders for the parameters in the format(....) statement
create_table_query = '''CREATE TABLE IF NOT EXISTS {db}.{table}
(url varchar(250),
title varchar(250),
published_date datetime,
section varchar(250),
entity varchar(250),
sentiment_type varchar(250),
Shares int,
PRIMARY KEY(title)
)'''.format(db=db_name, table=table1_name)
cursor.execute(create_table_query)
cursor.close()
return
#CREATE timevaryingtable
def createVariant():
cursor = con.cursor()
db_name = 'NYTimes'
table2_name = 'Articles'
# The {db} and {table} are placeholders for the parameters in the format(....) statement
create2_table_query = '''CREATE TABLE IF NOT EXISTS {db}.{table}
(url varchar(250),
title varchar(250),
published_date datetime,
section varchar(250),
entity varchar(250),
sentiment_type varchar(250),
Shares int,
Time int,
PRIMARY KEY(title, Time),
FOREIGN KEY(title) REFERENCES {db}.Articles_Constant(title)
)'''.format(db=db_name, table=table2_name)
cursor.execute(create2_table_query)
cursor.close()
return
# In[ ]:
def getEntity(url):
watson_url = "http://gateway-a.watsonplatform.net/calls/url/URLGetRankedNamedEntities"
api_key = 'KEY??
headers = {
"Accept": "application/json"
}
parameters = {
'outputMode': 'json',
'apikey' : api_key,
'sentiment' :1,
'knowledgeGraph': 1,
'url': url
}
watson_resp = requests.post(watson_url, params=parameters, headers=headers)
watson_result = watson_resp.json()
watson_entity = str(watson_result["entities"][0]['text'])
return watson_entity
def getSentiment(url):
watson_url = "http://gateway-a.watsonplatform.net/calls/url/URLGetTextSentiment"
api_key = 'KEYY??'
headers = {
"Accept": "application/json"
}
parameters = {
'outputMode': 'json',
'apikey' : api_key,
'url': url
}
watson_resp = requests.post(watson_url, params=parameters, headers=headers)
watson_result = watson_resp.json()
watson_sentiment = str(watson_result["docSentiment"]["type"])
return watson_sentiment
# In[ ]:
#import data
import datetime
import time
def insertVariant():
query_template = '''INSERT IGNORE INTO NYTimes.Articles(url, title, published_date, section, entity, sentiment_type, Shares, Time)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'''
cursor = con.cursor()
data = getData()
for entry in data:
url = entry["url"]
title = entry["title"]
date = entry["published_date"]
section = entry["section"]
Shares = entry['total_shares']
entity = getEntity(url)
sentiment_type = getSentiment(url)
Time = int(time.time())
query_parameters = (url, title, date, section, entity, sentiment_type, Shares, Time)
cursor.execute(query_template, query_parameters)
con.commit()
cursor.close()
return
###################
def insertInvariant():
query_template = '''INSERT IGNORE INTO NYTimes.Articles_Constant(url, title, published_date, section, entity, sentiment_type, Shares)
VALUES (%s, %s, %s, %s, %s, %s, %s)'''
cursor = con.cursor()
data = getData()
for entry in data:
url = entry["url"]
title = entry["title"]
date = entry["published_date"]
section = entry["section"]
Shares = entry['total_shares']
entity = getEntity(url)
sentiment_type = getSentiment(url)
query_parameters = (url, title, date, section, entity, sentiment_type, Shares)
cursor.execute(query_template, query_parameters)
con.commit()
cursor.close()
return
# In[ ]:
createDB()
createInvariant()
insertInvariant()
createVariant()
insertVariant()
# In[ ]:
# In[ ]: