-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnyt.py
162 lines (111 loc) · 4.34 KB
/
nyt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import http.client
import json
import os
from nltk import word_tokenize
class NYT:
def __init__(self,config_file='config.json',data_folder='nytapi_data/'):
self.conn = http.client.HTTPConnection("api.nytimes.com")
try:
with open(config_file,'r') as f: #is config.txt a file that needs to be downloaded?
self.cfg = json.load(f)
except:
raise(Exception('Error loading config.json file. Does it exist and is it json?'))
# data folder
self.data_folder = data_folder
#print(os.path.isfile('nytapi_data'))
if not os.path.isdir(data_folder):
os.mkdir(data_folder)
def get_archive(self, year=1851, month=12):
''' Either gets previously retreived data or makes a new
request to the nyt api to get archive data for a given
year and month.
'''
fname = self.data_folder + '{}.{}.json'.format(year,month)
if os.path.isfile(fname):
json_string = self.archives_local(year,month,fname)
else:
json_string = self.archives_apirequest(year,month)
with open(fname,'w') as f:
f.write(json_string)
return json.loads(json_string)
def archives_local(self, year, month, fname):
# get data from local file
with open(fname,'r') as f:
json_string = f.read()
return json_string
def archives_apirequest(self, year, month):
# make request to api using get variable "api-key"
self.conn.request("GET", "//svc/archive/v1/" + str(year) + \
"/" + str(month) + ".json?api-key=%s" % (self.cfg['apikey'],))
# get response from connection object
r = self.conn.getresponse()
# detect the http error codes - if there was an error it was probably the api key
if r.status is 403:
raise(Exception('Api key was rejected. Did you paste it in config.json?'))
elif r.status is not 200:
raise(Exception('Error making request to nyt.com.'))
return r.read().decode("ascii")
def get_headline_list(self, *args):
## -Function which returns a list comprised of all the headlines in list form
## of an archive(from a given year, month)
d = self.get_archive(*args)
headlineList = []
for doc in d['response']['docs']:
if doc['headline']['main'][:5] == 'Front':
pass
elif doc['headline']['main'][:7] == 'Article':
pass
else:
headlineList.append(word_tokenize(doc['headline']['main']))
return headlineList
def get_year_headline_list(self, year = 1853):
## -Function which returns a list comprised of all of the headline
## lists of the archives from a given year.
yearHeadlineList = []
for month in range(1, 13):
yearHeadlineList.append(self.get_headline_list(year, month))
return yearHeadlineList
def get_year_range_headline_list(self, startYear = 1853, endYear = 1862):
## -Function which returns a list comprised of all of the year headline lists
## of the archives from a given range of years.
yearRangeHeadlineList = []
for year in range(startYear, endYear + 1):
yearRangeHeadlineList.append(self.get_year_headline_list(year))
return yearRangeHeadlineList
def get_time_period_headline_list(self, startYear = 1990, endYear = 1999, \
startMonth = 1, endMonth = 12):
## -Function which returns a list comprised of all of the year headline lists
## of the archives from a given range of years, starting and ending at specific
## months.
print("Compiling List of NYT headline Year Lists from " + str(startMonth) + ", " \
+ str(startYear) + " to " + str(endMonth) + ", " + str(endYear) + "...")
timePeriodHeadlineList = []
beginYearList = []
endYearList = []
for month in range(startMonth, 13):
beginYearList.append(self.get_headline_list(startYear, month))
timePeriodHeadlineList.append(beginYearList)
for yearList in self.get_year_range_headline_list(startYear + 1, endYear - 1):
timePeriodHeadlineList.append(yearList)
for month in range(1, endMonth + 1):
endYearList.append(self.get_headline_list(endYear, month))
timePeriodHeadlineList.append(endYearList)
return timePeriodHeadlineList
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
yr = str(sys.argv[1])
mo = str(sys.argv[2])
else:
yr = 1853
mo = 12
# initiate nytapi library
nytapi = NYT()
# make a request to get the archive
d = nytapi.get_archive(year=yr, month=mo)
i = 1
for doc in d['response']['docs']:
print(doc['headline']['main'])
if i > 10:
break
i += 1