-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnyt_headline.py
114 lines (85 loc) · 3.83 KB
/
nyt_headline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import pandas as pd
import requests
import json
import time
import dateutil
import datetime
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def send_request(date):
'''Sending a request to the NYT Archive API for given date.
Two rate limits: 4,000 requests per day and 10 requests per minute.
You should sleep 6 seconds between calls to avoid hitting the per minute rate limit'''
base_url = 'https://api.nytimes.com/svc/archive/v1/'
url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + 'YOUR-KEY'
try:
response = requests.get(url, verify=False).json()
except Exception:
return None
time.sleep(6)
return response
def is_valid(article, date):
''' Checking whether an article has a headline and its publication date
falls beetween the intended time range'''
is_in_range = date > start and date < end
has_headline = ( type(article['headline']) == dict and 'main' in article['headline'].keys() )
return (is_in_range) and (has_headline)
def parse_response(response):
'''Parses and returns response as pandas data frame.'''
### Setting up a data dictionary for getting the details of articles for a month in question
data = {'headline': [],
'date': [],
'doc_type': [],
'material_type': [],
'section': [],
'keywords': []}
articles = response['response']['docs']
for article in articles:
date = dateutil.parser.parse(article['pub_date']).date()
if is_valid(article, date):
data['date'].append(date)
data['headline'].append(article['headline']['main'])
if 'section' in article:
data['section'].append(article['section_name'])
else:
data['section'].append(None)
data['doc_type'].append(article['document_type'])
if 'type_of_material' in article:
data['material_type'].append(article['type_of_material'])
else:
data['material_type'].append(None)
keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
data['keywords'].append(keywords)
return pd.DataFrame(data)
def get_data(dates):
'''Sends and parses request/response to/from NYT Archive API for given dates.'''
total = 0
unified_df = None
print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
if not os.path.exists('./output/headlines'):
os.mkdir('./output/headlines')
for date in dates:
print('Working on ' + str(date) + '...')
csv_path = './output/headlines/' + date[0] + '-' + date[1] + '.csv'
unified_csv_path = './output/unified_headlines.csv'
if not os.path.exists(csv_path): # If we don't already have this month
response = send_request(date)
if response is not None:
df = parse_response(response)
unified_df = pd.concat([unified_df, df])
total += len(df)
df.to_csv(csv_path, index=False)
print('Saving ' + csv_path + '...')
unified_df.to_csv(unified_csv_path, index=False)
print('Number of articles collected: ' + str(total))
if __name__ == "__main__":
### Determining the time interval using for downloading headlines from NYT
start = datetime.date(2011, 10, 1)
end = datetime.date.today()
print('Start date: ' + str(start))
print('End date: ' + str(end))
''' A list of months that falls beetween start = 2011-10-01 and end = today.
Necesseary for Archive API as it functions on one month only '''
months = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]
get_data(months)