-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrape_Guardian.py
70 lines (60 loc) · 1.99 KB
/
Scrape_Guardian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# coding: utf-8
# In[ ]:
import json
import requests
import os
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta
import re
from collections import defaultdict
import codecs
ARTICLES_DIR = join('C:','fashion')
makedirs(ARTICLES_DIR, )
# Sample URL exist_ok=True
#
# http://content.guardianapis.com/search?from-date=2016-01-02&
# to-date=2016-01-02&order-by=newest&show-fields=all&page-size=200
# &api-key=your-api-key-goes-here
MY_API_KEY = "0044543a-4602-4f38-b4a9-52ed97c2d4e9"
API_ENDPOINT = 'http://content.guardianapis.com/search'
my_params = {
'section':"section_name",
'from-date': "",
'to-date': "",
'order-by': "newest",
'show-fields': 'all',
'page-size': 200,
'show-elements': "image",
'api-key': MY_API_KEY
}
# day iteration from here:
# http://stackoverflow.com/questions/7274267/print-all-day-dates-between-two-dates
start_date = date(2012, 1, 1)
end_date = date(2017,10, 20)
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
dt = start_date + timedelta(days=daycount)
datestr = dt.strftime('%Y-%m-%d')
fname = join(ARTICLES_DIR, datestr + '.json')
if not exists(fname):
# then let's download it
print("Downloading", datestr)
all_results = []
my_params['from-date'] = datestr
my_params['to-date'] = datestr
current_page = 1
total_pages = 1
while current_page <= total_pages:
print("...page", current_page)
my_params['page'] = current_page
resp = requests.get(API_ENDPOINT, my_params)
data = resp.json()
all_results.extend(data['response']['results'])
# if there is more than one page
current_page += 1
total_pages = data['response']['pages']
with open(fname, 'w') as f:
print("Writing to", fname)
# re-serialize it for pretty indentation
f.write(json.dumps(all_results, indent=2))