-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrapeComtrade.py
103 lines (79 loc) · 2.71 KB
/
ScrapeComtrade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Alice Lepissier
alice.lepissier@gmail.com
July 2018
Risk-based IFF
Scrape the Comtrade website for data
This code must be run in batches, as Comtrade limits the API usage to 100 requests per hour.
"""
import requests, os
import pandas as pd
from pandas.compat import StringIO
os.chdir('C:/cloudstorage/googledrive/Projects/Tax Justice Network/Consultancy 2 - summer 18/Risk-based IFF')
"""" First batch """
gets = pd.read_csv('Data/Comtrade/Comtrade_GET_1-100.csv', header=None)
urls = gets[0].tolist()
df1 = pd.DataFrame()
for url in urls:
req = requests.get(url)
req_string = req.text
# file_name = 'r' + url[url.find('r=')+len('r='):url.rfind('&px=')] + '.csv'
# with open(file_name, 'w', newline='') as out_csv:
# out_csv.write(req_string)
# data = pd.read_csv(file_name, encoding='iso-8859-1')
data = pd.read_csv(StringIO(req_string))
df1 = df1.append(data, sort=False)
df1.to_csv('Data/Comtrade/comtrade_1.csv')
"""" Second batch """
gets = pd.read_csv('Data/Comtrade/Comtrade_GET_101-200.csv', header=None)
urls = gets[0].tolist()
df2 = pd.DataFrame()
for url in urls:
req = requests.get(url)
req_string = req.text
data = pd.read_csv(StringIO(req_string))
df2 = df2.append(data, sort=False)
df2.to_csv('Data/Comtrade/comtrade_2.csv')
"""" Third batch """
gets = pd.read_csv('Data/Comtrade/Comtrade_GET_201-300.csv', header=None)
urls = gets[0].tolist()
df3 = pd.DataFrame()
for url in urls:
req = requests.get(url)
req_string = req.text
data = pd.read_csv(StringIO(req_string))
df3 = df3.append(data, sort=False)
df3.to_csv('Data/Comtrade/comtrade_3.csv')
"""" Fourth batch """
gets = pd.read_csv('Data/Comtrade/Comtrade_GET_301-400.csv', header=None)
urls = gets[0].tolist()
df4 = pd.DataFrame()
for url in urls:
req = requests.get(url)
req_string = req.text
data = pd.read_csv(StringIO(req_string))
df4 = df4.append(data, sort=False)
df4.to_csv('Data/Comtrade/comtrade_4.csv')
"""" Fifth batch """
gets = pd.read_csv('Data/Comtrade/Comtrade_GET_401-500.csv', header=None)
urls = gets[0].tolist()
df5 = pd.DataFrame()
for url in urls:
req = requests.get(url)
req_string = req.text
data = pd.read_csv(StringIO(req_string))
df5 = df5.append(data, sort=False)
df5.to_csv('Data/Comtrade/comtrade_5.csv')
"""" Sixth batch """
gets = pd.read_csv('Data/Comtrade/Comtrade_GET_501-508.csv', header=None)
urls = gets[0].tolist()
df6 = pd.DataFrame()
for url in urls:
req = requests.get(url)
req_string = req.text
data = pd.read_csv(StringIO(req_string))
df6 = df6.append(data, sort=False)
df6.to_csv('Data/Comtrade/comtrade_6.csv')
df = pd.DataFrame()
df = df1.append([df2, df3, df4, df5, df6])
df.to_csv('Data/Comtrade/comtrade.csv')