-
Notifications
You must be signed in to change notification settings - Fork 3
/
crawler.py
196 lines (160 loc) · 7.48 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# -*- coding:utf-8 -*-
import os
import requests
from datetime import datetime
from datetime import timedelta
from urllib.parse import quote_plus
import concurrent.futures
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
directory = os.path.dirname(os.path.abspath(__file__))
class spider:
def __init__(self, username, password, proxies = False):
self.home = "http://pems.dot.ca.gov/"
self.s = requests.Session()
if proxies:
self.s.proxies = proxies
self.username = username
self.password = password
def get(self, url):
'''
HTTP get request
'''
return self.s.get(url, headers = headers)
def post(self, url, data):
'''
HTTP post request
'''
return self.s.post(url, data = data, headers = headers)
def login(self):
'''
sign in PEMS
'''
print('try to login')
data = {'username': self.username,
'password': self.password,
'login': 'Login',
'redirect': ""}
self.post(self.home, data)
def download_station_time_series_5min(self, station_id, start_time, end_time, areaname):
'''
download each station's time series data, time interval is 5min
Parameters
----------
station_id: str, e.g. 402000
start_time: str, %Y%m%d%H%M, e.g. 201701010000
end_time: str, %Y%m%d%H%M, e.g. 201701012359
'''
print('try to download %s %s %s'%(station_id, start_time, end_time))
s_time_id_f = datetime.strptime(start_time, "%Y%m%d%H%M").strftime("%m/%d/%Y+%H:%M")
e_time_id_f = datetime.strptime(end_time, "%Y%m%d%H%M").strftime("%m/%d/%Y+%H:%M")
delta = timedelta(days = 1)
s_time_id = str(int(datetime.strptime(datetime.strptime(start_time, "%Y%m%d%H%M").strftime("%Y%m%d") + "0800",
"%Y%m%d%H%M").timestamp()))
e_time_id = str(int(datetime.strptime((datetime.strptime(end_time, "%Y%m%d%H%M") + delta).strftime("%Y%m%d") + "0759",
"%Y%m%d%H%M").timestamp()))
data_str = '''report_form=1
dnode=VDS
content=loops
tab=det_timeseries
export=text
station_id=405572
s_time_id=1483228800
s_time_id_f=01%2F01%2F2017+00%3A00
e_time_id=1483315140
e_time_id_f=01%2F01%2F2017+23%3A59
tod=all
tod_from=0
tod_to=0
dow_0=on
dow_1=on
dow_2=on
dow_3=on
dow_4=on
dow_5=on
dow_6=on
holidays=on
q=flow
q2=
gn=5min
agg=on'''
data = dict(map(lambda x: x.strip().split('='), data_str.split('\n')))
data['station_id'] = station_id
data['s_time_id'] = s_time_id
data['e_time_id'] = e_time_id
data['s_time_id_f'] = s_time_id_f
data['e_time_id_f'] = e_time_id_f
url = self.home + '?' + '&'.join(map(lambda x: '='.join((x[0][0], quote_plus(x[0][1], safe = "+"))), zip(data.items())))
response = self.get(url)
if not os.path.exists(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname)))):
os.makedirs(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname))))
with open('%s%s_%s_%s.txt'%(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname))), station_id, start_time, end_time), 'w') as f:
f.write(response.text)
def download_station_metadata(self, station, areaname):
print('try to download meta data of station %s'%(station))
url = "%s?station_id=%s&dnode=VDS&content=sta_cfg"%(self.home, station)
r = self.get(url)
if not os.path.exists(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname)))):
os.makedirs(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname))))
with open("%s%s.html"%(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname))), station), "w") as f:
f.write(r.text)
def start(self, areaname, filename, start_time, end_time):
'''
Parameters
----------
areaname: str, e.g. Bay
filename: str, e.g. station.txt
start_time, end_time: str, %Y%m%d
'''
# download all stations' metadata
with open(filename, 'r') as f:
station_list = f.read().strip().split('\n')
if not os.path.exists(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname)))):
os.makedirs(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname))))
station_metadatas = os.listdir(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname))))
for station in station_list:
if "%s.html"%(station) in station_metadatas:
continue
self.download_station_metadata(station, areaname)
# compute timelist
start_time = start_time + "0000"
t = datetime.strptime(start_time, "%Y%m%d%H%M")
delta = timedelta(days = 7)
end_time = datetime.strptime(end_time, "%Y%m%d")
timelist = []
while t < end_time:
tmp = [t.strftime("%Y%m%d") + "0000"]
t = t + delta
tmp.append((t - timedelta(days = 1)).strftime("%Y%m%d") + "2359")
timelist.append(tmp)
# compute all stations' time series start time and end time
jobs = [(station, start_time, end_time) \
for station in station_list \
for start_time, end_time in timelist \
if not os.path.exists(os.path.normpath(os.path.join(directory, 'time_series/%s/%s_%s_%s.txt'%(areaname, station, start_time, end_time))))]
# multi threaded downloader
with concurrent.futures.ThreadPoolExecutor(max_workers = 4) as executor:
# add all jobs to future_to_url, {job: job's name}
future_to_url = {executor.submit(self.download_station_time_series_5min, station, start_time, end_time, areaname): '_'.join((station, start_time, end_time)) for station, start_time, end_time in jobs}
for future in concurrent.futures.as_completed(future_to_url):
job_name = future_to_url[future]
try:
future.result()
except Exception as exc:
print('%r generated an exception: %s' % (job_name, exc))
if __name__ == "__main__":
proxies = {'http': 'http://127.0.0.1:1080'}
username = None
password = None
if username == None:
username = input('please input your username of PeMS: ')
if password == None:
password = input('please input your password: ')
# spider initialization
a = spider()
# sign in PEMS
a.login(username, password, proxies = False)
a.start('Bay', 'BayStations.txt', '20170101', '20170601')