-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
127 lines (99 loc) · 4.59 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from requests_html import HTMLSession
import json
import datetime
root_url = 'https://www.cp.pt'
index_url = root_url + '/passageiros/pt/consultar-horarios/estacoes'
session = HTMLSession()
stations = {'stations' : []}
def get_stations_url():
page = session.get(index_url)
station_url_table = page.html.find('tbody tr a')
for station_url in station_url_table:
station = dict()
station['url'] = root_url + station_url.attrs['href']
stations['stations'].append(station)
def get_stations_info(url, index):
page = session.get(url)
station_data = page.html.find('div.station-data')[0]
stations['stations'][index]['name'] = station_data.find('h1', first = True).text
print(stations['stations'][index]['name'])
station_main_data = station_data.find('ul li')
#Address
try:
stations['stations'][index]['address'] = station_main_data[0].text.split(':')[1].strip()
except:
print('Error: Address not setted.')
#Location
try:
coord = station_main_data[1].text.split(':')[1].split('|')
location = []
location.append(float(coord[0].replace(',', '')))
location.append(float(coord[1].replace(',', '')))
stations['stations'][index]['location'] = location
except:
print('Error: Location not setted.')
#CP Services
try:
for index2, service in enumerate(station_main_data[2].text.split(':')[1].split('|')):
if index2 == 0:
stations['stations'][index]['cp_services'] = []
stations['stations'][index]['cp_services'].append(service.strip())
except:
print('Error: CP Services not setted.')
#Lines
try:
for index2, line in enumerate(station_main_data[3].text.split(':')[1].split('|')):
if index2 == 0:
stations['stations'][index]['lines'] = []
stations['stations'][index]['lines'].append(line.strip())
except:
print('Error: Line(s) not setted.')
#Services
try :
station_services = page.html.find('div.tab-content', first=True)
if station_services.find('ul'):
stations['stations'][index]['services'] = dict()
cp_services, access_connections, facilities, complementary_services = False, False, False, False
for services in station_services.find('ul'):
#CP Services
if cp_services == False:
if page.html.find('ul li', containing='Serviços CP') :
stations['stations'][index]['services']['cp_services'] = []
for service in services.find('li'):
stations['stations'][index]['services']['cp_services'].append(service.text)
cp_services = True
continue
#Access/Connections
if access_connections == False:
if page.html.find('ul li', containing='Acessos e Ligações') :
stations['stations'][index]['services']['access_connections'] = []
for service in services.find('li'):
stations['stations'][index]['services']['access_connections'].append(service.text)
access_connections = True
continue
#Facilities
if facilities == False:
if page.html.find('ul li', containing='Mobilidade Condicionada') :
stations['stations'][index]['services']['facilities'] = []
for service in services.find('li'):
stations['stations'][index]['services']['facilities'].append(service.text)
facilities = True
continue
#Complementary Services
if complementary_services == False:
if page.html.find('ul li', containing='Serviços Complementares') :
stations['stations'][index]['services']['complementary_services'] = []
for service in services.find('li'):
stations['stations'][index]['services']['complementary_services'].append(service.text)
complementary_services = True
continue
except:
print('Error: Services not setted')
def parse_station_to_file(dict, filename):
with open(filename, 'w') as file:
json.dump(dict, file, indent = 4, ensure_ascii = False)
get_stations_url()
for index, station in enumerate(stations['stations']):
get_stations_info(station['url'], index)
print(str(index+1) + '/' + str(len(stations['stations'])))
parse_station_to_file(stations, 'data.json')