forked from jiangq28/Subway_Data
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathnjSpider.py
92 lines (87 loc) · 3.97 KB
/
njSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import csv
# 获取页面源码
def get_html(url):
browser = webdriver.PhantomJS(executable_path=r"E:/spider/phantomjs-2.1.1-windows/bin/phantomjs.exe") #使用无界面的phantomjs浏览器
browser.get(url) #发送请求
print("### Handling <" + url + ">")
return browser.page_source
# 将 24 小时制时间转换为分钟数表示,0 时按 24 时计算
def minutes(time):
if ":" in time:
hour_minute = time.split(":")
hour = int(hour_minute[0])
if hour == 0:
hour = 24
minute = int(hour_minute[1])
return str(60*hour + minute)
else:
return '-1'
# 将 dict 类型数据写入 CSV 文件
def write_csv_row(path_to_file, fieldnames, row):
with open(path_to_file, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
print(" [success]")
# 爬取所有站点的名称,并给每个站点编一个唯一的编号
def get_all_lines_time():
with open(times_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=times_header)
writer.writeheader()
with open(stations_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=stations_header)
writer.writeheader()
i= 0
for url in urls:
get_one_line_time(url,i) #获取所有线路的数据
i= i+1
def get_one_line_time(url,i):
line_name = [1,2,3,4,10,"s1","s8","s3"]
html = get_html(url) #获取html页面
bsobj = BeautifulSoup(html, "lxml") #用lxml解析html
tbody = bsobj.find("tbody") #查找到显示线路信息的表格
while tbody is None:
print(" retry ...")
html = get_html(url)
bsobj = BeautifulSoup(html, "lxml")
tbody = bsobj.find("tbody") #查找到显示线路信息的表格
trs = tbody.findAll("tr") #把所有行放入数组trs[]
stations = []
global number
global station_names
for tr in trs: #跳过前三行的标题内容,开始遍历列车行
tds = tr.findAll("td") #把一行中的所有列放入数组tds[]
line = line_name[i]
station_name = tds[0].get_text()
toStart_time = minutes(tds[1].get_text()) #方向1首班车时间
toEnd_time = minutes(tds[3].get_text()) #方向1末班车时间
toStart_time2 = minutes(tds[2].get_text()) #方向2首班车时间
toEnd_time2 = minutes(tds[4].get_text()) #方向2末班车时间
station = [line, station_name, toStart_time, toEnd_time, toStart_time2, toEnd_time2,]
stations.append(station)
# 爬取所有站点的名称,并给每个站点编一个唯一的编号
if station_name in station_names:
continue
else:
station_names[station_name] = number
row ={'station_name':station_name, 'number':number}
write_csv_row(stations_file, stations_header, row)
number += 1
n = len(stations)
for i in range(n-1): #方向1
row = { "line": line, "from_station": stations[i][1], "to_station": stations[i+1][1], "first_time": stations[i][2], "last_time": stations[i][3],}
write_csv_row(times_file, times_header, row)
for i in range(n-1): #方向2
row = { "line": line, "from_station": stations[i+1][1], "to_station": stations[i][1], "first_time": stations[i+1][4], "last_time": stations[i+1][5],}
write_csv_row(times_file, times_header, row)
number = 1
station_names = {}
lines = ["x_44fac405", "x_0bd59b0e", "x_b4a482e2", "x_3717a1ad", "x_40e26542", "x_d8fdd83d", "x_854e2f53", "x_8cdaf2fa"]
urls = ["http://njdt.8684.cn/%s" % line for line in lines]
stations_file = 'stations.csv'
times_file = 'time.csv'
stations_header = ['station_name', 'number']
times_header = ['line', 'from_station', 'to_station', 'first_time', 'last_time']
get_all_lines_time()