Skip to content

Commit c326aff

Browse files
committed
修复一些错误
1 parent e00c950 commit c326aff

15 files changed

+104
-55
lines changed

CrawlBase.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from collections import defaultdict
2-
import random,re
2+
import random,re,os
33
from PyQt5.QtCore import QThread
44
import requests
55
from bs4 import BeautifulSoup
@@ -113,21 +113,14 @@ def positionHandle(self):
113113
f.write(str(position) + ',')
114114
f.write(str(num) + '\n')
115115

116-
116+
#保存文件到txt
117117
def staffHandle(self):
118118
fileName = 'staff.txt'
119119
with open(self.file_path + fileName, 'w', encoding='utf-8') as f:
120120
for job_info in self.job_infos:
121121
f.write(str(job_info['staff']) + ',' + str(job_info['details_url'] + '\n'))
122122

123123

124-
#保存文件,用户可提取
125-
def saveFileCSV(self, fileName):
126-
with open(self.file_path + fileName + '.csv', 'w', encoding='utf-8') as f:
127-
f.write(str(fileName) + '\n')
128-
for job_info in self.job_infos:
129-
f.write(str(job_info[fileName]) + '\n')
130-
131124
#保存所有信息,使用sqlite3数据库存储
132125
def saveAll(self, tableName, db):
133126
cursor = db.cursor()
@@ -140,5 +133,20 @@ def saveAll(self, tableName, db):
140133
cursor.execute("UPDATE latestType SET latest_type = ? WHERE id = ?",(tableName,1))
141134
cursor.close()
142135
db.commit()
143-
136+
137+
#初始化数据库(若文件不存在,则创建数据库)
138+
def InitDB(self):
139+
if os.path.isfile(os.getcwd() + '/resource/jobs.db'):
140+
db = sqlite3.connect(os.getcwd() + '/resource/jobs.db')
141+
return db
142+
else:
143+
db = sqlite3.connect(os.getcwd() + '/resource/jobs.db')
144+
cursor = db.cursor()
145+
cursor.execute('CREATE TABLE zhilian (staff text, salary varchar(20), position varchar(20), details_url text)')
146+
cursor.execute('CREATE TABLE lagou (staff text, salary varchar(20), position varchar(20), details_url text)')
147+
cursor.execute('CREATE TABLE latestType (id INTEGER, latest_type varchar(20))')
148+
cursor.execute('INSERT INTO latestType (id, latest_type) values (?, ?)',(1,'lagou'))
149+
cursor.close()
150+
db.commit()
151+
return db
144152

jobs.db

-296 KB
Binary file not shown.

lagou/lagou.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def __init__(self, position, keyword, progressBar, page_number):
3737

3838

3939

40+
4041
def generateUrl(self):
4142
q = Queue()
4243
for i in range(self.page_number):
@@ -73,12 +74,10 @@ def run(self):
7374
t3.join()
7475
t4.join()
7576

76-
db = sqlite3.connect('jobs.db')
77-
77+
db = self.InitDB()
7878
self.salaryHandle() #保存文件,单独存放薪水,用于方便生成图像,下同
7979
self.positionHandle()
80-
self.saveAll('lagou',db)
81-
self.staffHandle() #保存文件,单独存放职位名称和对应的URL
80+
self.saveAll('lagou',db) #保存文件,单独存放职位名称和对应的URL
8281

8382

8483
self.trigger.emit()

main.py

+46-7
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@ def __init__(self):
1818
QMainWindow.__init__(self)
1919
self.setupUi(self)
2020
self.type = ""
21+
self.db = self.InitDB()
2122
self.staff_list = []
2223
self.InitImage()
2324
self.showStaff()
2425
self.ItemNumber = 0
26+
2527

2628
def typeMap(self, type):
2729
if type == "拉勾网":
@@ -33,7 +35,6 @@ def typeMap(self, type):
3335

3436
#初始化图片展示到GUI上
3537
def InitImage(self):
36-
self.db = sqlite3.connect('jobs.db')
3738
cursor = self.db.cursor()
3839
cursor.execute("SELECT * FROM latestType")
3940
self.type = cursor.fetchall()[0][1]
@@ -45,6 +46,22 @@ def InitImage(self):
4546
PixMapPosition = QtGui.QPixmap(os.getcwd() + '/resource/%s/images/2.png' % self.type).scaled(500,500)
4647
self.PositionImage.setPixmap(PixMapPosition)
4748

49+
#这里代码和CrawlBase里的代码重复了,违背了软件开发的一些原则,但是因为sqlite3对象不能传递到另一个线程,所以暂时只能这样做
50+
def InitDB(self):
51+
if os.path.isfile(os.getcwd() + '/resource/jobs.db'):
52+
db = sqlite3.connect(os.getcwd() + '/resource/jobs.db')
53+
return db
54+
else:
55+
db = sqlite3.connect(os.getcwd() + '/resource/jobs.db')
56+
cursor = db.cursor()
57+
cursor.execute('CREATE TABLE zhilian (staff text, salary varchar(20), position varchar(20), details_url text)')
58+
cursor.execute('CREATE TABLE lagou (staff text, salary varchar(20), position varchar(20), details_url text)')
59+
cursor.execute('CREATE TABLE latestType (id INTEGER, latest_type varchar(20))')
60+
cursor.execute('INSERT INTO latestType (id, latest_type) values (?, ?)',(1,'lagou'))
61+
cursor.close()
62+
db.commit()
63+
return db
64+
4865
#开启主线程外的另一个线程,防止UI阻塞,注意到在那个线程里爬数据的时候再次开启了多线程,这是可以的,也是python和Qt 灵活的地方
4966
def work(self):
5067
position = self.positionEdit.text()
@@ -66,14 +83,13 @@ def work(self):
6683
self.workTheard.start()
6784
self.workTheard.trigger.connect(self.showImage)
6885

86+
#展示staff信息到listwidget
6987
def showStaff(self):
7088
cursor = self.db.cursor()
7189
self.staff_list = []
7290
self.listWidget.clear() #staff_list 常驻内存,切记每次都要初始化为空,否则列表将无限增长,最终程序崩溃
7391

74-
#从数据库中得到所有职位信息
75-
76-
cursor.execute('SELECT * FROM %s' % (self.type))
92+
cursor.execute('SELECT * FROM %s' % (self.type)) #从数据库中得到所有职位信息
7793
values = cursor.fetchall()
7894
N = 50 if len(values) >= 50 else len(values)
7995

@@ -103,6 +119,7 @@ def showImage(self):
103119

104120
del image
105121
gc.collect()
122+
106123
#读取新的数据
107124
self.showStaff()
108125

@@ -115,19 +132,34 @@ def showItem(self):
115132
webbrowser.open_new(url)
116133

117134

135+
#打开文件目录浏览器
118136
def openFile(self):
119137
DirName = QtWidgets.QFileDialog.getExistingDirectory(self.Filedialog, "浏览文件",
120138
"C:",QtWidgets.QFileDialog.ShowDirsOnly)
121139
self.DirlineEdit.setText(DirName)
122140

123-
141+
142+
#保存数据到EXCEL文件
124143
def saveFileToExcel(self):
125144
fileName = self.FilelineEdit.text()
126145
if fileName == '':
127146
fileName = 'default'
128147

129-
fileName = self.DirlineEdit.text() + '/' + fileName
148+
fileName = self.DirlineEdit.text() + '/' + fileName + '.xls'
149+
150+
if os.path.isfile(fileName):
151+
existMessage = QtWidgets.QMessageBox.warning( self,
152+
'文件已存在',
153+
fileName + ' 已存在,是否覆盖该文件',
154+
QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No)
155+
156+
if existMessage == QtWidgets.QMessageBox.No:
157+
return
130158

159+
160+
161+
162+
131163
cursor = self.db.cursor()
132164
cursor.execute('SELECT * FROM %s' % (self.type))
133165
values = cursor.fetchall()
@@ -143,13 +175,19 @@ def saveFileToExcel(self):
143175
workSheet.write(i+1, 1, values[i][1])
144176
workSheet.write(i+1, 2, values[i][2])
145177

146-
workBook.save(fileName + '.xls')
178+
workBook.save(fileName)
147179

148180
SucessMessage = QtWidgets.QMessageBox.information( self,
149181
'导出EXCEL',
150182
'导出成功',
151183
QtWidgets.QMessageBox.Yes)
152184

185+
186+
if SucessMessage == QtWidgets.QMessageBox.Yes:
187+
self.Filedialog.close()
188+
189+
190+
153191

154192

155193

@@ -166,6 +204,7 @@ def networkError(self):
166204
pass
167205

168206

207+
169208
if __name__ == '__main__':
170209
app = QtWidgets.QApplication(sys.argv)
171210
MainWindow = Main()

mainWIndow.py

+1
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def outPutFile(self):
159159
self.DirlineEdit = QtWidgets.QLineEdit(self.Filedialog)
160160
self.DirlineEdit.move(100,40)
161161
self.DirlineEdit.setFixedSize(350,30)
162+
self.DirlineEdit.setText(os.getcwd())
162163

163164
self.Filelabel = QtWidgets.QLabel(self.Filedialog)
164165
self.Filelabel.move(20,100)

resource/jobs.db

140 KB
Binary file not shown.

resource/lagou/images/1.png

336 Bytes
Loading

resource/lagou/images/2.png

-9.81 KB
Loading

resource/lagou/position_for_image.csv

+11-10
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
位置
2-
房山区,1
3-
西城区,3
4-
None,5
5-
丰台区,4
6-
东城区,6
7-
朝阳区,92
8-
昌平区,7
9-
通州区,1
2+
朝阳区,118
3+
昌平区,3
4+
None,13
5+
海淀区,108
6+
顺义区,1
7+
石景山区,2
8+
丰台区,3
9+
东城区,28
10+
西城区,11
1011
大兴区,10
11-
海淀区,168
12-
石景山区,3
12+
房山区,2
13+
通州区,1

resource/lagou/salary_for_image.csv

+4-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
薪水
2-
5K-8K,10
3-
8K-12K,61
4-
0-5K,9
5-
15K-~,161
6-
12K-15K,59
2+
12K-15K,46
3+
15K-~,249
4+
8K-12K,4
5+
0-5K,1

resource/zhilian/images/1.png

2.7 KB
Loading

resource/zhilian/images/2.png

233 Bytes
Loading
+14-11
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
位置
2-
昌平区,14
3-
石景山区,1
4-
丰台区,11
5-
顺义区,2
6-
大兴区,3
7-
海淀区,100
8-
房山区,3
9-
朝阳区,25
10-
东城区,3
11-
北京,428
12-
西城区,10
2+
东城区,12
3+
顺义区,5
4+
房山区,4
5+
海淀区,164
6+
西城区,34
7+
石景山区,6
8+
通州区,5
9+
崇文区,3
10+
延庆县,1
11+
昌平区,26
12+
大兴区,9
13+
北京,835
14+
丰台区,27
15+
朝阳区,69

resource/zhilian/salary_for_image.csv

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
薪水
2-
8K-12K,171
3-
5K-8K,121
4-
12K-15K,150
5-
15K-~,144
6-
0-5K,14
2+
5K-8K,325
3+
15K-~,238
4+
12K-15K,292
5+
8K-12K,319
6+
0-5K,26

zhilian/zhilian.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,10 @@ def run(self):
7070
t3.join()
7171
t4.join()
7272

73-
db = sqlite3.connect('jobs.db')
73+
db = self.InitDB()
7474
self.salaryHandle() #保存文件,单独存放薪水,用于方便生成图像,下同
7575
self.positionHandle()
76-
self.saveAll('zhilian',db)
77-
self.staffHandle() #保存文件,单独存放职位名称和对应的URL
76+
self.saveAll('zhilian',db) #保存文件,单独存放职位名称和对应的URL
7877

7978
self.trigger.emit() #爬取完毕,要发送信号给UI主线程,并执行相应的槽函数
8079

0 commit comments

Comments
 (0)