Skip to content

Commit fc7a644

Browse files
committed
[feature]change url crawler to Generators,add province selector
1 parent 37932ee commit fc7a644

File tree

3 files changed

+34
-17
lines changed

3 files changed

+34
-17
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,5 @@ venv.bak/
105105

106106
# idea
107107
.idea/
108+
109+
test1.py

test.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,15 @@
2828
# 登录
2929
login_driver = tyc_login.TianyanchaLogin(PHONE, PASSWORD)
3030

31+
"""
32+
爬取省份列表
33+
省份缩写可以查看天眼查 url 格式
34+
"""
35+
36+
PROVINCE = ['gd']
37+
3138
# 初始化 URL 爬虫
32-
url_crawler = tyc_urls_crawler.TianyanchaUrlsCrawler(login_driver.driver, KEYWORDS, STATUS)
39+
url_crawler = tyc_urls_crawler.TianyanchaUrlsCrawler(login_driver.driver, KEYWORDS, STATUS, PROVINCE)
3340

3441
# 爬取返回 URL 列表
3542
urls = []

tyc_spider/tyc_urls_crawler.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ class TianyanchaUrlsCrawler:
1111
"""
1212
爬详细页链接,传入登录后的 driver 以及关键字
1313
"""
14-
def __init__(self, driver, keywords, status=None):
14+
def __init__(self, driver, keywords, status=None, province=None):
1515
"""
1616
:param driver: selenium webdriver
1717
:param keywords: str
@@ -32,31 +32,39 @@ def __init__(self, driver, keywords, status=None):
3232
if not status:
3333
self.status = ['1', '2', '3', '4', '5']
3434
print(self.status)
35-
self.url_fmt = 'https://www.tianyancha.com/search/os{status}/p{page_num}?key={keyword}&searchType=company'
35+
if not province:
36+
self.province = ['']
37+
else:
38+
self.province = province
39+
self.url_fmt = 'https://{province}.tianyancha.com/search/' \
40+
'os{status}/p{page_num}?key={keyword}&searchType=company'
3641
self.url_list = []
3742

3843
def crawl_urls(self):
39-
for keyword in self.keywords:
40-
for status in self.status:
41-
pages = self.get_page_num(keyword, status)
42-
for page_num in range(1, int(pages)+1):
43-
url = self.url_fmt.format(page_num=page_num, keyword=keyword, status=status)
44-
self.driver.get(url)
45-
sleep(randint(300, 600)/100)
44+
for province in self.province:
45+
for keyword in self.keywords:
46+
for status in self.status:
47+
pages = self.get_page_num(keyword, status, province)
48+
for page_num in range(1, int(pages)+1):
49+
url = self.url_fmt.format(page_num=page_num, keyword=keyword, status=status, province=province)
50+
self.driver.get(url)
51+
sleep(randint(300, 600)/100)
4652

47-
soup = BeautifulSoup(self.driver.page_source, 'lxml')
48-
links = soup.findAll('a', class_="name ")
49-
print(keyword, page_num)
50-
for link in links:
51-
yield link['href']
53+
soup = BeautifulSoup(self.driver.page_source, 'lxml')
54+
links = soup.findAll('a', class_="name ")
55+
print(keyword, page_num)
56+
for link in links:
57+
yield link['href']
5258

53-
def get_page_num(self, keyword, status):
59+
def get_page_num(self, keyword, status, province):
5460
"""
5561
爬取页数
5662
:param keyword: str
63+
:param status: str
64+
:param province: str
5765
:return: int
5866
"""
59-
url = self.url_fmt.format(page_num=1, keyword=keyword, status=status)
67+
url = self.url_fmt.format(page_num=1, keyword=keyword, status=status, province=province)
6068

6169
try:
6270
self.driver.get(url)

0 commit comments

Comments
 (0)