@@ -11,7 +11,7 @@ class TianyanchaUrlsCrawler:
11
11
"""
12
12
爬详细页链接,传入登录后的 driver 以及关键字
13
13
"""
14
- def __init__ (self , driver , keywords , status = None ):
14
+ def __init__ (self , driver , keywords , status = None , province = None ):
15
15
"""
16
16
:param driver: selenium webdriver
17
17
:param keywords: str
@@ -32,31 +32,39 @@ def __init__(self, driver, keywords, status=None):
32
32
if not status :
33
33
self .status = ['1' , '2' , '3' , '4' , '5' ]
34
34
print (self .status )
35
- self .url_fmt = 'https://www.tianyancha.com/search/os{status}/p{page_num}?key={keyword}&searchType=company'
35
+ if not province :
36
+ self .province = ['' ]
37
+ else :
38
+ self .province = province
39
+ self .url_fmt = 'https://{province}.tianyancha.com/search/' \
40
+ 'os{status}/p{page_num}?key={keyword}&searchType=company'
36
41
self .url_list = []
37
42
38
43
def crawl_urls (self ):
39
- for keyword in self .keywords :
40
- for status in self .status :
41
- pages = self .get_page_num (keyword , status )
42
- for page_num in range (1 , int (pages )+ 1 ):
43
- url = self .url_fmt .format (page_num = page_num , keyword = keyword , status = status )
44
- self .driver .get (url )
45
- sleep (randint (300 , 600 )/ 100 )
44
+ for province in self .province :
45
+ for keyword in self .keywords :
46
+ for status in self .status :
47
+ pages = self .get_page_num (keyword , status , province )
48
+ for page_num in range (1 , int (pages )+ 1 ):
49
+ url = self .url_fmt .format (page_num = page_num , keyword = keyword , status = status , province = province )
50
+ self .driver .get (url )
51
+ sleep (randint (300 , 600 )/ 100 )
46
52
47
- soup = BeautifulSoup (self .driver .page_source , 'lxml' )
48
- links = soup .findAll ('a' , class_ = "name " )
49
- print (keyword , page_num )
50
- for link in links :
51
- yield link ['href' ]
53
+ soup = BeautifulSoup (self .driver .page_source , 'lxml' )
54
+ links = soup .findAll ('a' , class_ = "name " )
55
+ print (keyword , page_num )
56
+ for link in links :
57
+ yield link ['href' ]
52
58
53
- def get_page_num (self , keyword , status ):
59
+ def get_page_num (self , keyword , status , province ):
54
60
"""
55
61
爬取页数
56
62
:param keyword: str
63
+ :param status: str
64
+ :param province: str
57
65
:return: int
58
66
"""
59
- url = self .url_fmt .format (page_num = 1 , keyword = keyword , status = status )
67
+ url = self .url_fmt .format (page_num = 1 , keyword = keyword , status = status , province = province )
60
68
61
69
try :
62
70
self .driver .get (url )
0 commit comments