forked from TurboWay/glidedsky
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawler-ip-block-2.py
76 lines (60 loc) · 1.76 KB
/
crawler-ip-block-2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : 2020/9/4 18:02
# @Author : way
# @Site :
# @Describe: 爬虫-IP屏蔽2
import os
import json
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from env import headers, proxies
# 重试装饰器
def retry(func):
max_retry = 10
def run(*args, **kwargs):
for i in range(max_retry + 1):
url, score = func(*args, **kwargs)
if score > 0:
return url, score
return func(*args, **kwargs)
return run
@retry
def crawler(url):
try:
response = requests.get(url, headers=headers, proxies=proxies)
rows = BeautifulSoup(response.text, 'lxml').find_all('div', class_="col-md-1")
score = sum(int(row.text) for row in rows)
except:
score = 0
return url, score
def main(result_path):
if not os.path.exists(result_path):
with open(result_path, 'w') as f:
json.dump({}, f)
urls = []
for i in range(1, 1001):
url = f'http://www.glidedsky.com/level/web/crawler-ip-block-2?page={i}'
urls.append(url)
with open(result_path, 'r') as f:
dt = json.load(f)
for key, value in dt.items():
if value > 0:
urls.remove(key)
if not urls:
print(sum(dt.values()))
return True
else:
print(f"剩余待采集页数:{len(urls)}")
pool = ThreadPoolExecutor(max_workers=5)
for result in pool.map(crawler, urls):
url, score = result
dt[url] = score
with open(result_path, 'w') as f:
json.dump(dt, f)
if __name__ == '__main__':
result_path = 'crawler-ip-block-2.json'
while True:
if main(result_path):
break # 3103341