-
Notifications
You must be signed in to change notification settings - Fork 48
/
keywords.py
92 lines (80 loc) · 2.96 KB
/
keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import json
import random
import re
import time
from urllib.parse import quote_plus
from v2ray_pool import Net
import chardet
import requests
import urllib3
from bs4 import BeautifulSoup
from my_fake_useragent import UserAgent
class Keywords(Net):
'''url:https://www.5118.com/ciku/index#129'''
def get_keys_by_net(self) -> []:
try:
r = self.request(r'https://www.5118.com/ciku/index#129')
if r.status_code != 200:
return None
r.encoding = r.apparent_encoding
# <span>法律</span><br>
soup = BeautifulSoup(r.text, "html.parser")
results = []
for a in soup.find_all(name='a'):
results += re.findall(r'<span>(.*?)</span><br', str(a.get_text), re.DOTALL)
return results
except Exception as e:
print(e)
return None
def get_keys_by_local(self) -> []:
with open('test/key_tag.json', 'r') as f:
js_get = json.load(f)
f.close()
return js_get
def get_titles_by_local(self) -> []:
with open('test/key_title.json', 'r') as f:
js_get = json.load(f)
f.close()
return js_get
def get_titles_by_net(self, key):
'''通过网盘搜索检查出
https://www.alipanso.com/search.html?page=1&keyword=%E7%90%86%E8%B4%A2&search_folder_or_file=2&is_search_folder_content=1&is_search_path_title=1&category=doc&file_extension=doc&search_model=1
'''
results = []
try:
time.sleep(random.randint(1, 4))
r = self.request_en(
r'https://www.alipanso.com/search.html?page=1&keyword=%s&search_folder_or_file=2&is_search_folder_content=1&is_search_path_title=1&category=doc&file_extension=doc&search_model=1' % key)
if r.status_code != 200:
print(r.status_code)
return None
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
for a in soup.find_all(name='a'):
ts = re.findall(r'(.*?).doc', str(a.get_text()).replace('\n', ''), re.DOTALL)
for t in ts:
if '公众号' in t or '【' in t or '[' in t or ',' in t or ',' in t or ')' in t or ')' in t or t in results:
continue
if len(t) < 4:
continue
results.append(t)
return results
except Exception as e:
print(e)
return None
def test():
js = ['a', 'b', 'c']
with open('test/key_tag.json', 'w') as f:
json.dump(js, f)
f.close()
with open('test/key_tag.json', 'r') as f:
js_get = json.load(f)
f.close()
print(js_get)
if __name__ == "__main__":
# test()
keys = Keywords().get_keys_by_net()
print(keys)
with open('test/key_tag.json', 'w') as f:
json.dump(keys, f, ensure_ascii=False)
f.close()