-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzhihupic.py
138 lines (128 loc) · 5.11 KB
/
zhihupic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import requests
import json
from lxml import etree
import os
import time
import urllib
headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
total_path = 'D:/zhihupic'
if not os.path.exists(total_path):
os.makedirs(total_path)
Error = []
def zhihuPicSpider(question_name_id_type_dict):
question_name = question_name_id_type_dict['name']
question_id = question_name_id_type_dict['id']
question_type = question_name_id_type_dict['type']
if question_type == 'question':
downloadQuestionPic(question_name, question_id)
elif question_type == 'article':
downloadArticlePic(question_name, question_id)
elif question_type == 'topic':
downloadTopicPic(question_id)
def downloadQuestionPic(question_name, question_id):
i = 0
pic_path = total_path + ('/{}'.format(question_name))
if not os.path.exists(pic_path):
os.makedirs(pic_path)
while True:
url = 'https://www.zhihu.com/api/v4/questions/{}/answers?limit=5&offset={}'.format(
question_id, i)
i += 5
response = requests.get(url, headers=headers)
content = json.loads(response.content.decode())['data']
if not content:
break
answer_ids = [item['id'] for item in content]
for answer_id in answer_ids:
answer_url = 'https://www.zhihu.com/question/{}/answer/{}'.format(
question_id, answer_id)
r = requests.get(answer_url, headers=headers)
html = r.content.decode()
element = etree.HTML(html)
pic_urls = element.xpath('//noscript/img//@src')
for pic_url in pic_urls:
name = int(time.time() * 10)
path = pic_path + ('/{}.jpg').format(name)
try:
urllib.request.urlretrieve(pic_url, path)
except Exception:
Error.append(pic_url)
print(Error)
def downloadArticlePic(question_name, question_id):
pic_path = total_path + ('/{}'.format(question_name))
headers = {
'cookie':
'd_c0="APCXZd6qohGPTmue1c1c7hfx3Hud6CL6mKQ=|1595754204";',
'origin':
'https://zhuanlan.zhihu.com',
'referer':
'https://zhuanlan.zhihu.com/52dmtp',
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
}
url = 'https://zhuanlan.zhihu.com/p/{}'.format(question_id)
res = requests.get(url, headers=headers)
print(res)
html = res.content.decode()
element = etree.HTML(html)
imgs_url = element.xpath('//figure/noscript/img//@data-original')
if not os.path.exists(pic_path):
os.makedirs(pic_path)
for img_url in imgs_url:
name = str(int(time.time() * 10))
path = pic_path + ('/{}.jpg'.format(name))
try:
urllib.request.urlretrieve(img_url, path)
except Exception:
Error.append(img_url)
print(Error)
def downloadTopicPic(question_id):
headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
i = 0
while True:
url = 'https://www.zhihu.com/api/v4/topics/{}/feeds/top_activity?limit=10&after_id={}.00000'.format(
question_id, i)
i += 10
reponse = requests.get(url, headers=headers)
# print(response)
data = json.loads(reponse.content.decode())['data']
if not data:
break
# print(data)
for item in data:
if 'target' in item:
target = item['target']
question_name_id_type_dict = {}
if 'type' in target:
if target['type'] == 'answer':
question_name_id_type_dict['name'] = target[
'question']['title'].replace('<em>', '').replace(
'</em>', '')
question_name_id_type_dict['id'] = target['question'][
'id']
question_name_id_type_dict['type'] = 'question'
elif target['type'] == 'article':
question_name_id_type_dict['name'] = target[
'title'].replace('<em>', '').replace('</em>', '')
question_name_id_type_dict['id'] = target['id']
question_name_id_type_dict['type'] = 'article'
else:
continue
else:
continue
zhihuPicSpider(question_name_id_type_dict)
# print(question_name_id_type_dict)
if __name__ == '__main__':
question_name_id_type_dict = {
'name': '', # 问题名称或者随便写,主要为分文件夹提供便利
'id': 0, # 问题的ID,自行查找
'type':
'', # 问题类别 question/topic/article (question:问题 topic:话题 article:专栏文章)
}
zhihuPicSpider(question_name_id_type_dict)