-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpixivCrawler_MAX_PLUS.py
257 lines (231 loc) · 12.4 KB
/
pixivCrawler_MAX_PLUS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import requests
from requests.exceptions import ChunkedEncodingError, ConnectionError
import re
import pyautogui
import time
from selenium.common.exceptions import WebDriverException, NoSuchElementException, NoSuchWindowException
#配置浏览器驱动
def openUrl(arguments=[], browserDriver=None, url=None):
try:
chrome_options = webdriver.ChromeOptions() # 配置和定制 Chrome 浏览器
chrome_options.add_argument("--disable-gpu") # 禁用gpu
# chrome_options.add_argument('--headless')#无头模式
# 其他配置和定制
for arg in arguments:
chrome_options.add_argument(arg)
# 若没有提供 ChromeDriver 的路径,就使用 webdriver.Chrome() 自带的 ChromeDriver
services = Service(executable_path=browserDriver)
driver = webdriver.Chrome(service=services, options=chrome_options)
# 返回webdriver对象
if url:
driver.get(url) # 直接打开URL,不返回结果
return driver
except WebDriverException as e:
print(f"WebDriverException: {e}")
return None
#滚动页面
def Scroll_tobottom(driver = None):
last_height = driver.execute_script("return document.body.scrollHeight")#循环滚动,直到到达页面底部为止
while True:
time.sleep(2)
#滚动到页面底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")#等待一段时间,让页面有机会加载新内容
driver.implicitly_wait(3)#等待最多3秒
#计算新的页面高度
new_height= driver.execute_script("return document.body.scrollHeight")#如果页面高度没有变化,说明没有加载新内容,退出循环
if new_height == last_height:
break
# 更新页面高度
last_height= new_height
#获取图片
def get_imgElement(driver=None, mode=0, path=None):
# 检查传参是否正确
if driver is None or mode == 0 or path is None:
print("driver is None or mode == 0 or path is wrong ")
return
try:
# 获取每个图片元素
imgElements = driver.find_elements(By.TAG_NAME, 'li')
# 要根据实际去修改遍历开始的位置,如下作者的话要从2~5索引开始,通过搜索下的话要从5开始
for imgElement in imgElements[4:]:
# 获取图片详情页的链接并跳转到图片详情页
match = re.search(r'href="(.*?)"', imgElement.get_attribute('outerHTML'))
if match:
url = 'https://www.pixiv.net' + match.group(1)
print("href:", url)
driver.execute_script(f'window.open("{url}")')
driver.switch_to.window(driver.window_handles[1])
time.sleep(1)
driver.implicitly_wait(5)
# 尝试点击按钮并获取图片
try:
elements = driver.find_elements(By.CSS_SELECTOR, ".sc-emr523-0.guczbC")#此处填写查看全部的按钮className
if elements:
btn = elements[0]
btn.click()
time.sleep(1)
oringalImages = driver.find_elements(By.CSS_SELECTOR, '.gtm-expand-full-size-illust')#图片的className
print(f'{len(oringalImages)} 张原图')
for i in oringalImages:
# 获取图片id和格式,href里存放的是原图片的链接
match_title = re.search(r'([^\/]+)$', i.get_attribute('href'))
title = match_title.group(1)
print(title)
print(fr'{path}\{title}')
# 根据mode决定保存方式
if mode == 1:
download_image1(driver, i, path, title)
elif mode == 2:
print(fr'{path}\{title}')
download_image2(i.get_attribute('href'), fr'{path}\{title}')
# 关闭页面并回到主页面
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(3)
else:
# 如果找不到按钮(按钮返回的是空列表时),直接尝试获取原图链接
oringalImage = driver.find_element(By.CSS_SELECTOR, '[role="presentation"]')#图片的属性选择
print(f'1 张原图')
# 获取图片链接
match_url = re.search(r'href="(.*?)"', oringalImage.get_attribute('outerHTML'))
if match_url:
image_url = match_url.group(1)
# 获取图片id和格式
match_title = re.search(r'([^\/]+)$', image_url)
title = match_title.group(1)
print(f'{title}正在下载')
# 根据mode决定保存方式
if mode == 1:
download_image1(driver, oringalImage, path, title)
elif mode == 2:
print(fr'{path}\{title}')
download_image2(image_url, fr'{path}\{title}')
# 关闭页面
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(3)
except NoSuchElementException:
# 如果找不到元素,直接尝试获取原图链接
oringalImage = driver.find_element(By.CSS_SELECTOR, '[role="presentation"]')#图片的className
print(f'1 张原图')
# 获取图片链接
match_url = re.search(r'href="(.*?)"', oringalImage.get_attribute('outerHTML'))
if match_url:
image_url = match_url.group(1)
# 获取图片id和格式
match_title = re.search(r'([^\/]+)$', image_url)
title = match_title.group(1)
print(f'{title}正在下载')
# 根据mode决定保存方式
if mode == 1:
download_image1(driver, oringalImage, path, title)
elif mode == 2:
print(fr'{path}\{title}')
download_image2(image_url, fr'{path}\{title}')
# 关闭页面
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(3)
except Exception as e:
#如果打开页面什么都找不到就直接关闭页面
print(f"处理图片时发生错误:{e}")
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(3)
else:
# 如果没有找到图片详情页的链接,直接跳过
print("未找到 href")
continue
except Exception as e:
print(f"获取图片元素时发生错误:{e}")
#使用pyautogui保存
def download_image1(driver = None,element = None,path = None,title = None):
if driver is None or element is None:
raise ValueError("driver 和 element 参数不能为 None")
# 右键另为存
elementAction = ActionChains(driver).move_to_element(element)
elementAction.context_click(element)
elementAction.perform()
pyautogui.typewrite(['v'])
time.sleep(1)
# 输入保存路径
pyautogui.write(fr'{path}\{title}',interval=0.1)
pyautogui.press('enter')
time.sleep(1)
# 使用requests保存,可在后台运行(ai加的处理错误真的牛)
def download_image2(image_url, save_path, retries=3, timeout=10):
headers = {
'referer': "https://www.pixiv.net/", # 携带referer是因为p站的插画链接都是防盗链
"user-agent": '', # 自行填入请求头信息
"cookie": '' # 自行去获取cookies
}
for attempt in range(retries):
try:
response = requests.get(image_url, headers=headers, timeout=timeout)
response.raise_for_status() # 如果响应状态码不是200,将引发HTTPError
with open(save_path, 'wb') as file:
file.write(response.content)
print("Download complete")
break # 如果下载成功,跳出循环
except (ChunkedEncodingError, ConnectionError, requests.HTTPError) as e:
print(f"下载失败,正在尝试第 {attempt + 1} 次重试...")
if attempt == retries - 1:
print(f"尝试了 {retries} 次后仍然失败,放弃下载。")
time.sleep(1)
if __name__ == '__main__':
# 谷歌浏览器的用户信息位置和驱动位置
option = [r'--user-data-dir=']#在 --user-data-dir= 后填写chrome浏览器用户数据的位置,用于跳过登录
browserDriver = r''#填写chrome浏览器驱动位置
# 输入搜索内容和执行方式
mode = int(input("输入你想要的执行方式(输入1为使用pyautogui保存,2为使用requests保存,可在后台运行)"))
path = input("输入保存路径:")
way = int(input('你想要怎么爬取(输入0为输入关键词来爬取,输入1为输入作者id来爬取作者的全部作品)'))
if way == 0:
# 打开浏览器并跳转到 Pixiv
driver = openUrl(option, browserDriver, 'https://www.pixiv.net/')
driver.implicitly_wait(30) # 设置隐式等待
# 找到搜索框并输入关键词
content = input('请输入想要搜索的内容:')
searchContent = driver.find_element(By.CSS_SELECTOR,'')#去pixiv获取搜索框的className,并填写在 '' 内,注意要以 . 开头,且空格要用 . 代替,不能有空格,如'.button.active'
searchContent.send_keys(f'{content}\n')
elif way == 1:
authorId = input('请输入作者id:')
# 打开浏览器并跳转到 Pixiv
driver = openUrl(option, browserDriver, f'https://www.pixiv.net/users/{authorId}/artworks')
if driver is not None:
try:
driver.implicitly_wait(30) # 设置隐式等待
# 使用显式等待来等待下一页按钮出现
wait = WebDriverWait(driver, 10)
next_button_selector = '.sc-d98f2c-0.sc-xhhh7v-2.cCkJiq.sc-xhhh7v-1-filterProps-Styled-Component.kKBslM'#注意可以去看看下一页的className是否相同,应该是一样的
while True:
#滚动页面
Scroll_tobottom(driver=driver)
# 下载本页图片
get_imgElement(driver=driver, mode=mode, path=path)
# 等待下一页按钮出现或直到超时
next_buttons = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, next_button_selector)))
if len(next_buttons) > 1:
nextButton = next_buttons[1]
isnext = nextButton.get_attribute('outerHTML')
if 'hidden' in isnext:
print("没有更多图片了")
break
else:
nextButton.click()
time.sleep(5)
else:
print("没有找到下一页按钮")
break
except WebDriverException as e:
print(f"发生错误:{e}")
finally:
driver.quit()
else:
print("浏览器打开失败")