-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmeizispider.py
105 lines (92 loc) · 3.46 KB
/
meizispider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# _*_ coding:utf-8_*_
"""
妹子爬虫 xpath 解析网址 2018.11.28
"""
__author__ = "open_china"
import os
import time
from threading import Thread
import numpy as np
import requests
from lxml import html
from requests.cookies import RequestsCookieJar
def dev_list(lis, n):
size = int(len(lis) / n)
res_list = np.array(lis[:size * n]).reshape(size, n).tolist()
if size * n < len(lis):
res_list.append(lis[size * n:])
return res_list
class Spider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Referer': "http://www.mzitu.com/"
}
self.baseUrl = "https://www.mzitu.com"
self.cookie_jar = RequestsCookieJar()
self.cookie_jar.set("MZITU", "B1CCDD4B4BC886BF99364C72C8AE1C01:FG=1", domain="mzitu.com")
def get_page(self):
print(self.baseUrl)
print(requests.get(self.baseUrl, headers=self.headers))
selector = html.fromstring(requests.get(self.baseUrl, headers=self.headers).content)
urls = []
for i in selector.xpath('//ul[@id="pins"]/li/a/@href'):
urls.append(i)
print(urls)
return urls
def get_pic_link(self, url):
print("we visit:" + url)
try:
sel = html.fromstring(requests.get(url, headers=self.headers).content)
except Exception as e:
print(e)
return
total = sel.xpath('//div[@class="pagenavi"]/a/span/text()')[-2]
title = sel.xpath('//h2[@class="main-title"]/text()')[0]
print(title + " " + "总共: " + str(total) + "张")
jpg_list = []
print("total:", total)
for i in range(int(total)):
link = "{}/{}".format(url, i + 1)
print(link)
s = html.fromstring(requests.get(link, headers=self.headers).content)
if len(s.xpath('//div[@class="main-image"]/p/a/img/@src')) > 0:
jpg_list.append(s.xpath('//div[@class="main-image"]/p/a/img/@src')[0])
else:
print("pass-----")
return title, jpg_list
def get_pic(self, title, pic_list):
count = len(pic_list)
dir_name = u"【%sP】%s" % (str(count), title)
if not os.path.exists(dir_name):
os.makedirs(dir_name)
k = 1
down_list = dev_list(pic_list, 60) # 切换为10个下载一次
for per in down_list:
threads = []
for url in per: # 开启多线程下载
filename = '%s/%s/%s.jpg' % (os.path.abspath('.'), dir_name, k)
k = k + 1
t = Thread(target=self.download, args=[filename, url])
t.start()
threads.append(t)
for t in threads: # 开启守护进程
t.join()
def download(self, filename, url):
if os.path.exists(filename):
return
with open(filename, "wb") as jpg:
try:
jpg.write(requests.get(url, headers=self.headers, timeout=60).content)
print("下载成功:" + filename)
time.sleep(0.1)
except Exception as e:
print(e)
def start(self):
urls = self.get_page()
for item in urls:
title, pic_list = self.get_pic_link(item)
self.get_pic(title, pic_list)
# break
if __name__ == "__main__":
Spider().start()