-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpinterest_crawler.py
216 lines (174 loc) · 7.66 KB
/
pinterest_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import os
import re
import time
from pattern.web import URL, DOM
from selenium import webdriver
class PinterestCrawler(object):
def __init__(self, search_key=''):
""" Pinterest image search class
Args:
search_key to be entered.
"""
if type(search_key) == str:
## convert to list even for one search keyword to standalize the pulling.
self.g_search_key_list = [search_key]
elif type(search_key) == list:
self.g_search_key_list = search_key
else:
print 'keyword not of type str or list'
raise
self.g_search_key = ''
## user options
self.image_dl_per_search = 200 # by default (if not explicitly set in set_num_image_to_dl function), this number of images will be downloaded
## url construct string text
self.prefix_of_search_url = "https://pl.pinterest.com/search/pins/?q="
self.target_url_str = ''
## storage
self.pic_url_list = []
self.pic_info_list = []
## file and folder path
#save to current dir "."
self.folder_main_dir_prefix = r'.'
def reformat_search_for_spaces(self):
"""
Method call immediately at the initialization stages
get rid of the spaces and replace by the "+"
Use in search term. Eg: "Cookie fast" to "Cookie+fast"
steps:
strip any lagging spaces if present
replace the self.g_search_key
"""
self.g_search_key = self.g_search_key.rstrip().replace(' ', '+')
def set_num_image_to_dl(self, num_image):
""" Set the number of image to download. Set to self.image_dl_per_search.
Args:
num_image (int): num of image to download.
"""
self.image_dl_per_search = num_image
def get_searchlist_fr_file(self, filename):
"""Get search list from filename. Ability to add in a lot of phrases.
Will replace the self.g_search_key_list
Args:
filename (str): full file path
"""
with open(filename, 'r') as f:
self.g_search_key_list = f.readlines()
def formed_search_url(self):
''' Form the url either one selected key phrases or multiple search items.
Get the url from the self.g_search_key_list
Set to self.sp_search_url_list
'''
self.reformat_search_for_spaces()
self.target_url_str = self.prefix_of_search_url + self.g_search_key
def multi_search_download(self):
""" Mutli search download"""
self.create_folder()
for indiv_search in self.g_search_key_list:
self.pic_url_list = []
self.pic_info_list = []
self.g_search_key = indiv_search
self.formed_search_url()
self.retrieve_source_fr_html()
self.extract_pic_url()
indinv_search_dir_path = os.path.join(self.gs_raw_dirpath, self.g_search_key.replace("\n", ""))
if not os.path.exists(indinv_search_dir_path):
os.makedirs(indinv_search_dir_path)
self.downloading_all_photos()
self.save_infolist_to_file()
def retrieve_source_fr_html(self):
""" Make use of selenium. Retrieve from html table using pandas table.
"""
options = webdriver.ChromeOptions()
options.add_argument("user-data-dir=C:\Users\pan\AppData\Local\Google\Chrome\User Data") # Path to your chrome profile
driver = webdriver.Chrome(chrome_options=options)
driver.get(self.target_url_str)
## wait for log in then get the page source.
try:
driver.execute_script("window.scrollTo(0, 10000000)")
time.sleep(1)
self.temp_page_source = driver.page_source
# driver.find_element_by_tag_name('img').click() # ok
#here we assume that one scroll is equal to gathering 20 new images
for i in range(0, int(self.image_dl_per_search/20)):
time.sleep(2)
driver.execute_script("window.scrollTo(0, 10000000)")
except Exception as e:
print e
print 'not able to find'
driver.quit()
self.page_source = driver.page_source
driver.close()
def extract_pic_url(self):
""" extract all the raw pic url in list
"""
dom = DOM(self.page_source)
tag_list = dom('img')
#we start for lopp from 1, becuase 0 index contains some crap image (connected with username account?)
for tag in tag_list[1:self.image_dl_per_search+1]:
tar_str = re.search("3x,.*jpg", tag.source)
tar_str = tar_str.group(0).replace("3x, ", "")
try:
self.pic_url_list.append(tar_str)
except:
print 'error parsing', tag
def create_folder(self):
"""
Create a folder to put the log data segregate by date
"""
self.gs_raw_dirpath = os.path.join(self.folder_main_dir_prefix, time.strftime("_%d_%b%y", time.localtime()))
if not os.path.exists(self.gs_raw_dirpath):
os.makedirs(self.gs_raw_dirpath)
def downloading_all_photos(self):
""" download all photos to particular folder
"""
print("For " + str(self.g_search_key) + " I've found " + str(len(self.pic_url_list)) + "photos. Now i will download them!")
self.create_folder()
pic_counter = 1
for url_link in self.pic_url_list:
print pic_counter
pic_prefix_str = self.g_search_key + str(pic_counter)
try:
self.download_single_image(url_link.encode(), pic_prefix_str)
pic_counter = pic_counter + 1
except:
continue
def download_single_image(self, url_link, pic_prefix_str):
""" Download data according to the url link given.
Args:
url_link (str): url str.
pic_prefix_str (str): pic_prefix_str for unique label the pic
"""
self.download_fault = 0
file_ext = os.path.splitext(url_link)[1] # use for checking valid pic ext
temp_filename = pic_prefix_str + file_ext
temp_filename_full_path = os.path.join(self.gs_raw_dirpath, self.g_search_key, temp_filename)
valid_image_ext_list = ['.jpg'] # not comprehensive
url = URL(url_link)
#if url.redirect:
# return # if there is re-direct, return
if file_ext not in valid_image_ext_list:
return # return if not valid image extension
f = open(temp_filename_full_path, 'wb')
print url_link
self.pic_info_list.append(pic_prefix_str + ': ' + url_link)
try:
f.write(url.download()) # if have problem skip
except:
# if self.__print_download_fault:
print 'Problem with processing this data: ', url_link
self.download_fault = 1
f.close()
def save_infolist_to_file(self):
""" Save the info list to file.
"""
temp_filename_full_path = os.path.join(self.gs_raw_dirpath, self.g_search_key, self.g_search_key + '_info.txt')
with open(temp_filename_full_path, 'w') as f:
for n in self.pic_info_list:
f.write(n)
f.write('\n')
if __name__ == '__main__':
crawler = PinterestCrawler('') # leave blank if get the search list from file
searchlist_filename = r'search_list.txt'
crawler.set_num_image_to_dl(3000) # number of images to download per single search
crawler.get_searchlist_fr_file(searchlist_filename) # replace the search list
crawler.multi_search_download()