Skip to content

Commit

Permalink
fix: parse and remove some symobls appeared in the url
Browse files Browse the repository at this point in the history
  • Loading branch information
iamatulsingh committed Oct 26, 2024
1 parent 448d673 commit efb0bf8
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 25 deletions.
2 changes: 1 addition & 1 deletion pinscrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

__version__ = _version
__author__ = "Atul Singh"
__copyright__ = "Copyright © 2021-2023"
__copyright__ = "Copyright © 2021-2024"
__license__ = "MIT"
__maintainer__ = "Atul Singh"
__email__ = "atulsingh0401@gmail.com"
2 changes: 1 addition & 1 deletion pinscrape/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.2.3"
__version__ = "3.2.4"
55 changes: 32 additions & 23 deletions pinscrape/pinscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,43 @@ class PinterestImageScraper:
def __init__(self):
self.json_data_list = []
self.unique_img = []
self.error_stack = []

# ---------------------------------------- GET GOOGLE RESULTS ---------------------------------
@staticmethod
def get_pinterest_links(body, max_images: int):
def get_pinterest_links(body):
searched_urls = []
html = soup(body, 'html.parser')
all_urls = []
links = html.select('#b_results cite')
for link in links:
link = link.text
all_urls.append(link)
if "pinterest" in link:
searched_urls.append(link)
# stops adding links if the limit has been reached
if max_images is not None and max_images == len(searched_urls):
break
all_urls.append(link.replace(' › ', '/'))
if "pinterest" in link and not "ideas" in link:
searched_urls.append(link.replace(' › ', '/'))

return searched_urls, all_urls

# -------------------------- save json data from source code of given pinterest url -------------
def get_source(self, url: str, proxies: dict) -> None:
try:
res = get(url, proxies=proxies)
except Exception:
return
html = soup(res.text, 'html.parser')
json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"})
if not len(json_data):
json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"})

self.json_data_list.append(json.loads(json_data[0].string)) if len(json_data) else self.json_data_list.append({})
def get_source(self, urls: list, proxies: dict, max_images: int) -> None:
counter = 1
for extracted_url in urls:
try:
res = get(extracted_url, proxies=proxies)
except Exception as e:
self.error_stack.append(e.args)
continue
html = soup(res.text, 'html.parser')
json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"})
if not len(json_data):
json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"})

self.json_data_list.append(json.loads(json_data[0].string)) if len(json_data) else self.json_data_list.append({})
# stops adding links if the limit has been reached
if max_images is not None and max_images == counter:
break

counter += 1

# --------------------------- READ JSON OF PINTEREST WEBSITE ----------------------
def save_image_url(self, max_images: int) -> list:
Expand All @@ -67,7 +74,8 @@ def save_image_url(self, max_images: int) -> list:
url_list.append(url)
if max_images is not None and max_images == len(url_list):
return list(set(url_list))
except Exception:
except Exception as e:
self.error_stack.append(e.args)
continue

return list(set(url_list))
Expand Down Expand Up @@ -110,7 +118,7 @@ def start_scraping(max_images, key=None, proxies: dict = {}):
keyword = keyword.replace("+", "%20")
url = f'https://www.bing.com/search?q={keyword}&first=1&FORM=PERE'
res = get(url, proxies=proxies, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"})
searched_urls, links = PinterestImageScraper.get_pinterest_links(res.content, max_images)
searched_urls, links = PinterestImageScraper.get_pinterest_links(res.content)

return searched_urls, key.replace(" ", "_"), res.status_code, links

Expand All @@ -119,8 +127,8 @@ def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, t
self.unique_img = []
self.json_data_list = []

for i in extracted_urls:
self.get_source(i, proxies)
# for i in extracted_urls:
self.get_source(extracted_urls, proxies, max_images)

# get all urls of images and save in a list
urls_list = self.save_image_url(max_images)
Expand All @@ -131,7 +139,8 @@ def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, t
"urls_list": urls_list,
"searched_urls": links,
"extracted_urls": extracted_urls,
"keyword": key
"keyword": key,
"error_stack": self.error_stack,
}

# download images from saved images url
Expand Down

0 comments on commit efb0bf8

Please sign in to comment.