From efb0bf85fab9bebc0c836592c601e9c36e45d466 Mon Sep 17 00:00:00 2001 From: iamatulsingh Date: Sat, 26 Oct 2024 10:29:20 +0200 Subject: [PATCH] fix: parse and remove some symobls appeared in the url --- pinscrape/__init__.py | 2 +- pinscrape/_version.py | 2 +- pinscrape/pinscrape.py | 55 ++++++++++++++++++++++++------------------ 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/pinscrape/__init__.py b/pinscrape/__init__.py index 044e1ef..b980640 100644 --- a/pinscrape/__init__.py +++ b/pinscrape/__init__.py @@ -4,7 +4,7 @@ __version__ = _version __author__ = "Atul Singh" -__copyright__ = "Copyright © 2021-2023" +__copyright__ = "Copyright © 2021-2024" __license__ = "MIT" __maintainer__ = "Atul Singh" __email__ = "atulsingh0401@gmail.com" diff --git a/pinscrape/_version.py b/pinscrape/_version.py index 3348d7f..79e4386 100644 --- a/pinscrape/_version.py +++ b/pinscrape/_version.py @@ -1 +1 @@ -__version__ = "3.2.3" +__version__ = "3.2.4" diff --git a/pinscrape/pinscrape.py b/pinscrape/pinscrape.py index ca98095..bb3db64 100644 --- a/pinscrape/pinscrape.py +++ b/pinscrape/pinscrape.py @@ -15,36 +15,43 @@ class PinterestImageScraper: def __init__(self): self.json_data_list = [] self.unique_img = [] + self.error_stack = [] # ---------------------------------------- GET GOOGLE RESULTS --------------------------------- @staticmethod - def get_pinterest_links(body, max_images: int): + def get_pinterest_links(body): searched_urls = [] html = soup(body, 'html.parser') all_urls = [] links = html.select('#b_results cite') for link in links: link = link.text - all_urls.append(link) - if "pinterest" in link: - searched_urls.append(link) - # stops adding links if the limit has been reached - if max_images is not None and max_images == len(searched_urls): - break + all_urls.append(link.replace(' › ', '/')) + if "pinterest" in link and not "ideas" in link: + searched_urls.append(link.replace(' › ', '/')) + return searched_urls, all_urls # -------------------------- save json data from source code of given pinterest url ------------- - def get_source(self, url: str, proxies: dict) -> None: - try: - res = get(url, proxies=proxies) - except Exception: - return - html = soup(res.text, 'html.parser') - json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"}) - if not len(json_data): - json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"}) - - self.json_data_list.append(json.loads(json_data[0].string)) if len(json_data) else self.json_data_list.append({}) + def get_source(self, urls: list, proxies: dict, max_images: int) -> None: + counter = 1 + for extracted_url in urls: + try: + res = get(extracted_url, proxies=proxies) + except Exception as e: + self.error_stack.append(e.args) + continue + html = soup(res.text, 'html.parser') + json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"}) + if not len(json_data): + json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"}) + + self.json_data_list.append(json.loads(json_data[0].string)) if len(json_data) else self.json_data_list.append({}) + # stops adding links if the limit has been reached + if max_images is not None and max_images == counter: + break + + counter += 1 # --------------------------- READ JSON OF PINTEREST WEBSITE ---------------------- def save_image_url(self, max_images: int) -> list: @@ -67,7 +74,8 @@ def save_image_url(self, max_images: int) -> list: url_list.append(url) if max_images is not None and max_images == len(url_list): return list(set(url_list)) - except Exception: + except Exception as e: + self.error_stack.append(e.args) continue return list(set(url_list)) @@ -110,7 +118,7 @@ def start_scraping(max_images, key=None, proxies: dict = {}): keyword = keyword.replace("+", "%20") url = f'https://www.bing.com/search?q={keyword}&first=1&FORM=PERE' res = get(url, proxies=proxies, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"}) - searched_urls, links = PinterestImageScraper.get_pinterest_links(res.content, max_images) + searched_urls, links = PinterestImageScraper.get_pinterest_links(res.content) return searched_urls, key.replace(" ", "_"), res.status_code, links @@ -119,8 +127,8 @@ def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, t self.unique_img = [] self.json_data_list = [] - for i in extracted_urls: - self.get_source(i, proxies) + # for i in extracted_urls: + self.get_source(extracted_urls, proxies, max_images) # get all urls of images and save in a list urls_list = self.save_image_url(max_images) @@ -131,7 +139,8 @@ def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, t "urls_list": urls_list, "searched_urls": links, "extracted_urls": extracted_urls, - "keyword": key + "keyword": key, + "error_stack": self.error_stack, } # download images from saved images url