From efb0bf85fab9bebc0c836592c601e9c36e45d466 Mon Sep 17 00:00:00 2001
From: iamatulsingh <atulsingh0401@gmail.com>
Date: Sat, 26 Oct 2024 10:29:20 +0200
Subject: [PATCH] fix: parse and remove some symobls appeared in the url

---
 pinscrape/__init__.py  |  2 +-
 pinscrape/_version.py  |  2 +-
 pinscrape/pinscrape.py | 55 ++++++++++++++++++++++++------------------
 3 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/pinscrape/__init__.py b/pinscrape/__init__.py
index 044e1ef..b980640 100644
--- a/pinscrape/__init__.py
+++ b/pinscrape/__init__.py
@@ -4,7 +4,7 @@
 
 __version__ = _version
 __author__ = "Atul Singh"
-__copyright__ = "Copyright © 2021-2023"
+__copyright__ = "Copyright © 2021-2024"
 __license__ = "MIT"
 __maintainer__ = "Atul Singh"
 __email__ = "atulsingh0401@gmail.com"
diff --git a/pinscrape/_version.py b/pinscrape/_version.py
index 3348d7f..79e4386 100644
--- a/pinscrape/_version.py
+++ b/pinscrape/_version.py
@@ -1 +1 @@
-__version__ = "3.2.3"
+__version__ = "3.2.4"
diff --git a/pinscrape/pinscrape.py b/pinscrape/pinscrape.py
index ca98095..bb3db64 100644
--- a/pinscrape/pinscrape.py
+++ b/pinscrape/pinscrape.py
@@ -15,36 +15,43 @@ class PinterestImageScraper:
     def __init__(self):
         self.json_data_list = []
         self.unique_img = []
+        self.error_stack = []
 
     # ---------------------------------------- GET GOOGLE RESULTS ---------------------------------
     @staticmethod
-    def get_pinterest_links(body, max_images: int):
+    def get_pinterest_links(body):
         searched_urls = []
         html = soup(body, 'html.parser')
         all_urls = []
         links = html.select('#b_results cite')
         for link in links:
             link = link.text
-            all_urls.append(link)
-            if "pinterest" in link:
-                searched_urls.append(link)
-                # stops adding links if the limit has been reached
-                if max_images is not None and max_images == len(searched_urls):
-                    break
+            all_urls.append(link.replace(' › ', '/'))
+            if "pinterest" in link and not "ideas" in link:
+                searched_urls.append(link.replace(' › ', '/'))
+
         return searched_urls, all_urls
 
     # -------------------------- save json data from source code of given pinterest url -------------
-    def get_source(self, url: str, proxies: dict) -> None:
-        try:
-            res = get(url, proxies=proxies)
-        except Exception:
-            return
-        html = soup(res.text, 'html.parser')
-        json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"})
-        if not len(json_data):
-            json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"})
-
-        self.json_data_list.append(json.loads(json_data[0].string)) if len(json_data) else self.json_data_list.append({})
+    def get_source(self, urls: list, proxies: dict, max_images: int) -> None:
+        counter = 1
+        for extracted_url in urls:
+            try:
+                res = get(extracted_url, proxies=proxies)
+            except Exception as e:
+                self.error_stack.append(e.args)
+                continue
+            html = soup(res.text, 'html.parser')
+            json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"})
+            if not len(json_data):
+                json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"})
+
+            self.json_data_list.append(json.loads(json_data[0].string)) if len(json_data) else self.json_data_list.append({})
+            # stops adding links if the limit has been reached
+            if max_images is not None and max_images == counter:
+                break
+
+            counter += 1
 
     # --------------------------- READ JSON OF PINTEREST WEBSITE ----------------------
     def save_image_url(self, max_images: int) -> list:
@@ -67,7 +74,8 @@ def save_image_url(self, max_images: int) -> list:
                     url_list.append(url)
                     if max_images is not None and max_images == len(url_list):
                         return list(set(url_list))
-            except Exception:
+            except Exception as e:
+                self.error_stack.append(e.args)
                 continue
 
         return list(set(url_list))
@@ -110,7 +118,7 @@ def start_scraping(max_images, key=None, proxies: dict = {}):
         keyword = keyword.replace("+", "%20")
         url = f'https://www.bing.com/search?q={keyword}&first=1&FORM=PERE'
         res = get(url, proxies=proxies, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"})
-        searched_urls, links = PinterestImageScraper.get_pinterest_links(res.content, max_images)
+        searched_urls, links = PinterestImageScraper.get_pinterest_links(res.content)
 
         return searched_urls, key.replace(" ", "_"), res.status_code, links
 
@@ -119,8 +127,8 @@ def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, t
         self.unique_img = []
         self.json_data_list = []
 
-        for i in extracted_urls:
-            self.get_source(i, proxies)
+        # for i in extracted_urls:
+        self.get_source(extracted_urls, proxies, max_images)
 
         # get all urls of images and save in a list
         urls_list = self.save_image_url(max_images)
@@ -131,7 +139,8 @@ def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, t
             "urls_list": urls_list,
             "searched_urls": links,
             "extracted_urls": extracted_urls,
-            "keyword": key
+            "keyword": key,
+            "error_stack": self.error_stack,
         }
 
         # download images from saved images url