diff --git a/.gitignore b/.gitignore
index c46238f..a459820 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ pinscrape.egg-info
**/pinscrape/*.pyc
venv
output
-.idea
\ No newline at end of file
+.idea
+data
\ No newline at end of file
diff --git a/README.md b/README.md
index 258fb23..df6875c 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,38 @@
# pinscrape
-
+
[![built with Python3](https://img.shields.io/badge/built%20with-Python3.6+-red.svg)](https://www.python.org/)
-### This package can be use to scrape images from pinterest just by using any search keywords. Install it just by using
+### This package can be used to scrape images from pinterest just by using any search keywords. Install it just by using
`pip install pinscrape`
+
### How to use?
```python
-from pinscrape import pinscrape
-details = pinscrape.scraper.scrape("messi", "output", {}, 10, 15)
-
-if details["isDownloaded"]:
- print("\nDownloading completed !!")
- print(f"\nTotal urls found: {len(details['extracted_urls'])}")
- print(f"\nTotal images downloaded (including duplicate images): {len(details['urls_list'])}")
- print(details)
-else:
- print("\nNothing to download !!", details)
-```
+from pinscrape import scraper, Pinterest
+
+
+keyword = "messi"
+output_folder = "output"
+proxies = {}
+number_of_workers = 10
+images_to_download = 1
-`scrape("messi", "output", {}, 10, 15)`
-- `"messi"` is keyword
-- `"output"` is path to a folder where you want to save images
-- `{}` is proxy list if you want to add one (optional)
-- `10` is a number of threads you want to use for downloading those images (optional)
-- `15` is the maximum number of images you want to download (optional)
+def using_search_engine():
+ details = scraper.scrape(keyword, output_folder, proxies, number_of_workers, images_to_download)
+ if details["isDownloaded"]:
+ print("\nDownloading completed !!")
+ print(f"\nTotal urls found: {len(details['extracted_urls'])}")
+ print(f"\nTotal images downloaded (including duplicate images): {len(details['urls_list'])}")
+ print(details)
+ else:
+ print("\nNothing to download !!", details)
+
+
+def using_pinterest_apis():
+ p = Pinterest(proxies=proxies) # you can also pass `user_agent` here.
+ images_url = p.search(keyword, images_to_download)
+ p.download(url_list=images_url, number_of_workers=number_of_workers, output_folder=output_folder)
+```
diff --git a/e2e.py b/e2e.py
index c897b91..e136181 100644
--- a/e2e.py
+++ b/e2e.py
@@ -1,10 +1,15 @@
-from pinscrape import scraper
+from pinscrape import scraper, Pinterest
-details = scraper.scrape("messi", "output", {}, 10, 1)
+keyword = "messi"
+output_folder = "output"
+proxies = {}
+number_of_workers = 10
+images_to_download = 1
def test_single_data():
+ details = scraper.scrape(keyword, output_folder, proxies, number_of_workers, images_to_download)
if details["isDownloaded"]:
print("\nDownloading completed !!")
print(f"\nTotal urls found: {len(details['extracted_urls'])}")
@@ -14,3 +19,9 @@ def test_single_data():
print("\nNothing to download !!", details)
assert len(details['extracted_urls']) > 0
+
+def test_v2():
+ p = Pinterest()
+ images_url = p.search(keyword, images_to_download)
+ p.download(url_list=images_url, number_of_workers=number_of_workers, output_folder=output_folder)
+ assert len(images_url) == images_to_download
diff --git a/pinscrape/__init__.py b/pinscrape/__init__.py
index b980640..7aa6798 100644
--- a/pinscrape/__init__.py
+++ b/pinscrape/__init__.py
@@ -1,5 +1,6 @@
from ._version import __version__ as _version
from .pinscrape import scraper
+from .v2 import Pinterest
__version__ = _version
diff --git a/pinscrape/_version.py b/pinscrape/_version.py
index 79e4386..ce1305b 100644
--- a/pinscrape/_version.py
+++ b/pinscrape/_version.py
@@ -1 +1 @@
-__version__ = "3.2.4"
+__version__ = "4.0.0"
diff --git a/pinscrape/v2.py b/pinscrape/v2.py
new file mode 100644
index 0000000..b967888
--- /dev/null
+++ b/pinscrape/v2.py
@@ -0,0 +1,220 @@
+import requests
+import json
+import time
+import cv2
+import numpy as np
+import logging
+
+from urllib.parse import quote_plus, quote
+from concurrent.futures import ThreadPoolExecutor
+from os import path, makedirs, getcwd
+
+
+class Pinterest:
+ def __init__(self, user_agent: str = "", proxies: dict = None):
+ self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.6668.71 Safari/537.36" \
+ if not user_agent else user_agent
+ self.BASE_URL = "https://www.pinterest.com"
+ self.BASE_HEADERS = {
+ 'Host': 'www.pinterest.com',
+ 'Sec-Ch-Ua': '"Chromium";v="129", "Not=A?Brand";v="8"',
+ 'Sec-Ch-Ua-Mobile': '?0',
+ 'Sec-Ch-Ua-Platform': '"Windows"',
+ 'Accept-Language': 'en-GB,en;q=0.9',
+ 'Upgrade-Insecure-Requests': '1',
+ 'User-Agent': self.user_agent,
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+ 'Sec-Fetch-Site': 'none',
+ 'Sec-Fetch-Mode': 'navigate',
+ 'Sec-Fetch-User': '?1',
+ 'Sec-Fetch-Dest': 'document',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Priority': 'u=0, i',
+ 'Connection': 'keep-alive',
+ }
+ self.data_dir = "data"
+ self.client_context = {}
+ self.unique_images = []
+ self.cookies = self.read_file("cookies.json").get("cookies", "")
+ self.proxies = proxies if proxies else {}
+
+ if not self.cookies:
+ self.cookies = self.fetch_cookies()
+ else:
+ logging.debug("Using saved cookies")
+
+ self.time_epoch = self.read_file("time_epoch.json").get('time_epoch', '')
+ if not self.time_epoch:
+ self.time_epoch = self.get_current_epoch()
+ self.save_file("time_epoch.json", {"time_epoch": self.time_epoch})
+ logging.info(f"New time epoch saved")
+ else:
+ current_epoch = self.get_current_epoch()
+ if float(self.time_epoch) < current_epoch:
+ self.update_time_epoch()
+ else:
+ logging.info(f"Using saved time epoch")
+
+ def update_time_epoch(self) -> None:
+ """
+ update_time_epoch will update current time epoch
+ :return: None
+ """
+ self.time_epoch = self.get_current_epoch()
+ self.save_file("time_epoch.json", {"time_epoch": self.time_epoch})
+ logging.info(f"New time epoch saved")
+
+ def save_file(self, file_name: str, content: dict) -> None:
+ """
+ save_file will save file with dict/list as content
+ :param file_name: file name that will be used to save a file
+ :param content: content should be dict/list
+ :return: None
+ """
+ makedirs(self.data_dir, exist_ok=True)
+ if path.exists(path.join(self.data_dir, file_name)):
+ with open(path.join(self.data_dir, file_name), "r") as f:
+ data = json.load(f)
+ for key in list(content.keys()):
+ data[key] = content[key]
+ else:
+ data = content
+
+ with open(path.join(self.data_dir, file_name), "w") as f:
+ json.dump(data, f)
+
+ def read_file(self, file_name: str) -> dict:
+ """
+ read_file will read file and return its content.
+ :param file_name: file that needs to be read
+ :return: dict content of the file
+ """
+ if not path.exists(path.join(self.data_dir, file_name)):
+ return {}
+
+ with open(path.join(self.data_dir, file_name), "r") as f:
+ data = json.load(f)
+ return data
+
+ @staticmethod
+ def image_hash(image: cv2.Mat, hash_size: int = 8) -> int:
+ """
+ image_hash will return the hash of an image
+ :param image: multi-dimension array of image data
+ :param hash_size: image hash size
+ :return: integer hash value representing the differences between adjacent pixels
+ """
+ resized = cv2.resize(image, (hash_size + 1, hash_size))
+ diff = resized[:, 1:] > resized[:, :-1]
+ return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
+
+ def saving_image(self, var: list) -> None:
+ """
+ saving_image downloads and save image(s) on the disk
+ :param var: list of params passed through thread
+ :return: None
+ """
+ url_list, folder_name = var
+ makedirs(path.join(getcwd(), folder_name), exist_ok=True)
+ for img in url_list:
+ result = requests.request("GET", img, stream=True, proxies=self.proxies).content
+ file_name = img.split("/")[-1]
+ file_path = path.join(getcwd(), folder_name, file_name)
+ img_arr = np.asarray(bytearray(result), dtype="uint8")
+ image = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
+ if not self.image_hash(image) in self.unique_images:
+ cv2.imwrite(file_path, image)
+ self.unique_images.append(self.image_hash(image))
+
+ def download(self, url_list: list, number_of_workers: int, output_folder: str) -> None:
+ """
+ download create number of workers to initiate download
+ :param url_list: list of urls
+ :param number_of_workers: number of workers you want to use
+ :param output_folder: output folder name to which all files will be saved
+ :return: None
+ """
+ idx = len(url_list) // number_of_workers if len(url_list) > 9 else len(url_list)
+ param = []
+ for i in range(number_of_workers):
+ param.append((url_list[(i * idx):(idx * (i + 1))], output_folder))
+ with ThreadPoolExecutor(max_workers=number_of_workers) as executor:
+ executor.map(self.saving_image, param)
+
+ @staticmethod
+ def get_current_epoch() -> int:
+ """
+ get_current_epoch will set current time epoch
+ :return: time epoch integer
+ """
+ current_time_seconds = time.time()
+ return int(current_time_seconds * 1000)
+
+ def fetch_cookies(self) -> str:
+ """
+ fetch_cookies will get the fresh cookies
+ :return: cookie string
+ """
+ cookie_res = requests.request("GET", f"{self.BASE_URL}/ideas/", data={}, proxies=self.proxies)
+ if cookie_res.status_code != 200:
+ logging.error(f"Failed attempt to get Cookies. Status code for cookie is {cookie_res.status_code}")
+ exit()
+
+ # Extract cookies from the response
+ cookies = cookie_res.cookies
+
+ # Format cookies for the Cookie header
+ cookie_header = '; '.join([f"{name}={value}" for name, value in cookies.items()])
+
+ logging.info("Saving cookies")
+ self.save_file("cookies.json", {"cookies": cookie_header})
+ return cookie_header
+
+ def search(self, query: str, page_size=26) -> list:
+ """
+ search query about the keyword on pinterest
+ :param query: keyword that will be searched on pinterest
+ :param page_size: total number of images to get (try to avoid big numbers here).
+ :return: list of image urls
+ """
+ source_url = f"/search/pins/?q={quote(query)}&rs=typed"
+ data = quote_plus(json.dumps({"options":
+ {"applied_unified_filters": None, "appliedProductFilters": "---" , "article": None,
+ "auto_correction_disabled": False, "corpus": None, "customized_rerank_type": None,
+ "domains":None, "filters": None, "journey_depth": None, "page_size": f"{page_size}", "price_max": None,
+ "price_min": None, "query_pin_sigs": None,"query": quote(query), "redux_normalize_feed": True,
+ "request_params": None, "rs": "typed", "scope": "pins", "selected_one_bar_modules": None,
+ "source_id": None, "source_module_id": None,
+ "source_url": quote_plus(source_url), "top_pin_id": None, "top_pin_ids": None},
+ "context":{}}).replace(" ", ""))
+
+ data = data.replace("%2520", "%20").replace("%252F", "%2F").replace("%253F", "%3F")\
+ .replace("%252520", "%2520").replace("%253D", "%3D").replace("%2526", "%26")
+
+ ts = self.time_epoch
+ url = (f"{self.BASE_URL}/resource/BaseSearchResource/get/?source_url={quote_plus(source_url)}"
+ f"&data={data}&_={ts}")
+ payload = {}
+ headers = self.BASE_HEADERS
+ headers['Cookies'] = self.cookies
+ response = requests.request("GET", url, headers=headers, data=payload, proxies=self.proxies)
+ image_urls = []
+ if response.status_code != 200:
+ logging.warning(f"Image search has failed!, {response.status_code}, {response.text}")
+ return []
+
+ json_data = response.json()
+ results = json_data.get('resource_response', {}).get('data', {}).get('results', [])
+ for result in results:
+ image_urls.append(result['images']['orig']['url'])
+ self.client_context = json_data['client_context']
+ logging.info(f"Total {len(image_urls)} image(s) found.")
+ return image_urls
+
+
+if __name__ == "__main__":
+ download_limit = 26
+ keyword = "loki"
+ p = Pinterest()
+ images_url = p.search(keyword, download_limit)
+ p.download(url_list=images_url, number_of_workers=1, output_folder="output")
diff --git a/requirements.txt b/requirements.txt
index 785395a..1fb971b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
-requests==2.32.0
+requests~=2.32.2
beautifulsoup4==4.11.1
-pydotmap
-opencv-python
+pydotmap~=0.1.3
+opencv-python~=4.9.0.80
pytest==7.2.0
+numpy~=1.24.2