Skip to content

Commit

Permalink
feat: added extracted_urls list to print urls got from search engine
Browse files Browse the repository at this point in the history
iamatulsingh committed Jul 4, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent cc58da9 commit 89d48ee
Showing 4 changed files with 15 additions and 12 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -11,10 +11,10 @@ details = pinscrape.scraper.scrape("messi", "output", {}, 10, 15)
if details["isDownloaded"]:
print("\nDownloading completed !!")
print(f"\nTotal urls found: {len(details['extracted_urls'])}")
print(f"\nTotal images downloaded (including duplicate images): {len(details['url_list'])}")
print(f"\nTotal images downloaded (including duplicate images): {len(details['urls_list'])}")
print(details)
else:
print("\nNothing to download !!")
print("\nNothing to download !!", details)
```

`scrape("messi", "output", {}, 10, 15)` <br/>
2 changes: 1 addition & 1 deletion e2e.py
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ def test_single_data():
if details["isDownloaded"]:
print("\nDownloading completed !!")
print(f"\nTotal urls found: {len(details['extracted_urls'])}")
print(f"\nTotal images downloaded (including duplicate images): {len(details['url_list'])}")
print(f"\nTotal images downloaded (including duplicate images): {len(details['urls_list'])}")
print(details)
else:
print("\nNothing to download !!", details)
2 changes: 1 addition & 1 deletion pinscrape/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.2.2"
__version__ = "3.2.3"
19 changes: 11 additions & 8 deletions pinscrape/pinscrape.py
Original file line number Diff line number Diff line change
@@ -21,15 +21,17 @@ def __init__(self):
def get_pinterest_links(body, max_images: int):
searched_urls = []
html = soup(body, 'html.parser')
all_urls = []
links = html.select('#b_results cite')
for link in links:
link = link.text
all_urls.append(link)
if "pinterest" in link:
searched_urls.append(link)
# stops adding links if the limit has been reached
if max_images is not None and max_images == len(searched_urls):
break
return searched_urls
return searched_urls, all_urls

# -------------------------- save json data from source code of given pinterest url -------------
def get_source(self, url: str, proxies: dict) -> None:
@@ -108,34 +110,35 @@ def start_scraping(max_images, key=None, proxies: dict = {}):
keyword = keyword.replace("+", "%20")
url = f'https://www.bing.com/search?q={keyword}&first=1&FORM=PERE'
res = get(url, proxies=proxies, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"})
searched_urls = PinterestImageScraper.get_pinterest_links(res.content, max_images)
searched_urls, links = PinterestImageScraper.get_pinterest_links(res.content, max_images)

return searched_urls, key.replace(" ", "_"), res.status_code
return searched_urls, key.replace(" ", "_"), res.status_code, links

def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, threads: int = 10, max_images: int = None) -> dict:
extracted_urls, keyword, search_engine_status_code = PinterestImageScraper.start_scraping(max_images, key, proxies)
extracted_urls, keyword, search_engine_status_code, links = PinterestImageScraper.start_scraping(max_images, key, proxies)
self.unique_img = []
self.json_data_list = []

for i in extracted_urls:
self.get_source(i, proxies)

# get all urls of images and save in a list
url_list = self.save_image_url(max_images)
urls_list = self.save_image_url(max_images)

return_data = {
"isDownloaded": False,
"search_engine_status_code": search_engine_status_code,
"url_list": url_list,
"urls_list": urls_list,
"searched_urls": links,
"extracted_urls": extracted_urls,
"keyword": key
}

# download images from saved images url
if len(url_list):
if len(urls_list):
try:
out_folder = output_folder if output_folder else key
self.download(url_list, threads, out_folder)
self.download(urls_list, threads, out_folder)
except KeyboardInterrupt:
return return_data

0 comments on commit 89d48ee

Please sign in to comment.