Skip to content

Commit

Permalink
re-tweak some snippet for failed requests.
Browse files Browse the repository at this point in the history
  • Loading branch information
sushil-rgb committed Jan 16, 2024
1 parent 5d5d028 commit b621005
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 32 deletions.
5 changes: 3 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@


async def main():
base_url = "https://www.amazon.com/s?k=gaming+keyboard&_encoding=UTF8&content-id=amzn1.sym.12129333-2117-4490-9c17-6d31baf0582a&pd_rd_r=34c04b08-58c2-4dec-8cce-e1ba5b33f1b4&pd_rd_w=6uYt1&pd_rd_wg=yxxCi&pf_rd_p=12129333-2117-4490-9c17-6d31baf0582a&pf_rd_r=0FTRXQKJYSVRXBPV695G&ref=pd_gw_unk"
base_url = "https://www.amazon.com/s?i=specialty-aps&bbn=4954955011&rh=n%3A4954955011%2Cn%3A%212617942011%2Cn%3A2747968011&ref=nav_em__nav_desktop_sa_intl_painting_drawing_supplies_0_2_8_2"
status = await Amazon(base_url, None).status()

if status == 503:
return "503 response. Please try again later."
return "503 response. Please try again in few minutes."

# Type True if you want to export to CSV and avoid MongoDB
csv = True
# Type True if you want to use proxy:
Expand Down
62 changes: 33 additions & 29 deletions scrapers/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ async def split_url(self):

# Get the URL of the next button on the search result page and costruct the URL of the next search result page:
next_link = f"""https://www.amazon.{self.country_domain}{await self.catch.attributes(soup.select_one(self.scrape['next_button']), 'href')}"""
for num in range(1, total_pages):

for num in range(1, total_pages):
# Replace the 'page' number in the URL with curren tpage number increment by 1:
next_url = re.sub(r'page=\d+', f'page={num+1}' , next_link)

Expand Down Expand Up @@ -197,6 +197,7 @@ async def product_urls(self, url, max_retries = 13):
Raises:
-Expecation: If there is an error while loading the content of the Amazon search results page.
"""
url_lists = []
for retry in range(max_retries):
try:
# Use the 'static_connection' method to download the HTML content of the search results bage
Expand All @@ -210,17 +211,15 @@ async def product_urls(self, url, max_retries = 13):
return f"Content loading error. Please try again in few minutes. Error message: {e}"
# Get product card contents from current page:
card_contents = [f"""https://www.amazon.{self.country_domain}{prod.select_one(self.scrape['hyperlink']).get('href')}""" for prod in soup.select(self.scrape['main_content'])]
return card_contents
except ConnectionResetError as se:
print(f"Connection lost: {str(e)}. Retrying... ({retry + 1} / {max_retries})")
if retry < max_retries - 1:
await asyncio.sleep(5) # Delay before retrying.
url_lists.append(card_contents)
except Exception as e:
print(f"Retry {retry + 1} failed: {str(e)}")
print(f"Retry {retry + 1} || Error: {str(e)}\n URL: {url}")
if retry < max_retries - 1:
await asyncio.sleep(4) # Delay before retrying.
await asyncio.sleep(5)
else:
return f"Failed to retrieve valid data after {max_retries} retries. Scraped URLS are saved and ready for crawling process."

raise Exception(f"Failed to retrieve valid data after {max_retries} retries.")
return flat(url_lists)


async def scrape_product_info(self, url, max_retries = 13):
Expand Down Expand Up @@ -314,17 +313,14 @@ async def scrape_product_info(self, url, max_retries = 13):
'Store link': store_link,
}
amazon_dicts.append(datas)
return amazon_dicts
except ConnectionResetError as se:
print(f"Connection lost: {str(e)}. Retrying... ({retry + 1} / {max_retries})")
if retry < max_retries - 1:
await asyncio.sleep(5) # Delay before retrying.
break
except Exception as e:
print(f"Retry {retry + 1} failed: {str(e)} | Error URL : {url}")
print(f"Retry {retry + 1} || Error: {str(e)}\nURL: {url}")
if retry < max_retries - 1:
await asyncio.sleep(4) # Delay before retrying.
return amazon_dicts
raise Exception(f"Failed to retrieve valid data after {max_retries} retries.")
await asyncio.sleep(5)
else:
raise Exception(f"Failed to retrieve valid data after {max_retries} retries. Scraped datas are saved and exported.")
return amazon_dicts


async def crawl_url(self):
Expand All @@ -351,18 +347,8 @@ async def concurrency(self):
if await verify_amazon(self.base_url):
return "I'm sorry, the link you provided is invalid. Could you please provide a valid Amazon link for the product category of your choice?"

# Print welcome and category scraping message:
print(f"----------------------- | Welcome to Amazon {self.region}. |---------------------------------")
searches = await self.category_name()
print(f"Scraping category || {searches}.")

# Pull the total number of pages for the category:
number_pages = await self.num_of_pages()
print(f"Total pages || {number_pages}.")

# Split the pagination and convert it to a list of URLs:
product_urls = await self.crawl_url()
print(f"The extraction process has begun and is currently in progress. The web scraper is scanning through all the links and collecting relevant information. Please be patient while the data is being gathered.")

# Use coroutines to scrape and save data from each URL concurrently:
coroutines = [self.scrape_product_info(url) for url in product_urls]
Expand All @@ -378,7 +364,25 @@ async def export_csv(self):
- None
"""
# Check if the provided Amazon link is valid:
categ_name = f"{self.region} - {await self.category_name()}"
# Print welcome and category scraping message:
try:
searches = await self.category_name()
except Exception as e:
return "Content loading error. Please try again in few minutes."

print(f"----------------------- | Welcome to Amazon {self.region}. |---------------------------------")
await asyncio.sleep(2)
print(f"Scraping category || {searches}.")

# Pull the total number of pages for the category:
number_pages = await self.num_of_pages()
print(f"Total pages || {number_pages}.")

await asyncio.sleep(2)

print(f"The extraction process has begun and is currently in progress. The web scraper is scanning through all the links and collecting relevant information. Please be patient while the data is being gathered.")

categ_name = f"{self.region} - {searches}."
concurrency_results = await self.concurrency()
results_dataframes = [pd.DataFrame(result) for result in concurrency_results]

Expand Down
2 changes: 1 addition & 1 deletion scrapers/selector.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
product_name: "div.a-section.a-spacing-none.a-spacing-top-small.s-title-instructions-style h2 a span"
searches: div[cel_widget_id="UPPER-RESULT_INFO_BAR-0"]
searches_I: "div#departments span.a-size-base.a-color-base.a-text-bold"
searches_II: "span.a-color-state.a-text-bold"
searches_II: "h1.a-size-base.s-desktop-toolbar.a-text-normal div.s-desktop-width-max.sg-row-align-items-center.s-wide-grid-style-t1.s-wide-grid-style.sg-row div.sg-col-14-of-20.sg-col-18-of-24.sg-col.s-breadcrumb.sg-col-10-of-16.sg-col-6-of-12 div.sg-col-inner span.a-color-state.a-text-bold span.a-color-state.a-text-bold"
searches_III: "span.a-list-item span.a-size-base.a-color-base.a-text-bold"
searches_IV: "a.a-link-normal.s-navigation-item span.a-size-base.a-color-base"

Expand Down

0 comments on commit b621005

Please sign in to comment.