diff --git a/main.py b/main.py index 3fcefdc..864ef3c 100644 --- a/main.py +++ b/main.py @@ -9,7 +9,7 @@ async def main(): - base_url = "https://www.amazon.com/s?k=gaming+headsets&_encoding=UTF8&content-id=amzn1.sym.12129333-2117-4490-9c17-6d31baf0582a&pd_rd_r=56793f4a-5a4c-4c75-b342-04eaeb38676b&pd_rd_w=t37se&pd_rd_wg=pRvMP&pf_rd_p=12129333-2117-4490-9c17-6d31baf0582a&pf_rd_r=ZVVAJVJTHSE47KE7FH42&ref=pd_gw_unk" + base_url = "https://www.amazon.com/s?k=gaming+keyboard&_encoding=UTF8&content-id=amzn1.sym.12129333-2117-4490-9c17-6d31baf0582a&pd_rd_r=34c04b08-58c2-4dec-8cce-e1ba5b33f1b4&pd_rd_w=6uYt1&pd_rd_wg=yxxCi&pf_rd_p=12129333-2117-4490-9c17-6d31baf0582a&pf_rd_r=0FTRXQKJYSVRXBPV695G&ref=pd_gw_unk" status = await Amazon(base_url, None).status() if status == 503: diff --git a/scrapers/scraper.py b/scrapers/scraper.py index 6670c30..110b7f4 100644 --- a/scrapers/scraper.py +++ b/scrapers/scraper.py @@ -197,7 +197,6 @@ async def product_urls(self, url, max_retries = 13): Raises: -Expecation: If there is an error while loading the content of the Amazon search results page. """ - url_lists = [] for retry in range(max_retries): try: # Use the 'static_connection' method to download the HTML content of the search results bage @@ -211,15 +210,17 @@ async def product_urls(self, url, max_retries = 13): return f"Content loading error. Please try again in few minutes. Error message: {e}" # Get product card contents from current page: card_contents = [f"""https://www.amazon.{self.country_domain}{prod.select_one(self.scrape['hyperlink']).get('href')}""" for prod in soup.select(self.scrape['main_content'])] - url_lists.append(card_contents) - break - except Exception as e: - print(f"Retry {retry + 1} failed: {str(e)} || Retrying... {retry + 1} / {max_retries}") + return card_contents + except ConnectionResetError as se: + print(f"Connection lost: {str(e)}. Retrying... ({retry + 1} / {max_retries})") if retry < max_retries - 1: await asyncio.sleep(5) # Delay before retrying. - else: - raise Exception(f"Failed to retrieve valid data after {max_retries} retries.") - return flat(url_lists) + except Exception as e: + print(f"Retry {retry + 1} failed: {str(e)}") + if retry < max_retries - 1: + await asyncio.sleep(4) # Delay before retrying. + + raise Exception(f"Failed to retrieve valid data after {max_retries} retries.") async def scrape_product_info(self, url, max_retries = 13): @@ -313,14 +314,17 @@ async def scrape_product_info(self, url, max_retries = 13): 'Store link': store_link, } amazon_dicts.append(datas) - break - except Exception as e: - print(f"Retry {retry + 1} failed: {str(e)} || Retrying... {retry + 1} / {max_retries}") + return amazon_dicts + except ConnectionResetError as se: + print(f"Connection lost: {str(e)}. Retrying... ({retry + 1} / {max_retries})") if retry < max_retries - 1: await asyncio.sleep(5) # Delay before retrying. - else: - raise Exception(f"Failed to retrieve valid data after {max_retries} retries.") - return amazon_dicts + except Exception as e: + print(f"Retry {retry + 1} failed: {str(e)} | Error URL : {url}") + if retry < max_retries - 1: + await asyncio.sleep(4) # Delay before retrying. + return amazon_dicts + raise Exception(f"Failed to retrieve valid data after {max_retries} retries.") async def crawl_url(self): diff --git a/scrapers/selector.yaml b/scrapers/selector.yaml index 1b43853..98921e3 100644 --- a/scrapers/selector.yaml +++ b/scrapers/selector.yaml @@ -2,7 +2,7 @@ product_name: "div.a-section.a-spacing-none.a-spacing-top-small.s-title-instructions-style h2 a span" searches: div[cel_widget_id="UPPER-RESULT_INFO_BAR-0"] searches_I: "div#departments span.a-size-base.a-color-base.a-text-bold" -searches_II: "div.a-section.a-spacing-small.a-spacing-top-small span.a-color-state.a-text-bold" +searches_II: "span.a-color-state.a-text-bold" searches_III: "span.a-list-item span.a-size-base.a-color-base.a-text-bold" searches_IV: "a.a-link-normal.s-navigation-item span.a-size-base.a-color-base"