From 7a7eec92233f45da0ca22fdca279c32456f4dcf0 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Sat, 2 Sep 2023 18:02:52 -0500 Subject: [PATCH] cleanup --- querent/collectors/webscaper/web_scraper_collector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/querent/collectors/webscaper/web_scraper_collector.py b/querent/collectors/webscaper/web_scraper_collector.py index 04d47502..d4b3e027 100644 --- a/querent/collectors/webscaper/web_scraper_collector.py +++ b/querent/collectors/webscaper/web_scraper_collector.py @@ -31,7 +31,7 @@ async def poll(self): content = await self.scrape_website(url) yield CollectedBytes(file=None, data=content.data, error=None) # Find and add links from this page to the list of URLs to scrape - new_urls = self.extract_links(content.data, url) + new_urls = self.extract_links(url) urls_to_scrape.extend(new_urls) async def scrape_website(self, website_url: str): @@ -44,7 +44,7 @@ async def scrape_website(self, website_url: str): data=content[:max_length], file=None, error=None ) - def extract_links(self, content: str, base_url: str): + def extract_links(self, base_url: str): # Use a proper HTML parser to extract links extractor = WebpageExtractor() links = extractor.extract_links(base_url)