diff --git a/querent/collectors/webscaper/web_scraper_collector.py b/querent/collectors/webscaper/web_scraper_collector.py index a6c3e9f4..04d47502 100644 --- a/querent/collectors/webscaper/web_scraper_collector.py +++ b/querent/collectors/webscaper/web_scraper_collector.py @@ -6,6 +6,7 @@ from querent.config.collector_config import CollectorBackend, WebScraperConfig from querent.common.uri import Uri from querent.tools.web_page_extractor import WebpageExtractor +from urllib.parse import urlparse, urljoin class WebScraperCollector(Collector): @@ -24,8 +25,14 @@ async def disconnect(self): async def poll(self): async with self.poll_lock: - content = await self.scrape_website(self.website_url) - yield CollectedBytes(file=None, data=content.data, error=None) + urls_to_scrape = [self.website_url] + while urls_to_scrape: + url = urls_to_scrape.pop() + content = await self.scrape_website(url) + yield CollectedBytes(file=None, data=content.data, error=None) + # Find and add links from this page to the list of URLs to scrape + new_urls = self.extract_links(content.data, url) + urls_to_scrape.extend(new_urls) async def scrape_website(self, website_url: str): async with self.semaphore: @@ -37,6 +44,13 @@ async def scrape_website(self, website_url: str): data=content[:max_length], file=None, error=None ) + def extract_links(self, content: str, base_url: str): + # Use a proper HTML parser to extract links + extractor = WebpageExtractor() + links = extractor.extract_links(base_url) + # Join relative links with the base URL + return [urljoin(base_url, link) for link in links] + class WebScraperFactory(CollectorFactory): def __init__(self): diff --git a/querent/tools/web_page_extractor.py b/querent/tools/web_page_extractor.py index fbc01fa6..59e74c11 100644 --- a/querent/tools/web_page_extractor.py +++ b/querent/tools/web_page_extractor.py @@ -243,3 +243,44 @@ def extract_with_lxml(self, url): f"Unknown error while extracting text from HTML (lxml): {str(e)}" ) return "" + + def extract_links(self, url): + """ + Extract internal links from a webpage. + + Args: + url (str): The URL of the webpage to extract links from. + + Returns: + list: A list of internal links (URLs). + """ + try: + headers = {"User-Agent": random.choice(USER_AGENTS)} + response = requests.get(url, headers=headers, timeout=10) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "html.parser") + links = [] + for link in soup.find_all("a", href=True): + link_href = link.get("href") + if ( + link_href.startswith("/") + or link_href.startswith(".") + or link_href.startswith("#") + ): + link_href = urljoin(url, link_href) + if ( + link_href.startswith(url) + and link_href not in self.crawled_urls + ): + links.append(link_href) + return links + else: + logger.error( + f"Error while extracting links from HTML (bs4): {response.status_code} for url - {url}" + ) + return [] + except Exception as e: + logger.error( + f"Unknown error while extracting links from HTML (bs4): {str(e)}" + ) + return []