diff --git a/querent/collectors/webscaper/web_scraper_collector.py b/querent/collectors/webscaper/web_scraper_collector.py index 3df46ec4..c3ea458e 100644 --- a/querent/collectors/webscaper/web_scraper_collector.py +++ b/querent/collectors/webscaper/web_scraper_collector.py @@ -22,8 +22,8 @@ async def poll(self): async def scrape_website(self, website_url: str): content = WebpageExtractor().extract_with_bs4(website_url) - max_length = len(" ".join(content.split(" ")[:600])) - return CollectedBytes(data=content[:max_length], file=None, error=None) + max_length = len(' '.join(content.split(" ")[:600])) + return CollectorResult({"content": content[:max_length]}) class WebScraperFactory(CollectorFactory): diff --git a/querent/common/uri.py b/querent/common/uri.py index 6a347826..05ae153d 100644 --- a/querent/common/uri.py +++ b/querent/common/uri.py @@ -13,7 +13,7 @@ class Protocol(enum.Enum): PostgreSQL = "postgresql" Ram = "ram" S3 = "s3" - Webscraper = "webscraper" + Webscraper = "https" def is_azure(self) -> bool: return self == Protocol.Azure diff --git a/querent/ingestors/pdf_ingestor.py b/querent/ingestors/pdf_ingestor.py index 3232708f..13f5df3b 100644 --- a/querent/ingestors/pdf_ingestor.py +++ b/querent/ingestors/pdf_ingestor.py @@ -1,4 +1,3 @@ -import PyPDF2 import pypdf diff --git a/querent/tools/web_page_extractor.py b/querent/tools/web_page_extractor.py index f123bf54..6e52405d 100644 --- a/querent/tools/web_page_extractor.py +++ b/querent/tools/web_page_extractor.py @@ -77,7 +77,7 @@ def extract_with_3k(self, url): article.set_html(html_content) article.parse() content = article.text.replace( - "\t", " ").replace("\n", " ").strip() + '\t', ' ').replace('\n', ' ').strip() return content[:1500] diff --git a/tests/test_webscrapper.py b/tests/test_webscrapper.py index 6415bb9b..52eeb264 100644 --- a/tests/test_webscrapper.py +++ b/tests/test_webscrapper.py @@ -29,8 +29,12 @@ def test_scrapping_data(): collector = resolver.resolve(uri, webscrapperConfig) assert collector is not None + print("REached here") + async def poll_and_print(): + print("Part 2") async for result in collector.poll(): + print("Hola...") assert not result.is_error() print(result.unwrap())