Merge branch 'ayush' of https://github.com/Querent-ai/querent-ai into…

… ayush
Querent-ai · Aug 29, 2023 · c1e6ac8 · c1e6ac8
2 parents 3f0c2da + ede0117
commit c1e6ac8
Show file tree

Hide file tree

Showing 5 changed files with 8 additions and 5 deletions.
diff --git a/querent/collectors/webscaper/web_scraper_collector.py b/querent/collectors/webscaper/web_scraper_collector.py
@@ -22,8 +22,8 @@ async def poll(self):
 
     async def scrape_website(self, website_url: str):
         content = WebpageExtractor().extract_with_bs4(website_url)
-        max_length = len(" ".join(content.split(" ")[:600]))
-        return CollectedBytes(data=content[:max_length], file=None, error=None)
+        max_length = len(' '.join(content.split(" ")[:600]))
+        return CollectorResult({"content": content[:max_length]})
 
 
 class WebScraperFactory(CollectorFactory):

diff --git a/querent/common/uri.py b/querent/common/uri.py
@@ -13,7 +13,7 @@ class Protocol(enum.Enum):
     PostgreSQL = "postgresql"
     Ram = "ram"
     S3 = "s3"
-    Webscraper = "webscraper"
+    Webscraper = "https"
 
     def is_azure(self) -> bool:
         return self == Protocol.Azure

diff --git a/querent/ingestors/pdf_ingestor.py b/querent/ingestors/pdf_ingestor.py
@@ -1,4 +1,3 @@
-import PyPDF2
 import pypdf
 
 

diff --git a/querent/tools/web_page_extractor.py b/querent/tools/web_page_extractor.py
@@ -77,7 +77,7 @@ def extract_with_3k(self, url):
                 article.set_html(html_content)
                 article.parse()
                 content = article.text.replace(
-                    "\t", " ").replace("\n", " ").strip()
+                    '\t', ' ').replace('\n', ' ').strip()
 
             return content[:1500]
 

diff --git a/tests/test_webscrapper.py b/tests/test_webscrapper.py
@@ -29,8 +29,12 @@ def test_scrapping_data():
     collector = resolver.resolve(uri, webscrapperConfig)
     assert collector is not None
 
+    print("REached here")
+
     async def poll_and_print():
+        print("Part 2")
         async for result in collector.poll():
+            print("Hola...")
             assert not result.is_error()
             print(result.unwrap())