From c532bcd2ad0d01b8c9cec18c7815f9ae11892f4f Mon Sep 17 00:00:00 2001 From: Ansh5461 Date: Sun, 20 Aug 2023 01:22:08 +0530 Subject: [PATCH 1/2] Adding test cases for webscrapping --- .../webscaper/web_scraper_collector.py | 3 +- querent/common/uri.py | 2 +- querent/ingestors/pdf_ingestor.py | 4 +-- querent/tools/web_page_extractor.py | 6 ++-- tests/test_webscrapper.py | 28 +++++++++++++++++++ 5 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 tests/test_webscrapper.py diff --git a/querent/collectors/webscaper/web_scraper_collector.py b/querent/collectors/webscaper/web_scraper_collector.py index 4b42f8ef..2524c01a 100644 --- a/querent/collectors/webscaper/web_scraper_collector.py +++ b/querent/collectors/webscaper/web_scraper_collector.py @@ -3,6 +3,7 @@ from querent.collectors.collector_result import CollectorResult from querent.config.collector_config import CollectorBackend, WebScraperConfig from querent.tools.web_page_extractor import WebpageExtractor +from querent.common.uri import Uri class WebScraperCollector(Collector): def __init__(self, config: WebScraperConfig): @@ -30,5 +31,5 @@ def __init__(self): def backend(self) -> CollectorBackend: return CollectorBackend.WebScraper - def resolve(self, config: WebScraperConfig) -> Collector: + def resolve(self,uri: Uri, config: WebScraperConfig) -> Collector: return WebScraperCollector(config) diff --git a/querent/common/uri.py b/querent/common/uri.py index 59aa1c49..522aabdf 100644 --- a/querent/common/uri.py +++ b/querent/common/uri.py @@ -13,7 +13,7 @@ class Protocol(enum.Enum): PostgreSQL = "postgresql" Ram = "ram" S3 = "s3" - Webscraper = "webscraper" + Webscraper = "https" def is_azure(self) -> bool: return self == Protocol.Azure diff --git a/querent/ingestors/pdf_ingestor.py b/querent/ingestors/pdf_ingestor.py index e7b4c466..13f5df3b 100644 --- a/querent/ingestors/pdf_ingestor.py +++ b/querent/ingestors/pdf_ingestor.py @@ -1,4 +1,4 @@ -import PyPDF2 +import pypdf class PDFConnector: @@ -10,7 +10,7 @@ def __init__(self, file_path): def open_pdf(self): """Open the PDF file.""" self.pdf_file = open(self.file_path, 'rb') - self.pdf_reader = PyPDF2.PdfReader(self.pdf_file) + self.pdf_reader = pypdf.pdf_reader(self.pdf_file) def authenticate(self, password): """Authenticate the connection if the PDF is encrypted.""" diff --git a/querent/tools/web_page_extractor.py b/querent/tools/web_page_extractor.py index 02c1a370..f7170edd 100644 --- a/querent/tools/web_page_extractor.py +++ b/querent/tools/web_page_extractor.py @@ -1,6 +1,5 @@ from io import BytesIO -from PyPDF2 import PdfFileReader -from PyPDF2 import PdfReader +from pypdf import PdfReader import requests import re from requests.exceptions import RequestException @@ -69,8 +68,7 @@ def extract_with_3k(self, url): article = Article(url, config=config) article.set_html(html_content) article.parse() - content = article.text.replace( - '\t', ' ').replace('\n', ' ').strip() + content = article.text.replace('\t', ' ').replace('\n', ' ').strip() return content[:1500] diff --git a/tests/test_webscrapper.py b/tests/test_webscrapper.py new file mode 100644 index 00000000..22c7a4a1 --- /dev/null +++ b/tests/test_webscrapper.py @@ -0,0 +1,28 @@ +import asyncio +from pathlib import Path +import tempfile +from querent.collectors.collector_resolver import CollectorResolver +from querent.collectors.webscaper.web_scraper_collector import WebScraperFactory +import pytest + +from querent.common.uri import Uri +from querent.config.collector_config import CollectorBackend, WebScraperConfig + + +def test_webscrapper_collector(): + uri = Uri("https://asecuritysite.com/") + resolver = CollectorResolver() + webscrapperConfig = WebScraperConfig(website_url = uri.uri) + collector = resolver.resolve(uri, webscrapperConfig) + assert collector is not None + +def test_fs_collector_factory(): + factory = WebScraperFactory() + assert factory.backend() == CollectorBackend.WebScraper + +def test_scrapping_data(): + uri = Uri("https://asecuritysite.com/") + resolver = CollectorResolver() + webscrapperConfig = WebScraperConfig(website_url = uri.uri) + collector = resolver.resolve(uri, webscrapperConfig) + assert collector is not None From ede011705adfe7851dd0f399640d38a62b672153 Mon Sep 17 00:00:00 2001 From: Ansh5461 Date: Sun, 20 Aug 2023 01:33:22 +0530 Subject: [PATCH 2/2] Added test cases for webscrapper --- querent/collectors/webscaper/web_scraper_collector.py | 2 +- tests/test_webscrapper.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/querent/collectors/webscaper/web_scraper_collector.py b/querent/collectors/webscaper/web_scraper_collector.py index 2524c01a..6a65038d 100644 --- a/querent/collectors/webscaper/web_scraper_collector.py +++ b/querent/collectors/webscaper/web_scraper_collector.py @@ -22,7 +22,7 @@ async def poll(self): async def scrape_website(self, website_url: str): content = WebpageExtractor().extract_with_bs4(website_url) max_length = len(' '.join(content.split(" ")[:600])) - return content[:max_length] + return CollectorResult({"content": content[:max_length]}) class WebScraperFactory(CollectorFactory): def __init__(self): diff --git a/tests/test_webscrapper.py b/tests/test_webscrapper.py index 22c7a4a1..784d330a 100644 --- a/tests/test_webscrapper.py +++ b/tests/test_webscrapper.py @@ -26,3 +26,13 @@ def test_scrapping_data(): webscrapperConfig = WebScraperConfig(website_url = uri.uri) collector = resolver.resolve(uri, webscrapperConfig) assert collector is not None + + print("REached here") + async def poll_and_print(): + print("Part 2") + async for result in collector.poll(): + print("Hola...") + assert not result.is_error() + print(result.unwrap()) + + asyncio.run(poll_and_print())