From 6e85a91f4929d60207d05c720586a8efc67d1459 Mon Sep 17 00:00:00 2001 From: Ansh5461 Date: Sat, 2 Sep 2023 00:06:01 +0530 Subject: [PATCH] Added tests for text --- querent/ingestors/texts/text_ingestor.py | 6 ++-- tests/data/text/asyncgenerator.txt | 12 +++++++ tests/test_text_ingestor.py | 41 ++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 tests/data/text/asyncgenerator.txt create mode 100644 tests/test_text_ingestor.py diff --git a/querent/ingestors/texts/text_ingestor.py b/querent/ingestors/texts/text_ingestor.py index 9eff2d34..e4fd22c7 100644 --- a/querent/ingestors/texts/text_ingestor.py +++ b/querent/ingestors/texts/text_ingestor.py @@ -23,8 +23,8 @@ async def create( class TextIngestor(BaseIngestor): def __init__(self, processors: List[AsyncProcessor]): + super().__init__(IngestorBackend.TEXT) self.processors = processors - super.__init__(IngestorBackend.TEXT) async def ingest( self, poll_function: AsyncGenerator[CollectedBytes, None] @@ -56,6 +56,7 @@ async def ingest( yield text except Exception as e: + print(e) yield [] async def extract_and_process_text( @@ -64,8 +65,9 @@ async def extract_and_process_text( text = await self.extract_text_from_file(collected_bytes) return await self.process_data(text=text) - async def extract_text_from_file(collected_bytes: CollectedBytes) -> str: + async def extract_text_from_file(self, collected_bytes: CollectedBytes) -> str: text = collected_bytes.data.decode("utf-8") + print(text) return text async def process_data(self, text: str) -> List[str]: diff --git a/tests/data/text/asyncgenerator.txt b/tests/data/text/asyncgenerator.txt new file mode 100644 index 00000000..d9426417 --- /dev/null +++ b/tests/data/text/asyncgenerator.txt @@ -0,0 +1,12 @@ +Asynchronous generator functions are part of Python version 3.6, they were introduced by PEP-525. Asynchronous generator +functions are much like regular asynchronous functions except that they contain the yield keyword in the function body. +Which in turn, makes them much like regular generators, except for that you can use the await keyword in there as well. + +When calling an asynchronous generator function, the result that is returned is an asynchronous generator object. In +contrast to calling regular asynchronous functions which return a coroutine object. +Since the asynchronous generator is, no surprise, asynchronous you are allowed to use the await keyword inside the +asynchronous generator. + +You can use this, for example, to send out HTTP requests in the asynchronous generator and yielding the response. + +Besides asynchronous iterables you can use asynchronous generators with the async for-loop as well. diff --git a/tests/test_text_ingestor.py b/tests/test_text_ingestor.py new file mode 100644 index 00000000..fbea22d8 --- /dev/null +++ b/tests/test_text_ingestor.py @@ -0,0 +1,41 @@ +import asyncio +from pathlib import Path +from querent.collectors.fs.fs_collector import FSCollectorFactory +from querent.config.collector_config import FSCollectorConfig +from querent.common.uri import Uri +from querent.ingestors.ingestor_manager import IngestorFactoryManager +import pytest + + +@pytest.mark.asyncio +async def test_collect_and_ingest_txt(): + # Set up the collector + collector_factory = FSCollectorFactory() + uri = Uri("file://" + str(Path("./tests/data/text/").resolve())) + config = FSCollectorConfig(root_path=uri.path) + collector = collector_factory.resolve(uri, config) + + # Set up the ingestor + ingestor_factory_manager = IngestorFactoryManager() + ingestor_factory = await ingestor_factory_manager.get_factory( + "txt" + ) # Notice the use of await here + ingestor = await ingestor_factory.create("txt", []) + + # Collect and ingest the PDF + ingested_call = ingestor.ingest(collector.poll()) + counter = 0 + + async def poll_and_print(): + counter = 0 + async for ingested in ingested_call: + assert ingested is not None + if len(ingested) == 0: + counter += 1 + assert counter == 0 + + await poll_and_print() # Notice the use of await here + + +if __name__ == "__main__": + asyncio.run(test_collect_and_ingest_txt())