Skip to content

Commit

Permalink
tweaks in test pdf ingestor
Browse files Browse the repository at this point in the history
  • Loading branch information
the-non-expert committed Sep 1, 2023
1 parent 8cd2553 commit ab3d4e0
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 8 deletions.
6 changes: 4 additions & 2 deletions querent/ingestors/pdfs/pdf_ingestor_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ async def ingest(
if current_file:
# Process the collected bytes of the previous file
text = await self.extract_and_process_pdf(
CollectedBytes(file=current_file, data=collected_bytes)
CollectedBytes(file=current_file,
data=collected_bytes)
)
yield text
collected_bytes = b"" # Reset collected bytes for the new file
Expand All @@ -56,6 +57,7 @@ async def ingest(
yield text

except Exception as e:
print(e)
yield []

async def extract_and_process_pdf(self, collected_bytes: CollectedBytes) -> List[str]:
Expand All @@ -66,7 +68,7 @@ async def extract_text_from_pdf(self, collected_bytes: CollectedBytes) -> str:
pdf = fitz.open(stream=collected_bytes.data, filetype="pdf")
text = ""
for page in pdf:
text += page.getText()
text += page.get_text()
return text

async def process_data(self, text: str) -> List[str]:
Expand Down
15 changes: 9 additions & 6 deletions tests/test_pdf_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,33 @@
from querent.ingestors.ingestor_manager import IngestorFactoryManager
import pytest


@pytest.mark.asyncio
async def test_collect_and_ingest_pdf():
# Set up the collector
collector_factory = FSCollectorFactory()
uri = Uri("file://" + str(Path("./tests/data/pdf/").resolve()))
config = FSCollectorConfig(root_path=uri.path)
collector = collector_factory.resolve(uri, config)

# Set up the ingestor
ingestor_factory_manager = IngestorFactoryManager()
ingestor_factory = await ingestor_factory_manager.get_factory("pdf") # Notice the use of await here
# Notice the use of await here
ingestor_factory = await ingestor_factory_manager.get_factory("pdf")
ingestor = await ingestor_factory.create("pdf", [])

# Collect and ingest the PDF
ingested_call = ingestor.ingest(collector.poll())
counter = 0

async def poll_and_print():
counter = 0
async for ingested in ingested_call:
assert ingested is not None
if len(ingested) == 0:
if len(ingested) == 0:
counter += 1
assert counter == 1
assert counter == 0

await poll_and_print() # Notice the use of await here


Expand Down

0 comments on commit ab3d4e0

Please sign in to comment.