From 30ec9b02f7668b30f00767e554494dcc4cc83bbc Mon Sep 17 00:00:00 2001 From: James Braza Date: Fri, 13 Sep 2024 11:33:17 -0700 Subject: [PATCH] Fixing crash in `chunk_text` for empty file (#389) --- paperqa/agents/search.py | 7 ++++++- paperqa/readers.py | 18 +++++++++++++----- tests/stub_data/empty.txt | 0 tests/test_agents.py | 14 +++++++++----- tests/test_paperqa.py | 6 ++++-- 5 files changed, 32 insertions(+), 13 deletions(-) create mode 100644 tests/stub_data/empty.txt diff --git a/paperqa/agents/search.py b/paperqa/agents/search.py index f9625bf1..585f5391 100644 --- a/paperqa/agents/search.py +++ b/paperqa/agents/search.py @@ -353,6 +353,9 @@ async def maybe_get_manifest(filename: anyio.Path | None) -> dict[str, DocDetail return {} +FAILED_DOCUMENT_ADD_ID = "ERROR" + + async def process_file( file_path: anyio.Path, search_index: SearchIndex, @@ -382,7 +385,9 @@ async def process_file( logger.exception( f"Error parsing {file_name}, skipping index for this file." ) - (await search_index.index_files)[str(file_path)] = "ERROR" + (await search_index.index_files)[ + str(file_path) + ] = FAILED_DOCUMENT_ADD_ID await search_index.save_index() return diff --git a/paperqa/readers.py b/paperqa/readers.py index 0a4962bf..496b1f62 100644 --- a/paperqa/readers.py +++ b/paperqa/readers.py @@ -48,7 +48,8 @@ def chunk_pdf( if not parsed_text.content: raise ImpossibleParsingError( - "No text was parsed from the document: either empty or corrupted." + f"No text was parsed from the document named {doc.docname!r}, either empty" + " or corrupted." ) for page_num, page_text in parsed_text.content.items(): @@ -117,17 +118,19 @@ def parse_text( def chunk_text( - parsed_text: ParsedText, doc: Doc, chunk_chars: int, overlap: int, use_tiktoken=True + parsed_text: ParsedText, + doc: Doc, + chunk_chars: int, + overlap: int, + use_tiktoken: bool = True, ) -> list[Text]: """Parse a document into chunks, based on tiktoken encoding. NOTE: We get some byte continuation errors. - Currently ignored, but should explore more to make sure we - don't miss anything. + Currently ignored, but should explore more to make sure we don't miss anything. """ texts: list[Text] = [] enc = tiktoken.get_encoding("cl100k_base") - split = [] if not isinstance(parsed_text.content, str): raise NotImplementedError( @@ -135,6 +138,11 @@ def chunk_text( ) content = parsed_text.content if not use_tiktoken else parsed_text.encode_content() + if not content: # Avoid div0 in token calculations + raise ImpossibleParsingError( + f"No text was parsed from the document named {doc.docname!r}, either empty" + " or corrupted." + ) # convert from characters to chunks char_count = parsed_text.metadata.total_parsed_text_length # e.g., 25,000 diff --git a/tests/stub_data/empty.txt b/tests/stub_data/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_agents.py b/tests/test_agents.py index 425c5685..c456a892 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -21,7 +21,7 @@ from paperqa.agents import agent_query from paperqa.agents.env import settings_to_tools from paperqa.agents.models import AgentStatus, AnswerResponse, QueryRequest -from paperqa.agents.search import get_directory_index +from paperqa.agents.search import FAILED_DOCUMENT_ADD_ID, get_directory_index from paperqa.agents.tools import ( EnvironmentState, GatherEvidence, @@ -44,8 +44,12 @@ async def test_get_directory_index(agent_test_settings: Settings) -> None: "title", "year", ], "Incorrect fields in index" - # paper.pdf + flag_day.html + bates.txt + obama.txt - assert len(await index.index_files) == 4, "Incorrect number of index files" + # paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt, + # but empty.txt fails to be added + path_to_id = await index.index_files + assert ( + sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 4 + ), "Incorrect number of parsed index files" results = await index.query(query="who is Frederick Bates?") paper_dir = cast(Path, agent_test_settings.paper_directory) assert results[0].docs.keys() == {md5sum((paper_dir / "bates.txt").absolute())} @@ -63,8 +67,8 @@ async def test_get_directory_index_w_manifest( "title", "year", ], "Incorrect fields in index" - # paper.pdf + flag_day.html + bates.txt + obama.txt - assert len(await index.index_files) == 4, "Incorrect number of index files" + # paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt + assert len(await index.index_files) == 5, "Incorrect number of index files" results = await index.query(query="who is Frederick Bates?") top_result = next(iter(results[0].docs.values())) paper_dir = cast(Path, agent_test_settings.paper_directory) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 2284a3bc..c44adedd 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -837,13 +837,15 @@ def test_pdf_reader_match_doc_details(stub_data_dir: Path) -> None: assert "yes" in answer.answer or "Yes" in answer.answer -@pytest.mark.flaky(reruns=3, only_rerun=["AssertionError"]) +@pytest.mark.flaky(reruns=5, only_rerun=["AssertionError"]) def test_fileio_reader_pdf(stub_data_dir: Path) -> None: with (stub_data_dir / "paper.pdf").open("rb") as f: docs = Docs() docs.add_file(f, "Wellawatte et al, XAI Review, 2023") answer = docs.query("Are counterfactuals actionable?[yes/no]") - assert "yes" in answer.answer or "Yes" in answer.answer + assert any( + w in answer.answer for w in ("yes", "Yes") + ), f"Incorrect answer: {answer.answer}" def test_fileio_reader_txt(stub_data_dir: Path) -> None: