Skip to content

Commit

Permalink
Fixing crash in chunk_text for empty file (#389)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbraza authored Sep 13, 2024
1 parent 586ab7c commit 30ec9b0
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 13 deletions.
7 changes: 6 additions & 1 deletion paperqa/agents/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,9 @@ async def maybe_get_manifest(filename: anyio.Path | None) -> dict[str, DocDetail
return {}


FAILED_DOCUMENT_ADD_ID = "ERROR"


async def process_file(
file_path: anyio.Path,
search_index: SearchIndex,
Expand Down Expand Up @@ -382,7 +385,9 @@ async def process_file(
logger.exception(
f"Error parsing {file_name}, skipping index for this file."
)
(await search_index.index_files)[str(file_path)] = "ERROR"
(await search_index.index_files)[
str(file_path)
] = FAILED_DOCUMENT_ADD_ID
await search_index.save_index()
return

Expand Down
18 changes: 13 additions & 5 deletions paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def chunk_pdf(

if not parsed_text.content:
raise ImpossibleParsingError(
"No text was parsed from the document: either empty or corrupted."
f"No text was parsed from the document named {doc.docname!r}, either empty"
" or corrupted."
)

for page_num, page_text in parsed_text.content.items():
Expand Down Expand Up @@ -117,24 +118,31 @@ def parse_text(


def chunk_text(
parsed_text: ParsedText, doc: Doc, chunk_chars: int, overlap: int, use_tiktoken=True
parsed_text: ParsedText,
doc: Doc,
chunk_chars: int,
overlap: int,
use_tiktoken: bool = True,
) -> list[Text]:
"""Parse a document into chunks, based on tiktoken encoding.
NOTE: We get some byte continuation errors.
Currently ignored, but should explore more to make sure we
don't miss anything.
Currently ignored, but should explore more to make sure we don't miss anything.
"""
texts: list[Text] = []
enc = tiktoken.get_encoding("cl100k_base")
split = []

if not isinstance(parsed_text.content, str):
raise NotImplementedError(
f"ParsedText.content must be a `str`, not {type(parsed_text.content)}."
)

content = parsed_text.content if not use_tiktoken else parsed_text.encode_content()
if not content: # Avoid div0 in token calculations
raise ImpossibleParsingError(
f"No text was parsed from the document named {doc.docname!r}, either empty"
" or corrupted."
)

# convert from characters to chunks
char_count = parsed_text.metadata.total_parsed_text_length # e.g., 25,000
Expand Down
Empty file added tests/stub_data/empty.txt
Empty file.
14 changes: 9 additions & 5 deletions tests/test_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from paperqa.agents import agent_query
from paperqa.agents.env import settings_to_tools
from paperqa.agents.models import AgentStatus, AnswerResponse, QueryRequest
from paperqa.agents.search import get_directory_index
from paperqa.agents.search import FAILED_DOCUMENT_ADD_ID, get_directory_index
from paperqa.agents.tools import (
EnvironmentState,
GatherEvidence,
Expand All @@ -44,8 +44,12 @@ async def test_get_directory_index(agent_test_settings: Settings) -> None:
"title",
"year",
], "Incorrect fields in index"
# paper.pdf + flag_day.html + bates.txt + obama.txt
assert len(await index.index_files) == 4, "Incorrect number of index files"
# paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt,
# but empty.txt fails to be added
path_to_id = await index.index_files
assert (
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 4
), "Incorrect number of parsed index files"
results = await index.query(query="who is Frederick Bates?")
paper_dir = cast(Path, agent_test_settings.paper_directory)
assert results[0].docs.keys() == {md5sum((paper_dir / "bates.txt").absolute())}
Expand All @@ -63,8 +67,8 @@ async def test_get_directory_index_w_manifest(
"title",
"year",
], "Incorrect fields in index"
# paper.pdf + flag_day.html + bates.txt + obama.txt
assert len(await index.index_files) == 4, "Incorrect number of index files"
# paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt
assert len(await index.index_files) == 5, "Incorrect number of index files"
results = await index.query(query="who is Frederick Bates?")
top_result = next(iter(results[0].docs.values()))
paper_dir = cast(Path, agent_test_settings.paper_directory)
Expand Down
6 changes: 4 additions & 2 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,13 +837,15 @@ def test_pdf_reader_match_doc_details(stub_data_dir: Path) -> None:
assert "yes" in answer.answer or "Yes" in answer.answer


@pytest.mark.flaky(reruns=3, only_rerun=["AssertionError"])
@pytest.mark.flaky(reruns=5, only_rerun=["AssertionError"])
def test_fileio_reader_pdf(stub_data_dir: Path) -> None:
with (stub_data_dir / "paper.pdf").open("rb") as f:
docs = Docs()
docs.add_file(f, "Wellawatte et al, XAI Review, 2023")
answer = docs.query("Are counterfactuals actionable?[yes/no]")
assert "yes" in answer.answer or "Yes" in answer.answer
assert any(
w in answer.answer for w in ("yes", "Yes")
), f"Incorrect answer: {answer.answer}"


def test_fileio_reader_txt(stub_data_dir: Path) -> None:
Expand Down

0 comments on commit 30ec9b0

Please sign in to comment.