Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 15, 2024
1 parent afe6295 commit 3e5a020
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 4 deletions.
4 changes: 4 additions & 0 deletions tests/test_document_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ def test_document_builder(pdf_document):
first_block = first_page.get_block(first_page.structure[0])
assert first_block.block_type == 'Section-header'
assert first_block.text_extraction_method == 'pdftext'

first_text_block: Line = first_page.get_block(first_block.structure[0])
assert first_text_block.block_type == 'Line'

first_span = first_page.get_block(first_text_block.structure[0])
assert first_span.block_type == 'Span'
assert first_span.text == 'Subspace Adversarial Training'
Expand All @@ -18,8 +20,10 @@ def test_document_builder(pdf_document):

last_block = first_page.get_block(first_page.structure[-1])
assert last_block.block_type == 'Text-inline-math'

last_text_block: Line = first_page.get_block(last_block.structure[-1])
assert last_text_block.block_type == 'Line'

last_span = first_page.get_block(last_text_block.structure[-1])
assert last_span.block_type == 'Span'
assert last_span.text == 'prove the quality of single-step AT solutions. However,'
Expand Down
10 changes: 6 additions & 4 deletions tests/test_ocr_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
from tests.utils import setup_pdf_document


def test_document_builder():
def test_ocr_pipeline():
pdf_document = setup_pdf_document(
"adversarial.pdf",
document_builder_config={
"force_ocr": False
"force_ocr": True
}
)

Expand All @@ -16,12 +16,14 @@ def test_document_builder():
first_block = first_page.get_block(first_page.structure[0])
assert first_block.text_extraction_method == 'surya'
assert first_block.block_type == 'Section-header'

first_text_block: Line = first_page.get_block(first_block.structure[0])
assert first_text_block.block_type == 'Line'

first_span = first_page.get_block(first_text_block.structure[0])
assert first_span.block_type == 'Span'
assert first_span.text == 'Subspace Adversarial Training'
assert first_span.text.strip() == 'Subspace Adversarial Training'


if __name__ == "__main__":
test_document_builder()
test_ocr_pipeline()

0 comments on commit 3e5a020

Please sign in to comment.