Skip to content

Commit 1a7b193

Browse files
committed
Ignore unsupported image file formats and normalize texts
1 parent dfef0b6 commit 1a7b193

File tree

3 files changed

+35
-2
lines changed

3 files changed

+35
-2
lines changed

ingestors/support/pdf.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
from typing import Dict, List
66
import uuid
7+
import unicodedata
78

89
import pikepdf
910
from PIL import Image
@@ -147,8 +148,12 @@ def _extract_images(
147148
pdfimages = []
148149
for r in raw_images:
149150
if isinstance(r, list):
150-
base_image = pikepdf.PdfImage(r[0]).as_pil_image()
151-
soft_mask = pikepdf.PdfImage(r[1]).as_pil_image()
151+
try:
152+
base_image = pikepdf.PdfImage(r[0]).as_pil_image()
153+
soft_mask = pikepdf.PdfImage(r[1]).as_pil_image()
154+
except NotImplementedError:
155+
# Skip unsupported image file formats
156+
continue
152157

153158
if base_image.size != soft_mask.size:
154159
log.debug(
@@ -218,4 +223,5 @@ def pdf_extract_page(
218223
if text is not None:
219224
texts += text
220225

226+
texts = unicodedata.normalize("NFKD", texts.strip())
221227
return PdfPageModel(number=page_number, text=texts.strip())

tests/fixtures/106972554.pdf

204 KB
Binary file not shown.

tests/test_pdf.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,3 +206,30 @@ def test_ingest_pdf_ocr_greek(self):
206206
page = emitted[1]
207207
assert page.schema.name == "Page"
208208
assert "IRIDECEA HOLDINGS LIMITED" in "\n".join(page.get("bodyText"))
209+
210+
def test_ingest_pdf_normalized(self):
211+
"""The text in this document contains escape sequences like
212+
\xa0 which need to be normalized in order for search to work. There are
213+
also some unsupported images embedded which need to be skipped."""
214+
215+
fixture_path, entity = self.fixture("106972554.pdf")
216+
self.manager.ingest(fixture_path, entity)
217+
218+
emitted = self.get_emitted()
219+
assert len(emitted) == 4
220+
221+
expected = {
222+
"1": "UPON THE APPLICATION of the Plaintiff in this action",
223+
"2": "The 1st, 2nd and 4th Defendants shall jointly",
224+
"3": "On or around 6 February 2014",
225+
}
226+
227+
for page in emitted:
228+
if page.schema.name == "Pages":
229+
continue
230+
231+
assert page.schema.name == "Page"
232+
page_no = page.properties["index"][0]
233+
page_text = "\n".join(page.get("bodyText"))
234+
235+
assert expected[page_no] in page_text

0 commit comments

Comments
 (0)