Ignore unsupported image file formats and normalize texts

stchris · stchris · commit 1a7b19332a83 · 2023-01-17T11:56:46.000+01:00
diff --git a/ingestors/support/pdf.py b/ingestors/support/pdf.py
@@ -4,6 +4,7 @@
 import os
 from typing import Dict, List
 import uuid
+import unicodedata
 
 import pikepdf
 from PIL import Image
@@ -147,8 +148,12 @@ def _extract_images(
         pdfimages = []
         for r in raw_images:
             if isinstance(r, list):
-                base_image = pikepdf.PdfImage(r[0]).as_pil_image()
-                soft_mask = pikepdf.PdfImage(r[1]).as_pil_image()
+                try:
+                    base_image = pikepdf.PdfImage(r[0]).as_pil_image()
+                    soft_mask = pikepdf.PdfImage(r[1]).as_pil_image()
+                except NotImplementedError:
+                    # Skip unsupported image file formats
+                    continue
 
                 if base_image.size != soft_mask.size:
                     log.debug(
@@ -218,4 +223,5 @@ def pdf_extract_page(
                 if text is not None:
                     texts += text
 
+        texts = unicodedata.normalize("NFKD", texts.strip())
         return PdfPageModel(number=page_number, text=texts.strip())
diff --git a/tests/fixtures/106972554.pdf b/tests/fixtures/106972554.pdf
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -206,3 +206,30 @@ def test_ingest_pdf_ocr_greek(self):
         page = emitted[1]
         assert page.schema.name == "Page"
         assert "IRIDECEA HOLDINGS LIMITED" in "\n".join(page.get("bodyText"))
+
+    def test_ingest_pdf_normalized(self):
+        """The text in this document contains escape sequences like
+        \xa0 which need to be normalized in order for search to work. There are
+        also some unsupported images embedded which need to be skipped."""
+
+        fixture_path, entity = self.fixture("106972554.pdf")
+        self.manager.ingest(fixture_path, entity)
+
+        emitted = self.get_emitted()
+        assert len(emitted) == 4
+
+        expected = {
+            "1": "UPON THE APPLICATION of the Plaintiff in this action",
+            "2": "The 1st, 2nd and 4th Defendants shall jointly",
+            "3": "On or around 6 February 2014",
+        }
+
+        for page in emitted:
+            if page.schema.name == "Pages":
+                continue
+
+            assert page.schema.name == "Page"
+            page_no = page.properties["index"][0]
+            page_text = "\n".join(page.get("bodyText"))
+
+            assert expected[page_no] in page_text