File tree Expand file tree Collapse file tree 3 files changed +35
-2
lines changed Expand file tree Collapse file tree 3 files changed +35
-2
lines changed Original file line number Diff line number Diff line change 44import os
55from typing import Dict , List
66import uuid
7+ import unicodedata
78
89import pikepdf
910from PIL import Image
@@ -147,8 +148,12 @@ def _extract_images(
147148 pdfimages = []
148149 for r in raw_images :
149150 if isinstance (r , list ):
150- base_image = pikepdf .PdfImage (r [0 ]).as_pil_image ()
151- soft_mask = pikepdf .PdfImage (r [1 ]).as_pil_image ()
151+ try :
152+ base_image = pikepdf .PdfImage (r [0 ]).as_pil_image ()
153+ soft_mask = pikepdf .PdfImage (r [1 ]).as_pil_image ()
154+ except NotImplementedError :
155+ # Skip unsupported image file formats
156+ continue
152157
153158 if base_image .size != soft_mask .size :
154159 log .debug (
@@ -218,4 +223,5 @@ def pdf_extract_page(
218223 if text is not None :
219224 texts += text
220225
226+ texts = unicodedata .normalize ("NFKD" , texts .strip ())
221227 return PdfPageModel (number = page_number , text = texts .strip ())
Original file line number Diff line number Diff line change @@ -206,3 +206,30 @@ def test_ingest_pdf_ocr_greek(self):
206206 page = emitted [1 ]
207207 assert page .schema .name == "Page"
208208 assert "IRIDECEA HOLDINGS LIMITED" in "\n " .join (page .get ("bodyText" ))
209+
210+ def test_ingest_pdf_normalized (self ):
211+ """The text in this document contains escape sequences like
212+ \xa0 which need to be normalized in order for search to work. There are
213+ also some unsupported images embedded which need to be skipped."""
214+
215+ fixture_path , entity = self .fixture ("106972554.pdf" )
216+ self .manager .ingest (fixture_path , entity )
217+
218+ emitted = self .get_emitted ()
219+ assert len (emitted ) == 4
220+
221+ expected = {
222+ "1" : "UPON THE APPLICATION of the Plaintiff in this action" ,
223+ "2" : "The 1st, 2nd and 4th Defendants shall jointly" ,
224+ "3" : "On or around 6 February 2014" ,
225+ }
226+
227+ for page in emitted :
228+ if page .schema .name == "Pages" :
229+ continue
230+
231+ assert page .schema .name == "Page"
232+ page_no = page .properties ["index" ][0 ]
233+ page_text = "\n " .join (page .get ("bodyText" ))
234+
235+ assert expected [page_no ] in page_text
You can’t perform that action at this time.
0 commit comments