|
1 | 1 | from dataclasses import dataclass |
2 | | -from io import StringIO |
3 | 2 | import logging |
4 | 3 | import os |
5 | 4 | from typing import Dict, List |
6 | 5 | import uuid |
7 | 6 | import unicodedata |
8 | 7 |
|
9 | | -import pikepdf |
10 | | -from PIL import Image |
11 | | -from pdfminer.converter import TextConverter |
12 | | -from pdfminer.layout import LAParams |
13 | | -from pdfminer.pdfdocument import PDFDocument |
14 | | -from pdfminer.pdfpage import PDFPage |
15 | | -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter |
16 | | -from pdfminer.pdfparser import PDFParser |
| 8 | +import fitz |
17 | 9 |
|
18 | 10 | from normality import collapse_spaces # noqa |
19 | 11 |
|
20 | 12 | from followthemoney import model |
| 13 | +from ingestors.exc import UnauthorizedError |
21 | 14 | from ingestors.support.ocr import OCRSupport |
22 | 15 | from ingestors.support.convert import DocumentConvertSupport |
23 | 16 |
|
24 | | -# silence some shouty debug output from pdfminer |
25 | | -logging.getLogger("pdfminer").setLevel(logging.WARNING) |
26 | | - |
27 | 17 | log = logging.getLogger(__name__) |
28 | 18 |
|
29 | 19 |
|
@@ -85,143 +75,63 @@ def extract_pages(self, pdf_model: PdfModel, entity, manager): |
85 | 75 | def parse(self, file_path: str) -> PdfModel: |
86 | 76 | """Takes a file_path to a pdf and returns a `PdfModel`""" |
87 | 77 | pdf_model = PdfModel(metadata=None, xmp_metadata=None, pages=[]) |
88 | | - with open(file_path, "rb") as pdf_file: |
89 | | - parser = PDFParser(pdf_file) |
90 | | - pike_doc = pikepdf.Pdf.open(pdf_file) |
91 | | - pdf_doc = PDFDocument(parser) |
92 | | - for page_number, page in enumerate(PDFPage.create_pages(pdf_doc), 1): |
| 78 | + with fitz.open(file_path) as pdf_doc: |
| 79 | + if pdf_doc.needs_pass: |
| 80 | + raise UnauthorizedError |
| 81 | + # print(f"\n[IF] number of pages: {pdf_doc.page_count}") |
| 82 | + for page_num in range(pdf_doc.page_count): |
93 | 83 | pdf_model.pages.append( |
94 | | - self.pdf_extract_page(page, pike_doc, page_number) |
| 84 | + self.pdf_extract_page(pdf_doc, pdf_doc[page_num], page_num) |
95 | 85 | ) |
96 | 86 | return pdf_model |
97 | 87 |
|
98 | 88 | def parse_and_ingest(self, file_path: str, entity, manager): |
99 | | - try: |
100 | | - pdf_model: PdfModel = self.parse(file_path) |
101 | | - self.extract_metadata(pdf_model, entity) |
102 | | - self.extract_xmp_metadata(pdf_model, entity) |
103 | | - self.extract_pages(pdf_model, entity, manager) |
104 | | - except pikepdf._core.PasswordError as pwe: |
105 | | - log.info(f"Failed to ingest password protected pdf: {file_path}") |
106 | | - raise pwe |
| 89 | + pdf_model: PdfModel = self.parse(file_path) |
| 90 | + self.extract_metadata(pdf_model, entity) |
| 91 | + self.extract_xmp_metadata(pdf_model, entity) |
| 92 | + self.extract_pages(pdf_model, entity, manager) |
107 | 93 |
|
108 | 94 | def pdf_alternative_extract(self, entity, pdf_path: str, manager): |
109 | 95 | checksum = self.manager.store(pdf_path) |
110 | 96 | entity.set("pdfHash", checksum) |
111 | 97 | self.parse_and_ingest(pdf_path, entity, manager) |
112 | 98 |
|
113 | | - def _find_images(self, container: pikepdf.Pdf, depth: int = 0): |
114 | | - if "/Resources" not in container: |
115 | | - return [] |
116 | | - resources = container["/Resources"] |
117 | | - |
118 | | - if "/XObject" not in resources: |
119 | | - return [] |
120 | | - xobjects = resources["/XObject"].as_dict() |
121 | | - |
122 | | - if depth > 0: |
123 | | - allow_recursion = False |
124 | | - else: |
125 | | - allow_recursion = True |
126 | | - |
127 | | - images = [] |
128 | | - |
129 | | - for xobject in xobjects: |
130 | | - candidate = xobjects[xobject] |
131 | | - if candidate["/Subtype"] == "/Image": |
132 | | - if "/SMask" in candidate: |
133 | | - images.append([candidate, candidate["/SMask"]]) |
134 | | - else: |
135 | | - images.append(candidate) |
136 | | - elif allow_recursion and candidate["/Subtype"] == "/Form": |
137 | | - images.extend(self._find_images(candidate, depth=depth + 1)) |
138 | | - |
139 | | - return images |
140 | | - |
141 | | - def _extract_images( |
142 | | - self, pike_doc: pikepdf.Pdf, image_path: str, prefix: str = "img" |
143 | | - ): |
144 | | - raw_images = [] |
145 | | - found_imgs = self._find_images(pike_doc) |
146 | | - raw_images.extend(found_imgs) |
147 | | - |
148 | | - pdfimages = [] |
149 | | - for r in raw_images: |
150 | | - if isinstance(r, list): |
151 | | - try: |
152 | | - base_image = pikepdf.PdfImage(r[0]).as_pil_image() |
153 | | - soft_mask = pikepdf.PdfImage(r[1]).as_pil_image() |
154 | | - except NotImplementedError: |
155 | | - # Skip unsupported image file formats |
156 | | - continue |
157 | | - |
158 | | - if base_image.size != soft_mask.size: |
159 | | - log.debug( |
160 | | - "Warning: Image and /SMask have a different size. This is unexpected.", |
161 | | - ) |
162 | | - soft_mask = soft_mask.resize(base_image.size) |
163 | | - |
164 | | - if base_image.mode in ("L", "LA"): |
165 | | - transparency = Image.new("LA", base_image.size, (0, 0)) |
166 | | - else: |
167 | | - if base_image.mode not in ("RGB", "RGBA"): |
168 | | - base_image = base_image.convert("RGB") |
169 | | - transparency = Image.new("RGBA", base_image.size, (0, 0, 0, 0)) |
170 | | - |
171 | | - composite = Image.composite(base_image, transparency, soft_mask) |
172 | | - |
173 | | - pdfimages.append(composite) |
174 | | - |
175 | | - else: |
176 | | - pdfimages.append(pikepdf.PdfImage(r)) |
177 | | - |
178 | | - n_images = len(pdfimages) |
179 | | - |
180 | | - n_digits = len(str(n_images)) |
181 | | - for i, image in enumerate(pdfimages): |
182 | | - filepath_prefix = os.path.join(image_path, prefix + f"{i+1:0{n_digits}}") |
183 | | - if isinstance(image, Image.Image): |
184 | | - image.save(filepath_prefix + ".png", "PNG") |
185 | | - else: |
186 | | - pil_image = image.as_pil_image() |
187 | | - if pil_image.format == "TIFF": |
188 | | - pil_image.save(filepath_prefix + ".png", "PNG") |
189 | | - image.extract_to(fileprefix=filepath_prefix) |
190 | | - |
191 | | - def pdf_extract_page( |
192 | | - self, page: PDFPage, pike_doc: pikepdf._core.Pdf, page_number: int |
193 | | - ) -> PdfPageModel: |
| 99 | + def pdf_extract_page(self, pdf_doc, page, page_number: int) -> PdfPageModel: |
194 | 100 | """Extract the contents of a single PDF page, using OCR if need be.""" |
195 | | - buf = StringIO() |
196 | | - rsrcmgr = PDFResourceManager() |
197 | | - device = TextConverter( |
198 | | - rsrcmgr, |
199 | | - buf, |
200 | | - laparams=LAParams( |
201 | | - line_overlap=0.5, # default: 0.5 |
202 | | - char_margin=2.0, # default: 2.0 |
203 | | - word_margin=0.2, # default: 0.1 |
204 | | - line_margin=0.5, # default: 0.5 |
205 | | - boxes_flow=0.5, # default: 0.5 |
206 | | - detect_vertical=True, # default: False |
207 | | - all_texts=True, # default: False |
208 | | - ), |
209 | | - ) |
210 | | - interpreter = PDFPageInterpreter(rsrcmgr, device) |
211 | | - interpreter.process_page(page) |
212 | | - texts = buf.getvalue() |
| 101 | + # Extract text |
| 102 | + full_text = page.get_text() |
| 103 | + # print(f"[IF] extracted text: \n{full_text}") |
| 104 | + |
| 105 | + # Extract images |
| 106 | + images = page.get_images() |
| 107 | + |
| 108 | + # Create a temporary location to store all extracted images |
213 | 109 | temp_dir = self.make_empty_directory() |
214 | | - image_path = temp_dir.joinpath(str(uuid.uuid4())) |
215 | | - os.mkdir(image_path) |
216 | | - pike_page = pike_doc.pages[page_number - 1] |
217 | | - self._extract_images(pike_page, image_path) |
| 110 | + image_dir = temp_dir.joinpath(str(uuid.uuid4())) |
| 111 | + os.mkdir(image_dir) |
| 112 | + |
| 113 | + # Extract images from PDF and store them on the disk |
| 114 | + extracted_images = [] |
| 115 | + for image_index, image in enumerate(images, start=1): |
| 116 | + xref = image[0] |
| 117 | + img = pdf_doc.extract_image(xref) |
| 118 | + if img: |
| 119 | + image_path = os.path.join( |
| 120 | + image_dir, f"image{page_number+1}_{image_index}.{img['ext']}" |
| 121 | + ) |
| 122 | + with open(image_path, "wb") as image_file: |
| 123 | + image_file.write(img["image"]) |
| 124 | + extracted_images.append(image_path) |
| 125 | + |
| 126 | + # Attempt to OCR the images and extract text |
218 | 127 | languages = self.manager.context.get("languages") |
219 | | - for image_file in image_path.glob("*.png"): |
220 | | - with open(image_file, "rb") as fh: |
| 128 | + for image_path in extracted_images: |
| 129 | + with open(image_path, "rb") as fh: |
221 | 130 | data = fh.read() |
222 | 131 | text = self.extract_ocr_text(data, languages=languages) |
223 | 132 | if text is not None: |
224 | | - texts += text |
| 133 | + # print(f"[IF] extracted text from images: \n{text}") |
| 134 | + full_text += text |
225 | 135 |
|
226 | | - texts = unicodedata.normalize("NFKD", texts.strip()) |
227 | | - return PdfPageModel(number=page_number, text=texts.strip()) |
| 136 | + full_text = unicodedata.normalize("NFKD", full_text.strip()) |
| 137 | + return PdfPageModel(number=page_number + 1, text=full_text.strip()) |
0 commit comments