Skip to content

Commit

Permalink
🧪 ✨ add **experimental** pdf-fixing utility (#190)
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastianMindee authored Nov 13, 2023
1 parent 486280b commit 6e644e5
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 29 deletions.
42 changes: 20 additions & 22 deletions mindee/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,60 +447,58 @@ def create_endpoint(
return self._build_endpoint(endpoint_name, account_name, version)

def source_from_path(
self,
input_path: Union[Path, str],
self, input_path: Union[Path, str], fix_pdf: bool = False
) -> PathInput:
"""
Load a document from an absolute path, as a string.
:param input_path: Path of file to open
"""
return PathInput(input_path)
input_doc = PathInput(input_path)
if fix_pdf:
input_doc.fix_pdf()
return input_doc

def source_from_file(
self,
input_file: BinaryIO,
self, input_file: BinaryIO, fix_pdf: bool = False
) -> FileInput:
"""
Load a document from a normal Python file object/handle.
:param input_file: Input file handle
"""
return FileInput(
input_file,
)
input_doc = FileInput(input_file)
if fix_pdf:
input_doc.fix_pdf()
return input_doc

def source_from_b64string(
self,
input_string: str,
filename: str,
self, input_string: str, filename: str, fix_pdf: bool = False
) -> Base64Input:
"""
Load a document from a base64 encoded string.
:param input_string: Input to parse as base64 string
:param filename: The name of the file (without the path)
"""
return Base64Input(
input_string,
filename,
)
input_doc = Base64Input(input_string, filename)
if fix_pdf:
input_doc.fix_pdf()
return input_doc

def source_from_bytes(
self,
input_bytes: bytes,
filename: str,
self, input_bytes: bytes, filename: str, fix_pdf: bool = False
) -> BytesInput:
"""
Load a document from raw bytes.
:param input_bytes: Raw byte input
:param filename: The name of the file (without the path)
"""
return BytesInput(
input_bytes,
filename,
)
input_doc = BytesInput(input_bytes, filename)
if fix_pdf:
input_doc.fix_pdf()
return input_doc

def source_from_url(
self,
Expand Down
52 changes: 45 additions & 7 deletions mindee/input/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
import io
import mimetypes
import os
import tempfile
from enum import Enum
from pathlib import Path
from typing import BinaryIO, Optional, Sequence, Tuple, Union

import pikepdf

from mindee.error.mimetype_error import MimeTypeError
from mindee.error.mindee_error import MindeeSourceError
from mindee.error.mindee_error import MindeeError, MindeeSourceError
from mindee.input.page_options import KEEP_ONLY, REMOVE
from mindee.logger import logger

Expand Down Expand Up @@ -46,10 +47,7 @@ class LocalInputSource:
input_type: InputType
filepath: Optional[str] = None

def __init__(
self,
input_type: InputType,
):
def __init__(self, input_type: InputType):
self.input_type = input_type
self._check_mimetype()

Expand All @@ -60,11 +58,51 @@ def _check_mimetype(self) -> None:
if file_mimetype:
self.file_mimetype = file_mimetype
else:
raise MimeTypeError(f"Could not determine MIME type of '{self.filename}'")
raise MimeTypeError(f"Could not determine MIME type of '{self.filename}'.")

if self.file_mimetype not in ALLOWED_MIME_TYPES:
file_types = ", ".join(ALLOWED_MIME_TYPES)
raise MimeTypeError(f"File type not allowed, must be one of {file_types}")
raise MimeTypeError(f"File type not allowed, must be one of {file_types}.")

def fix_pdf(self, maximum_offset: int = 500) -> None:
"""
Fix a potentially broken pdf file.
WARNING: this feature alters the data of the enqueued file by removing unnecessary headers.
Reads the bytes of a PDF file until a proper pdf tag is encountered, or until the maximum offset has been
reached. If a tag denoting a PDF file is found, deletes all bytes before it.
:param maximum_offset: maximum byte offset where superfluous headers will be removed. Cannot be less than 0.
"""
if maximum_offset < 0:
raise MindeeError("Can't set maximum offset for pdf-fixing to less than 0.")
try:
buf = self.file_object.read()
self.file_object.seek(0)
pos: int = buf.find(b"%PDF-")
if pos != -1 and pos < maximum_offset:
self.file_object.seek(pos)
raw_bytes = self.file_object.read()
temp_file = tempfile.TemporaryFile()
temp_file.write(raw_bytes)
temp_file.seek(0)
self.file_object = io.BytesIO(temp_file.read())
temp_file.close()
else:
if pos < 0:
raise MimeTypeError(
"Provided stream isn't a valid PDF-like object."
)
raise MimeTypeError(
f"PDF couldn't be fixed. PDF tag was found at position {pos}."
)
self.file_mimetype = "application/pdf"
except MimeTypeError as exc:
raise exc
except Exception as exc:
print(f"Attempt to fix pdf raised exception {exc}.")
raise exc

def is_pdf(self) -> bool:
""":return: True if the file is a PDF."""
Expand Down
21 changes: 21 additions & 0 deletions tests/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,27 @@ def test_pdf_blank_check():
assert input_not_blank.count_doc_pages() == 1


#
# Broken PDFS fixing
#


def test_broken_unfixable_pdf():
with pytest.raises(MimeTypeError):
input_doc = PathInput(FILE_TYPES_DIR / "pdf" / "broken_unfixable.pdf")
input_doc.fix_pdf()


def test_broken_fixable_pdf():
input_doc = PathInput(FILE_TYPES_DIR / "pdf" / "broken_fixable.pdf")
input_doc.fix_pdf()


def test_broken_fixable_invoice_pdf():
input_doc = PathInput(FILE_TYPES_DIR / "pdf" / "broken_invoice.pdf")
input_doc.fix_pdf()


#
# Images
#
Expand Down

0 comments on commit 6e644e5

Please sign in to comment.