Skip to content

Commit 2ce38c4

Browse files
fix function names
1 parent 4296608 commit 2ce38c4

File tree

4 files changed

+42
-39
lines changed

4 files changed

+42
-39
lines changed

mindee/image_operations/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from mindee.image_operations.image_compressor import compress_image

mindee/pdf/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
from mindee.pdf.pdf_utils import (
44
extract_text_from_pdf,
55
has_source_text,
6+
lerp,
67
)

mindee/pdf/pdf_compressor.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from mindee.pdf.pdf_utils import (
1515
extract_text_from_pdf,
1616
has_source_text,
17+
lerp,
1718
)
1819

1920
logger = logging.getLogger(__name__)
@@ -61,15 +62,15 @@ def compress_pdf(
6162
extract_text_from_pdf(pdf_bytes) if not disable_source_text else None
6263
)
6364

64-
compressed_pages = compress_pdf_pages(pdf_bytes, image_quality)
65+
compressed_pages = _compress_pdf_pages(pdf_bytes, image_quality)
6566

6667
if not compressed_pages:
6768
logger.warning(
6869
"Could not compress PDF to a smaller size. Returning original PDF."
6970
)
7071
return pdf_bytes
7172

72-
out_pdf = collect_images_as_pdf(
73+
out_pdf = _collect_images_as_pdf(
7374
[compressed_page_image[0] for compressed_page_image in compressed_pages]
7475
)
7576

@@ -83,7 +84,7 @@ def compress_pdf(
8384
return out_buffer.read()
8485

8586

86-
def compress_pdf_pages(
87+
def _compress_pdf_pages(
8788
pdf_data: bytes,
8889
image_quality: int,
8990
) -> Optional[List[Tuple[bytes, int, int]]]:
@@ -98,10 +99,10 @@ def compress_pdf_pages(
9899
image_quality_loop = image_quality
99100

100101
while image_quality_loop >= MIN_QUALITY:
101-
compressed_pages = compress_pages_with_quality(pdf_data, image_quality_loop)
102+
compressed_pages = _compress_pages_with_quality(pdf_data, image_quality_loop)
102103
total_compressed_size = sum(len(page) for page in compressed_pages)
103104

104-
if is_compression_successful(
105+
if _is_compression_successful(
105106
total_compressed_size, original_size, image_quality
106107
):
107108
return compressed_pages
@@ -146,7 +147,7 @@ def add_text_to_pdf_page( # type: ignore
146147
pdfium_c.FPDFPage_GenerateContent(page.raw)
147148

148149

149-
def compress_pages_with_quality(
150+
def _compress_pages_with_quality(
150151
pdf_data: bytes,
151152
image_quality: int,
152153
) -> List[Tuple[bytes, int, int]]:
@@ -160,15 +161,15 @@ def compress_pages_with_quality(
160161
pdf_document = pdfium.PdfDocument(pdf_data)
161162
compressed_pages = []
162163
for page in pdf_document:
163-
rasterized_page = rasterize_page(page, image_quality)
164+
rasterized_page = _rasterize_page(page, image_quality)
164165
compressed_image = compress_image(rasterized_page, image_quality)
165166
image = Image.open(io.BytesIO(compressed_image))
166167
compressed_pages.append((compressed_image, image.size[0], image.size[1]))
167168

168169
return compressed_pages
169170

170171

171-
def is_compression_successful(
172+
def _is_compression_successful(
172173
total_compressed_size: int, original_size: int, image_quality: int
173174
) -> bool:
174175
"""
@@ -183,7 +184,7 @@ def is_compression_successful(
183184
return total_compressed_size + total_compressed_size * overhead < original_size
184185

185186

186-
def rasterize_page( # type: ignore
187+
def _rasterize_page( # type: ignore
187188
page: pdfium.PdfPage,
188189
quality: int = 85,
189190
) -> bytes:
@@ -200,19 +201,7 @@ def rasterize_page( # type: ignore
200201
return buffer.getvalue()
201202

202203

203-
def lerp(start: float, end: float, t: float) -> float:
204-
"""
205-
Performs linear interpolation between two numbers.
206-
207-
:param start: The starting value.
208-
:param end: The ending value.
209-
:param t: The interpolation factor (0 to 1).
210-
:return: The interpolated value.
211-
"""
212-
return start * (1 - t) + end * t
213-
214-
215-
def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument: # type: ignore
204+
def _collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument: # type: ignore
216205
"""
217206
Converts a list of JPEG images into pages in a PdfDocument.
218207

mindee/pdf/pdf_utils.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,12 @@ def extract_text_from_pdf(pdf_bytes: bytes) -> List[List[PDFCharData]]:
3737
char_data_list: List[List[PDFCharData]] = []
3838

3939
for i, page in enumerate(pdf):
40-
char_data_list.append(process_page(page, i, pdfium_lock))
40+
char_data_list.append(_process_page(page, i, pdfium_lock))
4141

4242
return char_data_list
4343

4444

45-
def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
45+
def _process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
4646
"""
4747
Processes a single page of the PDF.
4848
@@ -59,7 +59,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
5959
count_chars = pdfium_c.FPDFText_CountChars(text_handler)
6060

6161
for i in range(count_chars):
62-
concatenated_chars = process_char(
62+
concatenated_chars = _process_char(
6363
i, text_handler, page, pdfium_lock, internal_height, internal_width, page_id
6464
)
6565
for concatenated_char in concatenated_chars:
@@ -70,7 +70,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
7070
return char_data_list
7171

7272

73-
def process_char(
73+
def _process_char(
7474
i: int,
7575
text_handler,
7676
page,
@@ -91,21 +91,21 @@ def process_char(
9191
:param page_id: ID of the page the character was found on.
9292
:return: List of character data for a page.
9393
"""
94-
char_info = get_char_info(i, text_handler, pdfium_lock)
94+
char_info = _get_char_info(i, text_handler, pdfium_lock)
9595
if not char_info:
9696
return []
97-
char_box = get_char_box(i, text_handler, pdfium_lock)
98-
rotation = get_page_rotation(page, pdfium_lock)
97+
char_box = _get_char_box(i, text_handler, pdfium_lock)
98+
rotation = _get_page_rotation(page, pdfium_lock)
9999

100-
adjusted_box = adjust_char_box(char_box, rotation, internal_height, internal_width)
100+
adjusted_box = _adjust_char_box(char_box, rotation, internal_height, internal_width)
101101
char_data_list: List[PDFCharData] = []
102102
for c in char_info["char"] or " ":
103103
if c in (
104104
"\n",
105105
"\r",
106106
): # Removes duplicated carriage returns in the PDF due to weird extraction.
107107
# IDK how to make this better, and neither does Claude, GPT4 nor GPT-o1, so I'm leaving this weird check.
108-
next_char_info = get_char_info(i + 1, text_handler, pdfium_lock)
108+
next_char_info = _get_char_info(i + 1, text_handler, pdfium_lock)
109109
if not next_char_info or next_char_info["char"] in ("\n", "\r"):
110110
continue
111111

@@ -128,7 +128,7 @@ def process_char(
128128
return char_data_list
129129

130130

131-
def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
131+
def _get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
132132
"""
133133
Retrieves information about a specific character.
134134
@@ -145,8 +145,8 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
145145
if unicode_char == 0xFF:
146146
return {}
147147
char = chr(unicode_char)
148-
font_name = get_font_name(text_handler, i)
149-
font_flags = get_font_flags(text_handler, i)
148+
font_name = _get_font_name(text_handler, i)
149+
font_flags = _get_font_flags(text_handler, i)
150150
font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i)
151151
font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i)
152152
_ = pdfium_c.FPDFText_GetStrokeColor(
@@ -167,7 +167,7 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
167167
}
168168

169169

170-
def get_font_name(text_handler, i: int) -> str:
170+
def _get_font_name(text_handler, i: int) -> str:
171171
"""
172172
Retrieves the font name for a specific character.
173173
@@ -186,7 +186,7 @@ def get_font_name(text_handler, i: int) -> str:
186186
)
187187

188188

189-
def get_font_flags(text_handler, i: int) -> int:
189+
def _get_font_flags(text_handler, i: int) -> int:
190190
"""
191191
Retrieves the font flags for a specific character.
192192
@@ -199,7 +199,7 @@ def get_font_flags(text_handler, i: int) -> int:
199199
return flags.value
200200

201201

202-
def get_char_box(
202+
def _get_char_box(
203203
i: int, text_handler, pdfium_lock: RLock
204204
) -> Tuple[float, float, float, float]:
205205
"""
@@ -218,7 +218,7 @@ def get_char_box(
218218
return left.value, right.value, bottom.value, top.value
219219

220220

221-
def get_page_rotation(page, pdfium_lock: RLock) -> int:
221+
def _get_page_rotation(page, pdfium_lock: RLock) -> int:
222222
"""
223223
Retrieves the rotation value for a specific page.
224224
@@ -232,7 +232,7 @@ def get_page_rotation(page, pdfium_lock: RLock) -> int:
232232
)
233233

234234

235-
def adjust_char_box(
235+
def _adjust_char_box(
236236
char_box: Tuple[float, float, float, float],
237237
rotation: int,
238238
internal_height: float,
@@ -263,3 +263,15 @@ def adjust_char_box(
263263
internal_height - left,
264264
)
265265
return left, right, top, bottom
266+
267+
268+
def lerp(start: float, end: float, t: float) -> float:
269+
"""
270+
Performs linear interpolation between two numbers.
271+
272+
:param start: The starting value.
273+
:param end: The ending value.
274+
:param t: The interpolation factor (0 to 1).
275+
:return: The interpolated value.
276+
"""
277+
return start * (1 - t) + end * t

0 commit comments

Comments
 (0)