@@ -37,12 +37,12 @@ def extract_text_from_pdf(pdf_bytes: bytes) -> List[List[PDFCharData]]:
37
37
char_data_list : List [List [PDFCharData ]] = []
38
38
39
39
for i , page in enumerate (pdf ):
40
- char_data_list .append (process_page (page , i , pdfium_lock ))
40
+ char_data_list .append (_process_page (page , i , pdfium_lock ))
41
41
42
42
return char_data_list
43
43
44
44
45
- def process_page (page , page_id : int , pdfium_lock : RLock ) -> List [PDFCharData ]:
45
+ def _process_page (page , page_id : int , pdfium_lock : RLock ) -> List [PDFCharData ]:
46
46
"""
47
47
Processes a single page of the PDF.
48
48
@@ -59,7 +59,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
59
59
count_chars = pdfium_c .FPDFText_CountChars (text_handler )
60
60
61
61
for i in range (count_chars ):
62
- concatenated_chars = process_char (
62
+ concatenated_chars = _process_char (
63
63
i , text_handler , page , pdfium_lock , internal_height , internal_width , page_id
64
64
)
65
65
for concatenated_char in concatenated_chars :
@@ -70,7 +70,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
70
70
return char_data_list
71
71
72
72
73
- def process_char (
73
+ def _process_char (
74
74
i : int ,
75
75
text_handler ,
76
76
page ,
@@ -91,21 +91,21 @@ def process_char(
91
91
:param page_id: ID of the page the character was found on.
92
92
:return: List of character data for a page.
93
93
"""
94
- char_info = get_char_info (i , text_handler , pdfium_lock )
94
+ char_info = _get_char_info (i , text_handler , pdfium_lock )
95
95
if not char_info :
96
96
return []
97
- char_box = get_char_box (i , text_handler , pdfium_lock )
98
- rotation = get_page_rotation (page , pdfium_lock )
97
+ char_box = _get_char_box (i , text_handler , pdfium_lock )
98
+ rotation = _get_page_rotation (page , pdfium_lock )
99
99
100
- adjusted_box = adjust_char_box (char_box , rotation , internal_height , internal_width )
100
+ adjusted_box = _adjust_char_box (char_box , rotation , internal_height , internal_width )
101
101
char_data_list : List [PDFCharData ] = []
102
102
for c in char_info ["char" ] or " " :
103
103
if c in (
104
104
"\n " ,
105
105
"\r " ,
106
106
): # Removes duplicated carriage returns in the PDF due to weird extraction.
107
107
# IDK how to make this better, and neither does Claude, GPT4 nor GPT-o1, so I'm leaving this weird check.
108
- next_char_info = get_char_info (i + 1 , text_handler , pdfium_lock )
108
+ next_char_info = _get_char_info (i + 1 , text_handler , pdfium_lock )
109
109
if not next_char_info or next_char_info ["char" ] in ("\n " , "\r " ):
110
110
continue
111
111
@@ -128,7 +128,7 @@ def process_char(
128
128
return char_data_list
129
129
130
130
131
- def get_char_info (i : int , text_handler , pdfium_lock : RLock ) -> dict :
131
+ def _get_char_info (i : int , text_handler , pdfium_lock : RLock ) -> dict :
132
132
"""
133
133
Retrieves information about a specific character.
134
134
@@ -145,8 +145,8 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
145
145
if unicode_char == 0xFF :
146
146
return {}
147
147
char = chr (unicode_char )
148
- font_name = get_font_name (text_handler , i )
149
- font_flags = get_font_flags (text_handler , i )
148
+ font_name = _get_font_name (text_handler , i )
149
+ font_flags = _get_font_flags (text_handler , i )
150
150
font_size = pdfium_c .FPDFText_GetFontSize (text_handler , i )
151
151
font_weight = pdfium_c .FPDFText_GetFontWeight (text_handler , i )
152
152
_ = pdfium_c .FPDFText_GetStrokeColor (
@@ -167,7 +167,7 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
167
167
}
168
168
169
169
170
- def get_font_name (text_handler , i : int ) -> str :
170
+ def _get_font_name (text_handler , i : int ) -> str :
171
171
"""
172
172
Retrieves the font name for a specific character.
173
173
@@ -186,7 +186,7 @@ def get_font_name(text_handler, i: int) -> str:
186
186
)
187
187
188
188
189
- def get_font_flags (text_handler , i : int ) -> int :
189
+ def _get_font_flags (text_handler , i : int ) -> int :
190
190
"""
191
191
Retrieves the font flags for a specific character.
192
192
@@ -199,7 +199,7 @@ def get_font_flags(text_handler, i: int) -> int:
199
199
return flags .value
200
200
201
201
202
- def get_char_box (
202
+ def _get_char_box (
203
203
i : int , text_handler , pdfium_lock : RLock
204
204
) -> Tuple [float , float , float , float ]:
205
205
"""
@@ -218,7 +218,7 @@ def get_char_box(
218
218
return left .value , right .value , bottom .value , top .value
219
219
220
220
221
- def get_page_rotation (page , pdfium_lock : RLock ) -> int :
221
+ def _get_page_rotation (page , pdfium_lock : RLock ) -> int :
222
222
"""
223
223
Retrieves the rotation value for a specific page.
224
224
@@ -232,7 +232,7 @@ def get_page_rotation(page, pdfium_lock: RLock) -> int:
232
232
)
233
233
234
234
235
- def adjust_char_box (
235
+ def _adjust_char_box (
236
236
char_box : Tuple [float , float , float , float ],
237
237
rotation : int ,
238
238
internal_height : float ,
@@ -263,3 +263,15 @@ def adjust_char_box(
263
263
internal_height - left ,
264
264
)
265
265
return left , right , top , bottom
266
+
267
+
268
+ def lerp (start : float , end : float , t : float ) -> float :
269
+ """
270
+ Performs linear interpolation between two numbers.
271
+
272
+ :param start: The starting value.
273
+ :param end: The ending value.
274
+ :param t: The interpolation factor (0 to 1).
275
+ :return: The interpolated value.
276
+ """
277
+ return start * (1 - t ) + end * t
0 commit comments