12
12
import re
13
13
import time
14
14
from types import SimpleNamespace
15
- from typing import Iterable , List , Optional , Union
15
+ from typing import Iterable , List , Optional , Tuple , Union
16
16
17
17
# External Dependencies:
18
18
import boto3
@@ -274,6 +274,94 @@ def trp_page_has_content(page: trp.Page) -> bool:
274
274
return len (page .lines ) > 0
275
275
276
276
277
+ def find_cleaned_page_imgs_by_textract_uri (
278
+ rel_filepath : str ,
279
+ imgs_s3uri : str ,
280
+ ) -> Tuple [List [str ], List [Union [int , None ]]]:
281
+ """Find cleaned page images (and their expected page numbers) on S3 for a doc in the corpus
282
+
283
+ This function essentially reconstructs logic applied by the image cleaning pre-processing job
284
+ to locate cleaned images in S3 for a given raw document in the corpus: Including multi-page
285
+ PDFs, TIFFs, or single-page input images like JPEGs. Returned objects are verified to actually
286
+ exist in S3 at the time the function was called.
287
+
288
+ Parameters
289
+ ----------
290
+ rel_filepath : str
291
+ Relative path to a raw document or image in the corpus (i.e. within the data/raw folder)
292
+ imgs_s3uri : str
293
+ 's3://...' root URI under which cleaned page images are stored, with filenames generated
294
+ from documents as per `clean_dataset_for_img_ocr()`
295
+
296
+ Returns
297
+ -------
298
+ img_candidate_s3keys: List[str]
299
+ List of S3 object keys which (have been tested to exist and) are expected to correspond to
300
+ cleaned page images of the input document. Not necessarily in page number order.
301
+ img_candidate_pagenums: List[Union[str, NoneType]]
302
+ Inferred (1-based) page number for each entry in `img_candidate_s3keys`, or `None` if page
303
+ number could not be inferred for that object.
304
+ """
305
+ # pdf2image outputs look like {MyOriginalFileBaseName}-0000-00.{FileExt}:
306
+ PDF2IMAGE_REGEX = re .compile (r"^-\d{4,}-\d+.(?:png|jpg|jpeg)$" , re .IGNORECASE )
307
+ NONPDF_REGEX = re .compile (r"^(-\d{4,})?.(?:png|jpg|jpeg)$" , re .IGNORECASE )
308
+
309
+ imgs_bucket_name , _ , imgs_s3key_root = imgs_s3uri [len ("s3://" ) :].partition ("/" )
310
+ imgs_bucket = s3 .Bucket (imgs_bucket_name )
311
+
312
+ rel_filedir , _ , filename = rel_filepath .rpartition ("/" )
313
+ filename_root , _ , extension = filename .rpartition ("." )
314
+ extension = extension .lower ()
315
+ file_img_s3key_prefix = "" .join (
316
+ (
317
+ imgs_s3key_root ,
318
+ "/" ,
319
+ rel_filedir + "/" if rel_filedir else "" ,
320
+ filename_root ,
321
+ )
322
+ )
323
+
324
+ raw_candidate_objs = imgs_bucket .objects .filter (Prefix = file_img_s3key_prefix )
325
+
326
+ if extension == "pdf" :
327
+ # Use the pdf2image regex to find images and associate page numbers:
328
+ img_candidate_s3keys = list (
329
+ map (
330
+ lambda o : o .key ,
331
+ filter (
332
+ lambda o : PDF2IMAGE_REGEX .match (o .key [len (file_img_s3key_prefix ):]),
333
+ raw_candidate_objs ,
334
+ ),
335
+ )
336
+ )
337
+ img_candidate_pagenums = list (
338
+ map (
339
+ lambda f : int (f .rpartition ("." )[0 ].rpartition ("-" )[2 ]),
340
+ img_candidate_s3keys ,
341
+ )
342
+ )
343
+ else :
344
+ # Could be a single-page (e.g. PNG) or multi-page (e.g. TIFF) image:
345
+ raw_candidate_s3keys = [o .key for o in raw_candidate_objs ]
346
+ regex_matches = [
347
+ NONPDF_REGEX .match (k [len (file_img_s3key_prefix ):])
348
+ for k in raw_candidate_s3keys
349
+ ]
350
+
351
+ img_candidate_s3keys = [
352
+ raw_candidate_s3keys [ix ]
353
+ for ix in range (len (regex_matches ))
354
+ if regex_matches [ix ]
355
+ ]
356
+
357
+ if len (img_candidate_s3keys ) == 1 :
358
+ img_candidate_pagenums = [1 ]
359
+ else :
360
+ img_candidate_pagenums = [int (match .group (1 )) for match in regex_matches if match ]
361
+
362
+ return img_candidate_s3keys , img_candidate_pagenums
363
+
364
+
277
365
def build_data_manifest (
278
366
manifest_file : str ,
279
367
rel_doc_paths : Iterable [str ],
@@ -342,11 +430,7 @@ def build_data_manifest(
342
430
f"`no_content` option must be 'omit', 'flag', or None. Got: { no_content } "
343
431
)
344
432
345
- # pdf2image outputs look like {MyOriginalFileBaseName}-0000-00.{FileExt}:
346
- pdf2image_regex = re .compile (r"-\d{4,}-\d+.(?:png|jpg|jpeg)" , re .IGNORECASE )
347
-
348
- imgs_bucket_name , _ , imgs_s3key_root = imgs_s3uri [len ("s3://" ) :].partition ("/" )
349
- imgs_bucket = s3 .Bucket (imgs_bucket_name )
433
+ imgs_bucket_name = imgs_s3uri [len ("s3://" ) :].partition ("/" )[0 ]
350
434
textract_bucket_name , _ , textract_s3key_root = textract_s3uri [len ("s3://" ) :].partition ("/" )
351
435
textract_bucket = s3 .Bucket (textract_bucket_name )
352
436
@@ -367,32 +451,9 @@ def build_data_manifest(
367
451
pages_have_content = [trp_page_has_content (p ) for p in doc .pages ]
368
452
369
453
# List the matching page images in S3:
370
- rel_filedir , _ , filename = rel_filepath .rpartition ("/" )
371
- filename_root = filename .rpartition ("." )[0 ]
372
- file_img_s3key_prefix = "" .join (
373
- (
374
- imgs_s3key_root ,
375
- "/" ,
376
- rel_filedir + "/" if rel_filedir else "" ,
377
- filename_root ,
378
- )
379
- )
380
- img_candidate_s3keys = list (
381
- map (
382
- lambda o : o .key ,
383
- filter (
384
- lambda o : pdf2image_regex .match (o .key [len (file_img_s3key_prefix ) :]),
385
- imgs_bucket .objects .filter (Prefix = file_img_s3key_prefix ),
386
- ),
387
- )
388
- )
389
-
390
- # Validate that we have one image per page of the Textract doc:
391
- img_candidate_pagenums = list (
392
- map (
393
- lambda f : int (f .rpartition ("." )[0 ].rpartition ("-" )[2 ]),
394
- img_candidate_s3keys ,
395
- )
454
+ img_candidate_s3keys , img_candidate_pagenums = find_cleaned_page_imgs_by_textract_uri (
455
+ rel_filepath ,
456
+ imgs_s3uri = imgs_s3uri ,
396
457
)
397
458
if img_candidate_pagenums != list (range (1 , len (doc .pages ) + 1 )):
398
459
if len (img_candidate_pagenums ) == 0 :
0 commit comments