Skip to content

Commit cd3f3d5

Browse files
authored
Merge #9 fix data prep for non-PDF (image) corpus
Fix data preparation notebook errors when using image data instead of PDFs. Also resolve iframe warning in notebook 1.
2 parents f3ae99e + a706d0b commit cd3f3d5

File tree

2 files changed

+99
-38
lines changed

2 files changed

+99
-38
lines changed

notebooks/1. Data Preparation.ipynb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282
"\n",
8383
"# External Dependencies:\n",
8484
"import boto3 # AWS SDK for Python\n",
85-
"from IPython.display import HTML # To display rich content in notebook\n",
85+
"from IPython import display # To display rich content in notebook\n",
8686
"import pandas as pd # For tabular data analysis\n",
8787
"import sagemaker # High-level SDK for SageMaker\n",
8888
"from tqdm.notebook import tqdm # Progress bars\n",
@@ -238,11 +238,11 @@
238238
"metadata": {},
239239
"outputs": [],
240240
"source": [
241-
"HTML(\n",
242-
" '<iframe src=\"{}\" width=100% height=600 type=\"application/pdf\"></iframe>'.format(\n",
243-
" # Edit the below (e.g. 0, 1, 2) to see different documents:\n",
244-
" \"data/raw/\" + rel_filepaths[0]\n",
245-
" )\n",
241+
"display.IFrame(\n",
242+
" # Edit the below (e.g. 0, 1, 2) to see different documents:\n",
243+
" \"data/raw/\" + rel_filepaths[0],\n",
244+
" height=\"600\",\n",
245+
" width=\"100%\",\n",
246246
")"
247247
]
248248
},

notebooks/util/preproc.py

Lines changed: 93 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import re
1313
import time
1414
from types import SimpleNamespace
15-
from typing import Iterable, List, Optional, Union
15+
from typing import Iterable, List, Optional, Tuple, Union
1616

1717
# External Dependencies:
1818
import boto3
@@ -274,6 +274,94 @@ def trp_page_has_content(page: trp.Page) -> bool:
274274
return len(page.lines) > 0
275275

276276

277+
def find_cleaned_page_imgs_by_textract_uri(
278+
rel_filepath: str,
279+
imgs_s3uri: str,
280+
) -> Tuple[List[str], List[Union[int, None]]]:
281+
"""Find cleaned page images (and their expected page numbers) on S3 for a doc in the corpus
282+
283+
This function essentially reconstructs logic applied by the image cleaning pre-processing job
284+
to locate cleaned images in S3 for a given raw document in the corpus: Including multi-page
285+
PDFs, TIFFs, or single-page input images like JPEGs. Returned objects are verified to actually
286+
exist in S3 at the time the function was called.
287+
288+
Parameters
289+
----------
290+
rel_filepath : str
291+
Relative path to a raw document or image in the corpus (i.e. within the data/raw folder)
292+
imgs_s3uri : str
293+
's3://...' root URI under which cleaned page images are stored, with filenames generated
294+
from documents as per `clean_dataset_for_img_ocr()`
295+
296+
Returns
297+
-------
298+
img_candidate_s3keys: List[str]
299+
List of S3 object keys which (have been tested to exist and) are expected to correspond to
300+
cleaned page images of the input document. Not necessarily in page number order.
301+
img_candidate_pagenums: List[Union[str, NoneType]]
302+
Inferred (1-based) page number for each entry in `img_candidate_s3keys`, or `None` if page
303+
number could not be inferred for that object.
304+
"""
305+
# pdf2image outputs look like {MyOriginalFileBaseName}-0000-00.{FileExt}:
306+
PDF2IMAGE_REGEX = re.compile(r"^-\d{4,}-\d+.(?:png|jpg|jpeg)$", re.IGNORECASE)
307+
NONPDF_REGEX = re.compile(r"^(-\d{4,})?.(?:png|jpg|jpeg)$", re.IGNORECASE)
308+
309+
imgs_bucket_name, _, imgs_s3key_root = imgs_s3uri[len("s3://") :].partition("/")
310+
imgs_bucket = s3.Bucket(imgs_bucket_name)
311+
312+
rel_filedir, _, filename = rel_filepath.rpartition("/")
313+
filename_root, _, extension = filename.rpartition(".")
314+
extension = extension.lower()
315+
file_img_s3key_prefix = "".join(
316+
(
317+
imgs_s3key_root,
318+
"/",
319+
rel_filedir + "/" if rel_filedir else "",
320+
filename_root,
321+
)
322+
)
323+
324+
raw_candidate_objs = imgs_bucket.objects.filter(Prefix=file_img_s3key_prefix)
325+
326+
if extension == "pdf":
327+
# Use the pdf2image regex to find images and associate page numbers:
328+
img_candidate_s3keys = list(
329+
map(
330+
lambda o: o.key,
331+
filter(
332+
lambda o: PDF2IMAGE_REGEX.match(o.key[len(file_img_s3key_prefix):]),
333+
raw_candidate_objs,
334+
),
335+
)
336+
)
337+
img_candidate_pagenums = list(
338+
map(
339+
lambda f: int(f.rpartition(".")[0].rpartition("-")[2]),
340+
img_candidate_s3keys,
341+
)
342+
)
343+
else:
344+
# Could be a single-page (e.g. PNG) or multi-page (e.g. TIFF) image:
345+
raw_candidate_s3keys = [o.key for o in raw_candidate_objs]
346+
regex_matches = [
347+
NONPDF_REGEX.match(k[len(file_img_s3key_prefix):])
348+
for k in raw_candidate_s3keys
349+
]
350+
351+
img_candidate_s3keys = [
352+
raw_candidate_s3keys[ix]
353+
for ix in range(len(regex_matches))
354+
if regex_matches[ix]
355+
]
356+
357+
if len(img_candidate_s3keys) == 1:
358+
img_candidate_pagenums = [1]
359+
else:
360+
img_candidate_pagenums = [int(match.group(1)) for match in regex_matches if match]
361+
362+
return img_candidate_s3keys, img_candidate_pagenums
363+
364+
277365
def build_data_manifest(
278366
manifest_file: str,
279367
rel_doc_paths: Iterable[str],
@@ -342,11 +430,7 @@ def build_data_manifest(
342430
f"`no_content` option must be 'omit', 'flag', or None. Got: {no_content}"
343431
)
344432

345-
# pdf2image outputs look like {MyOriginalFileBaseName}-0000-00.{FileExt}:
346-
pdf2image_regex = re.compile(r"-\d{4,}-\d+.(?:png|jpg|jpeg)", re.IGNORECASE)
347-
348-
imgs_bucket_name, _, imgs_s3key_root = imgs_s3uri[len("s3://") :].partition("/")
349-
imgs_bucket = s3.Bucket(imgs_bucket_name)
433+
imgs_bucket_name = imgs_s3uri[len("s3://") :].partition("/")[0]
350434
textract_bucket_name, _, textract_s3key_root = textract_s3uri[len("s3://") :].partition("/")
351435
textract_bucket = s3.Bucket(textract_bucket_name)
352436

@@ -367,32 +451,9 @@ def build_data_manifest(
367451
pages_have_content = [trp_page_has_content(p) for p in doc.pages]
368452

369453
# List the matching page images in S3:
370-
rel_filedir, _, filename = rel_filepath.rpartition("/")
371-
filename_root = filename.rpartition(".")[0]
372-
file_img_s3key_prefix = "".join(
373-
(
374-
imgs_s3key_root,
375-
"/",
376-
rel_filedir + "/" if rel_filedir else "",
377-
filename_root,
378-
)
379-
)
380-
img_candidate_s3keys = list(
381-
map(
382-
lambda o: o.key,
383-
filter(
384-
lambda o: pdf2image_regex.match(o.key[len(file_img_s3key_prefix) :]),
385-
imgs_bucket.objects.filter(Prefix=file_img_s3key_prefix),
386-
),
387-
)
388-
)
389-
390-
# Validate that we have one image per page of the Textract doc:
391-
img_candidate_pagenums = list(
392-
map(
393-
lambda f: int(f.rpartition(".")[0].rpartition("-")[2]),
394-
img_candidate_s3keys,
395-
)
454+
img_candidate_s3keys, img_candidate_pagenums = find_cleaned_page_imgs_by_textract_uri(
455+
rel_filepath,
456+
imgs_s3uri=imgs_s3uri,
396457
)
397458
if img_candidate_pagenums != list(range(1, len(doc.pages) + 1)):
398459
if len(img_candidate_pagenums) == 0:

0 commit comments

Comments
 (0)