Merge pull request #71 from dataiku/feature/text-extraction-pandoc

Text extraction with pandoc
dataiku · Jul 25, 2023 · 606ea31 · 606ea31
2 parents 9df4c1e + 20adced
commit 606ea31
Show file tree

Hide file tree

Showing 19 changed files with 245 additions and 60 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [Version 2.1.0](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v2.1.0) - Feature release - 2023-06
+
+- Add a recipe to extract text content from various file types
+- Breaking change: the 'file' column of the output of the OCR recipe now contains the extension
+
 ## [Version 2.0.0](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v2.0.0) - Feature release - 2023-06
 
 - Add EasyOCR

diff --git a/README.md b/README.md
@@ -1,9 +1,10 @@
 # dss-plugin-tesseract-ocr
-Plugin for optical character recognition (OCR) in python using the tesseract engine
+Plugin for text extraction and optical character recognition (OCR) in python.
 
-The plugin has four components (three recipes and a notebook template):
+The plugin has five components (four recipes and a notebook template):
+- OCR recipe: extract text from images using either the tesseract or the easyOcr engine and output a dataset with a filename column and a text column. This recipe has parameters to recombine multi-page PDF (for instance those that were split into multiple images in the Image Conversion recipe) into a single text row, to specify the language used in tesseract, and to remove special charaters from the extracted text. It takes as input a folder of PDF/JPG/JPEG/PNG/TIFF files (ideally the output of the Image Conversion recipe or of the Image Processing recipe).
+- Text Extraction recipe: extract text content from various file types (using pypdfium2, docx or pandoc). It takes as input a folder of files and output a dataset with a filename column, a text column and an error column for when text failed to be extracted.
 - Image Conversion recipe: convert all types of images into jpg images and split multi-page PDF documents into multiple jpg images. It also convert images to grayscale. With advanced parameters, it is possible to set the image quality used by PIL and the DPI of images converted from PDFs.
-- Text Extraction recipe:: extract text from images using tesseract and output a dataset with a filename column and a text column. This recipe has parameters to recombine multi-page PDF (that were split into multiple images in the Image Conversion recipe) into a single text, to specify the language used in tesseract, and to remove special charaters from the extracted text. It takes as input a folder of JPG images (ideally the output of the Image Conversion recipe or of the Image Processing recipe).
 - Image Processing notebook: notebook to explore different types of image processing to improve (or not) text extraction from tesseract. Then, the functions that were tested in the notebook can be used in the Image Processing recipe.
 - Image Processing recipe: recipe to process images using functions defined by the user in the python editor area of the recipe parameter's form.
 
@@ -32,7 +33,7 @@ When you are done exploring different type of image processing functions, you ca
 
 ### tesseract
 
-Tesseract must be installed on DSS instance server:
+Tesseract must be installed on the DSS instance server in order to use the Tesseract OCR engine.
 
 #### Linux
 Tesseract is available directly from many Linux distributions.

diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
@@ -9,4 +9,6 @@ deskew==0.10.33
 torch==1.11.0; python_version >= '3.10'
 torch==1.9.1; python_version <= '3.9'
 easyocr==1.7.0
-packaging==21.3
+packaging==21.3
+python-docx==0.8.11
+pypandoc==1.11
diff --git a/custom-recipes/image-conversion/recipe.json b/custom-recipes/image-conversion/recipe.json
@@ -3,7 +3,7 @@
         "label": "Image conversion",
         "description": "Convert PDF/JPG/JPEG/PNG/TIFF files into greyscale JPG images. Split multipage PDF into multiple images as well.",
         "icon": "icon-picture",
-        "displayOrderRank": 2
+        "displayOrderRank": 3
     },
 
     "kind": "PYTHON",

diff --git a/custom-recipes/image-conversion/recipe.py b/custom-recipes/image-conversion/recipe.py
@@ -2,11 +2,12 @@
 from PIL import Image
 from io import BytesIO
 import logging
-from ocr_recipes_io_utils import get_input_output
-from ocr_utils import convert_image_to_greyscale_bytes
-from ocr_utils import image_conversion_parameters
-from ocr_utils import pdf_to_pil_images_iterator
-from ocr_constants import Constants
+from text_extraction_ocr_utils.recipes_io_utils import get_input_output
+from text_extraction_ocr_utils import convert_image_to_greyscale_bytes
+from text_extraction_ocr_utils import image_conversion_parameters
+from text_extraction_ocr_utils import pdf_to_pil_images_iterator
+from text_extraction_ocr_utils import Constants
+
 
 logger = logging.getLogger(__name__)
 
@@ -22,7 +23,7 @@
     prefix = sample_file.split('.')[0]
     suffix = sample_file.split('.')[-1]
 
-    if suffix in Constants.TYPES:
+    if suffix in Constants.OCR_TYPES:
         with input_folder.get_download_stream(sample_file) as stream:
             img_bytes = stream.read()
 
@@ -39,5 +40,5 @@
         logger.info("OCR - Converted {}/{} images".format(i+1, total_images))
 
     else:
-        logger.info("OCR - Rejecting {} because it is not a {} file.".format(sample_file, '/'.join(Constants.TYPES)))
+        logger.info("OCR - Rejecting {} because it is not a {} file.".format(sample_file, '/'.join(Constants.OCR_TYPES)))
         logger.info("OCR - Rejected {}/{} images".format(i+1, total_images))
diff --git a/custom-recipes/image-processing-custom/recipe.json b/custom-recipes/image-processing-custom/recipe.json
@@ -3,7 +3,7 @@
         "label": "Image processing",
         "description": "For advanced users. Use the notebook template to find good image processing functions for your images. All images in the input folder are processed using the image processing functions specified by the user in the recipe parameter's form. Inputs must be greyscale JPG images.",
         "icon": "icon-cogs",
-        "displayOrderRank": 3
+        "displayOrderRank": 4
     },
 
     "kind": "PYTHON",

diff --git a/custom-recipes/image-processing-custom/recipe.py b/custom-recipes/image-processing-custom/recipe.py
@@ -3,9 +3,9 @@
 from io import BytesIO
 import numpy as np
 import logging
-from ocr_recipes_io_utils import get_input_output
-from ocr_utils import image_processing_parameters
-from ocr_constants import Constants
+from text_extraction_ocr_utils.recipes_io_utils import get_input_output
+from text_extraction_ocr_utils import image_processing_parameters
+from text_extraction_ocr_utils import Constants
 
 logger = logging.getLogger(__name__)
 

diff --git a/custom-recipes/ocr-text-extraction-dataset/recipe.json b/custom-recipes/ocr-text-extraction-dataset/recipe.json
@@ -1,8 +1,8 @@
 {
     "meta": {
-        "label": "OCR - Text extraction",
+        "label": "Optical Character Recognition (OCR)",
         "description": "Extract text from PDF/JPG/JPEG/PNG/TIFF files into a dataset of filename and text.",
-        "icon": "icon-file-text-alt",
+        "icon": "icon-search",
         "displayOrderRank": 1
     },
 

diff --git a/custom-recipes/ocr-text-extraction-dataset/recipe.py b/custom-recipes/ocr-text-extraction-dataset/recipe.py
@@ -5,19 +5,21 @@
 from time import perf_counter
 
 from dataiku.customrecipe import get_recipe_config
-from ocr_constants import Constants
-from ocr_recipes_io_utils import get_input_output
-from ocr_utils import convert_image_to_greyscale_bytes
-from ocr_utils import pdf_to_pil_images_iterator
-from ocr_utils import text_extraction_parameters
-from tesseractocr.extract_text import text_extraction
+from text_extraction_ocr_utils import Constants
+from text_extraction_ocr_utils.recipes_io_utils import get_input_output
+from text_extraction_ocr_utils import convert_image_to_greyscale_bytes
+from text_extraction_ocr_utils import pdf_to_pil_images_iterator
+from text_extraction_ocr_utils import ocr_parameters
+from ocr import extract_text_ocr
+from ocr import get_multi_page_pdf_base_name
+from ocr import get_multi_page_pdf_page_nb
 
 
 logger = logging.getLogger(__name__)
 
 input_folder, output_dataset = get_input_output('folder', 'dataset')
 
-params = text_extraction_parameters(get_recipe_config())
+params = ocr_parameters(get_recipe_config())
 
 input_filenames = input_folder.list_paths_in_partition()
 total_files = len(input_filenames)
@@ -26,10 +28,10 @@
 
 for i, sample_file in enumerate(input_filenames):
     prefix, suffix = os.path.splitext(sample_file)
-    suffix = suffix[1:]  # removing the dot from the extension
+    suffix = suffix[1:].lower()  # removing the dot from the extension and accepting capital letters
 
-    if suffix not in Constants.TYPES:
-        logger.info("OCR - Rejecting {} because it is not a {} file.".format(sample_file, '/'.join(Constants.TYPES)))
+    if suffix not in Constants.OCR_TYPES:
+        logger.info("OCR - Rejecting {} because it is not a {} file.".format(sample_file, '/'.join(Constants.OCR_TYPES)))
         logger.info("OCR - Rejected {}/{} files".format(i+1, total_files))
         continue
 
@@ -41,22 +43,21 @@
     if suffix == "pdf":
         for j, img in enumerate(pdf_to_pil_images_iterator(img_bytes)):
             img_bytes = convert_image_to_greyscale_bytes(img)
-            img_text = text_extraction(img_bytes, params)
+            img_text = extract_text_ocr(img_bytes, params)
 
-            pdf_image_name = "{}{}{:05d}".format(prefix, Constants.PDF_MULTI_SUFFIX, j+1)
+            pdf_image_name = "{}{}{:05d}.jpg".format(prefix, Constants.PDF_MULTI_SUFFIX, j+1)
             rows.append({'file': pdf_image_name, 'text': img_text})
     else:
-        img_text = text_extraction(img_bytes, params)
-        rows.append({'file': prefix, 'text': img_text})
+        img_text = extract_text_ocr(img_bytes, params)
+        rows.append({'file': sample_file, 'text': img_text})
 
     logger.info("OCR - Extracted text from {}/{} files (in {:.2f} seconds)".format(i+1, total_files, perf_counter() - start))
 
 df = pd.DataFrame(rows)
 
 if params['recombine_pdf']:
-    pdf_multi_page_pattern = "^.*{}\d{{5}}$".format(Constants.PDF_MULTI_SUFFIX)
-    df['page_nb'] = df.apply(lambda row: int(row['file'].split(Constants.PDF_MULTI_SUFFIX)[1]) if re.match(pdf_multi_page_pattern, row['file']) else 1, axis=1)
-    df['file'] = df.apply(lambda row: row['file'].split(Constants.PDF_MULTI_SUFFIX)[0] if re.match(pdf_multi_page_pattern, row['file']) else row['file'], axis=1)
+    df['page_nb'] = df.apply(lambda row: get_multi_page_pdf_page_nb(row['file']), axis=1)
+    df['file'] = df.apply(lambda row: get_multi_page_pdf_base_name(row['file']), axis=1)
 
     df = df.sort_values(['file', 'page_nb'], ascending=True)
 

diff --git a/custom-recipes/text-extraction/recipe.json b/custom-recipes/text-extraction/recipe.json
@@ -0,0 +1,46 @@
+{
+    "meta": {
+        "label": "Text extraction",
+        "description": "Extract text content from various file types (PDF, Docx, HTML, etc) into a dataset with columns filename and text",
+        "icon": "icon-file-text-alt",
+        "displayOrderRank": 2
+    },
+
+    "kind": "PYTHON",
+    "selectableFromFolder": "input_folder",
+
+    "inputRoles" : [
+        {
+            "name": "input_folder",
+            "label": "Input folder",
+            "arity": "UNARY",
+            "required": true,
+            "description": "Folder of input files",
+            "acceptsManagedFolder": true,
+            "acceptsDataset": false
+        }
+    ],
+
+    "outputRoles" : [
+        {
+            "name": "output_dataset",
+            "label": "Output dataset",
+            "arity": "UNARY",
+            "required": true,
+            "description": "Dataset of extracted text",
+            "acceptsManagedFolder": false,
+            "acceptsDataset": true
+        }
+    ],
+    "params": [
+        {
+            "name": "description",
+            "label": "",
+            "type": "SEPARATOR",
+            "description": "The Text extraction recipe is ready to use 'out of the box' - no settings required."
+        }
+    ],
+
+    "resourceKeys": []
+
+}
diff --git a/custom-recipes/text-extraction/recipe.py b/custom-recipes/text-extraction/recipe.py
@@ -0,0 +1,48 @@
+import logging
+import os
+import pandas as pd
+from time import perf_counter
+
+from text_extraction_ocr_utils.recipes_io_utils import get_input_output
+from text_extraction import extract_text_content
+from text_extraction import download_pandoc_binaries
+
+
+# call this method to download pandoc binaries
+with_pandoc = download_pandoc_binaries()
+
+logger = logging.getLogger(__name__)
+
+input_folder, output_dataset = get_input_output('folder', 'dataset')
+
+input_filenames = input_folder.list_paths_in_partition()
+total_files = len(input_filenames)
+
+rows = []
+
+for i, sample_file in enumerate(input_filenames):
+    prefix, suffix = os.path.splitext(sample_file)
+    suffix = suffix[1:].lower()  # removing the dot from the extension and accepting capital letters
+
+    start = perf_counter()
+
+    logger.info("Extracting text from file {} ...".format(sample_file))
+
+    with input_folder.get_download_stream(sample_file) as stream:
+        file_bytes = stream.read()
+
+    try:
+        extracted_text = extract_text_content(file_bytes, suffix, with_pandoc)
+
+        if not extracted_text.strip():
+            logger.warning("Extracted text is empty")
+
+        rows.append({'file': sample_file, 'text': extracted_text, 'error_message': ""})
+        logger.info("Extracted text from {}/{} files (in {:.2f} seconds)".format(i+1, total_files, perf_counter() - start))
+    except Exception as e:
+        rows.append({'file': sample_file, 'text': "", 'error_message': e})
+        logger.info("Failed extracting text from file {} because: {}".format(sample_file, e))
+
+df = pd.DataFrame(rows)
+
+output_dataset.write_with_schema(df)
diff --git a/plugin.json b/plugin.json
@@ -1,9 +1,9 @@
 {
     "id": "tesseract-ocr",
-    "version": "2.0.0",
+    "version": "2.1.0",
     "meta": {
-        "label": "OCR",
-        "description": "Extract text from images using OCR engines",
+        "label": "Text extraction and OCR",
+        "description": "Extract text content from files or use OCR engines",
         "author": "Dataiku",
         "icon": "icon-file-text-alt",
         "tags": [

diff --git a/python-lib/tesseractocr/extract_text.py → python-lib/ocr/__init__.py b/python-lib/tesseractocr/extract_text.py → python-lib/ocr/__init__.py
@@ -3,12 +3,13 @@
 import numpy as np
 import pytesseract
 import logging
-from ocr_constants import Constants
+import re
+from text_extraction_ocr_utils import Constants
 
 logger = logging.getLogger(__name__)
 
 
-def text_extraction(img_bytes, params):
+def extract_text_ocr(img_bytes, params):
     """
     extract text from bytes images using the selected OCR engine (with specified language)
     """
@@ -36,3 +37,17 @@ def text_extraction(img_bytes, params):
         raise NotImplementedError("OCR engine {} not implemented".format(params[Constants.OCR_ENGINE]))
 
     return img_text
+
+
+def get_multi_page_pdf_page_nb(file_name):
+    matched = re.fullmatch(r".*_pdf_page_(\d{5})\.jpg", file_name)
+    if matched is not None:
+        return int(matched.group(1))
+    return 1
+
+
+def get_multi_page_pdf_base_name(file_name):
+    matched = re.fullmatch(r"(.*)_pdf_page_\d{5}\.jpg", file_name)
+    if matched is not None:
+        return "{}.pdf".format(matched.group(1))
+    return file_name
diff --git a/python-lib/ocr_constants.py b/python-lib/ocr_constants.py