Merge pull request #70 from dataiku/feature/add-easyocr

add easyocr and accept pdf
dataiku · Jun 29, 2023 · 9df4c1e · 9df4c1e
2 parents f046670 + ee2fd15
commit 9df4c1e
Show file tree

Hide file tree

Showing 22 changed files with 340 additions and 134 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## [Version 2.0.0](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v2.0.0) - Feature release - 2023-06
+
+- Add EasyOCR
+- Add an option for a default OCR engine that fallbacks to EasyOCR if tesseract is not installed on the system 
+- Support of PDFs in the text extraction recipe
+- Remove "Tesseract" from the plugin name
+- Use pypdfium2 instead of pdf2images to not depend on any system packages 
+
 ## [Version 1.0.3](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v1.0.3) - Update release - 2023-04
 
 - Update code env description to support python versions 3.8, 3.9, 3.10 and 3.11

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -0,0 +1,54 @@
+pipeline {
+   options {
+        disableConcurrentBuilds()
+   }
+   agent { label 'dss-plugin-tests'}
+   environment {
+        PLUGIN_INTEGRATION_TEST_INSTANCE="$HOME/instance_config.json"
+        UNIT_TEST_FILES_STATUS_CODE = sh(script: 'ls ./tests/*/unit/test*', returnStatus: true)
+        INTEGRATION_TEST_FILES_STATUS_CODE = sh(script: 'ls ./tests/*/integration/test*', returnStatus: true)
+   }
+   stages {
+      stage('Run Unit Tests') {
+         when { environment name: 'UNIT_TEST_FILES_STATUS_CODE', value: "0"}
+         steps {
+            sh 'echo "Running unit tests"'
+            catchError(stageResult: 'FAILURE') {
+            sh """
+               make unit-tests
+               """
+            }
+            sh 'echo "Done with unit tests"'
+         }
+      }
+      stage('Run Integration Tests') {
+         when { environment name: 'INTEGRATION_TEST_FILES_STATUS_CODE', value: "0"}
+         steps {
+            sh 'echo "Running integration tests"'
+            catchError(stageResult: 'FAILURE') {
+            sh """
+               make integration-tests
+               """
+            }
+            sh 'echo "Done with integration tests"'
+         }
+      }
+   }
+   post {
+     always {
+        script {
+           allure([
+                    includeProperties: false,
+                    jdk: '',
+                    properties: [],
+                    reportBuildPolicy: 'ALWAYS',
+                    results: [[path: 'tests/allure_report']]
+            ])
+
+            def status = currentBuild.currentResult
+            sh "file_name=\$(echo ${env.JOB_NAME} | tr '/' '-').status; touch \$file_name; echo \"${env.BUILD_URL};${env.CHANGE_TITLE};${env.CHANGE_AUTHOR};${env.CHANGE_URL};${env.BRANCH_NAME};${status};\" >> $HOME/daily-statuses/\$file_name"
+            cleanWs()
+        }
+     }
+   }
+}
diff --git a/Makefile b/Makefile
@@ -18,13 +18,18 @@ plugin:
 	@echo "[SUCCESS] Archiving plugin to dist/ folder: Done!"
 
 unit-tests:
-	@echo "[START] Running unit tests..."
-	@echo "[SUCCESS] Running unit tests: Done!"
+	@echo "No unit tests"
 
 integration-tests:
-	@echo "[START] Running integration tests..."
-	# TODO add integration tests
-	@echo "[SUCCESS] Running integration tests: Done!"
+	@echo "Running integration tests..."
+	@( \
+		rm -rf ./env/; \
+		python3 -m venv env/; \
+		source env/bin/activate; \
+		pip3 install --upgrade pip;\
+		pip install --no-cache-dir -r tests/python/integration/requirements.txt; \
+		pytest tests/python/integration --alluredir=tests/allure_report || ret=$$?; exit $$ret \
+	)
 
 tests: unit-tests integration-tests
 

diff --git a/README.md b/README.md
@@ -47,19 +47,6 @@ Using macports: `sudo port install tesseract`
 
 For more informations, go to: <https://github.com/tesseract-ocr/tessdoc/blob/master/Home.md>.
 
-### pdf2image
-
-To be able to use the python package pdf2image:
-
-#### Linux
-Most distros ship with pdftoppm and pdftocairo. If they are not installed, refer to your package manager to install poppler-utils
-
-For more informations, go to: <https://github.com/Belval/pdf2image>.
-
-#### Mac
-For macOS using brew: `brew install poppler`.
-Mac users will have to install poppler for Mac (<http://macappstore.org/poppler/>).
-
 ### Specific languages
 
 If you want to specify languages in tesseract, you must install them on the machine with your DSS instance, you can find instructions on how to install them and the code for each language here <https://tesseract-ocr.github.io/tessdoc/Data-Files>.

diff --git a/code-env/python/desc.json b/code-env/python/desc.json
@@ -4,8 +4,7 @@
     "PYTHON37",
     "PYTHON38",
     "PYTHON39",
-    "PYTHON310",
-    "PYTHON311"
+    "PYTHON310"
   ],
   "corePackagesSet": "AUTO",
   "forceConda": false,

diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
@@ -1,8 +1,12 @@
-pdf2image==1.14.0
+pypdfium2==4.17.0
 pytesseract==0.3.7
-Pillow==8.1.0
+Pillow==8.2.0
 matplotlib==3.3.4; python_version <= '3.9'
 matplotlib==3.7.1; python_version >= '3.10'
 opencv-python==4.5.1.48; python_version <= '3.9'
 opencv-python==4.7.0.72; python_version >= '3.10'
-deskew==0.10.3
+deskew==0.10.33
+torch==1.11.0; python_version >= '3.10'
+torch==1.9.1; python_version <= '3.9'
+easyocr==1.7.0
+packaging==21.3
diff --git a/custom-recipes/image-conversion/recipe.json b/custom-recipes/image-conversion/recipe.json
@@ -1,8 +1,9 @@
 {
     "meta": {
-        "label": "OCR - Image conversion",
+        "label": "Image conversion",
         "description": "Convert PDF/JPG/JPEG/PNG/TIFF files into greyscale JPG images. Split multipage PDF into multiple images as well.",
-        "icon": "icon-picture"
+        "icon": "icon-picture",
+        "displayOrderRank": 2
     },
 
     "kind": "PYTHON",

diff --git a/custom-recipes/image-conversion/recipe.py b/custom-recipes/image-conversion/recipe.py
@@ -1,10 +1,12 @@
 from dataiku.customrecipe import get_recipe_config
-from pdf2image import convert_from_bytes
 from PIL import Image
 from io import BytesIO
 import logging
-from utils import get_input_output, convert_image_to_greyscale_bytes, image_conversion_parameters
-from constants import Constants
+from ocr_recipes_io_utils import get_input_output
+from ocr_utils import convert_image_to_greyscale_bytes
+from ocr_utils import image_conversion_parameters
+from ocr_utils import pdf_to_pil_images_iterator
+from ocr_constants import Constants
 
 logger = logging.getLogger(__name__)
 
@@ -25,9 +27,7 @@
             img_bytes = stream.read()
 
         if suffix == "pdf":
-            pdf_images = convert_from_bytes(img_bytes, fmt='jpg', dpi=params[Constants.DPI])
-
-            for j, img in enumerate(pdf_images):
+            for j, img in enumerate(pdf_to_pil_images_iterator(img_bytes)):
                 img_bytes = convert_image_to_greyscale_bytes(img, quality=params[Constants.QUALITY])
                 output_folder.upload_data("{0}/{0}{1}{2:05d}.jpg".format(prefix, Constants.PDF_MULTI_SUFFIX, j+1), img_bytes)
 

diff --git a/custom-recipes/image-processing-custom/recipe.json b/custom-recipes/image-processing-custom/recipe.json
@@ -1,8 +1,9 @@
 {
     "meta": {
-        "label": "OCR - Image processing",
+        "label": "Image processing",
         "description": "For advanced users. Use the notebook template to find good image processing functions for your images. All images in the input folder are processed using the image processing functions specified by the user in the recipe parameter's form. Inputs must be greyscale JPG images.",
-        "icon": "icon-cogs"
+        "icon": "icon-cogs",
+        "displayOrderRank": 3
     },
 
     "kind": "PYTHON",

diff --git a/custom-recipes/image-processing-custom/recipe.py b/custom-recipes/image-processing-custom/recipe.py
@@ -3,8 +3,9 @@
 from io import BytesIO
 import numpy as np
 import logging
-from utils import get_input_output, image_processing_parameters
-from constants import Constants
+from ocr_recipes_io_utils import get_input_output
+from ocr_utils import image_processing_parameters
+from ocr_constants import Constants
 
 logger = logging.getLogger(__name__)
 

diff --git a/custom-recipes/ocr-text-extraction-dataset/recipe.json b/custom-recipes/ocr-text-extraction-dataset/recipe.json
@@ -1,8 +1,9 @@
 {
     "meta": {
         "label": "OCR - Text extraction",
-        "description": "Extract text from JPG images into a dataset of filename and text.",
-        "icon": "icon-file-text-alt"
+        "description": "Extract text from PDF/JPG/JPEG/PNG/TIFF files into a dataset of filename and text.",
+        "icon": "icon-file-text-alt",
+        "displayOrderRank": 1
     },
 
     "kind": "PYTHON",
@@ -31,25 +32,42 @@
             "acceptsDataset": true
         }
     ],
-
+    "paramsPythonSetup": "select_ocr_engine.py",
     "params": [
         {
             "name": "recombine_pdf",
             "label" : "Recombine multiple-page PDF together",
             "type": "BOOLEAN",
-            "description": "Text of images that are from the same original multiple-page PDF (images with name pattern <PDF_NAME>_pdf_page_XXXXX.jpg) are concatenated."
+            "description": "Multiple-page PDFs and images with name pattern $FILENAME_pdf_page_XXXXX.jpg are extracted into a single row."
+        },
+        {
+            "name": "ocr_engine",
+            "label": "OCR Engine",
+            "type": "SELECT",
+            "mandatory": true,
+            "description": "",
+            "defaultValue": "default",
+            "getChoicesFromPython": true
         },
         {
             "name": "advanced_parameters",
             "label" : "Advanced preprocessing parameters",
-            "type": "BOOLEAN"
+            "type": "BOOLEAN",
+            "visibilityCondition" : "model.ocr_engine != 'default'"
         },
         {
             "name": "language",
             "label": "Specify language",
             "type": "STRING",
-            "description": "Enter language code found at https://tesseract-ocr.github.io/tessdoc/Data-Files. Languages must be installed beforehand.",
-            "visibilityCondition" : "model.advanced_parameters"
+            "description": "Enter language code found at https://tesseract-ocr.github.io/tessdoc/Data-Files. Languages must be installed beforehand",
+            "visibilityCondition" : "model.advanced_parameters && model.ocr_engine == 'tesseract'"
+        },
+        {
+            "name": "language_easyocr",
+            "label": "Specify language",
+            "type": "STRING",
+            "description": "Enter language code found at https://www.jaided.ai/easyocr/.",
+            "visibilityCondition" : "model.advanced_parameters && model.ocr_engine == 'easyocr'"
         }
     ],
 

diff --git a/custom-recipes/ocr-text-extraction-dataset/recipe.py b/custom-recipes/ocr-text-extraction-dataset/recipe.py
@@ -1,9 +1,17 @@
-from dataiku.customrecipe import get_recipe_config
 import logging
-from utils import get_input_output, text_extraction_parameters
-from tesseractocr.extract_text import text_extraction
+import os
 import pandas as pd
-from constants import Constants
+import re
+from time import perf_counter
+
+from dataiku.customrecipe import get_recipe_config
+from ocr_constants import Constants
+from ocr_recipes_io_utils import get_input_output
+from ocr_utils import convert_image_to_greyscale_bytes
+from ocr_utils import pdf_to_pil_images_iterator
+from ocr_utils import text_extraction_parameters
+from tesseractocr.extract_text import text_extraction
+
 
 logger = logging.getLogger(__name__)
 
@@ -12,27 +20,43 @@
 params = text_extraction_parameters(get_recipe_config())
 
 input_filenames = input_folder.list_paths_in_partition()
-total_images = len(input_filenames)
+total_files = len(input_filenames)
 
-df = pd.DataFrame()
+rows = []
 
 for i, sample_file in enumerate(input_filenames):
-    if sample_file.split('.')[-1] != "jpg":
-        logger.info("OCR - Rejecting {} because it is not a JPG file.".format(sample_file))
-        logger.info("OCR - Rejected {}/{} images".format(i+1, total_images))
+    prefix, suffix = os.path.splitext(sample_file)
+    suffix = suffix[1:]  # removing the dot from the extension
+
+    if suffix not in Constants.TYPES:
+        logger.info("OCR - Rejecting {} because it is not a {} file.".format(sample_file, '/'.join(Constants.TYPES)))
+        logger.info("OCR - Rejected {}/{} files".format(i+1, total_files))
         continue
 
     with input_folder.get_download_stream(sample_file) as stream:
         img_bytes = stream.read()
 
-    img_text = text_extraction(img_bytes, params)
-    logger.info("OCR - Extracted text from {}/{} images".format(i+1, total_images))
+    start = perf_counter()
+
+    if suffix == "pdf":
+        for j, img in enumerate(pdf_to_pil_images_iterator(img_bytes)):
+            img_bytes = convert_image_to_greyscale_bytes(img)
+            img_text = text_extraction(img_bytes, params)
+
+            pdf_image_name = "{}{}{:05d}".format(prefix, Constants.PDF_MULTI_SUFFIX, j+1)
+            rows.append({'file': pdf_image_name, 'text': img_text})
+    else:
+        img_text = text_extraction(img_bytes, params)
+        rows.append({'file': prefix, 'text': img_text})
+
+    logger.info("OCR - Extracted text from {}/{} files (in {:.2f} seconds)".format(i+1, total_files, perf_counter() - start))
 
-    df = df.append({'file': sample_file.split('/')[-1].split('.')[0], 'text': img_text}, ignore_index=True)
+df = pd.DataFrame(rows)
 
 if params['recombine_pdf']:
-    df['page_nb'] = df.apply(lambda row: int(row['file'].split(Constants.PDF_MULTI_SUFFIX)[1]) if Constants.PDF_MULTI_SUFFIX in row['file'] else 1, axis=1)
-    df['file'] = df.apply(lambda row: row['file'].split(Constants.PDF_MULTI_SUFFIX)[0] if Constants.PDF_MULTI_SUFFIX in row['file'] else row['file'], axis=1)
+    pdf_multi_page_pattern = "^.*{}\d{{5}}$".format(Constants.PDF_MULTI_SUFFIX)
+    df['page_nb'] = df.apply(lambda row: int(row['file'].split(Constants.PDF_MULTI_SUFFIX)[1]) if re.match(pdf_multi_page_pattern, row['file']) else 1, axis=1)
+    df['file'] = df.apply(lambda row: row['file'].split(Constants.PDF_MULTI_SUFFIX)[0] if re.match(pdf_multi_page_pattern, row['file']) else row['file'], axis=1)
 
     df = df.sort_values(['file', 'page_nb'], ascending=True)
 

diff --git a/plugin.json b/plugin.json
@@ -1,10 +1,10 @@
 {
     "id": "tesseract-ocr",
-    "version": "1.0.3",
+    "version": "2.0.0",
     "meta": {
-        "label": "Tesseract OCR",
-        "description": "Extract text from images using the Tesseract Optical Character Recognition (OCR) engine",
-        "author": "Dataiku (Stanislas GUINEL)",
+        "label": "OCR",
+        "description": "Extract text from images using OCR engines",
+        "author": "Dataiku",
         "icon": "icon-file-text-alt",
         "tags": [
             "NLP",

diff --git a/python-lib/constants.py → python-lib/ocr_constants.py b/python-lib/constants.py → python-lib/ocr_constants.py
@@ -6,5 +6,10 @@ class Constants:
     DPI = "dpi"
     QUALITY = "quality"
     RECOMBINE_PDF = "recombine_pdf"
-    LANGUAGE = "language"
-    DEFAULT_LANGUAGE = "eng"
+    LANGUAGE_TESSERACT = "language"
+    LANGUAGE_EASYOCR = "language_easyocr"
+    OCR_ENGINE = "ocr_engine"
+    DEFAULT_ENGINE = "default"
+    TESSERACT = "tesseract"
+    EASYOCR = "easyocr"
+    EASYOCR_READER = "easyocr_reader"
diff --git a/python-lib/ocr_recipes_io_utils.py b/python-lib/ocr_recipes_io_utils.py
@@ -0,0 +1,21 @@
+import dataiku
+from dataiku.customrecipe import get_input_names_for_role
+from dataiku.customrecipe import get_output_names_for_role
+
+
+def get_input_output(input_type='dataset', output_type='dataset'):
+    if input_type == 'folder':
+        input_names = get_input_names_for_role('input_folder')[0]
+        input_obj = dataiku.Folder(input_names)
+    else:
+        input_names = get_input_names_for_role('input_dataset')[0]
+        input_obj = dataiku.Dataset(input_names)
+
+    if output_type == 'folder':
+        output_names = get_output_names_for_role('output_folder')[0]
+        output_obj = dataiku.Folder(output_names)
+    else:
+        output_names = get_output_names_for_role('output_dataset')[0]
+        output_obj = dataiku.Dataset(output_names)
+
+    return input_obj, output_obj