diff --git a/CHANGELOG.md b/CHANGELOG.md index 51b0d82..5f79351 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [Version 2.0.0](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v2.0.0) - Feature release - 2023-06 + +- Add EasyOCR +- Add an option for a default OCR engine that fallbacks to EasyOCR if tesseract is not installed on the system +- Support of PDFs in the text extraction recipe +- Remove "Tesseract" from the plugin name +- Use pypdfium2 instead of pdf2images to not depend on any system packages + ## [Version 1.0.3](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v1.0.3) - Update release - 2023-04 - Update code env description to support python versions 3.8, 3.9, 3.10 and 3.11 diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..325695d --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,54 @@ +pipeline { + options { + disableConcurrentBuilds() + } + agent { label 'dss-plugin-tests'} + environment { + PLUGIN_INTEGRATION_TEST_INSTANCE="$HOME/instance_config.json" + UNIT_TEST_FILES_STATUS_CODE = sh(script: 'ls ./tests/*/unit/test*', returnStatus: true) + INTEGRATION_TEST_FILES_STATUS_CODE = sh(script: 'ls ./tests/*/integration/test*', returnStatus: true) + } + stages { + stage('Run Unit Tests') { + when { environment name: 'UNIT_TEST_FILES_STATUS_CODE', value: "0"} + steps { + sh 'echo "Running unit tests"' + catchError(stageResult: 'FAILURE') { + sh """ + make unit-tests + """ + } + sh 'echo "Done with unit tests"' + } + } + stage('Run Integration Tests') { + when { environment name: 'INTEGRATION_TEST_FILES_STATUS_CODE', value: "0"} + steps { + sh 'echo "Running integration tests"' + catchError(stageResult: 'FAILURE') { + sh """ + make integration-tests + """ + } + sh 'echo "Done with integration tests"' + } + } + } + post { + always { + script { + allure([ + includeProperties: false, + jdk: '', + properties: [], + reportBuildPolicy: 'ALWAYS', + results: [[path: 'tests/allure_report']] + ]) + + def status = currentBuild.currentResult + sh "file_name=\$(echo ${env.JOB_NAME} | tr '/' '-').status; touch \$file_name; echo \"${env.BUILD_URL};${env.CHANGE_TITLE};${env.CHANGE_AUTHOR};${env.CHANGE_URL};${env.BRANCH_NAME};${status};\" >> $HOME/daily-statuses/\$file_name" + cleanWs() + } + } + } +} \ No newline at end of file diff --git a/Makefile b/Makefile index fed3978..1cc23c0 100644 --- a/Makefile +++ b/Makefile @@ -18,13 +18,18 @@ plugin: @echo "[SUCCESS] Archiving plugin to dist/ folder: Done!" unit-tests: - @echo "[START] Running unit tests..." - @echo "[SUCCESS] Running unit tests: Done!" + @echo "No unit tests" integration-tests: - @echo "[START] Running integration tests..." - # TODO add integration tests - @echo "[SUCCESS] Running integration tests: Done!" + @echo "Running integration tests..." + @( \ + rm -rf ./env/; \ + python3 -m venv env/; \ + source env/bin/activate; \ + pip3 install --upgrade pip;\ + pip install --no-cache-dir -r tests/python/integration/requirements.txt; \ + pytest tests/python/integration --alluredir=tests/allure_report || ret=$$?; exit $$ret \ + ) tests: unit-tests integration-tests diff --git a/README.md b/README.md index d9d9da5..c8c142f 100644 --- a/README.md +++ b/README.md @@ -47,19 +47,6 @@ Using macports: `sudo port install tesseract` For more informations, go to: . -### pdf2image - -To be able to use the python package pdf2image: - -#### Linux -Most distros ship with pdftoppm and pdftocairo. If they are not installed, refer to your package manager to install poppler-utils - -For more informations, go to: . - -#### Mac -For macOS using brew: `brew install poppler`. -Mac users will have to install poppler for Mac (). - ### Specific languages If you want to specify languages in tesseract, you must install them on the machine with your DSS instance, you can find instructions on how to install them and the code for each language here . diff --git a/code-env/python/desc.json b/code-env/python/desc.json index 33fb722..100f24e 100644 --- a/code-env/python/desc.json +++ b/code-env/python/desc.json @@ -4,8 +4,7 @@ "PYTHON37", "PYTHON38", "PYTHON39", - "PYTHON310", - "PYTHON311" + "PYTHON310" ], "corePackagesSet": "AUTO", "forceConda": false, diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt index 7fbc340..80bca16 100644 --- a/code-env/python/spec/requirements.txt +++ b/code-env/python/spec/requirements.txt @@ -1,8 +1,12 @@ -pdf2image==1.14.0 +pypdfium2==4.17.0 pytesseract==0.3.7 -Pillow==8.1.0 +Pillow==8.2.0 matplotlib==3.3.4; python_version <= '3.9' matplotlib==3.7.1; python_version >= '3.10' opencv-python==4.5.1.48; python_version <= '3.9' opencv-python==4.7.0.72; python_version >= '3.10' -deskew==0.10.3 +deskew==0.10.33 +torch==1.11.0; python_version >= '3.10' +torch==1.9.1; python_version <= '3.9' +easyocr==1.7.0 +packaging==21.3 \ No newline at end of file diff --git a/custom-recipes/image-conversion/recipe.json b/custom-recipes/image-conversion/recipe.json index ad5cb69..461c078 100644 --- a/custom-recipes/image-conversion/recipe.json +++ b/custom-recipes/image-conversion/recipe.json @@ -1,8 +1,9 @@ { "meta": { - "label": "OCR - Image conversion", + "label": "Image conversion", "description": "Convert PDF/JPG/JPEG/PNG/TIFF files into greyscale JPG images. Split multipage PDF into multiple images as well.", - "icon": "icon-picture" + "icon": "icon-picture", + "displayOrderRank": 2 }, "kind": "PYTHON", diff --git a/custom-recipes/image-conversion/recipe.py b/custom-recipes/image-conversion/recipe.py index ab1cb7f..35a96ad 100644 --- a/custom-recipes/image-conversion/recipe.py +++ b/custom-recipes/image-conversion/recipe.py @@ -1,10 +1,12 @@ from dataiku.customrecipe import get_recipe_config -from pdf2image import convert_from_bytes from PIL import Image from io import BytesIO import logging -from utils import get_input_output, convert_image_to_greyscale_bytes, image_conversion_parameters -from constants import Constants +from ocr_recipes_io_utils import get_input_output +from ocr_utils import convert_image_to_greyscale_bytes +from ocr_utils import image_conversion_parameters +from ocr_utils import pdf_to_pil_images_iterator +from ocr_constants import Constants logger = logging.getLogger(__name__) @@ -25,9 +27,7 @@ img_bytes = stream.read() if suffix == "pdf": - pdf_images = convert_from_bytes(img_bytes, fmt='jpg', dpi=params[Constants.DPI]) - - for j, img in enumerate(pdf_images): + for j, img in enumerate(pdf_to_pil_images_iterator(img_bytes)): img_bytes = convert_image_to_greyscale_bytes(img, quality=params[Constants.QUALITY]) output_folder.upload_data("{0}/{0}{1}{2:05d}.jpg".format(prefix, Constants.PDF_MULTI_SUFFIX, j+1), img_bytes) diff --git a/custom-recipes/image-processing-custom/recipe.json b/custom-recipes/image-processing-custom/recipe.json index c82eb55..bdadee9 100644 --- a/custom-recipes/image-processing-custom/recipe.json +++ b/custom-recipes/image-processing-custom/recipe.json @@ -1,8 +1,9 @@ { "meta": { - "label": "OCR - Image processing", + "label": "Image processing", "description": "For advanced users. Use the notebook template to find good image processing functions for your images. All images in the input folder are processed using the image processing functions specified by the user in the recipe parameter's form. Inputs must be greyscale JPG images.", - "icon": "icon-cogs" + "icon": "icon-cogs", + "displayOrderRank": 3 }, "kind": "PYTHON", diff --git a/custom-recipes/image-processing-custom/recipe.py b/custom-recipes/image-processing-custom/recipe.py index d00bba7..df02be2 100644 --- a/custom-recipes/image-processing-custom/recipe.py +++ b/custom-recipes/image-processing-custom/recipe.py @@ -3,8 +3,9 @@ from io import BytesIO import numpy as np import logging -from utils import get_input_output, image_processing_parameters -from constants import Constants +from ocr_recipes_io_utils import get_input_output +from ocr_utils import image_processing_parameters +from ocr_constants import Constants logger = logging.getLogger(__name__) diff --git a/custom-recipes/ocr-text-extraction-dataset/recipe.json b/custom-recipes/ocr-text-extraction-dataset/recipe.json index f2a193e..f5c280a 100644 --- a/custom-recipes/ocr-text-extraction-dataset/recipe.json +++ b/custom-recipes/ocr-text-extraction-dataset/recipe.json @@ -1,8 +1,9 @@ { "meta": { "label": "OCR - Text extraction", - "description": "Extract text from JPG images into a dataset of filename and text.", - "icon": "icon-file-text-alt" + "description": "Extract text from PDF/JPG/JPEG/PNG/TIFF files into a dataset of filename and text.", + "icon": "icon-file-text-alt", + "displayOrderRank": 1 }, "kind": "PYTHON", @@ -31,25 +32,42 @@ "acceptsDataset": true } ], - + "paramsPythonSetup": "select_ocr_engine.py", "params": [ { "name": "recombine_pdf", "label" : "Recombine multiple-page PDF together", "type": "BOOLEAN", - "description": "Text of images that are from the same original multiple-page PDF (images with name pattern _pdf_page_XXXXX.jpg) are concatenated." + "description": "Multiple-page PDFs and images with name pattern $FILENAME_pdf_page_XXXXX.jpg are extracted into a single row." + }, + { + "name": "ocr_engine", + "label": "OCR Engine", + "type": "SELECT", + "mandatory": true, + "description": "", + "defaultValue": "default", + "getChoicesFromPython": true }, { "name": "advanced_parameters", "label" : "Advanced preprocessing parameters", - "type": "BOOLEAN" + "type": "BOOLEAN", + "visibilityCondition" : "model.ocr_engine != 'default'" }, { "name": "language", "label": "Specify language", "type": "STRING", - "description": "Enter language code found at https://tesseract-ocr.github.io/tessdoc/Data-Files. Languages must be installed beforehand.", - "visibilityCondition" : "model.advanced_parameters" + "description": "Enter language code found at https://tesseract-ocr.github.io/tessdoc/Data-Files. Languages must be installed beforehand", + "visibilityCondition" : "model.advanced_parameters && model.ocr_engine == 'tesseract'" + }, + { + "name": "language_easyocr", + "label": "Specify language", + "type": "STRING", + "description": "Enter language code found at https://www.jaided.ai/easyocr/.", + "visibilityCondition" : "model.advanced_parameters && model.ocr_engine == 'easyocr'" } ], diff --git a/custom-recipes/ocr-text-extraction-dataset/recipe.py b/custom-recipes/ocr-text-extraction-dataset/recipe.py index 2165f19..629b870 100644 --- a/custom-recipes/ocr-text-extraction-dataset/recipe.py +++ b/custom-recipes/ocr-text-extraction-dataset/recipe.py @@ -1,9 +1,17 @@ -from dataiku.customrecipe import get_recipe_config import logging -from utils import get_input_output, text_extraction_parameters -from tesseractocr.extract_text import text_extraction +import os import pandas as pd -from constants import Constants +import re +from time import perf_counter + +from dataiku.customrecipe import get_recipe_config +from ocr_constants import Constants +from ocr_recipes_io_utils import get_input_output +from ocr_utils import convert_image_to_greyscale_bytes +from ocr_utils import pdf_to_pil_images_iterator +from ocr_utils import text_extraction_parameters +from tesseractocr.extract_text import text_extraction + logger = logging.getLogger(__name__) @@ -12,27 +20,43 @@ params = text_extraction_parameters(get_recipe_config()) input_filenames = input_folder.list_paths_in_partition() -total_images = len(input_filenames) +total_files = len(input_filenames) -df = pd.DataFrame() +rows = [] for i, sample_file in enumerate(input_filenames): - if sample_file.split('.')[-1] != "jpg": - logger.info("OCR - Rejecting {} because it is not a JPG file.".format(sample_file)) - logger.info("OCR - Rejected {}/{} images".format(i+1, total_images)) + prefix, suffix = os.path.splitext(sample_file) + suffix = suffix[1:] # removing the dot from the extension + + if suffix not in Constants.TYPES: + logger.info("OCR - Rejecting {} because it is not a {} file.".format(sample_file, '/'.join(Constants.TYPES))) + logger.info("OCR - Rejected {}/{} files".format(i+1, total_files)) continue with input_folder.get_download_stream(sample_file) as stream: img_bytes = stream.read() - img_text = text_extraction(img_bytes, params) - logger.info("OCR - Extracted text from {}/{} images".format(i+1, total_images)) + start = perf_counter() + + if suffix == "pdf": + for j, img in enumerate(pdf_to_pil_images_iterator(img_bytes)): + img_bytes = convert_image_to_greyscale_bytes(img) + img_text = text_extraction(img_bytes, params) + + pdf_image_name = "{}{}{:05d}".format(prefix, Constants.PDF_MULTI_SUFFIX, j+1) + rows.append({'file': pdf_image_name, 'text': img_text}) + else: + img_text = text_extraction(img_bytes, params) + rows.append({'file': prefix, 'text': img_text}) + + logger.info("OCR - Extracted text from {}/{} files (in {:.2f} seconds)".format(i+1, total_files, perf_counter() - start)) - df = df.append({'file': sample_file.split('/')[-1].split('.')[0], 'text': img_text}, ignore_index=True) +df = pd.DataFrame(rows) if params['recombine_pdf']: - df['page_nb'] = df.apply(lambda row: int(row['file'].split(Constants.PDF_MULTI_SUFFIX)[1]) if Constants.PDF_MULTI_SUFFIX in row['file'] else 1, axis=1) - df['file'] = df.apply(lambda row: row['file'].split(Constants.PDF_MULTI_SUFFIX)[0] if Constants.PDF_MULTI_SUFFIX in row['file'] else row['file'], axis=1) + pdf_multi_page_pattern = "^.*{}\d{{5}}$".format(Constants.PDF_MULTI_SUFFIX) + df['page_nb'] = df.apply(lambda row: int(row['file'].split(Constants.PDF_MULTI_SUFFIX)[1]) if re.match(pdf_multi_page_pattern, row['file']) else 1, axis=1) + df['file'] = df.apply(lambda row: row['file'].split(Constants.PDF_MULTI_SUFFIX)[0] if re.match(pdf_multi_page_pattern, row['file']) else row['file'], axis=1) df = df.sort_values(['file', 'page_nb'], ascending=True) diff --git a/plugin.json b/plugin.json index 9d00ee1..6991fa8 100644 --- a/plugin.json +++ b/plugin.json @@ -1,10 +1,10 @@ { "id": "tesseract-ocr", - "version": "1.0.3", + "version": "2.0.0", "meta": { - "label": "Tesseract OCR", - "description": "Extract text from images using the Tesseract Optical Character Recognition (OCR) engine", - "author": "Dataiku (Stanislas GUINEL)", + "label": "OCR", + "description": "Extract text from images using OCR engines", + "author": "Dataiku", "icon": "icon-file-text-alt", "tags": [ "NLP", diff --git a/python-lib/constants.py b/python-lib/ocr_constants.py similarity index 53% rename from python-lib/constants.py rename to python-lib/ocr_constants.py index 38ec1d1..a4cb3f3 100644 --- a/python-lib/constants.py +++ b/python-lib/ocr_constants.py @@ -6,5 +6,10 @@ class Constants: DPI = "dpi" QUALITY = "quality" RECOMBINE_PDF = "recombine_pdf" - LANGUAGE = "language" - DEFAULT_LANGUAGE = "eng" + LANGUAGE_TESSERACT = "language" + LANGUAGE_EASYOCR = "language_easyocr" + OCR_ENGINE = "ocr_engine" + DEFAULT_ENGINE = "default" + TESSERACT = "tesseract" + EASYOCR = "easyocr" + EASYOCR_READER = "easyocr_reader" diff --git a/python-lib/ocr_recipes_io_utils.py b/python-lib/ocr_recipes_io_utils.py new file mode 100644 index 0000000..0a43ec7 --- /dev/null +++ b/python-lib/ocr_recipes_io_utils.py @@ -0,0 +1,21 @@ +import dataiku +from dataiku.customrecipe import get_input_names_for_role +from dataiku.customrecipe import get_output_names_for_role + + +def get_input_output(input_type='dataset', output_type='dataset'): + if input_type == 'folder': + input_names = get_input_names_for_role('input_folder')[0] + input_obj = dataiku.Folder(input_names) + else: + input_names = get_input_names_for_role('input_dataset')[0] + input_obj = dataiku.Dataset(input_names) + + if output_type == 'folder': + output_names = get_output_names_for_role('output_folder')[0] + output_obj = dataiku.Folder(output_names) + else: + output_names = get_output_names_for_role('output_dataset')[0] + output_obj = dataiku.Dataset(output_names) + + return input_obj, output_obj diff --git a/python-lib/ocr_utils.py b/python-lib/ocr_utils.py new file mode 100644 index 0000000..33ea0c1 --- /dev/null +++ b/python-lib/ocr_utils.py @@ -0,0 +1,82 @@ +from io import BytesIO +from ocr_constants import Constants +import os +import pypdfium2 as pdfium +from shutil import which + + +def pdf_to_pil_images_iterator(pdf_bytes, dpi=None): + """ iterator over the multiple images of pdf bytes """ + pdf_pages = pdfium.PdfDocument(pdf_bytes) + # scale is DPI / 72 according to pypdfium2 doc + scale = dpi / 72 if dpi else 2 + for pdf_page in pdf_pages: + yield pdf_page.render(scale=scale).to_pil() + + +def convert_image_to_greyscale_bytes(img, quality=75): + """ convert a PIL image to greyscale with a specified dpi and output image as bytes """ + img = img.convert('L') + buf = BytesIO() + img.save(buf, format='JPEG', quality=quality) + return buf.getvalue() + + +def image_conversion_parameters(recipe_config): + """ retrieve image conversion recipe parameters """ + params = {} + dpi = recipe_config.get(Constants.DPI, 200) + assert dpi > 0 and dpi <= 4000 + params[Constants.DPI] = dpi + + quality = recipe_config.get(Constants.QUALITY, 75) + assert quality > 0 and quality <= 95 + params[Constants.QUALITY] = quality + + return params + + +def image_processing_parameters(recipe_config): + """ retrieve image processing recipe parameters """ + params = {} + params[Constants.FUNCTIONS_DEF] = recipe_config.get(Constants.FUNCTIONS_DEF, None) + params[Constants.PIPELINE_DEF] = recipe_config.get(Constants.PIPELINE_DEF, None) + return params + + +def text_extraction_parameters(recipe_config): + """ retrieve text extraction recipe parameters """ + params = {} + params[Constants.RECOMBINE_PDF] = recipe_config.get(Constants.RECOMBINE_PDF, False) + selected_ocr_engine = recipe_config.get(Constants.OCR_ENGINE, Constants.DEFAULT_ENGINE) + advanced = recipe_config.get('advanced_parameters', False) + + if selected_ocr_engine == Constants.DEFAULT_ENGINE: + advanced = False + selected_ocr_engine = get_default_ocr_engine() + + params[Constants.OCR_ENGINE] = selected_ocr_engine + + if params[Constants.OCR_ENGINE] == Constants.TESSERACT: + params[Constants.LANGUAGE_TESSERACT] = recipe_config.get(Constants.LANGUAGE_TESSERACT, "eng") if advanced else "eng" + elif params[Constants.OCR_ENGINE] == Constants.EASYOCR: + import easyocr + language = recipe_config.get(Constants.LANGUAGE_EASYOCR, "en") if advanced else "en" + # instantiate the easyocr.Reader only once here because it takes some time + # use tmp folders inside the job temporary folder to store the model and the custom network model (note that this one isn't used) + model_storage_directory = os.path.join(os.getcwd(), "easyocr_model_tmp") + user_network_directory = os.path.join(os.getcwd(), "easyocr_user_network_tmp") + params[Constants.EASYOCR_READER] = easyocr.Reader( + lang_list=[language], gpu=False, + model_storage_directory=model_storage_directory, + user_network_directory=user_network_directory, + verbose=False + ) + + return params + + +def get_default_ocr_engine(): + if which("tesseract") is not None: # check if tesseract is in the path + return Constants.TESSERACT + return Constants.EASYOCR diff --git a/python-lib/tesseractocr/extract_text.py b/python-lib/tesseractocr/extract_text.py index 35f0b5c..3a3e9f0 100644 --- a/python-lib/tesseractocr/extract_text.py +++ b/python-lib/tesseractocr/extract_text.py @@ -3,14 +3,14 @@ import numpy as np import pytesseract import logging -from constants import Constants +from ocr_constants import Constants logger = logging.getLogger(__name__) def text_extraction(img_bytes, params): """ - extract text from bytes images using pytesseract (with specified language) + extract text from bytes images using the selected OCR engine (with specified language) """ img = Image.open(BytesIO(img_bytes)) @@ -19,10 +19,20 @@ def text_extraction(img_bytes, params): logger.info("OCR - converting image to greyscale.") img = img.convert('L') - try: - img = np.array(img) - img_text = pytesseract.image_to_string(img, lang=params[Constants.LANGUAGE]) - except Exception as e: - raise Exception("OCR - Error calling pytesseract: {}".format(e)) + if params[Constants.OCR_ENGINE] == Constants.TESSERACT: + try: + img = np.array(img) + img_text = pytesseract.image_to_string(img, lang=params[Constants.LANGUAGE_TESSERACT]) + except Exception as e: + raise Exception("OCR - Error calling pytesseract: {}".format(e)) + elif params[Constants.OCR_ENGINE] == Constants.EASYOCR: + try: + img = np.array(img) + reader = params[Constants.EASYOCR_READER] + img_text = " ".join(reader.readtext(img_bytes, detail=0)) + except Exception as e: + raise Exception("OCR - Error calling easyocr: {}".format(e)) + else: + raise NotImplementedError("OCR engine {} not implemented".format(params[Constants.OCR_ENGINE])) return img_text diff --git a/python-lib/utils.py b/python-lib/utils.py deleted file mode 100644 index 8532ebe..0000000 --- a/python-lib/utils.py +++ /dev/null @@ -1,65 +0,0 @@ -import dataiku -from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role -from io import BytesIO -from constants import Constants - - -def get_input_output(input_type='dataset', output_type='dataset'): - if input_type == 'folder': - input_names = get_input_names_for_role('input_folder')[0] - input_obj = dataiku.Folder(input_names) - else: - input_names = get_input_names_for_role('input_dataset')[0] - input_obj = dataiku.Dataset(input_names) - - if output_type == 'folder': - output_names = get_output_names_for_role('output_folder')[0] - output_obj = dataiku.Folder(output_names) - else: - output_names = get_output_names_for_role('output_dataset')[0] - output_obj = dataiku.Dataset(output_names) - - return input_obj, output_obj - - -def convert_image_to_greyscale_bytes(img, quality): - """ convert a PIL image to greyscale with a specified dpi and output image as bytes """ - img = img.convert('L') - buf = BytesIO() - img.save(buf, format='JPEG', quality=quality) - return buf.getvalue() - - -def image_conversion_parameters(recipe_config): - """ retrieve image conversion recipe parameters """ - params = {} - dpi = recipe_config.get(Constants.DPI, 200) - assert dpi > 0 and dpi <= 4000 - params[Constants.DPI] = dpi - - quality = recipe_config.get(Constants.QUALITY, 75) - assert quality > 0 and quality <= 95 - params[Constants.QUALITY] = quality - - return params - - -def image_processing_parameters(recipe_config): - """ retrieve image processing recipe parameters """ - params = {} - params[Constants.FUNCTIONS_DEF] = recipe_config.get(Constants.FUNCTIONS_DEF, None) - params[Constants.PIPELINE_DEF] = recipe_config.get(Constants.PIPELINE_DEF, None) - return params - - -def text_extraction_parameters(recipe_config): - """ retrieve text extraction recipe parameters """ - params = {} - params[Constants.RECOMBINE_PDF] = recipe_config.get(Constants.RECOMBINE_PDF, False) - params['advanced'] = recipe_config.get('advanced_parameters', False) - if params['advanced']: - params[Constants.LANGUAGE] = recipe_config.get(Constants.LANGUAGE, Constants.DEFAULT_LANGUAGE) - else: - params[Constants.LANGUAGE] = Constants.DEFAULT_LANGUAGE - - return params diff --git a/resource/select_ocr_engine.py b/resource/select_ocr_engine.py new file mode 100644 index 0000000..58ee115 --- /dev/null +++ b/resource/select_ocr_engine.py @@ -0,0 +1,32 @@ +from ocr_utils import get_default_ocr_engine +from ocr_constants import Constants + + +OCR_ENGINES = { + Constants.TESSERACT: "Tesseract", + Constants.EASYOCR: "EasyOCR" +} + + +def do(payload, config, plugin_config, inputs): + """ + Retrieve a list of OCR engines including a default engine that points to an available engine. + """ + choices = [] + if payload.get("parameterName") == Constants.OCR_ENGINE: + default_ocr_engine = get_default_ocr_engine() + choices.append({ + "label": "Default ({})".format(OCR_ENGINES[default_ocr_engine]), + "value": "default" + }) + + if default_ocr_engine != Constants.TESSERACT: + OCR_ENGINES[Constants.TESSERACT] += " (not installed)" + + for engine_value, engine_label in OCR_ENGINES.items(): + choices.append({ + "label": engine_label, + "value": engine_value + }) + + return {"choices": choices} diff --git a/tests/python/integration/pytest.ini b/tests/python/integration/pytest.ini new file mode 100644 index 0000000..f45b532 --- /dev/null +++ b/tests/python/integration/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +usefixtures = plugin dss_target diff --git a/tests/python/integration/requirements.txt b/tests/python/integration/requirements.txt new file mode 100644 index 0000000..9c9c9f7 --- /dev/null +++ b/tests/python/integration/requirements.txt @@ -0,0 +1,4 @@ +pandas>=1.0,<1.1 +pytest==6.2.1 +dataiku-api-client +git+git://github.com/dataiku/dataiku-plugin-tests-utils.git@master#egg=dataiku-plugin-tests-utils \ No newline at end of file diff --git a/tests/python/integration/test_scenario.py b/tests/python/integration/test_scenario.py new file mode 100644 index 0000000..8fdd276 --- /dev/null +++ b/tests/python/integration/test_scenario.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +from dku_plugin_test_utils import dss_scenario + + +TEST_PROJECT_KEY = "TESTOCRPLUGIN" + + +def test_run_image_processing(user_dss_clients): + dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="IMAGE_PROCESSING") + + +def test_run_text_extraction(user_dss_clients): + dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="TEXT_EXTRACTION")