Skip to content
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## [Version 2.4.0](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v2.4.0) - Minor release - 2025-10

- Add partitions support for input folders
- drop support for Python 3.6, 3.7 and 3.8
- add support for Python 3.11 and 3.12

## [Version 2.3.3](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v2.3.3) - Patch release - 2024-04

- Add support for .tif extension in OCR
Expand Down
9 changes: 4 additions & 5 deletions code-env/python/desc.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
{
"acceptedPythonInterpreters": [
"PYTHON36",
"PYTHON37",
"PYTHON38",
"PYTHON39",
"PYTHON310"
"PYTHON310",
"PYTHON311",
"PYTHON312"
],
"corePackagesSet": "AUTO",
"forceConda": false,
"installCorePackages": true,
"installJupyterSupport": true
}
}
19 changes: 8 additions & 11 deletions code-env/python/spec/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
Pillow==10.3.0
opencv-python==4.8.1.78
pytesseract==0.3.13
pypdfium2==4.17.0
pytesseract==0.3.7
Pillow==8.2.0
matplotlib==3.3.4; python_version <= '3.9'
matplotlib==3.7.1; python_version >= '3.10'
opencv-python==4.5.1.48; python_version <= '3.9'
opencv-python==4.7.0.72; python_version >= '3.10'
deskew==0.10.33
torch==1.11.0; python_version >= '3.10'
torch==1.9.1; python_version <= '3.9'
easyocr==1.7.0
packaging==21.3
matplotlib==3.7.1
packaging==24.0
torch==2.8.0
easyocr==1.7.2
python-docx==0.8.11
pypandoc==1.11
pypandoc==1.12
4 changes: 2 additions & 2 deletions custom-recipes/image-conversion/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from PIL import Image
from io import BytesIO
import logging
from text_extraction_ocr_utils.recipes_io_utils import get_input_output
from text_extraction_ocr_utils.recipes_io_utils import get_input_output, list_input_paths
from text_extraction_ocr_utils import convert_image_to_greyscale_bytes
from text_extraction_ocr_utils import image_conversion_parameters
from text_extraction_ocr_utils import pdf_to_pil_images_iterator
Expand All @@ -15,7 +15,7 @@

params = image_conversion_parameters(get_recipe_config())

input_filenames = input_folder.list_paths_in_partition()
input_filenames = list_input_paths(input_folder)
total_images = len(input_filenames)

# check if pdf and split pdf into multiple images
Expand Down
4 changes: 2 additions & 2 deletions custom-recipes/image-processing-custom/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
from io import BytesIO
import numpy as np
import logging
from text_extraction_ocr_utils.recipes_io_utils import get_input_output
from text_extraction_ocr_utils.recipes_io_utils import get_input_output, list_input_paths
from text_extraction_ocr_utils import image_processing_parameters
from text_extraction_ocr_utils import Constants

logger = logging.getLogger(__name__)

input_folder, output_folder = get_input_output('folder', 'folder')

input_filenames = input_folder.list_paths_in_partition()
input_filenames = list_input_paths(input_folder)
total_images = len(input_filenames)

params = image_processing_parameters(get_recipe_config())
Expand Down
4 changes: 2 additions & 2 deletions custom-recipes/ocr-text-extraction-dataset/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from dataiku.customrecipe import get_recipe_config
from text_extraction_ocr_utils import Constants
from text_extraction_ocr_utils.recipes_io_utils import get_input_output
from text_extraction_ocr_utils.recipes_io_utils import get_input_output, list_input_paths
from text_extraction_ocr_utils import convert_image_to_greyscale_bytes
from text_extraction_ocr_utils import pdf_to_pil_images_iterator
from text_extraction_ocr_utils import ocr_parameters
Expand All @@ -21,7 +21,7 @@

params = ocr_parameters(get_recipe_config())

input_filenames = input_folder.list_paths_in_partition()
input_filenames = list_input_paths(input_folder)
total_files = len(input_filenames)

rows = []
Expand Down
4 changes: 2 additions & 2 deletions custom-recipes/text-extraction/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from time import perf_counter

from dataiku.customrecipe import get_recipe_config
from text_extraction_ocr_utils.recipes_io_utils import get_input_output
from text_extraction_ocr_utils.recipes_io_utils import get_input_output, list_input_paths
from text_extraction_ocr_utils import text_extraction_parameters
from text_extraction_ocr_utils import Constants
from text_extraction import extract_text_content
Expand All @@ -21,7 +21,7 @@

params = text_extraction_parameters(get_recipe_config())

input_filenames = input_folder.list_paths_in_partition()
input_filenames = list_input_paths(input_folder)
total_files = len(input_filenames)

rows = []
Expand Down
2 changes: 1 addition & 1 deletion plugin.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"id": "tesseract-ocr",
"version": "2.3.3",
"version": "2.4.0",
"meta": {
"label": "Text extraction and OCR",
"description": "Extract text from documents & images.",
Expand Down
9 changes: 9 additions & 0 deletions python-lib/text_extraction_ocr_utils/recipes_io_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import dataiku
from dataiku.customrecipe import get_input_names_for_role
from dataiku.customrecipe import get_output_names_for_role
from dataiku.core import flow


def get_input_output(input_type='dataset', output_type='dataset'):
Expand All @@ -19,3 +20,11 @@ def get_input_output(input_type='dataset', output_type='dataset'):
output_obj = dataiku.Dataset(output_names)

return input_obj, output_obj

def list_input_paths(input_folder):
partitions = flow.FLOW['in'][0].get("partitions", [""])
return [
path
for partition in partitions
for path in input_folder.list_paths_in_partition(partition)
]
Comment on lines +24 to +30
Copy link

@clairebehue clairebehue Sep 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm pretty sure we got the same issue in the native* recipe 😢 : always listing all files. could you quickly double check if you already have your partitioned folder in place please? (i'll take care of opening the card if needed)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested it and I found no issue with the embed doc recipe. The KB do not support partitions, so we read all partitions by default, with the ability to customize the selection in the I/O settings.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was missing the I/O selection settings indeed ! thanks for checking this 🙏

6 changes: 3 additions & 3 deletions tests/python/integration/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pandas>=1.0,<1.1
pytest==6.2.1
pytest
requests<2.22,>=2
dataiku-api-client
git+git://github.com/dataiku/dataiku-plugin-tests-utils.git@master#egg=dataiku-plugin-tests-utils
git+https://github.com/dataiku/dataiku-plugin-tests-utils.git@master#egg=dataiku-plugin-tests-utils