diff --git a/README.md b/README.md index 292cfbc..916c556 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ :warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. ## Installation -Python `3.8-3.11` with Tensorflow `2.12-2.15` on Linux are currently supported. +Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported. For (limited) GPU support the CUDA toolkit needs to be installed. diff --git a/requirements.txt b/requirements.txt index f01d319..af6aaa3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # ocrd includes opencv, numpy, shapely, click -ocrd >= 2.23.3 +ocrd >= 3.0.0b4 numpy <1.24.0 scikit-learn >= 0.23.2 -tensorflow == 2.12.1 +tensorflow < 2.13 imutils >= 0.5.3 matplotlib setuptools >= 50 diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 564b8b0..772efb4 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -1,6 +1,6 @@ import sys import click -from ocrd_utils import initLogging, setOverrideLogLevel +from ocrd_utils import initLogging, setOverrideLogLevel, getLogger from eynollah.eynollah import Eynollah @@ -186,10 +186,11 @@ def main( print('Error: You used -eoi which can not be enabled alongside light_version -light or allow_scaling -as or allow_enhancement -ae or curved_line -cl or textline_light -tll or full_layout -fl or tables -tab or right2left -r2l or headers_off -ho') sys.exit(1) eynollah = Eynollah( + model, + getLogger('Eynollah'), image_filename=image, dir_out=out, dir_in=dir_in, - dir_models=model, dir_of_cropped_images=save_images, extract_only_images=extract_only_images, dir_of_layout=save_layout, diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 511e994..4755e4b 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -6,14 +6,18 @@ document layout analysis (segmentation) with output in PAGE-XML """ +from logging import Logger import math import os import sys import time +from typing import Optional import warnings from pathlib import Path from multiprocessing import Process, Queue, cpu_count import gc +from PIL.Image import Image +from ocrd import OcrdPage from ocrd_utils import getLogger import cv2 import numpy as np @@ -142,33 +146,33 @@ def get_config(self): class Eynollah: def __init__( self, - dir_models, - image_filename=None, - image_pil=None, - image_filename_stem=None, - dir_out=None, - dir_in=None, - dir_of_cropped_images=None, - extract_only_images=False, - dir_of_layout=None, - dir_of_deskewed=None, - dir_of_all=None, - dir_save_page=None, - enable_plotting=False, - allow_enhancement=False, - curved_line=False, - textline_light=False, - full_layout=False, - tables=False, - right2left=False, - input_binary=False, - allow_scaling=False, - headers_off=False, - light_version=False, - ignore_page_extraction=False, - override_dpi=None, - logger=None, - pcgts=None, + dir_models : str, + logger : Logger, + image_filename : Optional[str] = None, + image_pil : Optional[Image] = None, + image_filename_stem : Optional[str] = None, + dir_out : Optional[str] = None, + dir_in : Optional[str] = None, + dir_of_cropped_images : Optional[str] = None, + extract_only_images : bool =False, + dir_of_layout : Optional[str] = None, + dir_of_deskewed : Optional[str] = None, + dir_of_all : Optional[str] = None, + dir_save_page : Optional[str] = None, + enable_plotting : bool = False, + allow_enhancement : bool = False, + curved_line : bool = False, + textline_light : bool = False, + full_layout : bool = False, + tables : bool = False, + right2left : bool = False, + input_binary : bool = False, + allow_scaling : bool = False, + headers_off : bool = False, + light_version : bool = False, + ignore_page_extraction : bool = False, + override_dpi : Optional[int] = None, + pcgts : Optional[OcrdPage] = None, ): if not dir_in: if image_pil: @@ -215,7 +219,7 @@ def __init__( curved_line=self.curved_line, textline_light = self.textline_light, pcgts=pcgts) - self.logger = logger if logger else getLogger('eynollah') + self.logger = logger self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index b840005..3295049 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -6,8 +6,8 @@ "executable": "ocrd-eynollah-segment", "categories": ["Layout analysis"], "description": "Segment page into regions and lines and do reading order detection with eynollah", - "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"], - "output_file_grp": ["OCR-D-SEG-LINE"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region", "layout/segmentation/line"], "parameters": { "models": { @@ -29,21 +29,46 @@ "default": true, "description": "Try to detect all element subtypes, including drop-caps and headings" }, - "tables": { - "type": "boolean", - "default": false, - "description": "Try to detect table regions" - }, + "tables": { + "type": "boolean", + "default": false, + "description": "Try to detect table regions" + }, "curved_line": { "type": "boolean", "default": false, "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" }, + "ignore_page_extraction": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would ignore page extraction" + }, "allow_scaling": { "type": "boolean", "default": false, "description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)" }, + "allow_enhancement": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not." + }, + "light_mode": { + "type": "boolean", + "default": false, + "description": "lighter and faster but simpler method for main region detection and deskewing" + }, + "textline_light": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method." + }, + "right_to_left": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool will extract right-to-left reading order." + }, "headers_off": { "type": "boolean", "default": false, @@ -51,14 +76,14 @@ } }, "resources": [ - { - "description": "models for eynollah (TensorFlow SavedModel format)", - "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz", - "name": "default", - "size": 1894627041, - "type": "archive", - "path_in_archive": "models_eynollah" - } + { + "description": "models for eynollah (TensorFlow SavedModel format)", + "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz", + "name": "default", + "size": 1894627041, + "type": "archive", + "path_in_archive": "models_eynollah" + } ] } } diff --git a/src/eynollah/processor.py b/src/eynollah/processor.py index 1bd190e..fd7dd2a 100644 --- a/src/eynollah/processor.py +++ b/src/eynollah/processor.py @@ -1,68 +1,55 @@ -from json import loads -from pkg_resources import resource_string -from tempfile import NamedTemporaryFile -from pathlib import Path -from os.path import join - -from PIL import Image - -from ocrd import Processor -from ocrd_modelfactory import page_from_file, exif_from_filename -from ocrd_models import OcrdFile, OcrdExif -from ocrd_models.ocrd_page import to_xml -from ocrd_utils import ( - getLogger, - MIMETYPE_PAGE, - assert_file_grp_cardinality, - make_file_id -) +from typing import Optional +from ocrd_models import OcrdPage +from ocrd import Processor, OcrdPageResult from .eynollah import Eynollah -from .utils.pil_cv2 import pil2cv - -OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) class EynollahProcessor(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) + def setup(self) -> None: + # for caching models + self.models = None + if self.parameter['textline_light'] and not self.parameter['light_mode']: + raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled") - def process(self): - LOG = getLogger('eynollah') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) - self.add_metadata(pcgts) - page = pcgts.get_Page() - # XXX loses DPI information - # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename - eynollah_kwargs = { - 'dir_models': self.resolve_resource(self.parameter['models']), - 'allow_enhancement': False, - 'curved_line': self.parameter['curved_line'], - 'full_layout': self.parameter['full_layout'], - 'allow_scaling': self.parameter['allow_scaling'], - 'headers_off': self.parameter['headers_off'], - 'tables': self.parameter['tables'], - 'override_dpi': self.parameter['dpi'], - 'logger': LOG, - 'pcgts': pcgts, - 'image_filename': image_filename - } - Eynollah(**eynollah_kwargs).run() - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, file_id) + '.xml', - content=to_xml(pcgts)) + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + assert input_pcgts + assert input_pcgts[0] + assert self.parameter + pcgts = input_pcgts[0] + page = pcgts.get_Page() + # if not('://' in page.imageFilename): + # image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename + # else: + # # could be a URL with file:// or truly remote + # image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename + page_image, _, _ = self.workspace.image_from_page( + page, page_id, + # avoid any features that would change the coordinate system: cropped,deskewed + # (the PAGE builder merely adds regions, so afterwards we would not know which to transform) + # also avoid binarization as models usually fare better on grayscale/RGB + feature_filter='cropped,deskewed,binarized') + eynollah = Eynollah( + self.resolve_resource(self.parameter['models']), + self.logger, + allow_enhancement=self.parameter['allow_enhancement'], + curved_line=self.parameter['curved_line'], + light_version=self.parameter['light_mode'], + right2left=self.parameter['right_to_left'], + ignore_page_extraction=self.parameter['ignore_page_extraction'], + textline_light=self.parameter['textline_light'], + full_layout=self.parameter['full_layout'], + allow_scaling=self.parameter['allow_scaling'], + headers_off=self.parameter['headers_off'], + tables=self.parameter['tables'], + override_dpi=self.parameter['dpi'], + pcgts=pcgts, + image_filename=page.imageFilename, + image_pil=page_image + ) + if self.models is not None: + # reuse loaded models from previous page + eynollah.models = self.models + eynollah.run() + self.models = eynollah.models + return OcrdPageResult(pcgts)