-
Notifications
You must be signed in to change notification settings - Fork 30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
port processor to core v3 #130
base: machine_based_reading_order_integration
Are you sure you want to change the base?
Changes from all commits
0a3f525
4a13781
9ce02a5
0d83db7
87adc4b
39b16e5
ddcc019
d7caeb2
8dfecb7
3381e5a
49c1a8f
c37d95d
61bcb43
d98fa2a
ecd202e
d26079d
7b92620
aef46a4
dfc4ac2
1e90257
17eafc1
9b274dc
f9c2d85
fdedae2
c6e0e05
2189391
bc9dddd
869110f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
# ocrd includes opencv, numpy, shapely, click | ||
ocrd >= 2.23.3 | ||
ocrd >= 3.0.0b4 | ||
numpy <1.24.0 | ||
scikit-learn >= 0.23.2 | ||
tensorflow == 2.12.1 | ||
tensorflow < 2.13 | ||
imutils >= 0.5.3 | ||
matplotlib | ||
setuptools >= 50 |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -6,8 +6,8 @@ | |||||||
"executable": "ocrd-eynollah-segment", | ||||||||
"categories": ["Layout analysis"], | ||||||||
"description": "Segment page into regions and lines and do reading order detection with eynollah", | ||||||||
"input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"], | ||||||||
"output_file_grp": ["OCR-D-SEG-LINE"], | ||||||||
"input_file_grp_cardinality": 1, | ||||||||
"output_file_grp_cardinality": 1, | ||||||||
"steps": ["layout/segmentation/region", "layout/segmentation/line"], | ||||||||
"parameters": { | ||||||||
"models": { | ||||||||
|
@@ -29,36 +29,61 @@ | |||||||
"default": true, | ||||||||
"description": "Try to detect all element subtypes, including drop-caps and headings" | ||||||||
}, | ||||||||
"tables": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "Try to detect table regions" | ||||||||
}, | ||||||||
"tables": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "Try to detect table regions" | ||||||||
}, | ||||||||
"curved_line": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" | ||||||||
}, | ||||||||
"ignore_page_extraction": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "if this parameter set to true, this tool would ignore page extraction" | ||||||||
}, | ||||||||
"allow_scaling": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)" | ||||||||
}, | ||||||||
"allow_enhancement": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not." | ||||||||
}, | ||||||||
"light_mode": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "lighter and faster but simpler method for main region detection and deskewing" | ||||||||
}, | ||||||||
"textline_light": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method." | ||||||||
}, | ||||||||
"right_to_left": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "if this parameter set to true, this tool will extract right-to-left reading order." | ||||||||
}, | ||||||||
"headers_off": { | ||||||||
"type": "boolean", | ||||||||
"default": false, | ||||||||
"description": "ignore the special role of headings during reading order detection" | ||||||||
} | ||||||||
}, | ||||||||
"resources": [ | ||||||||
{ | ||||||||
"description": "models for eynollah (TensorFlow SavedModel format)", | ||||||||
"url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz", | ||||||||
"name": "default", | ||||||||
"size": 1894627041, | ||||||||
"type": "archive", | ||||||||
"path_in_archive": "models_eynollah" | ||||||||
} | ||||||||
{ | ||||||||
"description": "models for eynollah (TensorFlow SavedModel format)", | ||||||||
"url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz", | ||||||||
"name": "default", | ||||||||
"size": 1894627041, | ||||||||
"type": "archive", | ||||||||
"path_in_archive": "models_eynollah" | ||||||||
} | ||||||||
] | ||||||||
} | ||||||||
} | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,68 +1,55 @@ | ||||||||||||||||
from json import loads | ||||||||||||||||
from pkg_resources import resource_string | ||||||||||||||||
from tempfile import NamedTemporaryFile | ||||||||||||||||
from pathlib import Path | ||||||||||||||||
from os.path import join | ||||||||||||||||
|
||||||||||||||||
from PIL import Image | ||||||||||||||||
|
||||||||||||||||
from ocrd import Processor | ||||||||||||||||
from ocrd_modelfactory import page_from_file, exif_from_filename | ||||||||||||||||
from ocrd_models import OcrdFile, OcrdExif | ||||||||||||||||
from ocrd_models.ocrd_page import to_xml | ||||||||||||||||
from ocrd_utils import ( | ||||||||||||||||
getLogger, | ||||||||||||||||
MIMETYPE_PAGE, | ||||||||||||||||
assert_file_grp_cardinality, | ||||||||||||||||
make_file_id | ||||||||||||||||
) | ||||||||||||||||
from typing import Optional | ||||||||||||||||
from ocrd_models import OcrdPage | ||||||||||||||||
from ocrd import Processor, OcrdPageResult | ||||||||||||||||
|
||||||||||||||||
from .eynollah import Eynollah | ||||||||||||||||
from .utils.pil_cv2 import pil2cv | ||||||||||||||||
|
||||||||||||||||
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) | ||||||||||||||||
|
||||||||||||||||
class EynollahProcessor(Processor): | ||||||||||||||||
|
||||||||||||||||
def __init__(self, *args, **kwargs): | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] | ||||||||||||||||
kwargs['version'] = OCRD_TOOL['version'] | ||||||||||||||||
super().__init__(*args, **kwargs) | ||||||||||||||||
def setup(self) -> None: | ||||||||||||||||
# for caching models | ||||||||||||||||
self.models = None | ||||||||||||||||
if self.parameter['textline_light'] and not self.parameter['light_mode']: | ||||||||||||||||
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled") | ||||||||||||||||
|
||||||||||||||||
def process(self): | ||||||||||||||||
LOG = getLogger('eynollah') | ||||||||||||||||
assert_file_grp_cardinality(self.input_file_grp, 1) | ||||||||||||||||
assert_file_grp_cardinality(self.output_file_grp, 1) | ||||||||||||||||
for n, input_file in enumerate(self.input_files): | ||||||||||||||||
page_id = input_file.pageId or input_file.ID | ||||||||||||||||
LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) | ||||||||||||||||
pcgts = page_from_file(self.workspace.download_file(input_file)) | ||||||||||||||||
LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) | ||||||||||||||||
self.add_metadata(pcgts) | ||||||||||||||||
page = pcgts.get_Page() | ||||||||||||||||
# XXX loses DPI information | ||||||||||||||||
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') | ||||||||||||||||
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename | ||||||||||||||||
eynollah_kwargs = { | ||||||||||||||||
'dir_models': self.resolve_resource(self.parameter['models']), | ||||||||||||||||
'allow_enhancement': False, | ||||||||||||||||
'curved_line': self.parameter['curved_line'], | ||||||||||||||||
'full_layout': self.parameter['full_layout'], | ||||||||||||||||
'allow_scaling': self.parameter['allow_scaling'], | ||||||||||||||||
'headers_off': self.parameter['headers_off'], | ||||||||||||||||
'tables': self.parameter['tables'], | ||||||||||||||||
'override_dpi': self.parameter['dpi'], | ||||||||||||||||
'logger': LOG, | ||||||||||||||||
'pcgts': pcgts, | ||||||||||||||||
'image_filename': image_filename | ||||||||||||||||
} | ||||||||||||||||
Eynollah(**eynollah_kwargs).run() | ||||||||||||||||
file_id = make_file_id(input_file, self.output_file_grp) | ||||||||||||||||
pcgts.set_pcGtsId(file_id) | ||||||||||||||||
self.workspace.add_file( | ||||||||||||||||
ID=file_id, | ||||||||||||||||
file_grp=self.output_file_grp, | ||||||||||||||||
pageId=page_id, | ||||||||||||||||
mimetype=MIMETYPE_PAGE, | ||||||||||||||||
local_filename=join(self.output_file_grp, file_id) + '.xml', | ||||||||||||||||
content=to_xml(pcgts)) | ||||||||||||||||
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: | ||||||||||||||||
assert input_pcgts | ||||||||||||||||
assert input_pcgts[0] | ||||||||||||||||
assert self.parameter | ||||||||||||||||
pcgts = input_pcgts[0] | ||||||||||||||||
page = pcgts.get_Page() | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||
# if not('://' in page.imageFilename): | ||||||||||||||||
# image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename | ||||||||||||||||
# else: | ||||||||||||||||
# # could be a URL with file:// or truly remote | ||||||||||||||||
# image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename | ||||||||||||||||
page_image, _, _ = self.workspace.image_from_page( | ||||||||||||||||
Comment on lines
+21
to
+25
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||
page, page_id, | ||||||||||||||||
# avoid any features that would change the coordinate system: cropped,deskewed | ||||||||||||||||
# (the PAGE builder merely adds regions, so afterwards we would not know which to transform) | ||||||||||||||||
# also avoid binarization as models usually fare better on grayscale/RGB | ||||||||||||||||
feature_filter='cropped,deskewed,binarized') | ||||||||||||||||
eynollah = Eynollah( | ||||||||||||||||
self.resolve_resource(self.parameter['models']), | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||
self.logger, | ||||||||||||||||
allow_enhancement=self.parameter['allow_enhancement'], | ||||||||||||||||
curved_line=self.parameter['curved_line'], | ||||||||||||||||
light_version=self.parameter['light_mode'], | ||||||||||||||||
right2left=self.parameter['right_to_left'], | ||||||||||||||||
ignore_page_extraction=self.parameter['ignore_page_extraction'], | ||||||||||||||||
textline_light=self.parameter['textline_light'], | ||||||||||||||||
full_layout=self.parameter['full_layout'], | ||||||||||||||||
allow_scaling=self.parameter['allow_scaling'], | ||||||||||||||||
headers_off=self.parameter['headers_off'], | ||||||||||||||||
tables=self.parameter['tables'], | ||||||||||||||||
override_dpi=self.parameter['dpi'], | ||||||||||||||||
pcgts=pcgts, | ||||||||||||||||
image_filename=page.imageFilename, | ||||||||||||||||
image_pil=page_image | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||
) | ||||||||||||||||
if self.models is not None: | ||||||||||||||||
# reuse loaded models from previous page | ||||||||||||||||
eynollah.models = self.models | ||||||||||||||||
eynollah.run() | ||||||||||||||||
self.models = eynollah.models | ||||||||||||||||
return OcrdPageResult(pcgts) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.