bertsky · bertsky · Mar 6, 2025 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
@@ -28,8 +28,6 @@
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
-TOOL = 'ocrd-cis-ocropy-binarize'
-
 def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
     LOG = getLogger('processor.OcropyBinarize')
     LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
@@ -70,19 +68,25 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
 class OcropyBinarize(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyBinarize')
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyBinarize, self).__init__(*args, **kwargs)
         if hasattr(self, 'output_file_grp'):
             # processing context
             self.setup()
-
+
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-binarize'
+
     def setup(self):
-        self.logger = getLogger('processor.OcropyBinarize')
-        if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy':
-            self.logger.critical('requested method %s does not support grayscale normalized output',
-                                 self.parameter['method'])
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+        method = self.parameter['method']
+        if self.parameter['grayscale'] and method != 'ocropy':
+            self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise Exception('only method=ocropy allows grayscale=true')
 
     def process(self):

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
@@ -31,16 +31,23 @@
     pil2array, array2pil
 )
 
-TOOL = 'ocrd-cis-ocropy-clip'
-
 class OcropyClip(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyClip')
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyClip, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-clip'
+
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
 
@@ -76,13 +83,12 @@ def process(self):
         # too. However, region-level clipping _must_ be run before region-level
         # deskewing, because that would make segments incomensurable with their
         # neighbours.
-        LOG = getLogger('processor.OcropyClip')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -98,7 +104,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -120,7 +126,7 @@ def process(self):
                 page.get_TableRegion() +
                 page.get_UnknownRegion())
             if not num_texts:
-                LOG.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning('Page "%s" contains no text regions', page_id)
             background = ImageStat.Stat(page_image)
             # workaround for Pillow#4925
             if len(background.bands) > 1:
@@ -151,7 +157,7 @@ def process(self):
                 if level == 'region':
                     if region.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        LOG.warning('Page "%s" region "%s" already contains image data: skipping',
+                        self.logger.warning('Page "%s" region "%s" already contains image data: skipping',
                                     page_id, region.id)
                         continue
                     shape = prep(shapes[i])
@@ -169,7 +175,7 @@ def process(self):
                 # level == 'line':
                 lines = region.get_TextLine()
                 if not lines:
-                    LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
                     continue
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords, feature_selector='binarized')
@@ -187,7 +193,7 @@ def process(self):
                 for j, line in enumerate(lines):
                     if line.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
+                        self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
                                     page_id, region.id, line.id)
                         continue
                     shape = prep(shapes[j])
@@ -212,13 +218,12 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
                         page_id, file_id):
-        LOG = getLogger('processor.OcropyClip')
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
@@ -230,7 +235,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         segment_bbox = bbox_from_polygon(segment_polygon)
         for neighbour, neighbour_mask in neighbours:
             if not np.any(segment_mask > neighbour_mask):
-                LOG.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
+                self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
                          neighbour.id, segment.id, page_id)
                 continue
             # find connected components that (only) belong to the neighbour:
@@ -240,7 +245,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
             num_foreground = np.count_nonzero(segment_mask * parent_bin)
             if not num_intruders:
                 continue
-            LOG.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
+            self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
                       segment.id, neighbour.id, num_intruders, num_foreground, page_id)
             # suppress in segment_mask so these intruders can stay in the neighbours
             # (are not removed from both sides)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
@@ -19,16 +19,23 @@
     # binarize,
     remove_noise)
 
-TOOL = 'ocrd-cis-ocropy-denoise'
-
 class OcropyDenoise(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDenoise')
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyDenoise, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-denoise'
+
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Despeckle the pages / regions / lines of the workspace.
 
@@ -50,13 +57,12 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropyDenoise')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -73,7 +79,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -84,7 +90,7 @@ def process(self):
             else:
                 regions = page.get_AllRegions(classes=['Text'], order='reading-order')
                 if not regions:
-                    LOG.warning('Page "%s" contains no text regions', page_id)
+                    self.logger.warning('Page "%s" contains no text regions', page_id)
                 for region in regions:
                     region_image, region_xywh = self.workspace.image_from_segment(
                         region, page_image, page_xywh,
@@ -95,7 +101,7 @@ def process(self):
                         continue
                     lines = region.get_TextLine()
                     if not lines:
-                        LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
                     for line in lines:
                         line_image, line_xywh = self.workspace.image_from_segment(
                             line, region_image, region_xywh,
@@ -114,15 +120,14 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id):
-        LOG = getLogger('processor.OcropyDenoise')
         if not segment_image.width or not segment_image.height:
-            LOG.warning("Skipping '%s' with zero size", file_id)
+            self.logger.warning("Skipping '%s' with zero size", file_id)
             return
-        LOG.info("About to despeckle '%s'", file_id)
+        self.logger.info("About to despeckle '%s'", file_id)
         bin_image = remove_noise(segment_image,
                                  maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
         # update METS (add the image file):

diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
@@ -33,11 +33,20 @@ def deskew(pil_image, maxskew=2):
 class OcropyDeskew(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDeskew')
         ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
         super(OcropyDeskew, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-deskew'
+
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Deskew the pages or regions of the workspace.
 
@@ -56,13 +65,12 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropyDeskew')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -86,7 +94,7 @@ def process(self):
                 else: # region
                     regions = page.get_AllRegions(classes=['Text'], order='reading-order')
                 if not regions:
-                    LOG.warning('Page "%s" contains no text regions', page_id)
+                    self.logger.warning('Page "%s" contains no text regions', page_id)
                 for region in regions:
                     # process region:
                     region_image, region_coords = self.workspace.image_from_segment(
@@ -109,23 +117,22 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id):
-        LOG = getLogger('processor.OcropyDeskew')
         if not segment_image.width or not segment_image.height:
-            LOG.warning("Skipping %s with zero size", segment_id)
+            self.logger.warning("Skipping %s with zero size", segment_id)
             return
         angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
-        LOG.info("About to deskew %s", segment_id)
+        self.logger.info("About to deskew %s", segment_id)
         angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
         # segment angle: PAGE orientation is defined clockwise,
         # whereas PIL/ndimage rotation is in mathematical direction:
         orientation = -(angle + angle0)
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         segment.set_orientation(orientation) # also removes all deskewed AlternativeImages
-        LOG.info("Found angle for %s: %.1f", segment_id, angle)
+        self.logger.info("Found angle for %s: %.1f", segment_id, angle)
         # delegate reflection, rotation and re-cropping to core:
         if isinstance(segment, PageType):
             segment_image, segment_coords, _ = self.workspace.image_from_page(

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
@@ -24,8 +24,6 @@
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
-TOOL = 'ocrd-cis-ocropy-dewarp'
-
 class InvalidLine(Exception):
     """Line image does not allow dewarping and should be ignored."""
 
@@ -71,15 +69,22 @@ def padvert(image, range_):
 class OcropyDewarp(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDewarp')
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyDewarp, self).__init__(*args, **kwargs)
         if hasattr(self, 'output_file_grp'):
             # processing context
             self.setup()
-
+
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-dewarp'
+
     def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
             params=(self.parameter['range'],
@@ -89,7 +94,6 @@ def setup(self):
                     #  dependency between smoothness
                     #  and extra params)
                     0.3))
-        self.logger = getLogger('processor.OcropyDewarp')
 
     def process(self):
         """Dewarp the lines of the workspace.