Skip to content
This repository has been archived by the owner on Feb 19, 2024. It is now read-only.

Commit

Permalink
Adapt to 1.0.0b6, write out result
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Mar 21, 2019
1 parent 256751d commit 485523e
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 23 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ help:
@echo " install pip install -e ."
@echo " uninstall pip uninstall $(PKG_NAME)"
@echo " assets Fetch test assets"
@echo " test Run tests"
@echo ""
@echo " Variables"
@echo ""
Expand Down
38 changes: 24 additions & 14 deletions ocrd_typegroups_classifier/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@
"""

from ocrd import Processor
from ocrd.model.ocrd_page import (
from_file,
from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE
from ocrd_models.ocrd_page import (
to_xml
)
from ocrd_modelfactory import page_from_file

from .typegroups_classifier import TypegroupsClassifier
from .constants import OCRD_TOOL

from ocrd.utils import getLogger


class TypegroupsClassifierProcessor(Processor):

Expand All @@ -26,20 +25,21 @@ def process(self):
network_file = self.parameter['network']
stride = self.parameter['stride']
classifier = TypegroupsClassifier.load(network_file)

ignore_type = ('Adornment', 'Book covers and other irrelevant data', 'Empty Pages', 'Woodcuts - Engravings')

self.log.debug('Processing: ', self.input_files)

ignore_type = ('Adornment', 'Book covers and other irrelevant data',
'Empty Pages', 'Woodcuts - Engravings')

self.log.debug('Processing: %s', self.input_files)
for (_, input_file) in enumerate(self.input_files):
pcgts = from_file(self.workspace.download_file(input_file))
pcgts = page_from_file(self.workspace.download_file(input_file))
image_url = pcgts.get_Page().imageFilename
pil_image = self.workspace.resolve_image_as_pil(image_url)
result = classifier.run(pil_image, stride)
score_sum = 0
for typegroup in classifier.classMap.cl2id:
if not typegroup in ignore_type:
score_sum += max(0, result[typegroup])

script_highscore = 0
noise_highscore = 0
result_map = {}
Expand All @@ -52,13 +52,23 @@ def process(self):
script_highscore = max(script_highscore, score)
normalised_score = max(0, score / score_sum)
result_map[normalised_score] = typegroup
if noise_highscore>script_highscore:
if noise_highscore > script_highscore:
pcgts.get_Page().set_primaryScript(None)
self.log.debug('Detected only noise (such as empty page or book cover)')
self.log.debug(
'Detected only noise (such as empty page or book cover). noise_highscore=%s > script_highscore=%s',
noise_highscore, script_highscore)
else:
for k in sorted(result_map, reverse=True):
if output!='':
if output != '':
output = '%s, ' % output
output = '%s%s:%d' % (output, result_map[k], round(100*k))
pcgts.get_Page().set_primaryScript(output)
self.log.debug('Detected %s' % output)
pcgts.get_Page().set_primaryScript(output)
ID = concat_padded(self.output_file_grp, input_file.ID)
self.workspace.add_file(
ID=ID,
file_grp=self.output_file_grp,
mimetype=MIMETYPE_PAGE,
local_filename="%s/%s" % (self.output_file_grp, ID),
content=to_xml(pcgts)
)
8 changes: 1 addition & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1 @@
click
ocrd >= 0.11.0
pandas
Pillow >= 5.3.0
scikit-image
torch
torchvision
.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
version='0.0.1',
description='Typegroups classifier for OCR',
long_description=README,
long_description_content_type='text/markdown',
author='Matthias Seuret, Konstantin Baierer',
author_email='seuretm@users.noreply.github.com',
url='https://github.com/seuretm/ocrd_typegroups_classifier',
Expand All @@ -19,7 +20,7 @@
include_package_data=True,
install_requires=[
'click',
'ocrd >= 0.11.0',
'ocrd >= 1.0.0b6',
'pandas',
'Pillow >= 5.3.0',
'scikit-image',
Expand Down
2 changes: 1 addition & 1 deletion tests/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ net="${SCRIPTDIR}/../ocrd_typegroups_classifier/models/classifier.tgc"
cd "$SCRIPTDIR/assets/pembroke_werke_1766/data"
ocrd-typegroups-classifier \
-l DEBUG \
-g FILE_0010_DEFAULT \
-g PHYS_0011 \
-m mets.xml \
-I DEFAULT \
-O "OCR-D-FONTIDENT" \
Expand Down

0 comments on commit 485523e

Please sign in to comment.