Skip to content

Commit

Permalink
Add support for Tesseract version 3.05.00
Browse files Browse the repository at this point in the history
This is a bit more involved, because Tesseract 3.05.00 comes not only
with improvements but also with a few quirks we need to deal with.

The first quirk is that the order arguments of the `tesseract' command
now matters and the list of configurations has to be at the end of the
command line. So we add a new attribute tesseract_flags to the
BaseBuilder class that contains a list of all the flags to pass to
`tesseract', the tesseract_configs attribute however remains pretty much
the same but now only really contains a list of configs instead of being
mixed with flag arguments.

Another quirk has to do with Leptonica >= 1.74 which Tesseract 3.05.00
now requires. Leptonica has special handling of files that reside in
/tmp and assumes that it's an internal temporary file of Leptonica. In
order to deal with it, we now run Tesseract in a temporary directory,
which contains the input/output files and use the relative name of these
files because Leptonica only searches for path names beginning with
/tmp.

Fortunately the last item we need to address is not really a quirk, but
an API change. In Tesseract 3.05.00 there is now a new function called
TessBaseAPIDetectOrientationScript(), which doesn't fill the OSResults
object anymore but now allows to pass the values we're interested in
directly by reference. We need to use this new function because the old
function TessBaseAPIDetectOS() now *always* returns false.

Ran the test suite successfully with Python 3.5 and both Tesseract
3.04.01 and 3.05.00 except the following tests, which also didn't
succeed prior to this commit:

 * cuneiform:TestTxt.test_basic
 * cuneiform:TestTxt.test_european
 * cuneiform:TestTxt.test_french
 * cuneiform:TestWordBox.test_basic
 * cuneiform:TestWordBox.test_european
 * cuneiform:TestWordBox.test_french
 * libtesseract:TestBasicDoc.test_basic
 * libtesseract:TestDigitLineBox.test_digits
 * libtesseract:TestLineBox.test_japanese
 * libtesseract:TestTxt.test_japanese
 * libtesseract:TestWordBox.test_japanese
 * tesseract:TestDigitLineBox.test_digits
 * tesseract:TestTxt.test_japanese

The failure of these test cases is probably related to issue openpaperwork#52, but
from looking at the failures it doesn't seem to be related to this
change anyway.

Signed-off-by: aszlig <aszlig@redmoonstudios.org>
  • Loading branch information
aszlig committed Apr 9, 2017
1 parent f232402 commit d385269
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 46 deletions.
20 changes: 13 additions & 7 deletions src/pyocr/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,10 @@ class BaseBuilder(object):
cuneiform_args : Arguments passed to the Cuneiform command line.
"""

def __init__(self, file_extensions, tesseract_configs, cuneiform_args):
def __init__(self, file_extensions, tesseract_flags, tesseract_configs,
cuneiform_args):
self.file_extensions = file_extensions
self.tesseract_flags = tesseract_flags
self.tesseract_configs = tesseract_configs
self.cuneiform_args = cuneiform_args

Expand Down Expand Up @@ -298,15 +300,15 @@ class TextBuilder(BaseBuilder):
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
cuneiform_fax=False, cuneiform_singlecolumn=False):
file_ext = ["txt"]
tess_conf = ["-psm", str(tesseract_layout)]
tess_flags = ["-psm", str(tesseract_layout)]
cun_args = ["-f", "text"]
# Add custom cuneiform parameters if needed
for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
(cuneiform_fax, "--fax"),
(cuneiform_singlecolumn, "--singlecolumn")]:
if par:
cun_args.append(arg)
super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args)
super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args)
self.tesseract_layout = tesseract_layout
self.built_text = []

Expand Down Expand Up @@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder):

def __init__(self, tesseract_layout=1):
file_ext = ["html", "hocr"]
tess_conf = ["hocr", "-psm", str(tesseract_layout)]
tess_flags = ["-psm", str(tesseract_layout)]
tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
cun_args)
self.word_boxes = []
self.tesseract_layout = tesseract_layout

Expand Down Expand Up @@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder):

def __init__(self, tesseract_layout=1):
file_ext = ["html", "hocr"]
tess_conf = ["hocr", "-psm", str(tesseract_layout)]
tess_flags = ["-psm", str(tesseract_layout)]
tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
cun_args)
self.lines = []
self.tesseract_layout = tesseract_layout

Expand Down
67 changes: 50 additions & 17 deletions src/pyocr/libtesseract/tesseract_raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,22 @@ class OSResults(ctypes.Structure):
]
g_libtesseract.TessDeleteText.restype = None

g_libtesseract.TessBaseAPIDetectOS.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
ctypes.POINTER(OSResults),
]
g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
ctypes.POINTER(ctypes.c_int), # orient_deg
ctypes.POINTER(ctypes.c_float), # orient_conf
ctypes.POINTER(ctypes.c_char_p), # script_name
ctypes.POINTER(ctypes.c_float), # script_conf
]
g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \
ctypes.c_bool
else:
g_libtesseract.TessBaseAPIDetectOS.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
ctypes.POINTER(OSResults),
]
g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool


def init(lang=None):
Expand Down Expand Up @@ -526,15 +537,37 @@ def detect_os(handle):
global g_libtesseract
assert(g_libtesseract)

results = OSResults()
r = g_libtesseract.TessBaseAPIDetectOS(
ctypes.c_void_p(handle),
ctypes.pointer(results)
)
if not r:
raise TesseractError("detect_orientation failed",
"TessBaseAPIDetectOS() failed")
return {
"orientation": results.best_orientation_id,
"confidence": results.best_oconfidence,
}
# Use the new API function if it is available, because since Tesseract
# 3.05.00 the old API function _always_ returns False.
if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
orientation_deg = ctypes.c_int(0)
orientation_confidence = ctypes.c_float(0.0)

r = g_libtesseract.TessBaseAPIDetectOrientationScript(
ctypes.c_void_p(handle),
ctypes.byref(orientation_deg),
ctypes.byref(orientation_confidence),
None, # script_name
None # script_confidence
)

if not r:
raise TesseractError("detect_orientation failed",
"TessBaseAPIDetectOrientationScript() failed")
return {
"orientation": round(orientation_deg.value / 90),
"confidence": orientation_confidence.value,
}
else: # old API (before Tesseract 3.05.00)
results = OSResults()
r = g_libtesseract.TessBaseAPIDetectOS(
ctypes.c_void_p(handle),
ctypes.pointer(results)
)
if not r:
raise TesseractError("detect_orientation failed",
"TessBaseAPIDetectOS() failed")
return {
"orientation": results.best_orientation_id,
"confidence": results.best_oconfidence,
}
54 changes: 34 additions & 20 deletions src/pyocr/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import subprocess
import sys
import tempfile
import contextlib
import shutil

from . import builders
from . import error
Expand Down Expand Up @@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder):

def __init__(self):
file_ext = ["box"]
tess_flags = []
tess_conf = ["batch.nochop", "makebox"]
cun_args = []
super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
cun_args)
self.tesseract_layout = 1

@staticmethod
Expand Down Expand Up @@ -173,18 +177,19 @@ def detect_orientation(image, lang=None):
TesseractError --- if no script detected on the image
"""
_set_environment()
with temp_file(".bmp") as input_file:
command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"]
with temp_dir() as tmpdir:
command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"]
if lang is not None:
command += ['-l', lang]

if image.mode != "RGB":
image = image.convert("RGB")
image.save(input_file.name)
image.save(os.path.join(tmpdir, "input.bmp"))

proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False,
startupinfo=g_subprocess_startup_info,
creationflags=g_creation_flags,
cwd=tmpdir,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
proc.stdin.close()
Expand Down Expand Up @@ -224,8 +229,8 @@ def get_available_builders():
]


def run_tesseract(input_filename, output_filename_base, lang=None,
configs=None):
def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None,
flags=None, configs=None):
'''
Runs Tesseract:
`TESSERACT_CMD` \
Expand All @@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
input_filename --- image to read
output_filename_base --- file name in which must be stored the result
(without the extension)
cwd --- Run Tesseract in the specified working directory or use current
one if None
lang --- Tesseract language to use (if None, none will be specified)
config --- List of Tesseract configs to use (if None, none will be
specified)
Expand All @@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
if lang is not None:
command += ['-l', lang]

if flags is not None:
command += flags

if configs is not None:
command += configs

proc = subprocess.Popen(command,
proc = subprocess.Popen(command, cwd=cwd,
startupinfo=g_subprocess_startup_info,
creationflags=g_creation_flags,
stdout=subprocess.PIPE,
Expand Down Expand Up @@ -301,11 +311,18 @@ def close(self):
self.name = None


def temp_file(suffix):
''' Returns a temporary file '''
if os.name == 'nt': # Windows
return ReOpenableTempfile(suffix)
return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix)
@contextlib.contextmanager
def temp_dir():
"""
A context manager for maintaining a temporary directory
"""
# NOTE: Drop this as soon as we don't support Python 2.7 anymore, because
# since Python 3.2 there is a context manager called TemporaryDirectory().
path = tempfile.mkdtemp(prefix='tess_')
try:
yield path
finally:
shutil.rmtree(path)


def image_to_string(image, lang=None, builder=None):
Expand All @@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None):

if builder is None:
builder = builders.TextBuilder()
with temp_file(".bmp") as input_file:
with temp_file('') as output_file:
output_file_name_base = output_file.name

with temp_dir() as tmpdir:
if image.mode != "RGB":
image = image.convert("RGB")
image.save(input_file.name)
(status, errors) = run_tesseract(input_file.name,
output_file_name_base,
image.save(os.path.join(tmpdir, "input.bmp"))
(status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir,
lang=lang,
flags=builder.tesseract_flags,
configs=builder.tesseract_configs)
if status:
raise TesseractError(status, errors)

output_file_name = "ERROR"
for file_extension in builder.file_extensions:
output_file_name = ('%s.%s' % (output_file_name_base,
output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"),
file_extension))
if not os.access(output_file_name, os.F_OK):
continue
Expand Down
3 changes: 2 additions & 1 deletion tests/tests_libtesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ def test_version(self):
(3, 3, 0),
(3, 4, 0),
(3, 4, 1),
(3, 5, 0),
), ("Tesseract does not have the expected version"
" (3.4.0) ! Some tests will be skipped !"))
" (3.5.0) ! Some tests will be skipped !"))

def test_langs(self):
langs = libtesseract.get_available_languages()
Expand Down
3 changes: 2 additions & 1 deletion tests/tests_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def test_version(self):
(3, 3, 0),
(3, 4, 0),
(3, 4, 1),
(3, 5, 0),
), ("Tesseract does not have the expected version"
" (3.4.0) ! Some tests will be skipped !"))
" (3.5.0) ! Some tests will be skipped !"))

def test_langs(self):
langs = tesseract.get_available_languages()
Expand Down

0 comments on commit d385269

Please sign in to comment.