-
Notifications
You must be signed in to change notification settings - Fork 152
[Enhancement]: Propagate ocr confidence to output hocr file #86
Changes from 1 commit
c2460ad
1475c9e
3ef33b3
57b96f0
ec5e511
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,7 +42,7 @@ class Box(object): | |
was used. | ||
""" | ||
|
||
def __init__(self, content, position): | ||
def __init__(self, content, position, confidence=None): | ||
""" | ||
Arguments: | ||
content --- a single string | ||
|
@@ -53,15 +53,17 @@ def __init__(self, content, position): | |
content = to_unicode(content) | ||
self.content = content | ||
self.position = position | ||
self.confidence = confidence | ||
|
||
def get_unicode_string(self): | ||
""" | ||
Return the string corresponding to the box, in unicode (utf8). | ||
This string can be stored in a file as-is (see write_box_file()) | ||
and reread using read_box_file(). | ||
""" | ||
return to_unicode("%s %d %d %d %d") % ( | ||
return to_unicode("%s %s %d %d %d %d") % ( | ||
self.content, | ||
self.confidence, | ||
self.position[0][0], | ||
self.position[0][1], | ||
self.position[1][0], | ||
|
@@ -71,9 +73,10 @@ def get_unicode_string(self): | |
def get_xml_tag(self, parent_doc): | ||
span_tag = parent_doc.createElement("span") | ||
span_tag.setAttribute("class", "ocrx_word") | ||
span_tag.setAttribute("title", ("bbox %d %d %d %d" % ( | ||
span_tag.setAttribute("title", ("bbox %d %d %d %d; x_wconf %d" % ( | ||
(self.position[0][0], self.position[0][1], | ||
self.position[1][0], self.position[1][1])))) | ||
self.position[1][0], self.position[1][1], | ||
self.confidence)))) | ||
txt = xml.dom.minidom.Text() | ||
txt.data = self.content | ||
span_tag.appendChild(txt) | ||
|
@@ -268,7 +271,7 @@ def start_line(self, box): | |
""" | ||
raise NotImplementedError("Implement in subclasses") | ||
|
||
def add_word(self, word, box): | ||
def add_word(self, word, box, confidence): | ||
""" | ||
Add a word to output. | ||
""" | ||
|
@@ -329,7 +332,7 @@ def write_file(file_descriptor, text): | |
def start_line(self, box): | ||
self.built_text.append(u"") | ||
|
||
def add_word(self, word, box): | ||
def add_word(self, word, box, confidence=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you missed my point 1 :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah sorry I did miss you point! The parameter has now been made optional for all |
||
if self.built_text[-1] != u"": | ||
self.built_text[-1] += u" " | ||
self.built_text[-1] += word | ||
|
@@ -381,12 +384,23 @@ def __init__(self): | |
|
||
self.__current_box_position = None | ||
self.__current_box_text = None | ||
self.__current_box_confidence = None | ||
self.boxes = [] | ||
|
||
self.__current_line_position = None | ||
self.__current_line_content = [] | ||
self.lines = [] | ||
|
||
@staticmethod | ||
def __parse_confidence(title): | ||
for piece in title.split("; "): | ||
piece = piece.strip() | ||
if not piece.startswith("x_wconf"): | ||
continue | ||
confidence = piece.split(" ")[1] | ||
return int(confidence) | ||
raise Exception("Invalid hocr confidence measure: %s" % title) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Corrected! |
||
|
||
@staticmethod | ||
def __parse_position(title): | ||
for piece in title.split("; "): | ||
|
@@ -413,7 +427,9 @@ def handle_starttag(self, tag, attrs): | |
return | ||
if tag_type == 'ocr_word' or tag_type == 'ocrx_word': | ||
try: | ||
confidence = self.__parse_confidence(position) | ||
position = self.__parse_position(position) | ||
self.__current_box_confidence = confidence | ||
self.__current_box_position = position | ||
except Exception: | ||
# invalid position --> old format --> we ignore this tag | ||
|
@@ -439,7 +455,7 @@ def handle_endtag(self, tag): | |
if self.__current_box_text is None: | ||
return | ||
box_position = self.__current_box_position | ||
box = Box(self.__current_box_text, box_position) | ||
box = Box(self.__current_box_text, box_position, self.__current_box_confidence) | ||
self.boxes.append(box) | ||
self.__current_line_content.append(box) | ||
self.__current_box_text = None | ||
|
@@ -596,8 +612,8 @@ def write_file(file_descriptor, boxes): | |
def start_line(self, box): | ||
pass | ||
|
||
def add_word(self, word, box): | ||
self.word_boxes.append(Box(word, box)) | ||
def add_word(self, word, box, confidence): | ||
self.word_boxes.append(Box(word, box, confidence)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I suggest you try running the tests. Make sure you have Unfortunately, the outputs of the tests slightly vary based on the exact version of Tesseract and Cuneiform you're using (and wind direction I guess ....). So you will have to filter the failed tests manually: ignore those that failed just because the output has slightly changed, and just focus on the ones that failed because the API broke due to your changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair point! Here is what I did:
All in all, this basically means that the confidence measure is propagated to the output hocr files for the tesseract and libtesseract interfaces and a value of 0 is used for all the words when using cuneiform. Do you think that makes sense? I've done some manual testing with Cuneiform, Tesseract and libtesseract to verify everything was working as expected (looking at the output hocr files). There are however still many failing unit-tests when running the test suite. I must say that it's quite hard to know if it's because I broke the API or just because tesseract feels like spitting out a different output 😅 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I know, they are a pain. Unfortunately I haven't found a better way :/ |
||
|
||
def end_line(self): | ||
pass | ||
|
@@ -680,8 +696,8 @@ def start_line(self, box): | |
return | ||
self.lines.append(LineBox([], box)) | ||
|
||
def add_word(self, word, box): | ||
self.lines[-1].word_boxes.append(Box(word, box)) | ||
def add_word(self, word, box, confidence): | ||
self.lines[-1].word_boxes.append(Box(word, box, confidence)) | ||
|
||
def end_line(self): | ||
pass | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As is, it will break
tesseract.CharBoxBuilder
.CharBoxBuilder
correspond to a file format specific to Tesseract (configuration 'makebox'). If you modify this function, the format won't be the same than Tesseract anymore, andCharBoxBuilder.read_file()
won't be able to read files written byCharBoxBuilder.write_file()
anymore.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reverted changes