diff --git a/easyocr/cli.py b/easyocr/cli.py index e8520b8b1de..b70006c65e0 100644 --- a/easyocr/cli.py +++ b/easyocr/cli.py @@ -229,7 +229,7 @@ def parse_args(): parser.add_argument( "--output_format", type=str, - choices=["standard", 'dict', 'json'], + choices=["standard", 'dict', 'json', "hocr"], default='standard', help="output format.", ) diff --git a/easyocr/easyocr.py b/easyocr/easyocr.py index 681c05b3ce3..310727b32db 100644 --- a/easyocr/easyocr.py +++ b/easyocr/easyocr.py @@ -4,7 +4,7 @@ from .utils import group_text_box, get_image_list, calculate_md5, get_paragraph,\ download_and_unzip, printProgressBar, diff, reformat_input,\ make_rotated_img_list, set_result_with_confidence,\ - reformat_input_batched, merge_to_free + reformat_input_batched, merge_to_free, to_hocr from .config import * from bidi.algorithm import get_display import numpy as np @@ -434,6 +434,8 @@ def recognize(self, img_cv_grey, horizontal_list=None, free_list=None,\ return [json.dumps({'boxes':[list(map(int, lst)) for lst in item[0]],'text':item[1],'confident':item[2]}, ensure_ascii=False) for item in result] elif output_format == 'free_merge': return merge_to_free(result, free_list) + elif output_format == "hocr": + return to_hocr(result) else: return result diff --git a/easyocr/utils.py b/easyocr/utils.py index 987baf2c9a6..56687e907f2 100644 --- a/easyocr/utils.py +++ b/easyocr/utils.py @@ -8,6 +8,7 @@ from PIL import Image, JpegImagePlugin from scipy import ndimage import hashlib +import html import sys, os from zipfile import ZipFile from .imgproc import loadImage @@ -383,6 +384,49 @@ def decode_wordbeamsearch(self, mat, beamWidth=5): texts.append(string) return texts +OCR_PREAMBLE = """ + + + +
+