How to get bboxes and respective texts, confidence scores like easyocr or paddleocr #29
-
HI Team, It's a great repo and i have tested the detections and recognitions are working very well, but i need to get the area/location of the test text in document with confidence score. i have extracted geomentries from json but they are normalized (i am just thinking). Help me on this Thanks |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments
-
Hi @saichandrareddy1 👋 Here is a detailed script to understand how you can get your required informations 🤗 If you like the repo feel free to give a ⭐ import requests
import cv2
import numpy as np
from onnxtr.io import DocumentFile
from onnxtr.models import ocr_predictor
# Fetch a example image
image_url = "https://huggingface.co/datasets/huggingfacejs/tasks/resolve/main/document-question-answering/document-question-answering-input.png"
bytes_data = requests.get(image_url).content
# Convert relative coordinates to absolute pixel values
def _to_absolute(geom, img_shape: tuple[int, int]) -> list[list[int]]:
h, w = img_shape
if len(geom) == 2: # Assume straight pages = True -> [[xmin, ymin], [xmax, ymax]]
(xmin, ymin), (xmax, ymax) = geom
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
return [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
# Assume straight pages = False -> [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
else: # For polygons, convert each point to absolute coordinates
return [[int(point[0] * w), int(point[1] * h)] for point in geom]
# Load the document and model
doc = DocumentFile.from_images(bytes_data)
model = ocr_predictor(assume_straight_pages=True) # NOTE: Change me to False if the page is not straight or contains rotated text
res = model(doc)
json_res = res.export()
# Decode the image (only for visualization purposes)
image = cv2.imdecode(np.frombuffer(bytes_data, np.uint8), cv2.IMREAD_COLOR)
for page in json_res["pages"]:
page_idx = page["page_idx"] # The index of the page
shape = page["dimensions"] # The shape of the page (height, width)
# Dict with the orientation of the page (angle in degrees, confidence)
# (if detect_orientation is True and/or assume_straight_pages is False)
orientation = page["orientation"]
language = page["language"] # The detected language of the page (if detect_language is True)
for block in page["blocks"]:
block_geom = _to_absolute(block["geometry"], shape) # The geom of the block (now absolute coordinates)
# The average objectness score of the block (over lines in the block)
block_objectness_score = block["objectness_score"]
# draw block on image
cv2.polylines(image, [np.array(block_geom).reshape(-1, 1, 2)], True, (0, 255, 0), 2)
for line in block["lines"]:
line_geom = _to_absolute(line["geometry"], shape) # The geom of the line (now absolute coordinates)
# The average objectness score of the block (over words in the line)
line_objectness_score = line["objectness_score"]
# draw line on image
cv2.polylines(image, [np.array(line_geom).reshape(-1, 1, 2)], True, (0, 0, 255), 2)
for word in line["words"]:
word_geom = _to_absolute(word["geometry"], shape) # The geom of the word (now absolute coordinates)
word_objectness_score = word["objectness_score"] # The objectness score of the word crop
value = word["value"] # The text value of the word
confidence = word["confidence"] # The confidence of the word
# Dict with the orientation of the word crop (angle in degrees, confidence)
word_crop_orientation = word["crop_orientation"]
# Draw word on image
cv2.polylines(image, [np.array(word_geom).reshape(-1, 1, 2)], True, (255, 0, 0), 2)
# Save the final image with drawn polygons
cv2.imwrite("output.png", image) Does this answer your question ? :) |
Beta Was this translation helpful? Give feedback.
-
Thanks for your reply, it's helpful :) |
Beta Was this translation helpful? Give feedback.
Hi @saichandrareddy1 👋
Here is a detailed script to understand how you can get your required informations 🤗
If you like the repo feel free to give a ⭐