How to get bboxes and respective texts, confidence scores like easyocr or paddleocr #29

saichandrareddy1 · 2024-08-14T10:44:09Z

saichandrareddy1
Aug 14, 2024

HI Team,

It's a great repo and i have tested the detections and recognitions are working very well, but i need to get the area/location of the test text in document with confidence score.

i have extracted geomentries from json but they are normalized (i am just thinking).

Help me on this

Thanks

Answered by felixdittrich92

Aug 15, 2024

Hi @saichandrareddy1 👋

Here is a detailed script to understand how you can get your required informations 🤗

If you like the repo feel free to give a ⭐

import requests
import cv2
import numpy as np

from onnxtr.io import DocumentFile
from onnxtr.models import ocr_predictor

# Fetch a example image
image_url = "https://huggingface.co/datasets/huggingfacejs/tasks/resolve/main/document-question-answering/document-question-answering-input.png"
bytes_data = requests.get(image_url).content


# Convert relative coordinates to absolute pixel values
def _to_absolute(geom, img_shape: tuple[int, int]) -> list[list[int]]:
    h, w = img_shape
    if len(geom) == 2:  # Assume straight pages = True -> […

View full answer

felixdittrich92 · 2024-08-15T07:10:06Z

felixdittrich92
Aug 15, 2024
Maintainer

Hi @saichandrareddy1 👋

Here is a detailed script to understand how you can get your required informations 🤗

If you like the repo feel free to give a ⭐

import requests
import cv2
import numpy as np

from onnxtr.io import DocumentFile
from onnxtr.models import ocr_predictor

# Fetch a example image
image_url = "https://huggingface.co/datasets/huggingfacejs/tasks/resolve/main/document-question-answering/document-question-answering-input.png"
bytes_data = requests.get(image_url).content


# Convert relative coordinates to absolute pixel values
def _to_absolute(geom, img_shape: tuple[int, int]) -> list[list[int]]:
    h, w = img_shape
    if len(geom) == 2:  # Assume straight pages = True -> [[xmin, ymin], [xmax, ymax]]
        (xmin, ymin), (xmax, ymax) = geom
        xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
        ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
        return [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
    # Assume straight pages = False -> [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
    else:  # For polygons, convert each point to absolute coordinates
        return [[int(point[0] * w), int(point[1] * h)] for point in geom]


# Load the document and model
doc = DocumentFile.from_images(bytes_data)
model = ocr_predictor(assume_straight_pages=True)  # NOTE: Change me to False if the page is not straight or contains rotated text
res = model(doc)
json_res = res.export()

# Decode the image (only for visualization purposes)
image = cv2.imdecode(np.frombuffer(bytes_data, np.uint8), cv2.IMREAD_COLOR)

for page in json_res["pages"]:
    page_idx = page["page_idx"]  # The index of the page
    shape = page["dimensions"]  # The shape of the page (height, width)
    # Dict with the orientation of the page (angle in degrees, confidence)
    # (if detect_orientation is True and/or assume_straight_pages is False)
    orientation = page["orientation"]
    language = page["language"]  # The detected language of the page (if detect_language is True)
    for block in page["blocks"]:
        block_geom = _to_absolute(block["geometry"], shape)  # The geom of the block (now absolute coordinates)
        # The average objectness score of the block (over lines in the block)
        block_objectness_score = block["objectness_score"]
        # draw block on image
        cv2.polylines(image, [np.array(block_geom).reshape(-1, 1, 2)], True, (0, 255, 0), 2)
        for line in block["lines"]:
            line_geom = _to_absolute(line["geometry"], shape)  # The geom of the line (now absolute coordinates)
            # The average objectness score of the block (over words in the line)
            line_objectness_score = line["objectness_score"]
            # draw line on image
            cv2.polylines(image, [np.array(line_geom).reshape(-1, 1, 2)], True, (0, 0, 255), 2)
            for word in line["words"]:
                word_geom = _to_absolute(word["geometry"], shape)  # The geom of the word (now absolute coordinates)
                word_objectness_score = word["objectness_score"]  # The objectness score of the word crop
                value = word["value"]  # The text value of the word
                confidence = word["confidence"]  # The confidence of the word
                # Dict with the orientation of the word crop (angle in degrees, confidence)
                word_crop_orientation = word["crop_orientation"]
                # Draw word on image
                cv2.polylines(image, [np.array(word_geom).reshape(-1, 1, 2)], True, (255, 0, 0), 2)

# Save the final image with drawn polygons
cv2.imwrite("output.png", image)

Does this answer your question ? :)

0 replies

saichandrareddy1 · 2024-08-20T11:04:44Z

saichandrareddy1
Aug 20, 2024
Author

Thanks for your reply, it's helpful :)

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to get bboxes and respective texts, confidence scores like easyocr or paddleocr #29

{{title}}

Replies: 2 comments

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

Select a reply

How to get bboxes and respective texts, confidence scores like easyocr or paddleocr #29

saichandrareddy1 Aug 14, 2024

Replies: 2 comments

felixdittrich92 Aug 15, 2024 Maintainer

saichandrareddy1 Aug 20, 2024 Author

saichandrareddy1
Aug 14, 2024

felixdittrich92
Aug 15, 2024
Maintainer

saichandrareddy1
Aug 20, 2024
Author