-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathservice.py
79 lines (64 loc) · 2.28 KB
/
service.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from __future__ import annotations
import io
import os
import typing as t
import asyncio
import numpy as np
import torch
import bentoml
from warmup import convert_pdf_to_images
from warmup import download_models
if t.TYPE_CHECKING:
import PIL.Image
from detectron2.structures import Boxes
from detectron2.structures import Instances
# Download Model
reader_model, predictor_model = download_models()
en_reader = reader_model.to_runner()
processor = predictor_model.to_runner()
THRESHOLD = float(os.getenv("OCR_THRESHOLD", 0.8))
svc = bentoml.Service(name="document-processing", runners=[en_reader, processor])
async def segmentation(im: PIL.Image.Image) -> tuple[list[int], list[float], Boxes]:
output: Instances = (await processor.async_run(np.asarray(im)))["instances"]
return (
output.get("pred_classes").tolist(),
output.get("scores").tolist(),
output.get("pred_boxes"),
)
async def preprocess(im: PIL.Image.Image, res: list[str], threshold: float = 0.8):
async def _proc_cls_scores(
cls: int, score: float, box: torch.Tensor, im: PIL.Image.Image
):
# 4 table
if cls != 4 and score >= threshold:
join_char = "" if cls == 0 else " "
text = join_char.join(
[
t[1]
for t in await en_reader.readtext.async_run(
np.asarray(im.crop(box.numpy()))
)
]
)
# ignore annotations for table footer
if not text.startswith("Figure") or not text.startswith("Table"):
print("Extract text:", text)
res.append(text)
classes, scores, boxes = await segmentation(im)
return await asyncio.gather(
*[
_proc_cls_scores(cls, score, box, im)
for cls, score, box in zip(classes, scores, boxes)
]
)
@svc.api(input=bentoml.io.File(mime_type="application/pdf"), output=bentoml.io.JSON())
async def image_to_text(file: io.BytesIO) -> dict[t.Literal["parsed"], str]:
res = []
with file:
await asyncio.gather(
*[
preprocess(im, res, THRESHOLD)
for im in convert_pdf_to_images(file.read())
]
)
return {"parsed": "\n".join(res)}