Skip to content

Commit 20ff0d7

Browse files
oksidgyNastyBoget
andauthored
ESL-167 extract only word boxes (#360)
* ESL-167 extract only word boxes * ESL-167 extract only words bboxes for tabby reader --------- Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
1 parent 29eebb7 commit 20ff0d7

File tree

4 files changed

+18
-7
lines changed

4 files changed

+18
-7
lines changed

dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,11 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str,
2828
output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_threshold)
2929

3030
height, width = image.shape[:2]
31+
extract_line_bbox = self.config.get("labeling_mode", False)
32+
3133
line_boxes = [
32-
TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height))
33-
for line_num, line in enumerate(output_dict.lines)
34+
TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num,
35+
annotations=line.get_annotations(width, height, extract_line_bbox)) for line_num, line in enumerate(output_dict.lines)
3436
]
3537

3638
return line_boxes

dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None:
2323
def text(self) -> str:
2424
return " ".join(word.text for word in self.words if word.text != "") + "\n"
2525

26-
def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]:
26+
def get_annotations(self, page_width: int, page_height: int, extract_line_bbox: bool) -> List[Annotation]:
2727
start = 0
2828
annotations = []
2929

@@ -35,8 +35,8 @@ def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]
3535
annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100)))
3636
annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height))
3737
start += len(word.text) + 1
38-
39-
annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
38+
if extract_line_bbox:
39+
annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
4040
return annotations
4141

4242
@staticmethod

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
199199
if annotation["metadata"] == "LINK":
200200
annotations.append(LinkedTextAnnotation(start, end, annotation["url"]))
201201

202+
bbox = BBox(x_top_left=int(block["x_top_left"]), y_top_left=int(block["y_top_left"]), width=int(block["width"]), height=int(block["height"]))
203+
if self.config.get("labeling_mode", False):
204+
annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))
205+
202206
meta = block["metadata"].lower()
203207
uid = f"txt_{file_hash}_{order}"
204-
bbox = BBox(x_top_left=int(block["x_top_left"]), y_top_left=int(block["y_top_left"]), width=int(block["width"]), height=int(block["height"]))
205-
annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))
206208

207209
metadata = LineMetadata(page_id=page_number, line_id=order)
208210
line_with_location = LineWithLocation(line=block_text,

dedoc/scripts/test_words_bbox_extraction.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,13 @@ def test_table_word_extraction(self):
171171

172172
image = cv2.imread(self._get_abs_path(file_name))
173173
image = rotate_image(image, page_angle)
174+
175+
# draw boxes of content's words
176+
structure = result["content"]["structure"]
177+
word_annotations = self.__get_words_annotation(structure)
178+
image = self.__draw_word_annotations(image, word_annotations)
179+
180+
# draw boxes of table's words
174181
tables = result["content"]["tables"]
175182
if len(tables) > 0:
176183
image = self.__draw_tables_words(tables, image)

0 commit comments

Comments
 (0)