ESL-167 extract only word boxes (#360)

oksidgy · NastyBoget · web-flow · commit 20ff0d76020a · 2023-10-20T15:16:05.000Z
* ESL-167 extract only word boxes

* ESL-167 extract only words bboxes for tabby reader

---------

Co-authored-by: Nasty &lt;bogatenkova.anastasiya@mail.ru&gt;
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py
@@ -28,9 +28,11 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str,
             output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_threshold)
 
         height, width = image.shape[:2]
+        extract_line_bbox = self.config.get("labeling_mode", False)
+
         line_boxes = [
-            TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height))
-            for line_num, line in enumerate(output_dict.lines)
+            TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num,
+                         annotations=line.get_annotations(width, height, extract_line_bbox)) for line_num, line in enumerate(output_dict.lines)
         ]
 
         return line_boxes
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py
@@ -23,7 +23,7 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None:
     def text(self) -> str:
         return " ".join(word.text for word in self.words if word.text != "") + "\n"
 
-    def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]:
+    def get_annotations(self, page_width: int, page_height: int, extract_line_bbox: bool) -> List[Annotation]:
         start = 0
         annotations = []
 
@@ -35,8 +35,8 @@ def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]
             annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100)))
             annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height))
             start += len(word.text) + 1
-
-        annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
+        if extract_line_bbox:
+            annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
         return annotations
 
     @staticmethod
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -199,10 +199,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
                 if annotation["metadata"] == "LINK":
                     annotations.append(LinkedTextAnnotation(start, end, annotation["url"]))
 
+            bbox = BBox(x_top_left=int(block["x_top_left"]), y_top_left=int(block["y_top_left"]), width=int(block["width"]), height=int(block["height"]))
+            if self.config.get("labeling_mode", False):
+                annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))
+
             meta = block["metadata"].lower()
             uid = f"txt_{file_hash}_{order}"
-            bbox = BBox(x_top_left=int(block["x_top_left"]), y_top_left=int(block["y_top_left"]), width=int(block["width"]), height=int(block["height"]))
-            annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))
 
             metadata = LineMetadata(page_id=page_number, line_id=order)
             line_with_location = LineWithLocation(line=block_text,
diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py
@@ -171,6 +171,13 @@ def test_table_word_extraction(self):
 
             image = cv2.imread(self._get_abs_path(file_name))
             image = rotate_image(image, page_angle)
+
+            # draw boxes of content's words
+            structure = result["content"]["structure"]
+            word_annotations = self.__get_words_annotation(structure)
+            image = self.__draw_word_annotations(image, word_annotations)
+
+            # draw boxes of table's words
             tables = result["content"]["tables"]
             if len(tables) > 0:
                 image = self.__draw_tables_words(tables, image)