Skip to content

Commit 6a60e97

Browse files
oksidgyNastyBogetGabrieleContealexander1999-hubsunveil
authored
new version 2.3.1 (#509)
Co-authored-by: Zykina (Bogatenkova) Anastasiya <bogatenkova.anastasiya@mail.ru> Co-authored-by: Gabriele Conte <100914559+GabrieleConte@users.noreply.github.com> Co-authored-by: Alexander Golodkov <55749660+alexander1999-hub@users.noreply.github.com> Co-authored-by: Andrey Mikhailov <mikhailov@icc.ru> Co-authored-by: alexander1999-hub <golodkov.ao@phystech.edu>
1 parent 724e2d2 commit 6a60e97

29 files changed

+414
-126
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.3
1+
2.3.1

dedoc/api/web/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ <h4>Tables handling </h4>
122122

123123
<div class="parameters">
124124
<h4>PDF handling</h4>
125-
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
125+
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
126126
<br>
127127
<p>
128128
<label>

dedoc/data_structures/cell_with_meta.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List
1+
from typing import List, Optional
22

33
from dedoc.api.schema.cell_with_meta import CellWithMeta as ApiCellWithMeta
44
from dedoc.data_structures.annotation import Annotation
@@ -20,14 +20,14 @@ class CellWithMeta(Serializable):
2020
:vartype rowspan: int
2121
:vartype invisible: bool
2222
"""
23-
def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
23+
def __init__(self, lines: Optional[List[LineWithMeta]], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
2424
"""
2525
:param lines: textual lines of the cell
2626
:param colspan: number of columns to span like in HTML format
2727
:param rowspan: number of rows to span like in HTML format
2828
:param invisible: indicator for displaying or hiding cell text
2929
"""
30-
self.lines: List[LineWithMeta] = lines
30+
self.lines: List[LineWithMeta] = [] if lines is None else lines
3131
self.colspan: int = colspan
3232
self.rowspan: int = rowspan
3333
self.invisible: bool = invisible

dedoc/readers/article_reader/article_reader.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ def __init__(self, config: Optional[dict] = None) -> None:
2828
else:
2929
self.grobid_url = f"http://{os.environ.get('GROBID_HOST', 'localhost')}:{os.environ.get('GROBID_PORT', '8070')}"
3030
self.url = f"{self.grobid_url}/api/processFulltextDocument"
31+
32+
auth_key = os.environ.get("GROBID_AUTH_KEY", "")
33+
self.request_headers = {"Authorization": auth_key} if auth_key else {}
3134
self.grobid_is_alive = False
3235

3336
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
@@ -48,7 +51,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
4851
with open(file_path, "rb") as file:
4952
files = {"input": file}
5053
try:
51-
response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"})
54+
response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"}, headers=self.request_headers)
5255
if response.status_code != 200:
5356
warning = f"GROBID returns code {response.status_code}."
5457
self.logger.warning(warning)
@@ -106,7 +109,7 @@ def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None:
106109
attempt = max_attempts
107110
while attempt > 0:
108111
try:
109-
response = requests.get(f"{grobid_url}/api/isalive")
112+
response = requests.get(f"{grobid_url}/api/isalive", headers=self.request_headers)
110113
if response.status_code == 200:
111114
self.logger.info(f"GROBID up on {grobid_url}.")
112115
self.grobid_is_alive = True

dedoc/readers/docx_reader/numbering_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ def parse(self, xml: Tag, paragraph_properties: BaseProperties, run_properties:
7575
self.styles_extractor.parse(lvl_info.style_id, paragraph_properties, StyleType.NUMBERING)
7676
if lvl_info.pPr:
7777
change_paragraph_properties(paragraph_properties, lvl_info.pPr)
78+
# run properties are applied only to the numbering text ("lvlText" content)
7879
if lvl_info.rPr:
7980
change_run_properties(run_properties, lvl_info.rPr)
80-
change_run_properties(paragraph_properties, lvl_info.rPr)
8181

8282
run_properties.text = text
8383
paragraph_properties.list_level = self.state.levels_count

dedoc/readers/pdf_reader/data_classes/tables/cell.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
from dedocutils.data_structures import BBox
44

55
from dedoc.data_structures.annotation import Annotation
6+
from dedoc.data_structures.cell_with_meta import CellWithMeta
67
from dedoc.data_structures.line_with_meta import LineWithMeta
78

89

9-
class Cell:
10+
class Cell(CellWithMeta):
1011

1112
@staticmethod
1213
def copy_from(cell: "Cell",
@@ -41,35 +42,27 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)
4142
if self.con_coord:
4243
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
4344

44-
def __init__(self,
45-
x_top_left: int,
46-
x_bottom_right: int,
47-
y_top_left: int,
48-
y_bottom_right: int,
49-
id_con: int = -1,
50-
lines: Optional[List[LineWithMeta]] = None,
51-
is_attribute: bool = False,
52-
is_attribute_required: bool = False,
53-
rotated_angle: int = 0,
54-
uid: str = None,
45+
def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
46+
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
5547
contour_coord: Optional[BBox] = None) -> None:
48+
5649
import uuid
5750

5851
assert x_top_left <= x_bottom_right
5952
assert y_top_left <= y_bottom_right
53+
54+
self.lines = [] if lines is None else lines
55+
super().__init__(lines)
56+
6057
self.x_top_left = x_top_left
6158
self.x_bottom_right = x_bottom_right
6259
self.y_top_left = y_top_left
6360
self.y_bottom_right = y_bottom_right
6461
self.id_con = id_con
65-
self.lines = [] if lines is None else lines
6662
self.is_attribute = is_attribute
6763
self.is_attribute_required = is_attribute_required
6864
self.rotated_angle = rotated_angle
6965
self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
70-
self.colspan = 1
71-
self.rowspan = 1
72-
self.invisible = False
7366
self.con_coord = contour_coord or BBox(0, 0, 0, 0)
7467

7568
def __str__(self) -> str:

dedoc/readers/pdf_reader/data_classes/tables/location.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
1010
self.page_number = page_number
1111
self.bbox = bbox
1212
self.name = name
13+
# TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
1314
self.rotated_angle = rotated_angle
1415

1516
def shift(self, shift_x: int, shift_y: int) -> None:

dedoc/readers/pdf_reader/data_classes/tables/scantable.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, List
1+
from typing import Any, List, Optional
22

33
from dedocutils.data_structures import BBox
44

@@ -10,7 +10,8 @@
1010

1111

1212
class ScanTable:
13-
def __init__(self, page_number: int, matrix_cells: List[List[Cell]] = None, bbox: BBox = None, name: str = "", order: int = -1) -> None:
13+
def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
14+
name: str = "", order: int = -1) -> None:
1415
self.matrix_cells = matrix_cells
1516
self.page_number = page_number
1617
self.locations = []
@@ -27,6 +28,15 @@ def extended(self, table: "ScanTable") -> None:
2728
# extend order
2829
self.order = max(self.order, table.order)
2930

31+
def check_on_cell_instance(self) -> bool:
32+
if len(self.matrix_cells) == 0:
33+
return False
34+
if len(self.matrix_cells[0]) == 0:
35+
return False
36+
if not isinstance(self.matrix_cells[0][0], Cell):
37+
return False
38+
return True
39+
3040
def to_table(self) -> Table:
3141
metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
3242
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]

dedoc/readers/pdf_reader/pdf_base_reader.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from collections import namedtuple
33
from typing import Dict, Iterator, List, Optional, Set, Tuple
44

5-
import numpy as np
65
from dedocutils.data_structures.bbox import BBox
76
from numpy import ndarray
87

@@ -13,7 +12,7 @@
1312
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
1413
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
1514
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
16-
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
15+
1716

1817
ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
1918
"orient_analysis_cells",
@@ -45,6 +44,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
4544

4645
from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor
4746
from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor
47+
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
4848
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
4949
from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker
5050
from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor
@@ -153,24 +153,24 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
153153
metadata["rotated_page_angles"] = page_angles
154154
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata
155155

156-
def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \
157-
Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]:
156+
def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \
157+
Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]]:
158158
from joblib import Parallel, delayed
159159
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
160160

161161
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
162162
page_range = range(first_page, first_page + len(gost_analyzed_images))
163163
gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
164164
if isinstance(self, PdfTxtlayerReader):
165-
self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()]))
165+
self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()]))
166166
result = Parallel(n_jobs=self.config["n_jobs"])(
167167
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
168168
gost_analyzed_images.items()
169169
)
170170
return result, gost_analyzed_images
171171

172172
def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
173-
gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None:
173+
gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None:
174174
# shift unref_tables
175175
for scan_table in unref_tables:
176176
for location in scan_table.locations:

dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from numpy import ndarray
55

6+
from dedoc.data_structures.unstructured_document import UnstructuredDocument
67
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
78
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
89
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
@@ -53,6 +54,9 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
5354
self.binarizer = AdaptiveBinarizer()
5455
self.ocr = OCRLineExtractor(config=self.config)
5556

57+
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
58+
return super().read(file_path, parameters)
59+
5660
def _process_one_page(self,
5761
image: ndarray,
5862
parameters: ParametersForParseDoc,

dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import copy
12
import logging
23
from typing import List
34

@@ -155,24 +156,26 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
155156
# condition 2. Exclusion of the duplicated header (if any)
156157
attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells)
157158
attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells)
159+
t2_update = copy.deepcopy(t2)
158160
if TableAttributeExtractor.is_equal_attributes(attr1, attr2):
159-
t2.matrix_cells = t2.matrix_cells[len(attr2):]
161+
t2_update.matrix_cells = t2_update.matrix_cells[len(attr2):]
160162

161-
if len(t2.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
163+
if len(t2_update.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
162164
return False
163165

164-
TableAttributeExtractor.clear_attributes(t2.matrix_cells)
166+
TableAttributeExtractor.clear_attributes(t2_update.matrix_cells)
165167

166168
# condition 3. Number of columns should be equal
167-
if len(t1.matrix_cells[-1]) != len(t2.matrix_cells[0]):
169+
if len(t1.matrix_cells[-1]) != len(t2_update.matrix_cells[0]):
168170
if self.config.get("debug_mode", False):
169171
self.logger.debug("Different count column")
170172
return False
171173

172174
# condition 4. Comparison of the widths of last and first rows
173-
if not self.__is_equal_width_cells(t1.matrix_cells, t2.matrix_cells):
175+
if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.matrix_cells, t2_update.matrix_cells):
174176
if self.config.get("debug_mode", False):
175177
self.logger.debug("Different width columns")
176178
return False
177179

180+
t2.matrix_cells = copy.deepcopy(t2_update.matrix_cells) # save changes
178181
return True

dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,22 @@ def is_equal_attributes(attr1: List[List[Cell]], attr2: List[List[Cell]], thr_si
3131

3232
return True
3333

34+
@staticmethod
35+
def check_have_attributes(matrix_table: List[List[Cell]]) -> bool:
36+
if len(matrix_table) == 0:
37+
return False
38+
if len(matrix_table[0]) == 0:
39+
return False
40+
if not hasattr(matrix_table[0][0], "is_attribute"):
41+
return False
42+
return True
43+
3444
@staticmethod
3545
def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:
46+
47+
if not TableAttributeExtractor.check_have_attributes(matrix_table):
48+
return matrix_table[:1]
49+
3650
header_rows = len(matrix_table)
3751
for (i, row) in enumerate(matrix_table):
3852
attrs = [cell for cell in row if cell.is_attribute]
@@ -44,6 +58,9 @@ def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:
4458

4559
@staticmethod
4660
def clear_attributes(matrix_table: List[List[Cell]]) -> None:
61+
if not TableAttributeExtractor.check_have_attributes(matrix_table):
62+
return
63+
4764
for row in matrix_table:
4865
for cell in row:
4966
cell.is_attribute = False

0 commit comments

Comments
 (0)