Skip to content

Commit

Permalink
update master (#363)
Browse files Browse the repository at this point in the history
* ESL-155 Add table bbox annotations to tabby reader (#354)

* Add table bbox annotations to tabby reader

* Fix tests

* Review fixes

---------

Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>

* TLDR-476 change swagger (#357)

* Use fastapi swagger, add pydantic classes and documentation

* Fix documentation and examples

* TLDR-465 pdf miner new params (#356)

* set char_margin to 3

* add pdf miner test script

* fix test_pdf_miner script

* fix TestApiPdfWithText

* add chaching

* rename test to benchmark

* add benchmark script again

* change name

* change name

* Try to fix documentation pipeline

* fix benchmark

---------

Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru>
Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>

* ESL-165 table bboxes bug (#358)

* ESL-165 Added test with hard tables

* ESL-165 fixed bug box extraction in payment_order

* ESL-165 after rebase

* ESL-165 update README.md

* ESL-165 after review

---------

Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>

* TLDR-367 refactor metadata extractor (#359)

* change add_metadata to extract_metadata in metadata readers

* fix usage of extract_metadata

* fix docs

* change output type to dict

* fix code style

* fix pr

---------

Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru>

* ESL-167 extract only word boxes (#360)

* ESL-167 extract only word boxes

* ESL-167 extract only words bboxes for tabby reader

---------

Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>

* TLDR-502 increase converter timeout (#361)

* new version 1.1.0 (#362)

---------

Co-authored-by: Andrey Mikhailov <mikhailov@icc.ru>
Co-authored-by: Nikita Shevtsov <61932814+Travvy88@users.noreply.github.com>
Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru>
Co-authored-by: Oksana Belyaeva <belyaeva@ispras.ru>
  • Loading branch information
5 people authored Oct 24, 2023
1 parent ff26829 commit b79dd4c
Show file tree
Hide file tree
Showing 96 changed files with 719 additions and 765 deletions.
1 change: 1 addition & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ jobs:

- name: Install dependencies
run: |
sudo apt update
sudo apt-get install -y libreoffice djvulibre-bin poppler-utils tesseract-ocr libtesseract-dev tesseract-ocr-rus tesseract-ocr-eng
python -m pip install --upgrade --no-cache-dir pip setuptools
python -m pip install --exists-action=w --no-cache-dir -r requirements.txt
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ It extracts a document’s logical structure and content, its tables, text forma
The document’s content is represented as a tree storing headings and lists of any level.
Dedoc can be integrated in a document contents and structure analysis system as a separate module.

## Workflow

![Workflow](https://github.com/ispras/dedoc/raw/master/docs/source/_static/workflow.png)

Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow)

## Features and advantages
Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and none-structured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats.
Document structure extraction is fully automatic regardless of input data type.
Expand Down Expand Up @@ -53,6 +59,8 @@ still, the docker application should be installed and configured properly.

If you don't need to change the application configuration, you may use the built docker image as well.

## Work with dedoc as service

### 1. Pull the image
```shell
docker pull dedocproject/dedoc
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0
1.1.0
11 changes: 6 additions & 5 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import dedoc
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
from dedoc.api.schema.parsed_document import ParsedDocument
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.common.exceptions.missing_file_error import MissingFileError
from dedoc.config import get_config
Expand Down Expand Up @@ -60,7 +61,7 @@ def _get_static_file_path(request: Request) -> str:
return os.path.abspath(os.path.join(directory, file))


@app.post("/upload")
@app.post("/upload", response_model=ParsedDocument)
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa
parameters = dataclasses.asdict(query_params)
if not file or file.filename == "":
Expand All @@ -81,15 +82,15 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
html_content = json2tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
elif return_format == "ujson":
return UJSONResponse(content=document_tree.to_dict())
return UJSONResponse(content=document_tree.to_api_schema().model_dump())
elif return_format == "collapsed_tree":
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
elif return_format == "pretty_json":
return PlainTextResponse(content=json.dumps(document_tree.to_dict(), ensure_ascii=False, indent=2))
return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
else:
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
return ORJSONResponse(content=document_tree.to_dict())
return ORJSONResponse(content=document_tree.to_api_schema().model_dump())


@app.get("/upload_example")
Expand All @@ -100,7 +101,7 @@ async def upload_example(file_name: str, return_format: Optional[str] = None) ->

if return_format == "html":
return HTMLResponse(content=json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0))
return ORJSONResponse(content=document_tree.to_dict(), status_code=200)
return ORJSONResponse(content=document_tree.to_api_schema().model_dump(), status_code=200)


@app.exception_handler(DedocError)
Expand Down
Empty file removed dedoc/api/models/__init__.py
Empty file.
27 changes: 0 additions & 27 deletions dedoc/api/models/custom_fields.py

This file was deleted.

13 changes: 13 additions & 0 deletions dedoc/api/schema/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .annotation import Annotation
from .cell_with_meta import CellWithMeta
from .document_content import DocumentContent
from .document_metadata import DocumentMetadata
from .line_metadata import LineMetadata
from .line_with_meta import LineWithMeta
from .parsed_document import ParsedDocument
from .table import Table
from .table_metadata import TableMetadata
from .tree_node import TreeNode

__all__ = ["Annotation", "CellWithMeta", "DocumentContent", "DocumentMetadata", "LineMetadata", "LineWithMeta", "ParsedDocument", "Table", "TableMetadata",
"TreeNode"]
12 changes: 12 additions & 0 deletions dedoc/api/schema/annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pydantic import BaseModel, Field


class Annotation(BaseModel):
"""
The piece of information about the text line: it's appearance or links to another document object.
For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic.
"""
start: int = Field(description="Start of the annotated text", example=0)
end: int = Field(description="End of the annotated text (end isn't included)", example=5)
name: str = Field(description="Annotation name", example="italic")
value: str = Field(description="Annotation value. For example, it may be font size value for size type", example="True")
15 changes: 15 additions & 0 deletions dedoc/api/schema/cell_with_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.line_with_meta import LineWithMeta


class CellWithMeta(BaseModel):
"""
Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
"""
lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations")
rowspan: int = Field(description="Number of rows to span like in HTML format", example=1)
colspan: int = Field(description="Number of columns to span like in HTML format", example=2)
invisible: bool = Field(description="Indicator for displaying or hiding cell text", example=False)
14 changes: 14 additions & 0 deletions dedoc/api/schema/document_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.table import Table
from dedoc.api.schema.tree_node import TreeNode


class DocumentContent(BaseModel):
"""
Content of the document - structured text and tables.
"""
structure: TreeNode = Field(description="Tree structure where content of the document is organized")
tables: List[Table] = Field(description="List of document tables")
20 changes: 20 additions & 0 deletions dedoc/api/schema/document_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field


class DocumentMetadata(BaseModel):
"""
Document metadata like its name, size, author, etc.
"""
model_config = ConfigDict(extra="allow")

uid: str = Field(description="Document unique identifier (useful for attached files)", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0")
file_name: str = Field(description="Original document name before rename and conversion", example="example.odt")
temporary_file_name: str = Field(description="File name during parsing (unique name after rename and conversion)", example="123.odt")
size: int = Field(description="File size in bytes", example=20060)
modified_time: int = Field(description="Modification time of the document in the UnixTime format", example=1590579805)
created_time: int = Field(description="Creation time of the document in the UnixTime format", example=1590579805)
access_time: int = Field(description="File access time in the UnixTime format", example=1590579805)
file_type: str = Field(description="Mime type of the file", example="application/vnd.oasis.opendocument.text")
other_fields: Optional[dict] = Field(description="Other optional fields")
15 changes: 15 additions & 0 deletions dedoc/api/schema/line_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field


class LineMetadata(BaseModel):
"""
Holds information about document node/line metadata, such as page number or line type.
"""
model_config = ConfigDict(extra="allow")

paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
line_id: Optional[int] = Field(description="Line number", example=1)
other_fields: Optional[dict] = Field(description="Some other fields")
13 changes: 13 additions & 0 deletions dedoc/api/schema/line_with_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.annotation import Annotation


class LineWithMeta(BaseModel):
"""
Textual line with text annotations.
"""
text: str = Field(description="Text of the line", example="Some text")
annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)")
17 changes: 17 additions & 0 deletions dedoc/api/schema/parsed_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.document_content import DocumentContent
from dedoc.api.schema.document_metadata import DocumentMetadata


class ParsedDocument(BaseModel):
"""
Holds information about the document content, metadata and attachments.
"""
content: DocumentContent = Field(description="Document text and tables")
metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on")
version: str = Field(description="Version of the program that parsed this document", example="0.9.1")
warnings: List[str] = Field(description="List of warnings and possible errors, arising in the process of document parsing")
attachments: List["ParsedDocument"] = Field(description="Result of analysis of attached files - list of `ParsedDocument`")
16 changes: 16 additions & 0 deletions dedoc/api/schema/table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.cell_with_meta import CellWithMeta
from dedoc.api.schema.table_metadata import TableMetadata


class Table(BaseModel):
"""
Holds information about tables in the document.
We assume that a table has rectangle form (has the same number of columns in each row).
Table representation is row-based i.e. external list contains list of rows.
"""
cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)")
metadata: TableMetadata = Field(description="Table meta information")
12 changes: 12 additions & 0 deletions dedoc/api/schema/table_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Optional

from pydantic import BaseModel, Field


class TableMetadata(BaseModel):
"""
Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
"""
page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")
rotated_angle: float = Field(description="Value of the rotation angle (in degrees) by which the table was rotated during recognition", example=1.0)
20 changes: 20 additions & 0 deletions dedoc/api/schema/tree_node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.annotation import Annotation
from dedoc.api.schema.line_metadata import LineMetadata


class TreeNode(BaseModel):
"""
Helps to represent document as recursive tree structure.
It has list of children `TreeNode` nodes (empty list for a leaf node).
"""
node_id: str = Field(description="Document element identifier. It is unique within a document content tree. "
"The identifier consists of numbers separated by dots where each number "
"means node's number among nodes with the same level in the document hierarchy.)", example="0.2.1")
text: str = Field(description="Text of the node", example="Some text")
annotations: List[Annotation] = Field(description="Some metadata related to the part of the text (as font size)")
metadata: LineMetadata = Field(description="Metadata for the entire node (as node type)")
subparagraphs: List["TreeNode"] = Field(description="List of children of this node, each child is `TreeNode`")
11 changes: 5 additions & 6 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,10 @@ def _handle_attachments(self, document: UnstructuredDocument, parameters: dict)
attachment.tmp_file_path = new_path

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[])
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = DocumentMetadata(**unstructured_document.metadata)
metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = DocumentMetadata(**metadata)
return ParsedDocument(content=get_empty_content(), metadata=metadata)
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/abstract_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, *, config: dict) -> None:
"""
:param config: configuration of the converter, e.g. logger for logging
"""
self.timeout = 10
self.timeout = 60
self.period_checking = 0.05
self.config = config
self.logger = config.get("logger", logging.getLogger())
Expand Down
1 change: 0 additions & 1 deletion dedoc/converters/concrete_converters/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ class PDFConverter(AbstractConverter):
"""
def __init__(self, *, config: dict) -> None:
super().__init__(config=config)
self.timeout = 60

def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
"""
Expand Down
1 change: 0 additions & 1 deletion dedoc/data_structures/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# noqa
import dedoc.data_structures.concrete_annotations as annotations
from .annotation import Annotation
from .attached_file import AttachedFile
Expand Down
30 changes: 3 additions & 27 deletions dedoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from collections import OrderedDict

from flask_restx import Api, Model, fields

from dedoc.api.schema.annotation import Annotation as ApiAnnotation
from dedoc.data_structures.serializable import Serializable


Expand Down Expand Up @@ -40,26 +37,5 @@ def __str__(self) -> str:
def __repr__(self) -> str:
return f"{self.name.capitalize()}(...)"

def to_dict(self) -> dict:
res = OrderedDict()
res["start"] = self.start
res["end"] = self.end
res["name"] = self.name
res["value"] = self.value
return res

@staticmethod
def get_api_dict(api: Api) -> Model:
names = [
"style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table",
"attachment", "spacing", "strike", "subscript", "superscript"
]
return api.model("Annotation", {
"start": fields.Integer(description="annotation start index", required=True, example=0),
"end": fields.Integer(description="annotation end index", required=True, example=4),
"name": fields.String(description="annotation name", required=True, example="bold", enum=names),
"value": fields.String(description="annotation value. For example, it may be font size value for size type "
"or type of alignment for alignment type",
required=True,
example="left")
})
def to_api_schema(self) -> ApiAnnotation:
return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value)
Loading

0 comments on commit b79dd4c

Please sign in to comment.