Skip to content

Commit

Permalink
TLDR-476 change swagger (#357)
Browse files Browse the repository at this point in the history
* Use fastapi swagger, add pydantic classes and documentation

* Fix documentation and examples
  • Loading branch information
NastyBoget authored Oct 18, 2023
1 parent 3d7f22a commit e7c1067
Show file tree
Hide file tree
Showing 58 changed files with 424 additions and 565 deletions.
11 changes: 6 additions & 5 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import dedoc
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
from dedoc.api.schema.parsed_document import ParsedDocument
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.common.exceptions.missing_file_error import MissingFileError
from dedoc.config import get_config
Expand Down Expand Up @@ -60,7 +61,7 @@ def _get_static_file_path(request: Request) -> str:
return os.path.abspath(os.path.join(directory, file))


@app.post("/upload")
@app.post("/upload", response_model=ParsedDocument)
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa
parameters = dataclasses.asdict(query_params)
if not file or file.filename == "":
Expand All @@ -81,15 +82,15 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
html_content = json2tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
elif return_format == "ujson":
return UJSONResponse(content=document_tree.to_dict())
return UJSONResponse(content=document_tree.to_api_schema().model_dump())
elif return_format == "collapsed_tree":
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
elif return_format == "pretty_json":
return PlainTextResponse(content=json.dumps(document_tree.to_dict(), ensure_ascii=False, indent=2))
return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
else:
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
return ORJSONResponse(content=document_tree.to_dict())
return ORJSONResponse(content=document_tree.to_api_schema().model_dump())


@app.get("/upload_example")
Expand All @@ -100,7 +101,7 @@ async def upload_example(file_name: str, return_format: Optional[str] = None) ->

if return_format == "html":
return HTMLResponse(content=json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0))
return ORJSONResponse(content=document_tree.to_dict(), status_code=200)
return ORJSONResponse(content=document_tree.to_api_schema().model_dump(), status_code=200)


@app.exception_handler(DedocError)
Expand Down
Empty file removed dedoc/api/models/__init__.py
Empty file.
27 changes: 0 additions & 27 deletions dedoc/api/models/custom_fields.py

This file was deleted.

13 changes: 13 additions & 0 deletions dedoc/api/schema/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .annotation import Annotation
from .cell_with_meta import CellWithMeta
from .document_content import DocumentContent
from .document_metadata import DocumentMetadata
from .line_metadata import LineMetadata
from .line_with_meta import LineWithMeta
from .parsed_document import ParsedDocument
from .table import Table
from .table_metadata import TableMetadata
from .tree_node import TreeNode

__all__ = ["Annotation", "CellWithMeta", "DocumentContent", "DocumentMetadata", "LineMetadata", "LineWithMeta", "ParsedDocument", "Table", "TableMetadata",
"TreeNode"]
12 changes: 12 additions & 0 deletions dedoc/api/schema/annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pydantic import BaseModel, Field


class Annotation(BaseModel):
"""
The piece of information about the text line: it's appearance or links to another document object.
For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic.
"""
start: int = Field(description="Start of the annotated text", example=0)
end: int = Field(description="End of the annotated text (end isn't included)", example=5)
name: str = Field(description="Annotation name", example="italic")
value: str = Field(description="Annotation value. For example, it may be font size value for size type", example="True")
15 changes: 15 additions & 0 deletions dedoc/api/schema/cell_with_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.line_with_meta import LineWithMeta


class CellWithMeta(BaseModel):
"""
Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
"""
lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations")
rowspan: int = Field(description="Number of rows to span like in HTML format", example=1)
colspan: int = Field(description="Number of columns to span like in HTML format", example=2)
invisible: bool = Field(description="Indicator for displaying or hiding cell text", example=False)
14 changes: 14 additions & 0 deletions dedoc/api/schema/document_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.table import Table
from dedoc.api.schema.tree_node import TreeNode


class DocumentContent(BaseModel):
"""
Content of the document - structured text and tables.
"""
structure: TreeNode = Field(description="Tree structure where content of the document is organized")
tables: List[Table] = Field(description="List of document tables")
20 changes: 20 additions & 0 deletions dedoc/api/schema/document_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field


class DocumentMetadata(BaseModel):
"""
Document metadata like its name, size, author, etc.
"""
model_config = ConfigDict(extra="allow")

uid: str = Field(description="Document unique identifier (useful for attached files)", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0")
file_name: str = Field(description="Original document name before rename and conversion", example="example.odt")
temporary_file_name: str = Field(description="File name during parsing (unique name after rename and conversion)", example="123.odt")
size: int = Field(description="File size in bytes", example=20060)
modified_time: int = Field(description="Modification time of the document in the UnixTime format", example=1590579805)
created_time: int = Field(description="Creation time of the document in the UnixTime format", example=1590579805)
access_time: int = Field(description="File access time in the UnixTime format", example=1590579805)
file_type: str = Field(description="Mime type of the file", example="application/vnd.oasis.opendocument.text")
other_fields: Optional[dict] = Field(description="Other optional fields")
15 changes: 15 additions & 0 deletions dedoc/api/schema/line_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field


class LineMetadata(BaseModel):
"""
Holds information about document node/line metadata, such as page number or line type.
"""
model_config = ConfigDict(extra="allow")

paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
line_id: Optional[int] = Field(description="Line number", example=1)
other_fields: Optional[dict] = Field(description="Some other fields")
13 changes: 13 additions & 0 deletions dedoc/api/schema/line_with_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.annotation import Annotation


class LineWithMeta(BaseModel):
"""
Textual line with text annotations.
"""
text: str = Field(description="Text of the line", example="Some text")
annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)")
17 changes: 17 additions & 0 deletions dedoc/api/schema/parsed_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.document_content import DocumentContent
from dedoc.api.schema.document_metadata import DocumentMetadata


class ParsedDocument(BaseModel):
"""
Holds information about the document content, metadata and attachments.
"""
content: DocumentContent = Field(description="Document text and tables")
metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on")
version: str = Field(description="Version of the program that parsed this document", example="0.9.1")
warnings: List[str] = Field(description="List of warnings and possible errors, arising in the process of document parsing")
attachments: List["ParsedDocument"] = Field(description="Result of analysis of attached files - list of `ParsedDocument`")
16 changes: 16 additions & 0 deletions dedoc/api/schema/table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.cell_with_meta import CellWithMeta
from dedoc.api.schema.table_metadata import TableMetadata


class Table(BaseModel):
"""
Holds information about tables in the document.
We assume that a table has rectangle form (has the same number of columns in each row).
Table representation is row-based i.e. external list contains list of rows.
"""
cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)")
metadata: TableMetadata = Field(description="Table meta information")
12 changes: 12 additions & 0 deletions dedoc/api/schema/table_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Optional

from pydantic import BaseModel, Field


class TableMetadata(BaseModel):
"""
Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
"""
page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")
rotated_angle: float = Field(description="Value of the rotation angle (in degrees) by which the table was rotated during recognition", example=1.0)
20 changes: 20 additions & 0 deletions dedoc/api/schema/tree_node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import List

from pydantic import BaseModel, Field

from dedoc.api.schema.annotation import Annotation
from dedoc.api.schema.line_metadata import LineMetadata


class TreeNode(BaseModel):
"""
Helps to represent document as recursive tree structure.
It has list of children `TreeNode` nodes (empty list for a leaf node).
"""
node_id: str = Field(description="Document element identifier. It is unique within a document content tree. "
"The identifier consists of numbers separated by dots where each number "
"means node's number among nodes with the same level in the document hierarchy.)", example="0.2.1")
text: str = Field(description="Text of the node", example="Some text")
annotations: List[Annotation] = Field(description="Some metadata related to the part of the text (as font size)")
metadata: LineMetadata = Field(description="Metadata for the entire node (as node type)")
subparagraphs: List["TreeNode"] = Field(description="List of children of this node, each child is `TreeNode`")
1 change: 0 additions & 1 deletion dedoc/data_structures/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# noqa
import dedoc.data_structures.concrete_annotations as annotations
from .annotation import Annotation
from .attached_file import AttachedFile
Expand Down
30 changes: 3 additions & 27 deletions dedoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from collections import OrderedDict

from flask_restx import Api, Model, fields

from dedoc.api.schema.annotation import Annotation as ApiAnnotation
from dedoc.data_structures.serializable import Serializable


Expand Down Expand Up @@ -40,26 +37,5 @@ def __str__(self) -> str:
def __repr__(self) -> str:
return f"{self.name.capitalize()}(...)"

def to_dict(self) -> dict:
res = OrderedDict()
res["start"] = self.start
res["end"] = self.end
res["name"] = self.name
res["value"] = self.value
return res

@staticmethod
def get_api_dict(api: Api) -> Model:
names = [
"style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table",
"attachment", "spacing", "strike", "subscript", "superscript"
]
return api.model("Annotation", {
"start": fields.Integer(description="annotation start index", required=True, example=0),
"end": fields.Integer(description="annotation end index", required=True, example=4),
"name": fields.String(description="annotation name", required=True, example="bold", enum=names),
"value": fields.String(description="annotation value. For example, it may be font size value for size type "
"or type of alignment for alignment type",
required=True,
example="left")
})
def to_api_schema(self) -> ApiAnnotation:
return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value)
28 changes: 7 additions & 21 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from collections import OrderedDict
from typing import List

import numpy as np
from flask_restx import Api, Model, fields

from dedoc.data_structures import Annotation
from dedoc.api.schema.cell_with_meta import CellWithMeta as ApiCellWithMeta
from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.serializable import Serializable


class CellWithMeta:
class CellWithMeta(Serializable):
"""
This class holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
"""
Expand Down Expand Up @@ -43,20 +43,6 @@ def get_annotations(self) -> List[Annotation]:
def create_from_cell(cell: "Cell") -> "CellWithMeta": # noqa
return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible)

def to_dict(self) -> dict:
res = OrderedDict()

res["lines"] = [line.to_dict() for line in self.lines]
res["colspan"] = int(np.int8(self.colspan))
res["rowspan"] = int(np.int8(self.rowspan))
res["invisible"] = self.invisible
return res

@staticmethod
def get_api_dict(api: Api) -> Model:
return api.model("CellWithMeta", {
"colspan": fields.Integer(description="attribute of union column count"),
"rowspan": fields.Integer(description="attribute of union row count"),
"invisible": fields.Boolean(description='flag for cell display (for example: if invisible==true then style="display: none")'),
"lines": fields.List(fields.Nested(LineWithMeta.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
})
def to_api_schema(self) -> ApiCellWithMeta:
lines = [line.to_api_schema() for line in self.lines]
return ApiCellWithMeta(lines=lines, colspan=int(np.int8(self.colspan)), rowspan=int(np.int8(self.rowspan)), invisible=self.invisible)
10 changes: 0 additions & 10 deletions dedoc/data_structures/concrete_annotations/alignment_annotation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from flask_restx import Api, Model, fields

from dedoc.data_structures.annotation import Annotation


Expand All @@ -19,11 +17,3 @@ def __init__(self, start: int, end: int, value: str) -> None:
if value not in ["left", "right", "both", "center"]:
raise ValueError("the value of alignment annotation should be left, right, both, or center")
super().__init__(start=start, end=end, name=AlignmentAnnotation.name, value=value)

@staticmethod
def get_api_dict(api: Api) -> Model:
return api.model("AlignmentAnnotation", {
"start": fields.Integer(description="annotation start index", required=True, example=0),
"end": fields.Integer(description="annotation end index", required=True, example=4),
"value": fields.String(description="alignment of the text", required=True, example="left", enum=AlignmentAnnotation.valid_values)
})
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from flask_restx import Api, Model, fields

from dedoc.data_structures.annotation import Annotation


Expand All @@ -18,9 +16,3 @@ def __init__(self, attach_uid: str, start: int, end: int) -> None:
:param end: end of the annotated text (usually end of the line)
"""
super().__init__(start=start, end=end, name=AttachAnnotation.name, value=attach_uid)

@staticmethod
def get_api_dict(api: Api) -> Model:
return api.model("AttachAnnotation", {
"value": fields.String(description="ref to attachment", required=True, example="attach fafffa145agh")
})
11 changes: 0 additions & 11 deletions dedoc/data_structures/concrete_annotations/bbox_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Tuple

from dedocutils.data_structures import BBox
from flask_restx import Api, Model, fields

from dedoc.data_structures.annotation import Annotation

Expand Down Expand Up @@ -37,13 +36,3 @@ def get_bbox_from_value(value: str) -> Tuple[BBox, int, int]:
width=int(bbox_dict["width"] * bbox_dict["page_width"]),
height=int(bbox_dict["height"] * bbox_dict["page_height"]))
return bbox, bbox_dict["page_width"], bbox_dict["page_height"]

@staticmethod
def get_api_dict(api: Api) -> Model:
return api.model("BBoxAnnotation", {
"start": fields.Integer(description="annotation start index", required=True, example=0),
"end": fields.Integer(description="annotation end index", required=True, example=4),
"value": fields.String(description="bounding box of text chunk",
required=True,
example='{"x_top_left": 0, "y_top_left": 0, "width": 0.5, "height": 0.2, "page_width": 1000, "page_height": 400}')
})
Loading

0 comments on commit e7c1067

Please sign in to comment.