diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index f2dcc520..ee730ab1 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -14,6 +14,7 @@ import dedoc from dedoc.api.api_args import QueryParameters from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt +from dedoc.api.schema.parsed_document import ParsedDocument from dedoc.common.exceptions.dedoc_error import DedocError from dedoc.common.exceptions.missing_file_error import MissingFileError from dedoc.config import get_config @@ -60,7 +61,7 @@ def _get_static_file_path(request: Request) -> str: return os.path.abspath(os.path.join(directory, file)) -@app.post("/upload") +@app.post("/upload", response_model=ParsedDocument) async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa parameters = dataclasses.asdict(query_params) if not file or file.filename == "": @@ -81,15 +82,15 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D html_content = json2tree(paragraph=document_tree.content.structure) return HTMLResponse(content=html_content) elif return_format == "ujson": - return UJSONResponse(content=document_tree.to_dict()) + return UJSONResponse(content=document_tree.to_api_schema().model_dump()) elif return_format == "collapsed_tree": html_content = json2collapsed_tree(paragraph=document_tree.content.structure) return HTMLResponse(content=html_content) elif return_format == "pretty_json": - return PlainTextResponse(content=json.dumps(document_tree.to_dict(), ensure_ascii=False, indent=2)) + return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2)) else: logger.info(f"Send result. File {file.filename} with parameters {parameters}") - return ORJSONResponse(content=document_tree.to_dict()) + return ORJSONResponse(content=document_tree.to_api_schema().model_dump()) @app.get("/upload_example") @@ -100,7 +101,7 @@ async def upload_example(file_name: str, return_format: Optional[str] = None) -> if return_format == "html": return HTMLResponse(content=json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)) - return ORJSONResponse(content=document_tree.to_dict(), status_code=200) + return ORJSONResponse(content=document_tree.to_api_schema().model_dump(), status_code=200) @app.exception_handler(DedocError) diff --git a/dedoc/api/models/__init__.py b/dedoc/api/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dedoc/api/models/custom_fields.py b/dedoc/api/models/custom_fields.py deleted file mode 100644 index b3796873..00000000 --- a/dedoc/api/models/custom_fields.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Optional, TypeVar -from weakref import WeakSet - -from flask_restx import fields - -T = TypeVar("T") - - -class AnyNotNullField(fields.Raw): - __schema_type__ = "any" - - def format(self, value: T) -> Optional[T]: # noqa - if not isinstance(value, WeakSet): - return value - - -class ForbiddenField(fields.Raw): - __schema_type__ = "any" - - def format(self, value: T) -> None: # noqa - return - - -# ==================== Wildcard fields ======================= - -wild_any_fields = fields.Wildcard(AnyNotNullField, description="other fields", skip_none=False, allow_null=False) -wild_forbid_fields = fields.Wildcard(ForbiddenField, description="forbidden fields for output") diff --git a/dedoc/api/schema/__init__.py b/dedoc/api/schema/__init__.py new file mode 100644 index 00000000..d724e445 --- /dev/null +++ b/dedoc/api/schema/__init__.py @@ -0,0 +1,13 @@ +from .annotation import Annotation +from .cell_with_meta import CellWithMeta +from .document_content import DocumentContent +from .document_metadata import DocumentMetadata +from .line_metadata import LineMetadata +from .line_with_meta import LineWithMeta +from .parsed_document import ParsedDocument +from .table import Table +from .table_metadata import TableMetadata +from .tree_node import TreeNode + +__all__ = ["Annotation", "CellWithMeta", "DocumentContent", "DocumentMetadata", "LineMetadata", "LineWithMeta", "ParsedDocument", "Table", "TableMetadata", + "TreeNode"] diff --git a/dedoc/api/schema/annotation.py b/dedoc/api/schema/annotation.py new file mode 100644 index 00000000..9add75dd --- /dev/null +++ b/dedoc/api/schema/annotation.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel, Field + + +class Annotation(BaseModel): + """ + The piece of information about the text line: it's appearance or links to another document object. + For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic. + """ + start: int = Field(description="Start of the annotated text", example=0) + end: int = Field(description="End of the annotated text (end isn't included)", example=5) + name: str = Field(description="Annotation name", example="italic") + value: str = Field(description="Annotation value. For example, it may be font size value for size type", example="True") diff --git a/dedoc/api/schema/cell_with_meta.py b/dedoc/api/schema/cell_with_meta.py new file mode 100644 index 00000000..efeb0fdf --- /dev/null +++ b/dedoc/api/schema/cell_with_meta.py @@ -0,0 +1,15 @@ +from typing import List + +from pydantic import BaseModel, Field + +from dedoc.api.schema.line_with_meta import LineWithMeta + + +class CellWithMeta(BaseModel): + """ + Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible). + """ + lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations") + rowspan: int = Field(description="Number of rows to span like in HTML format", example=1) + colspan: int = Field(description="Number of columns to span like in HTML format", example=2) + invisible: bool = Field(description="Indicator for displaying or hiding cell text", example=False) diff --git a/dedoc/api/schema/document_content.py b/dedoc/api/schema/document_content.py new file mode 100644 index 00000000..5127650e --- /dev/null +++ b/dedoc/api/schema/document_content.py @@ -0,0 +1,14 @@ +from typing import List + +from pydantic import BaseModel, Field + +from dedoc.api.schema.table import Table +from dedoc.api.schema.tree_node import TreeNode + + +class DocumentContent(BaseModel): + """ + Content of the document - structured text and tables. + """ + structure: TreeNode = Field(description="Tree structure where content of the document is organized") + tables: List[Table] = Field(description="List of document tables") diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py new file mode 100644 index 00000000..a68f3cce --- /dev/null +++ b/dedoc/api/schema/document_metadata.py @@ -0,0 +1,20 @@ +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field + + +class DocumentMetadata(BaseModel): + """ + Document metadata like its name, size, author, etc. + """ + model_config = ConfigDict(extra="allow") + + uid: str = Field(description="Document unique identifier (useful for attached files)", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0") + file_name: str = Field(description="Original document name before rename and conversion", example="example.odt") + temporary_file_name: str = Field(description="File name during parsing (unique name after rename and conversion)", example="123.odt") + size: int = Field(description="File size in bytes", example=20060) + modified_time: int = Field(description="Modification time of the document in the UnixTime format", example=1590579805) + created_time: int = Field(description="Creation time of the document in the UnixTime format", example=1590579805) + access_time: int = Field(description="File access time in the UnixTime format", example=1590579805) + file_type: str = Field(description="Mime type of the file", example="application/vnd.oasis.opendocument.text") + other_fields: Optional[dict] = Field(description="Other optional fields") diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py new file mode 100644 index 00000000..2f5126f1 --- /dev/null +++ b/dedoc/api/schema/line_metadata.py @@ -0,0 +1,15 @@ +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field + + +class LineMetadata(BaseModel): + """ + Holds information about document node/line metadata, such as page number or line type. + """ + model_config = ConfigDict(extra="allow") + + paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text") + page_id: int = Field(description="Page number of the line/paragraph beginning", example=0) + line_id: Optional[int] = Field(description="Line number", example=1) + other_fields: Optional[dict] = Field(description="Some other fields") diff --git a/dedoc/api/schema/line_with_meta.py b/dedoc/api/schema/line_with_meta.py new file mode 100644 index 00000000..1c155ab5 --- /dev/null +++ b/dedoc/api/schema/line_with_meta.py @@ -0,0 +1,13 @@ +from typing import List + +from pydantic import BaseModel, Field + +from dedoc.api.schema.annotation import Annotation + + +class LineWithMeta(BaseModel): + """ + Textual line with text annotations. + """ + text: str = Field(description="Text of the line", example="Some text") + annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)") diff --git a/dedoc/api/schema/parsed_document.py b/dedoc/api/schema/parsed_document.py new file mode 100644 index 00000000..076540a4 --- /dev/null +++ b/dedoc/api/schema/parsed_document.py @@ -0,0 +1,17 @@ +from typing import List + +from pydantic import BaseModel, Field + +from dedoc.api.schema.document_content import DocumentContent +from dedoc.api.schema.document_metadata import DocumentMetadata + + +class ParsedDocument(BaseModel): + """ + Holds information about the document content, metadata and attachments. + """ + content: DocumentContent = Field(description="Document text and tables") + metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on") + version: str = Field(description="Version of the program that parsed this document", example="0.9.1") + warnings: List[str] = Field(description="List of warnings and possible errors, arising in the process of document parsing") + attachments: List["ParsedDocument"] = Field(description="Result of analysis of attached files - list of `ParsedDocument`") diff --git a/dedoc/api/schema/table.py b/dedoc/api/schema/table.py new file mode 100644 index 00000000..52b2b59c --- /dev/null +++ b/dedoc/api/schema/table.py @@ -0,0 +1,16 @@ +from typing import List + +from pydantic import BaseModel, Field + +from dedoc.api.schema.cell_with_meta import CellWithMeta +from dedoc.api.schema.table_metadata import TableMetadata + + +class Table(BaseModel): + """ + Holds information about tables in the document. + We assume that a table has rectangle form (has the same number of columns in each row). + Table representation is row-based i.e. external list contains list of rows. + """ + cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)") + metadata: TableMetadata = Field(description="Table meta information") diff --git a/dedoc/api/schema/table_metadata.py b/dedoc/api/schema/table_metadata.py new file mode 100644 index 00000000..779af066 --- /dev/null +++ b/dedoc/api/schema/table_metadata.py @@ -0,0 +1,12 @@ +from typing import Optional + +from pydantic import BaseModel, Field + + +class TableMetadata(BaseModel): + """ + Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on. + """ + page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0) + uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f") + rotated_angle: float = Field(description="Value of the rotation angle (in degrees) by which the table was rotated during recognition", example=1.0) diff --git a/dedoc/api/schema/tree_node.py b/dedoc/api/schema/tree_node.py new file mode 100644 index 00000000..5eeedd42 --- /dev/null +++ b/dedoc/api/schema/tree_node.py @@ -0,0 +1,20 @@ +from typing import List + +from pydantic import BaseModel, Field + +from dedoc.api.schema.annotation import Annotation +from dedoc.api.schema.line_metadata import LineMetadata + + +class TreeNode(BaseModel): + """ + Helps to represent document as recursive tree structure. + It has list of children `TreeNode` nodes (empty list for a leaf node). + """ + node_id: str = Field(description="Document element identifier. It is unique within a document content tree. " + "The identifier consists of numbers separated by dots where each number " + "means node's number among nodes with the same level in the document hierarchy.)", example="0.2.1") + text: str = Field(description="Text of the node", example="Some text") + annotations: List[Annotation] = Field(description="Some metadata related to the part of the text (as font size)") + metadata: LineMetadata = Field(description="Metadata for the entire node (as node type)") + subparagraphs: List["TreeNode"] = Field(description="List of children of this node, each child is `TreeNode`") diff --git a/dedoc/data_structures/__init__.py b/dedoc/data_structures/__init__.py index d22a6435..a51e7d68 100644 --- a/dedoc/data_structures/__init__.py +++ b/dedoc/data_structures/__init__.py @@ -1,4 +1,3 @@ -# noqa import dedoc.data_structures.concrete_annotations as annotations from .annotation import Annotation from .attached_file import AttachedFile diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py index 23c27937..2820f5a4 100644 --- a/dedoc/data_structures/annotation.py +++ b/dedoc/data_structures/annotation.py @@ -1,7 +1,4 @@ -from collections import OrderedDict - -from flask_restx import Api, Model, fields - +from dedoc.api.schema.annotation import Annotation as ApiAnnotation from dedoc.data_structures.serializable import Serializable @@ -40,26 +37,5 @@ def __str__(self) -> str: def __repr__(self) -> str: return f"{self.name.capitalize()}(...)" - def to_dict(self) -> dict: - res = OrderedDict() - res["start"] = self.start - res["end"] = self.end - res["name"] = self.name - res["value"] = self.value - return res - - @staticmethod - def get_api_dict(api: Api) -> Model: - names = [ - "style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table", - "attachment", "spacing", "strike", "subscript", "superscript" - ] - return api.model("Annotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "name": fields.String(description="annotation name", required=True, example="bold", enum=names), - "value": fields.String(description="annotation value. For example, it may be font size value for size type " - "or type of alignment for alignment type", - required=True, - example="left") - }) + def to_api_schema(self) -> ApiAnnotation: + return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value) diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index 20d5deb3..f544e9e7 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -1,14 +1,14 @@ -from collections import OrderedDict from typing import List import numpy as np -from flask_restx import Api, Model, fields -from dedoc.data_structures import Annotation +from dedoc.api.schema.cell_with_meta import CellWithMeta as ApiCellWithMeta +from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.serializable import Serializable -class CellWithMeta: +class CellWithMeta(Serializable): """ This class holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible). """ @@ -43,20 +43,6 @@ def get_annotations(self) -> List[Annotation]: def create_from_cell(cell: "Cell") -> "CellWithMeta": # noqa return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible) - def to_dict(self) -> dict: - res = OrderedDict() - - res["lines"] = [line.to_dict() for line in self.lines] - res["colspan"] = int(np.int8(self.colspan)) - res["rowspan"] = int(np.int8(self.rowspan)) - res["invisible"] = self.invisible - return res - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("CellWithMeta", { - "colspan": fields.Integer(description="attribute of union column count"), - "rowspan": fields.Integer(description="attribute of union row count"), - "invisible": fields.Boolean(description='flag for cell display (for example: if invisible==true then style="display: none")'), - "lines": fields.List(fields.Nested(LineWithMeta.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")), - }) + def to_api_schema(self) -> ApiCellWithMeta: + lines = [line.to_api_schema() for line in self.lines] + return ApiCellWithMeta(lines=lines, colspan=int(np.int8(self.colspan)), rowspan=int(np.int8(self.rowspan)), invisible=self.invisible) diff --git a/dedoc/data_structures/concrete_annotations/alignment_annotation.py b/dedoc/data_structures/concrete_annotations/alignment_annotation.py index 615f8786..1f194e7a 100644 --- a/dedoc/data_structures/concrete_annotations/alignment_annotation.py +++ b/dedoc/data_structures/concrete_annotations/alignment_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -19,11 +17,3 @@ def __init__(self, start: int, end: int, value: str) -> None: if value not in ["left", "right", "both", "center"]: raise ValueError("the value of alignment annotation should be left, right, both, or center") super().__init__(start=start, end=end, name=AlignmentAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("AlignmentAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="alignment of the text", required=True, example="left", enum=AlignmentAnnotation.valid_values) - }) diff --git a/dedoc/data_structures/concrete_annotations/attach_annotation.py b/dedoc/data_structures/concrete_annotations/attach_annotation.py index 6b276cbc..7c34be22 100644 --- a/dedoc/data_structures/concrete_annotations/attach_annotation.py +++ b/dedoc/data_structures/concrete_annotations/attach_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -18,9 +16,3 @@ def __init__(self, attach_uid: str, start: int, end: int) -> None: :param end: end of the annotated text (usually end of the line) """ super().__init__(start=start, end=end, name=AttachAnnotation.name, value=attach_uid) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("AttachAnnotation", { - "value": fields.String(description="ref to attachment", required=True, example="attach fafffa145agh") - }) diff --git a/dedoc/data_structures/concrete_annotations/bbox_annotation.py b/dedoc/data_structures/concrete_annotations/bbox_annotation.py index f33566e5..c08f359a 100644 --- a/dedoc/data_structures/concrete_annotations/bbox_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bbox_annotation.py @@ -2,7 +2,6 @@ from typing import Tuple from dedocutils.data_structures import BBox -from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -37,13 +36,3 @@ def get_bbox_from_value(value: str) -> Tuple[BBox, int, int]: width=int(bbox_dict["width"] * bbox_dict["page_width"]), height=int(bbox_dict["height"] * bbox_dict["page_height"])) return bbox, bbox_dict["page_width"], bbox_dict["page_height"] - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("BBoxAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="bounding box of text chunk", - required=True, - example='{"x_top_left": 0, "y_top_left": 0, "width": 0.5, "height": 0.2, "page_width": 1000, "page_height": 400}') - }) diff --git a/dedoc/data_structures/concrete_annotations/bold_annotation.py b/dedoc/data_structures/concrete_annotations/bold_annotation.py index 871ab166..71167b9a 100644 --- a/dedoc/data_structures/concrete_annotations/bold_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bold_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,11 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError("the value of bold annotation should be True or False") super().__init__(start=start, end=end, name=BoldAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("BoldAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="indicator if the text is bold or not", required=True, example="True", enum=BoldAnnotation.valid_values) - }) diff --git a/dedoc/data_structures/concrete_annotations/confidence_annotation.py b/dedoc/data_structures/concrete_annotations/confidence_annotation.py index b7b7ad65..f0cec1e6 100644 --- a/dedoc/data_structures/concrete_annotations/confidence_annotation.py +++ b/dedoc/data_structures/concrete_annotations/confidence_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -22,11 +20,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except AssertionError: raise ValueError("the value of confidence annotation should be in range [0, 1]") super().__init__(start=start, end=end, name=ConfidenceAnnotation.name, value=value, is_mergeable=False) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("BoldAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="confidence value", required=True, example="95") - }) diff --git a/dedoc/data_structures/concrete_annotations/indentation_annotation.py b/dedoc/data_structures/concrete_annotations/indentation_annotation.py index 4ecbfd16..2736a2bb 100644 --- a/dedoc/data_structures/concrete_annotations/indentation_annotation.py +++ b/dedoc/data_structures/concrete_annotations/indentation_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,11 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError("the value of indentation annotation should be a number") super().__init__(start=start, end=end, name=IndentationAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("IndentationAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="text indentation in twentieths of a point (1/1440 of an inch)", required=True, example="720") - }) diff --git a/dedoc/data_structures/concrete_annotations/italic_annotation.py b/dedoc/data_structures/concrete_annotations/italic_annotation.py index 0cfc83a1..db8b02b0 100644 --- a/dedoc/data_structures/concrete_annotations/italic_annotation.py +++ b/dedoc/data_structures/concrete_annotations/italic_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,11 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError("the value of italic annotation should be True or False") super().__init__(start=start, end=end, name=ItalicAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("ItalicAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="indicator if the text is italic or not", required=True, example="True", enum=ItalicAnnotation.valid_values) - }) diff --git a/dedoc/data_structures/concrete_annotations/linked_text_annotation.py b/dedoc/data_structures/concrete_annotations/linked_text_annotation.py index 9bd9228e..3c62b8f2 100644 --- a/dedoc/data_structures/concrete_annotations/linked_text_annotation.py +++ b/dedoc/data_structures/concrete_annotations/linked_text_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -17,11 +15,3 @@ def __init__(self, start: int, end: int, value: str) -> None: :param value: text, linked to given one, for example text of the footnote """ super().__init__(start=start, end=end, name=LinkedTextAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("LinkedTextAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="text, linked to given, for example text of the footnote", required=True) - }) diff --git a/dedoc/data_structures/concrete_annotations/size_annotation.py b/dedoc/data_structures/concrete_annotations/size_annotation.py index c82c1df0..edb44dae 100644 --- a/dedoc/data_structures/concrete_annotations/size_annotation.py +++ b/dedoc/data_structures/concrete_annotations/size_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,11 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError("the value of size annotation should be a number") super().__init__(start=start, end=end, name=SizeAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("SizeAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="the size of the text in points (1/72 of an inch)", required=True, example="18.5") - }) diff --git a/dedoc/data_structures/concrete_annotations/spacing_annotation.py b/dedoc/data_structures/concrete_annotations/spacing_annotation.py index ba0c4e1b..d13e1bd5 100644 --- a/dedoc/data_structures/concrete_annotations/spacing_annotation.py +++ b/dedoc/data_structures/concrete_annotations/spacing_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,14 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError(f"the value of spacing annotation should be a number, get {value}") super().__init__(start=start, end=end, name=SpacingAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("SpacingAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="spacing between the current line and the previous one in " - "twentieths of a point or one hundredths of a line", - required=True, - example="240") - }) diff --git a/dedoc/data_structures/concrete_annotations/strike_annotation.py b/dedoc/data_structures/concrete_annotations/strike_annotation.py index 25cc9806..56a3f133 100644 --- a/dedoc/data_structures/concrete_annotations/strike_annotation.py +++ b/dedoc/data_structures/concrete_annotations/strike_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,14 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError("the value of strike annotation should be True or False") super().__init__(start=start, end=end, name=StrikeAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("StrikeAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="indicator if the text is strikethrough or not", - required=True, - example="True", - enum=StrikeAnnotation.valid_values) - }) diff --git a/dedoc/data_structures/concrete_annotations/style_annotation.py b/dedoc/data_structures/concrete_annotations/style_annotation.py index 234750a3..a692a54d 100644 --- a/dedoc/data_structures/concrete_annotations/style_annotation.py +++ b/dedoc/data_structures/concrete_annotations/style_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -17,11 +15,3 @@ def __init__(self, start: int, end: int, value: str) -> None: :param value: style name of the text procured from the document formatting if exist (e.g. Heading 1) """ super().__init__(start=start, end=end, name=StyleAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("StyleAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="style name", required=True, example="heading 1") - }) diff --git a/dedoc/data_structures/concrete_annotations/subscript_annotation.py b/dedoc/data_structures/concrete_annotations/subscript_annotation.py index db3edbfe..b6c54d84 100644 --- a/dedoc/data_structures/concrete_annotations/subscript_annotation.py +++ b/dedoc/data_structures/concrete_annotations/subscript_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,14 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError("the value of subscript annotation should be True or False") super().__init__(start=start, end=end, name=SubscriptAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("SuperscriptAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="indicator if the text is subscript ($a_1$ in tex) or not", - required=True, - example="True", - enum=SubscriptAnnotation.valid_values) - }) diff --git a/dedoc/data_structures/concrete_annotations/superscript_annotation.py b/dedoc/data_structures/concrete_annotations/superscript_annotation.py index 98611918..05d55b5b 100644 --- a/dedoc/data_structures/concrete_annotations/superscript_annotation.py +++ b/dedoc/data_structures/concrete_annotations/superscript_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,14 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError("the value of superscript annotation should be True or False") super().__init__(start=start, end=end, name=SuperscriptAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("SuperscriptAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="indicator if the text is superscript ($a^1$ in tex) or not", - required=True, - example="True", - enum=SuperscriptAnnotation.valid_values) - }) diff --git a/dedoc/data_structures/concrete_annotations/table_annotation.py b/dedoc/data_structures/concrete_annotations/table_annotation.py index 8842a84d..8575d564 100644 --- a/dedoc/data_structures/concrete_annotations/table_annotation.py +++ b/dedoc/data_structures/concrete_annotations/table_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -17,9 +15,3 @@ def __init__(self, name: str, start: int, end: int) -> None: :param end: end of the annotated text (usually end of the line) """ super().__init__(start=start, end=end, name=TableAnnotation.name, value=name) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("TableAnnotation", { - "value": fields.String(description="ref to table", required=True, example="table fafffa145agh") - }) diff --git a/dedoc/data_structures/concrete_annotations/underlined_annotation.py b/dedoc/data_structures/concrete_annotations/underlined_annotation.py index e77e397c..c5cff271 100644 --- a/dedoc/data_structures/concrete_annotations/underlined_annotation.py +++ b/dedoc/data_structures/concrete_annotations/underlined_annotation.py @@ -1,5 +1,3 @@ -from flask_restx import Api, Model, fields - from dedoc.data_structures.annotation import Annotation @@ -21,14 +19,3 @@ def __init__(self, start: int, end: int, value: str) -> None: except ValueError: raise ValueError("the value of underlined annotation should be True or False") super().__init__(start=start, end=end, name=UnderlinedAnnotation.name, value=value) - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("UnderlinedAnnotation", { - "start": fields.Integer(description="annotation start index", required=True, example=0), - "end": fields.Integer(description="annotation end index", required=True, example=4), - "value": fields.String(description="indicator if the text is underlined or not", - required=True, - example="True", - enum=UnderlinedAnnotation.valid_values) - }) diff --git a/dedoc/data_structures/document_content.py b/dedoc/data_structures/document_content.py index 3cf7c1bb..ad4fa81e 100644 --- a/dedoc/data_structures/document_content.py +++ b/dedoc/data_structures/document_content.py @@ -1,8 +1,6 @@ -from collections import OrderedDict from typing import List -from flask_restx import Api, Model, fields - +from dedoc.api.schema.document_content import DocumentContent as ApiDocumentContent from dedoc.data_structures.serializable import Serializable from dedoc.data_structures.table import Table from dedoc.data_structures.tree_node import TreeNode @@ -22,15 +20,7 @@ def __init__(self, tables: List[Table], structure: TreeNode, warnings: List[str] self.structure = structure self.warnings = warnings if warnings is not None else [] - def to_dict(self) -> dict: - res = OrderedDict() - res["structure"] = self.structure.to_dict() - res["tables"] = [table.to_dict() for table in self.tables] - return res - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("DocumentContent", { - "structure": fields.Nested(TreeNode.get_api_dict(api), readonly=True, description="document content structure"), - "tables": fields.List(fields.Nested(Table.get_api_dict(api), description="tables structure")) - }) + def to_api_schema(self) -> ApiDocumentContent: + structure = self.structure.to_api_schema() + tables = [table.to_api_schema() for table in self.tables] + return ApiDocumentContent(structure=structure, tables=tables) diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index 67be8956..134ba6a4 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,9 +1,6 @@ import uuid -from collections import OrderedDict -from flask_restx import Api, Model, fields - -from dedoc.api.models.custom_fields import wild_any_fields +from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -61,32 +58,11 @@ def extend_other_fields(self, new_fields: dict) -> None: setattr(self, key, value) self.other_fields[key] = value - def to_dict(self) -> dict: - res = OrderedDict() - res["uid"] = self.uid - res["file_name"] = self.file_name - res["temporary_file_name"] = self.temporary_file_name - res["size"] = self.size - res["modified_time"] = self.modified_time - res["created_time"] = self.created_time - res["access_time"] = self.access_time - res["file_type"] = self.file_type + def to_api_schema(self) -> ApiDocumentMetadata: + api_document_metadata = ApiDocumentMetadata(uid=self.uid, file_name=self.file_name, temporary_file_name=self.temporary_file_name, size=self.size, + modified_time=self.modified_time, created_time=self.created_time, access_time=self.access_time, + file_type=self.file_type, other_fields=self.other_fields) if self.other_fields is not None: for (key, value) in self.other_fields.items(): - res[key] = value - res["other_fields"] = self.other_fields - return res - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("DocumentMetadata", { - "uid": fields.String(description="unique document identifier", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0"), - "file_name": fields.String(description="file name", example="example.odt"), - "temporary_file_name": fields.String(description="file name", example="123.odt"), - "size": fields.Integer(description="file size in bytes", example="20060"), - "modified_time": fields.Integer(description="modification time of the document in the format UnixTime", example="1590579805"), - "created_time": fields.Integer(description="creation time of the document in the format UnixTime", example="1590579805"), - "access_time": fields.Integer(description="file access time in format UnixTime", example="1590579805"), - "file_type": fields.String(description="mime-type file", example="application/vnd.oasis.opendocument.text"), - "[a-z]*": wild_any_fields - }) + setattr(api_document_metadata, key, value) + return api_document_metadata diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py index ccca5c0a..504c5110 100644 --- a/dedoc/data_structures/line_metadata.py +++ b/dedoc/data_structures/line_metadata.py @@ -1,9 +1,6 @@ -from collections import OrderedDict from typing import Optional -from flask_restx import Api, Model, fields - -from dedoc.api.models.custom_fields import wild_any_fields, wild_forbid_fields +from dedoc.api.schema.line_metadata import LineMetadata as ApiLineMetadata from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.serializable import Serializable @@ -50,24 +47,9 @@ def extend_other_fields(self, new_fields: dict) -> None: setattr(self, key, value) self.__other_fields[key] = value - def to_dict(self) -> dict: - res = OrderedDict() - res["page_id"] = self.page_id - res["line_id"] = self.line_id - res["paragraph_type"] = self.hierarchy_level.line_type if self.hierarchy_level is not None else HierarchyLevel.raw_text - res["other_fields"] = self.__other_fields + def to_api_schema(self) -> ApiLineMetadata: + paragraph_type = self.hierarchy_level.line_type if self.hierarchy_level is not None else HierarchyLevel.raw_text + api_line_metadata = ApiLineMetadata(page_id=self.page_id, line_id=self.line_id, paragraph_type=paragraph_type, other_fields=self.__other_fields) for key, value in self.__other_fields.items(): - res[key] = value - return res - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("LineMetadata", { - "paragraph_type": fields.String(description="paragraph type (header, list_item, list) and etc.", required=True, example="header"), - "page_id": fields.Integer(description="page number of begin paragraph", required=False, example=0), - "line_id": fields.Integer(description="line number of begin paragraph", required=True, example=13), - "_*": wild_forbid_fields, # don't get private fields - "tag_hierarchy_level": wild_forbid_fields, - "hierarchy_level": wild_forbid_fields, - "[a-z]*": wild_any_fields - }) + setattr(api_line_metadata, key, value) + return api_line_metadata diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index 5f68cd1d..2d906cf7 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -1,10 +1,8 @@ import re -from collections import OrderedDict from typing import List, Optional, Sized, Union from uuid import uuid1 -from flask_restx import Api, Model, fields - +from dedoc.api.schema.line_with_meta import LineWithMeta as ApiLineWithMeta from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.serializable import Serializable @@ -155,16 +153,6 @@ def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta": annotations = AnnotationMerger().merge_annotations(self.annotations + other_annotations, text=line) return LineWithMeta(line=line, metadata=self._metadata, annotations=annotations, uid=self.uid) - def to_dict(self) -> dict: - res = OrderedDict() - res["text"] = self._line - res["annotations"] = [annotation.to_dict() for annotation in self.annotations] - - return res - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("LineWithMeta", { - "text": fields.String(description="line's text"), - "annotations": fields.List(fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")), - }) + def to_api_schema(self) -> ApiLineWithMeta: + annotations = [annotation.to_api_schema() for annotation in self.annotations] + return ApiLineWithMeta(text=self._line, annotations=annotations) diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index f4a05710..9b671e04 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -1,9 +1,7 @@ -from collections import OrderedDict from typing import List, Optional -from flask_restx import Api, Model, fields - import dedoc +from dedoc.api.schema.parsed_document import ParsedDocument as ApiParsedDocument from dedoc.data_structures.document_content import DocumentContent from dedoc.data_structures.document_metadata import DocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -11,7 +9,7 @@ class ParsedDocument(Serializable): """ - This class hold information about the document content, metadata and attachments. + This class holds information about the document content, metadata and attachments. """ def __init__(self, metadata: DocumentMetadata, @@ -37,26 +35,8 @@ def add_attachments(self, new_attachment: List["ParsedDocument"]) -> None: def set_metadata(self, metadata: DocumentMetadata) -> None: self.metadata = metadata - def to_dict(self, depth: int = 0) -> dict: - res = OrderedDict() - res["version"] = dedoc.__version__ - res["warnings"] = self.warnings - res["content"] = self.content.to_dict() if self.content is not None else [] - res["metadata"] = self.metadata.to_dict() - res["attachments"] = [attachment.to_dict(depth=depth + 1) for attachment in self.attachments] if self.attachments is not None and depth < 10 else [] - - return res - - @staticmethod - def get_api_dict(api: Api, depth: int = 0, name: str = "ParsedDocument") -> Model: - return api.model(name, { - "content": fields.Nested(DocumentContent.get_api_dict(api), description="Document content structure"), - "metadata": fields.Nested(DocumentMetadata.get_api_dict(api), allow_null=False, skip_none=True, description="Document meta information"), - "version": fields.String(description="the version of the program that parsed this document", example="0.9.1"), - "warnings": fields.List(fields.String(description="list of warnings and possible errors", example="DOCX: seems that document corrupted")), - "attachments": fields.List(fields.Nested(api.model("others_ParsedDocument", {})), description="structure of attachments", required=False) - if depth == 10 # TODO delete this - else fields.List(fields.Nested(ParsedDocument.get_api_dict(api, depth=depth + 1, name="refParsedDocument" + str(depth)), - description="Attachment structure", - required=False)) - }) + def to_api_schema(self) -> ApiParsedDocument: + content = self.content.to_api_schema() + metadata = self.metadata.to_api_schema() + attachments = [attachment.to_api_schema() for attachment in self.attachments] if self.attachments is not None else [] + return ApiParsedDocument(content=content, metadata=metadata, version=dedoc.__version__, warnings=self.warnings, attachments=attachments) diff --git a/dedoc/data_structures/serializable.py b/dedoc/data_structures/serializable.py index 9de5d23e..08809394 100644 --- a/dedoc/data_structures/serializable.py +++ b/dedoc/data_structures/serializable.py @@ -1,16 +1,17 @@ from abc import ABC, abstractmethod +from pydantic import BaseModel + class Serializable(ABC): """ - Base class for the serializable objects which we need convert to dict. + Base class for the API schema objects which we later need convert to dict. """ @abstractmethod - def to_dict(self) -> dict: + def to_api_schema(self) -> BaseModel: """ - Convert class data into dictionary representation. - Dictionary key should be string and dictionary value should be json serializable. + Convert class data into the corresponding API schema class. - :return: dict with all class data. + :return: API schema class """ pass diff --git a/dedoc/data_structures/table.py b/dedoc/data_structures/table.py index c92125ea..65ac6d49 100644 --- a/dedoc/data_structures/table.py +++ b/dedoc/data_structures/table.py @@ -1,8 +1,6 @@ -from collections import OrderedDict from typing import List -from flask_restx import Api, Model, fields - +from dedoc.api.schema.table import Table as ApiTable from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.serializable import Serializable from dedoc.data_structures.table_metadata import TableMetadata @@ -22,15 +20,6 @@ def __init__(self, cells: List[List[CellWithMeta]], metadata: TableMetadata) -> self.metadata = metadata self.cells = cells - def to_dict(self) -> dict: - res = OrderedDict() - res["cells"] = [[cell.to_dict() for cell in row] for row in self.cells] - res["metadata"] = self.metadata.to_dict() - return res - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("Table", { - "cells": fields.List(fields.List(CellWithMeta.get_api_dict(api), description="Cell contains text"), description="matrix of cells"), - "metadata": fields.Nested(TableMetadata.get_api_dict(api), readonly=True, description="Table meta information") - }) + def to_api_schema(self) -> ApiTable: + cells = [[cell.to_api_schema() for cell in row] for row in self.cells] + return ApiTable(cells=cells, metadata=self.metadata.to_api_schema()) diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index a7d9dd6a..a70ab2b4 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -1,9 +1,7 @@ import uuid -from collections import OrderedDict from typing import Optional -from flask_restx import Api, Model, fields - +from dedoc.api.schema.table_metadata import TableMetadata as ApiTableMetadata from dedoc.data_structures.serializable import Serializable @@ -21,17 +19,5 @@ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_an self.uid = str(uuid.uuid4()) if not uid else uid self.rotated_angle = rotated_angle - def to_dict(self) -> dict: - res = OrderedDict() - res["uid"] = self.uid - res["page_id"] = self.page_id - res["rotated_angle"] = self.rotated_angle - return res - - @staticmethod - def get_api_dict(api: Api) -> Model: - return api.model("TableMetadata", { - "page_id": fields.Integer(readonly=False, description="table start page number"), - "uid": fields.String(description="table unique id"), - "rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes") - }) + def to_api_schema(self) -> ApiTableMetadata: + return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle) diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py index 595bd89a..9d8ba676 100644 --- a/dedoc/data_structures/tree_node.py +++ b/dedoc/data_structures/tree_node.py @@ -1,8 +1,6 @@ -from collections import OrderedDict from typing import List, Optional -from flask_restx import Api, Model, fields - +from dedoc.api.schema.tree_node import TreeNode as ApiTreeNode from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata @@ -38,32 +36,11 @@ def __init__(self, self.subparagraphs = subparagraphs self.parent = parent - def to_dict(self) -> dict: - res = OrderedDict() - res["node_id"] = self.node_id - res["text"] = self.text - res["annotations"] = [annotation.to_dict() for annotation in self.annotations] - res["metadata"] = self.metadata.to_dict() - res["subparagraphs"] = [node.to_dict() for node in self.subparagraphs] - return res - - @staticmethod - def get_api_dict(api: Api, depth: int = 0, name: str = "TreeNode") -> Model: - return api.model(name, { - "node_id": fields.String(description="Document element identifier. It is unique within one tree (i.e. " - "there will be no other such node_id in this tree, but in attachment " - "it may occur) The identifier has the form 0.2.1 where each number " - "means a serial number at the corresponding level of the hierarchy.", - required=True, - example="0.2.1"), - "text": fields.String(description="text of node", required=True, example="Закон"), - "annotations": fields.List(fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")), - "metadata": fields.Nested(LineMetadata.get_api_dict(api), skip_none=True, allow_null=False, description="Line meta information"), - "subparagraphs": fields.List(fields.Nested(api.model("others_TreeNode", {})), description='Node childes (with type "TreeNode") of structure tree') - if depth == 30 # TODO delete this - else fields.List(fields.Nested(TreeNode.get_api_dict(api, depth=depth + 1, name="refTreeNode" + str(depth))), - description='Node childes (with type "TreeNode") of structure tree') - }) + def to_api_schema(self) -> ApiTreeNode: + annotations = [annotation.to_api_schema() for annotation in self.annotations] + metadata = self.metadata.to_api_schema() + subparagraphs = [node.to_api_schema() for node in self.subparagraphs] + return ApiTreeNode(node_id=self.node_id, text=self.text, annotations=annotations, metadata=metadata, subparagraphs=subparagraphs) @staticmethod def create(lines: List[LineWithMeta] = None) -> "TreeNode": diff --git a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py index 13ba6150..e5a9b959 100644 --- a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py +++ b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py @@ -5,10 +5,9 @@ from dedocutils.data_structures import BBox from dedoc.data_structures.annotation import Annotation -from dedoc.data_structures.serializable import Serializable -class TextWithBBox(Serializable): +class TextWithBBox: def __init__(self, bbox: BBox, diff --git a/dedoc/train_dataset/data_structures/task_item.py b/dedoc/train_dataset/data_structures/task_item.py index 17017803..87308ea0 100644 --- a/dedoc/train_dataset/data_structures/task_item.py +++ b/dedoc/train_dataset/data_structures/task_item.py @@ -1,10 +1,8 @@ from collections import OrderedDict from typing import List, Optional -from dedoc.data_structures.serializable import Serializable - -class TaskItem(Serializable): +class TaskItem: def __init__(self, task_id: int, task_path: str, data: any, labeled: Optional[List[str]], additional_info: str = "", default_label: str = None) -> None: """ @@ -22,7 +20,7 @@ def __init__(self, task_id: int, task_path: str, data: any, labeled: Optional[Li self.additional_info = additional_info self.default_label = default_label - def to_dict(self, old_version: bool = False) -> dict: + def to_dict(self) -> dict: result = OrderedDict() result["id"] = self.task_id result["task_path"] = self.task_path diff --git a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py index 9a0ff1a5..d23f9967 100644 --- a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py @@ -58,7 +58,7 @@ result = manager.parse(file_path=file_path, parameters={"with_attachments": "true"}) result # -result.to_dict() # OrderedDict([('version', '0.11.2'), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ... +result.to_api_schema().model_dump() # {'content': {'structure': {'node_id': '0', 'text': '', 'annotations': [], 'metadata': {'paragraph_type': 'root', ... os.remove("test_dir/The_New_Yorker_Case_Study.pdf") [os.remove("test_dir/" + file) for file in os.listdir("test_dir/") if file[-4] == "_"] diff --git a/docs/source/_static/code_examples/dedoc_usage_tutorial.py b/docs/source/_static/code_examples/dedoc_usage_tutorial.py index 3e0df596..52bb29b4 100644 --- a/docs/source/_static/code_examples/dedoc_usage_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_usage_tutorial.py @@ -113,5 +113,4 @@ result = manager.parse(file_path=file_path, parameters={}) result # -result.to_dict() # OrderedDict([('version', ''), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ('text', ''), -# ('annotations', []), ('metadata', OrderedDict([('page_id', 0), ('line_id', 0), ('paragraph_type', 'root'), ('other_fields', {})])), ... +result.to_api_schema().model_dump() # {'content': {'structure': {'node_id': '0', 'text': '', 'annotations': [], 'metadata': {'paragraph_type': 'root', ... diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 73888c4c..0a7ee82b 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -3,8 +3,8 @@ Using dedoc via API =================== -Dedoc can be used as a web application that runs on the `localhost:1231`. -It's possible to change the port via `config.py` file (if you clone the repository and run dedoc as a docker container). +Dedoc can be used as a web application that runs on the ``localhost:1231``. +It's possible to change the port via ``config.py`` file (if you clone the repository and run dedoc as a docker container). There are two ways to install and run dedoc as a web application: @@ -20,14 +20,14 @@ There are two ways to install and run dedoc as a web application: Application usage ----------------- -Once you run the dedoc application, you can go to `localhost:1231` and +Once you run the dedoc application, you can go to ``localhost:1231`` and look to the main page with the information about dedoc. From this page you can go to the upload page and manually choose settings for the file parsing. -Then you can get the result after pressing the `upload` button. +Then you can get the result after pressing the ``upload`` button. If you want to use the application in your program, you can send requests e.g. using `requests `_ python library. -Post-requests should be sent to `http://localhost:1231/upload`. +Post-requests should be sent to ``http://localhost:1231/upload``. .. code-block:: python @@ -47,7 +47,7 @@ Post-requests should be sent to `http://localhost:1231/upload`. r = requests.post("http://localhost:1231/upload", files=files, data=data) result = r.content.decode('utf-8') -The `data` dictionary in the example contains some parameters to parse the given file. +The ``data`` dictionary in the example contains some parameters to parse the given file. They are described in the section :ref:`api_parameters`. .. _api_parameters: @@ -119,15 +119,15 @@ Api parameters description - The option to enable attached files extraction. Some documents can have attached files (attachments), e.g. images or videos. Dedoc allows to find attachments of the given file, get their metadata and save them in the directory where the given file is located. - If the option is `false`, all attached files will be ignored. + If the option is ``false``, all attached files will be ignored. * - need_content_analysis - true, false - false - The option to enable file's attachments parsing along with the given file. The content of the parsed attachments will be represented as :class:`~dedoc.data_structures.ParsedDocument` - and saved in the specified return format in the `attachments` field (see :ref:`json_format` for examples). - Use `true` value to enable this behaviour. + and saved in the specified return format in the ``attachments`` field (see :ref:`json_format` for examples). + Use ``true`` value to enable this behaviour. * - recursion_deep_attachments - integer value >= 0 @@ -139,8 +139,8 @@ Api parameters description - true, false - false - Attached files can be encoded in base64 and their contents will be saved instead of saving attached file on disk. - The encoded contents will be saved in the attachment's metadata in the `base64_encode` field. - Use `true` value to enable this behaviour. + The encoded contents will be saved in the attachment's metadata in the ``base64_encode`` field. + Use ``true`` value to enable this behaviour. * - attachments_dir - optional string with a valid path @@ -153,26 +153,26 @@ Api parameters description - true, false - true - This option is used for PDF documents which are images with text (PDF without a textual layer). - It is also used for PDF documents when `pdf_with_text_layer` is `true`, `false`, `auto` or `auto_tabby`. - Since costly table recognition methods are used to get tables, you may need to use `need_pdf_table_analysis=false` + It is also used for PDF documents when ``pdf_with_text_layer`` is ``true``, ``false``, ``auto`` or ``auto_tabby``. + Since costly table recognition methods are used to get tables, you may need to use ``need_pdf_table_analysis=false`` to increase parsing speed and get text without tables. - If the document has a textual layer, it is recommended to use `pdf_with_text_layer=tabby`, + If the document has a textual layer, it is recommended to use ``pdf_with_text_layer=tabby``, in this case tables will be parsed much easier and faster. * - orient_analysis_cells - true, false - false - This option is used for a table recognition in case of PDF documents without a textual layer - (images, scanned documents or when `pdf_with_text_layer` is `true`, `false` or `auto`). - When set to `true`, it enables analysis of rotated cells in table headers. + (images, scanned documents or when ``pdf_with_text_layer`` is ``true``, ``false`` or ``auto``). + When set to ``true``, it enables analysis of rotated cells in table headers. Use this option if you are sure that the cells of the table header are rotated. * - orient_cell_angle - 90, 270 - 90 - This option is used for a table recognition in case of PDF documents without a textual layer - (images, scanned documents or when `pdf_with_text_layer` is `true`, `false` or `auto`). - It is ignored when `orient_analysis_cells=false`. + (images, scanned documents or when ``pdf_with_text_layer`` is ``true``, ``false`` or ``auto``). + It is ignored when ``orient_analysis_cells=false``. The option is used to set orientation of cells in table headers: * **270** -- cells are rotated 90 degrees clockwise; @@ -188,8 +188,8 @@ Api parameters description * **true** -- use this option if you are sure that the PDF file has a textual layer (its text is copiable). In this case tables will be parsed using table recognition method for documents without a textual layer - (if you set `need_pdf_table_analysis=false` parsing will be faster but tables will be ignored). - It is recommended to use `pdf_with_text_layer=tabby` instead of `pdf_with_text_layer=true`, + (if you set ``need_pdf_table_analysis=false`` parsing will be faster but tables will be ignored). + It is recommended to use ``pdf_with_text_layer=tabby`` instead of ``pdf_with_text_layer=true``, but you can try this option as well. * **false** -- this value forces to use PDF reader for scanned documents (images, PDF without a textual layer) @@ -200,17 +200,17 @@ Api parameters description * **tabby** -- use this option if you are sure that the PDF file has a textual layer (its text is copiable). This option value forces to use PDF reader for documents with a textual layer only, it also allows to extract tables easily and quickly. - The method enabled by this option is much faster than the method enabled by `pdf_with_text_layer=true`. + The method enabled by this option is much faster than the method enabled by ``pdf_with_text_layer=true``. * **auto** -- automatic detection of textual layer presence in the PDF document. - If the document has a textual layer (is copyable), PDF document parsing works like with `need_pdf_table_analysis=true`. - If the document doesn't have a textual layer (it is an image, scanned document), PDF document parsing works like with `need_pdf_table_analysis=false`. - It is recommended to use `pdf_with_text_layer=auto_tabby` instead of `pdf_with_text_layer=auto`, + If the document has a textual layer (is copyable), PDF document parsing works like with ``need_pdf_table_analysis=true``. + If the document doesn't have a textual layer (it is an image, scanned document), PDF document parsing works like with ``need_pdf_table_analysis=false``. + It is recommended to use ``pdf_with_text_layer=auto_tabby`` instead of ``pdf_with_text_layer=auto``, but you can try this option as well. * **auto_tabby** -- automatic detection of textual layer presence in the PDF document. - If the document has a textual layer (is copyable), PDF document parsing works like with `need_pdf_table_analysis=tabby`. - If the document doesn't have a textual layer (it is an image, scanned document), PDF document parsing works like with `need_pdf_table_analysis=false`. + If the document has a textual layer (is copyable), PDF document parsing works like with ``need_pdf_table_analysis=tabby``. + If the document doesn't have a textual layer (it is an image, scanned document), PDF document parsing works like with ``need_pdf_table_analysis=false``. It is highly recommended to use this option value for any PDF document parsing. * - language @@ -226,17 +226,17 @@ Api parameters description - :, start:, :end, start:end - : - If you need to read a part of the PDF document, you can use page slice to define the reading range. - If the range is set like `start_page:end_page`, document will be processed from `start_page` to `end_page` - (`start_page` to `end_page` are included to the range). + If the range is set like ``start_page:end_page``, document will be processed from ``start_page`` to ``end_page`` + (``start_page`` to ``end_page`` are included to the range). * using **:** means reading all document pages; - * using empty `end` -- **start:** (e.g. 5:) means reading the document from `start` up to the end of the document; - * using empty `start` -- **:end** (e.g. :5) means reading the document from the beginning up to the `end` page; - * using **start:end** means reading document pages from `start` to `end` inclusively. + * using empty ``end`` -- **start:** (e.g. 5:) means reading the document from ``start`` up to the end of the document; + * using empty ``start`` -- **:end** (e.g. :5) means reading the document from the beginning up to the ``end`` page; + * using **start:end** means reading document pages from ``start`` to ``end`` inclusively. - If `start` > `end` or `start` > the number of pages in the document, the empty document will be returned. - If `end` > the number of pages in the document, the document will be read up to its end. - For example, if `1:3` is given, 1, 2 and 3 document pages will be processed. + If ``start`` > ``end`` or ``start`` > the number of pages in the document, the empty document will be returned. + If ``end`` > the number of pages in the document, the document will be read up to its end. + For example, if ``1:3`` is given, 1, 2 and 3 document pages will be processed. * - is_one_column_document - true, false, auto @@ -248,7 +248,7 @@ Api parameters description * **false** -- the document is multi-column (two columns parsing is supported); * **auto** -- automatic detection of the number of columns in the document. - If you are not sure about the number of columns in the documents you need to parse, it is recommended to use `auto`. + If you are not sure about the number of columns in the documents you need to parse, it is recommended to use ``auto``. * - document_orientation - auto, no_change @@ -259,20 +259,20 @@ Api parameters description * **auto** -- automatic detection of rotated document pages (rotation angle 0, 90, 180, 270 degrees) and rotation of document pages; * **no_change** -- parse document pages as they are without rotated pages detection. - If you are sure that the documents you need to parse consist of vertical (not rotated) pages, you can use `no_change`. + If you are sure that the documents you need to parse consist of vertical (not rotated) pages, you can use ``no_change``. * - need_header_footer_analysis - true, false - false - This option is used to **remove** headers and footers of PDF documents from the output result. - If `need_header_footer_analysis=false`, header and footer lines will present in the output as well as all other document lines. + If ``need_header_footer_analysis=false``, header and footer lines will present in the output as well as all other document lines. * - need_binarization - true, false - false - This option is used to clean background (binarize) for pages of PDF documents without a textual layer. If the document's background is heterogeneous, this option may help to improve the result of document text recognition. - By default `need_binarization=false` because its usage may decrease the quality of the document page (and the recognised text on it). + By default ``need_binarization=false`` because its usage may decrease the quality of the document page (and the recognised text on it). * - :cspan:`3` **Other formats handling** @@ -286,7 +286,7 @@ Api parameters description - any string - None - The encoding of documents of textual formats like TXT, CSV, TSV. - Look `here `_ to get the list of possible values for the `encoding` parameter. + Look `here `_ to get the list of possible values for the ``encoding`` parameter. By default the encoding of the document is detected automatically. * - handle_invisible_table diff --git a/docs/source/dedoc_api_usage/api_schema.rst b/docs/source/dedoc_api_usage/api_schema.rst new file mode 100644 index 00000000..be327ba9 --- /dev/null +++ b/docs/source/dedoc_api_usage/api_schema.rst @@ -0,0 +1,77 @@ +.. _dedoc_api_schema: + +API schema +========== + +The output json format has a strict schema: serialized :class:`~dedoc.api.schema.ParsedDocument` is returned. +Json schema of the output is also available during dedoc application running on ``http://localhost:1231/docs``. + +.. autoclass:: dedoc.api.schema.ParsedDocument + + .. autoattribute:: content + .. autoattribute:: metadata + .. autoattribute:: version + .. autoattribute:: warnings + .. autoattribute:: attachments + +.. autoclass:: dedoc.api.schema.DocumentContent + + .. autoattribute:: structure + .. autoattribute:: tables + +.. autoclass:: dedoc.api.schema.DocumentMetadata + + .. autoattribute:: uid + .. autoattribute:: file_name + .. autoattribute:: temporary_file_name + .. autoattribute:: size + .. autoattribute:: modified_time + .. autoattribute:: created_time + .. autoattribute:: access_time + .. autoattribute:: file_type + .. autoattribute:: other_fields + +.. autoclass:: dedoc.api.schema.TreeNode + + .. autoattribute:: node_id + .. autoattribute:: text + .. autoattribute:: annotations + .. autoattribute:: metadata + .. autoattribute:: subparagraphs + +.. autoclass:: dedoc.api.schema.LineWithMeta + + .. autoattribute:: text + .. autoattribute:: annotations + +.. autoclass:: dedoc.api.schema.LineMetadata + + .. autoattribute:: paragraph_type + .. autoattribute:: page_id + .. autoattribute:: line_id + .. autoattribute:: other_fields + +.. autoclass:: dedoc.api.schema.Table + + .. autoattribute:: cells + .. autoattribute:: metadata + +.. autoclass:: dedoc.api.schema.TableMetadata + + .. autoattribute:: page_id + .. autoattribute:: uid + .. autoattribute:: rotated_angle + +.. autoclass:: dedoc.api.schema.CellWithMeta + + .. autoattribute:: lines + .. autoattribute:: rowspan + .. autoattribute:: colspan + .. autoattribute:: invisible + +.. autoclass:: dedoc.api.schema.Annotation + + .. autoattribute:: start + .. autoattribute:: end + .. autoattribute:: name + .. autoattribute:: value diff --git a/docs/source/dedoc_api_usage/return_format.rst b/docs/source/dedoc_api_usage/return_format.rst index 9665d946..250b7c8a 100644 --- a/docs/source/dedoc_api_usage/return_format.rst +++ b/docs/source/dedoc_api_usage/return_format.rst @@ -21,8 +21,8 @@ The instruction about :ref:`dedoc API running ` may be useful. JSON output format ------------------ -Dedoc allows to get json representation of the class :class:`dedoc.data_structures.ParsedDocument`. -This format is used as a return format by default or if you use `return_format="json"` in the dictionary with API parameters. +Dedoc allows to get json representation of the class :class:`~dedoc.api.schema.ParsedDocument`. +This format is used as a return format by default or if you use ``return_format="json"`` in the dictionary with API parameters. The output structure may vary depending on the other API parameters (see :ref:`table_parameters` for more details). Basic example @@ -35,7 +35,7 @@ Let's parse the example file using default parameters: :lines: 9-12 The full :download:`output json file<../_static/json_format_examples/basic_example.json>` contains -serialized class :class:`dedoc.data_structures.ParsedDocument` with its content, tables, metadata and attachments. +serialized class :class:`~dedoc.api.schema.ParsedDocument` with its content, tables, metadata and attachments. The beginning of the document's content: @@ -72,8 +72,8 @@ The document's attachments: :language: json :lines: 706 -As we see, the `attachments` field is empty because the option -`with_attachments` is set to `"false"` by default (see :ref:`table_parameters`). +As we see, the ``attachments`` field is empty because the option +``with_attachments`` is set to ``"false"`` by default (see :ref:`table_parameters`). Example of linear structure type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -105,7 +105,7 @@ All remaining document lines have the same level as well. Example with attachments ~~~~~~~~~~~~~~~~~~~~~~~~ -Let's parse the example file using `with_attachments` parameter: +Let's parse the example file using ``with_attachments`` parameter: .. literalinclude:: ../_static/code_examples/dedoc_return_format.py :language: python @@ -114,7 +114,7 @@ Let's parse the example file using `with_attachments` parameter: The full :download:`output json file<../_static/json_format_examples/with_attachments.json>` has the same document content, tables and metadata. -Unlike the previous examples, in this case we have `attachments` field filled: +Unlike the previous examples, in this case we have ``attachments`` field filled: .. literalinclude:: ../_static/json_format_examples/with_attachments.json :language: json @@ -132,7 +132,7 @@ Let's parse the example file with attachments in base64 format: The full :download:`output json file<../_static/json_format_examples/with_base64_attachments.json>` has the same document content, tables, metadata and filled attachments as the previous example output. -The only difference is in the attachment's metadata: attachment's content is encoded and stored in the `"base64_encode"` field: +The only difference is in the attachment's metadata: attachment's content is encoded and stored in the ``"base64_encode"`` field: .. literalinclude:: ../_static/json_format_examples/with_base64_attachments.json :language: json @@ -149,7 +149,7 @@ Let's parse the example file with attachments and their content: The full :download:`output json file<../_static/json_format_examples/with_parsed_attachments.json>` has the same document content, tables and metadata. -The `attachments` field is filled and attachments are also parsed. +The ``attachments`` field is filled and attachments are also parsed. In the document example the attached image has some text on it, this text has been also parsed and saved in the attachment's content. The beginning of the document's attachments: diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 70373dfc..7e9ead61 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -33,7 +33,7 @@ still, the docker application should be installed and configured properly. docker-compose up --build -If you need to change some application settings, you may update `config.py` according to your needs and re-build the image. +If you need to change some application settings, you may update ``config.py`` according to your needs and re-build the image. If you don't need to change the application configuration, you may use the built docker image as well. @@ -59,7 +59,7 @@ Install dedoc using pip If you don't want to use docker for running the application, it's possible to run dedoc locally. However, it isn't suitable for any operating system (Ubuntu 20+ is recommended) and there may be not enough machine's resources for its work. -You should have `python` (python3.8, python3.9 are recommended) and `pip` installed. +You should have ``python`` (python3.8, python3.9 are recommended) and ``pip`` installed. .. _install_packages: @@ -70,9 +70,9 @@ You should have `python` (python3.8, python3.9 are recommended) and `pip` instal sudo apt-get install -y libreoffice djvulibre-bin unzip unrar -`libreoffice` and `djvulibre-bin` packages are used by converters (doc, odt to docx; xls, ods to xlsx; ppt, odp to pptx; djvu to pdf). +``libreoffice`` and ``djvulibre-bin`` packages are used by converters (doc, odt to docx; xls, ods to xlsx; ppt, odp to pptx; djvu to pdf). If you don't need converters, you can skip this step. -`unzip` and `unrar` packages are used in the process of extracting archives. +``unzip`` and ``unrar`` packages are used in the process of extracting archives. .. _install_tesseract: @@ -105,7 +105,7 @@ to get the example of Tesseract installing for dedoc container or use next comma 3. Install the dedoc library via pip. ************************************* -You need `torch~=1.11.0` and `torchvision~=0.12.0` installed. +You need ``torch~=1.11.0`` and ``torchvision~=0.12.0`` installed. If you already have torch and torchvision in your environment: .. code-block:: bash @@ -124,15 +124,15 @@ Install and run dedoc from sources If you want to run dedoc as a service from sources. it's possible to run dedoc locally. However, it isn't suitable for any operating system (Ubuntu 20+ is recommended) and there may be not enough machine's resources for its work. -You should have `python` (python3.8, python3.9 are recommended) and `pip` installed. +You should have ``python`` (python3.8, python3.9 are recommended) and ``pip`` installed. 1. Install necessary packages: according to instructions :ref:`install_packages` 2. Build Tesseract from sources according to instructions :ref:`install_tesseract` -3. We recommend to install python's virtual environment (for example, via `virtualenvwrapper`) +3. We recommend to install python's virtual environment (for example, via ``virtualenvwrapper``) -Below are the instructions for installing the package `virtualenvwrapper`: +Below are the instructions for installing the package ``virtualenvwrapper``: .. code-block:: bash @@ -144,7 +144,7 @@ Below are the instructions for installing the package `virtualenvwrapper`: source ~/.bashrc mkvirtualenv dedoc_env -4. Install python's requirements and launch dedoc service on default port `1231`: +4. Install python's requirements and launch dedoc service on default port ``1231``: .. code-block:: bash @@ -162,7 +162,7 @@ Below are the instructions for installing the package `virtualenvwrapper`: Install trusted torch (verified version) ---------------------------------------------- -You can install a trusted library `torch` (as a verified version of the library, verified by tools developed by the Ivannikov Institute for System Programming of the Russian Academy of Sciences). +You can install a trusted library ``torch`` (as a verified version of the library, verified by tools developed by the Ivannikov Institute for System Programming of the Russian Academy of Sciences). First you need to install two required packages.: @@ -172,13 +172,13 @@ First you need to install two required packages.: Second you need to install torch and torchvision from built wheels: -For `python3.8`: +For ``python3.8``: .. code-block:: bash pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torch-1.11.0a0+git137096a-cp38-cp38-linux_x86_64.whl pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torchvision-0.12.0a0%2B9b5a3fe-cp38-cp38-linux_x86_64.whl -For `python3.9`: +For ``python3.9``: .. code-block:: bash pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torch-1.11.0a0+git137096a-cp39-cp39-linux_x86_64.whl diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst index 57329ce9..7eea3c3e 100644 --- a/docs/source/getting_started/usage.rst +++ b/docs/source/getting_started/usage.rst @@ -17,21 +17,21 @@ In the context of this tutorial, you'll need to include certain import statement Using converters ---------------- -Assume we have a file :download:`example.odt <../_static/code_examples/test_dir/example.odt>` and we need to convert it to `example.docx` using dedoc library. -For this purpose one can use :class:`dedoc.converters.DocxConverter` class: +Assume we have a file :download:`example.odt <../_static/code_examples/test_dir/example.odt>` and we need to convert it to ``example.docx`` using dedoc library. +For this purpose one can use :class:`~dedoc.converters.DocxConverter` class: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 13 -Method :meth:`dedoc.converters.DocxConverter.can_convert` allows to check if the converter can convert the given file: +Method :meth:`~dedoc.converters.DocxConverter.can_convert` allows to check if the converter can convert the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 15-21 Since we have checked if the converter is able to convert the file, -we can convert it using :meth:`dedoc.converters.DocxConverter.do_convert` method: +we can convert it using :meth:`~dedoc.converters.DocxConverter.do_convert` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python @@ -60,21 +60,21 @@ Assume we need to parse file :download:`example.docx <../_static/code_examples/t As we see, the file contains text of different styles, two tables and an attached image. -To read the contents of this file in the intermediate representation (see :class:`dedoc.data_structures.UnstructuredDocument`) -one can use :class:`dedoc.converters.DocxReader` class: +To read the contents of this file in the intermediate representation (see :class:`~dedoc.data_structures.UnstructuredDocument`) +one can use :class:`~dedoc.converters.DocxReader` class: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 25 -Method :meth:`dedoc.readers.DocxReader.can_read` allows to check if the reader can parse the given file: +Method :meth:`~dedoc.readers.DocxReader.can_read` allows to check if the reader can parse the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 27-32 Since we have checked if the reader is able to read the file, -we can get its content (:class:`dedoc.data_structures.UnstructuredDocument`) using :meth:`dedoc.readers.DocxReader.read` method: +we can get its content (:class:`~dedoc.data_structures.UnstructuredDocument`) using :meth:`~dedoc.readers.DocxReader.read` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python @@ -86,15 +86,15 @@ Let's save the document in the variable and look at it in more detail: :language: python :lines: 36-38 -As we see, the document object has the following attributes: `lines`, `tables`, `attachments`, `metadata` and `warnings`. +As we see, the document object has the following attributes: ``lines``, ``tables``, ``attachments``, ``metadata`` and ``warnings``. Document metadata is the empty dict on this stage, because it should be filled by one of the metadata extractors (see :ref:`dedoc_metadata_extractors` and :ref:`using_metadata_extractors`). Document warnings -- the list of strings with some warnings that occurred while document parsing. -So the most useful information is stored in `lines`, `tables` and `attachments`. +So the most useful information is stored in ``lines``, ``tables`` and ``attachments``. Document lines ~~~~~~~~~~~~~~ -The attribute `lines` in the :class:`dedoc.data_structures.UnstructuredDocument` is a list of :class:`dedoc.data_structures.LineWithMeta`. +The attribute ``lines`` in the :class:`~dedoc.data_structures.UnstructuredDocument` is a list of :class:`~dedoc.data_structures.LineWithMeta`. We can get the text of any line: @@ -109,7 +109,7 @@ Also some of the readers can detect line types based of their styles, e.g.: :language: python :lines: 41 -Formatting of each line is stored in the `annotations` attribute: +Formatting of each line is stored in the ``annotations`` attribute: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python @@ -120,9 +120,9 @@ See :ref:`dedoc_data_structures` to get more information about main classes form Document tables ~~~~~~~~~~~~~~~ -The attribute `tables` in the :class:`dedoc.data_structures.UnstructuredDocument` is a list of :class:`dedoc.data_structures.Table`. +The attribute ``tables`` in the :class:`~dedoc.data_structures.UnstructuredDocument` is a list of :class:`~dedoc.data_structures.Table`. -Each table is represented as a list of table rows, each row is a list of cells with additional metadata :class:`dedoc.data_structures.CellWithMeta`. +Each table is represented as a list of table rows, each row is a list of cells with additional metadata :class:`~dedoc.data_structures.CellWithMeta`. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python @@ -153,12 +153,12 @@ The unique identifier links the table with the previous non-empty line in the do :lines: 64-66 In the current example (:ref:`docx_example_image`), the line with the text "Bold, italic, small text." is the first non-empty line -before the first table, so the table uid is linked to this line using :class:`dedoc.data_structures.TableAnnotation`. +before the first table, so the table uid is linked to this line using :class:`~dedoc.data_structures.TableAnnotation`. Document attachments ~~~~~~~~~~~~~~~~~~~~ -The attribute `attachments` in the :class:`dedoc.data_structures.UnstructuredDocument` is a list of :class:`dedoc.data_structures.AttachedFile`. +The attribute ``attachments`` in the :class:`~dedoc.data_structures.UnstructuredDocument` is a list of :class:`~dedoc.data_structures.AttachedFile`. In the :ref:`docx_example_image` there is an image attached to the file: @@ -166,17 +166,17 @@ In the :ref:`docx_example_image` there is an image attached to the file: :language: python :lines: 68-71 -The `tmp_file_path` contains the path to the image saved on disk, +The ``tmp_file_path`` contains the path to the image saved on disk, the image is saved in the same directory as the parent docx file. -The unique identifier of the attachent links it with the previous non-empty line in the document. +The unique identifier of the attachment links it with the previous non-empty line in the document. In our :ref:`docx_example_image` it is a line with text "More text.". .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 72-74 -The annotation uid is linked to the line using :class:`dedoc.data_structures.AttachAnnotation`. +The annotation uid is linked to the line using :class:`~dedoc.data_structures.AttachAnnotation`. .. _using_metadata_extractors: @@ -185,28 +185,28 @@ Using metadata extractors Continue the example from the :ref:`previous section `. -The reader returned the intermediate representation of the document -- :class:`dedoc.data_structures.UnstructuredDocument`. +The reader returned the intermediate representation of the document -- :class:`~dedoc.data_structures.UnstructuredDocument`. If we need to get some additional information about the file e.g. document subject or author, -we can add some metadata using :class:`dedoc.metadata_extractors.DocxMetadataExtractor`. +we can add some metadata using :class:`~dedoc.metadata_extractors.DocxMetadataExtractor`. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 77 -Method :meth:`dedoc.metadata_extractors.DocxMetadataExtractor.can_extract` allows to check if +Method :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.can_extract` allows to check if the metadata extractor can extract metadata from the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 78 -To extract metadata, one can add them to the document using :meth:`dedoc.metadata_extractors.DocxMetadataExtractor.add_metadata` method. +To extract metadata, one can add them to the document using :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.add_metadata` method. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 79-83 -As we see, the attribute `metadata` has been filled with some metadata fields. +As we see, the attribute ``metadata`` has been filled with some metadata fields. The list of common fields for any metadata extractor along with the specific fields for different document formats are enlisted in :ref:`dedoc_metadata_extractors`. @@ -217,27 +217,27 @@ In the section :ref:`using_readers` we already got the attachments of the file a If there is a need to extract attachments without reading the whole content of the document, one can use :ref:`dedoc_attachments_extractors`. -For example, in the :ref:`docx_example_image` we can use :class:`dedoc.attachments_extractors.DocxAttachmentsExtractor`. +For example, in the :ref:`docx_example_image` we can use :class:`~dedoc.attachments_extractors.DocxAttachmentsExtractor`. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 87 -Method :meth:`dedoc.attachments_extractors.DocxAttachmentsExtractor.can_extract` allows to check if the attachments extractor can extract attachments from the given file: +Method :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.can_extract` allows to check if the attachments extractor can extract attachments from the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 88 Since we have checked if the extractor can extract attachments from the file, -we can extract them it using :meth:`dedoc.attachments_extractors.DocxAttachmentsExtractor.get_attachments` method: +we can extract them it using :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.get_attachments` method: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 89-90 -As we see, attachment extractors return the same list of :class:`dedoc.data_structures.AttachedFile`, -as in the attribute `attachments` of the :class:`dedoc.data_structures.UnstructuredDocument`, +As we see, attachment extractors return the same list of :class:`~dedoc.data_structures.AttachedFile`, +as in the attribute ``attachments`` of the :class:`~dedoc.data_structures.UnstructuredDocument`, that we can get via readers (see :ref:`using_readers`). See :ref:`dedoc_attachments_extractors` to get more information about available extractors, their methods and parameters. @@ -249,7 +249,7 @@ Using structure extractors -------------------------- After sections :ref:`using_readers` and :ref:`using_metadata_extractors` we got an intermediate representation of the document content and its metadata. -The next step is to extract document structure, i.e. to find the :class:`dedoc.data_structures.HierarchyLevel` for each document line. +The next step is to extract document structure, i.e. to find the :class:`~dedoc.data_structures.HierarchyLevel` for each document line. This class contains information about the type and the level of the line (or its importance in the document). Let's extract the default structure based on the document styles: @@ -258,7 +258,7 @@ Let's extract the default structure based on the document styles: :language: python :lines: 94-97 -As we see, the `hierarchy_level` has been filled. +As we see, the ``hierarchy_level`` has been filled. See :ref:`other_structure` for more details about the default document structure. Use :ref:`dedoc_structure_extractors` to get the information about available structure extractors, their methods and parameters. @@ -268,7 +268,7 @@ Using structure constructors ---------------------------- After we got the document content with hierarchy levels of each line (see :ref:`using_readers`, :ref:`using_metadata_extractors` and :ref:`using_structure_extractors`), -it's possible to make the result class :class:`dedoc.data_structures.ParsedDocument`. +it's possible to make the result class :class:`~dedoc.data_structures.ParsedDocument`. Let's construct the tree structure of the document: @@ -276,14 +276,14 @@ Let's construct the tree structure of the document: :language: python :lines: 101-104 -As we see, parsed document has similar attributes as :class:`dedoc.data_structures.UnstructuredDocument`. -The main difference is in the `content` attribute, that contains hierarchical document structure and tables. +As we see, parsed document has similar attributes as :class:`~dedoc.data_structures.UnstructuredDocument`. +The main difference is in the ``content`` attribute, that contains hierarchical document structure and tables. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python :lines: 106-108 -To get more information about :class:`dedoc.data_structures.ParsedDocument`, :class:`dedoc.data_structures.DocumentContent` +To get more information about :class:`~dedoc.data_structures.ParsedDocument`, :class:`~dedoc.data_structures.DocumentContent` and other classes, that form the output format, see :ref:`dedoc_data_structures`. See :ref:`dedoc_structure_constructors` for the description of available structure constructors and structure types. @@ -298,7 +298,7 @@ one may use manager class (see :ref:`dedoc_manager` for more details). .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 112-117 + :lines: 112-116 Manager allows to run workflow (see :ref:`dedoc_workflow`) for a file of any format supported by dedoc (see :ref:`table_formats`). -One can also make a custom `config` and `manager_config` (parameters of the manager constructor) for more flexible usage of the library. \ No newline at end of file +One can also make a custom ``config`` and ``manager_config`` (parameters of the manager constructor) for more flexible usage of the library. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index e1a3cfdb..5ebf91ab 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -203,7 +203,7 @@ Currently the following domains can be handled: * Russian thesis for bachelor or master degree (:ref:`structure description `). For a document of unknown or unsupported domain there is an option to use default structure extractor -(`document_type=other` at :ref:`api_parameters`), the default document structure described :ref:`here `. +(``document_type=other`` at :ref:`api_parameters`), the default document structure described :ref:`here `. .. toctree:: @@ -226,6 +226,7 @@ For a document of unknown or unsupported domain there is an option to use defaul :caption: Dedoc API usage dedoc_api_usage/api + dedoc_api_usage/api_schema dedoc_api_usage/return_format diff --git a/docs/source/modules/data_structures.rst b/docs/source/modules/data_structures.rst index e4ddcc4f..e8e07812 100644 --- a/docs/source/modules/data_structures.rst +++ b/docs/source/modules/data_structures.rst @@ -12,31 +12,26 @@ Main classes defining a document .. autoclass:: dedoc.data_structures.ParsedDocument :show-inheritance: :special-members: __init__ - :exclude-members: to_dict :members: .. autoclass:: dedoc.data_structures.DocumentContent :show-inheritance: :special-members: __init__ - :exclude-members: to_dict :members: .. autoclass:: dedoc.data_structures.DocumentMetadata :show-inheritance: :special-members: __init__ - :exclude-members: to_dict :members: .. autoclass:: dedoc.data_structures.TreeNode :show-inheritance: :special-members: __init__ - :exclude-members: to_dict :members: .. autoclass:: dedoc.data_structures.LineWithMeta :show-inheritance: - :special-members: __init__ - :exclude-members: to_dict, get_api_dict + :special-members: __init__, __lt__ :members: :undoc-members: line, uid, metadata, annotations @@ -47,30 +42,25 @@ Main classes defining a document .. autoclass:: dedoc.data_structures.LineMetadata :show-inheritance: :special-members: __init__ - :exclude-members: to_dict :members: .. autoclass:: dedoc.data_structures.HierarchyLevel :special-members: __init__, __eq__, __lt__ - :exclude-members: to_dict :members: .. autoclass:: dedoc.data_structures.Table :show-inheritance: :special-members: __init__ - :exclude-members: to_dict :members: .. autoclass:: dedoc.data_structures.TableMetadata :show-inheritance: :special-members: __init__ - :exclude-members: to_dict :members: .. autoclass:: dedoc.data_structures.CellWithMeta :show-inheritance: :special-members: __init__ - :exclude-members: to_dict :members: diff --git a/docs/source/readers_output/line_types.rst b/docs/source/readers_output/line_types.rst index 826a4807..d7c42425 100644 --- a/docs/source/readers_output/line_types.rst +++ b/docs/source/readers_output/line_types.rst @@ -4,8 +4,8 @@ Types of textual lines ====================== Each reader returns :class:`~dedoc.data_structures.UnstructuredDocument` with textual lines. -Readers don't fill `hierarchy_level` metadata field (structure extractors do this), but they can fill `hierarchy_level_tag` with information about line types. -Below the readers are enlisted that can return non-empty `hierarchy_level_tag` in document lines metadata: +Readers don't fill ``hierarchy_level`` metadata field (structure extractors do this), but they can fill ``hierarchy_level_tag`` with information about line types. +Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` in document lines metadata: * `+` means that the reader can return lines of this type. * `-` means that the reader doesn't return lines of this type due to complexity of the task or lack of information provided by the format. diff --git a/docs/source/structure_types/diploma.rst b/docs/source/structure_types/diploma.rst index c39457ba..ccb16d61 100644 --- a/docs/source/structure_types/diploma.rst +++ b/docs/source/structure_types/diploma.rst @@ -100,7 +100,7 @@ The detailed description of each line type: **In pdf documents footnotes are separate nodes with footnote type, but in the formats without strict page separation (docx, html, etc.) footnote's text is stored in the annotation of the line referring to this footnote.** - The documentation of the class :class:`dedoc.data_structures.LinkedTextAnnotation` may be helpful. + The documentation of the class :class:`~dedoc.data_structures.LinkedTextAnnotation` may be helpful. * **page_id**: node containing the text with page number. diff --git a/examples/create_structured_document.py b/examples/create_structured_document.py index 907df01a..434563c5 100644 --- a/examples/create_structured_document.py +++ b/examples/create_structured_document.py @@ -7,4 +7,4 @@ structure_constructor = TreeConstructor() parsed_document = structure_constructor.structure_document(document=unstructured_document, structure_type="tree") -print(parsed_document.to_dict()) +print(parsed_document.to_api_schema().model_dump()) diff --git a/examples/example_manager_input.py b/examples/example_manager_input.py index afb9bfff..e43fd0ac 100644 --- a/examples/example_manager_input.py +++ b/examples/example_manager_input.py @@ -10,25 +10,25 @@ # save the result with open("result_docx.json", "w") as outfile: - outfile.write(json.dumps(parsed_docx_document.to_dict())) + outfile.write(json.dumps(parsed_docx_document.to_api_schema().model_dump())) filename_jpg = "example.jpg" parsed_jpg_document = manager.parse(file_path=filename_jpg, parameters={}) # save the result with open("result_jpg.json", "w") as outfile: - outfile.write(json.dumps(parsed_jpg_document.to_dict())) + outfile.write(json.dumps(parsed_jpg_document.to_api_schema().model_dump())) filename_pdf_no_text_layer = "example_without_text_layer.pdf" parsed_pdf_no_text_layer_document = manager.parse(file_path=filename_pdf_no_text_layer, parameters={"pdf_with_text_layer": "false"}) # save the result with open("result_pdf_no_text_layer.json", "w") as outfile: - outfile.write(json.dumps(parsed_pdf_no_text_layer_document.to_dict())) + outfile.write(json.dumps(parsed_pdf_no_text_layer_document.to_api_schema().model_dump())) filename_pdf_with_text_layer = "example_with_text_layer.pdf" parsed_pdf_with_text_layer_document = manager.parse(file_path=filename_pdf_with_text_layer, parameters={"pdf_with_text_layer": "true"}) # save the result with open("result_pdf_with_text_layer.json", "w") as outfile: - outfile.write(json.dumps(parsed_pdf_with_text_layer_document.to_dict())) + outfile.write(json.dumps(parsed_pdf_with_text_layer_document.to_api_schema().model_dump())) diff --git a/requirements.txt b/requirements.txt index 9604d42e..e169bf81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ pandas>=1.4.1,<=1.9.0 pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c' pdfminer.six==20211012 piexif==1.1.3 +pydantic>=2.0,<=2.5 pylzma==0.5.0 PyPDF2==1.27.0 pytesseract==0.3.10 @@ -30,16 +31,11 @@ scikit-image>=0.19.3,<=0.21.0 scikit_learn>=1.0.2,<=1.3.0 scipy>=1.8.0,<=1.11.2 six==1.14.0 -starlette>=0.26.1,<0.27.0 +starlette>=0.26.1,<=0.27.0 texttable==1.6.7 ujson>=5.4.0,<=5.8.0 uvicorn>=0.18.0,<=0.23.2 wget==3.2 xgbfir==0.3.1 xgboost>=1.1.1,<1.2.0 -xlrd==1.2.0 -# TODO remove all flask -Werkzeug==2.0.3 -Flask==2.0.3 -flask_cors==3.0.10 -flask-restx==0.5.1 \ No newline at end of file +xlrd==1.2.0 \ No newline at end of file