diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index 4362832a..ede62117 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -122,7 +122,7 @@
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 8b635ad7..64d96fe6 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -3,6 +3,7 @@ from numpy import ndarray +from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable @@ -53,6 +54,9 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.binarizer = AdaptiveBinarizer() self.ocr = OCRLineExtractor(config=self.config) + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: + return super().read(file_path, parameters) + def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index d277815b..4cebbaf4 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -3,6 +3,7 @@ from dedocutils.data_structures import BBox from numpy import ndarray +from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable @@ -37,6 +38,9 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) == "true" + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: + return super().read(file_path, parameters) + def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, diff --git a/docs/source/_static/code_examples/test_dir/example.docx b/docs/source/_static/code_examples/test_dir/example.docx index 45ae9a18..e41ef095 100644 Binary files a/docs/source/_static/code_examples/test_dir/example.docx and b/docs/source/_static/code_examples/test_dir/example.docx differ diff --git a/docs/source/_static/gost_frame_data/document_with_gost_frame.pdf b/docs/source/_static/gost_frame_data/document_with_gost_frame.pdf new file mode 100644 index 00000000..295df746 Binary files /dev/null and b/docs/source/_static/gost_frame_data/document_with_gost_frame.pdf differ diff --git a/docs/source/_static/gost_frame_data/page_with_gost_frame_1.png b/docs/source/_static/gost_frame_data/page_with_gost_frame_1.png new file mode 100644 index 00000000..dd62facd Binary files /dev/null and b/docs/source/_static/gost_frame_data/page_with_gost_frame_1.png differ diff --git a/docs/source/_static/gost_frame_data/page_with_gost_frame_2.png b/docs/source/_static/gost_frame_data/page_with_gost_frame_2.png new file mode 100644 index 00000000..272fb522 Binary files /dev/null and b/docs/source/_static/gost_frame_data/page_with_gost_frame_2.png differ diff --git a/docs/source/_static/gost_frame_data/result_gost_frame.png b/docs/source/_static/gost_frame_data/result_gost_frame.png new file mode 100644 index 00000000..30611a31 Binary files /dev/null and b/docs/source/_static/gost_frame_data/result_gost_frame.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py index edd4500c..5ffa1b1a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -38,6 +38,7 @@ ("py:class", "abc.ABC"), ("py:class", "pydantic.main.BaseModel"), ("py:class", "scipy.stats._multivariate.dirichlet_multinomial_gen.cov"), + ("py:class", "scipy.stats._multivariate.random_table_gen.rvs"), ("py:class", "pandas.core.series.Series"), ("py:class", "numpy.ndarray"), ("py:class", "pandas.core.frame.DataFrame"), diff --git a/docs/source/parameters/gost_frame_handling.rst b/docs/source/parameters/gost_frame_handling.rst new file mode 100644 index 00000000..7093b799 --- /dev/null +++ b/docs/source/parameters/gost_frame_handling.rst @@ -0,0 +1,66 @@ +.. _gost_frame_handling: + +GOST frame handling +==================== + +.. flat-table:: Parameters for GOST frame handling + :widths: 5 5 3 15 72 + :header-rows: 1 + :class: tight-table + + * - Parameter + - Possible values + - Default value + - Where can be used + - Description + + * - need_gost_frame_analysis + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to enable GOST (Russian government standard "ГОСТ Р 21.1101") frame recognition for PDF documents or images. + + +The content of each page of some technical documents is placed in special GOST frames. An example of GOST frames is shown in the example below (:ref:`example_gost_frame`). +Such frames contain meta-information and are not part of the text content of the document. Based on this, we have implemented the functionality for ignoring GOST frames in documents, which works for: + + * Copyable PDF documents (:class:`dedoc.readers.PdfTxtlayerReader` and :class:`dedoc.readers.PdfTabbyReader`); + * Non-copyable PDF documents and Images (:class:`dedoc.readers.PdfImageReader`). + +If parameter ``need_gost_frame_analysis=True``, the GOST frame itself is ignored and only the contents inside the frame are extracted. + +.. _example_gost_frame: + +Examples of GOST frame +---------------------- +For example, your send PDF-document with two pages :download:`PDF-document with two pages <../_static/gost_frame_data/document_with_gost_frame.pdf>`: + +.. image:: ../_static/gost_frame_data/page_with_gost_frame_1.png + :width: 30% +.. image:: ../_static/gost_frame_data/page_with_gost_frame_2.png + :width: 30% + +Parameter's usage +----------------- + +.. code-block:: python + + import requests + + data = { + "pdf_with_text_layer": "auto_tabby", + "need_gost_frame_analysis": "true", + "return_format": "html" + } + with open(filename, "rb") as file: + files = {"file": (filename, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=data) + result = r.content.decode("utf-8") + +Request's result +---------------- + +.. image:: ../_static/gost_frame_data/result_gost_frame.png + :width: 50% diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index d8788089..20fabec9 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -62,7 +62,7 @@ PDF and images handling - rus, eng, rus+eng, fra, spa - rus+eng - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` * :meth:`dedoc.structure_extractors.FintocStructureExtractor.extract` - Language of the document without a textual layer. The following values are available: @@ -77,7 +77,7 @@ PDF and images handling - :, start:, :end, start:end - : - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - If you need to read a part of the PDF document, you can use page slice to define the reading range. If the range is set like ``start_page:end_page``, document will be processed from ``start_page`` to ``end_page`` @@ -96,7 +96,7 @@ PDF and images handling - true, false, auto - auto - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to set the number of columns if the PDF document is without a textual layer in case it's known beforehand. The following values are available: @@ -111,7 +111,7 @@ PDF and images handling - auto, no_change - auto - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to control document orientation analysis for PDF documents without a textual layer. The following values are available: @@ -125,7 +125,7 @@ PDF and images handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to **remove** headers and footers of PDF documents from the output result. If ``need_header_footer_analysis=False``, header and footer lines will present in the output as well as all other document lines. @@ -134,7 +134,7 @@ PDF and images handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to clean background (binarize) for pages of PDF documents without a textual layer. If the document's background is heterogeneous, this option may help to improve the result of document text recognition. @@ -144,7 +144,7 @@ PDF and images handling - True, False - True - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to enable table recognition for PDF documents or images. The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`. @@ -155,18 +155,17 @@ PDF and images handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images. - The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and - ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader` - to properly process the content of the document containing GOST frame. + It allows :class:`dedoc.readers.PdfImageReader`, :class:`dedoc.readers.PdfTxtlayerReader` and :class:`dedoc.readers.PdfTabbyReader` + to properly process the content of the document containing GOST frame, see :ref:`gost_frame_handling` for more details. * - orient_analysis_cells - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used for a table recognition for PDF documents or images. It is ignored when ``need_pdf_table_analysis=False``. @@ -177,7 +176,7 @@ PDF and images handling - 90, 270 - 90 - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used for a table recognition for PDF documents or images. It is ignored when ``need_pdf_table_analysis=False`` or ``orient_analysis_cells=False``. @@ -185,3 +184,9 @@ PDF and images handling * **270** -- cells are rotated 90 degrees clockwise; * **90** -- cells are rotated 90 degrees counterclockwise (or 270 clockwise). + + +.. toctree:: + :maxdepth: 1 + + gost_frame_handling \ No newline at end of file