TLDR-465 pdf miner new params (#356)

* set char_margin to 3 * add pdf miner test script * fix test_pdf_miner script * fix TestApiPdfWithText * add chaching * rename test to benchmark * add benchmark script again * change name * change name * Try to fix documentation pipeline * fix benchmark --------- Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru> Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
ispras · Oct 19, 2023 · 62445da · 62445da
1 parent e7c1067
commit 62445da
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 3 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -19,6 +19,7 @@ jobs:
 
     - name: Install dependencies
       run: |
+        sudo apt update
         sudo apt-get install -y libreoffice djvulibre-bin poppler-utils tesseract-ocr libtesseract-dev tesseract-ocr-rus tesseract-ocr-eng
         python -m pip install --upgrade --no-cache-dir pip setuptools
         python -m pip install --exists-action=w --no-cache-dir -r requirements.txt

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
@@ -142,7 +142,7 @@ def __get_image(path: str, page_num: int) -> np.ndarray:
 
     def __get_interpreter(self) -> Tuple[PDFPageAggregator, PDFPageInterpreter]:
         rsrcmgr = PDFResourceManager()
-        laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, detect_vertical=False)  # TODO find the best parameters
+        laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, char_margin=3, detect_vertical=False)
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         return device, interpreter

diff --git a/dedoc/scripts/benchmark_pdf_miner.py b/dedoc/scripts/benchmark_pdf_miner.py
@@ -0,0 +1,72 @@
+import json
+import os
+import re
+import zipfile
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import wget
+
+from dedoc.api.api_utils import json2txt
+from dedoc.config import get_config
+from dedoc.dedoc_manager import DedocManager
+
+
+URL = "https://at.ispras.ru/owncloud/index.php/s/uImxYhliBHU8ei7/download"
+URL_GT = "https://at.ispras.ru/owncloud/index.php/s/SXsOTqxGaGO9wL9/download"
+
+if __name__ == "__main__":
+    data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_pdfminer_data"
+
+    if not os.path.isdir(data_dir):
+        data_dir.mkdir(parents=True)
+        pdfs_zip_path = str(data_dir / "pdfs.zip")
+        pdfs_zip_gt_path = str(data_dir / "pdfs_gt.zip")
+        wget.download(URL, pdfs_zip_path)
+        wget.download(URL_GT, pdfs_zip_gt_path)
+
+        with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref:
+            zip_ref.extractall(data_dir)
+        os.remove(pdfs_zip_path)
+        with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref:
+            zip_ref.extractall(data_dir)
+        os.remove(pdfs_zip_gt_path)
+
+        print(f"Benchmark data downloaded to {data_dir}")
+    else:
+        print(f"Use cached benchmark data from {data_dir}")
+
+    pdfs_path = data_dir / "PdfMiner Params"
+    pdfs_gt_path = data_dir / "PdfMiner Params GT"
+
+    info = dict()
+    with TemporaryDirectory() as tmpdir:
+        manager = DedocManager()
+        for file in os.listdir(pdfs_path):
+            result = manager.parse(file_path=str(pdfs_path / file), parameters={"pdf_with_text_layer": "true"})
+            txt_content = json2txt(paragraph=result.content.structure)
+            with (Path(tmpdir) / "ocr.txt").open("w") as f:
+                f.write(txt_content)
+
+            accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy"))
+            gt_path = pdfs_gt_path / (file[:-3] + "txt")
+            tmp_ocr_path = Path(tmpdir) / "ocr.txt"
+            accuracy_path = Path(tmpdir) / "accuracy.txt"
+            if accuracy_path.exists():
+                accuracy_path.unlink()
+            command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}"
+            os.system(command)
+
+            with open(accuracy_path, "r") as f:
+                lines = f.readlines()
+                matched = [line for line in lines if "Accuracy After Correction" in line]
+                if not matched:
+                    matched = [line for line in lines if "Accuracy\n" in line]
+                acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1]
+                info[str(file)] = acc_percent
+
+    output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
+    with (Path(output_dir) / "benchmark_pdf_miner.json").open("w") as f:
+        json.dump(info, f, ensure_ascii=False, indent=2)
+
+    print(f"save result in {output_dir}")
diff --git a/resources/benchmarks/benchmark_pdf_miner.json b/resources/benchmarks/benchmark_pdf_miner.json
@@ -0,0 +1,13 @@
+{
+  "Образец примерного заполнения уведомления об отсутствии цифровых финансовых активов.pdf": "100.0",
+  "2023 Гоночная Инструкция  CR 2023.pdf": "100.0",
+  "support_182_poisk-dokumentov.pdf": "100.0",
+  "6.1 Описание проекта Thalamus.pdf": "100.0",
+  "ECPPM2020_Instructions.pdf": "100.0",
+  "NOR CHR 2023.pdf": "100.0",
+  "2-column-state.pdf": "100.0",
+  "ba-2017.pdf": "100.0",
+  "Международное и национальное спортивное право портфолио_рус.pdf": "100.0",
+  "Uvedoml_ESN.pdf": "100.0",
+  "instruction_gibdd.pdf": "100.0"
+}
diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py
@@ -115,12 +115,12 @@ def test_pdf_with_2_columns_text(self) -> None:
         self.assertIn("Keywords", self._get_by_tree_path(tree, "0.4.1.3")["text"])
         self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.4.1.4")["text"])
 
-        self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0.0")["text"])
+        self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0")["text"])
         self.assertIn("The Tor network was designed to provide freedom\n"
                       "of speech by guaranteeing anonymous communications.\n"
                       "Whereas the cryptographic foundations of Tor, based on\n"
                       "onion-routing [3, 9, 22, 24], are known to be robust, identity",
-                      self._get_by_tree_path(tree, "0.5.0.1")["text"])
+                      self._get_by_tree_path(tree, "0.5.0.0")["text"])
 
     def test_pdf_with_2_columns_text_2(self) -> None:
         file_name = "liters_state.pdf"