-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
TLDR-465 pdf miner new params (#356)
* set char_margin to 3 * add pdf miner test script * fix test_pdf_miner script * fix TestApiPdfWithText * add chaching * rename test to benchmark * add benchmark script again * change name * change name * Try to fix documentation pipeline * fix benchmark --------- Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru> Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
- Loading branch information
1 parent
e7c1067
commit 62445da
Showing
5 changed files
with
89 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import json | ||
import os | ||
import re | ||
import zipfile | ||
from pathlib import Path | ||
from tempfile import TemporaryDirectory | ||
|
||
import wget | ||
|
||
from dedoc.api.api_utils import json2txt | ||
from dedoc.config import get_config | ||
from dedoc.dedoc_manager import DedocManager | ||
|
||
|
||
URL = "https://at.ispras.ru/owncloud/index.php/s/uImxYhliBHU8ei7/download" | ||
URL_GT = "https://at.ispras.ru/owncloud/index.php/s/SXsOTqxGaGO9wL9/download" | ||
|
||
if __name__ == "__main__": | ||
data_dir = Path(get_config()["intermediate_data_path"]) / "benchmark_pdfminer_data" | ||
|
||
if not os.path.isdir(data_dir): | ||
data_dir.mkdir(parents=True) | ||
pdfs_zip_path = str(data_dir / "pdfs.zip") | ||
pdfs_zip_gt_path = str(data_dir / "pdfs_gt.zip") | ||
wget.download(URL, pdfs_zip_path) | ||
wget.download(URL_GT, pdfs_zip_gt_path) | ||
|
||
with zipfile.ZipFile(pdfs_zip_path, 'r') as zip_ref: | ||
zip_ref.extractall(data_dir) | ||
os.remove(pdfs_zip_path) | ||
with zipfile.ZipFile(pdfs_zip_gt_path, 'r') as zip_ref: | ||
zip_ref.extractall(data_dir) | ||
os.remove(pdfs_zip_gt_path) | ||
|
||
print(f"Benchmark data downloaded to {data_dir}") | ||
else: | ||
print(f"Use cached benchmark data from {data_dir}") | ||
|
||
pdfs_path = data_dir / "PdfMiner Params" | ||
pdfs_gt_path = data_dir / "PdfMiner Params GT" | ||
|
||
info = dict() | ||
with TemporaryDirectory() as tmpdir: | ||
manager = DedocManager() | ||
for file in os.listdir(pdfs_path): | ||
result = manager.parse(file_path=str(pdfs_path / file), parameters={"pdf_with_text_layer": "true"}) | ||
txt_content = json2txt(paragraph=result.content.structure) | ||
with (Path(tmpdir) / "ocr.txt").open("w") as f: | ||
f.write(txt_content) | ||
|
||
accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy")) | ||
gt_path = pdfs_gt_path / (file[:-3] + "txt") | ||
tmp_ocr_path = Path(tmpdir) / "ocr.txt" | ||
accuracy_path = Path(tmpdir) / "accuracy.txt" | ||
if accuracy_path.exists(): | ||
accuracy_path.unlink() | ||
command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}" | ||
os.system(command) | ||
|
||
with open(accuracy_path, "r") as f: | ||
lines = f.readlines() | ||
matched = [line for line in lines if "Accuracy After Correction" in line] | ||
if not matched: | ||
matched = [line for line in lines if "Accuracy\n" in line] | ||
acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1] | ||
info[str(file)] = acc_percent | ||
|
||
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) | ||
with (Path(output_dir) / "benchmark_pdf_miner.json").open("w") as f: | ||
json.dump(info, f, ensure_ascii=False, indent=2) | ||
|
||
print(f"save result in {output_dir}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"Образец примерного заполнения уведомления об отсутствии цифровых финансовых активов.pdf": "100.0", | ||
"2023 Гоночная Инструкция CR 2023.pdf": "100.0", | ||
"support_182_poisk-dokumentov.pdf": "100.0", | ||
"6.1 Описание проекта Thalamus.pdf": "100.0", | ||
"ECPPM2020_Instructions.pdf": "100.0", | ||
"NOR CHR 2023.pdf": "100.0", | ||
"2-column-state.pdf": "100.0", | ||
"ba-2017.pdf": "100.0", | ||
"Международное и национальное спортивное право портфолио_рус.pdf": "100.0", | ||
"Uvedoml_ESN.pdf": "100.0", | ||
"instruction_gibdd.pdf": "100.0" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters