diff --git a/.gitignore b/.gitignore index 3f2cbc3..b3c13fd 100644 --- a/.gitignore +++ b/.gitignore @@ -439,3 +439,8 @@ pip-selfcheck.json # Docs # docs/**/*.png # docs/**/*.svg + + +# AI tools +**/.bob/ + diff --git a/packages/docling-metric-teds/.gitignore b/packages/docling-metric-teds/.gitignore new file mode 100644 index 0000000..8e6bab5 --- /dev/null +++ b/packages/docling-metric-teds/.gitignore @@ -0,0 +1,329 @@ +# Created by https://www.toptal.com/developers/gitignore/api/c++,python,cmake,vim,visualstudiocode,virtualenv,venv +# Edit at https://www.toptal.com/developers/gitignore?templates=c++,python,cmake,vim,visualstudiocode,virtualenv,venv + +### C++ ### +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +### CMake ### +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps + +### CMake Patch ### +CMakeUserPresets.json + +# External projects +*-prefix/ + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### venv ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +*~ +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### VirtualEnv ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ + +### VisualStudioCode ### +.vscode/* +*/.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +# End of https://www.toptal.com/developers/gitignore/api/c++,python,cmake,vim,visualstudiocode,virtualenv,venv,macos + + +# AI +.bob/ + +# Additional dirs/files to ignore +externals/ +install_root/ + +**/docling_metric_teds_cli + diff --git a/packages/docling-metric-teds/CMakeLists.txt b/packages/docling-metric-teds/CMakeLists.txt new file mode 100644 index 0000000..552fdc9 --- /dev/null +++ b/packages/docling-metric-teds/CMakeLists.txt @@ -0,0 +1,207 @@ +cmake_minimum_required(VERSION 3.14) + + +################################################################################# +# Names and versions +# +set(teds_project_name "docling_metric_teds_cpp") +set(teds_cli_name "docling_metric_teds_cli") +set(install_root "docling_metric_teds") +set(externals_root "externals") + +project(${teds_project_name} VERSION 1.0.0 LANGUAGES CXX C) + +add_compile_definitions(PROJECT_VERSION_MAJOR=${PROJECT_VERSION_MAJOR}) +add_compile_definitions(PROJECT_VERSION_MINOR=${PROJECT_VERSION_MINOR}) +add_compile_definitions(PROJECT_VERSION_PATCH=${PROJECT_VERSION_PATCH}) + +# Debug: Show make commands +# set(CMAKE_VERBOSE_MAKEFILE ON) + + +########################################################################################### +# Path variables +# External dependencies are downloaded inside the externals/ dir +# + +if(NOT DEFINED TOPLEVEL_PREFIX_PATH) + set(TOPLEVEL_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}") +endif() + +if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + set(CMAKE_INSTALL_PREFIX "${TOPLEVEL_PREFIX_PATH}/${install_root}" CACHE PATH "Install prefix" FORCE) +endif() + +if(NOT DEFINED EXTERNALS_PREFIX_PATH) + set(EXTERNALS_PREFIX_PATH "${TOPLEVEL_PREFIX_PATH}/${externals_root}" CACHE INTERNAL "") +endif() + +if(NOT "${TOPLEVEL_PREFIX_PATH}/cmake" IN_LIST CMAKE_MODULE_PATH) + set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${TOPLEVEL_PREFIX_PATH}/cmake") +endif() + +include(GNUInstallDirs) + + +# set(ENV_ARCH $ENV{CIBW_ARCHS}) +# message(STATUS "cibw arch: " ${ENV_ARCH}) + +# set(CMAKE_OSX_ARCHITECTURES $ENV{CIBW_ARCHS}) +# message(STATUS "cibw arch: " ${ENV_ARCH}) + +set(ENV_ARCHFLAGS $ENV{ARCHFLAGS}) +message(STATUS "arch flags: " ${ENV_ARCHFLAGS}) +message(STATUS " top path: " ${TOPLEVEL_PREFIX_PATH}) +message(STATUS " lib path: " ${EXTERNALS_PREFIX_PATH}) +message(STATUS " install path: " ${CMAKE_INSTALL_PREFIX}) +message(STATUS " cmake path: " ${CMAKE_MODULE_PATH}) +message(STATUS " cmake system: " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS " make osx arch: " ${CMAKE_OSX_ARCHITECTURES}) + + +################################################################################# +# Compiler +# +message(STATUS "cmake osx-deployment: " ${CMAKE_OSX_DEPLOYMENT_TARGET}) + +# minimum macosx, ignored on other platforms +if(APPLE) + if(NOT CMAKE_OSX_DEPLOYMENT_TARGET) + message(STATUS "cmake is not set") + elseif(CMAKE_OSX_DEPLOYMENT_TARGET VERSION_LESS 10.15) + set(CMAKE_OSX_DEPLOYMENT_TARGET "10.15" CACHE STRING "Minimum macOS version" FORCE) + endif() +endif() + +message(STATUS "cmake system-version: " ${CMAKE_SYSTEM_VERSION}) +message(STATUS "cmake osx-deployment: " ${CMAKE_OSX_DEPLOYMENT_TARGET}) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if (WIN32) + set(TEST_PATH "\\\"${TOPLEVEL_PREFIX_PATH}\\\"") + add_definitions(-DROOT_PATH="\\\"${TOPLEVEL_PREFIX_PATH}\\\"") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-sign-compare -O3 ${ENV_ARCHFLAGS}") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-sign-compare -O3 -DROOT_PATH='\"${TOPLEVEL_PREFIX_PATH}\"' ${ENV_ARCHFLAGS}") +endif() + +message(STATUS "cxx-compiler: " ${CMAKE_CXX_COMPILER}) +message(STATUS "cxx-flags : " ${CMAKE_CXX_FLAGS}) +message(STATUS "cxx-standard: " ${CMAKE_CXX_STANDARD}) + + +################################################################################# +# Source files, include dirs +# + +# List of header files +# Get all *.h except those begining with underscore +file(GLOB_RECURSE teds_headers "${PROJECT_SOURCE_DIR}/cpp_src/[!_]*.h") +# message(STATUS "Header files: ${teds_headers}") + +# List all header directories +set(teds_include_dirs "") +foreach(header ${teds_headers}) + get_filename_component(dir "${header}" DIRECTORY) + list(APPEND teds_include_dirs "${dir}") +endforeach() +list(REMOVE_DUPLICATES teds_include_dirs) +list(APPEND teds_include_dirs "${pybind11_INCLUDE_DIRS}") +list(APPEND teds_include_dirs "${EXTERNALS_PREFIX_PATH}/include") + +# Debug +# string(REPLACE ";" "\n" teds_include_dirs_str "${teds_include_dirs}") +# message(STATUS "Include dirs: ${teds_include_dirs_str}") + + +########################################################################################### +# Dependencies +# + +# Interface library for our source code +add_library(teds_interface INTERFACE) +target_include_directories(teds_interface INTERFACE "${teds_include_dirs}") + +# Find Python first (before pybind11) to ensure we use the correct version +# This will use the Python from the activated virtual environment +find_package(Python3 COMPONENTS Interpreter Development REQUIRED) +message(STATUS "Python3 executable: ${Python3_EXECUTABLE}") +message(STATUS "Python3 version: ${Python3_VERSION}") + +# Add pybind11 - it will now use the Python3 found above +set(PYBIND11_FINDPYTHON ON) +set(PYBIND11_PYTHON_VERSION ${Python3_VERSION}) +find_package(pybind11 CONFIG REQUIRED) +message(STATUS "PyBind11 + Python include dirs: ${pybind11_INCLUDE_DIRS}") +message(STATUS "PyBind11 libs: ${pybind11_LIBRARIES}") + +option(USE_SYSTEM_DEPS OFF "If enabled, the build will find and link to system dependencies, otherwise they are sourced from the original repos and compiled on the fly.") + +# Directory structure to building external packages +if(NOT USE_SYSTEM_DEPS) + if(NOT EXISTS ${EXTERNALS_PREFIX_PATH}) + file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}) + file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/bin) + file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/lib) + file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/include) + endif() +endif() + +# Include external dependencies +include(cmake/extlib_cxxopts.cmake) +include(cmake/extlib_loguru.cmake) +include(cmake/extlib_json.cmake) + +# define LIB_LINK and OS_DEPENDENCIES +# include(cmake/os_opts.cmake) +# list(APPEND DEPENDENCIES ${OS_DEPENDENCIES}) + +# define subdirlist utility +# include(cmake/subdirlist.cmake) + +set(dependencies + teds_interface + "${ext_name_json}" + "${ext_name_loguru}" + "${ext_name_cxxopts}" +) + + +########################################################################################### +# Executables +# + +# Compile CLI +add_executable("${teds_cli_name}" "cpp_src/command_line/main.cpp") +target_link_libraries("${teds_cli_name}" PUBLIC ${dependencies}) + + +########################################################################################### +# Python bindings +# + +pybind11_add_module( + "${teds_project_name}" + "${TOPLEVEL_PREFIX_PATH}/cpp_src/pybind/pybind_module.cpp" +) +target_link_libraries("${teds_project_name}" PUBLIC ${dependencies}) + + +########################################################################################### +# Install +# + +# CLI +install(TARGETS "${teds_cli_name}" RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}") + +# Python bindings +install(TARGETS "${teds_project_name}" LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}") + + +########################################################################################### +# Tests +# +include(CTest) +add_subdirectory(cpp_tests/) diff --git a/packages/docling-metric-teds/LICENSE b/packages/docling-metric-teds/LICENSE new file mode 100644 index 0000000..cd51b95 --- /dev/null +++ b/packages/docling-metric-teds/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Database Research Group Salzburg + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/packages/docling-metric-teds/README.md b/packages/docling-metric-teds/README.md new file mode 100644 index 0000000..d72bb71 --- /dev/null +++ b/packages/docling-metric-teds/README.md @@ -0,0 +1,88 @@ +# Docling metric for the Tree-Edit-Distance Score + +This is an optimized implementation of the Tree Edit Distance Score. + + +## Overview + +Main features: + +- Parallelized C++ implementation. +- Python bindings. +- Docling API. + +This repo builds on top of the [tree-similarity](https://github.com/DatabaseGroup/tree-similarity) + + +## Directory structure + +Before building the C++ code, we have only the source code directories: + +``` +. +├── cmake # Cmake files required to compile the C++ code. +├── cpp_src # C++ source code +├── cpp_tests # C++ source code for the test +├── docling_metric_teds # Python wrapper for the C++ bindings that implements the docling-metrics-core API +└── test # Python tests +``` + + +After building the C++ code we have the following directories: + +``` +. +├── build # C++ build directory required during the build process +├── cmake +├── cpp_src +├── cpp_test +├── docling_metric_teds +├── externals # C++ code of external libraries. Required during the compilation. +└── tests + +``` + +## Installation + +TODO: This is the manual installation via a local bash script + +```bash +devtools/build_cpp.sh +``` + +Testing: + +```bash +devtools/test_cpp.sh +``` + + +## Usage + +```python +from docling_metric_teds.docling_metric_teds import ( + TEDSMetric, + TEDSMetricInputSample, + TEDSMetricSampleEvaluation, +) + +sample = TEDSMetricInputSample( + id="s1", + gt_bracket="{x{a}{b}}", + pred_bracket="{x{a}{c}}", +) +teds_metric = TEDSMetric() + +sample_evaluation: TEDSMetricSampleEvaluation = teds_metric.evaluate_sample(sample) +print(sample_evaluation) +``` + + +## Links + +[tree-similarity](https://github.com/DatabaseGroup/tree-similarity) + + +## License + +MIT diff --git a/packages/docling-metric-teds/cmake/_extlib_pybind11.cmake b/packages/docling-metric-teds/cmake/_extlib_pybind11.cmake new file mode 100644 index 0000000..ad97c68 --- /dev/null +++ b/packages/docling-metric-teds/cmake/_extlib_pybind11.cmake @@ -0,0 +1,28 @@ +message(STATUS "entering in extlib_pybind11.cmake") + +include(ExternalProject) +include(CMakeParseArguments) + +set(PYBIND11_URL https://github.com/pybind/pybind11.git) +set(PYBIND11_TAG v3.0.1) + +ExternalProject_Add(extlib_pybind11 + PREFIX extlib_pybind11 + + GIT_REPOSITORY ${PYBIND11_URL} + GIT_TAG ${PYBIND11_TAG} + + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + + BUILD_COMMAND "" + BUILD_ALWAYS OFF + + INSTALL_DIR ${EXTERNALS_PREFIX_PATH} + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory /include/ ${EXTERNALS_PREFIX_PATH}/include/ + ) + +add_library(pybind11 INTERFACE) +add_custom_target(install_extlib_pybind11 DEPENDS extlib_pybind11) +add_dependencies(pybind11 install_extlib_pybind11) + diff --git a/packages/docling-metric-teds/cmake/extlib_cxxopts.cmake b/packages/docling-metric-teds/cmake/extlib_cxxopts.cmake new file mode 100644 index 0000000..5dba659 --- /dev/null +++ b/packages/docling-metric-teds/cmake/extlib_cxxopts.cmake @@ -0,0 +1,53 @@ +message(STATUS "entering in extlib_cxxopts.cmake") + +set(ext_name_cxxopts "cxxopts") + +if(USE_SYSTEM_DEPS) + message(STATUS "using system-deps in extlib_cxxopts.cmake") + + # this will define the cxxopts target + # find_package(cxxopts REQUIRED) + + # add_library(${ext_name_cxxopts} INTERFACE IMPORTED) + # add_dependencies(${ext_name_cxxopts} cxxopts) + + find_package(PkgConfig) + pkg_check_modules(libcxxopts REQUIRED IMPORTED_TARGET cxxopts) + add_library(${ext_name_cxxopts} ALIAS PkgConfig::libcxxopts) + +else() + message(STATUS "ignoring system-deps extlib_cxxopts.cmake") + + include(ExternalProject) + include(CMakeParseArguments) + + set(CXXOPTS_URL https://github.com/jarro2783/cxxopts.git) + # set(CXXOPTS_TAG v3.2.0) + set(CXXOPTS_TAG v3.3.1) + + ExternalProject_Add(extlib_cxxopts + + PREFIX extlib_cxxopts + + UPDATE_COMMAND "" + GIT_REPOSITORY ${CXXOPTS_URL} + GIT_TAG ${CXXOPTS_TAG} + + BUILD_ALWAYS OFF + + INSTALL_DIR ${EXTERNALS_PREFIX_PATH} + + CMAKE_ARGS \\ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \\ + -DCMAKE_INSTALL_PREFIX=${EXTERNALS_PREFIX_PATH} + + BUILD_IN_SOURCE ON + LOG_DOWNLOAD ON + ) + + add_library(${ext_name_cxxopts} INTERFACE) + add_dependencies(${ext_name_cxxopts} extlib_cxxopts) + set_target_properties(${ext_name_cxxopts} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${EXTERNALS_PREFIX_PATH}/include + ) +endif() + diff --git a/packages/docling-metric-teds/cmake/extlib_json.cmake b/packages/docling-metric-teds/cmake/extlib_json.cmake new file mode 100644 index 0000000..9277ccf --- /dev/null +++ b/packages/docling-metric-teds/cmake/extlib_json.cmake @@ -0,0 +1,42 @@ +message(STATUS "entering in extlib_json.cmake") + +set(ext_name_json "json") + +if(USE_SYSTEM_DEPS) + # this will define the nlohmann_json::nlohmann_json target + find_package(nlohmann_json REQUIRED) + + add_library(${ext_name_json} INTERFACE IMPORTED) + add_dependencies(${ext_name_json} nlohmann_json::nlohmann_json) + +else() + + include(ExternalProject) + include(CMakeParseArguments) + + set(JSON_URL https://github.com/nlohmann/json.git) + # set(JSON_TAG v3.11.3) + set(JSON_TAG v3.12.0) + ExternalProject_Add(extlib_json + + PREFIX extlib_json + + GIT_REPOSITORY ${JSON_URL} + GIT_TAG ${JSON_TAG} + + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + + BUILD_COMMAND "" + BUILD_ALWAYS OFF + + INSTALL_DIR ${EXTERNALS_PREFIX_PATH} + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory /include/ ${EXTERNALS_PREFIX_PATH}/include/ + ) + + add_library(${ext_name_json} INTERFACE IMPORTED) + add_dependencies(${ext_name_json} extlib_json) + set_target_properties(${ext_name_json} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${EXTERNALS_PREFIX_PATH}/include) + +endif() + diff --git a/packages/docling-metric-teds/cmake/extlib_loguru.cmake b/packages/docling-metric-teds/cmake/extlib_loguru.cmake new file mode 100644 index 0000000..8609ac8 --- /dev/null +++ b/packages/docling-metric-teds/cmake/extlib_loguru.cmake @@ -0,0 +1,43 @@ +message(STATUS "entering in extlib_loguru.cmake") + +set(ext_name_loguru "loguru") + +if(USE_SYSTEM_DEPS) + message(STATUS "using system-deps in extlib_loguru.cmake") + + # this will define the loguru target + # find_package(loguru REQUIRED) + + # add_library(${ext_name_loguru} INTERFACE IMPORTED) + # add_dependencies(${ext_name_loguru} loguru) + + find_package(loguru CONFIG REQUIRED) + add_library(${ext_name_loguru} ALIAS loguru::loguru) + +else() + message(STATUS "ignoring system-deps extlib_loguru.cmake") + + include(FetchContent) + + # pin current master, because tag v2.1.0 branch does not have full make support, + # that was introduced later, this SHA also matches the system package .rpm + # https://koji.fedoraproject.org/koji/rpminfo?rpmID=40293153 + FetchContent_Declare(LoguruGitRepo + GIT_REPOSITORY "https://github.com/emilk/loguru" + GIT_TAG "4adaa185883e3c04da25913579c451d3c32cfac1" + ) + + set(LOGURU_WITH_STREAMS TRUE) + set(STACKTRACES TRUE) + set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) + + if(WIN32) + # https://digitalmars.com/rtl/constants.html + # value of _SH_DENYNO is 0x40 = 64 + add_compile_definitions(_SH_DENYNO=64) + endif() + + FetchContent_MakeAvailable(LoguruGitRepo) # defines target 'loguru::loguru' + +endif() + diff --git a/packages/docling-metric-teds/cpp_src/command_line/main.cpp b/packages/docling-metric-teds/cpp_src/command_line/main.cpp new file mode 100644 index 0000000..22638d5 --- /dev/null +++ b/packages/docling-metric-teds/cpp_src/command_line/main.cpp @@ -0,0 +1,79 @@ +#include + +#include "nlohmann/json.hpp" +#include "loguru.hpp" +#include "cxxopts.hpp" + +#include "teds_manager.h" + + +int main(int argc, char* argv[]) { + int orig_argc = argc; + + // Initialize loguru + loguru::init(argc, argv); + + try { + cxxopts::Options options("docling-metric-teds", "Compute Tree Edit Distance Score"); + + // Define the options + options.add_options() + ("g,gt-file", "Input ground truth file in bracket notation", cxxopts::value()) + ("p,pred-file", "Input predictions file in bracket notation", cxxopts::value()) + // ("l,loglevel", "loglevel [error;warning;success;info]", cxxopts::value()) + ("V,version", "Show version") + ("h,help", "Print usage"); + + // Parse command line arguments + auto result = options.parse(argc, argv); + + // TODO: Check if orig_argc is needed + if (orig_argc == 1) { + LOG_S(INFO) << argc; + LOG_F(ERROR, "Either input (-i) or config (-c) must be specified."); + LOG_F(INFO, "%s", options.help().c_str()); + return 1; + } + + // Help option or no arguments provided + if (result.count("help")) { + LOG_F(INFO, "%s", options.help().c_str()); + return 0; + } + + // Show version + if (result.count("version")) { + LOG_F(INFO, "Version: %d.%d.%d", PROJECT_VERSION_MAJOR, PROJECT_VERSION_MINOR, PROJECT_VERSION_PATCH); + return 0; + } + + // Load bracket files and compute TEDS + std::string gt_file_fn = result["gt-file"].as(); + std::string pred_file_fn = result["pred-file"].as(); + LOG_F(INFO, "GT file: %s", gt_file_fn.c_str()); + LOG_F(INFO, "Pred file: %s", pred_file_fn.c_str()); + std::string gt_bracket, pred_bracket; + std::ifstream gt_file(gt_file_fn); + std::getline(gt_file, gt_bracket); + gt_file.close(); + std::ifstream pred_file(pred_file_fn); + std::getline(pred_file, pred_bracket); + pred_file.close(); + + // Compute TEDs + docling::TEDSManager manager; + docling::TEDSSampleEvaluation eval_sample = manager.evaluate_sample("test", gt_bracket, pred_bracket); + LOG_F(INFO, "eval_sample error_id: %d", eval_sample.error_id); + LOG_F(INFO, "eval_sample error_msg: %s", eval_sample.error_msg.c_str()); + LOG_F(INFO, "eval_sample gt_tree_size: %d", eval_sample.gt_tree_size); + LOG_F(INFO, "eval_sample pred_tree_size: %d", eval_sample.pred_tree_size); + LOG_F(INFO, "eval_sample TEDS: %f", eval_sample.teds); + + } catch (const cxxopts::exceptions::exception& e) { + LOG_F(ERROR, "Error parsing options: %s", e.what()); + return 1; + } + + return 0; +} + diff --git a/packages/docling-metric-teds/cpp_src/cost_model/unit_cost_model.h b/packages/docling-metric-teds/cpp_src/cost_model/unit_cost_model.h new file mode 100644 index 0000000..1007d09 --- /dev/null +++ b/packages/docling-metric-teds/cpp_src/cost_model/unit_cost_model.h @@ -0,0 +1,127 @@ +// The MIT License (MIT) +// Copyright (c) 2017 Mateusz Pawlik, Nikolaus Augsten, Daniel Kocher, and +// Thomas Huetter. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +/// Contains the declaration of a basic cost model, i.e., the unit costs. + +#pragma once + +#include +#include "../node/node.h" +#include "../label/label_dictionary.h" + +namespace cost_model { + +// TODO: Deprecated. Substitute with UnitCostModelLD. +template +struct UnitCostModel { + int ren(const node::Node