OCR-D · kba · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -1,9 +1,26 @@
-ARG BASE_IMAGE
-FROM $BASE_IMAGE as ocrd_core_base
+ARG BASE_IMAGE=ubuntu:20.04
+FROM $BASE_IMAGE AS ocrd_core_base
+ARG BASE_IMAGE=ubuntu:20.04
 ARG FIXUP=echo
-MAINTAINER OCR-D
-ENV DEBIAN_FRONTEND noninteractive
-ENV PYTHONIOENCODING utf8
+ARG VCS_REF=unknown
+ARG BUILD_DATE=unknown
+LABEL \
+    maintainer="https://ocr-d.de/en/contact" \
+    org.label-schema.vcs-ref=$VCS_REF \
+    org.label-schema.vcs-url="https://github.com/OCR-D/core" \
+    org.label-schema.build-date=$BUILD_DATE \
+    org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
+    org.opencontainers.image.title="core" \
+    org.opencontainers.image.description="OCR-D framework" \
+    org.opencontainers.image.source="https://github.com/OCR-D/core" \
+    org.opencontainers.image.documentation="https://github.com/OCR-D/core/blob/${VCS_REF}/README.md" \
+    org.opencontainers.image.revision=$VCS_REF \
+    org.opencontainers.image.created=$BUILD_DATE \
+    org.opencontainers.image.base.name=$BASE_IMAGE
+
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONIOENCODING=utf8
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV PIP=pip
@@ -45,7 +62,7 @@ WORKDIR /data
 
 CMD ["/usr/local/bin/ocrd", "--help"]
 
-FROM ocrd_core_base as ocrd_core_test
+FROM ocrd_core_base AS ocrd_core_test
 # Optionally skip make assets with this arg
 ARG SKIP_ASSETS
 WORKDIR /build/core

diff --git a/Dockerfile.cuda b/Dockerfile.cuda
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE
+ARG BASE_IMAGE=docker.io/ocrd/core
 FROM $BASE_IMAGE AS ocrd_core_base
 
 ENV MAMBA_EXE=/usr/local/bin/conda
@@ -13,6 +13,8 @@ WORKDIR /build/core
 COPY Makefile .
 
 RUN make deps-cuda
+# Smoke Test
+RUN ocrd --version
 
 WORKDIR /data
 

diff --git a/Dockerfile.cuda-tf1 b/Dockerfile.cuda-tf1
@@ -1,11 +1,13 @@
-ARG BASE_IMAGE
+ARG BASE_IMAGE=docker.io/ocrd/core-cuda
 FROM $BASE_IMAGE AS ocrd_core_base
 
 WORKDIR /build/core
 
 COPY Makefile .
 
 RUN make deps-tf1
+# Smoke Test
+RUN ocrd --version
 
 WORKDIR /data
 

diff --git a/Dockerfile.cuda-tf2 b/Dockerfile.cuda-tf2
@@ -1,11 +1,13 @@
-ARG BASE_IMAGE
+ARG BASE_IMAGE=docker.io/ocrd/core-cuda
 FROM $BASE_IMAGE AS ocrd_core_base
 
 WORKDIR /build/core
 
 COPY Makefile .
 
 RUN make deps-tf2
+# Smoke Test
+RUN ocrd --version
 
 WORKDIR /data
 

diff --git a/Dockerfile.cuda-torch b/Dockerfile.cuda-torch
@@ -1,11 +1,13 @@
-ARG BASE_IMAGE
+ARG BASE_IMAGE=docker.io/ocrd/core-cuda
 FROM $BASE_IMAGE AS ocrd_core_base
 
 WORKDIR /build
 
 COPY Makefile .
 
 RUN make deps-torch
+# Smoke Test
+RUN ocrd --version
 
 WORKDIR /data
 

diff --git a/Makefile b/Makefile
@@ -57,20 +57,34 @@ help:
 PIP_INSTALL ?= $(PIP) install
 PIP_INSTALL_CONFIG_OPTION ?=
 
-.PHONY: deps-cuda deps-ubuntu deps-test
-
-deps-cuda: CONDA_EXE ?= /usr/local/bin/conda
-deps-cuda: export CONDA_PREFIX ?= /conda
-deps-cuda: PYTHON_PREFIX != $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'
-deps-cuda:
-	curl --retry 6 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
+.PHONY: get-conda deps-cuda deps-ubuntu deps-test
+
+ifeq ($(shell command -v conda),)
+# Conda installation: get Micromamba distribution
+get-conda: CONDA_EXE ?= /usr/local/bin/conda
+get-conda: export CONDA_PREFIX ?= /conda
+# first part of recipe: see micro.mamba.pm/install.sh
+get-conda: OS != uname
+get-conda: PLATFORM = $(subst Darwin,osx,$(subst Linux,linux,$(OS)))
+get-conda: MACHINE = $(or $(filter aarch64 arm64 ppc64le, $(ARCH)), 64)
+get-conda: URL = https://micro.mamba.pm/api/micromamba/$(PLATFORM)-$(MACHINE)/latest
+get-conda:
+	curl --retry 6 -Ls $(URL) | tar -xvj bin/micromamba
 	mv bin/micromamba $(CONDA_EXE)
 # Install Conda system-wide (for interactive / login shells)
 	echo 'export MAMBA_EXE=$(CONDA_EXE) MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) CONDA_PREFIX=$(CONDA_PREFIX) PATH=$(CONDA_PREFIX)/bin:$$PATH' >> /etc/profile.d/98-conda.sh
 # workaround for tf-keras#62
 	echo 'export XLA_FLAGS=--xla_gpu_cuda_data_dir=$(CONDA_PREFIX)/' >> /etc/profile.d/98-conda.sh
 	mkdir -p $(CONDA_PREFIX)/lib $(CONDA_PREFIX)/include
 	echo $(CONDA_PREFIX)/lib >> /etc/ld.so.conf.d/conda.conf
+else
+# Conda installation already present: do nothing
+get-conda: ;
+endif
+
+# Dependencies for CUDA installation via Conda
+deps-cuda: PYTHON_PREFIX != $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'
+deps-cuda: get-conda
 # Get CUDA toolkit, including compiler and libraries with dev,
 # however, the Nvidia channels do not provide (recent) cudnn (needed for Torch, TF etc):
 #MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) \
@@ -79,7 +93,6 @@ deps-cuda:
 # The conda-forge channel has cudnn and cudatoolkit but no cudatoolkit-dev anymore (and we need both!),
 # so let's combine nvidia and conda-forge (will be same lib versions, no waste of space),
 # but omitting cuda-cudart-dev and cuda-libraries-dev (as these will be pulled by pip for torch anyway):
-	MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) \
 	conda install -c nvidia/label/cuda-11.8.0 \
 	                 cuda-nvcc \
 	                 cuda-cccl \
@@ -145,25 +158,33 @@ deps-tf1:
 	  pushd $$name && for path in $$name*; do mv $$path $${path/$$name/$$newname}; done && popd && \
 	  $(PYTHON) -m wheel pack $$name && \
 	  $(PIP) install $$newname*.whl && popd && rm -fr $$OLDPWD; \
-	  $(PIP) install "numpy<1.24"; \
+	  $(PIP) install "numpy<1.24" -r requirements.txt; \
 	else \
-	$(PIP) install "tensorflow-gpu<2.0"; \
+	  $(PIP) install "tensorflow-gpu<2.0" -r requirements.txt; \
 	fi
 
 deps-tf2:
 	if $(PYTHON) -c 'import sys; print("%u.%u" % (sys.version_info.major, sys.version_info.minor))' | fgrep 3.8; then \
-	$(PIP) install tensorflow; \
+	$(PIP) install tensorflow -r requirements.txt; \
 	else \
-	$(PIP) install "tensorflow[and-cuda]"; \
+	$(PIP) install "tensorflow[and-cuda]"  -r requirements.txt; \
 	fi
 
 deps-torch:
-	$(PIP) install -i https://download.pytorch.org/whl/cu118 torchvision==0.16.2+cu118 torch==2.1.2+cu118
+	$(PIP) install -i https://download.pytorch.org/whl/cu118 torchvision==0.16.2+cu118 torch==2.1.2+cu118 -r requirements.txt
+
+# deps-*: always mix core's requirements.txt with additional deps,
+# so pip does not ignore the older version reqs,
+# but instead tries to find a mutually compatible set.
 
 # Dependencies for deployment in an ubuntu/debian linux
 deps-ubuntu:
 	apt-get install -y python3 imagemagick libgeos-dev libxml2-dev libxslt-dev libssl-dev
 
+# Dependencies for deployment via Conda
+deps-conda: get-conda
+	conda install -c conda-forge python==3.8.* imagemagick geos pkgconfig
+
 # Install test python deps via pip
 deps-test:
 	$(PIP) install -U pip
@@ -395,8 +416,16 @@ docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch
 
 docker-cuda-torch: docker-cuda
 
+# if the current ref is a release, then use it as tag instead of :latest
+docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: GIT_TAG := $(strip $(shell git describe --tags | grep -x "v[0-9]\.[0-9][[0-9]\.[0-9]"))
 docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch:
-	$(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) --target ocrd_core_base --build-arg BASE_IMAGE=$(lastword $(DOCKER_BASE_IMAGE)) $(DOCKER_ARGS) .
+	$(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) \
+	$(if $(GIT_TAG),$(DOCKER_TAG:%=-t %:$(GIT_TAG))) \
+	--target ocrd_core_base \
+	--build-arg BASE_IMAGE=$(lastword $(DOCKER_BASE_IMAGE)) \
+	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
+	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
+	$(DOCKER_ARGS) .
 
 # Build wheels and source dist and twine upload them
 pypi: build

diff --git a/repo/spec b/repo/spec
diff --git a/src/ocrd/ocrd-all-tool.json b/src/ocrd/ocrd-all-tool.json
@@ -1,21 +1,45 @@
 {
-  "ocrd-dummy": {
-    "executable": "ocrd-dummy",
-    "description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group",
-    "steps": [
-      "preprocessing/optimization"
-    ],
-    "categories": [
-      "Image preprocessing"
-    ],
-    "input_file_grp": "DUMMY_INPUT",
-    "output_file_grp": "DUMMY_OUTPUT",
-    "parameters": {
-      "copy_files": {
-        "type": "boolean",
-        "default": false,
-        "description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)"
-      }
-    }
+ "ocrd-dummy": {
+  "executable": "ocrd-dummy",
+  "description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group",
+  "steps": [
+   "preprocessing/optimization"
+  ],
+  "categories": [
+   "Image preprocessing"
+  ],
+  "input_file_grp_cardinality": 1,
+  "output_file_grp_cardinality": 1,
+  "parameters": {
+   "copy_files": {
+    "type": "boolean",
+    "default": false,
+    "description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)"
+   }
   }
-}
+ },
+ "ocrd-filter": {
+  "executable": "ocrd-filter",
+  "description": "Bare-bones processor can be dynamically configured to remove segments based on XPath queries",
+  "steps": [
+   "recognition/post-correction"
+  ],
+  "categories": [
+   "Quality assurance"
+  ],
+  "input_file_grp_cardinality": 1,
+  "output_file_grp_cardinality": 1,
+  "parameters": {
+   "select": {
+    "type": "string",
+    "default": "//*[ends-with(local-name(),'Region')]",
+    "description": "Which segments to select for removal. An XPath 2.0 query expression (path and optional predicates), with 'pc' as namespace prefix for PAGE-XML and our extension functions (see help text). Only selection of segment hierarchy elements is allowed (so e.g. `*` would be equivalent to `pc:NoiseRegion|pc:LineDrawingRegion|pc:AdvertRegion|pc:ImageRegion|pc:ChartRegion|pc:MusicRegion|pc:GraphicRegion|pc:UnknownRegion|pc:CustomRegion|pc:SeparatorRegion|pc:MathsRegion|pc:TextRegion|pc:MapRegion|pc:ChemRegion|pc:TableRegion|pc:TextLine|pc:Word|pc:Glyph`, but `pc:MetadataItem` or `pc:Border` or `pc:Coords` would not match).\nFor example, to remove words or glyphs with low text confidence, select '(pc:Word|pc:Glyph)[pc:TextEquiv/@conf < 0.7]'. Or low layout confidence, '*[pc:Coords/@conf < 0.7]'.\nTo remove high pixel-to-character rate, select '*[pc:pixelarea(.) div string-length(pc:textequiv(.)) > 10000]'."
+   },
+   "plot": {
+    "type": "boolean",
+    "default": false,
+    "description": "Whether to extract an image for each filtered segment and write to the output fileGrp."
+   }
+  }
+ }
+}
diff --git a/tests/cli/test_resmgr.py b/tests/cli/test_resmgr.py
@@ -8,7 +8,7 @@
 from ocrd.resource_manager import OcrdResourceManager
 
 runner = CliRunner()
-executable = 'ocrd-dummy'
+executable = 'ocrd-test-dummy'
 
 @fixture
 def mgr_with_tmp_path(tmp_path):

diff --git a/tests/network/docker-compose.yml b/tests/network/docker-compose.yml
@@ -52,6 +52,7 @@ services:
       args:
         BASE_IMAGE: 'ubuntu:22.04'
       target: ocrd_core_test
+    pull_policy: build
     hostname: ${OCRD_PS_HOST}
     container_name: ocrd_network_processing_server
     depends_on:
@@ -84,6 +85,13 @@ services:
 
   ocrd_dummy_processing_worker:
     image: "ocrd_core_test"
+    build:
+      context: ../../
+      dockerfile: Dockerfile
+      args:
+        BASE_IMAGE: 'ubuntu:22.04'
+      target: ocrd_core_test
+    pull_policy: build
     depends_on:
       ocrd_network_processing_server:
         condition: service_healthy
@@ -100,6 +108,13 @@ services:
 
   ocrd_network_core_test:
     image: "ocrd_core_test"
+    build:
+      context: ../../
+      dockerfile: Dockerfile
+      args:
+        BASE_IMAGE: 'ubuntu:22.04'
+      target: ocrd_core_test
+    pull_policy: build
     container_name: core_test
     depends_on:
       ocrd_network_processing_server:

diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
@@ -525,7 +525,7 @@ def test_run_output_parallel(start_mets_server):
                   parameter={"sleep": 2},
                   mets_server_url=mets_server_url)
     run_time = time.time() - start_time
-    assert run_time < 3.2, f"run_processor took {run_time}s"
+    assert run_time < 3.5, f"run_processor took {run_time}s"
     assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG"))
     config.reset_defaults()
 
@@ -547,14 +547,14 @@ def test_run_output_parallel_caching(start_mets_server):
     start_time = time.time()
     proc1 = run_processor(DummyProcessorWithOutputSleep, **kwargs)
     run_time = time.time() - start_time
-    assert run_time < 3.2, f"run_processor took {run_time}s"
+    assert run_time < 3.5, f"run_processor took {run_time}s"
     assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG"))
     start_time = time.time()
     proc2 = run_processor(DummyProcessorWithOutputSleep, **kwargs)
     assert proc1 is proc2, "instance_caching must yield identical processor objects for equal parameters"
     run_time = time.time() - start_time
     # should be faster with default config.OCRD_EXISTING_OUTPUT==SKIP
-    assert run_time < 1.2, f"run_processor took {run_time}s"
+    assert run_time < 1.5, f"run_processor took {run_time}s"
     config.reset_defaults()
 
 if __name__ == "__main__":