Skip to content

Commit

Permalink
Add docker config
Browse files Browse the repository at this point in the history
  • Loading branch information
gardner committed Mar 7, 2024
1 parent ad6c97c commit 10fa67e
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.DS_Store
.git
.vscode
data/
27 changes: 27 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# See marker/settings.py for more options
# The following are the default values. Uncomment and change as needed.

# Please note the order of precedence for settings:
# 1. Environment variables
# 2. local.env file
# 3. Default values in marker/settings.py

# See # https://docs.pydantic.dev/latest/concepts/pydantic_settings/#dotenv-env-support

# TESSDATA_PREFIX setting is set in the Dockerfile

## General settings:

# TORCH_DEVICE=cpu

# How much VRAM each GPU has (in GB).
# INFERENCE_RAM=12

# How much VRAM to allocate per task (in GB). Peak marker VRAM usage is around 3GB, but avg across workers is lower.
# VRAM_PER_TASK=2.5

# Enable debug logging
# DEBUG=False

# Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
# DEFAULT_LANG=English
32 changes: 32 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Publish Docker image

on:
push:
branches:
- master

jobs:
build-and-push:
runs-on: ubuntu-latest
steps:
- name: Check out the repository
uses: actions/checkout@v4

- name: Log in to Docker Hub
uses: docker/login-action@v3.0.0
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Build the Docker image (CPU version)
run: docker build . --file Dockerfile --tag gardner/marker:cpu-${{ github.sha }} --tag gardner/marker:latest --tag gardner/marker:cpu

- name: Push the Docker image (CPU version)
run: docker push gardner/marker:cpu-${{ github.sha }}

- name: Build the Docker image (CUDA version)
run: docker build . --build-arg BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel --file Dockerfile --tag gardner/marker:cuda-${{ github.sha }} --tag gardner/marker:cuda-latest --tag gardner/marker:cuda

- name: Push the Docker image (CUDA version)
run: docker push gardner/marker:cuda-${{ github.sha }}

67 changes: 67 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
ARG BASE_IMAGE=python:3.10-bookworm
FROM ${BASE_IMAGE}

WORKDIR /app

ARG PIP_VERSION=24.0
ARG POETRY_VERSION=1.7.1
ARG GS_VERSION=10.02.1
ARG TORCH_VERSION=2.1.2

ENV DEBIAN_FRONTEND=noninteractive
ENV GS_URL=https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs10021/ghostscript-${GS_VERSION}.tar.gz

RUN apt-get update \
&& apt-get -y install apt-transport-https lsb-release wget gnupg2 \
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
&& echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" > /etc/apt/sources.list.d/notesalexp.list \
&& apt-get update \
&& apt-get install -y \
build-essential \
cmake \
libmagic1 \
libtesseract-dev \
ocrmypdf \
python3-dev \
python3-pip \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
tesseract-ocr-fra \
tesseract-ocr-por \
tesseract-ocr-rus \
tesseract-ocr-spa \
&& rm -rf /var/lib/apt/lists/*

RUN wget -q ${GS_URL} \
&& tar -xvf ghostscript-${GS_VERSION}.tar.gz \
&& cd ghostscript-${GS_VERSION} \
&& ./configure \
&& make -j $(nproc) \
&& make install \
&& cd .. \
&& rm -rf ghostscript-${GS_VERSION} ghostscript-${GS_VERSION}.tar.gz

RUN pip install pip==${PIP_VERSION} \
&& pip install poetry==${POETRY_VERSION} \
&& poetry config virtualenvs.create false

# If BASE_IMAGE is pytorch/pytorch:tag then pytorch will be installed with cuda support.
# If pytorch is not installed, install the cpu version.
RUN python -c "import torch" \
|| pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \
torch==${TORCH_VERSION} \
torchvision \
torchaudio==${TORCH_VERSION}

COPY ./pyproject.toml ./poetry.lock ./

RUN poetry install --no-dev --no-interaction --no-ansi --no-root

ARG TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
ENV TESSDATA_PREFIX=${TESSDATA_PREFIX}

# Test to make sure the TESSDATA_PREFIX is set correctly
RUN find / -name tessdata 2> /dev/null | grep "${TESSDATA_PREFIX}"

COPY . .
29 changes: 29 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# This example builds and runs the cuda version
services:
marker:
build:
context: .
dockerfile: Dockerfile
args:
- BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel
command: python convert_single.py /input/thinkpython.pdf /output/thinkpython.md --parallel_factor 2 --max_pages 10
shm_size: '12gb' # set this to the size of VRAM if possible
volumes:
- ./input:/input
- ./output:/output
- xdg_cache:/root/.cache
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- TORCH_DEVICE=cuda
- INFERENCE_RAM=12
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]

volumes:
xdg_cache:

0 comments on commit 10fa67e

Please sign in to comment.