From f6a35cb3cdbdeacacf3e0a86037397d9374affad Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Wed, 3 Sep 2025 16:24:45 -0700 Subject: [PATCH 01/15] Add Dockerfile for CUDA GPU deployment Introduces a multi-stage Dockerfile for building and running kolosal-server with CUDA support. Updates README with instructions for building and running the server in a Docker container, including health checks and config mounting. --- Dockerfile | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 34 +++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..c61d00a0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,145 @@ +# syntax=docker/dockerfile:1.7-labs + +######################################## +# Kolosal Server – CUDA Docker image +# - Multi-stage: build (devel) -> runtime (slim) +# - Defaults to GPU (CUDA) build +# - Uses config_rms.yaml as default config +# - Copies only required runtime bits +######################################## + +ARG CUDA_VERSION=12.4.1 +ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 +FROM ${BASE_IMAGE} AS build + +ARG DEBIAN_FRONTEND=noninteractive +ARG TZ=UTC +ARG BUILD_TYPE=Release +ARG ENABLE_CUDA=ON +ARG ENABLE_NATIVE_OPTIMIZATION=OFF +ARG USE_PODOFO=ON + +ENV TZ=${TZ} \ + CC=gcc \ + CXX=g++ \ + BUILD_TYPE=${BUILD_TYPE} + +# Build dependencies (system CURL required by inference/CMakeLists on Linux) +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential git pkg-config ca-certificates curl \ + cmake ninja-build ccache \ + libcurl4-openssl-dev libssl-dev libbz2-dev \ + libomp-dev libblas-dev liblapack-dev \ + # PDF (PoDoFo) optional deps – safe to install even if disabled + libfreetype6-dev libjpeg-dev libpng-dev libtiff-dev libxml2-dev libfontconfig1-dev \ + && rm -rf /var/lib/apt/lists/* + +# Speed up rebuilds +ENV PATH=/usr/lib/ccache:${PATH} \ + CCACHE_DIR=/root/.ccache \ + CCACHE_MAXSIZE=1G + +WORKDIR /src + +# Copy repository (rely on .dockerignore to keep context small) +COPY . . + +# Initialize submodules when available (no-op if not a git context) +RUN if [ -d .git ]; then git submodule update --init --recursive; else echo "No .git directory – skipping submodules"; fi + +# Configure & build (CUDA by default) +RUN set -eux; \ + cmake -S . -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_C_COMPILER=${CC} -DCMAKE_CXX_COMPILER=${CXX} \ + -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \ + -DUSE_CUDA=${ENABLE_CUDA} \ + -DUSE_PODOFO=${USE_PODOFO}; \ + cmake --build build --config ${BUILD_TYPE} + +# Determine single-config output dir (project sets output to build/) +RUN set -eux; \ + OUTDIR="build/${BUILD_TYPE}"; \ + test -x "${OUTDIR}/kolosal-server" || { echo "Build output not found at ${OUTDIR}"; ls -la build || true; exit 1; }; + +# Collect runtime payload +RUN set -eux; \ + OUTDIR="build/${BUILD_TYPE}"; \ + strip -s "${OUTDIR}/kolosal-server" || true; \ + mkdir -p /out/bin /out/config /out/libs /out/licenses; \ + cp "${OUTDIR}/kolosal-server" /out/bin/; \ + # Prefer configs/config_rms.yaml by default + if [ -f configs/config_rms.yaml ]; then \ + cp configs/config_rms.yaml /out/config/config_rms.yaml; \ + cp configs/config_rms.yaml /out/config/config.yaml; \ + elif [ -f config_rms.yaml ]; then \ + cp config_rms.yaml /out/config/config_rms.yaml; \ + cp config_rms.yaml /out/config/config.yaml; \ + elif [ -f config.yaml ]; then \ + cp config.yaml /out/config/config.yaml; \ + else \ + echo "No config found; you can mount one at runtime"; \ + fi; \ + # Shared libs placed by post-build step alongside the exe + for p in libllama-*.so* libkolosal_server.so*; do \ + if ls "${OUTDIR}/$p" 1>/dev/null 2>&1; then \ + cp -n "${OUTDIR}/"$p /out/libs/ || true; \ + fi; \ + done; \ + # Non-system dependencies referenced by the binary + ldd "${OUTDIR}/kolosal-server" | awk '{for(i=1;i<=NF;i++) if ($i ~ /\//) print $i}' | sort -u > /tmp/libs.txt || true; \ + while read -r lib; do case "$lib" in /lib/*|/usr/lib/*) ;; *) cp -n "$lib" /out/libs/ 2>/dev/null || true ;; esac; done < /tmp/libs.txt; \ + cp LICENSE /out/licenses/ 2>/dev/null || true; \ + echo "Collected libs:"; ls -1 /out/libs || true + +######################################## +# Runtime image +######################################## +FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime + +ARG DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH=/usr/local/lib:/app/libs:/usr/local/cuda/lib64 \ + KOL_MODELS_DIR=/app/models + +# Minimal runtime deps (keep in sync with ldd if needed) +RUN apt-get update && apt-get install -y --no-install-recommends \ + libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \ + # PoDoFo runtime libs (safe if unused) + libfreetype6 libjpeg-turbo8 libpng16-16 libtiff5 libxml2 fontconfig \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY --from=build /out/bin/kolosal-server /usr/local/bin/kolosal-server +COPY --from=build /out/config /app/config +COPY --from=build /out/libs /app/libs +COPY --from=build /out/licenses /licenses + +# Make inference libs discoverable and refresh linker cache +RUN set -eux; \ + mkdir -p /usr/local/lib; \ + if ls /app/libs/libllama-*.so* 1>/dev/null 2>&1; then cp /app/libs/libllama-*.so* /usr/local/lib/ || true; fi; \ + if ls /app/libs/libkolosal_server.so* 1>/dev/null 2>&1; then cp /app/libs/libkolosal_server.so* /usr/local/lib/ || true; fi; \ + ldconfig || true + +# Simple entrypoint wrapper – use config_rms.yaml by default when present +RUN printf '%s\n' '#!/bin/sh' \ + 'set -e' \ + 'CONFIG="/app/config/config_rms.yaml"' \ + 'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config.yaml"; fi' \ + 'echo "Starting kolosal-server with: $CONFIG"' \ + 'exec kolosal-server --config "$CONFIG"' \ + > /usr/local/bin/kolosal-entry.sh && chmod +x /usr/local/bin/kolosal-entry.sh + +# Non-root runtime +RUN useradd -r -u 10001 -d /app kolosal && chown -R kolosal:kolosal /app +USER kolosal + +VOLUME ["/app/models", "/app/data"] + +EXPOSE 8080 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \ + CMD curl -fsS http://localhost:8080/v1/health || exit 1 + +ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/kolosal-entry.sh"] diff --git a/README.md b/README.md index e83b5782..66e9b864 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,40 @@ A high-performance inference server for large language models with OpenAI-compat ## Quick Start +### Docker (CUDA GPU) + +Prerequisites: +- NVIDIA GPU + drivers on host +- NVIDIA Container Toolkit installed + +Build (CUDA by default): + +```powershell +docker build -t kolosal-server:cuda . --build-arg BUILD_TYPE=Release --build-arg ENABLE_CUDA=ON +``` + +Run on GPU (uses configs/config_rms.yaml by default inside the image): + +```powershell +# expose port 8080; mount models dir (optional) +docker run --rm --gpus all -p 8080:8080 -v ${PWD}\models:/app/models kolosal-server:cuda +``` + +Use a custom config (for example, your edited config_rms.yaml in the local configs folder): + +```powershell +docker run --rm --gpus all -p 8080:8080 ^ + -v ${PWD}\models:/app/models ^ + -v ${PWD}\configs:/app/config ^ + kolosal-server:cuda +``` + +Health check: + +```powershell +curl http://localhost:8080/v1/health +``` + ### Linux (Recommended) #### Prerequisites From ed8bb4bff6718938df7c06784ede1153bb46fc92 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Wed, 3 Sep 2025 16:37:56 -0700 Subject: [PATCH 02/15] Update config for Docker and CUDA inference engine Changed server port to 8080 for Docker compatibility and updated SearXNG URL to a public instance. Added CUDA-based inference engine as default, adjusted library paths for Docker/Linux, and set model inference engines to use CUDA. Updated comments and parameters for better clarity and GPU layer offloading. --- configs/config_rms.yaml | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/configs/config_rms.yaml b/configs/config_rms.yaml index 555ac174..4493948c 100644 --- a/configs/config_rms.yaml +++ b/configs/config_rms.yaml @@ -1,6 +1,6 @@ # Server Configuration server: - port: "8084" # Port number to run the server on + port: "8080" # Port number to run the server on (Docker exposes 8080) host: "0.0.0.0" # Host to bind the server; 0.0.0.0 means all available interfaces idle_timeout: 300 # Idle timeout in seconds allow_public_access: false # Allow access from other devices on the same network @@ -54,7 +54,7 @@ auth: # Search Integration (e.g., with SearXNG) search: enabled: true - searxng_url: http://localhost:8090 # URL of SearXNG or compatible search engine + searxng_url: https://searx.stream # URL of SearXNG or compatible search engine timeout: 30 # Search timeout in seconds max_results: 20 # Maximum number of results returned default_engine: "" # Optional default search engine @@ -79,14 +79,22 @@ database: # Inference Engine Definitions inference_engines: + # CUDA (GPU) engine used inside Docker/Linux runtime + - name: llama-cuda + library_path: /usr/local/lib/libllama-cuda.so # Path inside Docker image; present at runtime + version: 1.0.0 + description: NVIDIA CUDA-accelerated inference engine for LLaMA models + load_on_startup: true + + # Optional CPU fallback (useful for Windows dev; will be ignored in Docker if not present) - name: llama-cpu - library_path: ./build/Release/llama-cpu.dll # Path to the inference engine library + library_path: /usr/local/lib/libllama-cpu.so version: 1.0.0 description: CPU-based inference engine for LLaMA models - load_on_startup: true # Whether to load this engine when the server starts + load_on_startup: false # Default inference engine to use -default_inference_engine: llama-cpu +default_inference_engine: llama-cuda # General feature toggles features: @@ -111,7 +119,7 @@ models: type: embedding load_immediately: true main_gpu_id: 0 - inference_engine: llama-cpu + inference_engine: llama-cuda load_params: n_ctx: 4096 n_keep: 1024 @@ -120,7 +128,7 @@ models: n_parallel: 1 cont_batching: true warmup: false - n_gpu_layers: 100 + n_gpu_layers: 100 # Increase or set to -1 to offload all layers if VRAM allows n_batch: 2048 n_ubatch: 512 @@ -129,7 +137,7 @@ models: type: llm load_immediately: true main_gpu_id: 0 - inference_engine: llama-cpu + inference_engine: llama-cuda load_params: n_ctx: 2048 n_keep: 1024 @@ -138,6 +146,6 @@ models: n_parallel: 1 cont_batching: true warmup: false - n_gpu_layers: 100 + n_gpu_layers: 100 # Increase or set to -1 to offload all layers if VRAM allows n_batch: 2048 n_ubatch: 512 From 0f111495241eb4215b71f2008152b640d728eebd Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Thu, 4 Sep 2025 11:53:59 -0700 Subject: [PATCH 03/15] Fix indentation for llama-cpu library_path in YAML Corrected the indentation of the 'library_path' field under the 'llama-cpu' inference engine to ensure proper YAML parsing and configuration loading. --- configs/config_rms.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_rms.yaml b/configs/config_rms.yaml index 4493948c..4395a0e3 100644 --- a/configs/config_rms.yaml +++ b/configs/config_rms.yaml @@ -88,7 +88,7 @@ inference_engines: # Optional CPU fallback (useful for Windows dev; will be ignored in Docker if not present) - name: llama-cpu - library_path: /usr/local/lib/libllama-cpu.so + library_path: /usr/local/lib/libllama-cpu.so version: 1.0.0 description: CPU-based inference engine for LLaMA models load_on_startup: false From 2558bc77ab97dcc2c99e71bae01d272b4efde468 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Thu, 4 Sep 2025 12:02:28 -0700 Subject: [PATCH 04/15] Update Dockerfile --- Dockerfile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Dockerfile b/Dockerfile index c61d00a0..580819dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,6 +47,16 @@ COPY . . # Initialize submodules when available (no-op if not a git context) RUN if [ -d .git ]; then git submodule update --init --recursive; else echo "No .git directory – skipping submodules"; fi +# Ensure llama.cpp source is present (fallback when submodules are not in context) +RUN set -eux; \ + if [ ! -f inference/external/llama.cpp/CMakeLists.txt ] && [ ! -f external/llama.cpp/CMakeLists.txt ]; then \ + echo "[Docker build] llama.cpp not found in repo – cloning shallow copy..."; \ + mkdir -p inference/external; \ + git clone --depth=1 https://github.com/ggerganov/llama.cpp.git inference/external/llama.cpp; \ + else \ + echo "[Docker build] Found llama.cpp sources in repo"; \ + fi + # Configure & build (CUDA by default) RUN set -eux; \ cmake -S . -B build -G Ninja \ From cc488b47adde9ec2af321a0caf21fac78fc04387 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Thu, 4 Sep 2025 12:04:32 -0700 Subject: [PATCH 05/15] Pin and upgrade CMake version in Dockerfile Added CMAKE_VERSION argument and logic to ensure CMake is upgraded to version 3.27.9 or higher, as required by PoDoFo. This improves build reliability by guaranteeing a compatible CMake version. --- Dockerfile | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 580819dd..a86f9e2f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ ARG BUILD_TYPE=Release ARG ENABLE_CUDA=ON ARG ENABLE_NATIVE_OPTIMIZATION=OFF ARG USE_PODOFO=ON +ARG CMAKE_VERSION=3.27.9 ENV TZ=${TZ} \ CC=gcc \ @@ -27,13 +28,27 @@ ENV TZ=${TZ} \ # Build dependencies (system CURL required by inference/CMakeLists on Linux) RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential git pkg-config ca-certificates curl \ - cmake ninja-build ccache \ + cmake ninja-build ccache \ libcurl4-openssl-dev libssl-dev libbz2-dev \ libomp-dev libblas-dev liblapack-dev \ # PDF (PoDoFo) optional deps – safe to install even if disabled libfreetype6-dev libjpeg-dev libpng-dev libtiff-dev libxml2-dev libfontconfig1-dev \ && rm -rf /var/lib/apt/lists/* +# Upgrade to pinned CMake (>=3.23 required by PoDoFo) +RUN set -eux; \ + ver="$(${SHELL:-/bin/sh} -c 'cmake --version 2>/dev/null | awk "NR==1{print \$3}"' || true)"; \ + need="${CMAKE_VERSION}"; \ + if [ -z "$ver" ] || [ "$(printf '%s\n' "$need" "$ver" | sort -V | head -n1)" != "$need" ] || [ "$ver" != "$need" ]; then \ + cd /tmp; \ + curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz; \ + tar -xf cmake.tar.gz; \ + cp -r cmake-${CMAKE_VERSION}-linux-x86_64/bin/* /usr/local/bin/; \ + cp -r cmake-${CMAKE_VERSION}-linux-x86_64/share/cmake* /usr/local/share/ || true; \ + rm -rf cmake-* cmake.tar.gz; \ + fi; \ + cmake --version + # Speed up rebuilds ENV PATH=/usr/lib/ccache:${PATH} \ CCACHE_DIR=/root/.ccache \ From 11c0731a9a2845f7c112d6eb71efc69cc3d7db92 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Thu, 4 Sep 2025 12:07:40 -0700 Subject: [PATCH 06/15] Clone zlib and pugixml if not present in Docker build Adds steps to the Dockerfile to automatically clone zlib and pugixml repositories if their sources are not found in the build context. This ensures required external dependencies are available for the build process. --- Dockerfile | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a86f9e2f..bfba0d63 100644 --- a/Dockerfile +++ b/Dockerfile @@ -72,6 +72,23 @@ RUN set -eux; \ echo "[Docker build] Found llama.cpp sources in repo"; \ fi +# Ensure other required externals are present when not vendored in context +RUN set -eux; \ + if [ ! -f external/zlib/contrib/minizip/ioapi.c ]; then \ + echo "[Docker build] zlib not found – cloning..."; \ + rm -rf external/zlib && mkdir -p external; \ + git clone --depth=1 https://github.com/madler/zlib.git external/zlib; \ + else \ + echo "[Docker build] Found zlib"; \ + fi; \ + if [ ! -f external/pugixml/src/pugixml.cpp ]; then \ + echo "[Docker build] pugixml not found – cloning..."; \ + rm -rf external/pugixml && mkdir -p external; \ + git clone --depth=1 https://github.com/zeux/pugixml.git external/pugixml; \ + else \ + echo "[Docker build] Found pugixml"; \ + fi + # Configure & build (CUDA by default) RUN set -eux; \ cmake -S . -B build -G Ninja \ @@ -79,7 +96,8 @@ RUN set -eux; \ -DCMAKE_C_COMPILER=${CC} -DCMAKE_CXX_COMPILER=${CXX} \ -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \ -DUSE_CUDA=${ENABLE_CUDA} \ - -DUSE_PODOFO=${USE_PODOFO}; \ + -DUSE_PODOFO=${USE_PODOFO} \ + -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so; \ cmake --build build --config ${BUILD_TYPE} # Determine single-config output dir (project sets output to build/) From 473972eba84164ccd3ce0bebbdb5287aab55d583 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Thu, 4 Sep 2025 12:48:26 -0700 Subject: [PATCH 07/15] Remove zlib and pugixml cloning from Dockerfile Eliminates the steps that clone zlib and pugixml if not present, assuming these dependencies are now handled elsewhere or are always available in the build context. --- Dockerfile | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index bfba0d63..cfc22c2d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -72,23 +72,6 @@ RUN set -eux; \ echo "[Docker build] Found llama.cpp sources in repo"; \ fi -# Ensure other required externals are present when not vendored in context -RUN set -eux; \ - if [ ! -f external/zlib/contrib/minizip/ioapi.c ]; then \ - echo "[Docker build] zlib not found – cloning..."; \ - rm -rf external/zlib && mkdir -p external; \ - git clone --depth=1 https://github.com/madler/zlib.git external/zlib; \ - else \ - echo "[Docker build] Found zlib"; \ - fi; \ - if [ ! -f external/pugixml/src/pugixml.cpp ]; then \ - echo "[Docker build] pugixml not found – cloning..."; \ - rm -rf external/pugixml && mkdir -p external; \ - git clone --depth=1 https://github.com/zeux/pugixml.git external/pugixml; \ - else \ - echo "[Docker build] Found pugixml"; \ - fi - # Configure & build (CUDA by default) RUN set -eux; \ cmake -S . -B build -G Ninja \ @@ -97,7 +80,8 @@ RUN set -eux; \ -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \ -DUSE_CUDA=${ENABLE_CUDA} \ -DUSE_PODOFO=${USE_PODOFO} \ - -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so; \ + -DBUILD_TESTING=OFF \ + -DBUILD_INFERENCE_TESTS=OFF; \ cmake --build build --config ${BUILD_TYPE} # Determine single-config output dir (project sets output to build/) From 68fc324ae7350702fcbcbfeb36ae7ce0534dbd62 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Thu, 4 Sep 2025 12:52:01 -0700 Subject: [PATCH 08/15] Add CUDA library path to CMake build Sets the CUDA_CUDA_LIBRARY variable in the CMake configuration to explicitly specify the location of libcuda.so. This helps ensure correct linking when building with CUDA support. --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index cfc22c2d..0c99a9c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -80,6 +80,7 @@ RUN set -eux; \ -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \ -DUSE_CUDA=${ENABLE_CUDA} \ -DUSE_PODOFO=${USE_PODOFO} \ + -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so \ -DBUILD_TESTING=OFF \ -DBUILD_INFERENCE_TESTS=OFF; \ cmake --build build --config ${BUILD_TYPE} From 0269d97cd6f5f8f4fb1a8212b0e8881794898df2 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Thu, 4 Sep 2025 15:43:53 -0700 Subject: [PATCH 09/15] Enable GGML_CUDA_NO_VMM in Docker build Adds the -DGGML_CUDA_NO_VMM=ON flag to the CMake configuration in the Dockerfile to disable CUDA VMM support during build. --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 0c99a9c2..863e54fd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -80,6 +80,7 @@ RUN set -eux; \ -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \ -DUSE_CUDA=${ENABLE_CUDA} \ -DUSE_PODOFO=${USE_PODOFO} \ + -DGGML_CUDA_NO_VMM=ON \ -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so \ -DBUILD_TESTING=OFF \ -DBUILD_INFERENCE_TESTS=OFF; \ From 4bf9f83d7526c165a02e45d1d6580d31e6b02216 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Sat, 6 Sep 2025 14:42:37 -0700 Subject: [PATCH 10/15] Add BLAS, LAPACK, and gfortran to Docker runtime deps Added libblas3, liblapack3, and libgfortran5 to the minimal runtime dependencies in the Dockerfile to support applications requiring these scientific libraries. --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 863e54fd..f17ba3c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -132,7 +132,8 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:/app/libs:/usr/local/cuda/lib64 \ # Minimal runtime deps (keep in sync with ldd if needed) RUN apt-get update && apt-get install -y --no-install-recommends \ - libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \ + libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \ + libblas3 liblapack3 libgfortran5 \ # PoDoFo runtime libs (safe if unused) libfreetype6 libjpeg-turbo8 libpng16-16 libtiff5 libxml2 fontconfig \ && rm -rf /var/lib/apt/lists/* From 63ff476676e71193835c1ad88862c9dfb9535038 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Sat, 6 Sep 2025 15:02:43 -0700 Subject: [PATCH 11/15] Enable public access in server config Changed 'allow_public_access' to true in config_rms.yaml to permit access from other devices on the same network. --- configs/config_rms.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_rms.yaml b/configs/config_rms.yaml index 4395a0e3..836eb9ed 100644 --- a/configs/config_rms.yaml +++ b/configs/config_rms.yaml @@ -3,7 +3,7 @@ server: port: "8080" # Port number to run the server on (Docker exposes 8080) host: "0.0.0.0" # Host to bind the server; 0.0.0.0 means all available interfaces idle_timeout: 300 # Idle timeout in seconds - allow_public_access: false # Allow access from other devices on the same network + allow_public_access: true # Allow access from other devices on the same network allow_internet_access: false # Allow internet access (requires proper port forwarding) # Logging Configuration From e711342f4b7dc0b4d62c7035076d9c3eced66652 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Sat, 6 Sep 2025 15:08:41 -0700 Subject: [PATCH 12/15] Add basic config and update Dockerfile defaults Introduces configs/config_basic.yaml for minimal embedding-only deployments. Updates Dockerfile to prefer config_basic.yaml as the default configuration, with fallback logic for other config files and adjusts entrypoint to use config_basic.yaml by default. --- Dockerfile | 13 +++-- configs/config_basic.yaml | 102 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 3 deletions(-) create mode 100644 configs/config_basic.yaml diff --git a/Dockerfile b/Dockerfile index f17ba3c2..057dcf1e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -97,10 +97,16 @@ RUN set -eux; \ strip -s "${OUTDIR}/kolosal-server" || true; \ mkdir -p /out/bin /out/config /out/libs /out/licenses; \ cp "${OUTDIR}/kolosal-server" /out/bin/; \ - # Prefer configs/config_rms.yaml by default - if [ -f configs/config_rms.yaml ]; then \ + # Prefer basic (vanilla) config by default, then fall back + if [ -f configs/config_basic.yaml ]; then \ + cp configs/config_basic.yaml /out/config/config_basic.yaml; \ + cp configs/config_basic.yaml /out/config/config.yaml; \ + elif [ -f configs/config_rms.yaml ]; then \ cp configs/config_rms.yaml /out/config/config_rms.yaml; \ cp configs/config_rms.yaml /out/config/config.yaml; \ + elif [ -f config_basic.yaml ]; then \ + cp config_basic.yaml /out/config/config_basic.yaml; \ + cp config_basic.yaml /out/config/config.yaml; \ elif [ -f config_rms.yaml ]; then \ cp config_rms.yaml /out/config/config_rms.yaml; \ cp config_rms.yaml /out/config/config.yaml; \ @@ -155,7 +161,8 @@ RUN set -eux; \ # Simple entrypoint wrapper – use config_rms.yaml by default when present RUN printf '%s\n' '#!/bin/sh' \ 'set -e' \ - 'CONFIG="/app/config/config_rms.yaml"' \ + 'CONFIG="/app/config/config_basic.yaml"' \ + 'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config_rms.yaml"; fi' \ 'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config.yaml"; fi' \ 'echo "Starting kolosal-server with: $CONFIG"' \ 'exec kolosal-server --config "$CONFIG"' \ diff --git a/configs/config_basic.yaml b/configs/config_basic.yaml new file mode 100644 index 00000000..1956607e --- /dev/null +++ b/configs/config_basic.yaml @@ -0,0 +1,102 @@ +# Minimal config for deployments that only need embeddings into Qdrant. +# Assumptions: +# - Exposes HTTP on 8080 and binds to all interfaces. +# - Only an embedding model is loaded (no LLMs). +# - Qdrant is reachable at the configured host/port (use "localhost" on a single VM, +# or the service DNS name like "qdrant" when running in Kubernetes/Helm). + +server: + port: "8080" + host: "0.0.0.0" + idle_timeout: 300 + allow_public_access: true + allow_internet_access: false + +logging: + level: INFO + file: "" + access_log: false + quiet_mode: false + show_request_details: false + +auth: + enabled: true + require_api_key: false + api_key_header: X-API-Key + api_keys: + - change_me_if_enabled + rate_limit: + enabled: true + max_requests: 100 + window_size: 60 + cors: + enabled: true + allow_credentials: false + max_age: 86400 + allowed_origins: ["*"] + allowed_methods: [GET, POST, PUT, DELETE, OPTIONS, HEAD, PATCH] + allowed_headers: [Content-Type, Authorization, X-Requested-With, Accept, Origin] + +# Disable internet search for this minimal embedding-only setup +search: + enabled: false + +database: + qdrant: + enabled: true + host: "localhost" # On K8s/Helm, set to the Qdrant service name, e.g., "qdrant" + port: 6333 + collection_name: "documents" + default_embedding_model: "qwen3-embedding-4b" + timeout: 30 + api_key: "" + max_connections: 10 + connection_timeout: 5 + +inference_engines: + - name: llama-cuda + library_path: /usr/local/lib/libllama-cuda.so + version: 1.0.0 + description: CUDA-accelerated inference engine for embeddings + load_on_startup: true + - name: llama-cpu + library_path: /usr/local/lib/libllama-cpu.so + version: 1.0.0 + description: CPU fallback (optional) + load_on_startup: false + +default_inference_engine: llama-cuda + +features: + health_check: true + metrics: true + +embedding_autoscaling: + enabled: true + min_instances: 1 + max_instances: 4 + scale_up_threshold: 10 + scale_down_threshold: 2 + scale_up_delay: 30 + scale_down_delay: 300 + check_interval: 15 + +# Only the embedding model is defined here; no LLMs. +models: + - id: qwen3-embedding-4b + path: https://huggingface.co/kolosal/qwen3-embedding-4b/resolve/main/Qwen3-Embedding-4B-Q4_K_M.gguf + type: embedding + load_immediately: true + main_gpu_id: 0 + inference_engine: llama-cuda + load_params: + n_ctx: 4096 + n_keep: 1024 + use_mmap: true + use_mlock: false + n_parallel: 1 + cont_batching: true + warmup: false + n_gpu_layers: 100 # Set to -1 to offload all layers if VRAM allows + n_batch: 2048 + n_ubatch: 512 From ea16d7906f2bb167222cc441c62d14a33ee25232 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Sun, 7 Sep 2025 00:54:04 -0700 Subject: [PATCH 13/15] Add Docker usage instructions to README Expanded the README with detailed steps for using prebuilt Docker images from GitHub Container Registry, including prerequisites, image pulling, running with GPU support, mounting directories, configuration options, updating/rollback, and troubleshooting. This helps users deploy the server more easily using Docker. --- README.md | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/README.md b/README.md index 66e9b864..a3a056e3 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,138 @@ Health check: curl http://localhost:8080/v1/health ``` +## Docker (prebuilt images from GHCR) + +Use a ready-made image from GitHub Container Registry so you don’t rebuild on every VM. + +### Prerequisites + +- NVIDIA GPU + drivers on the host +- NVIDIA Container Toolkit installed (for `--gpus all`) +- Open port 8080/tcp on your VM firewall/security group if accessing remotely + +### Pull the image + +Windows PowerShell: + +```powershell +docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Linux/macOS: + +```bash +docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +If your package is private, login first with a Personal Access Token (scopes: `read:packages`): + +```powershell +$env:GHCR_TOKEN = "" +echo $env:GHCR_TOKEN | docker login ghcr.io -u --password-stdin +``` + +### Run (GPU) + +Minimal run (exposes 8080 and runs with GPU): + +Windows PowerShell: + +```powershell +docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Linux/macOS: + +```bash +docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Mount a models directory (recommended): + +```powershell +docker run -d --name kolosal-server --gpus all --restart unless-stopped ` + -p 8080:8080 ` + -v C:\\kolosal\\models:/app/models ` + ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Linux/macOS: + +```bash +docker run -d --name kolosal-server --gpus all --restart unless-stopped \ + -p 8080:8080 \ + -v $PWD/models:/app/models \ + ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +### Choose a config + +Images support these defaults inside the container: + +- Prefer `/app/config/config_basic.yaml` (embedding-only “vanilla”) +- Fallback to `/app/config/config_rms.yaml` +- Fallback to `/app/config/config.yaml` + +To use your own config, bind-mount it to `/app/config/config.yaml`: + +Windows PowerShell: + +```powershell +docker run -d --name kolosal-server --gpus all --restart unless-stopped ` + -p 8080:8080 ` + -v ${PWD}\configs\config_basic.yaml:/app/config/config.yaml:ro ` + -v ${PWD}\models:/app/models ` + ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Linux/macOS: + +```bash +docker run -d --name kolosal-server --gpus all --restart unless-stopped \ + -p 8080:8080 \ + -v $PWD/configs/config_basic.yaml:/app/config/config.yaml:ro \ + -v $PWD/models:/app/models \ + ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Note: Ensure `server.allow_public_access: true` in your config if you’ll call the API from outside the container/host. + +### Verify and logs + +```powershell +curl http://localhost:8080/v1/health +docker logs -f kolosal-server +``` + +If you see model “unloaded”, that’s okay if `load_immediately` is false; it will load on first use. + +### Persist data + +- Models: mount a host folder to `/app/models` +- App data/cache: mount a host folder to `/app/data` if desired + +### Update or rollback + +```powershell +# Update to a new tag +docker pull ghcr.io/kolosalai/kolosal-server:v0.0.2 +docker rm -f kolosal-server +docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.2 + +# Rollback to previous +docker rm -f kolosal-server +docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +### Troubleshooting + +- Connection reset: confirm `-p 8080:8080` and `server.allow_public_access: true` in the config. +- GPU not used: ensure NVIDIA drivers + NVIDIA Container Toolkit; run with `--gpus all`; check `nvidia-smi` on host. +- Missing models: mount a models directory to `/app/models` or use paths/URLs in your config. +- 404s: check path (`/v1/health`, `/v1/models`, etc.). +- Authentication: set `auth.enabled` and `api_keys` in your config and send `X-API-Key` header. + ### Linux (Recommended) #### Prerequisites From 8c2781459dd14faca4e2ba22f79cd7e23d66566b Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Mon, 8 Sep 2025 12:43:33 -0700 Subject: [PATCH 14/15] Disable rate limiting in basic config Set 'rate_limit.enabled' to false in config_basic.yaml to turn off API rate limiting. This may be for testing or to allow unrestricted access during development. --- configs/config_basic.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_basic.yaml b/configs/config_basic.yaml index 1956607e..e34fcfe8 100644 --- a/configs/config_basic.yaml +++ b/configs/config_basic.yaml @@ -26,7 +26,7 @@ auth: api_keys: - change_me_if_enabled rate_limit: - enabled: true + enabled: false max_requests: 100 window_size: 60 cors: From 40d67094fca326c18693243fa404420506950127 Mon Sep 17 00:00:00 2001 From: Alifais Farrel Ramdhani Date: Mon, 8 Sep 2025 12:44:44 -0700 Subject: [PATCH 15/15] Revert "Disable rate limiting in basic config" This reverts commit 8c2781459dd14faca4e2ba22f79cd7e23d66566b. --- configs/config_basic.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_basic.yaml b/configs/config_basic.yaml index e34fcfe8..1956607e 100644 --- a/configs/config_basic.yaml +++ b/configs/config_basic.yaml @@ -26,7 +26,7 @@ auth: api_keys: - change_me_if_enabled rate_limit: - enabled: false + enabled: true max_requests: 100 window_size: 60 cors: