From f6a35cb3cdbdeacacf3e0a86037397d9374affad Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Wed, 3 Sep 2025 16:24:45 -0700
Subject: [PATCH 01/15] Add Dockerfile for CUDA GPU deployment

Introduces a multi-stage Dockerfile for building and running kolosal-server with CUDA support. Updates README with instructions for building and running the server in a Docker container, including health checks and config mounting.
---
 Dockerfile | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md  |  34 +++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 Dockerfile
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..c61d00a0
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,145 @@
+# syntax=docker/dockerfile:1.7-labs
+
+########################################
+# Kolosal Server – CUDA Docker image
+# - Multi-stage: build (devel) -> runtime (slim)
+# - Defaults to GPU (CUDA) build
+# - Uses config_rms.yaml as default config
+# - Copies only required runtime bits
+########################################
+
+ARG CUDA_VERSION=12.4.1
+ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+FROM ${BASE_IMAGE} AS build
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG TZ=UTC
+ARG BUILD_TYPE=Release
+ARG ENABLE_CUDA=ON
+ARG ENABLE_NATIVE_OPTIMIZATION=OFF
+ARG USE_PODOFO=ON
+
+ENV TZ=${TZ} \
+    CC=gcc \
+    CXX=g++ \
+    BUILD_TYPE=${BUILD_TYPE}
+
+# Build dependencies (system CURL required by inference/CMakeLists on Linux)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git pkg-config ca-certificates curl \
+  cmake ninja-build ccache \
+  libcurl4-openssl-dev libssl-dev libbz2-dev \
+  libomp-dev libblas-dev liblapack-dev \
+      # PDF (PoDoFo) optional deps – safe to install even if disabled
+      libfreetype6-dev libjpeg-dev libpng-dev libtiff-dev libxml2-dev libfontconfig1-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Speed up rebuilds
+ENV PATH=/usr/lib/ccache:${PATH} \
+    CCACHE_DIR=/root/.ccache \
+    CCACHE_MAXSIZE=1G
+
+WORKDIR /src
+
+# Copy repository (rely on .dockerignore to keep context small)
+COPY . .
+
+# Initialize submodules when available (no-op if not a git context)
+RUN if [ -d .git ]; then git submodule update --init --recursive; else echo "No .git directory – skipping submodules"; fi
+
+# Configure & build (CUDA by default)
+RUN set -eux; \
+    cmake -S . -B build -G Ninja \
+      -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+      -DCMAKE_C_COMPILER=${CC} -DCMAKE_CXX_COMPILER=${CXX} \
+      -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \
+      -DUSE_CUDA=${ENABLE_CUDA} \
+      -DUSE_PODOFO=${USE_PODOFO}; \
+    cmake --build build --config ${BUILD_TYPE}
+
+# Determine single-config output dir (project sets output to build/<TYPE>)
+RUN set -eux; \
+    OUTDIR="build/${BUILD_TYPE}"; \
+    test -x "${OUTDIR}/kolosal-server" || { echo "Build output not found at ${OUTDIR}"; ls -la build || true; exit 1; };
+
+# Collect runtime payload
+RUN set -eux; \
+    OUTDIR="build/${BUILD_TYPE}"; \
+    strip -s "${OUTDIR}/kolosal-server" || true; \
+    mkdir -p /out/bin /out/config /out/libs /out/licenses; \
+    cp "${OUTDIR}/kolosal-server" /out/bin/; \
+    # Prefer configs/config_rms.yaml by default
+    if [ -f configs/config_rms.yaml ]; then \
+      cp configs/config_rms.yaml /out/config/config_rms.yaml; \
+      cp configs/config_rms.yaml /out/config/config.yaml; \
+    elif [ -f config_rms.yaml ]; then \
+      cp config_rms.yaml /out/config/config_rms.yaml; \
+      cp config_rms.yaml /out/config/config.yaml; \
+    elif [ -f config.yaml ]; then \
+      cp config.yaml /out/config/config.yaml; \
+    else \
+      echo "No config found; you can mount one at runtime"; \
+    fi; \
+    # Shared libs placed by post-build step alongside the exe
+    for p in libllama-*.so* libkolosal_server.so*; do \
+      if ls "${OUTDIR}/$p" 1>/dev/null 2>&1; then \
+        cp -n "${OUTDIR}/"$p /out/libs/ || true; \
+      fi; \
+    done; \
+    # Non-system dependencies referenced by the binary
+    ldd "${OUTDIR}/kolosal-server" | awk '{for(i=1;i<=NF;i++) if ($i ~ /\//) print $i}' | sort -u > /tmp/libs.txt || true; \
+    while read -r lib; do case "$lib" in /lib/*|/usr/lib/*) ;; *) cp -n "$lib" /out/libs/ 2>/dev/null || true ;; esac; done < /tmp/libs.txt; \
+    cp LICENSE /out/licenses/ 2>/dev/null || true; \
+    echo "Collected libs:"; ls -1 /out/libs || true
+
+########################################
+# Runtime image
+########################################
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV LD_LIBRARY_PATH=/usr/local/lib:/app/libs:/usr/local/cuda/lib64 \
+    KOL_MODELS_DIR=/app/models
+
+# Minimal runtime deps (keep in sync with ldd if needed)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \
+      # PoDoFo runtime libs (safe if unused)
+      libfreetype6 libjpeg-turbo8 libpng16-16 libtiff5 libxml2 fontconfig \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY --from=build /out/bin/kolosal-server /usr/local/bin/kolosal-server
+COPY --from=build /out/config /app/config
+COPY --from=build /out/libs /app/libs
+COPY --from=build /out/licenses /licenses
+
+# Make inference libs discoverable and refresh linker cache
+RUN set -eux; \
+    mkdir -p /usr/local/lib; \
+    if ls /app/libs/libllama-*.so* 1>/dev/null 2>&1; then cp /app/libs/libllama-*.so* /usr/local/lib/ || true; fi; \
+    if ls /app/libs/libkolosal_server.so* 1>/dev/null 2>&1; then cp /app/libs/libkolosal_server.so* /usr/local/lib/ || true; fi; \
+    ldconfig || true
+
+# Simple entrypoint wrapper – use config_rms.yaml by default when present
+RUN printf '%s\n' '#!/bin/sh' \
+  'set -e' \
+  'CONFIG="/app/config/config_rms.yaml"' \
+  'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config.yaml"; fi' \
+  'echo "Starting kolosal-server with: $CONFIG"' \
+  'exec kolosal-server --config "$CONFIG"' \
+  > /usr/local/bin/kolosal-entry.sh && chmod +x /usr/local/bin/kolosal-entry.sh
+
+# Non-root runtime
+RUN useradd -r -u 10001 -d /app kolosal && chown -R kolosal:kolosal /app
+USER kolosal
+
+VOLUME ["/app/models", "/app/data"]
+
+EXPOSE 8080
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
+  CMD curl -fsS http://localhost:8080/v1/health || exit 1
+
+ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/kolosal-entry.sh"]
diff --git a/README.md b/README.md
index e83b5782..66e9b864 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,40 @@ A high-performance inference server for large language models with OpenAI-compat
 
 ## Quick Start
 
+### Docker (CUDA GPU)
+
+Prerequisites:
+- NVIDIA GPU + drivers on host
+- NVIDIA Container Toolkit installed
+
+Build (CUDA by default):
+
+```powershell
+docker build -t kolosal-server:cuda . --build-arg BUILD_TYPE=Release --build-arg ENABLE_CUDA=ON
+```
+
+Run on GPU (uses configs/config_rms.yaml by default inside the image):
+
+```powershell
+# expose port 8080; mount models dir (optional)
+docker run --rm --gpus all -p 8080:8080 -v ${PWD}\models:/app/models kolosal-server:cuda
+```
+
+Use a custom config (for example, your edited config_rms.yaml in the local configs folder):
+
+```powershell
+docker run --rm --gpus all -p 8080:8080 ^
+  -v ${PWD}\models:/app/models ^
+  -v ${PWD}\configs:/app/config ^
+  kolosal-server:cuda
+```
+
+Health check:
+
+```powershell
+curl http://localhost:8080/v1/health
+```
+
 ### Linux (Recommended)
 
 #### Prerequisites

From ed8bb4bff6718938df7c06784ede1153bb46fc92 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Wed, 3 Sep 2025 16:37:56 -0700
Subject: [PATCH 02/15] Update config for Docker and CUDA inference engine

Changed server port to 8080 for Docker compatibility and updated SearXNG URL to a public instance. Added CUDA-based inference engine as default, adjusted library paths for Docker/Linux, and set model inference engines to use CUDA. Updated comments and parameters for better clarity and GPU layer offloading.
---
 configs/config_rms.yaml | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/configs/config_rms.yaml b/configs/config_rms.yaml
index 555ac174..4493948c 100644
--- a/configs/config_rms.yaml
+++ b/configs/config_rms.yaml
@@ -1,6 +1,6 @@
 # Server Configuration
 server:
-  port: "8084"                        # Port number to run the server on
+  port: "8080"                        # Port number to run the server on (Docker exposes 8080)
   host: "0.0.0.0"                     # Host to bind the server; 0.0.0.0 means all available interfaces
   idle_timeout: 300                  # Idle timeout in seconds
   allow_public_access: false        # Allow access from other devices on the same network
@@ -54,7 +54,7 @@ auth:
 # Search Integration (e.g., with SearXNG)
 search:
   enabled: true
-  searxng_url: http://localhost:8090  # URL of SearXNG or compatible search engine
+  searxng_url: https://searx.stream  # URL of SearXNG or compatible search engine
   timeout: 30                         # Search timeout in seconds
   max_results: 20                     # Maximum number of results returned
   default_engine: ""                  # Optional default search engine
@@ -79,14 +79,22 @@ database:
 
 # Inference Engine Definitions
 inference_engines:
+  # CUDA (GPU) engine used inside Docker/Linux runtime
+  - name: llama-cuda
+    library_path: /usr/local/lib/libllama-cuda.so  # Path inside Docker image; present at runtime
+    version: 1.0.0
+    description: NVIDIA CUDA-accelerated inference engine for LLaMA models
+    load_on_startup: true
+
+  # Optional CPU fallback (useful for Windows dev; will be ignored in Docker if not present)
   - name: llama-cpu
-    library_path: ./build/Release/llama-cpu.dll  # Path to the inference engine library
+  library_path: /usr/local/lib/libllama-cpu.so
     version: 1.0.0
     description: CPU-based inference engine for LLaMA models
-    load_on_startup: true            # Whether to load this engine when the server starts
+    load_on_startup: false
 
 # Default inference engine to use
-default_inference_engine: llama-cpu
+default_inference_engine: llama-cuda
 
 # General feature toggles
 features:
@@ -111,7 +119,7 @@ models:
     type: embedding
     load_immediately: true
     main_gpu_id: 0
-    inference_engine: llama-cpu
+    inference_engine: llama-cuda
     load_params:
       n_ctx: 4096
       n_keep: 1024
@@ -120,7 +128,7 @@ models:
       n_parallel: 1
       cont_batching: true
       warmup: false
-      n_gpu_layers: 100
+      n_gpu_layers: 100              # Increase or set to -1 to offload all layers if VRAM allows
       n_batch: 2048
       n_ubatch: 512
 
@@ -129,7 +137,7 @@ models:
     type: llm
     load_immediately: true
     main_gpu_id: 0
-    inference_engine: llama-cpu
+    inference_engine: llama-cuda
     load_params:
       n_ctx: 2048
       n_keep: 1024
@@ -138,6 +146,6 @@ models:
       n_parallel: 1
       cont_batching: true
       warmup: false
-      n_gpu_layers: 100
+      n_gpu_layers: 100              # Increase or set to -1 to offload all layers if VRAM allows
       n_batch: 2048
       n_ubatch: 512

From 0f111495241eb4215b71f2008152b640d728eebd Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Thu, 4 Sep 2025 11:53:59 -0700
Subject: [PATCH 03/15] Fix indentation for llama-cpu library_path in YAML

Corrected the indentation of the 'library_path' field under the 'llama-cpu' inference engine to ensure proper YAML parsing and configuration loading.
---
 configs/config_rms.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/config_rms.yaml b/configs/config_rms.yaml
index 4493948c..4395a0e3 100644
--- a/configs/config_rms.yaml
+++ b/configs/config_rms.yaml
@@ -88,7 +88,7 @@ inference_engines:
 
   # Optional CPU fallback (useful for Windows dev; will be ignored in Docker if not present)
   - name: llama-cpu
-  library_path: /usr/local/lib/libllama-cpu.so
+    library_path: /usr/local/lib/libllama-cpu.so
     version: 1.0.0
     description: CPU-based inference engine for LLaMA models
     load_on_startup: false

From 2558bc77ab97dcc2c99e71bae01d272b4efde468 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Thu, 4 Sep 2025 12:02:28 -0700
Subject: [PATCH 04/15] Update Dockerfile

---
 Dockerfile | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index c61d00a0..580819dd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,6 +47,16 @@ COPY . .
 # Initialize submodules when available (no-op if not a git context)
 RUN if [ -d .git ]; then git submodule update --init --recursive; else echo "No .git directory – skipping submodules"; fi
 
+# Ensure llama.cpp source is present (fallback when submodules are not in context)
+RUN set -eux; \
+    if [ ! -f inference/external/llama.cpp/CMakeLists.txt ] && [ ! -f external/llama.cpp/CMakeLists.txt ]; then \
+      echo "[Docker build] llama.cpp not found in repo – cloning shallow copy..."; \
+      mkdir -p inference/external; \
+      git clone --depth=1 https://github.com/ggerganov/llama.cpp.git inference/external/llama.cpp; \
+    else \
+      echo "[Docker build] Found llama.cpp sources in repo"; \
+    fi
+
 # Configure & build (CUDA by default)
 RUN set -eux; \
     cmake -S . -B build -G Ninja \

From cc488b47adde9ec2af321a0caf21fac78fc04387 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Thu, 4 Sep 2025 12:04:32 -0700
Subject: [PATCH 05/15] Pin and upgrade CMake version in Dockerfile

Added CMAKE_VERSION argument and logic to ensure CMake is upgraded to version 3.27.9 or higher, as required by PoDoFo. This improves build reliability by guaranteeing a compatible CMake version.
---
 Dockerfile | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 580819dd..a86f9e2f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,6 +18,7 @@ ARG BUILD_TYPE=Release
 ARG ENABLE_CUDA=ON
 ARG ENABLE_NATIVE_OPTIMIZATION=OFF
 ARG USE_PODOFO=ON
+ARG CMAKE_VERSION=3.27.9
 
 ENV TZ=${TZ} \
     CC=gcc \
@@ -27,13 +28,27 @@ ENV TZ=${TZ} \
 # Build dependencies (system CURL required by inference/CMakeLists on Linux)
 RUN apt-get update && apt-get install -y --no-install-recommends \
   build-essential git pkg-config ca-certificates curl \
-  cmake ninja-build ccache \
+      cmake ninja-build ccache \
   libcurl4-openssl-dev libssl-dev libbz2-dev \
   libomp-dev libblas-dev liblapack-dev \
       # PDF (PoDoFo) optional deps – safe to install even if disabled
       libfreetype6-dev libjpeg-dev libpng-dev libtiff-dev libxml2-dev libfontconfig1-dev \
     && rm -rf /var/lib/apt/lists/*
 
+# Upgrade to pinned CMake (>=3.23 required by PoDoFo)
+RUN set -eux; \
+    ver="$(${SHELL:-/bin/sh} -c 'cmake --version 2>/dev/null | awk "NR==1{print \$3}"' || true)"; \
+    need="${CMAKE_VERSION}"; \
+    if [ -z "$ver" ] || [ "$(printf '%s\n' "$need" "$ver" | sort -V | head -n1)" != "$need" ] || [ "$ver" != "$need" ]; then \
+      cd /tmp; \
+      curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz; \
+      tar -xf cmake.tar.gz; \
+      cp -r cmake-${CMAKE_VERSION}-linux-x86_64/bin/* /usr/local/bin/; \
+      cp -r cmake-${CMAKE_VERSION}-linux-x86_64/share/cmake* /usr/local/share/ || true; \
+      rm -rf cmake-* cmake.tar.gz; \
+    fi; \
+    cmake --version
+
 # Speed up rebuilds
 ENV PATH=/usr/lib/ccache:${PATH} \
     CCACHE_DIR=/root/.ccache \

From 11c0731a9a2845f7c112d6eb71efc69cc3d7db92 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Thu, 4 Sep 2025 12:07:40 -0700
Subject: [PATCH 06/15] Clone zlib and pugixml if not present in Docker build

Adds steps to the Dockerfile to automatically clone zlib and pugixml repositories if their sources are not found in the build context. This ensures required external dependencies are available for the build process.
---
 Dockerfile | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index a86f9e2f..bfba0d63 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -72,6 +72,23 @@ RUN set -eux; \
       echo "[Docker build] Found llama.cpp sources in repo"; \
     fi
 
+# Ensure other required externals are present when not vendored in context
+RUN set -eux; \
+    if [ ! -f external/zlib/contrib/minizip/ioapi.c ]; then \
+      echo "[Docker build] zlib not found – cloning..."; \
+      rm -rf external/zlib && mkdir -p external; \
+      git clone --depth=1 https://github.com/madler/zlib.git external/zlib; \
+    else \
+      echo "[Docker build] Found zlib"; \
+    fi; \
+    if [ ! -f external/pugixml/src/pugixml.cpp ]; then \
+      echo "[Docker build] pugixml not found – cloning..."; \
+      rm -rf external/pugixml && mkdir -p external; \
+      git clone --depth=1 https://github.com/zeux/pugixml.git external/pugixml; \
+    else \
+      echo "[Docker build] Found pugixml"; \
+    fi
+
 # Configure & build (CUDA by default)
 RUN set -eux; \
     cmake -S . -B build -G Ninja \
@@ -79,7 +96,8 @@ RUN set -eux; \
       -DCMAKE_C_COMPILER=${CC} -DCMAKE_CXX_COMPILER=${CXX} \
       -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \
       -DUSE_CUDA=${ENABLE_CUDA} \
-      -DUSE_PODOFO=${USE_PODOFO}; \
+      -DUSE_PODOFO=${USE_PODOFO} \
+      -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so; \
     cmake --build build --config ${BUILD_TYPE}
 
 # Determine single-config output dir (project sets output to build/<TYPE>)

From 473972eba84164ccd3ce0bebbdb5287aab55d583 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Thu, 4 Sep 2025 12:48:26 -0700
Subject: [PATCH 07/15] Remove zlib and pugixml cloning from Dockerfile

Eliminates the steps that clone zlib and pugixml if not present, assuming these dependencies are now handled elsewhere or are always available in the build context.
---
 Dockerfile | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index bfba0d63..cfc22c2d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -72,23 +72,6 @@ RUN set -eux; \
       echo "[Docker build] Found llama.cpp sources in repo"; \
     fi
 
-# Ensure other required externals are present when not vendored in context
-RUN set -eux; \
-    if [ ! -f external/zlib/contrib/minizip/ioapi.c ]; then \
-      echo "[Docker build] zlib not found – cloning..."; \
-      rm -rf external/zlib && mkdir -p external; \
-      git clone --depth=1 https://github.com/madler/zlib.git external/zlib; \
-    else \
-      echo "[Docker build] Found zlib"; \
-    fi; \
-    if [ ! -f external/pugixml/src/pugixml.cpp ]; then \
-      echo "[Docker build] pugixml not found – cloning..."; \
-      rm -rf external/pugixml && mkdir -p external; \
-      git clone --depth=1 https://github.com/zeux/pugixml.git external/pugixml; \
-    else \
-      echo "[Docker build] Found pugixml"; \
-    fi
-
 # Configure & build (CUDA by default)
 RUN set -eux; \
     cmake -S . -B build -G Ninja \
@@ -97,7 +80,8 @@ RUN set -eux; \
       -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \
       -DUSE_CUDA=${ENABLE_CUDA} \
       -DUSE_PODOFO=${USE_PODOFO} \
-      -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so; \
+      -DBUILD_TESTING=OFF \
+      -DBUILD_INFERENCE_TESTS=OFF; \
     cmake --build build --config ${BUILD_TYPE}
 
 # Determine single-config output dir (project sets output to build/<TYPE>)

From 68fc324ae7350702fcbcbfeb36ae7ce0534dbd62 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Thu, 4 Sep 2025 12:52:01 -0700
Subject: [PATCH 08/15] Add CUDA library path to CMake build

Sets the CUDA_CUDA_LIBRARY variable in the CMake configuration to explicitly specify the location of libcuda.so. This helps ensure correct linking when building with CUDA support.
---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index cfc22c2d..0c99a9c2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -80,6 +80,7 @@ RUN set -eux; \
       -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \
       -DUSE_CUDA=${ENABLE_CUDA} \
       -DUSE_PODOFO=${USE_PODOFO} \
+      -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so \
       -DBUILD_TESTING=OFF \
       -DBUILD_INFERENCE_TESTS=OFF; \
     cmake --build build --config ${BUILD_TYPE}

From 0269d97cd6f5f8f4fb1a8212b0e8881794898df2 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Thu, 4 Sep 2025 15:43:53 -0700
Subject: [PATCH 09/15] Enable GGML_CUDA_NO_VMM in Docker build

Adds the -DGGML_CUDA_NO_VMM=ON flag to the CMake configuration in the Dockerfile to disable CUDA VMM support during build.
---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 0c99a9c2..863e54fd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -80,6 +80,7 @@ RUN set -eux; \
       -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \
       -DUSE_CUDA=${ENABLE_CUDA} \
       -DUSE_PODOFO=${USE_PODOFO} \
+  -DGGML_CUDA_NO_VMM=ON \
       -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so \
       -DBUILD_TESTING=OFF \
       -DBUILD_INFERENCE_TESTS=OFF; \

From 4bf9f83d7526c165a02e45d1d6580d31e6b02216 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Sat, 6 Sep 2025 14:42:37 -0700
Subject: [PATCH 10/15] Add BLAS, LAPACK, and gfortran to Docker runtime deps

Added libblas3, liblapack3, and libgfortran5 to the minimal runtime dependencies in the Dockerfile to support applications requiring these scientific libraries.
---
 Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 863e54fd..f17ba3c2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -132,7 +132,8 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:/app/libs:/usr/local/cuda/lib64 \
 
 # Minimal runtime deps (keep in sync with ldd if needed)
 RUN apt-get update && apt-get install -y --no-install-recommends \
-      libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \
+  libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \
+  libblas3 liblapack3 libgfortran5 \
       # PoDoFo runtime libs (safe if unused)
       libfreetype6 libjpeg-turbo8 libpng16-16 libtiff5 libxml2 fontconfig \
     && rm -rf /var/lib/apt/lists/*

From 63ff476676e71193835c1ad88862c9dfb9535038 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Sat, 6 Sep 2025 15:02:43 -0700
Subject: [PATCH 11/15] Enable public access in server config

Changed 'allow_public_access' to true in config_rms.yaml to permit access from other devices on the same network.
---
 configs/config_rms.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/config_rms.yaml b/configs/config_rms.yaml
index 4395a0e3..836eb9ed 100644
--- a/configs/config_rms.yaml
+++ b/configs/config_rms.yaml
@@ -3,7 +3,7 @@ server:
   port: "8080"                        # Port number to run the server on (Docker exposes 8080)
   host: "0.0.0.0"                     # Host to bind the server; 0.0.0.0 means all available interfaces
   idle_timeout: 300                  # Idle timeout in seconds
-  allow_public_access: false        # Allow access from other devices on the same network
+  allow_public_access: true         # Allow access from other devices on the same network
   allow_internet_access: false      # Allow internet access (requires proper port forwarding)
 
 # Logging Configuration

From e711342f4b7dc0b4d62c7035076d9c3eced66652 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Sat, 6 Sep 2025 15:08:41 -0700
Subject: [PATCH 12/15] Add basic config and update Dockerfile defaults

Introduces configs/config_basic.yaml for minimal embedding-only deployments. Updates Dockerfile to prefer config_basic.yaml as the default configuration, with fallback logic for other config files and adjusts entrypoint to use config_basic.yaml by default.
---
 Dockerfile                |  13 +++--
 configs/config_basic.yaml | 102 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 3 deletions(-)
 create mode 100644 configs/config_basic.yaml

diff --git a/Dockerfile b/Dockerfile
index f17ba3c2..057dcf1e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -97,10 +97,16 @@ RUN set -eux; \
     strip -s "${OUTDIR}/kolosal-server" || true; \
     mkdir -p /out/bin /out/config /out/libs /out/licenses; \
     cp "${OUTDIR}/kolosal-server" /out/bin/; \
-    # Prefer configs/config_rms.yaml by default
-    if [ -f configs/config_rms.yaml ]; then \
+    # Prefer basic (vanilla) config by default, then fall back
+    if [ -f configs/config_basic.yaml ]; then \
+      cp configs/config_basic.yaml /out/config/config_basic.yaml; \
+      cp configs/config_basic.yaml /out/config/config.yaml; \
+    elif [ -f configs/config_rms.yaml ]; then \
       cp configs/config_rms.yaml /out/config/config_rms.yaml; \
       cp configs/config_rms.yaml /out/config/config.yaml; \
+    elif [ -f config_basic.yaml ]; then \
+      cp config_basic.yaml /out/config/config_basic.yaml; \
+      cp config_basic.yaml /out/config/config.yaml; \
     elif [ -f config_rms.yaml ]; then \
       cp config_rms.yaml /out/config/config_rms.yaml; \
       cp config_rms.yaml /out/config/config.yaml; \
@@ -155,7 +161,8 @@ RUN set -eux; \
 # Simple entrypoint wrapper – use config_rms.yaml by default when present
 RUN printf '%s\n' '#!/bin/sh' \
   'set -e' \
-  'CONFIG="/app/config/config_rms.yaml"' \
+  'CONFIG="/app/config/config_basic.yaml"' \
+  'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config_rms.yaml"; fi' \
   'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config.yaml"; fi' \
   'echo "Starting kolosal-server with: $CONFIG"' \
   'exec kolosal-server --config "$CONFIG"' \
diff --git a/configs/config_basic.yaml b/configs/config_basic.yaml
new file mode 100644
index 00000000..1956607e
--- /dev/null
+++ b/configs/config_basic.yaml
@@ -0,0 +1,102 @@
+# Minimal config for deployments that only need embeddings into Qdrant.
+# Assumptions:
+# - Exposes HTTP on 8080 and binds to all interfaces.
+# - Only an embedding model is loaded (no LLMs).
+# - Qdrant is reachable at the configured host/port (use "localhost" on a single VM,
+#   or the service DNS name like "qdrant" when running in Kubernetes/Helm).
+
+server:
+  port: "8080"
+  host: "0.0.0.0"
+  idle_timeout: 300
+  allow_public_access: true
+  allow_internet_access: false
+
+logging:
+  level: INFO
+  file: ""
+  access_log: false
+  quiet_mode: false
+  show_request_details: false
+
+auth:
+  enabled: true
+  require_api_key: false
+  api_key_header: X-API-Key
+  api_keys:
+    - change_me_if_enabled
+  rate_limit:
+    enabled: true
+    max_requests: 100
+    window_size: 60
+  cors:
+    enabled: true
+    allow_credentials: false
+    max_age: 86400
+    allowed_origins: ["*"]
+    allowed_methods: [GET, POST, PUT, DELETE, OPTIONS, HEAD, PATCH]
+    allowed_headers: [Content-Type, Authorization, X-Requested-With, Accept, Origin]
+
+# Disable internet search for this minimal embedding-only setup
+search:
+  enabled: false
+
+database:
+  qdrant:
+    enabled: true
+    host: "localhost"   # On K8s/Helm, set to the Qdrant service name, e.g., "qdrant"
+    port: 6333
+    collection_name: "documents"
+    default_embedding_model: "qwen3-embedding-4b"
+    timeout: 30
+    api_key: ""
+    max_connections: 10
+    connection_timeout: 5
+
+inference_engines:
+  - name: llama-cuda
+    library_path: /usr/local/lib/libllama-cuda.so
+    version: 1.0.0
+    description: CUDA-accelerated inference engine for embeddings
+    load_on_startup: true
+  - name: llama-cpu
+    library_path: /usr/local/lib/libllama-cpu.so
+    version: 1.0.0
+    description: CPU fallback (optional)
+    load_on_startup: false
+
+default_inference_engine: llama-cuda
+
+features:
+  health_check: true
+  metrics: true
+
+embedding_autoscaling:
+  enabled: true
+  min_instances: 1
+  max_instances: 4
+  scale_up_threshold: 10
+  scale_down_threshold: 2
+  scale_up_delay: 30
+  scale_down_delay: 300
+  check_interval: 15
+
+# Only the embedding model is defined here; no LLMs.
+models:
+  - id: qwen3-embedding-4b
+    path: https://huggingface.co/kolosal/qwen3-embedding-4b/resolve/main/Qwen3-Embedding-4B-Q4_K_M.gguf
+    type: embedding
+    load_immediately: true
+    main_gpu_id: 0
+    inference_engine: llama-cuda
+    load_params:
+      n_ctx: 4096
+      n_keep: 1024
+      use_mmap: true
+      use_mlock: false
+      n_parallel: 1
+      cont_batching: true
+      warmup: false
+      n_gpu_layers: 100   # Set to -1 to offload all layers if VRAM allows
+      n_batch: 2048
+      n_ubatch: 512

From ea16d7906f2bb167222cc441c62d14a33ee25232 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Sun, 7 Sep 2025 00:54:04 -0700
Subject: [PATCH 13/15] Add Docker usage instructions to README

Expanded the README with detailed steps for using prebuilt Docker images from GitHub Container Registry, including prerequisites, image pulling, running with GPU support, mounting directories, configuration options, updating/rollback, and troubleshooting. This helps users deploy the server more easily using Docker.
---
 README.md | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/README.md b/README.md
index 66e9b864..a3a056e3 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,138 @@ Health check:
 curl http://localhost:8080/v1/health
 ```
 
+## Docker (prebuilt images from GHCR)
+
+Use a ready-made image from GitHub Container Registry so you don’t rebuild on every VM.
+
+### Prerequisites
+
+- NVIDIA GPU + drivers on the host
+- NVIDIA Container Toolkit installed (for `--gpus all`)
+- Open port 8080/tcp on your VM firewall/security group if accessing remotely
+
+### Pull the image
+
+Windows PowerShell:
+
+```powershell
+docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Linux/macOS:
+
+```bash
+docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+If your package is private, login first with a Personal Access Token (scopes: `read:packages`):
+
+```powershell
+$env:GHCR_TOKEN = "<YOUR_GHCR_PAT>"
+echo $env:GHCR_TOKEN | docker login ghcr.io -u <YOUR_GH_USERNAME> --password-stdin
+```
+
+### Run (GPU)
+
+Minimal run (exposes 8080 and runs with GPU):
+
+Windows PowerShell:
+
+```powershell
+docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Linux/macOS:
+
+```bash
+docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Mount a models directory (recommended):
+
+```powershell
+docker run -d --name kolosal-server --gpus all --restart unless-stopped `
+  -p 8080:8080 `
+  -v C:\\kolosal\\models:/app/models `
+  ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Linux/macOS:
+
+```bash
+docker run -d --name kolosal-server --gpus all --restart unless-stopped \
+  -p 8080:8080 \
+  -v $PWD/models:/app/models \
+  ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+### Choose a config
+
+Images support these defaults inside the container:
+
+- Prefer `/app/config/config_basic.yaml` (embedding-only “vanilla”)
+- Fallback to `/app/config/config_rms.yaml`
+- Fallback to `/app/config/config.yaml`
+
+To use your own config, bind-mount it to `/app/config/config.yaml`:
+
+Windows PowerShell:
+
+```powershell
+docker run -d --name kolosal-server --gpus all --restart unless-stopped `
+  -p 8080:8080 `
+  -v ${PWD}\configs\config_basic.yaml:/app/config/config.yaml:ro `
+  -v ${PWD}\models:/app/models `
+  ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Linux/macOS:
+
+```bash
+docker run -d --name kolosal-server --gpus all --restart unless-stopped \
+  -p 8080:8080 \
+  -v $PWD/configs/config_basic.yaml:/app/config/config.yaml:ro \
+  -v $PWD/models:/app/models \
+  ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Note: Ensure `server.allow_public_access: true` in your config if you’ll call the API from outside the container/host.
+
+### Verify and logs
+
+```powershell
+curl http://localhost:8080/v1/health
+docker logs -f kolosal-server
+```
+
+If you see model “unloaded”, that’s okay if `load_immediately` is false; it will load on first use.
+
+### Persist data
+
+- Models: mount a host folder to `/app/models`
+- App data/cache: mount a host folder to `/app/data` if desired
+
+### Update or rollback
+
+```powershell
+# Update to a new tag
+docker pull ghcr.io/kolosalai/kolosal-server:v0.0.2
+docker rm -f kolosal-server
+docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.2
+
+# Rollback to previous
+docker rm -f kolosal-server
+docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+### Troubleshooting
+
+- Connection reset: confirm `-p 8080:8080` and `server.allow_public_access: true` in the config.
+- GPU not used: ensure NVIDIA drivers + NVIDIA Container Toolkit; run with `--gpus all`; check `nvidia-smi` on host.
+- Missing models: mount a models directory to `/app/models` or use paths/URLs in your config.
+- 404s: check path (`/v1/health`, `/v1/models`, etc.).
+- Authentication: set `auth.enabled` and `api_keys` in your config and send `X-API-Key` header.
+
 ### Linux (Recommended)
 
 #### Prerequisites

From 8c2781459dd14faca4e2ba22f79cd7e23d66566b Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Mon, 8 Sep 2025 12:43:33 -0700
Subject: [PATCH 14/15] Disable rate limiting in basic config

Set 'rate_limit.enabled' to false in config_basic.yaml to turn off API rate limiting. This may be for testing or to allow unrestricted access during development.
---
 configs/config_basic.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/config_basic.yaml b/configs/config_basic.yaml
index 1956607e..e34fcfe8 100644
--- a/configs/config_basic.yaml
+++ b/configs/config_basic.yaml
@@ -26,7 +26,7 @@ auth:
   api_keys:
     - change_me_if_enabled
   rate_limit:
-    enabled: true
+    enabled: false
     max_requests: 100
     window_size: 60
   cors:

From 40d67094fca326c18693243fa404420506950127 Mon Sep 17 00:00:00 2001
From: Alifais Farrel Ramdhani <farrellabindonesia@gmail.com>
Date: Mon, 8 Sep 2025 12:44:44 -0700
Subject: [PATCH 15/15] Revert "Disable rate limiting in basic config"

This reverts commit 8c2781459dd14faca4e2ba22f79cd7e23d66566b.
---
 configs/config_basic.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/config_basic.yaml b/configs/config_basic.yaml
index e34fcfe8..1956607e 100644
--- a/configs/config_basic.yaml
+++ b/configs/config_basic.yaml
@@ -26,7 +26,7 @@ auth:
   api_keys:
     - change_me_if_enabled
   rate_limit:
-    enabled: false
+    enabled: true
     max_requests: 100
     window_size: 60
   cors: