diff --git a/docker-compose.yml b/docker-compose.yml
index 02c2fdab7..231ce87ba 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -44,6 +44,7 @@ services:
     restart: "no"
     environment:
       - TORCH_DEVICE=cuda
+      - WARMUP_ENABLED=true
     labels:
       ca.mcgill.a11y.image.cacheTimeout: 3600
     deploy:
@@ -59,6 +60,7 @@ services:
     restart: "no"
     environment:
       - TORCH_DEVICE=cuda
+      - WARMUP_ENABLED=true
     labels:
       ca.mcgill.a11y.image.cacheTimeout: 3600
     deploy:
@@ -80,6 +82,8 @@ services:
           devices:
             - driver: nvidia
               capabilities: ["gpu", "compute", "utility"]
+    environment:
+      - WARMUP_ENABLED=true
 
   autour-preprocessor:
     profiles: [production, test, default]
@@ -110,6 +114,7 @@ services:
       ./config/ollama.env
     environment:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
 
   graphic-caption:
     profiles: [production, test, default]
@@ -125,6 +130,7 @@ services:
       ./config/ollama.env
     environment:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
 
   text-followup:
     profiles: [production, test, default]
@@ -134,6 +140,7 @@ services:
       - MAX_HISTORY_LENGTH=100
       - HISTORY_EXPIRY=3600
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
     labels:
       ca.mcgill.a11y.image.preprocessor: 1
       ca.mcgill.a11y.image.port: 5000
@@ -179,6 +186,7 @@ services:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
       - YOLO_MODEL_PATH=/usr/src/app/models/yolo11x.pt
       - CONF_THRESHOLD=0.75
+      - WARMUP_ENABLED=true
 
   multistage-diagram-segmentation:
     profiles: [production, test, default]
@@ -201,6 +209,7 @@ services:
       - SAM_MODEL_PATH=/usr/src/app/models/sam2.1_l.pt
       - GEMINI_MODEL=gemini-2.5-pro-preview-06-05
       - BASE_SCHEMA=/usr/src/app/base_schema.json
+      - WARMUP_ENABLED=true
     env_file:
       ./config/gemini.env   
 
@@ -263,6 +272,7 @@ services:
       ca.mcgill.a11y.image.optional_dependencies: "content-categoriser,graphic-tagger"
     environment:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
   
   supercollider:
     profiles: [production, test, default]
@@ -376,6 +386,7 @@ services:
         ca.mcgill.a11y.image.optional_dependencies: ""
     environment:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
 
   svg-depth-map:
     profiles: [production, test, default]
diff --git a/preprocessors/content-categoriser/categoriser.py b/preprocessors/content-categoriser/categoriser.py
index b28237f7c..6ec1b9c7c 100644
--- a/preprocessors/content-categoriser/categoriser.py
+++ b/preprocessors/content-categoriser/categoriser.py
@@ -198,5 +198,44 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Trigger a warmup call to load the Ollama LLM into memory.
+    This avoids first-request latency by sending a dummy request.
+    """
+    try:
+        # construct the target Ollama endpoint for generate
+        api_url = f"{os.environ['OLLAMA_URL']}/generate"
+
+        # authorization headers with API key
+        headers = {
+            "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}",
+            "Content-Type": "application/json"
+        }
+
+        # prepare the warmup request data using the configured model
+        data = {
+            "model": os.environ["OLLAMA_MODEL"],
+            "prompt": "ping",
+            "stream": False,
+            "keep_alive": -1  # instruct Ollama to keep the model in memory
+        }
+
+        logging.info("[WARMUP] Warmup endpoint triggered.")
+        logging.pii(f"[WARMUP] Posting to {api_url} with model \
+                    {data['model']}")
+
+        # send warmup request (with timeout)
+        r = requests.post(api_url, headers=headers, json=data, timeout=60)
+        r.raise_for_status()
+
+        return jsonify({"status": "warmed"}), 200
+
+    except Exception as e:
+        logging.exception(f"[WARMUP] Exception details: {str(e)}")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/depth-map-gen/depth-map-generator.py b/preprocessors/depth-map-gen/depth-map-generator.py
index eff12c28d..5424523d1 100644
--- a/preprocessors/depth-map-gen/depth-map-generator.py
+++ b/preprocessors/depth-map-gen/depth-map-generator.py
@@ -36,7 +36,6 @@
 configure_logging()
 
 app = Flask(__name__)
-logging.basicConfig(level=logging.DEBUG)
 
 
 def parse_args():
@@ -223,6 +222,30 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    try:
+        logging.pii("[WARMUP] Initializing RelDepthModel with resnext101 \
+                    and loading weights from /app/res101.pth")
+        model = RelDepthModel(backbone='resnext101').eval().cuda()
+        model.load_state_dict(
+            strip_prefix_if_present(
+                torch.load("/app/res101.pth")['depth_model'], "module."),
+            strict=True
+        )
+
+        # simulating a single RGB image input to the model
+        # 1: one image; 3: RGB; 448 and 448: height and width
+        dummy = torch.ones((1, 3, 448, 448), dtype=torch.float32).cuda()
+        _ = model.inference(dummy)
+        return jsonify({"status": "warmed"}), 200
+
+    except Exception as e:
+        logging.error("Warmup failed")
+        logging.pii(f"Warmup error: {e}")
+        return jsonify({"status": "warmup failed"}), 500
+
+
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000, debug=True)
     depthgenerator()
diff --git a/preprocessors/graphic-caption/caption.py b/preprocessors/graphic-caption/caption.py
index 835b62445..64979dbe2 100644
--- a/preprocessors/graphic-caption/caption.py
+++ b/preprocessors/graphic-caption/caption.py
@@ -28,7 +28,6 @@
 configure_logging()
 
 app = Flask(__name__)
-logging.basicConfig(level=logging.DEBUG)
 
 PROMPT = """Describe this image to a person who cannot see it.
     Use simple, descriptive, clear, and concise language.
@@ -173,5 +172,44 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Trigger a warmup call to load the Ollama LLM into memory.
+    This avoids first-request latency by sending a dummy request.
+    """
+    try:
+        # construct the target Ollama endpoint for generate
+        api_url = f"{os.environ['OLLAMA_URL']}/generate"
+
+        # authorization headers with API key
+        headers = {
+            "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}",
+            "Content-Type": "application/json"
+        }
+
+        # prepare the warmup request data using the configured model
+        data = {
+            "model": os.environ["OLLAMA_MODEL"],
+            "prompt": "ping",
+            "stream": False,
+            "keep_alive": -1  # instruct Ollama to keep the model in memory
+        }
+
+        logging.info("[WARMUP] Warmup endpoint triggered.")
+        logging.pii(f"[WARMUP] Posting to {api_url} with model \
+                    {data['model']}")
+
+        # send warmup request (with timeout)
+        r = requests.post(api_url, headers=headers, json=data, timeout=60)
+        r.raise_for_status()
+
+        return jsonify({"status": "warmed"}), 200
+
+    except Exception as e:
+        logging.exception(f"[WARMUP] Exception details: {str(e)}")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/mmsemseg/Dockerfile b/preprocessors/mmsemseg/Dockerfile
index 99d0a4c1d..ab4f536d3 100644
--- a/preprocessors/mmsemseg/Dockerfile
+++ b/preprocessors/mmsemseg/Dockerfile
@@ -52,7 +52,7 @@ EXPOSE 5000
 ENV FLASK_APP=segment.py
 USER python
 
-HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=5 CMD curl -f http://localhost:5000/health || exit 1
-HEALTHCHECK --interval=3600s --timeout=30s --start-period=120s --retries=3 CMD curl -f http://localhost:5000/health/gpu || exit 1
+HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=5 \
+  CMD curl -f http://localhost:5000/health && curl -f http://localhost:5000/health/gpu || exit 1
 
 CMD [ "gunicorn", "segment:app", "-b", "0.0.0.0:5000", "--capture-output", "--log-level=debug" ]
\ No newline at end of file
diff --git a/preprocessors/mmsemseg/segment.py b/preprocessors/mmsemseg/segment.py
index 1e5c86236..615a99458 100644
--- a/preprocessors/mmsemseg/segment.py
+++ b/preprocessors/mmsemseg/segment.py
@@ -284,10 +284,13 @@ def gpu_driver_health_check():
     try:
         # Get installed NVIDIA driver version from nvidia-smi
         nvidia_smi_version = subprocess.check_output(
-            ["nvidia-smi", "--query-gpu=driver_version",
-             "--format=csv,noheader"],
+            [
+                "nvidia-smi",
+                "--query-gpu=driver_version",
+                "--format=csv,noheader"
+            ],
             text=True
-        ).strip()
+        ).strip().split("\n")[0]
 
         # Get loaded driver version from /proc/driver/nvidia/version
         loaded_driver_version = subprocess.check_output(
@@ -320,5 +323,34 @@ def gpu_driver_health_check():
         }), 500
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Warms up the segmentation model by running a dummy inference.
+    """
+    try:
+        # dummy black image (512×512)
+        dummy_img = np.zeros((512, 512, 3), dtype=np.uint8)
+
+        # runs inference_segmentor(): model weight loading/memory allocation
+        model = init_segmentor(BEIT_CONFIG, BEIT_CHECKPOINT, device='cuda:0')
+        _ = inference_segmentor(model, dummy_img)
+
+        torch.cuda.empty_cache()
+
+        return jsonify({
+            "status": "warmup successful",
+            "timestamp": datetime.now().isoformat()
+        }), 200
+
+    except Exception as e:
+        logging.pii(f"[WARMUP] Warmup failed: {e}")
+        logging.exception("Warmup failed")
+        return jsonify({
+            "status": "warmup failed",
+            "message": str(e)
+        }), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py b/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py
index 8b1870363..9e94a6c4e 100644
--- a/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py
+++ b/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py
@@ -819,5 +819,35 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    try:
+        logging.info("Warming up Gemini and SAM...")
+
+        # Gemini: dummy image + prompt
+        dummy_img = Image.new("RGB", (512, 512), color="white")
+        response = client.models.generate_content(
+            model=GEMINI_MODEL,
+            contents=["{}", dummy_img],
+            config=types.GenerateContentConfig(
+                temperature=0.1,
+                safety_settings=safety_settings,
+                response_mime_type='application/json',
+                response_schema=BASE_SCHEMA_GEMINI,
+            )
+        )
+        _ = validate_gemini_response(response)
+
+        # SAM: dummy box
+        dummy_cv2 = np.zeros((512, 512, 3), dtype=np.uint8)
+        dummy_pil = Image.fromarray(dummy_cv2)
+        _ = sam_model(dummy_pil, bboxes=[[100, 100, 200, 200]])
+
+        return jsonify({"status": "ok"}), 200
+    except Exception as e:
+        logging.pii(f"Warmup failed: {str(e)}")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/text-followup/text-followup.py b/preprocessors/text-followup/text-followup.py
index 2e674557e..1aefd135e 100644
--- a/preprocessors/text-followup/text-followup.py
+++ b/preprocessors/text-followup/text-followup.py
@@ -401,5 +401,44 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Trigger a warmup call to load the Ollama LLM into memory.
+    This avoids first-request latency by sending a dummy request.
+    """
+    try:
+        # construct the target Ollama endpoint for chat
+        api_url = f"{os.environ['OLLAMA_URL']}/chat"
+
+        # authorization headers with API key
+        headers = {
+            "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}"
+        }
+
+        # prepare the warmup request data using the configured model
+        data = {
+            "model": os.environ["OLLAMA_MODEL"],
+            "messages": [{"role": "user", "content": "warmup"}],
+            "stream": False
+        }
+
+        logging.info("[WARMUP] Warmup endpoint triggered.")
+        logging.pii(
+            f"[WARMUP] Posting to {api_url} with model {data['model']}"
+        )
+
+        # send warmup request (with timeout)
+        r = requests.post(api_url, headers=headers, json=data, timeout=60)
+        r.raise_for_status()
+
+        return jsonify({"status": "warmed"}), 200
+
+    except Exception as e:
+        logging.pii(f"[WARMUP] Warmup failed: {str(e)}")
+        logging.exception("[WARMUP] Exception details:")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/yolo/detect.py b/preprocessors/yolo/detect.py
index 0c83b8470..83cfea757 100644
--- a/preprocessors/yolo/detect.py
+++ b/preprocessors/yolo/detect.py
@@ -239,5 +239,29 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    try:
+        # create a blank dummy image (640x640)
+        dummy_image = Image.new("RGB", (8, 8), color=(0, 0, 0))
+
+        # Run YOLO inference with dummy image
+        with torch.no_grad():
+            _ = model.predict(
+                dummy_image,
+                device=device,
+                conf=CONF_THRESHOLD,
+                # imgsz=MAX_IMAGE_SIZE,
+                verbose=False
+            )
+
+        logging.info("YOLO warmup completed successfully with 8x8 image.")
+        return jsonify({"status": "ok"}), 200
+    except Exception as e:
+        logging.error(f"YOLO warmup failed: {str(e)}")
+        logging.pii(traceback.format_exc())
+        return jsonify({"status": "error", "error": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(debug=True)
diff --git a/scripts/imageup b/scripts/imageup
index b2bf80156..dd5927ce0 100755
--- a/scripts/imageup
+++ b/scripts/imageup
@@ -104,4 +104,7 @@ docker network rm -f image || true
 # Bring all the containers back up
 docker compose --env-file "$COMPOSE_ENV_FILE" up -d --force-recreate
 
+echo "Running warmup for critical services..."
+"${SCRIPT_DIR}/warmup"
+
 cd -
\ No newline at end of file
diff --git a/scripts/warmup b/scripts/warmup
new file mode 100755
index 000000000..909ab4f20
--- /dev/null
+++ b/scripts/warmup
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Locate this script's directory
+SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+
+WARMUP_LOG_DIR="/var/docker/image/testing/warmup"
+timestamp=$(date +"%Y%m%d_%H%M%S")
+logfile="${WARMUP_LOG_DIR}/warmup_${timestamp}.log"
+mkdir -p "$WARMUP_LOG_DIR"
+
+echo "[Warmup] $(date) Starting warmup..." | tee -a "$logfile"
+
+# Get all running containers
+# Restrict to containers on the 'image' Docker network
+if docker network inspect image &> /dev/null; then
+  containers=$(docker network inspect image | jq -r '.[0].Containers | to_entries[] | .value.Name') 
+else
+  echo "[Warmup] No Docker network named 'image' found. Aborting." | tee -a "$logfile"
+  exit 1
+fi
+
+for container in $containers; do
+  # Check if WARMUP_ENABLED=true is present in the environment
+  if ! docker inspect -f '{{range .Config.Env}}{{println .}}{{end}}' "$container" | grep -q "^WARMUP_ENABLED=true$"; then
+    continue
+  fi
+
+  # Get EXPOSED port (assume first one is the correct one)
+  exposed_port=$(docker inspect -f '{{range $p, $_ := .Config.ExposedPorts}}{{println $p}}{{end}}' "$container" | head -n1 | cut -d'/' -f1)
+  if [ -z "$exposed_port" ]; then
+    echo "[Warmup] $container has no EXPOSEd port. Skipping." | tee -a "$logfile"
+    continue
+  fi
+
+  endpoint="http://localhost:${exposed_port}/warmup"
+
+  echo "[Warmup] Waiting for $container to be healthy..." | tee -a "$logfile"
+  until [[ "$(docker inspect -f '{{.State.Health.Status}}' "$container")" == "healthy" ]]; do
+    sleep 2
+  done
+
+  echo "[Warmup] $container marked healthy. Waiting 10s before hitting warmup..." | tee -a "$logfile"
+  # Wait briefly after container is marked healthy to ensure internal models are fully initialized before warmup.
+  # prevents race conditions where healthcheck passes but model isnt ready
+
+  # add random jitter to stagger warmups (addresses potential resource spike if all hit at once)
+  # note: even if some warmups fail, most models will still be partially/fully loaded, so the first real request is likely to succeed or respond faster than a cold start.
+  jitter=$((RANDOM % 5))
+  sleep $((10 + jitter))
+
+  echo "[Warmup] Hitting warmup endpoint at $endpoint..." | tee -a "$logfile"
+  resp=$(docker exec "$container" curl -s -w "%{http_code}" -o /tmp/warmup_resp.txt "$endpoint")
+
+  if [[ "$resp" == "200" ]]; then
+    echo "[Warmup] $container warmed successfully." | tee -a "$logfile"
+  else
+    echo "[Warmup] $container warmup failed with HTTP $resp. Response was:" | tee -a "$logfile"
+    docker exec "$container" cat /tmp/warmup_resp.txt | tee -a "$logfile" || echo "[Warmup] (no response body)" | tee -a "$logfile"
+  fi
+done
+
+echo "[Warmup] Completed at $(date)! " | tee -a "$logfile"
\ No newline at end of file
diff --git a/services/espnet-tts-fr/src/app.py b/services/espnet-tts-fr/src/app.py
index e54ca902c..b5e4a60e4 100644
--- a/services/espnet-tts-fr/src/app.py
+++ b/services/espnet-tts-fr/src/app.py
@@ -172,6 +172,22 @@ def segment_tts():
         empty_cache()
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Trigger a dummy call to warm up the model and pre-load it into GPU memory.
+    to reduce first-request latency by avoiding model load.
+    """
+    try:
+        logger.info("[WARMUP] Warmup endpoint triggered.")
+        # Run inference on a short dummy input
+        _ = tts("warmup")
+        return jsonify({"status": "warmed"}), 200
+    except Exception as e:
+        logger.exception("[WARMUP] Warmup failed.")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 @app.route("/health", methods=["GET"])
 def health():
     """
diff --git a/services/espnet-tts/src/app.py b/services/espnet-tts/src/app.py
index a825776d5..347922560 100644
--- a/services/espnet-tts/src/app.py
+++ b/services/espnet-tts/src/app.py
@@ -115,6 +115,22 @@ def segment_tts():
         empty_cache()
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Trigger a dummy call to warm up the model and pre-load it into GPU memory.
+    to reduce first-request latency by avoiding model load.
+    """
+    try:
+        logger.info("[WARMUP] Warmup endpoint triggered.")
+        # Run inference on a short dummy input
+        _ = tts("warmup")
+        return jsonify({"status": "warmed"}), 200
+    except Exception as e:
+        logger.exception("[WARMUP] Warmup failed.")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 @app.route("/health", methods=["GET"])
 def health():
     """