diff --git a/docker-compose.yml b/docker-compose.yml index 02c2fdab7..231ce87ba 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -44,6 +44,7 @@ services: restart: "no" environment: - TORCH_DEVICE=cuda + - WARMUP_ENABLED=true labels: ca.mcgill.a11y.image.cacheTimeout: 3600 deploy: @@ -59,6 +60,7 @@ services: restart: "no" environment: - TORCH_DEVICE=cuda + - WARMUP_ENABLED=true labels: ca.mcgill.a11y.image.cacheTimeout: 3600 deploy: @@ -80,6 +82,8 @@ services: devices: - driver: nvidia capabilities: ["gpu", "compute", "utility"] + environment: + - WARMUP_ENABLED=true autour-preprocessor: profiles: [production, test, default] @@ -110,6 +114,7 @@ services: ./config/ollama.env environment: - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED} + - WARMUP_ENABLED=true graphic-caption: profiles: [production, test, default] @@ -125,6 +130,7 @@ services: ./config/ollama.env environment: - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED} + - WARMUP_ENABLED=true text-followup: profiles: [production, test, default] @@ -134,6 +140,7 @@ services: - MAX_HISTORY_LENGTH=100 - HISTORY_EXPIRY=3600 - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED} + - WARMUP_ENABLED=true labels: ca.mcgill.a11y.image.preprocessor: 1 ca.mcgill.a11y.image.port: 5000 @@ -179,6 +186,7 @@ services: - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED} - YOLO_MODEL_PATH=/usr/src/app/models/yolo11x.pt - CONF_THRESHOLD=0.75 + - WARMUP_ENABLED=true multistage-diagram-segmentation: profiles: [production, test, default] @@ -201,6 +209,7 @@ services: - SAM_MODEL_PATH=/usr/src/app/models/sam2.1_l.pt - GEMINI_MODEL=gemini-2.5-pro-preview-06-05 - BASE_SCHEMA=/usr/src/app/base_schema.json + - WARMUP_ENABLED=true env_file: ./config/gemini.env @@ -263,6 +272,7 @@ services: ca.mcgill.a11y.image.optional_dependencies: "content-categoriser,graphic-tagger" environment: - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED} + - WARMUP_ENABLED=true supercollider: profiles: [production, test, default] @@ -376,6 +386,7 @@ services: ca.mcgill.a11y.image.optional_dependencies: "" environment: - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED} + - WARMUP_ENABLED=true svg-depth-map: profiles: [production, test, default] diff --git a/preprocessors/content-categoriser/categoriser.py b/preprocessors/content-categoriser/categoriser.py index b28237f7c..6ec1b9c7c 100644 --- a/preprocessors/content-categoriser/categoriser.py +++ b/preprocessors/content-categoriser/categoriser.py @@ -198,5 +198,44 @@ def health(): }), 200 +@app.route("/warmup", methods=["GET"]) +def warmup(): + """ + Trigger a warmup call to load the Ollama LLM into memory. + This avoids first-request latency by sending a dummy request. + """ + try: + # construct the target Ollama endpoint for generate + api_url = f"{os.environ['OLLAMA_URL']}/generate" + + # authorization headers with API key + headers = { + "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}", + "Content-Type": "application/json" + } + + # prepare the warmup request data using the configured model + data = { + "model": os.environ["OLLAMA_MODEL"], + "prompt": "ping", + "stream": False, + "keep_alive": -1 # instruct Ollama to keep the model in memory + } + + logging.info("[WARMUP] Warmup endpoint triggered.") + logging.pii(f"[WARMUP] Posting to {api_url} with model \ + {data['model']}") + + # send warmup request (with timeout) + r = requests.post(api_url, headers=headers, json=data, timeout=60) + r.raise_for_status() + + return jsonify({"status": "warmed"}), 200 + + except Exception as e: + logging.exception(f"[WARMUP] Exception details: {str(e)}") + return jsonify({"status": "error", "message": str(e)}), 500 + + if __name__ == "__main__": app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/preprocessors/depth-map-gen/depth-map-generator.py b/preprocessors/depth-map-gen/depth-map-generator.py index eff12c28d..5424523d1 100644 --- a/preprocessors/depth-map-gen/depth-map-generator.py +++ b/preprocessors/depth-map-gen/depth-map-generator.py @@ -36,7 +36,6 @@ configure_logging() app = Flask(__name__) -logging.basicConfig(level=logging.DEBUG) def parse_args(): @@ -223,6 +222,30 @@ def health(): }), 200 +@app.route("/warmup", methods=["GET"]) +def warmup(): + try: + logging.pii("[WARMUP] Initializing RelDepthModel with resnext101 \ + and loading weights from /app/res101.pth") + model = RelDepthModel(backbone='resnext101').eval().cuda() + model.load_state_dict( + strip_prefix_if_present( + torch.load("/app/res101.pth")['depth_model'], "module."), + strict=True + ) + + # simulating a single RGB image input to the model + # 1: one image; 3: RGB; 448 and 448: height and width + dummy = torch.ones((1, 3, 448, 448), dtype=torch.float32).cuda() + _ = model.inference(dummy) + return jsonify({"status": "warmed"}), 200 + + except Exception as e: + logging.error("Warmup failed") + logging.pii(f"Warmup error: {e}") + return jsonify({"status": "warmup failed"}), 500 + + if __name__ == '__main__': app.run(host='0.0.0.0', port=5000, debug=True) depthgenerator() diff --git a/preprocessors/graphic-caption/caption.py b/preprocessors/graphic-caption/caption.py index 835b62445..64979dbe2 100644 --- a/preprocessors/graphic-caption/caption.py +++ b/preprocessors/graphic-caption/caption.py @@ -28,7 +28,6 @@ configure_logging() app = Flask(__name__) -logging.basicConfig(level=logging.DEBUG) PROMPT = """Describe this image to a person who cannot see it. Use simple, descriptive, clear, and concise language. @@ -173,5 +172,44 @@ def health(): }), 200 +@app.route("/warmup", methods=["GET"]) +def warmup(): + """ + Trigger a warmup call to load the Ollama LLM into memory. + This avoids first-request latency by sending a dummy request. + """ + try: + # construct the target Ollama endpoint for generate + api_url = f"{os.environ['OLLAMA_URL']}/generate" + + # authorization headers with API key + headers = { + "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}", + "Content-Type": "application/json" + } + + # prepare the warmup request data using the configured model + data = { + "model": os.environ["OLLAMA_MODEL"], + "prompt": "ping", + "stream": False, + "keep_alive": -1 # instruct Ollama to keep the model in memory + } + + logging.info("[WARMUP] Warmup endpoint triggered.") + logging.pii(f"[WARMUP] Posting to {api_url} with model \ + {data['model']}") + + # send warmup request (with timeout) + r = requests.post(api_url, headers=headers, json=data, timeout=60) + r.raise_for_status() + + return jsonify({"status": "warmed"}), 200 + + except Exception as e: + logging.exception(f"[WARMUP] Exception details: {str(e)}") + return jsonify({"status": "error", "message": str(e)}), 500 + + if __name__ == "__main__": app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/preprocessors/mmsemseg/Dockerfile b/preprocessors/mmsemseg/Dockerfile index 99d0a4c1d..ab4f536d3 100644 --- a/preprocessors/mmsemseg/Dockerfile +++ b/preprocessors/mmsemseg/Dockerfile @@ -52,7 +52,7 @@ EXPOSE 5000 ENV FLASK_APP=segment.py USER python -HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=5 CMD curl -f http://localhost:5000/health || exit 1 -HEALTHCHECK --interval=3600s --timeout=30s --start-period=120s --retries=3 CMD curl -f http://localhost:5000/health/gpu || exit 1 +HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=5 \ + CMD curl -f http://localhost:5000/health && curl -f http://localhost:5000/health/gpu || exit 1 CMD [ "gunicorn", "segment:app", "-b", "0.0.0.0:5000", "--capture-output", "--log-level=debug" ] \ No newline at end of file diff --git a/preprocessors/mmsemseg/segment.py b/preprocessors/mmsemseg/segment.py index 1e5c86236..615a99458 100644 --- a/preprocessors/mmsemseg/segment.py +++ b/preprocessors/mmsemseg/segment.py @@ -284,10 +284,13 @@ def gpu_driver_health_check(): try: # Get installed NVIDIA driver version from nvidia-smi nvidia_smi_version = subprocess.check_output( - ["nvidia-smi", "--query-gpu=driver_version", - "--format=csv,noheader"], + [ + "nvidia-smi", + "--query-gpu=driver_version", + "--format=csv,noheader" + ], text=True - ).strip() + ).strip().split("\n")[0] # Get loaded driver version from /proc/driver/nvidia/version loaded_driver_version = subprocess.check_output( @@ -320,5 +323,34 @@ def gpu_driver_health_check(): }), 500 +@app.route("/warmup", methods=["GET"]) +def warmup(): + """ + Warms up the segmentation model by running a dummy inference. + """ + try: + # dummy black image (512×512) + dummy_img = np.zeros((512, 512, 3), dtype=np.uint8) + + # runs inference_segmentor(): model weight loading/memory allocation + model = init_segmentor(BEIT_CONFIG, BEIT_CHECKPOINT, device='cuda:0') + _ = inference_segmentor(model, dummy_img) + + torch.cuda.empty_cache() + + return jsonify({ + "status": "warmup successful", + "timestamp": datetime.now().isoformat() + }), 200 + + except Exception as e: + logging.pii(f"[WARMUP] Warmup failed: {e}") + logging.exception("Warmup failed") + return jsonify({ + "status": "warmup failed", + "message": str(e) + }), 500 + + if __name__ == "__main__": app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py b/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py index 8b1870363..9e94a6c4e 100644 --- a/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py +++ b/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py @@ -819,5 +819,35 @@ def health(): }), 200 +@app.route("/warmup", methods=["GET"]) +def warmup(): + try: + logging.info("Warming up Gemini and SAM...") + + # Gemini: dummy image + prompt + dummy_img = Image.new("RGB", (512, 512), color="white") + response = client.models.generate_content( + model=GEMINI_MODEL, + contents=["{}", dummy_img], + config=types.GenerateContentConfig( + temperature=0.1, + safety_settings=safety_settings, + response_mime_type='application/json', + response_schema=BASE_SCHEMA_GEMINI, + ) + ) + _ = validate_gemini_response(response) + + # SAM: dummy box + dummy_cv2 = np.zeros((512, 512, 3), dtype=np.uint8) + dummy_pil = Image.fromarray(dummy_cv2) + _ = sam_model(dummy_pil, bboxes=[[100, 100, 200, 200]]) + + return jsonify({"status": "ok"}), 200 + except Exception as e: + logging.pii(f"Warmup failed: {str(e)}") + return jsonify({"status": "error", "message": str(e)}), 500 + + if __name__ == "__main__": app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/preprocessors/text-followup/text-followup.py b/preprocessors/text-followup/text-followup.py index 2e674557e..1aefd135e 100644 --- a/preprocessors/text-followup/text-followup.py +++ b/preprocessors/text-followup/text-followup.py @@ -401,5 +401,44 @@ def health(): }), 200 +@app.route("/warmup", methods=["GET"]) +def warmup(): + """ + Trigger a warmup call to load the Ollama LLM into memory. + This avoids first-request latency by sending a dummy request. + """ + try: + # construct the target Ollama endpoint for chat + api_url = f"{os.environ['OLLAMA_URL']}/chat" + + # authorization headers with API key + headers = { + "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}" + } + + # prepare the warmup request data using the configured model + data = { + "model": os.environ["OLLAMA_MODEL"], + "messages": [{"role": "user", "content": "warmup"}], + "stream": False + } + + logging.info("[WARMUP] Warmup endpoint triggered.") + logging.pii( + f"[WARMUP] Posting to {api_url} with model {data['model']}" + ) + + # send warmup request (with timeout) + r = requests.post(api_url, headers=headers, json=data, timeout=60) + r.raise_for_status() + + return jsonify({"status": "warmed"}), 200 + + except Exception as e: + logging.pii(f"[WARMUP] Warmup failed: {str(e)}") + logging.exception("[WARMUP] Exception details:") + return jsonify({"status": "error", "message": str(e)}), 500 + + if __name__ == "__main__": app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/preprocessors/yolo/detect.py b/preprocessors/yolo/detect.py index 0c83b8470..83cfea757 100644 --- a/preprocessors/yolo/detect.py +++ b/preprocessors/yolo/detect.py @@ -239,5 +239,29 @@ def health(): }), 200 +@app.route("/warmup", methods=["GET"]) +def warmup(): + try: + # create a blank dummy image (640x640) + dummy_image = Image.new("RGB", (8, 8), color=(0, 0, 0)) + + # Run YOLO inference with dummy image + with torch.no_grad(): + _ = model.predict( + dummy_image, + device=device, + conf=CONF_THRESHOLD, + # imgsz=MAX_IMAGE_SIZE, + verbose=False + ) + + logging.info("YOLO warmup completed successfully with 8x8 image.") + return jsonify({"status": "ok"}), 200 + except Exception as e: + logging.error(f"YOLO warmup failed: {str(e)}") + logging.pii(traceback.format_exc()) + return jsonify({"status": "error", "error": str(e)}), 500 + + if __name__ == "__main__": app.run(debug=True) diff --git a/scripts/imageup b/scripts/imageup index b2bf80156..dd5927ce0 100755 --- a/scripts/imageup +++ b/scripts/imageup @@ -104,4 +104,7 @@ docker network rm -f image || true # Bring all the containers back up docker compose --env-file "$COMPOSE_ENV_FILE" up -d --force-recreate +echo "Running warmup for critical services..." +"${SCRIPT_DIR}/warmup" + cd - \ No newline at end of file diff --git a/scripts/warmup b/scripts/warmup new file mode 100755 index 000000000..909ab4f20 --- /dev/null +++ b/scripts/warmup @@ -0,0 +1,62 @@ +#!/bin/bash + +# Locate this script's directory +SCRIPT_DIR="$(dirname "$(realpath "$0")")" + +WARMUP_LOG_DIR="/var/docker/image/testing/warmup" +timestamp=$(date +"%Y%m%d_%H%M%S") +logfile="${WARMUP_LOG_DIR}/warmup_${timestamp}.log" +mkdir -p "$WARMUP_LOG_DIR" + +echo "[Warmup] $(date) Starting warmup..." | tee -a "$logfile" + +# Get all running containers +# Restrict to containers on the 'image' Docker network +if docker network inspect image &> /dev/null; then + containers=$(docker network inspect image | jq -r '.[0].Containers | to_entries[] | .value.Name') +else + echo "[Warmup] No Docker network named 'image' found. Aborting." | tee -a "$logfile" + exit 1 +fi + +for container in $containers; do + # Check if WARMUP_ENABLED=true is present in the environment + if ! docker inspect -f '{{range .Config.Env}}{{println .}}{{end}}' "$container" | grep -q "^WARMUP_ENABLED=true$"; then + continue + fi + + # Get EXPOSED port (assume first one is the correct one) + exposed_port=$(docker inspect -f '{{range $p, $_ := .Config.ExposedPorts}}{{println $p}}{{end}}' "$container" | head -n1 | cut -d'/' -f1) + if [ -z "$exposed_port" ]; then + echo "[Warmup] $container has no EXPOSEd port. Skipping." | tee -a "$logfile" + continue + fi + + endpoint="http://localhost:${exposed_port}/warmup" + + echo "[Warmup] Waiting for $container to be healthy..." | tee -a "$logfile" + until [[ "$(docker inspect -f '{{.State.Health.Status}}' "$container")" == "healthy" ]]; do + sleep 2 + done + + echo "[Warmup] $container marked healthy. Waiting 10s before hitting warmup..." | tee -a "$logfile" + # Wait briefly after container is marked healthy to ensure internal models are fully initialized before warmup. + # prevents race conditions where healthcheck passes but model isnt ready + + # add random jitter to stagger warmups (addresses potential resource spike if all hit at once) + # note: even if some warmups fail, most models will still be partially/fully loaded, so the first real request is likely to succeed or respond faster than a cold start. + jitter=$((RANDOM % 5)) + sleep $((10 + jitter)) + + echo "[Warmup] Hitting warmup endpoint at $endpoint..." | tee -a "$logfile" + resp=$(docker exec "$container" curl -s -w "%{http_code}" -o /tmp/warmup_resp.txt "$endpoint") + + if [[ "$resp" == "200" ]]; then + echo "[Warmup] $container warmed successfully." | tee -a "$logfile" + else + echo "[Warmup] $container warmup failed with HTTP $resp. Response was:" | tee -a "$logfile" + docker exec "$container" cat /tmp/warmup_resp.txt | tee -a "$logfile" || echo "[Warmup] (no response body)" | tee -a "$logfile" + fi +done + +echo "[Warmup] Completed at $(date)! " | tee -a "$logfile" \ No newline at end of file diff --git a/services/espnet-tts-fr/src/app.py b/services/espnet-tts-fr/src/app.py index e54ca902c..b5e4a60e4 100644 --- a/services/espnet-tts-fr/src/app.py +++ b/services/espnet-tts-fr/src/app.py @@ -172,6 +172,22 @@ def segment_tts(): empty_cache() +@app.route("/warmup", methods=["GET"]) +def warmup(): + """ + Trigger a dummy call to warm up the model and pre-load it into GPU memory. + to reduce first-request latency by avoiding model load. + """ + try: + logger.info("[WARMUP] Warmup endpoint triggered.") + # Run inference on a short dummy input + _ = tts("warmup") + return jsonify({"status": "warmed"}), 200 + except Exception as e: + logger.exception("[WARMUP] Warmup failed.") + return jsonify({"status": "error", "message": str(e)}), 500 + + @app.route("/health", methods=["GET"]) def health(): """ diff --git a/services/espnet-tts/src/app.py b/services/espnet-tts/src/app.py index a825776d5..347922560 100644 --- a/services/espnet-tts/src/app.py +++ b/services/espnet-tts/src/app.py @@ -115,6 +115,22 @@ def segment_tts(): empty_cache() +@app.route("/warmup", methods=["GET"]) +def warmup(): + """ + Trigger a dummy call to warm up the model and pre-load it into GPU memory. + to reduce first-request latency by avoiding model load. + """ + try: + logger.info("[WARMUP] Warmup endpoint triggered.") + # Run inference on a short dummy input + _ = tts("warmup") + return jsonify({"status": "warmed"}), 200 + except Exception as e: + logger.exception("[WARMUP] Warmup failed.") + return jsonify({"status": "error", "message": str(e)}), 500 + + @app.route("/health", methods=["GET"]) def health(): """