Shared-Reality-Lab · shahdyousefak · Jun 17, 2025 · Jun 7, 2025 · Jun 7, 2025 · Jun 8, 2025
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -44,6 +44,7 @@ services:
     restart: "no"
     environment:
       - TORCH_DEVICE=cuda
+      - WARMUP_ENABLED=true
     labels:
       ca.mcgill.a11y.image.cacheTimeout: 3600
     deploy:
@@ -59,6 +60,7 @@ services:
     restart: "no"
     environment:
       - TORCH_DEVICE=cuda
+      - WARMUP_ENABLED=true
     labels:
       ca.mcgill.a11y.image.cacheTimeout: 3600
     deploy:
@@ -80,6 +82,8 @@ services:
           devices:
             - driver: nvidia
               capabilities: ["gpu", "compute", "utility"]
+    environment:
+      - WARMUP_ENABLED=true
 
   autour-preprocessor:
     profiles: [production, test, default]
@@ -110,6 +114,7 @@ services:
       ./config/ollama.env
     environment:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
 
   graphic-caption:
     profiles: [production, test, default]
@@ -125,6 +130,7 @@ services:
       ./config/ollama.env
     environment:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
 
   text-followup:
     profiles: [production, test, default]
@@ -134,6 +140,7 @@ services:
       - MAX_HISTORY_LENGTH=100
       - HISTORY_EXPIRY=3600
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
     labels:
       ca.mcgill.a11y.image.preprocessor: 1
       ca.mcgill.a11y.image.port: 5000
@@ -179,6 +186,7 @@ services:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
       - YOLO_MODEL_PATH=/usr/src/app/models/yolo11x.pt
       - CONF_THRESHOLD=0.75
+      - WARMUP_ENABLED=true
 
   multistage-diagram-segmentation:
     profiles: [production, test, default]
@@ -201,6 +209,7 @@ services:
       - SAM_MODEL_PATH=/usr/src/app/models/sam2.1_l.pt
       - GEMINI_MODEL=gemini-2.5-pro-preview-06-05
       - BASE_SCHEMA=/usr/src/app/base_schema.json
+      - WARMUP_ENABLED=true
     env_file:
       ./config/gemini.env   
 
@@ -263,6 +272,7 @@ services:
       ca.mcgill.a11y.image.optional_dependencies: "content-categoriser,graphic-tagger"
     environment:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
 
   supercollider:
     profiles: [production, test, default]
@@ -376,6 +386,7 @@ services:
         ca.mcgill.a11y.image.optional_dependencies: ""
     environment:
       - PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
+      - WARMUP_ENABLED=true
 
   svg-depth-map:
     profiles: [production, test, default]

diff --git a/preprocessors/content-categoriser/categoriser.py b/preprocessors/content-categoriser/categoriser.py
@@ -198,5 +198,44 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Trigger a warmup call to load the Ollama LLM into memory.
+    This avoids first-request latency by sending a dummy request.
+    """
+    try:
+        # construct the target Ollama endpoint for generate
+        api_url = f"{os.environ['OLLAMA_URL']}/generate"
+
+        # authorization headers with API key
+        headers = {
+            "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}",
+            "Content-Type": "application/json"
+        }
+
+        # prepare the warmup request data using the configured model
+        data = {
+            "model": os.environ["OLLAMA_MODEL"],
+            "prompt": "ping",
+            "stream": False,
+            "keep_alive": -1  # instruct Ollama to keep the model in memory
+        }
+
+        logging.info("[WARMUP] Warmup endpoint triggered.")
+        logging.pii(f"[WARMUP] Posting to {api_url} with model \
+                    {data['model']}")
+
+        # send warmup request (with timeout)
+        r = requests.post(api_url, headers=headers, json=data, timeout=60)
+        r.raise_for_status()
+
+        return jsonify({"status": "warmed"}), 200
+
+    except Exception as e:
+        logging.exception(f"[WARMUP] Exception details: {str(e)}")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/depth-map-gen/depth-map-generator.py b/preprocessors/depth-map-gen/depth-map-generator.py
@@ -36,7 +36,6 @@
 configure_logging()
 
 app = Flask(__name__)
-logging.basicConfig(level=logging.DEBUG)
 
 
 def parse_args():
@@ -223,6 +222,30 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    try:
+        logging.pii("[WARMUP] Initializing RelDepthModel with resnext101 \
+                    and loading weights from /app/res101.pth")
+        model = RelDepthModel(backbone='resnext101').eval().cuda()
+        model.load_state_dict(
+            strip_prefix_if_present(
+                torch.load("/app/res101.pth")['depth_model'], "module."),
+            strict=True
+        )
+
+        # simulating a single RGB image input to the model
+        # 1: one image; 3: RGB; 448 and 448: height and width
+        dummy = torch.ones((1, 3, 448, 448), dtype=torch.float32).cuda()
+        _ = model.inference(dummy)
+        return jsonify({"status": "warmed"}), 200
+
+    except Exception as e:
+        logging.error("Warmup failed")
+        logging.pii(f"Warmup error: {e}")
+        return jsonify({"status": "warmup failed"}), 500
+
+
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000, debug=True)
     depthgenerator()
diff --git a/preprocessors/graphic-caption/caption.py b/preprocessors/graphic-caption/caption.py
@@ -28,7 +28,6 @@
 configure_logging()
 
 app = Flask(__name__)
-logging.basicConfig(level=logging.DEBUG)
 
 PROMPT = """Describe this image to a person who cannot see it.
     Use simple, descriptive, clear, and concise language.
@@ -173,5 +172,44 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Trigger a warmup call to load the Ollama LLM into memory.
+    This avoids first-request latency by sending a dummy request.
+    """
+    try:
+        # construct the target Ollama endpoint for generate
+        api_url = f"{os.environ['OLLAMA_URL']}/generate"
+
+        # authorization headers with API key
+        headers = {
+            "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}",
+            "Content-Type": "application/json"
+        }
+
+        # prepare the warmup request data using the configured model
+        data = {
+            "model": os.environ["OLLAMA_MODEL"],
+            "prompt": "ping",
+            "stream": False,
+            "keep_alive": -1  # instruct Ollama to keep the model in memory
+        }
+
+        logging.info("[WARMUP] Warmup endpoint triggered.")
+        logging.pii(f"[WARMUP] Posting to {api_url} with model \
+                    {data['model']}")
+
+        # send warmup request (with timeout)
+        r = requests.post(api_url, headers=headers, json=data, timeout=60)
+        r.raise_for_status()
+
+        return jsonify({"status": "warmed"}), 200
+
+    except Exception as e:
+        logging.exception(f"[WARMUP] Exception details: {str(e)}")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/mmsemseg/Dockerfile b/preprocessors/mmsemseg/Dockerfile
@@ -52,7 +52,7 @@ EXPOSE 5000
 ENV FLASK_APP=segment.py
 USER python
 
-HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=5 CMD curl -f http://localhost:5000/health || exit 1
-HEALTHCHECK --interval=3600s --timeout=30s --start-period=120s --retries=3 CMD curl -f http://localhost:5000/health/gpu || exit 1
+HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=5 \
+  CMD curl -f http://localhost:5000/health && curl -f http://localhost:5000/health/gpu || exit 1
 
 CMD [ "gunicorn", "segment:app", "-b", "0.0.0.0:5000", "--capture-output", "--log-level=debug" ]
diff --git a/preprocessors/mmsemseg/segment.py b/preprocessors/mmsemseg/segment.py
@@ -284,10 +284,13 @@ def gpu_driver_health_check():
     try:
         # Get installed NVIDIA driver version from nvidia-smi
         nvidia_smi_version = subprocess.check_output(
-            ["nvidia-smi", "--query-gpu=driver_version",
-             "--format=csv,noheader"],
+            [
+                "nvidia-smi",
+                "--query-gpu=driver_version",
+                "--format=csv,noheader"
+            ],
             text=True
-        ).strip()
+        ).strip().split("\n")[0]
 
         # Get loaded driver version from /proc/driver/nvidia/version
         loaded_driver_version = subprocess.check_output(
@@ -320,5 +323,34 @@ def gpu_driver_health_check():
         }), 500
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Warms up the segmentation model by running a dummy inference.
+    """
+    try:
+        # dummy black image (512×512)
+        dummy_img = np.zeros((512, 512, 3), dtype=np.uint8)
+
+        # runs inference_segmentor(): model weight loading/memory allocation
+        model = init_segmentor(BEIT_CONFIG, BEIT_CHECKPOINT, device='cuda:0')
+        _ = inference_segmentor(model, dummy_img)
+
+        torch.cuda.empty_cache()
+
+        return jsonify({
+            "status": "warmup successful",
+            "timestamp": datetime.now().isoformat()
+        }), 200
+
+    except Exception as e:
+        logging.pii(f"[WARMUP] Warmup failed: {e}")
+        logging.exception("Warmup failed")
+        return jsonify({
+            "status": "warmup failed",
+            "message": str(e)
+        }), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py b/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py
@@ -819,5 +819,35 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    try:
+        logging.info("Warming up Gemini and SAM...")
+
+        # Gemini: dummy image + prompt
+        dummy_img = Image.new("RGB", (512, 512), color="white")
+        response = client.models.generate_content(
+            model=GEMINI_MODEL,
+            contents=["{}", dummy_img],
+            config=types.GenerateContentConfig(
+                temperature=0.1,
+                safety_settings=safety_settings,
+                response_mime_type='application/json',
+                response_schema=BASE_SCHEMA_GEMINI,
+            )
+        )
+        _ = validate_gemini_response(response)
+
+        # SAM: dummy box
+        dummy_cv2 = np.zeros((512, 512, 3), dtype=np.uint8)
+        dummy_pil = Image.fromarray(dummy_cv2)
+        _ = sam_model(dummy_pil, bboxes=[[100, 100, 200, 200]])
+
+        return jsonify({"status": "ok"}), 200
+    except Exception as e:
+        logging.pii(f"Warmup failed: {str(e)}")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/text-followup/text-followup.py b/preprocessors/text-followup/text-followup.py
@@ -401,5 +401,44 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    """
+    Trigger a warmup call to load the Ollama LLM into memory.
+    This avoids first-request latency by sending a dummy request.
+    """
+    try:
+        # construct the target Ollama endpoint for chat
+        api_url = f"{os.environ['OLLAMA_URL']}/chat"
+
+        # authorization headers with API key
+        headers = {
+            "Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}"
+        }
+
+        # prepare the warmup request data using the configured model
+        data = {
+            "model": os.environ["OLLAMA_MODEL"],
+            "messages": [{"role": "user", "content": "warmup"}],
+            "stream": False
+        }
+
+        logging.info("[WARMUP] Warmup endpoint triggered.")
+        logging.pii(
+            f"[WARMUP] Posting to {api_url} with model {data['model']}"
+        )
+
+        # send warmup request (with timeout)
+        r = requests.post(api_url, headers=headers, json=data, timeout=60)
+        r.raise_for_status()
+
+        return jsonify({"status": "warmed"}), 200
+
+    except Exception as e:
+        logging.pii(f"[WARMUP] Warmup failed: {str(e)}")
+        logging.exception("[WARMUP] Exception details:")
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=5000, debug=True)
diff --git a/preprocessors/yolo/detect.py b/preprocessors/yolo/detect.py
@@ -239,5 +239,29 @@ def health():
     }), 200
 
 
+@app.route("/warmup", methods=["GET"])
+def warmup():
+    try:
+        # create a blank dummy image (640x640)
+        dummy_image = Image.new("RGB", (8, 8), color=(0, 0, 0))
+
+        # Run YOLO inference with dummy image
+        with torch.no_grad():
+            _ = model.predict(
+                dummy_image,
+                device=device,
+                conf=CONF_THRESHOLD,
+                # imgsz=MAX_IMAGE_SIZE,
+                verbose=False
+            )
+
+        logging.info("YOLO warmup completed successfully with 8x8 image.")
+        return jsonify({"status": "ok"}), 200
+    except Exception as e:
+        logging.error(f"YOLO warmup failed: {str(e)}")
+        logging.pii(traceback.format_exc())
+        return jsonify({"status": "error", "error": str(e)}), 500
+
+
 if __name__ == "__main__":
     app.run(debug=True)