Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ services:
restart: "no"
environment:
- TORCH_DEVICE=cuda
- WARMUP_ENABLED=true
labels:
ca.mcgill.a11y.image.cacheTimeout: 3600
deploy:
Expand All @@ -59,6 +60,7 @@ services:
restart: "no"
environment:
- TORCH_DEVICE=cuda
- WARMUP_ENABLED=true
labels:
ca.mcgill.a11y.image.cacheTimeout: 3600
deploy:
Expand All @@ -80,6 +82,8 @@ services:
devices:
- driver: nvidia
capabilities: ["gpu", "compute", "utility"]
environment:
- WARMUP_ENABLED=true

autour-preprocessor:
profiles: [production, test, default]
Expand Down Expand Up @@ -110,6 +114,7 @@ services:
./config/ollama.env
environment:
- PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
- WARMUP_ENABLED=true

graphic-caption:
profiles: [production, test, default]
Expand All @@ -125,6 +130,7 @@ services:
./config/ollama.env
environment:
- PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
- WARMUP_ENABLED=true

text-followup:
profiles: [production, test, default]
Expand All @@ -134,6 +140,7 @@ services:
- MAX_HISTORY_LENGTH=100
- HISTORY_EXPIRY=3600
- PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
- WARMUP_ENABLED=true
labels:
ca.mcgill.a11y.image.preprocessor: 1
ca.mcgill.a11y.image.port: 5000
Expand Down Expand Up @@ -179,6 +186,7 @@ services:
- PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
- YOLO_MODEL_PATH=/usr/src/app/models/yolo11x.pt
- CONF_THRESHOLD=0.75
- WARMUP_ENABLED=true

multistage-diagram-segmentation:
profiles: [production, test, default]
Expand All @@ -201,6 +209,7 @@ services:
- SAM_MODEL_PATH=/usr/src/app/models/sam2.1_l.pt
- GEMINI_MODEL=gemini-2.5-pro-preview-06-05
- BASE_SCHEMA=/usr/src/app/base_schema.json
- WARMUP_ENABLED=true
env_file:
./config/gemini.env

Expand Down Expand Up @@ -263,6 +272,7 @@ services:
ca.mcgill.a11y.image.optional_dependencies: "content-categoriser,graphic-tagger"
environment:
- PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
- WARMUP_ENABLED=true

supercollider:
profiles: [production, test, default]
Expand Down Expand Up @@ -376,6 +386,7 @@ services:
ca.mcgill.a11y.image.optional_dependencies: ""
environment:
- PII_LOGGING_ENABLED=${PII_LOGGING_ENABLED}
- WARMUP_ENABLED=true

svg-depth-map:
profiles: [production, test, default]
Expand Down
39 changes: 39 additions & 0 deletions preprocessors/content-categoriser/categoriser.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,5 +198,44 @@ def health():
}), 200


@app.route("/warmup", methods=["GET"])
def warmup():
"""
Trigger a warmup call to load the Ollama LLM into memory.
This avoids first-request latency by sending a dummy request.
"""
try:
# construct the target Ollama endpoint for generate
api_url = f"{os.environ['OLLAMA_URL']}/generate"

# authorization headers with API key
headers = {
"Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}",
"Content-Type": "application/json"
}

# prepare the warmup request data using the configured model
data = {
"model": os.environ["OLLAMA_MODEL"],
"prompt": "ping",
"stream": False,
"keep_alive": -1 # instruct Ollama to keep the model in memory
}

logging.info("[WARMUP] Warmup endpoint triggered.")
logging.pii(f"[WARMUP] Posting to {api_url} with model \
{data['model']}")

# send warmup request (with timeout)
r = requests.post(api_url, headers=headers, json=data, timeout=60)
r.raise_for_status()

return jsonify({"status": "warmed"}), 200

except Exception as e:
logging.exception(f"[WARMUP] Exception details: {str(e)}")
return jsonify({"status": "error", "message": str(e)}), 500


if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000, debug=True)
25 changes: 24 additions & 1 deletion preprocessors/depth-map-gen/depth-map-generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
configure_logging()

app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG)


def parse_args():
Expand Down Expand Up @@ -223,6 +222,30 @@ def health():
}), 200


@app.route("/warmup", methods=["GET"])
def warmup():
try:
logging.pii("[WARMUP] Initializing RelDepthModel with resnext101 \
and loading weights from /app/res101.pth")
model = RelDepthModel(backbone='resnext101').eval().cuda()
model.load_state_dict(
strip_prefix_if_present(
torch.load("/app/res101.pth")['depth_model'], "module."),
strict=True
)

# simulating a single RGB image input to the model
# 1: one image; 3: RGB; 448 and 448: height and width
dummy = torch.ones((1, 3, 448, 448), dtype=torch.float32).cuda()
_ = model.inference(dummy)
return jsonify({"status": "warmed"}), 200

except Exception as e:
logging.error("Warmup failed")
logging.pii(f"Warmup error: {e}")
return jsonify({"status": "warmup failed"}), 500


if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)
depthgenerator()
40 changes: 39 additions & 1 deletion preprocessors/graphic-caption/caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
configure_logging()

app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG)

PROMPT = """Describe this image to a person who cannot see it.
Use simple, descriptive, clear, and concise language.
Expand Down Expand Up @@ -173,5 +172,44 @@ def health():
}), 200


@app.route("/warmup", methods=["GET"])
def warmup():
"""
Trigger a warmup call to load the Ollama LLM into memory.
This avoids first-request latency by sending a dummy request.
"""
try:
# construct the target Ollama endpoint for generate
api_url = f"{os.environ['OLLAMA_URL']}/generate"

# authorization headers with API key
headers = {
"Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}",
"Content-Type": "application/json"
}

# prepare the warmup request data using the configured model
data = {
"model": os.environ["OLLAMA_MODEL"],
"prompt": "ping",
"stream": False,
"keep_alive": -1 # instruct Ollama to keep the model in memory
}

logging.info("[WARMUP] Warmup endpoint triggered.")
logging.pii(f"[WARMUP] Posting to {api_url} with model \
{data['model']}")

# send warmup request (with timeout)
r = requests.post(api_url, headers=headers, json=data, timeout=60)
r.raise_for_status()

return jsonify({"status": "warmed"}), 200

except Exception as e:
logging.exception(f"[WARMUP] Exception details: {str(e)}")
return jsonify({"status": "error", "message": str(e)}), 500


if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000, debug=True)
4 changes: 2 additions & 2 deletions preprocessors/mmsemseg/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ EXPOSE 5000
ENV FLASK_APP=segment.py
USER python

HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=5 CMD curl -f http://localhost:5000/health || exit 1
HEALTHCHECK --interval=3600s --timeout=30s --start-period=120s --retries=3 CMD curl -f http://localhost:5000/health/gpu || exit 1
HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=5 \
CMD curl -f http://localhost:5000/health && curl -f http://localhost:5000/health/gpu || exit 1

CMD [ "gunicorn", "segment:app", "-b", "0.0.0.0:5000", "--capture-output", "--log-level=debug" ]
38 changes: 35 additions & 3 deletions preprocessors/mmsemseg/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,13 @@ def gpu_driver_health_check():
try:
# Get installed NVIDIA driver version from nvidia-smi
nvidia_smi_version = subprocess.check_output(
["nvidia-smi", "--query-gpu=driver_version",
"--format=csv,noheader"],
[
"nvidia-smi",
"--query-gpu=driver_version",
"--format=csv,noheader"
],
text=True
).strip()
).strip().split("\n")[0]

# Get loaded driver version from /proc/driver/nvidia/version
loaded_driver_version = subprocess.check_output(
Expand Down Expand Up @@ -320,5 +323,34 @@ def gpu_driver_health_check():
}), 500


@app.route("/warmup", methods=["GET"])
def warmup():
"""
Warms up the segmentation model by running a dummy inference.
"""
try:
# dummy black image (512×512)
dummy_img = np.zeros((512, 512, 3), dtype=np.uint8)

# runs inference_segmentor(): model weight loading/memory allocation
model = init_segmentor(BEIT_CONFIG, BEIT_CHECKPOINT, device='cuda:0')
_ = inference_segmentor(model, dummy_img)

torch.cuda.empty_cache()

return jsonify({
"status": "warmup successful",
"timestamp": datetime.now().isoformat()
}), 200

except Exception as e:
logging.pii(f"[WARMUP] Warmup failed: {e}")
logging.exception("Warmup failed")
return jsonify({
"status": "warmup failed",
"message": str(e)
}), 500


if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000, debug=True)
Original file line number Diff line number Diff line change
Expand Up @@ -819,5 +819,35 @@ def health():
}), 200


@app.route("/warmup", methods=["GET"])
def warmup():
try:
logging.info("Warming up Gemini and SAM...")

# Gemini: dummy image + prompt
dummy_img = Image.new("RGB", (512, 512), color="white")
response = client.models.generate_content(
model=GEMINI_MODEL,
contents=["{}", dummy_img],
config=types.GenerateContentConfig(
temperature=0.1,
safety_settings=safety_settings,
response_mime_type='application/json',
response_schema=BASE_SCHEMA_GEMINI,
)
)
_ = validate_gemini_response(response)

# SAM: dummy box
dummy_cv2 = np.zeros((512, 512, 3), dtype=np.uint8)
dummy_pil = Image.fromarray(dummy_cv2)
_ = sam_model(dummy_pil, bboxes=[[100, 100, 200, 200]])

return jsonify({"status": "ok"}), 200
except Exception as e:
logging.pii(f"Warmup failed: {str(e)}")
return jsonify({"status": "error", "message": str(e)}), 500


if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000, debug=True)
39 changes: 39 additions & 0 deletions preprocessors/text-followup/text-followup.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,5 +401,44 @@ def health():
}), 200


@app.route("/warmup", methods=["GET"])
def warmup():
"""
Trigger a warmup call to load the Ollama LLM into memory.
This avoids first-request latency by sending a dummy request.
"""
try:
# construct the target Ollama endpoint for chat
api_url = f"{os.environ['OLLAMA_URL']}/chat"

# authorization headers with API key
headers = {
"Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}"
}

# prepare the warmup request data using the configured model
data = {
"model": os.environ["OLLAMA_MODEL"],
"messages": [{"role": "user", "content": "warmup"}],
"stream": False
}

logging.info("[WARMUP] Warmup endpoint triggered.")
logging.pii(
f"[WARMUP] Posting to {api_url} with model {data['model']}"
)

# send warmup request (with timeout)
r = requests.post(api_url, headers=headers, json=data, timeout=60)
r.raise_for_status()

return jsonify({"status": "warmed"}), 200

except Exception as e:
logging.pii(f"[WARMUP] Warmup failed: {str(e)}")
logging.exception("[WARMUP] Exception details:")
return jsonify({"status": "error", "message": str(e)}), 500


if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000, debug=True)
24 changes: 24 additions & 0 deletions preprocessors/yolo/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,5 +239,29 @@ def health():
}), 200


@app.route("/warmup", methods=["GET"])
def warmup():
try:
# create a blank dummy image (640x640)
dummy_image = Image.new("RGB", (8, 8), color=(0, 0, 0))

# Run YOLO inference with dummy image
with torch.no_grad():
_ = model.predict(
dummy_image,
device=device,
conf=CONF_THRESHOLD,
# imgsz=MAX_IMAGE_SIZE,
verbose=False
)

logging.info("YOLO warmup completed successfully with 8x8 image.")
return jsonify({"status": "ok"}), 200
except Exception as e:
logging.error(f"YOLO warmup failed: {str(e)}")
logging.pii(traceback.format_exc())
return jsonify({"status": "error", "error": str(e)}), 500


if __name__ == "__main__":
app.run(debug=True)
Loading
Loading