feat(workflows): use built images in Github workflows (#11284)

CarterFendley · web-flow · commit 1550b363aed3 · 2024-10-11T17:40:11.000Z
* Patch deployments and include debugging info

Signed-off-by: carter.fendley &lt;carter.fendley@gmail.com&gt;

* Build and use driver / launcher too!

Signed-off-by: carter.fendley &lt;carter.fendley@gmail.com&gt;

* Modify waiting status message

Signed-off-by: carter.fendley &lt;carter.fendley@gmail.com&gt;

* Fix typo

Signed-off-by: carter.fendley &lt;carter.fendley@gmail.com&gt;

---------

Signed-off-by: carter.fendley &lt;carter.fendley@gmail.com&gt;
diff --git a/scripts/deploy/github/build-images.sh b/scripts/deploy/github/build-images.sh
@@ -46,6 +46,19 @@ then
   exit $EXIT_CODE
 fi
 
+docker build -q -t "${REGISTRY}/driver:${TAG}" -f backend/Dockerfile.driver . && docker push "${REGISTRY}/driver:${TAG}" || EXIT_CODE=$?
+if [[ $EXIT_CODE -ne 0 ]]
+then
+  echo "Failed to build driver image."
+  exit $EXIT_CODE
+fi
+
+docker build -q -t "${REGISTRY}/launcher:${TAG}" -f backend/Dockerfile.launcher . && docker push "${REGISTRY}/launcher:${TAG}" || EXIT_CODE=$?
+if [[ $EXIT_CODE -ne 0 ]]
+then
+  echo "Failed to build launcher image."
+  exit $EXIT_CODE
+fi
 
 # clean up intermittent build caches to free up disk space
 docker system prune -a -f
diff --git a/scripts/deploy/github/deploy-kfp.sh b/scripts/deploy/github/deploy-kfp.sh
@@ -41,6 +41,18 @@ then
   exit 1
 fi
 
+echo "Patching deployments to use built docker images..."
+# Patch API server
+kubectl patch deployment ml-pipeline -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-api-server", "image": "kind-registry:5000/apiserver"}]}}}}' -n kubeflow
+# Patch persistence agent
+kubectl patch deployment.apps/ml-pipeline-persistenceagent -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-persistenceagent", "image": "kind-registry:5000/persistenceagent"}]}}}}' -n kubeflow
+# Patch scheduled workflow
+kubectl patch deployment.apps/ml-pipeline-scheduledworkflow -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-scheduledworkflow", "image": "kind-registry:5000/scheduledworkflow"}]}}}}' -n kubeflow
+
+# Update environment variables to override driver / launcher
+kubectl set env deployments/ml-pipeline V2_DRIVER_IMAGE=kind-registry:5000/driver -n kubeflow
+kubectl set env deployments/ml-pipeline V2_LAUNCHER_IMAGE=kind-registry:5000/launcher -n kubeflow
+
 # Check if all pods are running - (10 minutes)
 wait_for_pods || EXIT_CODE=$?
 if [[ $EXIT_CODE -ne 0 ]]
diff --git a/scripts/deploy/github/kfp-readiness/wait_for_pods.py b/scripts/deploy/github/kfp-readiness/wait_for_pods.py
@@ -21,15 +21,25 @@ def get_pod_statuses():
         pod_name = pod.metadata.name
         pod_status = pod.status.phase
         container_statuses = pod.status.container_statuses or []
-        ready_containers = sum(1 for status in container_statuses if status.ready)
-        total_containers = len(container_statuses)
-        statuses[pod_name] = (pod_status, ready_containers, total_containers)
+        ready = 0
+        total = 0
+        waiting_messages = []
+        for status in container_statuses:
+            total += 1
+            if status.ready:
+                ready += 1
+            if status.state.waiting is not None:
+                if status.state.waiting.message is not None:
+                    waiting_messages.append(f'Waiting on Container: {status.name} - {status.state.waiting.reason}: {status.state.waiting.message}')
+                else:
+                    waiting_messages.append(f'Waiting on Container: {status.name} - {status.state.waiting.reason}')
+        statuses[pod_name] = (pod_status, ready, total, waiting_messages)
     return statuses
 
 
 def all_pods_ready(statuses):
     return all(pod_status == 'Running' and ready == total
-               for pod_status, ready, total in statuses.values())
+               for pod_status, ready, total, _ in statuses.values())
 
 
 def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
@@ -41,8 +51,10 @@ def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
         current_statuses = get_pod_statuses()
 
         logging.info("Checking pod statuses...")
-        for pod_name, (pod_status, ready, total) in current_statuses.items():
+        for pod_name, (pod_status, ready, total, waiting_messages) in current_statuses.items():
             logging.info(f"Pod {pod_name} - Status: {pod_status}, Ready: {ready}/{total}")
+            for waiting_msg  in waiting_messages:
+                logging.info(waiting_msg)
 
         if current_statuses == previous_statuses:
             if all_pods_ready(current_statuses):
@@ -65,7 +77,7 @@ def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
         raise Exception("Pods did not stabilize within the timeout period.")
 
     logging.info("Final pod statuses:")
-    for pod_name, (pod_status, ready, total) in previous_statuses.items():
+    for pod_name, (pod_status, ready, total, _) in previous_statuses.items():
         if pod_status == 'Running' and ready == total:
             logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})")
         else: