From 8c99f108f80f2a6fcbf83dbab8e273e225cb6073 Mon Sep 17 00:00:00 2001
From: vsoch <vsoch@users.noreply.github.com>
Date: Thu, 4 Apr 2024 19:40:15 -0600
Subject: [PATCH] test: only allow scheduling first pod

Problem: we currently allow any pod in the group to make the request
Solution: Making a BIG assumption that might be wrong, I am adding logic
that only allows scheduling (meaning going through PreFilter with AskFlux)
given that we see the first pod in the listing. In practice this is the first
index (e.g., index 0) which based on our sorting strategy (timestamp then name)
I think might work. But I am not 100% on that. The reason we want to do that is
so the nodes are chosen for the first pod, and then the group can quickly
follow and be actually assigned. Before I did this I kept seeing huge delays
in waiting for the queue to move (e.g., 5/6 pods Running and the last one
waiting, and then kicking in much later like an old car) and I think with
this tweak that is fixed. But this is my subjective evaluation. I am
also adding in the hack script for deploying to gke, which requires a
push instead of a kind load.

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
---
 README.md                                     |  1 -
 hack/quick-build-gke.sh                       | 33 ++++++++++++++++
 hack/quick-build.sh                           |  2 +-
 .../pkg/fluence/core/core.go                  | 39 ++++++++++++++++---
 sig-scheduler-plugins/pkg/logger/logger.go    |  3 +-
 5 files changed, 68 insertions(+), 10 deletions(-)
 create mode 100755 hack/quick-build-gke.sh

diff --git a/README.md b/README.md
index e3e1214..ff3327e 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,6 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched
 
 ## TODO
 
-- On init, need to load in resource graph that accounts for running stuff
 - Need to allow for restart / crashes and looking up existing jobid, updating maps in PodGroup
 - Since AskFlux is done on level of pod group, refactor function to account for specific resources of all pods (not just one pod)
 - Figure out if EventsToRegister replaces old informer
diff --git a/hack/quick-build-gke.sh b/hack/quick-build-gke.sh
new file mode 100755
index 0000000..875360a
--- /dev/null
+++ b/hack/quick-build-gke.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Before running this, you should:
+# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane)
+# 2. Install cert-manager
+# 3. Customize the script to point to your registry if you intend to push
+
+REGISTRY="${1:-ghcr.io/vsoch}"
+HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT=$(dirname ${HERE})
+
+# Go to the script directory
+cd ${ROOT}
+
+# These build each of the images. The sidecar is separate from the other two in src/
+make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller
+
+# This is what it might look like to push
+# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest
+
+# We load into kind so we don't need to push/pull and use up internet data ;)
+docker push ${REGISTRY}/fluence-sidecar:latest
+docker push ${REGISTRY}/fluence-controller:latest
+docker push ${REGISTRY}/fluence:latest
+
+# And then install using the charts. The pull policy ensures we use the loaded ones
+cd ${ROOT}/upstream/manifests/install/charts
+helm uninstall fluence || true
+helm install \
+  --set scheduler.image=${REGISTRY}/fluence:latest \
+  --set controller.image=${REGISTRY}/fluence-controller:latest \
+  --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \
+        fluence as-a-second-scheduler/
diff --git a/hack/quick-build.sh b/hack/quick-build.sh
index b3ccefe..23a5c87 100755
--- a/hack/quick-build.sh
+++ b/hack/quick-build.sh
@@ -33,4 +33,4 @@ helm install \
   --set controller.pullPolicy=Never \
   --set controller.image=${REGISTRY}/fluence-controller:latest \
   --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \
-        fluence as-a-second-scheduler/
\ No newline at end of file
+        fluence as-a-second-scheduler/
diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go
index 8b08468..1e75814 100644
--- a/sig-scheduler-plugins/pkg/fluence/core/core.go
+++ b/sig-scheduler-plugins/pkg/fluence/core/core.go
@@ -71,7 +71,7 @@ type PodGroupManager struct {
 	scheduleTimeout *time.Duration
 	// permittedPG stores the podgroup name which has passed the pre resource check.
 	permittedPG *gochache.Cache
-	// backedOffPG stores the podgorup name which failed scheudling recently.
+	// backedOffPG stores the podgorup name which failed scheduling recently.
 	backedOffPG *gochache.Cache
 	// podLister is pod lister
 	podLister listerv1.PodLister
@@ -111,12 +111,25 @@ func NewPodGroupManager(
 }
 
 // GetStatuses string (of all pods) to show for debugging purposes
-func (pgMgr *PodGroupManager) GetStatuses(pods []*corev1.Pod) string {
+// Since we loop here, we also determine if the first pod is the one
+// we are considering
+func (pgMgr *PodGroupManager) GetStatusesAndIndex(
+	pods []*corev1.Pod,
+	pod *corev1.Pod,
+) (string, bool, int) {
 	statuses := ""
-	for _, pod := range pods {
-		statuses += " " + fmt.Sprintf("%s", pod.Status.Phase)
+
+	// We need to distinguish 0 from the default and not finding anything
+	foundIndex := false
+	index := 0
+	for i, p := range pods {
+		if p.Name == pod.Name {
+			foundIndex = true
+			index = i
+		}
+		statuses += " " + fmt.Sprintf("%s", p.Status.Phase)
 	}
-	return statuses
+	return statuses, foundIndex, index
 }
 
 // GetPodNode is a quick lookup to see if we have a node
@@ -153,8 +166,10 @@ func (pgMgr *PodGroupManager) PreFilter(
 		return fmt.Errorf("podLister list pods failed: %w", err)
 	}
 
+	// Only allow scheduling the first in the group so the others come after
+
 	// Get statuses to show for debugging
-	statuses := pgMgr.GetStatuses(pods)
+	statuses, found, idx := pgMgr.GetStatusesAndIndex(pods, pod)
 
 	// This shows us the number of pods we have in the set and their states
 	pgMgr.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", pgFullName, statuses, pg.Spec.MinMember, len(pods))
@@ -163,6 +178,18 @@ func (pgMgr *PodGroupManager) PreFilter(
 			"current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember)
 	}
 
+	if !found {
+		return fmt.Errorf("pod %s was not found in group - this should not happen", pod.Name)
+	}
+
+	// We only will AskFlux for the first pod
+	// This makes an assumption that the order listed is the order in the queue, I'm not
+	// sure that is true in practice. This is the one case with retry. This design
+	// probably needs thinking and work.
+	if idx != 0 {
+		return fmt.Errorf("pod %s is not first in the list, will wait to schedule", pod.Name)
+	}
+
 	// TODO we likely can take advantage of these resources or other custom
 	// attributes we add. For now ignore and calculate based on pod needs (above)
 	// if pg.Spec.MinResources == nil {
diff --git a/sig-scheduler-plugins/pkg/logger/logger.go b/sig-scheduler-plugins/pkg/logger/logger.go
index 522be61..053021a 100644
--- a/sig-scheduler-plugins/pkg/logger/logger.go
+++ b/sig-scheduler-plugins/pkg/logger/logger.go
@@ -19,7 +19,6 @@ const (
 	LevelDebug
 )
 
-// TODO try saving state here when we can close
 type DebugLogger struct {
 	level    int
 	Filename string
@@ -28,7 +27,7 @@ type DebugLogger struct {
 
 func NewDebugLogger(level int, filename string) *DebugLogger {
 	return &DebugLogger{
-		level:    LevelNone,
+		level:    level,
 		Filename: filename,
 	}
 }