From 8c99f108f80f2a6fcbf83dbab8e273e225cb6073 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 4 Apr 2024 19:40:15 -0600 Subject: [PATCH] test: only allow scheduling first pod Problem: we currently allow any pod in the group to make the request Solution: Making a BIG assumption that might be wrong, I am adding logic that only allows scheduling (meaning going through PreFilter with AskFlux) given that we see the first pod in the listing. In practice this is the first index (e.g., index 0) which based on our sorting strategy (timestamp then name) I think might work. But I am not 100% on that. The reason we want to do that is so the nodes are chosen for the first pod, and then the group can quickly follow and be actually assigned. Before I did this I kept seeing huge delays in waiting for the queue to move (e.g., 5/6 pods Running and the last one waiting, and then kicking in much later like an old car) and I think with this tweak that is fixed. But this is my subjective evaluation. I am also adding in the hack script for deploying to gke, which requires a push instead of a kind load. Signed-off-by: vsoch --- README.md | 1 - hack/quick-build-gke.sh | 33 ++++++++++++++++ hack/quick-build.sh | 2 +- .../pkg/fluence/core/core.go | 39 ++++++++++++++++--- sig-scheduler-plugins/pkg/logger/logger.go | 3 +- 5 files changed, 68 insertions(+), 10 deletions(-) create mode 100755 hack/quick-build-gke.sh diff --git a/README.md b/README.md index e3e1214..ff3327e 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,6 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched ## TODO -- On init, need to load in resource graph that accounts for running stuff - Need to allow for restart / crashes and looking up existing jobid, updating maps in PodGroup - Since AskFlux is done on level of pod group, refactor function to account for specific resources of all pods (not just one pod) - Figure out if EventsToRegister replaces old informer diff --git a/hack/quick-build-gke.sh b/hack/quick-build-gke.sh new file mode 100755 index 0000000..875360a --- /dev/null +++ b/hack/quick-build-gke.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Before running this, you should: +# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane) +# 2. Install cert-manager +# 3. Customize the script to point to your registry if you intend to push + +REGISTRY="${1:-ghcr.io/vsoch}" +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT=$(dirname ${HERE}) + +# Go to the script directory +cd ${ROOT} + +# These build each of the images. The sidecar is separate from the other two in src/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller + +# This is what it might look like to push +# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest + +# We load into kind so we don't need to push/pull and use up internet data ;) +docker push ${REGISTRY}/fluence-sidecar:latest +docker push ${REGISTRY}/fluence-controller:latest +docker push ${REGISTRY}/fluence:latest + +# And then install using the charts. The pull policy ensures we use the loaded ones +cd ${ROOT}/upstream/manifests/install/charts +helm uninstall fluence || true +helm install \ + --set scheduler.image=${REGISTRY}/fluence:latest \ + --set controller.image=${REGISTRY}/fluence-controller:latest \ + --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ + fluence as-a-second-scheduler/ diff --git a/hack/quick-build.sh b/hack/quick-build.sh index b3ccefe..23a5c87 100755 --- a/hack/quick-build.sh +++ b/hack/quick-build.sh @@ -33,4 +33,4 @@ helm install \ --set controller.pullPolicy=Never \ --set controller.image=${REGISTRY}/fluence-controller:latest \ --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ - fluence as-a-second-scheduler/ \ No newline at end of file + fluence as-a-second-scheduler/ diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 8b08468..1e75814 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -71,7 +71,7 @@ type PodGroupManager struct { scheduleTimeout *time.Duration // permittedPG stores the podgroup name which has passed the pre resource check. permittedPG *gochache.Cache - // backedOffPG stores the podgorup name which failed scheudling recently. + // backedOffPG stores the podgorup name which failed scheduling recently. backedOffPG *gochache.Cache // podLister is pod lister podLister listerv1.PodLister @@ -111,12 +111,25 @@ func NewPodGroupManager( } // GetStatuses string (of all pods) to show for debugging purposes -func (pgMgr *PodGroupManager) GetStatuses(pods []*corev1.Pod) string { +// Since we loop here, we also determine if the first pod is the one +// we are considering +func (pgMgr *PodGroupManager) GetStatusesAndIndex( + pods []*corev1.Pod, + pod *corev1.Pod, +) (string, bool, int) { statuses := "" - for _, pod := range pods { - statuses += " " + fmt.Sprintf("%s", pod.Status.Phase) + + // We need to distinguish 0 from the default and not finding anything + foundIndex := false + index := 0 + for i, p := range pods { + if p.Name == pod.Name { + foundIndex = true + index = i + } + statuses += " " + fmt.Sprintf("%s", p.Status.Phase) } - return statuses + return statuses, foundIndex, index } // GetPodNode is a quick lookup to see if we have a node @@ -153,8 +166,10 @@ func (pgMgr *PodGroupManager) PreFilter( return fmt.Errorf("podLister list pods failed: %w", err) } + // Only allow scheduling the first in the group so the others come after + // Get statuses to show for debugging - statuses := pgMgr.GetStatuses(pods) + statuses, found, idx := pgMgr.GetStatusesAndIndex(pods, pod) // This shows us the number of pods we have in the set and their states pgMgr.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", pgFullName, statuses, pg.Spec.MinMember, len(pods)) @@ -163,6 +178,18 @@ func (pgMgr *PodGroupManager) PreFilter( "current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember) } + if !found { + return fmt.Errorf("pod %s was not found in group - this should not happen", pod.Name) + } + + // We only will AskFlux for the first pod + // This makes an assumption that the order listed is the order in the queue, I'm not + // sure that is true in practice. This is the one case with retry. This design + // probably needs thinking and work. + if idx != 0 { + return fmt.Errorf("pod %s is not first in the list, will wait to schedule", pod.Name) + } + // TODO we likely can take advantage of these resources or other custom // attributes we add. For now ignore and calculate based on pod needs (above) // if pg.Spec.MinResources == nil { diff --git a/sig-scheduler-plugins/pkg/logger/logger.go b/sig-scheduler-plugins/pkg/logger/logger.go index 522be61..053021a 100644 --- a/sig-scheduler-plugins/pkg/logger/logger.go +++ b/sig-scheduler-plugins/pkg/logger/logger.go @@ -19,7 +19,6 @@ const ( LevelDebug ) -// TODO try saving state here when we can close type DebugLogger struct { level int Filename string @@ -28,7 +27,7 @@ type DebugLogger struct { func NewDebugLogger(level int, filename string) *DebugLogger { return &DebugLogger{ - level: LevelNone, + level: level, Filename: filename, } }