From 4d826e4c32a0e86dcea71a5a067df96db72ba282 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 7 Jan 2024 21:37:58 -0700 Subject: [PATCH 01/28] docs: add section to README for developer Also tweak some docstrings for fluence. Signed-off-by: vsoch --- README.md | 10 ++++++++++ sig-scheduler-plugins/pkg/fluence/fluence.go | 10 +++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 968c2dc..0e2b4c9 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ For background on the Flux framework and the Fluxion scheduler, you can take a l - To deploy our pre-built images, go to [Deploy](#deploy) - To build your own images, go to [Setup](#setup) + - To learn about repository organization, see [Developer](#developer) ### Deploy @@ -431,6 +432,15 @@ For the above, I found [this page](https://kubernetes.io/docs/tasks/extend-kuber Finally, note that we also have a more appropriate example with jobs under [examples/test_example](examples/test_example). It's slightly more sane because it uses Job, and jobs are expected to complete (whereas pods are not and will get into crash loop backoffs, etc). For example of how to programmatically interact with the job pods and check states, events, see the [test.sh](.github/test.sh) script. +### Developer + +You can see [deploy](#deploy) for instructions on how to do a custom deployment. If you are looking to develop: + + - [src](src): includes source code for fluence + - [sig-scheduler-plugins](sig-scheduler-plugins): includes assets (manifests and Go files) that are intended to be added to the kubernetes-sigs/scheduler-plugins upstream repository before build + - *upstream*: the default name this upstream is cloned to when you do a make build command. + +Note that the clone of the repository and copying of files to the correct locations is all automated through the [Makefile](Makefile). This section exists to alert you to where to look for the different assets defined above. ## Papers diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index fec0a35..a705e2c 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -62,7 +62,7 @@ func (f *Fluence) Name() string { return Name } -// initialize and return a new Flux Plugin +// Initialize and return a new Fluence Custom Scheduler Plugin // Note from vsoch: seems analogous to: // https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/coscheduling.go#L63 func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { @@ -82,9 +82,9 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { klog.Info("Create generic pod informer") scheme := runtime.NewScheme() - _ = clientscheme.AddToScheme(scheme) - _ = v1.AddToScheme(scheme) - _ = v1alpha1.AddToScheme(scheme) + clientscheme.AddToScheme(scheme) + v1.AddToScheme(scheme) + v1alpha1.AddToScheme(scheme) client, err := client.New(handle.KubeConfig(), client.Options{Scheme: scheme}) if err != nil { return nil, err @@ -121,7 +121,7 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { return nil, err } - klog.Info("Fluence starts") + klog.Info("Fluence start") return f, nil } From 1026549c7fb96295b0cb205d63b236f5d0493691 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 7 Jan 2024 21:42:44 -0700 Subject: [PATCH 02/28] build: add commands to make for clone and update Problem: the local upstream might get out of date Solution: provide an easy make update to pull from it. Signed-off-by: vsoch --- Makefile | 11 ++++++++--- README.md | 12 +++++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index dc87d50..e20f706 100644 --- a/Makefile +++ b/Makefile @@ -10,15 +10,20 @@ SIDECAR_IMAGE ?= fluence-sidecar:latest CONTROLLER_IMAGE ?= fluence-controller SCHEDULER_IMAGE ?= fluence -.PHONY: all build build-sidecar prepare push push-sidecar push-controller +.PHONY: all build build-sidecar clone prepare push push-sidecar push-controller -all: build-sidecar prepare build +all: build-sidecar prepare build update clone update build-sidecar: make -C ./src LOCAL_REGISTRY=${REGISTRY} LOCAL_IMAGE=${SIDECAR_IMAGE} -prepare: +clone: if [ -d "$(CLONE_UPSTREAM)" ]; then echo "Upstream is cloned"; else git clone $(UPSTREAM) ./$(CLONE_UPSTREAM); fi + +update: clone + git -C $(CLONE_UPSTREAM) pull origin master + +prepare: clone # These are entirely new directory structures cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence cp -R sig-scheduler-plugins/manifests/fluence $(CLONE_UPSTREAM)/manifests/fluence diff --git a/README.md b/README.md index 0e2b4c9..3aa66e6 100644 --- a/README.md +++ b/README.md @@ -440,7 +440,17 @@ You can see [deploy](#deploy) for instructions on how to do a custom deployment. - [sig-scheduler-plugins](sig-scheduler-plugins): includes assets (manifests and Go files) that are intended to be added to the kubernetes-sigs/scheduler-plugins upstream repository before build - *upstream*: the default name this upstream is cloned to when you do a make build command. -Note that the clone of the repository and copying of files to the correct locations is all automated through the [Makefile](Makefile). This section exists to alert you to where to look for the different assets defined above. +Note that the clone of the repository and copying of files to the correct locations is all automated through the [Makefile](Makefile). Additional commands provided include the following: + +```bash +# Only clone the repository into ./upstream +make clone + +# Update the cloned upstream with a git pull origin master +make update +``` + +It's recommend to update once in a while if you have an older clone locally and there might be changes you are not accounting for. ## Papers From 242d169f9785dfdf78306e1fa286896a5103c436 Mon Sep 17 00:00:00 2001 From: vsoch Date: Mon, 8 Jan 2024 18:18:53 -0700 Subject: [PATCH 03/28] fix: restore pod group Signed-off-by: vsoch There are too many edge cases / too much complexity and behavior that I do not understand to pursue having the pod group information cached with fluence. For now I am nuking it and testing the intial design as a sanity check. --- .gitignore | 3 + Makefile | 10 +- README.md | 111 ++- examples/indexed-jobs/job1.yaml | 21 + examples/indexed-jobs/job2.yaml | 21 + .../pkg/fluence/core/core.go | 16 - sig-scheduler-plugins/pkg/fluence/fluence.go | 138 +-- .../pkg/fluence/fluxcli-grpc/fluxcli.pb.go | 838 ------------------ .../pkg/fluence/fluxcli-grpc/fluxcli.proto | 76 -- .../fluence/fluxcli-grpc/fluxcli_grpc.pb.go | 139 --- .../pkg/fluence/utils/utils.go | 93 +- src/Makefile | 15 +- src/build/scheduler/Dockerfile | 1 + src/fluence/fluxcli-grpc/fluxcli.pb.go | 4 +- src/fluence/fluxcli-grpc/fluxcli.proto | 2 +- src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go | 4 + src/fluence/fluxion/fluxion.go | 56 +- src/fluence/go.mod | 29 +- src/fluence/jgf/jgf.go | 221 ++--- src/fluence/jgf/types.go | 62 ++ src/fluence/jobspec/jobspec.go | 238 +++-- src/fluence/jobspec/types.go | 1 + src/fluence/utils/utils.go | 52 +- 23 files changed, 695 insertions(+), 1456 deletions(-) create mode 100644 examples/indexed-jobs/job1.yaml create mode 100644 examples/indexed-jobs/job2.yaml delete mode 100644 sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.pb.go delete mode 100644 sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto delete mode 100644 sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli_grpc.pb.go create mode 100644 src/fluence/jgf/types.go diff --git a/.gitignore b/.gitignore index fa1845c..51462a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ plugins upstream scheduler-plugins +sig-scheduler-plugins/pkg/fluence/bin/ +src/bin +src/fluence/vendor \ No newline at end of file diff --git a/Makefile b/Makefile index e20f706..907f96e 100644 --- a/Makefile +++ b/Makefile @@ -10,9 +10,9 @@ SIDECAR_IMAGE ?= fluence-sidecar:latest CONTROLLER_IMAGE ?= fluence-controller SCHEDULER_IMAGE ?= fluence -.PHONY: all build build-sidecar clone prepare push push-sidecar push-controller +.PHONY: all build build-sidecar clone update push push-sidecar push-controller -all: build-sidecar prepare build update clone update +all: prepare build-sidecar build build-sidecar: make -C ./src LOCAL_REGISTRY=${REGISTRY} LOCAL_IMAGE=${SIDECAR_IMAGE} @@ -25,14 +25,18 @@ update: clone prepare: clone # These are entirely new directory structures + rm -rf $(CLONE_UPSTREAM)/pkg/fluence + rm -rf $(CLONE_UPSTREAM)/manifests/fluence cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence cp -R sig-scheduler-plugins/manifests/fluence $(CLONE_UPSTREAM)/manifests/fluence + # This is the one exception not from sig-scheduler-plugins because it is needed in both spots + cp -R src/fluence/fluxcli-grpc $(CLONE_UPSTREAM)/pkg/fluence/fluxcli-grpc # These are files with subtle changes to add fluence cp sig-scheduler-plugins/cmd/scheduler/main.go ./upstream/cmd/scheduler/main.go cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/values.yaml -build: +build: prepare REGISTRY=${REGISTRY} IMAGE=${SCHEDULER_IMAGE} CONTROLLER_IMAGE=${CONTROLLER_IMAGE} $(BASH) $(CLONE_UPSTREAM)/hack/build-images.sh push-sidecar: diff --git a/README.md b/README.md index 3aa66e6..4431050 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,48 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched For instructions on how to start Fluence on a K8s cluster, see [examples](examples/). Documentation and instructions for reproducing our CANOPIE2022 paper (citation below) can be found in the [canopie22-artifacts branch](https://github.com/flux-framework/flux-k8s/tree/canopie22-artifacts). For background on the Flux framework and the Fluxion scheduler, you can take a look at our award-winning R&D100 submission: https://ipo.llnl.gov/sites/default/files/2022-02/Flux_RD100_Final.pdf. For next steps: + - To understand how it works, see [Design](#design) - To deploy our pre-built images, go to [Deploy](#deploy) - To build your own images, go to [Setup](#setup) - To learn about repository organization, see [Developer](#developer) +### Design + +Fluence is a custom scheduler plugin that you can specify to use with two directive in your pod spec - + +- Asking for `fluence` as the scheduler name +- Defining a named group of pods with the `fluence.flux-framework.org/pod-group` label. +- Defining the group size with the `fluence.flux-framework.org/group-size` label. + +If you are using Fluence, these values are required. +An example is shown below for an indexed job, which will create multiple pods. + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: fluence-job + annotations: + fluence.flux-framework.org/pod-group: my-pods + fluence.flux-framework.org/group-size: 10 +spec: + completions: 10 + parallelism: 10 + completionMode: Indexed + template: + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [echo, potato] + restartPolicy: Never + backoffLimit: 4 +``` + +The group size might be different than, for example, your higher level abstraction (e.g., the IndexedJob) as there is no reason +pods with different names cannot be part of the same group that needs to be scheduled together. + ### Deploy We provide a set of pre-build containers [alongside the repository](https://github.com/orgs/flux-framework/packages?repo_name=flux-k8s) @@ -434,10 +472,17 @@ Finally, note that we also have a more appropriate example with jobs under [exam ### Developer -You can see [deploy](#deploy) for instructions on how to do a custom deployment. If you are looking to develop: +You can see [deploy](#deploy) for instructions on how to do a custom deployment. + +#### Organization + +If you are looking to develop: - - [src](src): includes source code for fluence - - [sig-scheduler-plugins](sig-scheduler-plugins): includes assets (manifests and Go files) that are intended to be added to the kubernetes-sigs/scheduler-plugins upstream repository before build + - [src](src): includes source code for fluence. You'll find logs for this code in the `sidecar` container of the fluence pod. + - [sig-scheduler-plugins](sig-scheduler-plugins): includes assets (manifests and Go files) that are intended to be added to the kubernetes-sigs/scheduler-plugins upstream repository before build. You'll find logs for this container in the `scheduler-plugins-scheduler` container of the pod. + - [manifests](sig-scheduler-plugins/manifests): manifests for helm and Kubernetes + - [pkg](sig-scheduler-plugins/pkg): the main fluence module to add to upstream + - [cmd](sig-scheduler-plugins/cmd): the main.go to replace in upstream - *upstream*: the default name this upstream is cloned to when you do a make build command. Note that the clone of the repository and copying of files to the correct locations is all automated through the [Makefile](Makefile). Additional commands provided include the following: @@ -452,6 +497,66 @@ make update It's recommend to update once in a while if you have an older clone locally and there might be changes you are not accounting for. +#### GRPC + +The fluence module uses GRPC to communicate with Flux, and these assets are stored in [src/fluence/fluxcli-grpc](src/fluence/fluxcli-grpc). +You should *only* update the [sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto](src/fluence/fluxcli-grpc/fluxcli.proto) file, +and then from the root run `make proto` to re-generate the other files: + +```bash +cd src + +# Install protoc tools to local bin +# make protoc +make proto +``` + +#### Workflow + +The easiest thing to do is to build the containers in some container namespace that you control (meaning you can push to a registry), e.g.,: + +```bash +make build REGISTRY=ghcr.io/vsoch +``` + +And then install with your custom images: + +``` +cd ./upstream/manifests/install/charts +helm install \ + --set scheduler.image=ghcr.io/vsoch/fluence:latest \ + --set scheduler.sidecarimage=ghcr.io/vsoch/fluence-sidecar:latest \ + schedscheduler-plugins as-a-second-scheduler/ +``` + +And then apply what you need to test, and look at logs! +And then keep doing that until you get what you want :) Note that I haven't found a good way for the VSCode developer tools to work because we develop fluence outside of the tree it's supposed to be in. + +#### Components + + - [FluxStateData](sig-scheduler-plugins/pkg/fluence/core/core.go): is given to the [framework.CycleState](https://github.com/kubernetes/kubernetes/blob/242b41b36a20032f99e8a059ca0a5d764105217b/pkg/scheduler/framework/cycle_state.go#L48) and serves as a vehicle to store a cache of node name assignment. + + +#### Helm + +The install commands are shown above, but often you want to uninstall! + +> What is the name of the installed plugin again? + +```bash + helm list +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +schedscheduler-plugins default 1 2024-01-08 12:04:58.558612156 -0700 MST deployed scheduler-plugins-0.27.80.27.8 +``` + +And then uninstall: + +```bash +$ helm uninstall schedscheduler-plugins +release "schedscheduler-plugins" uninstalled +``` + + ## Papers You can find details of Fluence architecture, implementation, experiments, and improvements to the Kubeflow MPI operator in our collaboration's papers: diff --git a/examples/indexed-jobs/job1.yaml b/examples/indexed-jobs/job1.yaml new file mode 100644 index 0000000..5778bc3 --- /dev/null +++ b/examples/indexed-jobs/job1.yaml @@ -0,0 +1,21 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: job-1 +spec: + completions: 10 + parallelism: 10 + completionMode: Indexed + template: + metadata: + labels: + fluence.pod-group: job-1 + fluence.group-size: "5" + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [sleep, "10"] + restartPolicy: Never + backoffLimit: 4 diff --git a/examples/indexed-jobs/job2.yaml b/examples/indexed-jobs/job2.yaml new file mode 100644 index 0000000..3d77660 --- /dev/null +++ b/examples/indexed-jobs/job2.yaml @@ -0,0 +1,21 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: job-2 +spec: + completions: 5 + parallelism: 5 + completionMode: Indexed + template: + metadata: + labels: + fluence.pod-group: job-2 + fluence.group-size: "5" + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [sleep, "10"] + restartPolicy: Never + backoffLimit: 4 diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 11c90ef..5914441 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -1,19 +1,3 @@ -/* -Copyright 2022 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - package core import ( diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index a705e2c..32fd513 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -36,8 +36,9 @@ import ( "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/metrics" + corelisters "k8s.io/client-go/listers/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" coschedulingcore "sigs.k8s.io/scheduler-plugins/pkg/coscheduling/core" fcore "sigs.k8s.io/scheduler-plugins/pkg/fluence/core" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" @@ -47,27 +48,38 @@ import ( type Fluence struct { mutex sync.Mutex handle framework.Handle + client client.Client podNameToJobId map[string]uint64 pgMgr coschedulingcore.Manager -} -var _ framework.QueueSortPlugin = &Fluence{} -var _ framework.PreFilterPlugin = &Fluence{} -var _ framework.FilterPlugin = &Fluence{} + // The pod group manager has a lister, but it's private + podLister corelisters.PodLister +} // Name is the name of the plugin used in the Registry and configurations. -const Name = "Fluence" +// Note that this would do better as an annotation (fluence.flux-framework.org/pod-group) +// But we cannot use them as selectors then! +const ( + Name = "Fluence" +) + +var ( + _ framework.QueueSortPlugin = &Fluence{} + _ framework.PreFilterPlugin = &Fluence{} + _ framework.FilterPlugin = &Fluence{} +) func (f *Fluence) Name() string { return Name } // Initialize and return a new Fluence Custom Scheduler Plugin -// Note from vsoch: seems analogous to: +// This class and functions are analogous to: // https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/coscheduling.go#L63 func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { f := &Fluence{handle: handle, podNameToJobId: make(map[string]uint64)} + klog.Info("Create plugin") ctx := context.TODO() fcore.Init() @@ -84,32 +96,38 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { scheme := runtime.NewScheme() clientscheme.AddToScheme(scheme) v1.AddToScheme(scheme) - v1alpha1.AddToScheme(scheme) - client, err := client.New(handle.KubeConfig(), client.Options{Scheme: scheme}) + sched.AddToScheme(scheme) + k8scli, err := client.New(handle.KubeConfig(), client.Options{Scheme: scheme}) if err != nil { return nil, err } + // Save the kubernetes client for fluence to interact with cluster objects + f.client = k8scli + fieldSelector, err := fields.ParseSelector(",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) if err != nil { klog.ErrorS(err, "ParseSelector failed") os.Exit(1) } + informerFactory := informers.NewSharedInformerFactoryWithOptions(handle.ClientSet(), 0, informers.WithTweakListOptions(func(opt *metav1.ListOptions) { opt.FieldSelector = fieldSelector.String() })) podInformer := informerFactory.Core().V1().Pods() - scheduleTimeDuration := time.Duration(500) * time.Second pgMgr := coschedulingcore.NewPodGroupManager( - client, + k8scli, handle.SnapshotSharedLister(), &scheduleTimeDuration, podInformer, ) f.pgMgr = pgMgr + // Save the podLister to fluence to easily query for the group + f.podLister = podInformer.Lister() + // stopCh := make(chan struct{}) // defer close(stopCh) // informerFactory.Start(stopCh) @@ -127,8 +145,9 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { // Less is used to sort pods in the scheduling queue in the following order. // 1. Compare the priorities of Pods. -// 2. Compare the initialization timestamps of PodGroups or Pods. -// 3. Compare the keys of PodGroups/Pods: /. +// 2. Compare the initialization timestamps of fluence pod groups +// 3. Fall back, sort by namespace/name +// See https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { klog.Infof("ordering pods from Coscheduling") prio1 := corev1helpers.PodPriority(podInfo1.Pod) @@ -144,59 +163,72 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { return creationTime1.Before(creationTime2) } -func (f *Fluence) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { +// getPodGroup gets the group information from the pod group manager +// to determine if a pod is in a group. We return the group +func (f *Fluence) getPodGroup(ctx context.Context, pod *v1.Pod) (string, *sched.PodGroup) { + pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) + if pg == nil { + klog.InfoS("Not in group", "pod", klog.KObj(pod)) + } + return pgName, pg +} + +// PreFilter checks info about the Pod / checks conditions that the cluster or the Pod must meet. +// This still comes after sort +func (f *Fluence) PreFilter( + ctx context.Context, + state *framework.CycleState, + pod *v1.Pod, +) (*framework.PreFilterResult, *framework.Status) { + + var ( + err error + nodename string + ) klog.Infof("Examining the pod") - var err error - var nodename string - if pgname, ok := f.isGroup(ctx, pod); ok { - if !fcore.HaveList(pgname) { + + // Get the pod group name and group + groupName, pg := f.getPodGroup(ctx, pod) + klog.Infof("group name is %s", groupName) + + // Case 1: We have a pod group + if pg != nil { + + // We have not yet derived a node list + if !fcore.HaveList(groupName) { klog.Infof("Getting a pod group") - groupSize, _ := f.groupPreFilter(ctx, pod) - if _, err = f.AskFlux(ctx, pod, groupSize); err != nil { + if _, err = f.AskFlux(ctx, pod, int(pg.Spec.MinMember)); err != nil { return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } } - nodename, err = fcore.GetNextNode(pgname) - klog.Infof("Node Selected %s (%s:%s)", nodename, pod.Name, pgname) + nodename, err = fcore.GetNextNode(groupName) + klog.Infof("Node Selected %s (%s:%s)", nodename, pod.Name, groupName) if err != nil { return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } + } else { + + // Case 2: no group, a faux group of a lonely 1 :( nodename, err = f.AskFlux(ctx, pod, 1) if err != nil { return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } } + // Create a fluxState (CycleState) with things that might be useful/ klog.Info("Node Selected: ", nodename) state.Write(framework.StateKey(pod.Name), &fcore.FluxStateData{NodeName: nodename}) return nil, framework.NewStatus(framework.Success, "") - -} - -func (f *Fluence) isGroup(ctx context.Context, pod *v1.Pod) (string, bool) { - pgFullName, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { - klog.InfoS("Not in group", "pod", klog.KObj(pod)) - return "", false - } - return pgFullName, true } -func (f *Fluence) groupPreFilter(ctx context.Context, pod *v1.Pod) (int, error) { - // klog.InfoS("Flux Pre-Filter", "pod", klog.KObj(pod)) - klog.InfoS("Flux Pre-Filter", "pod labels", pod.Labels) - _, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { - klog.InfoS("Not in group", "pod", klog.KObj(pod)) - return 0, nil - } - - klog.Info("pod group members ", pg.Spec.MinMember) - return int(pg.Spec.MinMember), nil -} +func (f *Fluence) Filter( + ctx context.Context, + cycleState *framework.CycleState, + pod *v1.Pod, + nodeInfo *framework.NodeInfo, +) *framework.Status { -func (f *Fluence) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { klog.Info("Filtering input node ", nodeInfo.Node().Name) if v, e := cycleState.Read(framework.StateKey(pod.Name)); e == nil { if value, ok := v.(*fcore.FluxStateData); ok && value.NodeName != nodeInfo.Node().Name { @@ -205,14 +237,16 @@ func (f *Fluence) Filter(ctx context.Context, cycleState *framework.CycleState, klog.Info("Filter: node selected by Flux ", value.NodeName) } } - return framework.NewStatus(framework.Success) } +// PreFilterExtensions allow for callbacks on filtered states +// https://github.com/kubernetes/kubernetes/blob/master/pkg/scheduler/framework/interface.go#L383 func (f *Fluence) PreFilterExtensions() framework.PreFilterExtensions { return nil } +// AskFlux will ask flux for an allocation for nodes for the pod group. func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) (string, error) { // clean up previous match if a pod has already allocated previously f.mutex.Lock() @@ -252,8 +286,9 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) (string, klog.Infof("[FluxClient] response podID %s", r.GetPodID()) - _, ok := f.isGroup(ctx, pod) - if count > 1 || ok { + _, pg := f.getPodGroup(ctx, pod) + + if count > 1 || pg != nil { pgFullName, _ := f.pgMgr.GetPodGroup(ctx, pod) nodelist := fcore.CreateNodePodsList(r.GetNodelist(), pgFullName) klog.Infof("[FluxClient] response nodeID %s", r.GetNodelist()) @@ -279,6 +314,7 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) (string, return "", nil } +// cancelFluxJobForPod cancels the flux job for a pod. func (f *Fluence) cancelFluxJobForPod(podName string) error { jobid := f.podNameToJobId[podName] @@ -325,11 +361,13 @@ func (f *Fluence) cancelFluxJobForPod(podName string) error { return nil } -// EventHandlers +// EventHandlers updatePod handles cleaning up resources func (f *Fluence) updatePod(oldObj, newObj interface{}) { // klog.Info("Update Pod event handler") newPod := newObj.(*v1.Pod) - klog.Infof("Processing event for pod %s", newPod) + + klog.Infof("Processing event for pod %s", newPod.Name) + switch newPod.Status.Phase { case v1.PodPending: // in this state we don't know if a pod is going to be running, thus we don't need to update job map diff --git a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.pb.go b/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.pb.go deleted file mode 100644 index e317af2..0000000 --- a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.pb.go +++ /dev/null @@ -1,838 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.26.0 -// protoc v3.15.8 -// source: fluence/fluxcli-grpc/fluxcli.proto - -package fluxcli - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type PodSpec struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - Container string `protobuf:"bytes,2,opt,name=container,proto3" json:"container,omitempty"` - Cpu int32 `protobuf:"varint,3,opt,name=cpu,proto3" json:"cpu,omitempty"` - Memory int64 `protobuf:"varint,4,opt,name=memory,proto3" json:"memory,omitempty"` - Gpu int64 `protobuf:"varint,5,opt,name=gpu,proto3" json:"gpu,omitempty"` - Storage int64 `protobuf:"varint,6,opt,name=storage,proto3" json:"storage,omitempty"` - Labels []string `protobuf:"bytes,7,rep,name=labels,proto3" json:"labels,omitempty"` -} - -func (x *PodSpec) Reset() { - *x = PodSpec{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *PodSpec) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*PodSpec) ProtoMessage() {} - -func (x *PodSpec) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use PodSpec.ProtoReflect.Descriptor instead. -func (*PodSpec) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{0} -} - -func (x *PodSpec) GetId() string { - if x != nil { - return x.Id - } - return "" -} - -func (x *PodSpec) GetContainer() string { - if x != nil { - return x.Container - } - return "" -} - -func (x *PodSpec) GetCpu() int32 { - if x != nil { - return x.Cpu - } - return 0 -} - -func (x *PodSpec) GetMemory() int64 { - if x != nil { - return x.Memory - } - return 0 -} - -func (x *PodSpec) GetGpu() int64 { - if x != nil { - return x.Gpu - } - return 0 -} - -func (x *PodSpec) GetStorage() int64 { - if x != nil { - return x.Storage - } - return 0 -} - -func (x *PodSpec) GetLabels() []string { - if x != nil { - return x.Labels - } - return nil -} - -// The Match request message (allocate, allocate_orelse_reserve) -type MatchRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Ps *PodSpec `protobuf:"bytes,1,opt,name=ps,proto3" json:"ps,omitempty"` - Request string `protobuf:"bytes,2,opt,name=request,proto3" json:"request,omitempty"` - Count int32 `protobuf:"varint,3,opt,name=count,proto3" json:"count,omitempty"` -} - -func (x *MatchRequest) Reset() { - *x = MatchRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *MatchRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*MatchRequest) ProtoMessage() {} - -func (x *MatchRequest) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use MatchRequest.ProtoReflect.Descriptor instead. -func (*MatchRequest) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{1} -} - -func (x *MatchRequest) GetPs() *PodSpec { - if x != nil { - return x.Ps - } - return nil -} - -func (x *MatchRequest) GetRequest() string { - if x != nil { - return x.Request - } - return "" -} - -func (x *MatchRequest) GetCount() int32 { - if x != nil { - return x.Count - } - return 0 -} - -// The Nodes/Cluster Update Status -type NodeAlloc struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - NodeID string `protobuf:"bytes,1,opt,name=nodeID,proto3" json:"nodeID,omitempty"` - Tasks int32 `protobuf:"varint,2,opt,name=tasks,proto3" json:"tasks,omitempty"` -} - -func (x *NodeAlloc) Reset() { - *x = NodeAlloc{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[2] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *NodeAlloc) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*NodeAlloc) ProtoMessage() {} - -func (x *NodeAlloc) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[2] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use NodeAlloc.ProtoReflect.Descriptor instead. -func (*NodeAlloc) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{2} -} - -func (x *NodeAlloc) GetNodeID() string { - if x != nil { - return x.NodeID - } - return "" -} - -func (x *NodeAlloc) GetTasks() int32 { - if x != nil { - return x.Tasks - } - return 0 -} - -// The Match response message -type MatchResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - PodID string `protobuf:"bytes,1,opt,name=podID,proto3" json:"podID,omitempty"` - Nodelist []*NodeAlloc `protobuf:"bytes,2,rep,name=nodelist,proto3" json:"nodelist,omitempty"` - JobID int64 `protobuf:"varint,3,opt,name=jobID,proto3" json:"jobID,omitempty"` -} - -func (x *MatchResponse) Reset() { - *x = MatchResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[3] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *MatchResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*MatchResponse) ProtoMessage() {} - -func (x *MatchResponse) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[3] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use MatchResponse.ProtoReflect.Descriptor instead. -func (*MatchResponse) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{3} -} - -func (x *MatchResponse) GetPodID() string { - if x != nil { - return x.PodID - } - return "" -} - -func (x *MatchResponse) GetNodelist() []*NodeAlloc { - if x != nil { - return x.Nodelist - } - return nil -} - -func (x *MatchResponse) GetJobID() int64 { - if x != nil { - return x.JobID - } - return 0 -} - -type CancelRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - JobID int64 `protobuf:"varint,2,opt,name=jobID,proto3" json:"jobID,omitempty"` -} - -func (x *CancelRequest) Reset() { - *x = CancelRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[4] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *CancelRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*CancelRequest) ProtoMessage() {} - -func (x *CancelRequest) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[4] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use CancelRequest.ProtoReflect.Descriptor instead. -func (*CancelRequest) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{4} -} - -func (x *CancelRequest) GetJobID() int64 { - if x != nil { - return x.JobID - } - return 0 -} - -// The Match response message -type CancelResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - JobID int64 `protobuf:"varint,1,opt,name=jobID,proto3" json:"jobID,omitempty"` - Error int32 `protobuf:"varint,2,opt,name=error,proto3" json:"error,omitempty"` -} - -func (x *CancelResponse) Reset() { - *x = CancelResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[5] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *CancelResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*CancelResponse) ProtoMessage() {} - -func (x *CancelResponse) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[5] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use CancelResponse.ProtoReflect.Descriptor instead. -func (*CancelResponse) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{5} -} - -func (x *CancelResponse) GetJobID() int64 { - if x != nil { - return x.JobID - } - return 0 -} - -func (x *CancelResponse) GetError() int32 { - if x != nil { - return x.Error - } - return 0 -} - -// The Nodes/Cluster Update Status -type NodeStatus struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - CpuAvail int32 `protobuf:"varint,1,opt,name=cpuAvail,proto3" json:"cpuAvail,omitempty"` - GpuAvail int32 `protobuf:"varint,2,opt,name=gpuAvail,proto3" json:"gpuAvail,omitempty"` - StorageAvail int64 `protobuf:"varint,3,opt,name=storageAvail,proto3" json:"storageAvail,omitempty"` - MemoryAvail int64 `protobuf:"varint,4,opt,name=memoryAvail,proto3" json:"memoryAvail,omitempty"` - AllowedPods int64 `protobuf:"varint,5,opt,name=allowedPods,proto3" json:"allowedPods,omitempty"` - NodeIP string `protobuf:"bytes,6,opt,name=nodeIP,proto3" json:"nodeIP,omitempty"` - Replication int32 `protobuf:"varint,7,opt,name=replication,proto3" json:"replication,omitempty"` -} - -func (x *NodeStatus) Reset() { - *x = NodeStatus{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[6] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *NodeStatus) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*NodeStatus) ProtoMessage() {} - -func (x *NodeStatus) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[6] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use NodeStatus.ProtoReflect.Descriptor instead. -func (*NodeStatus) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{6} -} - -func (x *NodeStatus) GetCpuAvail() int32 { - if x != nil { - return x.CpuAvail - } - return 0 -} - -func (x *NodeStatus) GetGpuAvail() int32 { - if x != nil { - return x.GpuAvail - } - return 0 -} - -func (x *NodeStatus) GetStorageAvail() int64 { - if x != nil { - return x.StorageAvail - } - return 0 -} - -func (x *NodeStatus) GetMemoryAvail() int64 { - if x != nil { - return x.MemoryAvail - } - return 0 -} - -func (x *NodeStatus) GetAllowedPods() int64 { - if x != nil { - return x.AllowedPods - } - return 0 -} - -func (x *NodeStatus) GetNodeIP() string { - if x != nil { - return x.NodeIP - } - return "" -} - -func (x *NodeStatus) GetReplication() int32 { - if x != nil { - return x.Replication - } - return 0 -} - -// The JGF response message -type JGFRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Jgf string `protobuf:"bytes,1,opt,name=jgf,proto3" json:"jgf,omitempty"` -} - -func (x *JGFRequest) Reset() { - *x = JGFRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[7] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *JGFRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*JGFRequest) ProtoMessage() {} - -func (x *JGFRequest) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[7] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use JGFRequest.ProtoReflect.Descriptor instead. -func (*JGFRequest) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{7} -} - -func (x *JGFRequest) GetJgf() string { - if x != nil { - return x.Jgf - } - return "" -} - -// The JGF response message -type JGFResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Jgf string `protobuf:"bytes,1,opt,name=jgf,proto3" json:"jgf,omitempty"` -} - -func (x *JGFResponse) Reset() { - *x = JGFResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[8] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *JGFResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*JGFResponse) ProtoMessage() {} - -func (x *JGFResponse) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[8] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use JGFResponse.ProtoReflect.Descriptor instead. -func (*JGFResponse) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{8} -} - -func (x *JGFResponse) GetJgf() string { - if x != nil { - return x.Jgf - } - return "" -} - -var File_fluence_fluxcli_grpc_fluxcli_proto protoreflect.FileDescriptor - -var file_fluence_fluxcli_grpc_fluxcli_proto_rawDesc = []byte{ - 0x0a, 0x22, 0x66, 0x6c, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x2f, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, - 0x69, 0x2d, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x07, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x22, 0xa5, 0x01, - 0x0a, 0x07, 0x50, 0x6f, 0x64, 0x53, 0x70, 0x65, 0x63, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1c, 0x0a, 0x09, 0x63, 0x6f, 0x6e, - 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x12, 0x10, 0x0a, 0x03, 0x63, 0x70, 0x75, 0x18, 0x03, - 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x63, 0x70, 0x75, 0x12, 0x16, 0x0a, 0x06, 0x6d, 0x65, 0x6d, - 0x6f, 0x72, 0x79, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, - 0x79, 0x12, 0x10, 0x0a, 0x03, 0x67, 0x70, 0x75, 0x18, 0x05, 0x20, 0x01, 0x28, 0x03, 0x52, 0x03, - 0x67, 0x70, 0x75, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x18, 0x06, - 0x20, 0x01, 0x28, 0x03, 0x52, 0x07, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x12, 0x16, 0x0a, - 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x18, 0x07, 0x20, 0x03, 0x28, 0x09, 0x52, 0x06, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x73, 0x22, 0x60, 0x0a, 0x0c, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, - 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x20, 0x0a, 0x02, 0x70, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, - 0x0b, 0x32, 0x10, 0x2e, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x50, 0x6f, 0x64, 0x53, - 0x70, 0x65, 0x63, 0x52, 0x02, 0x70, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x72, 0x65, 0x71, 0x75, 0x65, - 0x73, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x12, 0x14, 0x0a, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, - 0x52, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x22, 0x39, 0x0a, 0x09, 0x4e, 0x6f, 0x64, 0x65, 0x41, - 0x6c, 0x6c, 0x6f, 0x63, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x49, 0x44, 0x18, 0x01, - 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x49, 0x44, 0x12, 0x14, 0x0a, 0x05, - 0x74, 0x61, 0x73, 0x6b, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x74, 0x61, 0x73, - 0x6b, 0x73, 0x22, 0x6b, 0x0a, 0x0d, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, - 0x6e, 0x73, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x70, 0x6f, 0x64, 0x49, 0x44, 0x18, 0x01, 0x20, 0x01, - 0x28, 0x09, 0x52, 0x05, 0x70, 0x6f, 0x64, 0x49, 0x44, 0x12, 0x2e, 0x0a, 0x08, 0x6e, 0x6f, 0x64, - 0x65, 0x6c, 0x69, 0x73, 0x74, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x66, 0x6c, - 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x4e, 0x6f, 0x64, 0x65, 0x41, 0x6c, 0x6c, 0x6f, 0x63, 0x52, - 0x08, 0x6e, 0x6f, 0x64, 0x65, 0x6c, 0x69, 0x73, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x6a, 0x6f, 0x62, - 0x49, 0x44, 0x18, 0x03, 0x20, 0x01, 0x28, 0x03, 0x52, 0x05, 0x6a, 0x6f, 0x62, 0x49, 0x44, 0x22, - 0x25, 0x0a, 0x0d, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, - 0x12, 0x14, 0x0a, 0x05, 0x6a, 0x6f, 0x62, 0x49, 0x44, 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, - 0x05, 0x6a, 0x6f, 0x62, 0x49, 0x44, 0x22, 0x3c, 0x0a, 0x0e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, - 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x6a, 0x6f, 0x62, 0x49, - 0x44, 0x18, 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x05, 0x6a, 0x6f, 0x62, 0x49, 0x44, 0x12, 0x14, - 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x65, - 0x72, 0x72, 0x6f, 0x72, 0x22, 0xe6, 0x01, 0x0a, 0x0a, 0x4e, 0x6f, 0x64, 0x65, 0x53, 0x74, 0x61, - 0x74, 0x75, 0x73, 0x12, 0x1a, 0x0a, 0x08, 0x63, 0x70, 0x75, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x63, 0x70, 0x75, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x12, - 0x1a, 0x0a, 0x08, 0x67, 0x70, 0x75, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x05, 0x52, 0x08, 0x67, 0x70, 0x75, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x12, 0x22, 0x0a, 0x0c, 0x73, - 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, - 0x03, 0x52, 0x0c, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x12, - 0x20, 0x0a, 0x0b, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x18, 0x04, - 0x20, 0x01, 0x28, 0x03, 0x52, 0x0b, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x41, 0x76, 0x61, 0x69, - 0x6c, 0x12, 0x20, 0x0a, 0x0b, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x50, 0x6f, 0x64, 0x73, - 0x18, 0x05, 0x20, 0x01, 0x28, 0x03, 0x52, 0x0b, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x50, - 0x6f, 0x64, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x49, 0x50, 0x18, 0x06, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x49, 0x50, 0x12, 0x20, 0x0a, 0x0b, 0x72, - 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x07, 0x20, 0x01, 0x28, 0x05, - 0x52, 0x0b, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x1e, 0x0a, - 0x0a, 0x4a, 0x47, 0x46, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x6a, - 0x67, 0x66, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6a, 0x67, 0x66, 0x22, 0x1f, 0x0a, - 0x0b, 0x4a, 0x47, 0x46, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x10, 0x0a, 0x03, - 0x6a, 0x67, 0x66, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6a, 0x67, 0x66, 0x32, 0x87, - 0x01, 0x0a, 0x0e, 0x46, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, - 0x65, 0x12, 0x38, 0x0a, 0x05, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x12, 0x15, 0x2e, 0x66, 0x6c, 0x75, - 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x1a, 0x16, 0x2e, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x4d, 0x61, 0x74, 0x63, - 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x3b, 0x0a, 0x06, 0x43, - 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x12, 0x16, 0x2e, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, - 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x17, 0x2e, - 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x0e, 0x5a, 0x0c, 0x67, 0x72, 0x70, 0x63, - 0x2f, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, -} - -var ( - file_fluence_fluxcli_grpc_fluxcli_proto_rawDescOnce sync.Once - file_fluence_fluxcli_grpc_fluxcli_proto_rawDescData = file_fluence_fluxcli_grpc_fluxcli_proto_rawDesc -) - -func file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP() []byte { - file_fluence_fluxcli_grpc_fluxcli_proto_rawDescOnce.Do(func() { - file_fluence_fluxcli_grpc_fluxcli_proto_rawDescData = protoimpl.X.CompressGZIP(file_fluence_fluxcli_grpc_fluxcli_proto_rawDescData) - }) - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescData -} - -var file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes = make([]protoimpl.MessageInfo, 9) -var file_fluence_fluxcli_grpc_fluxcli_proto_goTypes = []interface{}{ - (*PodSpec)(nil), // 0: fluxcli.PodSpec - (*MatchRequest)(nil), // 1: fluxcli.MatchRequest - (*NodeAlloc)(nil), // 2: fluxcli.NodeAlloc - (*MatchResponse)(nil), // 3: fluxcli.MatchResponse - (*CancelRequest)(nil), // 4: fluxcli.CancelRequest - (*CancelResponse)(nil), // 5: fluxcli.CancelResponse - (*NodeStatus)(nil), // 6: fluxcli.NodeStatus - (*JGFRequest)(nil), // 7: fluxcli.JGFRequest - (*JGFResponse)(nil), // 8: fluxcli.JGFResponse -} -var file_fluence_fluxcli_grpc_fluxcli_proto_depIdxs = []int32{ - 0, // 0: fluxcli.MatchRequest.ps:type_name -> fluxcli.PodSpec - 2, // 1: fluxcli.MatchResponse.nodelist:type_name -> fluxcli.NodeAlloc - 1, // 2: fluxcli.FluxcliService.Match:input_type -> fluxcli.MatchRequest - 4, // 3: fluxcli.FluxcliService.Cancel:input_type -> fluxcli.CancelRequest - 3, // 4: fluxcli.FluxcliService.Match:output_type -> fluxcli.MatchResponse - 5, // 5: fluxcli.FluxcliService.Cancel:output_type -> fluxcli.CancelResponse - 4, // [4:6] is the sub-list for method output_type - 2, // [2:4] is the sub-list for method input_type - 2, // [2:2] is the sub-list for extension type_name - 2, // [2:2] is the sub-list for extension extendee - 0, // [0:2] is the sub-list for field type_name -} - -func init() { file_fluence_fluxcli_grpc_fluxcli_proto_init() } -func file_fluence_fluxcli_grpc_fluxcli_proto_init() { - if File_fluence_fluxcli_grpc_fluxcli_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*PodSpec); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*MatchRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*NodeAlloc); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*MatchResponse); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*CancelRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*CancelResponse); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*NodeStatus); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[7].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*JGFRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[8].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*JGFResponse); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_fluence_fluxcli_grpc_fluxcli_proto_rawDesc, - NumEnums: 0, - NumMessages: 9, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_fluence_fluxcli_grpc_fluxcli_proto_goTypes, - DependencyIndexes: file_fluence_fluxcli_grpc_fluxcli_proto_depIdxs, - MessageInfos: file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes, - }.Build() - File_fluence_fluxcli_grpc_fluxcli_proto = out.File - file_fluence_fluxcli_grpc_fluxcli_proto_rawDesc = nil - file_fluence_fluxcli_grpc_fluxcli_proto_goTypes = nil - file_fluence_fluxcli_grpc_fluxcli_proto_depIdxs = nil -} diff --git a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto b/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto deleted file mode 100644 index f47d35b..0000000 --- a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto +++ /dev/null @@ -1,76 +0,0 @@ -syntax = "proto3"; -option go_package = "grpc/fluxcli"; - -package fluxcli; - - -// Service definition -service FluxcliService { - // Sends a Match command - rpc Match(MatchRequest) returns (MatchResponse) {} - rpc Cancel(CancelRequest) returns (CancelResponse) {} -} - -message PodSpec { - string id = 1; - string container = 2; - int32 cpu = 3; - int64 memory = 4; - int64 gpu = 5; - int64 storage = 6; - repeated string labels = 7; -} - -// The Match request message (allocate, allocate_orelse_reserve) -message MatchRequest { - PodSpec ps = 1; - string request = 2; - int32 count = 3; -} - -// The Nodes/Cluster Update Status -message NodeAlloc { - string nodeID = 1; - int32 tasks = 2; -} - -// The Match response message -message MatchResponse { - string podID = 1; - repeated NodeAlloc nodelist = 2; - int64 jobID = 3; -} - -message CancelRequest { - int64 jobID = 2; -} - -// The Match response message -message CancelResponse { - int64 jobID = 1; - int32 error = 2; -} - - - -// The Nodes/Cluster Update Status -message NodeStatus { - int32 cpuAvail = 1; - int32 gpuAvail = 2; - int64 storageAvail = 3; - int64 memoryAvail = 4; - int64 allowedPods = 5; - string nodeIP = 6; - int32 replication = 7; -} - -// The JGF response message -message JGFRequest { - string jgf = 1; -} - - -// The JGF response message -message JGFResponse { - string jgf = 1; -} diff --git a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli_grpc.pb.go b/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli_grpc.pb.go deleted file mode 100644 index 7bd905a..0000000 --- a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli_grpc.pb.go +++ /dev/null @@ -1,139 +0,0 @@ -// Code generated by protoc-gen-go-grpc. DO NOT EDIT. - -package fluxcli - -import ( - context "context" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" -) - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -// Requires gRPC-Go v1.32.0 or later. -const _ = grpc.SupportPackageIsVersion7 - -// FluxcliServiceClient is the client API for FluxcliService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. -type FluxcliServiceClient interface { - // Sends a Match command - Match(ctx context.Context, in *MatchRequest, opts ...grpc.CallOption) (*MatchResponse, error) - Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) -} - -type fluxcliServiceClient struct { - cc grpc.ClientConnInterface -} - -func NewFluxcliServiceClient(cc grpc.ClientConnInterface) FluxcliServiceClient { - return &fluxcliServiceClient{cc} -} - -func (c *fluxcliServiceClient) Match(ctx context.Context, in *MatchRequest, opts ...grpc.CallOption) (*MatchResponse, error) { - out := new(MatchResponse) - err := c.cc.Invoke(ctx, "/fluxcli.FluxcliService/Match", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -func (c *fluxcliServiceClient) Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) { - out := new(CancelResponse) - err := c.cc.Invoke(ctx, "/fluxcli.FluxcliService/Cancel", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// FluxcliServiceServer is the server API for FluxcliService service. -// All implementations must embed UnimplementedFluxcliServiceServer -// for forward compatibility -type FluxcliServiceServer interface { - // Sends a Match command - Match(context.Context, *MatchRequest) (*MatchResponse, error) - Cancel(context.Context, *CancelRequest) (*CancelResponse, error) - mustEmbedUnimplementedFluxcliServiceServer() -} - -// UnimplementedFluxcliServiceServer must be embedded to have forward compatible implementations. -type UnimplementedFluxcliServiceServer struct { -} - -func (UnimplementedFluxcliServiceServer) Match(context.Context, *MatchRequest) (*MatchResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method Match not implemented") -} -func (UnimplementedFluxcliServiceServer) Cancel(context.Context, *CancelRequest) (*CancelResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method Cancel not implemented") -} -func (UnimplementedFluxcliServiceServer) mustEmbedUnimplementedFluxcliServiceServer() {} - -// UnsafeFluxcliServiceServer may be embedded to opt out of forward compatibility for this service. -// Use of this interface is not recommended, as added methods to FluxcliServiceServer will -// result in compilation errors. -type UnsafeFluxcliServiceServer interface { - mustEmbedUnimplementedFluxcliServiceServer() -} - -func RegisterFluxcliServiceServer(s grpc.ServiceRegistrar, srv FluxcliServiceServer) { - s.RegisterService(&FluxcliService_ServiceDesc, srv) -} - -func _FluxcliService_Match_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(MatchRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(FluxcliServiceServer).Match(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/fluxcli.FluxcliService/Match", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(FluxcliServiceServer).Match(ctx, req.(*MatchRequest)) - } - return interceptor(ctx, in, info, handler) -} - -func _FluxcliService_Cancel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(CancelRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(FluxcliServiceServer).Cancel(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/fluxcli.FluxcliService/Cancel", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(FluxcliServiceServer).Cancel(ctx, req.(*CancelRequest)) - } - return interceptor(ctx, in, info, handler) -} - -// FluxcliService_ServiceDesc is the grpc.ServiceDesc for FluxcliService service. -// It's only intended for direct use with grpc.RegisterService, -// and not to be introspected or modified (even as a copy) -var FluxcliService_ServiceDesc = grpc.ServiceDesc{ - ServiceName: "fluxcli.FluxcliService", - HandlerType: (*FluxcliServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "Match", - Handler: _FluxcliService_Match_Handler, - }, - { - MethodName: "Cancel", - Handler: _FluxcliService_Cancel_Handler, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "fluence/fluxcli-grpc/fluxcli.proto", -} diff --git a/sig-scheduler-plugins/pkg/fluence/utils/utils.go b/sig-scheduler-plugins/pkg/fluence/utils/utils.go index cfb857d..53e9c4a 100644 --- a/sig-scheduler-plugins/pkg/fluence/utils/utils.go +++ b/sig-scheduler-plugins/pkg/fluence/utils/utils.go @@ -17,60 +17,83 @@ limitations under the License. package utils import ( + "fmt" "strings" v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" - "k8s.io/kubernetes/pkg/scheduler/framework" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" ) -type NoopStateData struct{} +// TODO this package should be renamed something related to a PodSpec Info -func NewNoopStateData() framework.StateData { - return &NoopStateData{} -} - -func (d *NoopStateData) Clone() framework.StateData { - return d +// getPodJobpsecLabels looks across labels and returns those relevant +// to a jobspec +func getPodJobspecLabels(pod *v1.Pod) []string { + labels := []string{} + for label, value := range pod.Labels { + if strings.Contains(label, "jobspec") { + labels = append(labels, value) + } + } + return labels } // InspectPodInfo takes a pod object and returns the pod.spec +// Note from vsoch - I updated this to calculate containers across the pod +// if that's wrong we can change it back. func InspectPodInfo(pod *v1.Pod) *pb.PodSpec { ps := new(pb.PodSpec) ps.Id = pod.Name - cont := pod.Spec.Containers[0] - - //This will need to be done here AND at client level - if len(pod.Labels) > 0 { - r := make([]string, 0) - for key, val := range pod.Labels { - if strings.Contains(key, "jobspec") { - r = append(r, val) - } - } - if len(r) > 0 { - ps.Labels = r - } - } - specRequests := cont.Resources.Requests - specLimits := cont.Resources.Limits + // Note from vsoch - there was an if check here to see if we had labels, + // I don't think there is risk to adding an empty list but we can add + // the check back if there is + ps.Labels = getPodJobspecLabels(pod) + + // Note that Container gets use for the JobSpec, so we provide + // the pod name (to be associated with tasks) for it. We likely + // should change this identifier eventually. + ps.Container = fmt.Sprintf("%s-%s", pod.Namespace, pod.Name) + + // Create accumulated requests for cpu and limits + // CPU and memory are summed across containers + // GPU cannot be shared across containers, but we + // take a count for the pod for the PodSpec + var cpus int32 = 0 + var memory int64 = 0 + var gpus int64 = 0 + + // I think we are OK to sum this too + // https://github.com/kubernetes/kubectl/blob/master/pkg/describe/describe.go#L4211-L4213 + var storage int64 = 0 + + for _, container := range pod.Spec.Containers { + + // Add on Cpu, Memory, GPU from container requests + // This is a limited set of resources owned by the pod + specRequests := container.Resources.Requests + cpus += int32(specRequests.Cpu().Value()) + memory += specRequests.Memory().Value() + storage += specRequests.StorageEphemeral().Value() + + specLimits := container.Resources.Limits + gpuSpec := specLimits["nvidia.com/gpu"] + gpus += gpuSpec.Value() - if specRequests.Cpu().Value() == 0 { - ps.Cpu = 1 - } else { - ps.Cpu = int32(specRequests.Cpu().Value()) } - if specRequests.Memory().Value() > 0 { - ps.Memory = specRequests.Memory().Value() + // If we have zero cpus, assume 1 + // We could use math.Max here, but it is expecting float64 + if cpus == 0 { + cpus = 1 } - gpu := specLimits["nvidia.com/gpu"] - ps.Gpu = gpu.Value() - ps.Storage = specRequests.StorageEphemeral().Value() - - klog.Infof("[Jobspec] Pod spec: CPU %v/%v-milli, memory %v, GPU %v, storage %v", ps.Cpu, specRequests.Cpu().MilliValue(), specRequests.Memory().Value(), ps.Gpu, ps.Storage) + ps.Cpu = cpus + ps.Gpu = gpus + ps.Memory = memory + ps.Storage = storage + // I removed specRequests.Cpu().MilliValue() but we can add back some derivative if desired + klog.Infof("[Jobspec] Pod spec: CPU %v, memory %v, GPU %v, storage %v", ps.Cpu, ps.Memory, ps.Gpu, ps.Storage) return ps } diff --git a/src/Makefile b/src/Makefile index a32efce..344bde1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,7 @@ FLUX_SCHED_ROOT ?= /home/flux-sched INSTALL_PREFIX ?= /usr +LOCALBIN ?= $(shell pwd)/bin COMMONENVVAR=GOOS=$(shell uname -s | tr A-Z a-z) # This is what worked @@ -14,6 +15,10 @@ RELEASE_VERSION?=v$(shell date +%Y%m%d)-$(shell git describe --tags --match "v*" .PHONY: all all: fluxcli +.PHONY: $(LOCALBIN) +$(LOCALBIN): + mkdir -p $(LOCALBIN) + .PHONY: fluxcli fluxcli: docker build -f build/scheduler/Dockerfile --build-arg ARCH="amd64" --build-arg RELEASE_VERSION="$(RELEASE_VERSION)" -t $(LOCAL_REGISTRY)/$(LOCAL_IMAGE) . @@ -22,6 +27,12 @@ fluxcli: server: $(COMMONENVVAR) $(BUILDENVVAR) go build -ldflags '-w' -o bin/server cmd/main.go +.PHONY: protoc +protoc: $(LOCALBIN) + GOBIN=$(LOCALBIN) go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.28 + GOBIN=$(LOCALBIN) go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.2 + +# You can use make protoc to download proto .PHONY: proto -proto: - protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluence/fluxcli-grpc/fluxcli.proto +proto: protoc + PATH=$(LOCALBIN):${PATH} protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluence/fluxcli-grpc/fluxcli.proto diff --git a/src/build/scheduler/Dockerfile b/src/build/scheduler/Dockerfile index 18c4bd7..15a9678 100644 --- a/src/build/scheduler/Dockerfile +++ b/src/build/scheduler/Dockerfile @@ -132,6 +132,7 @@ WORKDIR /go/src/fluence/ # This is the 0.31.0 tag of flux-sched (same as we install above) RUN go get -u github.com/flux-framework/flux-sched/resource/reapi/bindings/go/src/fluxcli@250eac78a6753253fc8353a3504d7e843d1b6b24 && \ go mod tidy && \ + go mod vendor && \ make server FLUX_SCHED_ROOT=/home/flux-sched INSTALL_PREFIX=${INSTALL_PREFIX} && \ mkdir -p /home/data/jobspecs /home/data/jgf && \ chmod -R ugo+rwx /home/data \ No newline at end of file diff --git a/src/fluence/fluxcli-grpc/fluxcli.pb.go b/src/fluence/fluxcli-grpc/fluxcli.pb.go index e317af2..6bd47d4 100644 --- a/src/fluence/fluxcli-grpc/fluxcli.pb.go +++ b/src/fluence/fluxcli-grpc/fluxcli.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: -// protoc-gen-go v1.26.0 -// protoc v3.15.8 +// protoc-gen-go v1.28.1 +// protoc v3.20.3 // source: fluence/fluxcli-grpc/fluxcli.proto package fluxcli diff --git a/src/fluence/fluxcli-grpc/fluxcli.proto b/src/fluence/fluxcli-grpc/fluxcli.proto index f47d35b..f85b558 100644 --- a/src/fluence/fluxcli-grpc/fluxcli.proto +++ b/src/fluence/fluxcli-grpc/fluxcli.proto @@ -73,4 +73,4 @@ message JGFRequest { // The JGF response message message JGFResponse { string jgf = 1; -} +} \ No newline at end of file diff --git a/src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go b/src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go index 7bd905a..f984b04 100644 --- a/src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go +++ b/src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go @@ -1,4 +1,8 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.20.3 +// source: fluence/fluxcli-grpc/fluxcli.proto package fluxcli diff --git a/src/fluence/fluxion/fluxion.go b/src/fluence/fluxion/fluxion.go index 7ef532a..6478602 100644 --- a/src/fluence/fluxion/fluxion.go +++ b/src/fluence/fluxion/fluxion.go @@ -18,6 +18,7 @@ type Fluxion struct { pb.UnimplementedFluxcliServiceServer } +// InitFluxion creates a new client to interaction with the fluxion API (via go bindings) func (f *Fluxion) InitFluxion(policy *string, label *string) { f.cli = fluxcli.NewReapiClient() @@ -44,7 +45,9 @@ func (f *Fluxion) InitFluxion(policy *string, label *string) { f.cli.InitContext(string(jgf), p) } +// Cancel wraps the Cancel function of the fluxion go bindings func (s *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelResponse, error) { + fmt.Printf("[GRPCServer] Received Cancel request %v\n", in) err := s.cli.Cancel(int64(in.JobID), true) if err != nil { @@ -55,7 +58,6 @@ func (s *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelR // This (I think) should be an error code for the specific job dr := &pb.CancelResponse{JobID: in.JobID} fmt.Printf("[GRPCServer] Sending Cancel response %v\n", dr) - fmt.Printf("[CancelRPC] Errors so far: %s\n", s.cli.GetErrMsg()) reserved, at, overhead, mode, fluxerr := s.cli.Info(int64(in.JobID)) @@ -66,30 +68,62 @@ func (s *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelR return dr, nil } -func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResponse, error) { - filename := "/home/data/jobspecs/jobspec.yaml" - jobspec.CreateJobSpecYaml(in.Ps, in.Count, filename) +// generateJobSpec generates a jobspec for a match request and returns the string +func (s *Fluxion) generateJobspec(in *pb.MatchRequest) ([]byte, error) { - spec, err := os.ReadFile(filename) + spec := []byte{} + + // Create a temporary file to write and read the jobspec + // The first parameter here as the empty string creates in /tmp + file, err := os.CreateTemp("", "jobspec.*.yaml") if err != nil { - return nil, errors.New("Error reading jobspec") + return spec, err } + defer os.Remove(file.Name()) + jobspec.CreateJobSpecYaml(in.Ps, in.Count, file.Name()) + spec, err = os.ReadFile(file.Name()) + if err != nil { + return spec, errors.New("Error reading jobspec") + } + return spec, err +} + +// Match wraps the MatchAllocate function of the fluxion go bindings +// If a match is not possible, we return the error and an empty response +func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResponse, error) { + + emptyResponse := &pb.MatchResponse{} + + // Prepare an empty match response (that can still be serialized) fmt.Printf("[GRPCServer] Received Match request %v\n", in) + + // Generate the jobspec, written to temporary file and read as string + spec, err := s.generateJobspec(in) + if err != nil { + return emptyResponse, err + } + + // Ask flux to match allocate! reserved, allocated, at, overhead, jobid, fluxerr := s.cli.MatchAllocate(false, string(spec)) utils.PrintOutput(reserved, allocated, at, overhead, jobid, fluxerr) - fmt.Printf("[MatchRPC] Errors so far: %s\n", s.cli.GetErrMsg()) if fluxerr != nil { - return nil, errors.New("Error in ReapiCliMatchAllocate") + fmt.Printf("[GRPCServer] Flux err is %w\n", fluxerr) + return emptyResponse, errors.New("Error in ReapiCliMatchAllocate") } + // This usually means we cannot allocate + // We need to return an error here otherwise we try to pass an empty string + // to other RPC endpoints and get back an error. if allocated == "" { - return nil, nil + fmt.Println("[GRPCServer] Allocated is empty") + return emptyResponse, errors.New("allocation was not possible") } - nodetasks := utils.ParseAllocResult(allocated) - + // Pass the spec name in so we can include it in the allocation result + // This will allow us to inspect the ordering later. + nodetasks := utils.ParseAllocResult(allocated, in.Ps.Container) nodetaskslist := make([]*pb.NodeAlloc, len(nodetasks)) for i, result := range nodetasks { nodetaskslist[i] = &pb.NodeAlloc{ diff --git a/src/fluence/go.mod b/src/fluence/go.mod index 5a14548..5409a2a 100644 --- a/src/fluence/go.mod +++ b/src/fluence/go.mod @@ -1,6 +1,6 @@ module github.com/flux-framework/flux-k8s/flux-plugin/fluence -go 1.16 +go 1.19 require ( github.com/flux-framework/flux-sched/resource/reapi/bindings/go v0.0.0-20231213021445-250eac78a675 @@ -13,6 +13,33 @@ require ( k8s.io/kubectl v0.0.0 ) +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/go-logr/logr v0.4.0 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/google/go-cmp v0.5.5 // indirect + github.com/google/gofuzz v1.1.0 // indirect + github.com/googleapis/gnostic v0.5.5 // indirect + github.com/json-iterator/go v1.1.11 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.1 // indirect + golang.org/x/net v0.0.0-20210520170846-37e1c6afe023 // indirect + golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d // indirect + golang.org/x/sys v0.0.0-20210616094352-59db8d763f22 // indirect + golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d // indirect + golang.org/x/text v0.3.6 // indirect + golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect + google.golang.org/appengine v1.6.5 // indirect + google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect + k8s.io/klog/v2 v2.9.0 // indirect + k8s.io/utils v0.0.0-20210819203725-bdf08cb9a70a // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect + sigs.k8s.io/yaml v1.2.0 // indirect +) + replace ( k8s.io/api => k8s.io/api v0.22.3 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.22.3 diff --git a/src/fluence/jgf/jgf.go b/src/fluence/jgf/jgf.go index d12148b..1f45235 100644 --- a/src/fluence/jgf/jgf.go +++ b/src/fluence/jgf/jgf.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -23,51 +23,22 @@ import ( "strings" ) -type node struct { - Id string `json:"id"` - Label string `json:"label,omitempty"` - Metadata nodeMetadata `json:"metadata,omitempty"` -} +var ( + // Defaults for nodes + defaultExclusive = false + defaultRank = -1 + defaultSize = 1 + defaultUnit = "" -type edge struct { - Source string `json:"source"` - Relation string `json:"relation,omitempty"` - Target string `json:"target"` - Directed bool `json:"directed,omitempty"` - Metadata edgeMetadata `json:"metadata"` -} + // Relations + containsRelation = "contains" + inRelation = "in" -type edgeMetadata struct { - Name map[string]string `json:"name,omitempty"` -} - -type nodeMetadata struct { - Type string `json:"type"` - Basename string `json:"basename"` - Name string `json:"name"` - Id int `json:"id"` - Uniq_id int `json:"uniq_id"` - Rank int `json:"rank,omitempty"` - Exclusive bool `json:"exclusive"` - Unit string `json:"unit"` - Size int `json:"size"` - Paths map[string]string `json:"paths,omitempty"` - Properties map[string]string `json:"properties,omitempty"` -} - -type graph struct { - Nodes []node `json:"nodes"` - Edges []edge `json:"edges"` - // Metadata metadata `json:"metadata,omitempty"` - Directed bool `json:"directed,omitempty"` -} - -type Fluxjgf struct { - Graph graph `json:"graph"` - Elements int `json:"-"` - NodeMap map[string]node `json:"-"` -} + // Paths + containmentKey = "containment" +) +// InitJGF initializes the Flux Json Graph Format object func InitJGF() (fluxgraph Fluxjgf) { var g graph fluxgraph = Fluxjgf{ @@ -77,42 +48,49 @@ func InitJGF() (fluxgraph Fluxjgf) { } return } + +// getDefaultPaths returns a new map with empty containment +// this cannot be a global shared variable or we get an error +// about inserting an edge to itself. +func getDefaultPaths() map[string]string { + return map[string]string{containmentKey: ""} +} + +// addNode adds a node to the JGF func (g *Fluxjgf) addNode(toadd node) { g.Graph.Nodes = append(g.Graph.Nodes, toadd) g.NodeMap[toadd.Id] = toadd g.Elements = g.Elements + 1 } +// MakeEdge creates an edge for the JGF func (g *Fluxjgf) MakeEdge(source string, target string, contains string) { newedge := edge{ Source: source, Target: target, Metadata: edgeMetadata{ - Name: map[string]string{ - "containment": contains, - }, + Name: map[string]string{containmentKey: contains}, }, } g.Graph.Edges = append(g.Graph.Edges, newedge) - if contains == "contains" { + if contains == containsRelation { tnode := g.NodeMap[target] - tnode.Metadata.Paths["containment"] = g.NodeMap[source].Metadata.Paths["containment"] + "/" + tnode.Metadata.Name + tnode.Metadata.Paths[containmentKey] = g.NodeMap[source].Metadata.Paths[containmentKey] + "/" + tnode.Metadata.Name } - } +// processLabels selects a subset based on a string filter func processLabels(labels *map[string]string, filter string) (filtered map[string]string) { - filtered = make(map[string]string, 0) + filtered = map[string]string{} for key, v := range *labels { if strings.Contains(key, filter) { - filtered[key] = v } } return } - +// MakeSubnet creates a subnet for the graph func (g *Fluxjgf) MakeSubnet(index int, ip string) string { newnode := node{ Id: strconv.Itoa(g.Elements), @@ -122,19 +100,18 @@ func (g *Fluxjgf) MakeSubnet(index int, ip string) string { Name: ip + strconv.Itoa(g.Elements), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } +// MakeNode creates a new node for the graph func (g *Fluxjgf) MakeNode(index int, exclusive bool, subnet string) string { newnode := node{ Id: strconv.Itoa(g.Elements), @@ -144,19 +121,18 @@ func (g *Fluxjgf) MakeNode(index int, exclusive bool, subnet string) string { Name: subnet + strconv.Itoa(g.Elements), Id: g.Elements, Uniq_id: g.Elements, - Rank: -1, + Rank: defaultRank, Exclusive: exclusive, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } +// MakeSocket creates a socket for the graph func (g *Fluxjgf) MakeSocket(index int, name string) string { newnode := node{ Id: strconv.Itoa(g.Elements), @@ -166,19 +142,18 @@ func (g *Fluxjgf) MakeSocket(index int, name string) string { Name: name + strconv.Itoa(index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } +// MakeCore creates a core for the graph func (g *Fluxjgf) MakeCore(index int, name string) string { newnode := node{ Id: strconv.Itoa(g.Elements), @@ -188,19 +163,18 @@ func (g *Fluxjgf) MakeCore(index int, name string) string { Name: name + strconv.Itoa(index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } +// MakeVCore makes a vcore (I think 2 vcpu == 1 cpu) for the graph func (g *Fluxjgf) MakeVCore(coreid string, index int, name string) string { newnode := node{ Id: strconv.Itoa(g.Elements), @@ -210,21 +184,20 @@ func (g *Fluxjgf) MakeVCore(coreid string, index int, name string) string { Name: name + strconv.Itoa(index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, "contains") - g.MakeEdge(newnode.Id, coreid, "in") + g.MakeEdge(coreid, newnode.Id, containsRelation) + g.MakeEdge(newnode.Id, coreid, inRelation) return newnode.Id } +// MakeNFProperties makes the node feature discovery properties for the graph func (g *Fluxjgf) MakeNFDProperties(coreid string, index int, filter string, labels *map[string]string) { for key, _ := range *labels { if strings.Contains(key, filter) { @@ -241,17 +214,15 @@ func (g *Fluxjgf) MakeNFDProperties(coreid string, index int, filter string, lab Name: name + strconv.Itoa(index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, "contains") + g.MakeEdge(coreid, newnode.Id, containsRelation) } } } @@ -269,21 +240,20 @@ func (g *Fluxjgf) MakeNFDPropertiesByValue(coreid string, index int, filter stri Name: name + strconv.Itoa(index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, "contains") + g.MakeEdge(coreid, newnode.Id, containsRelation) } } } +// MakeMemory creates memory for the graph func (g *Fluxjgf) MakeMemory(index int, name string, unit string, size int) string { newnode := node{ Id: strconv.Itoa(g.Elements), @@ -293,19 +263,18 @@ func (g *Fluxjgf) MakeMemory(index int, name string, unit string, size int) stri Name: name + strconv.Itoa(index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, + Rank: defaultRank, + Exclusive: defaultExclusive, Unit: unit, Size: size, - Paths: map[string]string{ - "containment": "", - }, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } +// MakeGPU makes a gpu for the graph func (g *Fluxjgf) MakeGPU(index int, name string, size int) string { newnode := node{ Id: strconv.Itoa(g.Elements), @@ -315,19 +284,18 @@ func (g *Fluxjgf) MakeGPU(index int, name string, size int) string { Name: name + strconv.Itoa(index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, Size: size, - Paths: map[string]string{ - "containment": "", - }, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } +// MakeCluster creates the cluster func (g *Fluxjgf) MakeCluster(clustername string) string { g.Elements = 0 newnode := node{ @@ -338,12 +306,12 @@ func (g *Fluxjgf) MakeCluster(clustername string) string { Name: clustername + "0", Id: g.Elements, Uniq_id: 0, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, Paths: map[string]string{ - "containment": "/" + clustername + "0", + containmentKey: "/" + clustername + "0", }, }, } @@ -351,6 +319,7 @@ func (g *Fluxjgf) MakeCluster(clustername string) string { return newnode.Id } +// MakeRack makes the rack func (g *Fluxjgf) MakeRack(id int) string { newnode := node{ Id: strconv.Itoa(g.Elements), @@ -360,13 +329,11 @@ func (g *Fluxjgf) MakeRack(id int) string { Name: "rack" + strconv.Itoa(id), Id: id, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) diff --git a/src/fluence/jgf/types.go b/src/fluence/jgf/types.go new file mode 100644 index 0000000..b2b743f --- /dev/null +++ b/src/fluence/jgf/types.go @@ -0,0 +1,62 @@ +/* +Copyright © 2021 IBM Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package jgf + +type node struct { + Id string `json:"id"` + Label string `json:"label,omitempty"` + Metadata nodeMetadata `json:"metadata,omitempty"` +} + +type edge struct { + Source string `json:"source"` + Relation string `json:"relation,omitempty"` + Target string `json:"target"` + Directed bool `json:"directed,omitempty"` + Metadata edgeMetadata `json:"metadata"` +} + +type edgeMetadata struct { + Name map[string]string `json:"name,omitempty"` +} + +type nodeMetadata struct { + Type string `json:"type"` + Basename string `json:"basename"` + Name string `json:"name"` + Id int `json:"id"` + Uniq_id int `json:"uniq_id"` + Rank int `json:"rank,omitempty"` + Exclusive bool `json:"exclusive"` + Unit string `json:"unit"` + Size int `json:"size"` + Paths map[string]string `json:"paths,omitempty"` + Properties map[string]string `json:"properties,omitempty"` +} + +type graph struct { + Nodes []node `json:"nodes"` + Edges []edge `json:"edges"` + // Metadata metadata `json:"metadata,omitempty"` + Directed bool `json:"directed,omitempty"` +} + +type Fluxjgf struct { + Graph graph `json:"graph"` + Elements int `json:"-"` + NodeMap map[string]node `json:"-"` +} diff --git a/src/fluence/jobspec/jobspec.go b/src/fluence/jobspec/jobspec.go index 8ef90ae..683f586 100644 --- a/src/fluence/jobspec/jobspec.go +++ b/src/fluence/jobspec/jobspec.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -25,9 +25,9 @@ import ( "gopkg.in/yaml.v2" ) - - /* + +Structure of the PodSpec that needs to be generated, for reference Ps: &pb.PodSpec{ Id: pod_jobspec.ID, Container: pod_jobspec.Containers[0].Image, @@ -38,149 +38,123 @@ Ps: &pb.PodSpec{ }, */ -func CreateJobSpecYaml(pr *pb.PodSpec, count int32, filename string) error { - socket_resources := make([]Resource, 1) - command := []string{pr.Container} - socket_resources[0] = Resource{Type: "core", Count: int64(pr.Cpu)} - if pr.Memory > 0 { - toMB := pr.Memory >> 20 - socket_resources = append(socket_resources, Resource{Type: "memory", Count: toMB}) - } +// CreateJobSpecYaml writes the protobuf jobspec into a yaml file +func CreateJobSpecYaml(spec *pb.PodSpec, count int32, filename string) error { - if pr.Gpu > 0 { - socket_resources = append(socket_resources, Resource{Type: "gpu", Count: pr.Gpu}) - } + command := []string{spec.Container} + fmt.Println("Labels ", spec.Labels, " ", len(spec.Labels)) - fmt.Println("Labels ", pr.Labels, " ", len(pr.Labels)) + js := JobSpec{ + Version: Version{Version: 9999}, + Attributes: Attribute{System{Duration: 3600}}, - js := JobSpec{ - Version: Version{ - Version: 9999, + // The name of the task likely needs to correspond with the pod + // Since we can't easily change the proto file, for now it is + // storing the pod namespaced name. + Tasks: []Task{ + { + Command: command, + Slot: "default", + Counts: Count{PerSlot: 1}, }, - Attributes: Attribute{ - System{ - Duration: 3600, - }, - }, - Tasks: []Task{ - { - // Command: "[\""+command+"\"]", - Command: command, - Slot: "default", - Counts: Count{ - PerSlot: 1, - }, - }, - }, - } + }, + } + + // Assemble resources! + socketResources := createSocketResources(spec) + js.Version.Resources = createResources(spec, socketResources, count) + + // Write bytes to file + yamlbytes, err := yaml.Marshal(&js) + if err != nil { + log.Fatalf("[JobSpec] yaml.Marshal failed with '%s'\n", err) + return err + } + return writeBytes(yamlbytes, filename) +} + +// WriteBytes writes a byte string to file +func writeBytes(bytelist []byte, filename string) error { + fmt.Printf("[JobSpec] Preparing to write:\n%s\n", string(bytelist)) + f, err := os.Create(filename) + if err != nil { + log.Fatalf("[JobSpec] Couldn't create file!!\n") + return err + } + defer f.Close() + + _, err = f.Write(bytelist) + if err != nil { + log.Fatalf("[JobSpec] Couldn't write file!!\n") + return err + } + + // Not sure why this is here, but will keep for now + _, err = f.WriteString("\n") + if err != nil { + log.Fatalf("[JobSpec] Couldn't append newline to file!!\n") + } + return err +} + +func toGB(bytes int64) int64 { + res := float64(bytes) / math.Pow(10, 9) + return int64(res) +} + +// createSocketResources creates the socket resources for the JobSpec +func createSocketResources(spec *pb.PodSpec) []Resource { - slot_resource := make([]Resource, 1) - slot_resource[0] = Resource{ - Type: "slot", + socketResources := []Resource{ + { + Type: "core", Count: int64(spec.Cpu), + }, + } + + // TODO double check what we are converting from -> to + if spec.Memory > 0 { + toMB := spec.Memory >> 20 + socketResources = append(socketResources, Resource{Type: "memory", Count: toMB}) + } + + if spec.Gpu > 0 { + socketResources = append(socketResources, Resource{Type: "gpu", Count: spec.Gpu}) + } + return socketResources +} + +// createResources assembles the list of JobSpec resources +func createResources(spec *pb.PodSpec, socketResources []Resource, count int32) []Resource { + + slotResource := []Resource{ + { + Type: "slot", Count: int64(count), Label: "default", - With: socket_resources, - } - - if len(pr.Labels) > 0 { - for _, label := range pr.Labels { - if label == "zone" { - node_resource := make([]Resource, 1) - node_resource[0] = Resource{ - Type: "subnet", + With: socketResources, + }, + } + + // Presence of the zone label means we need to add a subnet + if len(spec.Labels) > 0 { + for _, label := range spec.Labels { + if label == "zone" { + nodeResource := []Resource{ + { + Type: "subnet", Count: 1, With: []Resource{ { - Type: "node", + Type: "node", Count: 1, - With: slot_resource, /*[]Resource{ - { - Type: "socket", - Count: 1, - With: slot_resource, - }, - },*/ + With: slotResource, }, }, - } - js.Version.Resources = node_resource + }, } - + return nodeResource } - - } else { - fmt.Println("No labels, going with plain JobSpec") - js.Version.Resources = slot_resource - } - - // js := JobSpec{ - // Version: Version{ - // Version: 9999, - // Resources: []Resource{ - // { - // Type: "node", - // Count: 1, - // With: []Resource{ - // { - // Type: "socket", - // Count: 1, - // With: []Resource{ - // { - // Type: "slot", - // Count: int64(count), - // Label: "default", - // With: socket_resources, - // }, - // }, - // }, - // }, - // }, - // }, - // }, - // Attributes: Attribute{ - // System{ - // Duration: 3600, - // }, - // }, - // Tasks: []Task{ - // { - // // Command: "[\""+command+"\"]", - // Command: command, - // Slot: "default", - // Counts: Count{ - // PerSlot: 1, - // }, - // }, - // }, - // } - yamlbytes, err := yaml.Marshal(&js) - if err != nil { - log.Fatalf("[JobSpec] yaml.Marshal failed with '%s'\n", err) - return err - } - fmt.Printf("[JobSpec] JobSpec in YAML:\n%s\n", string(yamlbytes)) - f, err := os.Create(filename) - if err != nil { - log.Fatalf("[JobSpec] Couldn't create yaml file!!\n") - return err } - defer f.Close() - - _, err = f.Write(yamlbytes) - if err != nil { - log.Fatalf("[JobSpec] Couldn't write yaml file!!\n") - return err - } - - _, err = f.WriteString("\n") - if err != nil { - log.Fatalf("[JobSpec] Couldn't write yaml file!!\n") - return err - } - return nil -} - -func toGB(bytes int64) int64 { - res := float64(bytes) / math.Pow(10, 9) - return int64(res) + } + return slotResource } diff --git a/src/fluence/jobspec/types.go b/src/fluence/jobspec/types.go index 9f4e4fc..8d6d06f 100644 --- a/src/fluence/jobspec/types.go +++ b/src/fluence/jobspec/types.go @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package jobspec type Version struct { diff --git a/src/fluence/utils/utils.go b/src/fluence/utils/utils.go index 2d6d932..aadcb41 100644 --- a/src/fluence/utils/utils.go +++ b/src/fluence/utils/utils.go @@ -4,7 +4,6 @@ import ( "context" "fmt" - // "strings" "encoding/json" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/jgf" @@ -17,7 +16,8 @@ import ( resourcehelper "k8s.io/kubectl/pkg/util/resource" ) -func CreateJGF(filename string, label *string) error { +// CreateJGF creates the Json Graph Format +func CreateJGF(filename string, skipLabel *string) error { ctx := context.Background() config, err := rest.InClusterConfig() if err != nil { @@ -34,6 +34,9 @@ func CreateJGF(filename string, label *string) error { var fluxgraph jgf.Fluxjgf fluxgraph = jgf.InitJGF() + + // TODO it looks like we can add more to the graph here - + // let's remember to consider what else we can. // subnets := make(map[string]string) cluster := fluxgraph.MakeCluster("k8scluster") @@ -49,10 +52,14 @@ func CreateJGF(filename string, label *string) error { var totalAllocCpu, totalmem int64 totalAllocCpu = 0 sdnCount := 0 + for node_index, node := range nodes.Items { + + // Question from V: what was this for (what is a worker)? // _, worker := node.Labels["node-role.kubernetes.io/worker"] - if *label != "" { - _, fluxnode := node.Labels[*label] + + if *skipLabel != "" { + _, fluxnode := node.Labels[*skipLabel] if !fluxnode { fmt.Println("Skipping node ", node.GetName()) continue @@ -71,6 +78,7 @@ func CreateJGF(filename string, label *string) error { if err != nil { return err } + // fmt.Println("Node ", node.GetName(), " has pods ", pods) // Check if subnet already exists // Here we build subnets according to topology.kubernetes.io/zone label @@ -128,6 +136,9 @@ func CreateJGF(filename string, label *string) error { core := fluxgraph.MakeCore(index, "core") fluxgraph.MakeEdge(workernode, core, "contains") // workernode was socket fluxgraph.MakeEdge(core, workernode, "in") + + // Question from Vanessa: + // How can we get here and have vcores ever not equal to zero? if vcores == 0 { fluxgraph.MakeNFDProperties(core, index, "cpu-", &node.Labels) // fluxgraph.MakeNFDProperties(core, index, "netmark-", &node.Labels) @@ -190,30 +201,33 @@ type allocation struct { CoreCount int } -func ParseAllocResult(allocated string) []allocation { +// ParseAllocResult takes an allocated (string) and parses into a list of allocation +// We include the pod namespace/name for debugging later +func ParseAllocResult(allocated, podName string) []allocation { var dat map[string]interface{} - result := make([]allocation, 0) + result := []allocation{} + + // Keep track of total core count across allocated corecount := 0 + + // This should not happen - the string we get back should parse. if err := json.Unmarshal([]byte(allocated), &dat); err != nil { panic(err) } - // fmt.Println("PRINTING DATA:\n", dat) - // graph := dat["graph"] - // fmt.Println("GET GRAPH:\n ", graph) + + // Parse graph and nodes into interfaces + // TODO look at github.com/mitchellh/mapstructure + // that might make this easier nodes := dat["graph"].(interface{}) str1 := nodes.(map[string]interface{}) - // fmt.Println("GET NODES:\n", str1["nodes"]) str2 := str1["nodes"].([]interface{}) - // fmt.Println("NODES:\n", len(str2)) + for _, item := range str2 { - // fmt.Println("ITEM: ", item) str1 = item.(map[string]interface{}) metadata := str1["metadata"].(map[string]interface{}) - // fmt.Println("TYPE: ", metadata["type"]) if metadata["type"].(string) == "core" { corecount = corecount + 1 } - // fmt.Println("BASENAME: ", metadata["basename"]) if metadata["type"].(string) == "node" { result = append(result, allocation{ Type: metadata["type"].(string), @@ -221,18 +235,16 @@ func ParseAllocResult(allocated string) []allocation { Basename: metadata["basename"].(string), CoreCount: corecount, }) + + // Reset the corecount once we've added to a node corecount = 0 - // result.Type = metadata["type"].(string) - // result.Name = metadata["name"].(string) - // result.Basename = metadata["basename"].(string) - // return result } } - fmt.Println("FINAL NODE RESULT:\n", result) + fmt.Printf("Final node result for %s: %s\n", podName, result) return result } -// //// Utility functions +// Utility functions func PrintOutput(reserved bool, allocated string, at int64, overhead float64, jobid uint64, fluxerr error) { fmt.Println("\n\t----Match Allocate output---") fmt.Printf("jobid: %d\nreserved: %t\nallocated: %s\nat: %d\noverhead: %f\nerror: %w\n", jobid, reserved, allocated, at, overhead, fluxerr) From 7b9c47083cfd73b51a4c6140b1a8247bd311a11d Mon Sep 17 00:00:00 2001 From: vsoch Date: Mon, 15 Jan 2024 07:31:38 -0700 Subject: [PATCH 04/28] fix: use podgroup millisecond precision timestamp Problem: the podgroups with second precision have interleaving Solution: try to create an internal representation with better precision. This looks promising with early testing, but I need to consider the edge cases and how to clean up the groups, otherwise a pod group might be re-created later and still in the cache. Signed-off-by: vsoch --- examples/indexed-jobs/job1.yaml | 4 +- .../pkg/fluence/core/core.go | 173 ++++++++++++++---- sig-scheduler-plugins/pkg/fluence/fluence.go | 167 ++++++++--------- sig-scheduler-plugins/pkg/fluence/group.go | 101 ++++++++++ 4 files changed, 324 insertions(+), 121 deletions(-) create mode 100644 sig-scheduler-plugins/pkg/fluence/group.go diff --git a/examples/indexed-jobs/job1.yaml b/examples/indexed-jobs/job1.yaml index 5778bc3..609e843 100644 --- a/examples/indexed-jobs/job1.yaml +++ b/examples/indexed-jobs/job1.yaml @@ -3,8 +3,8 @@ kind: Job metadata: name: job-1 spec: - completions: 10 - parallelism: 10 + completions: 5 + parallelism: 5 completionMode: Indexed template: metadata: diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 5914441..8e209ea 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -3,81 +3,190 @@ package core import ( "fmt" - "k8s.io/klog/v2" + v1 "k8s.io/api/core/v1" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/pkg/scheduler/framework" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" ) +// FluxStateData is a CycleState +// It holds the PodCache for a pod, which has node assignment, group, and group size +// We also save the group name and size, and time created, in case we want to (somehow) resume scheduling +// In practice I'm not sure how CycleState objects are dumped and loaded. Kueue has a dumper :P +// https://github.com/kubernetes/enhancements/blob/master/keps/sig-scheduling/624-scheduling-framework/README.md#cyclestate type FluxStateData struct { - NodeName string + NodeCache NodeCache } +// Clone is required for CycleState plugins func (s *FluxStateData) Clone() framework.StateData { - clone := &FluxStateData{ - NodeName: s.NodeName, + return &FluxStateData{NodeCache: s.NodeCache} +} + +// NewFluxState creates an entry for the CycleState with the minimum that we might need +func NewFluxState(nodeName string, groupName string, size int32) *FluxStateData { + cache := NodeCache{ + NodeName: nodeName, + GroupName: groupName, + MinGroupSize: size, } - return clone + return &FluxStateData{NodeCache: cache} } -type NodePodsCount struct { +// NodeCache holds the node name and tasks for the node +// For the PodGroupCache, these are organized by group name, +// and there is a list of them +type NodeCache struct { NodeName string - Count int + + // This is derived from tasks, where + // task is an allocation to some node + // High level it is most often akin to the + // number of pods on the node. I'm not sure that I understand this + // https://github.com/flux-framework/flux-k8s/blob/9f24f36752e3cced1b1112d93bfa366fb58b3c84/src/fluence/fluxion/fluxion.go#L94-L97 + // How does that relate to a single pod? It is called "Count" in other places + Tasks int + + // These fields are primarily for the FluxStateData + // Without a PodGroup CRD we keep min size here + MinGroupSize int32 + GroupName string } -var podgroupMap map[string][]NodePodsCount +// A pod group cache holds a list of nodes for an allocation, where each has some number of tasks +// along with the expected group size. This is intended to replace PodGroup +// given the group name, size (derived from annotations) and timestamp +type PodGroupCache struct { + + // This is a cache of nodes for pods + Nodes []NodeCache + Size int32 + Name string + // Keep track of when the group was initially created! + // This is like, the main thing we need. + TimeCreated metav1.MicroTime +} + +// Memory cache of pod group name to pod group cache, above +var podGroupCache map[string]*PodGroupCache + +// Init populates the podGroupCache func Init() { - podgroupMap = make(map[string][]NodePodsCount, 0) + podGroupCache = map[string]*PodGroupCache{} } -func (n *NodePodsCount) Clone() framework.StateData { - return &NodePodsCount{ - NodeName: n.NodeName, - Count: n.Count, +// RegisterPodGroup ensures that the PodGroup exists in the cache +// This is an experimental replacement for an actual PodGroup +// We take a timestampo, which if called from Less (during sorting) is tiem.Time +// if called later (an individual pod) we go for its creation timestamp +func RegisterPodGroup(pod *v1.Pod, groupName string, groupSize int32) error { + entry, ok := podGroupCache[groupName] + + if !ok { + + // Assume we create the group with the timestamp + // of the first pod seen. There might be imperfections + // by the second, but as long as we sort them via millisecond + // this should prevent interleaving + nodes := []NodeCache{} + + // Create the new entry for the pod group + entry = &PodGroupCache{ + Name: groupName, + Size: groupSize, + Nodes: nodes, + TimeCreated: metav1.NowMicro(), + } + + // Tell the user when it was created + fmt.Printf("Pod group %s was created at %s\n", entry.Name, entry.TimeCreated) + } + + // If the size has changed, we currently do not allow updating it. + // We issue a warning. In the future this could be supported with a grow command. + if entry.Size != groupSize { + fmt.Printf("Pod group %s request to change size from %s to %s is not yet supported", groupName, entry.Size, groupSize) + // entry.GroupSize = groupSize } + podGroupCache[groupName] = entry + return nil +} + +// GetPodGroup gets a pod group in the cache by name +func GetPodGroup(groupName string) *PodGroupCache { + entry, _ := podGroupCache[groupName] + return entry +} + +// DeletePodGroup deletes a pod from the group cache +func DeletePodGroup(groupName string) { + delete(podGroupCache, groupName) } -func CreateNodePodsList(nodelist []*pb.NodeAlloc, pgname string) (nodepods []NodePodsCount) { - nodepods = make([]NodePodsCount, len(nodelist)) +// CreateNodePodsList creates a list of node pod caches +func CreateNodePodsList(nodelist []*pb.NodeAlloc, groupName string) (nodepods []NodeCache) { + + // Create a pod cache for each node + nodepods = make([]NodeCache, len(nodelist)) + for i, v := range nodelist { - nodepods[i] = NodePodsCount{ + nodepods[i] = NodeCache{ NodeName: v.GetNodeID(), - Count: int(v.GetTasks()), + Tasks: int(v.GetTasks()), } } - podgroupMap[pgname] = nodepods - klog.Info("MAP ", podgroupMap) - return + // Update the pods in the PodGraphCache + updatePodGroupNodes(groupName, nodepods) + fmt.Printf("Pod Group Cache ", podGroupCache) + return nodepods +} + +// updatePodGroupList updates the PodGroupCache with a listing of nodes +func updatePodGroupNodes(groupName string, nodes []NodeCache) { + group := podGroupCache[groupName] + group.Nodes = nodes + podGroupCache[groupName] = group +} + +// HavePodNodes returns true if the listing of pods is not empty +// This should be all pods that are needed - the allocation will not +// be successful otherwise, so we just check > 0 +func (p *PodGroupCache) HavePodNodes() bool { + return len(p.Nodes) > 0 } -func HaveList(pgname string) bool { - _, exists := podgroupMap[pgname] - return exists +// CancelAllocation resets the node cache and allocation status +func (p *PodGroupCache) CancelAllocation() { + p.Nodes = []NodeCache{} } -func GetNextNode(pgname string) (string, error) { - entry, ok := podgroupMap[pgname] +// GetNextNode gets the next available node we can allocate for a group +func GetNextNode(groupName string) (string, error) { + entry, ok := podGroupCache[groupName] if !ok { err := fmt.Errorf("Map is empty") return "", err } - if len(entry) == 0 { + if len(entry.Nodes) == 0 { err := fmt.Errorf("Error while getting a node") return "", err } - nodename := entry[0].NodeName + nodename := entry.Nodes[0].NodeName - if entry[0].Count == 1 { - slice := entry[1:] + if entry.Nodes[0].Tasks == 1 { + slice := entry.Nodes[1:] if len(slice) == 0 { - delete(podgroupMap, pgname) + delete(podGroupCache, groupName) return nodename, nil } - podgroupMap[pgname] = slice + updatePodGroupNodes(groupName, slice) return nodename, nil } - entry[0].Count = entry[0].Count - 1 + entry.Nodes[0].Tasks = entry.Nodes[0].Tasks - 1 return nodename, nil } diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 32fd513..a23f4a0 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -125,9 +125,6 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { ) f.pgMgr = pgMgr - // Save the podLister to fluence to easily query for the group - f.podLister = podInformer.Lister() - // stopCh := make(chan struct{}) // defer close(stopCh) // informerFactory.Start(stopCh) @@ -148,29 +145,37 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { // 2. Compare the initialization timestamps of fluence pod groups // 3. Fall back, sort by namespace/name // See https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/ +// Less is part of Sort, which is the earliest we can see a pod unless we use gate +// IMPORTANT: Less sometimes is not called for smaller sizes, not sure why. +// To get around this we call it during PreFilter too. func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { klog.Infof("ordering pods from Coscheduling") + + // ensure we have a PodGroup no matter what + klog.Infof("ensuring fluence groups") + podGroup1 := f.ensureFluenceGroup(podInfo1.Pod) + podGroup2 := f.ensureFluenceGroup(podInfo2.Pod) + + // First preference to priority, but only if they are different prio1 := corev1helpers.PodPriority(podInfo1.Pod) prio2 := corev1helpers.PodPriority(podInfo2.Pod) + + // ...and only allow this to sort if they aren't the same + // The assumption here is that pods with priority are ignored by fluence if prio1 != prio2 { return prio1 > prio2 } - creationTime1 := f.pgMgr.GetCreationTimestamp(podInfo1.Pod, *podInfo1.InitialAttemptTimestamp) - creationTime2 := f.pgMgr.GetCreationTimestamp(podInfo2.Pod, *podInfo2.InitialAttemptTimestamp) - if creationTime1.Equal(creationTime2) { - return coschedulingcore.GetNamespacedName(podInfo1.Pod) < coschedulingcore.GetNamespacedName(podInfo2.Pod) - } - return creationTime1.Before(creationTime2) -} -// getPodGroup gets the group information from the pod group manager -// to determine if a pod is in a group. We return the group -func (f *Fluence) getPodGroup(ctx context.Context, pod *v1.Pod) (string, *sched.PodGroup) { - pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { - klog.InfoS("Not in group", "pod", klog.KObj(pod)) + // Fluence can only compare if we have two known groups. + // This tries for that first, and falls back to the initial attempt timestamp + creationTime1 := f.getCreationTimestamp(podGroup1, podInfo1) + creationTime2 := f.getCreationTimestamp(podGroup2, podInfo2) + + // If they are the same, fall back to sorting by name. + if creationTime1.Equal(&creationTime2) { + return coschedulingcore.GetNamespacedName(podInfo1.Pod) < coschedulingcore.GetNamespacedName(podInfo2.Pod) } - return pgName, pg + return creationTime1.Before(&creationTime2) } // PreFilter checks info about the Pod / checks conditions that the cluster or the Pod must meet. @@ -181,47 +186,38 @@ func (f *Fluence) PreFilter( pod *v1.Pod, ) (*framework.PreFilterResult, *framework.Status) { - var ( - err error - nodename string - ) klog.Infof("Examining the pod") - // Get the pod group name and group - groupName, pg := f.getPodGroup(ctx, pod) - klog.Infof("group name is %s", groupName) + // groupName will be named according to the single pod namespace / pod if there wasn't + // a user defined group. This is a size 1 group we handle equivalently. + pg := f.getPodsGroup(pod) - // Case 1: We have a pod group - if pg != nil { + klog.Infof("The group size %d", pg.Size) + klog.Infof("group name is %s", pg.Name) - // We have not yet derived a node list - if !fcore.HaveList(groupName) { - klog.Infof("Getting a pod group") - if _, err = f.AskFlux(ctx, pod, int(pg.Spec.MinMember)); err != nil { - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) - } - } - nodename, err = fcore.GetNextNode(groupName) - klog.Infof("Node Selected %s (%s:%s)", nodename, pod.Name, groupName) - if err != nil { - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) - } - - } else { - - // Case 2: no group, a faux group of a lonely 1 :( - nodename, err = f.AskFlux(ctx, pod, 1) + // Note that it is always the case we have a group + // We have not yet derived a node list + if !pg.HavePodNodes() { + klog.Infof("Getting a pod group") + err := f.AskFlux(ctx, pod, int(pg.Size)) if err != nil { return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } } + nodename, err := fcore.GetNextNode(pg.Name) + klog.Infof("Node Selected %s (%s:%s)", nodename, pod.Name, pg.Name) + if err != nil { + return nil, framework.NewStatus(framework.Unschedulable, err.Error()) + } // Create a fluxState (CycleState) with things that might be useful/ klog.Info("Node Selected: ", nodename) - state.Write(framework.StateKey(pod.Name), &fcore.FluxStateData{NodeName: nodename}) + cache := fcore.NodeCache{NodeName: nodename} + state.Write(framework.StateKey(pod.Name), &fcore.FluxStateData{NodeCache: cache}) return nil, framework.NewStatus(framework.Success, "") } +// TODO we need to account for affinity here func (f *Fluence) Filter( ctx context.Context, cycleState *framework.CycleState, @@ -231,10 +227,10 @@ func (f *Fluence) Filter( klog.Info("Filtering input node ", nodeInfo.Node().Name) if v, e := cycleState.Read(framework.StateKey(pod.Name)); e == nil { - if value, ok := v.(*fcore.FluxStateData); ok && value.NodeName != nodeInfo.Node().Name { + if value, ok := v.(*fcore.FluxStateData); ok && value.NodeCache.NodeName != nodeInfo.Node().Name { return framework.NewStatus(framework.Unschedulable, "pod is not permitted") } else { - klog.Info("Filter: node selected by Flux ", value.NodeName) + klog.Infof("Filter: node %s selected for %s\n", value.NodeCache.NodeName, pod.Name) } } return framework.NewStatus(framework.Success) @@ -247,7 +243,7 @@ func (f *Fluence) PreFilterExtensions() framework.PreFilterExtensions { } // AskFlux will ask flux for an allocation for nodes for the pod group. -func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) (string, error) { +func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { // clean up previous match if a pod has already allocated previously f.mutex.Lock() _, isPodAllocated := f.podNameToJobId[pod.Name] @@ -256,7 +252,7 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) (string, if isPodAllocated { klog.Info("Clean up previous allocation") f.mutex.Lock() - f.cancelFluxJobForPod(pod.Name) + f.cancelFluxJobForPod(pod) f.mutex.Unlock() } @@ -265,7 +261,7 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) (string, if err != nil { klog.Errorf("[FluxClient] Error connecting to server: %v", err) - return "", err + return err } defer conn.Close() @@ -278,47 +274,38 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) (string, Request: "allocate", Count: int32(count)} - r, err2 := grpcclient.Match(context.Background(), request) - if err2 != nil { - klog.Errorf("[FluxClient] did not receive any match response: %v", err2) - return "", err + // Question from vsoch; Why return err instead of err2 here? + // err would return a nil value, but we need to return non nil, + // otherwise it's going to try to use the allocation (but there is none) + r, err := grpcclient.Match(context.Background(), request) + if err != nil { + klog.Errorf("[FluxClient] did not receive any match response: %v", err) + return err } klog.Infof("[FluxClient] response podID %s", r.GetPodID()) - _, pg := f.getPodGroup(ctx, pod) + // Presence of a podGroup is indicated by a groupName + // Flag that the group is allocated (yes we also have the job id, testing for now) + pg := f.getPodsGroup(pod) - if count > 1 || pg != nil { - pgFullName, _ := f.pgMgr.GetPodGroup(ctx, pod) - nodelist := fcore.CreateNodePodsList(r.GetNodelist(), pgFullName) - klog.Infof("[FluxClient] response nodeID %s", r.GetNodelist()) - klog.Info("[FluxClient] Parsed Nodelist ", nodelist) - jobid := uint64(r.GetJobID()) + nodelist := fcore.CreateNodePodsList(r.GetNodelist(), pg.Name) + klog.Infof("[FluxClient] response nodeID %s", r.GetNodelist()) + klog.Info("[FluxClient] Parsed Nodelist ", nodelist) + jobid := uint64(r.GetJobID()) - f.mutex.Lock() - f.podNameToJobId[pod.Name] = jobid - klog.Info("Check job set: ", f.podNameToJobId) - f.mutex.Unlock() - } else { - nodename := r.GetNodelist()[0].GetNodeID() - jobid := uint64(r.GetJobID()) - - f.mutex.Lock() - f.podNameToJobId[pod.Name] = jobid - klog.Info("Check job set: ", f.podNameToJobId) - f.mutex.Unlock() - - return nodename, nil - } - - return "", nil + f.mutex.Lock() + f.podNameToJobId[pod.Name] = jobid + klog.Info("Check job set: ", f.podNameToJobId) + f.mutex.Unlock() + return nil } // cancelFluxJobForPod cancels the flux job for a pod. -func (f *Fluence) cancelFluxJobForPod(podName string) error { - jobid := f.podNameToJobId[podName] +func (f *Fluence) cancelFluxJobForPod(pod *v1.Pod) error { + jobid := f.podNameToJobId[pod.Name] - klog.Infof("Cancel flux job: %v for pod %s", jobid, podName) + klog.Infof("Cancel flux job: %v for pod %s", jobid, pod.Name) start := time.Now() @@ -345,15 +332,19 @@ func (f *Fluence) cancelFluxJobForPod(podName string) error { } if res.Error == 0 { - delete(f.podNameToJobId, podName) + delete(f.podNameToJobId, pod.Name) } else { - klog.Warningf("Failed to delete pod %s from the podname-jobid map.", podName) + klog.Warningf("Failed to delete pod %s from the podname-jobid map.", pod.Name) } + // If we are successful, clear the group allocated nodes + pg := f.getPodsGroup(pod) + pg.CancelAllocation() + elapsed := metrics.SinceInSeconds(start) klog.Info("Time elapsed (Cancel Job) :", elapsed) - klog.Infof("Job cancellation for pod %s result: %d", podName, err) + klog.Infof("Job cancellation for pod %s result: %d", pod.Name, err) if klog.V(2).Enabled() { klog.Info("Check job set: after delete") klog.Info(f.podNameToJobId) @@ -380,7 +371,7 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { defer f.mutex.Unlock() if _, ok := f.podNameToJobId[newPod.Name]; ok { - f.cancelFluxJobForPod(newPod.Name) + f.cancelFluxJobForPod(newPod) } else { klog.Infof("Succeeded pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) } @@ -392,7 +383,7 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { defer f.mutex.Unlock() if _, ok := f.podNameToJobId[newPod.Name]; ok { - f.cancelFluxJobForPod(newPod.Name) + f.cancelFluxJobForPod(newPod) } else { klog.Errorf("Failed pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) } @@ -403,6 +394,8 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { } } +// deletePod handles the delete event handler +// TODO when should we clear group from the cache? func (f *Fluence) deletePod(podObj interface{}) { klog.Info("Delete Pod event handler") @@ -417,7 +410,7 @@ func (f *Fluence) deletePod(podObj interface{}) { defer f.mutex.Unlock() if _, ok := f.podNameToJobId[pod.Name]; ok { - f.cancelFluxJobForPod(pod.Name) + f.cancelFluxJobForPod(pod) } else { klog.Infof("Terminating pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) } @@ -426,7 +419,7 @@ func (f *Fluence) deletePod(podObj interface{}) { defer f.mutex.Unlock() if _, ok := f.podNameToJobId[pod.Name]; ok { - f.cancelFluxJobForPod(pod.Name) + f.cancelFluxJobForPod(pod) } else { klog.Infof("Deleted pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) } diff --git a/sig-scheduler-plugins/pkg/fluence/group.go b/sig-scheduler-plugins/pkg/fluence/group.go new file mode 100644 index 0000000..a2597eb --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/group.go @@ -0,0 +1,101 @@ +package fluence + +import ( + "fmt" + "strconv" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/scheduler/framework" + + fcore "sigs.k8s.io/scheduler-plugins/pkg/fluence/core" +) + +const ( + PodGroupNameLabel = "fluence.pod-group" + PodGroupSizeLabel = "fluence.group-size" +) + +// getDefaultGroupName returns a group name based on the pod namespace and name +// We could do this for pods that are not labeled, and treat them as a size 1 group +func (f *Fluence) getDefaultGroupName(pod *v1.Pod) string { + return fmt.Sprintf("%s-%s", pod.Namespace, pod.Name) +} + +// getPodsGroup gets the pods group, if it exists. +func (f *Fluence) getPodsGroup(pod *v1.Pod) *fcore.PodGroupCache { + groupName := f.ensureFluenceGroup(pod) + return fcore.GetPodGroup(groupName) +} + +// ensureFluenceGroup ensure that a podGroup is created for the named fluence group +// Preference goes to the traditional PodGroup (created by the user) +// and falls back to having one created by fluence. If there is no PodGroup +// created and no fluence annotation, we do not create the group. +// Likely for fluence we'd want a cleanup function somehow too, +// for now assume groups are unique by name. +func (f *Fluence) ensureFluenceGroup(pod *v1.Pod) string { + + // Get the group name and size from the fluence labels + groupName := f.getFluenceGroupName(pod) + groupSize := f.getFluenceGroupSize(pod) + + // If there isn't a group, make a single node sized group + // This is so we can always treat the cases equally + if groupName == "" { + groupName = f.getDefaultGroupName(pod) + } + klog.Infof("group name for %s is %s", pod.Name, groupName) + klog.Infof("group size for %s is %d", pod.Name, groupSize) + + // Register the pod group (with the pod) in our cache + fcore.RegisterPodGroup(pod, groupName, groupSize) + return groupName +} + +// deleteFluenceGroup ensures the pod group is deleted, if it exists +func (f *Fluence) deleteFluenceGroup(pod *v1.Pod) { + + // Get the group name and size from the fluence labels + pg := f.getPodsGroup(pod) + fcore.DeletePodGroup(pg.Name) +} + +// getFluenceGroupName looks for the group to indicate a fluence group, and returns it +func (f *Fluence) getFluenceGroupName(pod *v1.Pod) string { + groupName, _ := pod.Labels[PodGroupNameLabel] + return groupName +} + +// getFluenceGroupSize gets the size of the fluence group +func (f *Fluence) getFluenceGroupSize(pod *v1.Pod) int32 { + size, _ := pod.Labels[PodGroupSizeLabel] + + // Default size of 1 if the label is not set (but name is) + if size == "" { + return 1 + } + + // We don't want the scheduler to fail if someone puts a value for size + // that doesn't convert nicely. They can find this in the logs. + intSize, err := strconv.ParseUint(size, 10, 32) + if err != nil { + klog.Error("Parsing integer size for pod group") + } + return int32(intSize) +} + +// getCreationTimestamp first tries the fluence group, then falls back to the initial attempt timestamp +func (f *Fluence) getCreationTimestamp(groupName string, podInfo *framework.QueuedPodInfo) metav1.MicroTime { + pg := fcore.GetPodGroup(groupName) + + // IsZero is an indicator if this was actually set + // If the group label was present and we have a group, this will be true + if !pg.TimeCreated.IsZero() { + klog.Infof("pod group %s was created at %s\n", groupName, pg.TimeCreated) + return pg.TimeCreated + } + // We should actually never get here. + return metav1.NewMicroTime(*podInfo.InitialAttemptTimestamp) +} From d6949a0703bd05769e1e9aac1f783bd774cfa844 Mon Sep 17 00:00:00 2001 From: vsoch Date: Mon, 15 Jan 2024 19:33:29 -0700 Subject: [PATCH 05/28] logs: more for various steps to see what is going on Signed-off-by: vsoch --- .../templates/deployment.yaml | 2 +- .../charts/as-a-second-scheduler/values.yaml | 1 + .../pkg/fluence/core/core.go | 15 ++- sig-scheduler-plugins/pkg/fluence/fluence.go | 99 +++++++++---------- sig-scheduler-plugins/pkg/fluence/group.go | 10 +- src/fluence/fluxion/fluxion.go | 19 ++-- src/fluence/utils/utils.go | 17 +++- 7 files changed, 89 insertions(+), 74 deletions(-) diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml index 8a73245..ffc3ce7 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml @@ -49,7 +49,7 @@ spec: - command: - /bin/kube-scheduler - --config=/etc/kubernetes/scheduler-config.yaml - - -v=9 + - -v={{ .Values.scheduler.loggingLevel }} image: {{ .Values.scheduler.image }} imagePullPolicy: {{ .Values.scheduler.pullPolicy }} livenessProbe: diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml index 1ae99f9..38da251 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml @@ -11,6 +11,7 @@ scheduler: policy: lonode pullPolicy: Always sidecarPullPolicy: Always + loggingLevel: "9" controller: name: scheduler-plugins-controller diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 8e209ea..ddf8e4c 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -102,13 +102,13 @@ func RegisterPodGroup(pod *v1.Pod, groupName string, groupSize int32) error { } // Tell the user when it was created - fmt.Printf("Pod group %s was created at %s\n", entry.Name, entry.TimeCreated) + fmt.Printf("[Fluence] Pod group %s was created at %s\n", entry.Name, entry.TimeCreated) } // If the size has changed, we currently do not allow updating it. // We issue a warning. In the future this could be supported with a grow command. if entry.Size != groupSize { - fmt.Printf("Pod group %s request to change size from %s to %s is not yet supported", groupName, entry.Size, groupSize) + fmt.Printf("[Fluence] Pod group %s request to change size from %s to %s is not yet supported\n", groupName, entry.Size, groupSize) // entry.GroupSize = groupSize } podGroupCache[groupName] = entry @@ -141,7 +141,7 @@ func CreateNodePodsList(nodelist []*pb.NodeAlloc, groupName string) (nodepods [] // Update the pods in the PodGraphCache updatePodGroupNodes(groupName, nodepods) - fmt.Printf("Pod Group Cache ", podGroupCache) + fmt.Printf("[Fluence] Pod group cache updated with nodes\n", podGroupCache) return nodepods } @@ -168,25 +168,30 @@ func (p *PodGroupCache) CancelAllocation() { func GetNextNode(groupName string) (string, error) { entry, ok := podGroupCache[groupName] if !ok { - err := fmt.Errorf("Map is empty") + err := fmt.Errorf("[Fluence] Map is empty\n") return "", err } if len(entry.Nodes) == 0 { - err := fmt.Errorf("Error while getting a node") + err := fmt.Errorf("[Fluence] Error while getting a node\n") return "", err } nodename := entry.Nodes[0].NodeName + fmt.Printf("[Fluence] Next node for group %s is %s", groupName, nodename) if entry.Nodes[0].Tasks == 1 { + fmt.Println("[Fluence] First node has one task") slice := entry.Nodes[1:] if len(slice) == 0 { + fmt.Printf("[Fluence] After this node, the slice is empty, deleting group %s from cache\n", groupName) delete(podGroupCache, groupName) return nodename, nil } + fmt.Println("[Fluence] After this node, the slide still has nodes") updatePodGroupNodes(groupName, slice) return nodename, nil } + fmt.Println("[Fluence] Subtracting one task from first node") entry.Nodes[0].Tasks = entry.Nodes[0].Tasks - 1 return nodename, nil } diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index a23f4a0..bbe6cee 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -34,9 +34,7 @@ import ( corev1helpers "k8s.io/component-helpers/scheduling/corev1" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" - "k8s.io/kubernetes/pkg/scheduler/metrics" - corelisters "k8s.io/client-go/listers/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" coschedulingcore "sigs.k8s.io/scheduler-plugins/pkg/coscheduling/core" @@ -51,9 +49,6 @@ type Fluence struct { client client.Client podNameToJobId map[string]uint64 pgMgr coschedulingcore.Manager - - // The pod group manager has a lister, but it's private - podLister corelisters.PodLister } // Name is the name of the plugin used in the Registry and configurations. @@ -80,7 +75,6 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { f := &Fluence{handle: handle, podNameToJobId: make(map[string]uint64)} - klog.Info("Create plugin") ctx := context.TODO() fcore.Init() @@ -91,7 +85,6 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { }) go fluxPodsInformer.Run(ctx.Done()) - klog.Info("Create generic pod informer") scheme := runtime.NewScheme() clientscheme.AddToScheme(scheme) @@ -136,7 +129,7 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { return nil, err } - klog.Info("Fluence start") + klog.Info("Fluence scheduler plugin started") return f, nil } @@ -149,10 +142,10 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { // IMPORTANT: Less sometimes is not called for smaller sizes, not sure why. // To get around this we call it during PreFilter too. func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { - klog.Infof("ordering pods from Coscheduling") + klog.Infof("[Fluence] Ordering pods in Less") // ensure we have a PodGroup no matter what - klog.Infof("ensuring fluence groups") + klog.Infof("[Fluence] Comparing %s and %s", podInfo1.Pod.Name, podInfo2.Pod.Name) podGroup1 := f.ensureFluenceGroup(podInfo1.Pod) podGroup2 := f.ensureFluenceGroup(podInfo2.Pod) @@ -186,21 +179,22 @@ func (f *Fluence) PreFilter( pod *v1.Pod, ) (*framework.PreFilterResult, *framework.Status) { - klog.Infof("Examining the pod") + klog.Infof("[Fluence] Examining pod %s", pod.Name) // groupName will be named according to the single pod namespace / pod if there wasn't // a user defined group. This is a size 1 group we handle equivalently. pg := f.getPodsGroup(pod) - klog.Infof("The group size %d", pg.Size) - klog.Infof("group name is %s", pg.Name) + klog.Infof("[Fluence] Pod %s group size %d", pod.Name, pg.Size) + klog.Infof("[Fluence] Pod %s group name is %s", pod.Name, pg.Name) // Note that it is always the case we have a group // We have not yet derived a node list if !pg.HavePodNodes() { - klog.Infof("Getting a pod group") + klog.Infof("[Fluence] Does not have nodes yet, asking Fluxion") err := f.AskFlux(ctx, pod, int(pg.Size)) if err != nil { + klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } } @@ -250,17 +244,18 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { f.mutex.Unlock() if isPodAllocated { - klog.Info("Clean up previous allocation") + klog.Info("[Fluence] Pod %s is allocated, cleaning up previous allocation", pod.Name) f.mutex.Lock() f.cancelFluxJobForPod(pod) f.mutex.Unlock() } jobspec := utils.InspectPodInfo(pod) + klog.Infof("[Fluence] Inspect pod info, jobspec: %s", jobspec) conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) if err != nil { - klog.Errorf("[FluxClient] Error connecting to server: %v", err) + klog.Errorf("[Fluence] Error connecting to server: %v", err) return err } defer conn.Close() @@ -279,24 +274,27 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { // otherwise it's going to try to use the allocation (but there is none) r, err := grpcclient.Match(context.Background(), request) if err != nil { - klog.Errorf("[FluxClient] did not receive any match response: %v", err) + klog.Errorf("[Fluence] did not receive any match response: %v", err) return err } - klog.Infof("[FluxClient] response podID %s", r.GetPodID()) + klog.Infof("[Fluence] response podID %s", r.GetPodID()) // Presence of a podGroup is indicated by a groupName // Flag that the group is allocated (yes we also have the job id, testing for now) pg := f.getPodsGroup(pod) - nodelist := fcore.CreateNodePodsList(r.GetNodelist(), pg.Name) - klog.Infof("[FluxClient] response nodeID %s", r.GetNodelist()) - klog.Info("[FluxClient] Parsed Nodelist ", nodelist) + // Get the nodelist and inspect + nodes := r.GetNodelist() + klog.Infof("[Fluence] Nodelist returned from Fluxion: %s", nodes) + + nodelist := fcore.CreateNodePodsList(nodes, pg.Name) + klog.Infof("[Fluence] parsed node pods list %s", nodelist) jobid := uint64(r.GetJobID()) f.mutex.Lock() f.podNameToJobId[pod.Name] = jobid - klog.Info("Check job set: ", f.podNameToJobId) + klog.Info("[Fluence] Check job assignment: ", f.podNameToJobId) f.mutex.Unlock() return nil } @@ -305,14 +303,12 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { func (f *Fluence) cancelFluxJobForPod(pod *v1.Pod) error { jobid := f.podNameToJobId[pod.Name] - klog.Infof("Cancel flux job: %v for pod %s", jobid, pod.Name) - - start := time.Now() + klog.Infof("[Fluence] Cancel flux job: %v for pod %s", jobid, pod.Name) conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) if err != nil { - klog.Errorf("[FluxClient] Error connecting to server: %v", err) + klog.Errorf("[Fluence] Error connecting to server: %v", err) return err } defer conn.Close() @@ -321,43 +317,36 @@ func (f *Fluence) cancelFluxJobForPod(pod *v1.Pod) error { _, cancel := context.WithTimeout(context.Background(), 200*time.Second) defer cancel() - request := &pb.CancelRequest{ - JobID: int64(jobid), - } - + // I think this error reflects the success or failure of the cancel request + request := &pb.CancelRequest{JobID: int64(jobid)} res, err := grpcclient.Cancel(context.Background(), request) if err != nil { - klog.Errorf("[FluxClient] did not receive any cancel response: %v", err) + klog.Errorf("[Fluence] did not receive any cancel response: %v", err) return err } + klog.Infof("[Fluence] Job cancellation for pod %s result: %d", pod.Name, res.Error) + // And this error is if the cancel was successful or not if res.Error == 0 { + klog.Infof("[Fluence] Successful cancel of flux job: %v for pod %s", jobid, pod.Name) delete(f.podNameToJobId, pod.Name) - } else { - klog.Warningf("Failed to delete pod %s from the podname-jobid map.", pod.Name) - } - - // If we are successful, clear the group allocated nodes - pg := f.getPodsGroup(pod) - pg.CancelAllocation() - elapsed := metrics.SinceInSeconds(start) - klog.Info("Time elapsed (Cancel Job) :", elapsed) - - klog.Infof("Job cancellation for pod %s result: %d", pod.Name, err) - if klog.V(2).Enabled() { - klog.Info("Check job set: after delete") - klog.Info(f.podNameToJobId) + // If we are successful, clear the group allocated nodes + pg := f.getPodsGroup(pod) + pg.CancelAllocation() + } else { + klog.Warningf("[Fluence] Failed to cancel flux job %v for pod %s", jobid, pod.Name) } return nil } // EventHandlers updatePod handles cleaning up resources func (f *Fluence) updatePod(oldObj, newObj interface{}) { - // klog.Info("Update Pod event handler") + + oldPod := oldObj.(*v1.Pod) newPod := newObj.(*v1.Pod) - klog.Infof("Processing event for pod %s", newPod.Name) + klog.Infof("[Fluence] Processing event for pod %s from %s to %s", newPod.Name, newPod.Status.Phase, oldPod.Status.Phase) switch newPod.Status.Phase { case v1.PodPending: @@ -365,7 +354,7 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { case v1.PodRunning: // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler case v1.PodSucceeded: - klog.Infof("Pod %s succeeded, Fluence needs to free the resources", newPod.Name) + klog.Infof("[Fluence] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) f.mutex.Lock() defer f.mutex.Unlock() @@ -373,11 +362,11 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { if _, ok := f.podNameToJobId[newPod.Name]; ok { f.cancelFluxJobForPod(newPod) } else { - klog.Infof("Succeeded pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) + klog.Infof("[Fluence] Succeeded pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) } case v1.PodFailed: // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test - klog.Warningf("Pod %s failed, Fluence needs to free the resources", newPod.Name) + klog.Warningf("[Fluence] Pod %s failed, Fluence needs to free the resources", newPod.Name) f.mutex.Lock() defer f.mutex.Unlock() @@ -385,7 +374,7 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { if _, ok := f.podNameToJobId[newPod.Name]; ok { f.cancelFluxJobForPod(newPod) } else { - klog.Errorf("Failed pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) + klog.Errorf("[Fluence] Failed pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) } case v1.PodUnknown: // don't know how to deal with it as it's unknown phase @@ -400,11 +389,11 @@ func (f *Fluence) deletePod(podObj interface{}) { klog.Info("Delete Pod event handler") pod := podObj.(*v1.Pod) - klog.Info("Pod status: ", pod.Status.Phase) + klog.Infof("[Fluence] Delete pod has status %s", pod.Status.Phase) switch pod.Status.Phase { case v1.PodSucceeded: case v1.PodPending: - klog.Infof("Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) + klog.Infof("[Fluence] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) f.mutex.Lock() defer f.mutex.Unlock() @@ -412,7 +401,7 @@ func (f *Fluence) deletePod(podObj interface{}) { if _, ok := f.podNameToJobId[pod.Name]; ok { f.cancelFluxJobForPod(pod) } else { - klog.Infof("Terminating pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) + klog.Infof("[Fluence] Terminating pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) } case v1.PodRunning: f.mutex.Lock() @@ -421,7 +410,7 @@ func (f *Fluence) deletePod(podObj interface{}) { if _, ok := f.podNameToJobId[pod.Name]; ok { f.cancelFluxJobForPod(pod) } else { - klog.Infof("Deleted pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) + klog.Infof("[Fluence] Deleted pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) } } } diff --git a/sig-scheduler-plugins/pkg/fluence/group.go b/sig-scheduler-plugins/pkg/fluence/group.go index a2597eb..6f55a8b 100644 --- a/sig-scheduler-plugins/pkg/fluence/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group.go @@ -44,10 +44,11 @@ func (f *Fluence) ensureFluenceGroup(pod *v1.Pod) string { // If there isn't a group, make a single node sized group // This is so we can always treat the cases equally if groupName == "" { + klog.Infof(" [Fluence] Group annotation missing for pod %s", pod.Name) groupName = f.getDefaultGroupName(pod) } - klog.Infof("group name for %s is %s", pod.Name, groupName) - klog.Infof("group size for %s is %d", pod.Name, groupSize) + klog.Infof(" [Fluence] Group name for %s is %s", pod.Name, groupName) + klog.Infof(" [Fluence] Group size for %s is %d", pod.Name, groupSize) // Register the pod group (with the pod) in our cache fcore.RegisterPodGroup(pod, groupName, groupSize) @@ -81,7 +82,7 @@ func (f *Fluence) getFluenceGroupSize(pod *v1.Pod) int32 { // that doesn't convert nicely. They can find this in the logs. intSize, err := strconv.ParseUint(size, 10, 32) if err != nil { - klog.Error("Parsing integer size for pod group") + klog.Error(" [Fluence] Parsing integer size for pod group") } return int32(intSize) } @@ -93,9 +94,10 @@ func (f *Fluence) getCreationTimestamp(groupName string, podInfo *framework.Queu // IsZero is an indicator if this was actually set // If the group label was present and we have a group, this will be true if !pg.TimeCreated.IsZero() { - klog.Infof("pod group %s was created at %s\n", groupName, pg.TimeCreated) + klog.Infof(" [Fluence] Pod group %s was created at %s\n", groupName, pg.TimeCreated) return pg.TimeCreated } // We should actually never get here. + klog.Errorf(" [Fluence] Pod group %s time IsZero, we should not have reached here", groupName) return metav1.NewMicroTime(*podInfo.InitialAttemptTimestamp) } diff --git a/src/fluence/fluxion/fluxion.go b/src/fluence/fluxion/fluxion.go index 6478602..2a8fd7f 100644 --- a/src/fluence/fluxion/fluxion.go +++ b/src/fluence/fluxion/fluxion.go @@ -96,7 +96,7 @@ func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResp emptyResponse := &pb.MatchResponse{} // Prepare an empty match response (that can still be serialized) - fmt.Printf("[GRPCServer] Received Match request %v\n", in) + fmt.Printf("[Fluence][MatchRPC] Received Match request %v\n", in) // Generate the jobspec, written to temporary file and read as string spec, err := s.generateJobspec(in) @@ -107,18 +107,25 @@ func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResp // Ask flux to match allocate! reserved, allocated, at, overhead, jobid, fluxerr := s.cli.MatchAllocate(false, string(spec)) utils.PrintOutput(reserved, allocated, at, overhead, jobid, fluxerr) - fmt.Printf("[MatchRPC] Errors so far: %s\n", s.cli.GetErrMsg()) + + // Be explicit about errors (or not) + errorMessages := s.cli.GetErrMsg() + if errorMessages == "" { + fmt.Println("[Fluence][MatchRPC] There are no errors") + } else { + fmt.Printf("[Fluence][MatchRPC] Errors so far: %s\n", errorMessages) + } if fluxerr != nil { - fmt.Printf("[GRPCServer] Flux err is %w\n", fluxerr) - return emptyResponse, errors.New("Error in ReapiCliMatchAllocate") + fmt.Printf("[Fluence][MatchRPC] Flux err is %w\n", fluxerr) + return emptyResponse, errors.New("[Fluence] Error in ReapiCliMatchAllocate") } // This usually means we cannot allocate // We need to return an error here otherwise we try to pass an empty string // to other RPC endpoints and get back an error. if allocated == "" { - fmt.Println("[GRPCServer] Allocated is empty") - return emptyResponse, errors.New("allocation was not possible") + fmt.Println("[Fluence][MatchRPC] Allocated is empty") + return emptyResponse, errors.New("Allocation was not possible") } // Pass the spec name in so we can include it in the allocation result diff --git a/src/fluence/utils/utils.go b/src/fluence/utils/utils.go index aadcb41..961a77a 100644 --- a/src/fluence/utils/utils.go +++ b/src/fluence/utils/utils.go @@ -206,6 +206,7 @@ type allocation struct { func ParseAllocResult(allocated, podName string) []allocation { var dat map[string]interface{} result := []allocation{} + fmt.Printf("Raw allocated response: %s\n", allocated) // Keep track of total core count across allocated corecount := 0 @@ -214,7 +215,6 @@ func ParseAllocResult(allocated, podName string) []allocation { if err := json.Unmarshal([]byte(allocated), &dat); err != nil { panic(err) } - // Parse graph and nodes into interfaces // TODO look at github.com/mitchellh/mapstructure // that might make this easier @@ -240,12 +240,23 @@ func ParseAllocResult(allocated, podName string) []allocation { corecount = 0 } } - fmt.Printf("Final node result for %s: %s\n", podName, result) + fmt.Printf("Final node result for %s\n", podName) + for i, alloc := range result { + fmt.Printf("Node %d: %s\n", i, alloc.Name) + fmt.Printf(" Type: %s\n Name: %s\n Basename: %s\n CoreCount: %d\n", + alloc.Type, alloc.Name, alloc.Basename, alloc.CoreCount) + + } return result } // Utility functions func PrintOutput(reserved bool, allocated string, at int64, overhead float64, jobid uint64, fluxerr error) { fmt.Println("\n\t----Match Allocate output---") - fmt.Printf("jobid: %d\nreserved: %t\nallocated: %s\nat: %d\noverhead: %f\nerror: %w\n", jobid, reserved, allocated, at, overhead, fluxerr) + fmt.Printf("jobid: %d\nreserved: %t\nallocated: %s\nat: %d\noverhead: %f\n", jobid, reserved, allocated, at, overhead) + + // Only print error if we had one + if fluxerr != nil { + fmt.Printf("error: %w\n", fluxerr) + } } From f8ca47ee63996e7e73bf8bb47cf0fdb1b49e6c0c Mon Sep 17 00:00:00 2001 From: vsoch Date: Mon, 15 Jan 2024 22:29:10 -0700 Subject: [PATCH 06/28] add examples with lammps to reproduce error Signed-off-by: vsoch --- examples/pod-group/lammps/lammps2.yaml | 22 ++++++++++++++++++ examples/pod-group/lammps/lammps4-2.yaml | 22 ++++++++++++++++++ examples/pod-group/lammps/lammps4-3.yaml | 22 ++++++++++++++++++ examples/pod-group/lammps/lammps4.yaml | 23 +++++++++++++++++++ examples/pod-group/lammps/lammps5.yaml | 22 ++++++++++++++++++ examples/pod-group/lammps/lammps6.yaml | 22 ++++++++++++++++++ .../pkg/fluence/core/core.go | 7 ++++++ sig-scheduler-plugins/pkg/fluence/fluence.go | 22 +++++++++--------- sig-scheduler-plugins/pkg/fluence/group.go | 17 +++++++------- 9 files changed, 160 insertions(+), 19 deletions(-) create mode 100644 examples/pod-group/lammps/lammps2.yaml create mode 100644 examples/pod-group/lammps/lammps4-2.yaml create mode 100644 examples/pod-group/lammps/lammps4-3.yaml create mode 100644 examples/pod-group/lammps/lammps4.yaml create mode 100644 examples/pod-group/lammps/lammps5.yaml create mode 100644 examples/pod-group/lammps/lammps6.yaml diff --git a/examples/pod-group/lammps/lammps2.yaml b/examples/pod-group/lammps/lammps2.yaml new file mode 100644 index 0000000..acdd2d5 --- /dev/null +++ b/examples/pod-group/lammps/lammps2.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps2 +spec: + size: 2 + network: + headlessName: l2 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps2 + fluence.group-size: "2" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 2 + requests: + cpu: 2 \ No newline at end of file diff --git a/examples/pod-group/lammps/lammps4-2.yaml b/examples/pod-group/lammps/lammps4-2.yaml new file mode 100644 index 0000000..777e73c --- /dev/null +++ b/examples/pod-group/lammps/lammps4-2.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps4-2 +spec: + size: 4 + network: + headlessName: l42 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps4-2 + fluence.group-size: "4" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 2 + requests: + cpu: 2 \ No newline at end of file diff --git a/examples/pod-group/lammps/lammps4-3.yaml b/examples/pod-group/lammps/lammps4-3.yaml new file mode 100644 index 0000000..76c5ed0 --- /dev/null +++ b/examples/pod-group/lammps/lammps4-3.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps4-3 +spec: + size: 4 + network: + headlessName: l43 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps4-3 + fluence.group-size: "4" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 2 + requests: + cpu: 2 \ No newline at end of file diff --git a/examples/pod-group/lammps/lammps4.yaml b/examples/pod-group/lammps/lammps4.yaml new file mode 100644 index 0000000..38ae0a7 --- /dev/null +++ b/examples/pod-group/lammps/lammps4.yaml @@ -0,0 +1,23 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps4 +spec: + size: 4 + network: + headlessName: l4 + pod: + schedulerName: fluence + labels: + app: lammps + fluence.pod-group: lammps4 + fluence.group-size: "4" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 2 + requests: + cpu: 2 \ No newline at end of file diff --git a/examples/pod-group/lammps/lammps5.yaml b/examples/pod-group/lammps/lammps5.yaml new file mode 100644 index 0000000..7546b48 --- /dev/null +++ b/examples/pod-group/lammps/lammps5.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps5 +spec: + size: 5 + network: + headlessName: l5 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps5 + fluence.group-size: "5" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 2 + requests: + cpu: 2 \ No newline at end of file diff --git a/examples/pod-group/lammps/lammps6.yaml b/examples/pod-group/lammps/lammps6.yaml new file mode 100644 index 0000000..2030192 --- /dev/null +++ b/examples/pod-group/lammps/lammps6.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps6 +spec: + size: 6 + network: + headlessName: l6 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps6 + fluence.group-size: "6" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 2 + requests: + cpu: 2 \ No newline at end of file diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index ddf8e4c..135659f 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -126,6 +126,13 @@ func DeletePodGroup(groupName string) { delete(podGroupCache, groupName) } +// ListGroups lists groups, primarily for debugging +func ListGroups() { + for name, pg := range podGroupCache { + fmt.Printf(" %s: size %s, created at %s\n", name, pg.Size, &pg.TimeCreated) + } +} + // CreateNodePodsList creates a list of node pod caches func CreateNodePodsList(nodelist []*pb.NodeAlloc, groupName string) (nodepods []NodeCache) { diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index bbe6cee..6752764 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -244,18 +244,18 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { f.mutex.Unlock() if isPodAllocated { - klog.Info("[Fluence] Pod %s is allocated, cleaning up previous allocation", pod.Name) + klog.Infof("[Fluence] Pod %s is allocated, cleaning up previous allocation\n", pod.Name) f.mutex.Lock() f.cancelFluxJobForPod(pod) f.mutex.Unlock() } jobspec := utils.InspectPodInfo(pod) - klog.Infof("[Fluence] Inspect pod info, jobspec: %s", jobspec) + klog.Infof("[Fluence] Inspect pod info, jobspec: %s\n", jobspec) conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) if err != nil { - klog.Errorf("[Fluence] Error connecting to server: %v", err) + klog.Errorf("[Fluence] Error connecting to server: %v\n", err) return err } defer conn.Close() @@ -274,11 +274,11 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { // otherwise it's going to try to use the allocation (but there is none) r, err := grpcclient.Match(context.Background(), request) if err != nil { - klog.Errorf("[Fluence] did not receive any match response: %v", err) + klog.Errorf("[Fluence] did not receive any match response: %v\n", err) return err } - klog.Infof("[Fluence] response podID %s", r.GetPodID()) + klog.Infof("[Fluence] response podID %s\n", r.GetPodID()) // Presence of a podGroup is indicated by a groupName // Flag that the group is allocated (yes we also have the job id, testing for now) @@ -286,20 +286,21 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { // Get the nodelist and inspect nodes := r.GetNodelist() - klog.Infof("[Fluence] Nodelist returned from Fluxion: %s", nodes) + klog.Infof("[Fluence] Nodelist returned from Fluxion: %s\n", nodes) nodelist := fcore.CreateNodePodsList(nodes, pg.Name) - klog.Infof("[Fluence] parsed node pods list %s", nodelist) + klog.Infof("[Fluence] parsed node pods list %s\n", nodelist) jobid := uint64(r.GetJobID()) f.mutex.Lock() f.podNameToJobId[pod.Name] = jobid - klog.Info("[Fluence] Check job assignment: ", f.podNameToJobId) + klog.Infof("[Fluence] Check job assignment: %s\n", f.podNameToJobId) f.mutex.Unlock() return nil } // cancelFluxJobForPod cancels the flux job for a pod. +// We assume that the cancelled job also means deleting the pod group func (f *Fluence) cancelFluxJobForPod(pod *v1.Pod) error { jobid := f.podNameToJobId[pod.Name] @@ -332,8 +333,7 @@ func (f *Fluence) cancelFluxJobForPod(pod *v1.Pod) error { delete(f.podNameToJobId, pod.Name) // If we are successful, clear the group allocated nodes - pg := f.getPodsGroup(pod) - pg.CancelAllocation() + f.DeleteFluenceGroup(pod) } else { klog.Warningf("[Fluence] Failed to cancel flux job %v for pod %s", jobid, pod.Name) } @@ -386,7 +386,7 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { // deletePod handles the delete event handler // TODO when should we clear group from the cache? func (f *Fluence) deletePod(podObj interface{}) { - klog.Info("Delete Pod event handler") + klog.Info("[Fluence] Delete Pod event handler") pod := podObj.(*v1.Pod) klog.Infof("[Fluence] Delete pod has status %s", pod.Status.Phase) diff --git a/sig-scheduler-plugins/pkg/fluence/group.go b/sig-scheduler-plugins/pkg/fluence/group.go index 6f55a8b..f2cdf21 100644 --- a/sig-scheduler-plugins/pkg/fluence/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group.go @@ -44,11 +44,11 @@ func (f *Fluence) ensureFluenceGroup(pod *v1.Pod) string { // If there isn't a group, make a single node sized group // This is so we can always treat the cases equally if groupName == "" { - klog.Infof(" [Fluence] Group annotation missing for pod %s", pod.Name) + klog.Infof(" [Fluence] Group annotation missing for pod %s", pod.Name) groupName = f.getDefaultGroupName(pod) } - klog.Infof(" [Fluence] Group name for %s is %s", pod.Name, groupName) - klog.Infof(" [Fluence] Group size for %s is %d", pod.Name, groupSize) + klog.Infof(" [Fluence] Group name for %s is %s", pod.Name, groupName) + klog.Infof(" [Fluence] Group size for %s is %d", pod.Name, groupSize) // Register the pod group (with the pod) in our cache fcore.RegisterPodGroup(pod, groupName, groupSize) @@ -56,11 +56,12 @@ func (f *Fluence) ensureFluenceGroup(pod *v1.Pod) string { } // deleteFluenceGroup ensures the pod group is deleted, if it exists -func (f *Fluence) deleteFluenceGroup(pod *v1.Pod) { - +func (f *Fluence) DeleteFluenceGroup(pod *v1.Pod) { // Get the group name and size from the fluence labels pg := f.getPodsGroup(pod) fcore.DeletePodGroup(pg.Name) + klog.Infof(" [Fluence] known groups are:\n") + fcore.ListGroups() } // getFluenceGroupName looks for the group to indicate a fluence group, and returns it @@ -82,7 +83,7 @@ func (f *Fluence) getFluenceGroupSize(pod *v1.Pod) int32 { // that doesn't convert nicely. They can find this in the logs. intSize, err := strconv.ParseUint(size, 10, 32) if err != nil { - klog.Error(" [Fluence] Parsing integer size for pod group") + klog.Error(" [Fluence] Parsing integer size for pod group") } return int32(intSize) } @@ -94,10 +95,10 @@ func (f *Fluence) getCreationTimestamp(groupName string, podInfo *framework.Queu // IsZero is an indicator if this was actually set // If the group label was present and we have a group, this will be true if !pg.TimeCreated.IsZero() { - klog.Infof(" [Fluence] Pod group %s was created at %s\n", groupName, pg.TimeCreated) + klog.Infof(" [Fluence] Pod group %s was created at %s\n", groupName, pg.TimeCreated) return pg.TimeCreated } // We should actually never get here. - klog.Errorf(" [Fluence] Pod group %s time IsZero, we should not have reached here", groupName) + klog.Errorf(" [Fluence] Pod group %s time IsZero, we should not have reached here", groupName) return metav1.NewMicroTime(*podInfo.InitialAttemptTimestamp) } From 275cd04414ebbda845fa26094171556c9e220fdc Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 18 Jan 2024 16:16:32 -0700 Subject: [PATCH 07/28] clean up logging and unused files We install with the helm manifests, and the old fluence manifests might be confusing (they have changed). This commit will remove the old manifests, and also change some of the fmt.Print logging to use klog to be easier to parse. Signed-off-by: vsoch --- Makefile | 1 - .../manifests/fluence/configmap.yaml | 23 ------ .../manifests/fluence/deploy.yaml | 45 ---------- .../manifests/fluence/rbac.yaml | 82 ------------------- .../scheduling.sigs.k8s.io_podgroups.yaml | 1 - .../manifests/fluence/serviceaccount.yaml | 10 --- .../pkg/fluence/core/core.go | 23 +++--- sig-scheduler-plugins/pkg/fluence/fluence.go | 16 +++- sig-scheduler-plugins/pkg/fluence/group.go | 8 +- .../pkg/fluence/utils/utils.go | 5 +- src/fluence/fluxion/fluxion.go | 33 ++++---- src/fluence/utils/utils.go | 1 - 12 files changed, 47 insertions(+), 201 deletions(-) delete mode 100644 sig-scheduler-plugins/manifests/fluence/configmap.yaml delete mode 100644 sig-scheduler-plugins/manifests/fluence/deploy.yaml delete mode 100644 sig-scheduler-plugins/manifests/fluence/rbac.yaml delete mode 120000 sig-scheduler-plugins/manifests/fluence/scheduling.sigs.k8s.io_podgroups.yaml delete mode 100644 sig-scheduler-plugins/manifests/fluence/serviceaccount.yaml diff --git a/Makefile b/Makefile index 907f96e..1356160 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,6 @@ prepare: clone rm -rf $(CLONE_UPSTREAM)/pkg/fluence rm -rf $(CLONE_UPSTREAM)/manifests/fluence cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence - cp -R sig-scheduler-plugins/manifests/fluence $(CLONE_UPSTREAM)/manifests/fluence # This is the one exception not from sig-scheduler-plugins because it is needed in both spots cp -R src/fluence/fluxcli-grpc $(CLONE_UPSTREAM)/pkg/fluence/fluxcli-grpc # These are files with subtle changes to add fluence diff --git a/sig-scheduler-plugins/manifests/fluence/configmap.yaml b/sig-scheduler-plugins/manifests/fluence/configmap.yaml deleted file mode 100644 index 21ffacc..0000000 --- a/sig-scheduler-plugins/manifests/fluence/configmap.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: scheduler-config - namespace: scheduler-plugins -data: - scheduler-config.yaml: | - apiVersion: kubescheduler.config.k8s.io/v1beta3 - kind: KubeSchedulerConfiguration - leaderElection: - leaderElect: false - profiles: - - schedulerName: fluence - plugins: - preFilter: - enabled: - - name: Fluence - filter: - enabled: - - name: Fluence - score: - disabled: - - name: '*' \ No newline at end of file diff --git a/sig-scheduler-plugins/manifests/fluence/deploy.yaml b/sig-scheduler-plugins/manifests/fluence/deploy.yaml deleted file mode 100644 index 92e39b0..0000000 --- a/sig-scheduler-plugins/manifests/fluence/deploy.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: fluence - namespace: scheduler-plugins -spec: - replicas: 1 - selector: - matchLabels: - component: scheduler - template: - metadata: - labels: - component: scheduler - spec: - serviceAccountName: scheduler-plugins - containers: - - image: quay.io/cmisale1/fluence-sidecar:latest - imagePullPolicy: Always - command: - - /go/src/fluence/bin/server - - --policy=lonode - name: fluence-sidecar - - image: quay.io/cmisale1/fluence:dev - imagePullPolicy: Always - command: - - /bin/kube-scheduler - - --config=/etc/kubernetes/scheduler-config.yaml - - -v=9 - name: fluence - resources: - requests: - cpu: '0.1' - securityContext: - privileged: false - volumeMounts: - - mountPath: /etc/kubernetes - name: scheduler-config - hostNetwork: false - hostPID: false - volumes: - - name: scheduler-config - configMap: - name: scheduler-config - diff --git a/sig-scheduler-plugins/manifests/fluence/rbac.yaml b/sig-scheduler-plugins/manifests/fluence/rbac.yaml deleted file mode 100644 index 3416e18..0000000 --- a/sig-scheduler-plugins/manifests/fluence/rbac.yaml +++ /dev/null @@ -1,82 +0,0 @@ -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: scheduler-plugins -rules: -- apiGroups: [""] - resources: ["namespaces", "configmaps"] - verbs: ["get", "list", "watch"] -- apiGroups: ["", "events.k8s.io"] - resources: ["events"] - verbs: ["create", "patch", "update"] -- apiGroups: ["coordination.k8s.io"] - resources: ["leases"] - verbs: ["create"] -- apiGroups: ["coordination.k8s.io"] - resourceNames: ["kube-scheduler"] - resources: ["leases"] - verbs: ["get", "update"] -- apiGroups: [""] - resources: ["endpoints"] - verbs: ["create"] -- apiGroups: [""] - resourceNames: ["kube-scheduler"] - resources: ["endpoints"] - verbs: ["get", "update"] -- apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "list", "watch", "patch"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["delete", "get", "list", "watch", "update"] -- apiGroups: [""] - resources: ["bindings", "pods/binding"] - verbs: ["create"] -- apiGroups: [""] - resources: ["pods/status"] - verbs: ["patch", "update"] -- apiGroups: [""] - resources: ["replicationcontrollers", "services"] - verbs: ["get", "list", "watch"] -- apiGroups: ["apps", "extensions"] - resources: ["replicasets"] - verbs: ["get", "list", "watch"] -- apiGroups: ["apps"] - resources: ["statefulsets"] - verbs: ["get", "list", "watch"] -- apiGroups: ["policy"] - resources: ["poddisruptionbudgets"] - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: ["persistentvolumeclaims", "persistentvolumes"] - verbs: ["get", "list", "watch", "patch", "update"] -- apiGroups: ["authentication.k8s.io"] - resources: ["tokenreviews"] - verbs: ["create"] -- apiGroups: ["authorization.k8s.io"] - resources: ["subjectaccessreviews"] - verbs: ["create"] -- apiGroups: ["storage.k8s.io"] - resources: ["csinodes", "storageclasses" , "csidrivers" , "csistoragecapacities"] - verbs: ["get", "list", "watch"] -- apiGroups: ["topology.node.k8s.io"] - resources: ["noderesourcetopologies"] - verbs: ["*"] -# resources need to be updated with the scheduler plugins used -- apiGroups: ["scheduling.sigs.k8s.io"] - resources: ["podgroups", "elasticquotas"] - verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: scheduler-plugins - namespace: scheduler-plugins -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: scheduler-plugins -subjects: - - kind: ServiceAccount - name: scheduler-plugins - namespace: scheduler-plugins diff --git a/sig-scheduler-plugins/manifests/fluence/scheduling.sigs.k8s.io_podgroups.yaml b/sig-scheduler-plugins/manifests/fluence/scheduling.sigs.k8s.io_podgroups.yaml deleted file mode 120000 index 7f8408e..0000000 --- a/sig-scheduler-plugins/manifests/fluence/scheduling.sigs.k8s.io_podgroups.yaml +++ /dev/null @@ -1 +0,0 @@ -../coscheduling/crd.yaml \ No newline at end of file diff --git a/sig-scheduler-plugins/manifests/fluence/serviceaccount.yaml b/sig-scheduler-plugins/manifests/fluence/serviceaccount.yaml deleted file mode 100644 index fface49..0000000 --- a/sig-scheduler-plugins/manifests/fluence/serviceaccount.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: scheduler-plugins ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: scheduler-plugins - namespace: scheduler-plugins \ No newline at end of file diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 135659f..53a627e 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -6,6 +6,7 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" @@ -102,13 +103,13 @@ func RegisterPodGroup(pod *v1.Pod, groupName string, groupSize int32) error { } // Tell the user when it was created - fmt.Printf("[Fluence] Pod group %s was created at %s\n", entry.Name, entry.TimeCreated) + klog.Infof("[Fluence] Pod group %s was created at %s\n", entry.Name, entry.TimeCreated) } // If the size has changed, we currently do not allow updating it. // We issue a warning. In the future this could be supported with a grow command. if entry.Size != groupSize { - fmt.Printf("[Fluence] Pod group %s request to change size from %s to %s is not yet supported\n", groupName, entry.Size, groupSize) + klog.Infof("[Fluence] Pod group %s request to change size from %s to %s is not yet supported\n", groupName, entry.Size, groupSize) // entry.GroupSize = groupSize } podGroupCache[groupName] = entry @@ -148,7 +149,7 @@ func CreateNodePodsList(nodelist []*pb.NodeAlloc, groupName string) (nodepods [] // Update the pods in the PodGraphCache updatePodGroupNodes(groupName, nodepods) - fmt.Printf("[Fluence] Pod group cache updated with nodes\n", podGroupCache) + klog.Infof("[Fluence] Pod group cache updated with nodes\n", podGroupCache) return nodepods } @@ -175,30 +176,28 @@ func (p *PodGroupCache) CancelAllocation() { func GetNextNode(groupName string) (string, error) { entry, ok := podGroupCache[groupName] if !ok { - err := fmt.Errorf("[Fluence] Map is empty\n") - return "", err + return "", fmt.Errorf("[Fluence] Map is empty\n") } if len(entry.Nodes) == 0 { - err := fmt.Errorf("[Fluence] Error while getting a node\n") - return "", err + return "", fmt.Errorf("[Fluence] Error while getting a node\n") } nodename := entry.Nodes[0].NodeName - fmt.Printf("[Fluence] Next node for group %s is %s", groupName, nodename) + klog.Infof("[Fluence] Next node for group %s is %s", groupName, nodename) if entry.Nodes[0].Tasks == 1 { - fmt.Println("[Fluence] First node has one task") + klog.Infof("[Fluence] First node has one task") slice := entry.Nodes[1:] if len(slice) == 0 { - fmt.Printf("[Fluence] After this node, the slice is empty, deleting group %s from cache\n", groupName) + klog.Infof("[Fluence] After this node, the slice is empty, deleting group %s from cache\n", groupName) delete(podGroupCache, groupName) return nodename, nil } - fmt.Println("[Fluence] After this node, the slide still has nodes") + klog.Infof("[Fluence] After this node, the slide still has nodes") updatePodGroupNodes(groupName, slice) return nodename, nil } - fmt.Println("[Fluence] Subtracting one task from first node") + klog.Infof("[Fluence] Subtracting one task from first node") entry.Nodes[0].Tasks = entry.Nodes[0].Tasks - 1 return nodename, nil } diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 6752764..145dfb0 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -44,9 +44,14 @@ import ( ) type Fluence struct { - mutex sync.Mutex - handle framework.Handle - client client.Client + mutex sync.Mutex + handle framework.Handle + client client.Client + + // Important: I tested moving this into the group, but it's a bad idea because + // we need to delete the group after the last allocation is given, and then we + // no longer have the ID. It might be a better approach to delete it elsewhere + // (but I'm not sure where that elsewhere could be) podNameToJobId map[string]uint64 pgMgr coschedulingcore.Manager } @@ -250,6 +255,7 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { f.mutex.Unlock() } + // Does the task name here matter? We are naming the entire group for the pod jobspec := utils.InspectPodInfo(pod) klog.Infof("[Fluence] Inspect pod info, jobspec: %s\n", jobspec) conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) @@ -413,4 +419,8 @@ func (f *Fluence) deletePod(podObj interface{}) { klog.Infof("[Fluence] Deleted pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) } } + + // We assume that a request to delete one pod means all of them. + // We have to take an all or nothing approach for now + f.DeleteFluenceGroup(pod) } diff --git a/sig-scheduler-plugins/pkg/fluence/group.go b/sig-scheduler-plugins/pkg/fluence/group.go index f2cdf21..84641f1 100644 --- a/sig-scheduler-plugins/pkg/fluence/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group.go @@ -44,11 +44,11 @@ func (f *Fluence) ensureFluenceGroup(pod *v1.Pod) string { // If there isn't a group, make a single node sized group // This is so we can always treat the cases equally if groupName == "" { - klog.Infof(" [Fluence] Group annotation missing for pod %s", pod.Name) + klog.Infof("[Fluence] Group annotation missing for pod %s", pod.Name) groupName = f.getDefaultGroupName(pod) } - klog.Infof(" [Fluence] Group name for %s is %s", pod.Name, groupName) - klog.Infof(" [Fluence] Group size for %s is %d", pod.Name, groupSize) + klog.Infof("[Fluence] Group name for %s is %s", pod.Name, groupName) + klog.Infof("[Fluence] Group size for %s is %d", pod.Name, groupSize) // Register the pod group (with the pod) in our cache fcore.RegisterPodGroup(pod, groupName, groupSize) @@ -60,7 +60,7 @@ func (f *Fluence) DeleteFluenceGroup(pod *v1.Pod) { // Get the group name and size from the fluence labels pg := f.getPodsGroup(pod) fcore.DeletePodGroup(pg.Name) - klog.Infof(" [Fluence] known groups are:\n") + klog.Infof("[Fluence] known groups are:\n") fcore.ListGroups() } diff --git a/sig-scheduler-plugins/pkg/fluence/utils/utils.go b/sig-scheduler-plugins/pkg/fluence/utils/utils.go index 53e9c4a..e384669 100644 --- a/sig-scheduler-plugins/pkg/fluence/utils/utils.go +++ b/sig-scheduler-plugins/pkg/fluence/utils/utils.go @@ -52,8 +52,9 @@ func InspectPodInfo(pod *v1.Pod) *pb.PodSpec { ps.Labels = getPodJobspecLabels(pod) // Note that Container gets use for the JobSpec, so we provide - // the pod name (to be associated with tasks) for it. We likely - // should change this identifier eventually. + // the pod name (to be associated with tasks) for it. We are making + // the assumption that this one container represents the group, + // which is OK for now, but might not always be true! ps.Container = fmt.Sprintf("%s-%s", pod.Namespace, pod.Name) // Create accumulated requests for cpu and limits diff --git a/src/fluence/fluxion/fluxion.go b/src/fluence/fluxion/fluxion.go index 2a8fd7f..18d6735 100644 --- a/src/fluence/fluxion/fluxion.go +++ b/src/fluence/fluxion/fluxion.go @@ -7,10 +7,10 @@ import ( "github.com/flux-framework/flux-k8s/flux-plugin/fluence/jobspec" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/utils" "github.com/flux-framework/flux-sched/resource/reapi/bindings/go/src/fluxcli" + "k8s.io/klog/v2" "context" "errors" - "fmt" ) type Fluxion struct { @@ -22,8 +22,7 @@ type Fluxion struct { func (f *Fluxion) InitFluxion(policy *string, label *string) { f.cli = fluxcli.NewReapiClient() - fmt.Println("Created flux resource client ", f.cli) - fmt.Printf("%+v\n", f.cli) + klog.Infof("[Fluence] Created flux resource client ", f.cli) filename := "/home/data/jgf/kubecluster.json" err := utils.CreateJGF(filename, label) if err != nil { @@ -32,14 +31,14 @@ func (f *Fluxion) InitFluxion(policy *string, label *string) { jgf, err := os.ReadFile(filename) if err != nil { - fmt.Println("Error reading JGF") + klog.Error("Error reading JGF") return } p := "{}" if *policy != "" { p = string("{\"matcher_policy\": \"" + *policy + "\"}") - fmt.Println("Match policy: ", p) + klog.Infof("[Fluence] match policy: ", p) } f.cli.InitContext(string(jgf), p) @@ -48,7 +47,7 @@ func (f *Fluxion) InitFluxion(policy *string, label *string) { // Cancel wraps the Cancel function of the fluxion go bindings func (s *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelResponse, error) { - fmt.Printf("[GRPCServer] Received Cancel request %v\n", in) + klog.Infof("[Fluence] received cancel request %v\n", in) err := s.cli.Cancel(int64(in.JobID), true) if err != nil { return nil, errors.New("Error in Cancel") @@ -57,14 +56,14 @@ func (s *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelR // Why would we have an error code here if we check above? // This (I think) should be an error code for the specific job dr := &pb.CancelResponse{JobID: in.JobID} - fmt.Printf("[GRPCServer] Sending Cancel response %v\n", dr) - fmt.Printf("[CancelRPC] Errors so far: %s\n", s.cli.GetErrMsg()) + klog.Infof("[Fluence] sending cancel response %v\n", dr) + klog.Infof("[Fluence] cancel errors so far: %s\n", s.cli.GetErrMsg()) reserved, at, overhead, mode, fluxerr := s.cli.Info(int64(in.JobID)) - fmt.Println("\n\t----Job Info output---") - fmt.Printf("jobid: %d\nreserved: %t\nat: %d\noverhead: %f\nmode: %s\nerror: %d\n", in.JobID, reserved, at, overhead, mode, fluxerr) + klog.Infof("\n\t----Job Info output---") + klog.Infof("jobid: %d\nreserved: %t\nat: %d\noverhead: %f\nmode: %s\nerror: %d\n", in.JobID, reserved, at, overhead, mode, fluxerr) - fmt.Printf("[GRPCServer] Sending Cancel response %v\n", dr) + klog.Infof("[GRPCServer] Sending Cancel response %v\n", dr) return dr, nil } @@ -96,7 +95,7 @@ func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResp emptyResponse := &pb.MatchResponse{} // Prepare an empty match response (that can still be serialized) - fmt.Printf("[Fluence][MatchRPC] Received Match request %v\n", in) + klog.Infof("[Fluence] Received Match request %v\n", in) // Generate the jobspec, written to temporary file and read as string spec, err := s.generateJobspec(in) @@ -111,12 +110,12 @@ func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResp // Be explicit about errors (or not) errorMessages := s.cli.GetErrMsg() if errorMessages == "" { - fmt.Println("[Fluence][MatchRPC] There are no errors") + klog.Infof("[Fluence] There are no errors") } else { - fmt.Printf("[Fluence][MatchRPC] Errors so far: %s\n", errorMessages) + klog.Infof("[Fluence] Match errors so far: %s\n", errorMessages) } if fluxerr != nil { - fmt.Printf("[Fluence][MatchRPC] Flux err is %w\n", fluxerr) + klog.Infof("[Fluence] Match Flux err is %w\n", fluxerr) return emptyResponse, errors.New("[Fluence] Error in ReapiCliMatchAllocate") } @@ -124,7 +123,7 @@ func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResp // We need to return an error here otherwise we try to pass an empty string // to other RPC endpoints and get back an error. if allocated == "" { - fmt.Println("[Fluence][MatchRPC] Allocated is empty") + klog.Infof("[Fluence] Allocated is empty") return emptyResponse, errors.New("Allocation was not possible") } @@ -139,6 +138,6 @@ func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResp } } mr := &pb.MatchResponse{PodID: in.Ps.Id, Nodelist: nodetaskslist, JobID: int64(jobid)} - fmt.Printf("[GRPCServer] Response %v \n", mr) + klog.Infof("[Fluence] Match response %v \n", mr) return mr, nil } diff --git a/src/fluence/utils/utils.go b/src/fluence/utils/utils.go index 961a77a..f30eeda 100644 --- a/src/fluence/utils/utils.go +++ b/src/fluence/utils/utils.go @@ -206,7 +206,6 @@ type allocation struct { func ParseAllocResult(allocated, podName string) []allocation { var dat map[string]interface{} result := []allocation{} - fmt.Printf("Raw allocated response: %s\n", allocated) // Keep track of total core count across allocated corecount := 0 From f243852e3cbfe400d49d89edeaf7d1d921a35cb1 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 18 Jan 2024 19:36:56 -0700 Subject: [PATCH 08/28] support for skeleton grpc server and service/ingress for external client This adds a prototype support for an extra helm flag that dually enables adding an extra grpc set of endpoints, and then the configs (ingress and service) necessary to expose them. I next need to figure out how to interact with grpc from a local client, likely built from the same codebase and grpc spec. This is super cool!! Signed-off-by: vsoch --- README.md | 39 +- .../templates/deployment.yaml | 36 ++ .../charts/as-a-second-scheduler/values.yaml | 6 + sig-scheduler-plugins/pkg/fluence/fluence.go | 17 +- .../pkg/fluence/{ => group}/group.go | 33 +- src/Makefile | 1 + src/fluence/cmd/main.go | 47 ++- src/fluence/cmd/main.go.bk | 15 - src/fluence/defaults/defaults.go | 5 + src/fluence/fluxcli-grpc/fluxcli.proto | 5 +- src/fluence/fluxion/fluxion.go | 10 +- src/fluence/service-grpc/service.pb.go | 351 ++++++++++++++++++ src/fluence/service-grpc/service.proto | 34 ++ src/fluence/service-grpc/service_grpc.pb.go | 181 +++++++++ src/fluence/service/service.go | 61 +++ 15 files changed, 785 insertions(+), 56 deletions(-) rename sig-scheduler-plugins/pkg/fluence/{ => group}/group.go (79%) delete mode 100644 src/fluence/cmd/main.go.bk create mode 100644 src/fluence/defaults/defaults.go create mode 100644 src/fluence/service-grpc/service.pb.go create mode 100644 src/fluence/service-grpc/service.proto create mode 100644 src/fluence/service-grpc/service_grpc.pb.go create mode 100644 src/fluence/service/service.go diff --git a/README.md b/README.md index 4431050..0433799 100644 --- a/README.md +++ b/README.md @@ -521,7 +521,7 @@ make build REGISTRY=ghcr.io/vsoch And then install with your custom images: -``` +```bash cd ./upstream/manifests/install/charts helm install \ --set scheduler.image=ghcr.io/vsoch/fluence:latest \ @@ -532,6 +532,43 @@ helm install \ And then apply what you need to test, and look at logs! And then keep doing that until you get what you want :) Note that I haven't found a good way for the VSCode developer tools to work because we develop fluence outside of the tree it's supposed to be in. +##### kubectl plugin + +Note that if you want to enable extra endpoints for the fluence kubectl plugin and expose the GRPC as a service, you can do: + +```bash +helm install \ + --set scheduler.image=ghcr.io/vsoch/fluence:latest \ + --set scheduler.enableExternalService=true \ + --set scheduler.sidecarimage=ghcr.io/vsoch/fluence-sidecar:latest \ + schedscheduler-plugins as-a-second-scheduler/ +``` + +For this setup if you are developing locally with kind, you will need to enable the ingress. Here is `kind-config.yaml` + +```yaml +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 4242 + hostPort: 4242 + protocol: TCP +``` + +And to create: + +```bash +kind create cluster --config ./kind-config.yaml +``` + #### Components - [FluxStateData](sig-scheduler-plugins/pkg/fluence/core/core.go): is given to the [framework.CycleState](https://github.com/kubernetes/kubernetes/blob/242b41b36a20032f99e8a059ca0a5d764105217b/pkg/scheduler/framework/cycle_state.go#L48) and serves as a vehicle to store a cache of node name assignment. diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml index ffc3ce7..83ecccc 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml @@ -37,6 +37,7 @@ spec: metadata: labels: component: scheduler + app: fluence-scheduler spec: serviceAccountName: {{ .Values.scheduler.name }} containers: @@ -45,7 +46,13 @@ spec: command: - /go/src/fluence/bin/server - --policy={{ .Values.scheduler.policy }} + - --port={{ .Values.scheduler.port }} + {{ if .Values.scheduler.enableExternalService }}- --external-service{{ end }} name: sidecar + # These are exposed for the kubectl plugin + {{ if .Values.scheduler.enableExternalService }}ports: + - containerPort: {{ .Values.scheduler.port }} + hostPort: {{ .Values.scheduler.port }}{{ end }} - command: - /bin/kube-scheduler - --config=/etc/kubernetes/scheduler-config.yaml @@ -79,3 +86,32 @@ spec: - name: scheduler-config configMap: name: scheduler-config +{{ if .Values.scheduler.enableExternalService }}--- +apiVersion: v1 +kind: Service +metadata: + name: fluence-service +spec: + type: NodePort + selector: + app: fluence-scheduler + ports: + - port: {{ .Values.scheduler.port }} + targetPort: {{ .Values.scheduler.port }} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: fluence-ingress +spec: + rules: + - host: localhost + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: fluence-service + port: + number: {{ .Values.scheduler.port }}{{ end }} \ No newline at end of file diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml index 38da251..2a35a3a 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml @@ -13,6 +13,12 @@ scheduler: sidecarPullPolicy: Always loggingLevel: "9" + # Port is for GRPC, and enabling the external service will also + # create the service and ingress to it, along with adding + # additional API endpoints for our TBA kubectl plugin + enableExternalService: false + port: 4242 + controller: name: scheduler-plugins-controller image: registry.k8s.io/scheduler-plugins/controller:v0.27.8 diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 145dfb0..26282e5 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -40,6 +40,7 @@ import ( coschedulingcore "sigs.k8s.io/scheduler-plugins/pkg/coscheduling/core" fcore "sigs.k8s.io/scheduler-plugins/pkg/fluence/core" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" + fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" "sigs.k8s.io/scheduler-plugins/pkg/fluence/utils" ) @@ -151,8 +152,8 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { // ensure we have a PodGroup no matter what klog.Infof("[Fluence] Comparing %s and %s", podInfo1.Pod.Name, podInfo2.Pod.Name) - podGroup1 := f.ensureFluenceGroup(podInfo1.Pod) - podGroup2 := f.ensureFluenceGroup(podInfo2.Pod) + podGroup1 := fgroup.EnsureFluenceGroup(podInfo1.Pod) + podGroup2 := fgroup.EnsureFluenceGroup(podInfo2.Pod) // First preference to priority, but only if they are different prio1 := corev1helpers.PodPriority(podInfo1.Pod) @@ -166,8 +167,8 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { // Fluence can only compare if we have two known groups. // This tries for that first, and falls back to the initial attempt timestamp - creationTime1 := f.getCreationTimestamp(podGroup1, podInfo1) - creationTime2 := f.getCreationTimestamp(podGroup2, podInfo2) + creationTime1 := fgroup.GetCreationTimestamp(podGroup1, podInfo1) + creationTime2 := fgroup.GetCreationTimestamp(podGroup2, podInfo2) // If they are the same, fall back to sorting by name. if creationTime1.Equal(&creationTime2) { @@ -188,7 +189,7 @@ func (f *Fluence) PreFilter( // groupName will be named according to the single pod namespace / pod if there wasn't // a user defined group. This is a size 1 group we handle equivalently. - pg := f.getPodsGroup(pod) + pg := fgroup.GetPodsGroup(pod) klog.Infof("[Fluence] Pod %s group size %d", pod.Name, pg.Size) klog.Infof("[Fluence] Pod %s group name is %s", pod.Name, pg.Name) @@ -288,7 +289,7 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { // Presence of a podGroup is indicated by a groupName // Flag that the group is allocated (yes we also have the job id, testing for now) - pg := f.getPodsGroup(pod) + pg := fgroup.GetPodsGroup(pod) // Get the nodelist and inspect nodes := r.GetNodelist() @@ -339,7 +340,7 @@ func (f *Fluence) cancelFluxJobForPod(pod *v1.Pod) error { delete(f.podNameToJobId, pod.Name) // If we are successful, clear the group allocated nodes - f.DeleteFluenceGroup(pod) + fgroup.DeleteFluenceGroup(pod) } else { klog.Warningf("[Fluence] Failed to cancel flux job %v for pod %s", jobid, pod.Name) } @@ -422,5 +423,5 @@ func (f *Fluence) deletePod(podObj interface{}) { // We assume that a request to delete one pod means all of them. // We have to take an all or nothing approach for now - f.DeleteFluenceGroup(pod) + fgroup.DeleteFluenceGroup(pod) } diff --git a/sig-scheduler-plugins/pkg/fluence/group.go b/sig-scheduler-plugins/pkg/fluence/group/group.go similarity index 79% rename from sig-scheduler-plugins/pkg/fluence/group.go rename to sig-scheduler-plugins/pkg/fluence/group/group.go index 84641f1..b681504 100644 --- a/sig-scheduler-plugins/pkg/fluence/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group/group.go @@ -1,4 +1,4 @@ -package fluence +package group import ( "fmt" @@ -19,13 +19,18 @@ const ( // getDefaultGroupName returns a group name based on the pod namespace and name // We could do this for pods that are not labeled, and treat them as a size 1 group -func (f *Fluence) getDefaultGroupName(pod *v1.Pod) string { +func getDefaultGroupName(pod *v1.Pod) string { return fmt.Sprintf("%s-%s", pod.Namespace, pod.Name) } // getPodsGroup gets the pods group, if it exists. -func (f *Fluence) getPodsGroup(pod *v1.Pod) *fcore.PodGroupCache { - groupName := f.ensureFluenceGroup(pod) +func GetPodsGroup(pod *v1.Pod) *fcore.PodGroupCache { + groupName := EnsureFluenceGroup(pod) + return fcore.GetPodGroup(groupName) +} + +// GetGroup is a courtesy wrapper around fcore.GetPodGroup +func GetGroup(groupName string) *fcore.PodGroupCache { return fcore.GetPodGroup(groupName) } @@ -35,17 +40,17 @@ func (f *Fluence) getPodsGroup(pod *v1.Pod) *fcore.PodGroupCache { // created and no fluence annotation, we do not create the group. // Likely for fluence we'd want a cleanup function somehow too, // for now assume groups are unique by name. -func (f *Fluence) ensureFluenceGroup(pod *v1.Pod) string { +func EnsureFluenceGroup(pod *v1.Pod) string { // Get the group name and size from the fluence labels - groupName := f.getFluenceGroupName(pod) - groupSize := f.getFluenceGroupSize(pod) + groupName := getFluenceGroupName(pod) + groupSize := getFluenceGroupSize(pod) // If there isn't a group, make a single node sized group // This is so we can always treat the cases equally if groupName == "" { klog.Infof("[Fluence] Group annotation missing for pod %s", pod.Name) - groupName = f.getDefaultGroupName(pod) + groupName = getDefaultGroupName(pod) } klog.Infof("[Fluence] Group name for %s is %s", pod.Name, groupName) klog.Infof("[Fluence] Group size for %s is %d", pod.Name, groupSize) @@ -56,22 +61,22 @@ func (f *Fluence) ensureFluenceGroup(pod *v1.Pod) string { } // deleteFluenceGroup ensures the pod group is deleted, if it exists -func (f *Fluence) DeleteFluenceGroup(pod *v1.Pod) { +func DeleteFluenceGroup(pod *v1.Pod) { // Get the group name and size from the fluence labels - pg := f.getPodsGroup(pod) + pg := GetPodsGroup(pod) fcore.DeletePodGroup(pg.Name) klog.Infof("[Fluence] known groups are:\n") fcore.ListGroups() } // getFluenceGroupName looks for the group to indicate a fluence group, and returns it -func (f *Fluence) getFluenceGroupName(pod *v1.Pod) string { +func getFluenceGroupName(pod *v1.Pod) string { groupName, _ := pod.Labels[PodGroupNameLabel] return groupName } // getFluenceGroupSize gets the size of the fluence group -func (f *Fluence) getFluenceGroupSize(pod *v1.Pod) int32 { +func getFluenceGroupSize(pod *v1.Pod) int32 { size, _ := pod.Labels[PodGroupSizeLabel] // Default size of 1 if the label is not set (but name is) @@ -88,8 +93,8 @@ func (f *Fluence) getFluenceGroupSize(pod *v1.Pod) int32 { return int32(intSize) } -// getCreationTimestamp first tries the fluence group, then falls back to the initial attempt timestamp -func (f *Fluence) getCreationTimestamp(groupName string, podInfo *framework.QueuedPodInfo) metav1.MicroTime { +// GetCreationTimestamp first tries the fluence group, then falls back to the initial attempt timestamp +func GetCreationTimestamp(groupName string, podInfo *framework.QueuedPodInfo) metav1.MicroTime { pg := fcore.GetPodGroup(groupName) // IsZero is an indicator if this was actually set diff --git a/src/Makefile b/src/Makefile index 344bde1..3392add 100644 --- a/src/Makefile +++ b/src/Makefile @@ -36,3 +36,4 @@ protoc: $(LOCALBIN) .PHONY: proto proto: protoc PATH=$(LOCALBIN):${PATH} protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluence/fluxcli-grpc/fluxcli.proto + PATH=$(LOCALBIN):${PATH} protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluence/service-grpc/service.proto \ No newline at end of file diff --git a/src/fluence/cmd/main.go b/src/fluence/cmd/main.go index c064ce8..3fb6a06 100644 --- a/src/fluence/cmd/main.go +++ b/src/fluence/cmd/main.go @@ -1,30 +1,44 @@ package main import ( - "fmt" "flag" + "fmt" "net" - "google.golang.org/grpc/keepalive" - "google.golang.org/grpc" + "strings" "time" + "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" + pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxcli-grpc" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxion" + "github.com/flux-framework/flux-k8s/flux-plugin/fluence/service" + svcPb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/service-grpc" ) - const ( - port = ":4242" + defaultPort = ":4242" + enableExternalService = false ) var responsechan chan string -func main () { +func main() { fmt.Println("This is the fluxion grpc server") policy := flag.String("policy", "", "Match policy") label := flag.String("label", "", "Label name for fluence dedicated nodes") + grpcPort := flag.String("port", defaultPort, "Port for grpc service") + enableServicePlugin := flag.Bool("external-service", enableExternalService, "Flag to enable the external service (defaults to false)") flag.Parse() + + // Ensure our port starts with : + port := *grpcPort + if !strings.HasPrefix(":", port) { + port = fmt.Sprintf(":%s", port) + } + + // Fluxion GRPC flux := fluxion.Fluxion{} flux.InitFluxion(policy, label) @@ -36,14 +50,27 @@ func main () { responsechan = make(chan string) s := grpc.NewServer( grpc.KeepaliveParams(keepalive.ServerParameters{ - MaxConnectionIdle: 5 * time.Minute, + MaxConnectionIdle: 5 * time.Minute, }), ) - pb.RegisterFluxcliServiceServer(s, &flux /*&server{flux: flux}*/) + pb.RegisterFluxcliServiceServer(s, &flux) + + // External plugin (Kubectl) GRPC + // This will eventually be an external GRPC module that can + // be shared by fluence (flux-k8s) and fluence-kubectl + // We give it a handle to Flux to get the state of groups + // and job Ids. The direct interaction with Fluxion + // happens through the other service handle + if *enableServicePlugin { + plugin := service.ExternalService{} + plugin.Init() + svcPb.RegisterExternalPluginServiceServer(s, &plugin) + } + fmt.Printf("[GRPCServer] gRPC Listening on %s\n", lis.Addr().String()) if err := s.Serve(lis); err != nil { fmt.Printf("[GRPCServer] failed to serve: %v\n", err) } - + fmt.Printf("[GRPCServer] Exiting\n") -} \ No newline at end of file +} diff --git a/src/fluence/cmd/main.go.bk b/src/fluence/cmd/main.go.bk deleted file mode 100644 index 5e66d14..0000000 --- a/src/fluence/cmd/main.go.bk +++ /dev/null @@ -1,15 +0,0 @@ -package main - -import ( - "fmt" - "flag" - "github.com/flux-framework/flux-k8s/flux-plugin/kubeflux/fluxion" -) - -func main () { - policy := flag.String("policy", "", "Match policy") - flag.Parse() - fmt.Println("Policy ", policy) - fc := fluxion.Fluxion{Policy: *policy} - fc.InitFluxion() -} \ No newline at end of file diff --git a/src/fluence/defaults/defaults.go b/src/fluence/defaults/defaults.go new file mode 100644 index 0000000..f4fc8f2 --- /dev/null +++ b/src/fluence/defaults/defaults.go @@ -0,0 +1,5 @@ +package defaults + +var ( + KubernetesJsonGraphFormat = "/home/data/jgf/kubecluster.json" +) diff --git a/src/fluence/fluxcli-grpc/fluxcli.proto b/src/fluence/fluxcli-grpc/fluxcli.proto index f85b558..1446041 100644 --- a/src/fluence/fluxcli-grpc/fluxcli.proto +++ b/src/fluence/fluxcli-grpc/fluxcli.proto @@ -3,8 +3,7 @@ option go_package = "grpc/fluxcli"; package fluxcli; - -// Service definition +// Service definition for Fluxclient service FluxcliService { // Sends a Match command rpc Match(MatchRequest) returns (MatchResponse) {} @@ -73,4 +72,4 @@ message JGFRequest { // The JGF response message message JGFResponse { string jgf = 1; -} \ No newline at end of file +} diff --git a/src/fluence/fluxion/fluxion.go b/src/fluence/fluxion/fluxion.go index 18d6735..f29ac62 100644 --- a/src/fluence/fluxion/fluxion.go +++ b/src/fluence/fluxion/fluxion.go @@ -3,6 +3,7 @@ package fluxion import ( "os" + "github.com/flux-framework/flux-k8s/flux-plugin/fluence/defaults" pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxcli-grpc" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/jobspec" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/utils" @@ -22,14 +23,13 @@ type Fluxion struct { func (f *Fluxion) InitFluxion(policy *string, label *string) { f.cli = fluxcli.NewReapiClient() - klog.Infof("[Fluence] Created flux resource client ", f.cli) - filename := "/home/data/jgf/kubecluster.json" - err := utils.CreateJGF(filename, label) + klog.Infof("[Fluence] Created flux resource client %s", f.cli) + err := utils.CreateJGF(defaults.KubernetesJsonGraphFormat, label) if err != nil { return } - jgf, err := os.ReadFile(filename) + jgf, err := os.ReadFile(defaults.KubernetesJsonGraphFormat) if err != nil { klog.Error("Error reading JGF") return @@ -38,7 +38,7 @@ func (f *Fluxion) InitFluxion(policy *string, label *string) { p := "{}" if *policy != "" { p = string("{\"matcher_policy\": \"" + *policy + "\"}") - klog.Infof("[Fluence] match policy: ", p) + klog.Infof("[Fluence] match policy: %s", p) } f.cli.InitContext(string(jgf), p) diff --git a/src/fluence/service-grpc/service.pb.go b/src/fluence/service-grpc/service.pb.go new file mode 100644 index 0000000..eca0e69 --- /dev/null +++ b/src/fluence/service-grpc/service.pb.go @@ -0,0 +1,351 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.20.3 +// source: fluence/service-grpc/service.proto + +package service + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// GroupRequest for a group +type GroupRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Group string `protobuf:"bytes,1,opt,name=group,proto3" json:"group,omitempty"` +} + +func (x *GroupRequest) Reset() { + *x = GroupRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_fluence_service_grpc_service_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GroupRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GroupRequest) ProtoMessage() {} + +func (x *GroupRequest) ProtoReflect() protoreflect.Message { + mi := &file_fluence_service_grpc_service_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GroupRequest.ProtoReflect.Descriptor instead. +func (*GroupRequest) Descriptor() ([]byte, []int) { + return file_fluence_service_grpc_service_proto_rawDescGZIP(), []int{0} +} + +func (x *GroupRequest) GetGroup() string { + if x != nil { + return x.Group + } + return "" +} + +// GroupResponse +type GroupResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Size int64 `protobuf:"varint,2,opt,name=size,proto3" json:"size,omitempty"` +} + +func (x *GroupResponse) Reset() { + *x = GroupResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_fluence_service_grpc_service_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GroupResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GroupResponse) ProtoMessage() {} + +func (x *GroupResponse) ProtoReflect() protoreflect.Message { + mi := &file_fluence_service_grpc_service_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GroupResponse.ProtoReflect.Descriptor instead. +func (*GroupResponse) Descriptor() ([]byte, []int) { + return file_fluence_service_grpc_service_proto_rawDescGZIP(), []int{1} +} + +func (x *GroupResponse) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *GroupResponse) GetSize() int64 { + if x != nil { + return x.Size + } + return 0 +} + +type ResourceRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields +} + +func (x *ResourceRequest) Reset() { + *x = ResourceRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_fluence_service_grpc_service_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ResourceRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ResourceRequest) ProtoMessage() {} + +func (x *ResourceRequest) ProtoReflect() protoreflect.Message { + mi := &file_fluence_service_grpc_service_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ResourceRequest.ProtoReflect.Descriptor instead. +func (*ResourceRequest) Descriptor() ([]byte, []int) { + return file_fluence_service_grpc_service_proto_rawDescGZIP(), []int{2} +} + +type ResourceResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Graph string `protobuf:"bytes,1,opt,name=graph,proto3" json:"graph,omitempty"` +} + +func (x *ResourceResponse) Reset() { + *x = ResourceResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_fluence_service_grpc_service_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ResourceResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ResourceResponse) ProtoMessage() {} + +func (x *ResourceResponse) ProtoReflect() protoreflect.Message { + mi := &file_fluence_service_grpc_service_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ResourceResponse.ProtoReflect.Descriptor instead. +func (*ResourceResponse) Descriptor() ([]byte, []int) { + return file_fluence_service_grpc_service_proto_rawDescGZIP(), []int{3} +} + +func (x *ResourceResponse) GetGraph() string { + if x != nil { + return x.Graph + } + return "" +} + +var File_fluence_service_grpc_service_proto protoreflect.FileDescriptor + +var file_fluence_service_grpc_service_proto_rawDesc = []byte{ + 0x0a, 0x22, 0x66, 0x6c, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x2f, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, + 0x65, 0x2d, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x07, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x22, 0x24, 0x0a, + 0x0c, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x14, 0x0a, + 0x05, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x67, 0x72, + 0x6f, 0x75, 0x70, 0x22, 0x37, 0x0a, 0x0d, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x73, 0x69, 0x7a, 0x65, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, 0x04, 0x73, 0x69, 0x7a, 0x65, 0x22, 0x11, 0x0a, 0x0f, + 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x22, + 0x28, 0x0a, 0x10, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, + 0x6e, 0x73, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x67, 0x72, 0x61, 0x70, 0x68, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x05, 0x67, 0x72, 0x61, 0x70, 0x68, 0x32, 0xda, 0x01, 0x0a, 0x15, 0x45, 0x78, + 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x50, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x53, 0x65, 0x72, 0x76, + 0x69, 0x63, 0x65, 0x12, 0x45, 0x0a, 0x0c, 0x47, 0x65, 0x74, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x73, 0x12, 0x18, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x52, 0x65, + 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x19, 0x2e, + 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, + 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x3d, 0x0a, 0x0a, 0x4c, 0x69, + 0x73, 0x74, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x12, 0x15, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x69, + 0x63, 0x65, 0x2e, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x16, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x3b, 0x0a, 0x08, 0x47, 0x65, 0x74, + 0x47, 0x72, 0x6f, 0x75, 0x70, 0x12, 0x15, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, + 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x16, 0x2e, 0x73, + 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x0e, 0x5a, 0x0c, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x73, + 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_fluence_service_grpc_service_proto_rawDescOnce sync.Once + file_fluence_service_grpc_service_proto_rawDescData = file_fluence_service_grpc_service_proto_rawDesc +) + +func file_fluence_service_grpc_service_proto_rawDescGZIP() []byte { + file_fluence_service_grpc_service_proto_rawDescOnce.Do(func() { + file_fluence_service_grpc_service_proto_rawDescData = protoimpl.X.CompressGZIP(file_fluence_service_grpc_service_proto_rawDescData) + }) + return file_fluence_service_grpc_service_proto_rawDescData +} + +var file_fluence_service_grpc_service_proto_msgTypes = make([]protoimpl.MessageInfo, 4) +var file_fluence_service_grpc_service_proto_goTypes = []interface{}{ + (*GroupRequest)(nil), // 0: service.GroupRequest + (*GroupResponse)(nil), // 1: service.GroupResponse + (*ResourceRequest)(nil), // 2: service.ResourceRequest + (*ResourceResponse)(nil), // 3: service.ResourceResponse +} +var file_fluence_service_grpc_service_proto_depIdxs = []int32{ + 2, // 0: service.ExternalPluginService.GetResources:input_type -> service.ResourceRequest + 0, // 1: service.ExternalPluginService.ListGroups:input_type -> service.GroupRequest + 0, // 2: service.ExternalPluginService.GetGroup:input_type -> service.GroupRequest + 3, // 3: service.ExternalPluginService.GetResources:output_type -> service.ResourceResponse + 1, // 4: service.ExternalPluginService.ListGroups:output_type -> service.GroupResponse + 1, // 5: service.ExternalPluginService.GetGroup:output_type -> service.GroupResponse + 3, // [3:6] is the sub-list for method output_type + 0, // [0:3] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_fluence_service_grpc_service_proto_init() } +func file_fluence_service_grpc_service_proto_init() { + if File_fluence_service_grpc_service_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_fluence_service_grpc_service_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GroupRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_fluence_service_grpc_service_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GroupResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_fluence_service_grpc_service_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ResourceRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_fluence_service_grpc_service_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ResourceResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_fluence_service_grpc_service_proto_rawDesc, + NumEnums: 0, + NumMessages: 4, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_fluence_service_grpc_service_proto_goTypes, + DependencyIndexes: file_fluence_service_grpc_service_proto_depIdxs, + MessageInfos: file_fluence_service_grpc_service_proto_msgTypes, + }.Build() + File_fluence_service_grpc_service_proto = out.File + file_fluence_service_grpc_service_proto_rawDesc = nil + file_fluence_service_grpc_service_proto_goTypes = nil + file_fluence_service_grpc_service_proto_depIdxs = nil +} diff --git a/src/fluence/service-grpc/service.proto b/src/fluence/service-grpc/service.proto new file mode 100644 index 0000000..6240314 --- /dev/null +++ b/src/fluence/service-grpc/service.proto @@ -0,0 +1,34 @@ +syntax = "proto3"; +option go_package = "grpc/service"; + +package service; + + +// Service definition for an external plugin like kubectl +service ExternalPluginService { + + // This is supported via a shared file in the container + rpc GetResources(ResourceRequest) returns (ResourceResponse) {} + + // Note we currently cannot support getting group metadata, need to add handle to get info, etc. + rpc ListGroups(GroupRequest) returns (GroupResponse) {} + rpc GetGroup(GroupRequest) returns (GroupResponse) {} +} + +// GroupRequest for a group +message GroupRequest { + string group = 1; +} + +// GroupResponse +message GroupResponse { + string name = 1; + int64 size = 2; +} + +message ResourceRequest {} +message ResourceResponse { + string graph = 1; +} + + diff --git a/src/fluence/service-grpc/service_grpc.pb.go b/src/fluence/service-grpc/service_grpc.pb.go new file mode 100644 index 0000000..c15f8f3 --- /dev/null +++ b/src/fluence/service-grpc/service_grpc.pb.go @@ -0,0 +1,181 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.20.3 +// source: fluence/service-grpc/service.proto + +package service + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// ExternalPluginServiceClient is the client API for ExternalPluginService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type ExternalPluginServiceClient interface { + // This is supported via a shared file in the container + GetResources(ctx context.Context, in *ResourceRequest, opts ...grpc.CallOption) (*ResourceResponse, error) + // Note we currently cannot support getting group metadata, need to add handle to get info, etc. + ListGroups(ctx context.Context, in *GroupRequest, opts ...grpc.CallOption) (*GroupResponse, error) + GetGroup(ctx context.Context, in *GroupRequest, opts ...grpc.CallOption) (*GroupResponse, error) +} + +type externalPluginServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewExternalPluginServiceClient(cc grpc.ClientConnInterface) ExternalPluginServiceClient { + return &externalPluginServiceClient{cc} +} + +func (c *externalPluginServiceClient) GetResources(ctx context.Context, in *ResourceRequest, opts ...grpc.CallOption) (*ResourceResponse, error) { + out := new(ResourceResponse) + err := c.cc.Invoke(ctx, "/service.ExternalPluginService/GetResources", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *externalPluginServiceClient) ListGroups(ctx context.Context, in *GroupRequest, opts ...grpc.CallOption) (*GroupResponse, error) { + out := new(GroupResponse) + err := c.cc.Invoke(ctx, "/service.ExternalPluginService/ListGroups", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *externalPluginServiceClient) GetGroup(ctx context.Context, in *GroupRequest, opts ...grpc.CallOption) (*GroupResponse, error) { + out := new(GroupResponse) + err := c.cc.Invoke(ctx, "/service.ExternalPluginService/GetGroup", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// ExternalPluginServiceServer is the server API for ExternalPluginService service. +// All implementations must embed UnimplementedExternalPluginServiceServer +// for forward compatibility +type ExternalPluginServiceServer interface { + // This is supported via a shared file in the container + GetResources(context.Context, *ResourceRequest) (*ResourceResponse, error) + // Note we currently cannot support getting group metadata, need to add handle to get info, etc. + ListGroups(context.Context, *GroupRequest) (*GroupResponse, error) + GetGroup(context.Context, *GroupRequest) (*GroupResponse, error) + mustEmbedUnimplementedExternalPluginServiceServer() +} + +// UnimplementedExternalPluginServiceServer must be embedded to have forward compatible implementations. +type UnimplementedExternalPluginServiceServer struct { +} + +func (UnimplementedExternalPluginServiceServer) GetResources(context.Context, *ResourceRequest) (*ResourceResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetResources not implemented") +} +func (UnimplementedExternalPluginServiceServer) ListGroups(context.Context, *GroupRequest) (*GroupResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method ListGroups not implemented") +} +func (UnimplementedExternalPluginServiceServer) GetGroup(context.Context, *GroupRequest) (*GroupResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetGroup not implemented") +} +func (UnimplementedExternalPluginServiceServer) mustEmbedUnimplementedExternalPluginServiceServer() {} + +// UnsafeExternalPluginServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to ExternalPluginServiceServer will +// result in compilation errors. +type UnsafeExternalPluginServiceServer interface { + mustEmbedUnimplementedExternalPluginServiceServer() +} + +func RegisterExternalPluginServiceServer(s grpc.ServiceRegistrar, srv ExternalPluginServiceServer) { + s.RegisterService(&ExternalPluginService_ServiceDesc, srv) +} + +func _ExternalPluginService_GetResources_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(ResourceRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ExternalPluginServiceServer).GetResources(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/service.ExternalPluginService/GetResources", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ExternalPluginServiceServer).GetResources(ctx, req.(*ResourceRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _ExternalPluginService_ListGroups_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GroupRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ExternalPluginServiceServer).ListGroups(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/service.ExternalPluginService/ListGroups", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ExternalPluginServiceServer).ListGroups(ctx, req.(*GroupRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _ExternalPluginService_GetGroup_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GroupRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ExternalPluginServiceServer).GetGroup(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/service.ExternalPluginService/GetGroup", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ExternalPluginServiceServer).GetGroup(ctx, req.(*GroupRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// ExternalPluginService_ServiceDesc is the grpc.ServiceDesc for ExternalPluginService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var ExternalPluginService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "service.ExternalPluginService", + HandlerType: (*ExternalPluginServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "GetResources", + Handler: _ExternalPluginService_GetResources_Handler, + }, + { + MethodName: "ListGroups", + Handler: _ExternalPluginService_ListGroups_Handler, + }, + { + MethodName: "GetGroup", + Handler: _ExternalPluginService_GetGroup_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "fluence/service-grpc/service.proto", +} diff --git a/src/fluence/service/service.go b/src/fluence/service/service.go new file mode 100644 index 0000000..ad61c1a --- /dev/null +++ b/src/fluence/service/service.go @@ -0,0 +1,61 @@ +package service + +import ( + "os" + + "github.com/flux-framework/flux-k8s/flux-plugin/fluence/defaults" + pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/service-grpc" + + "k8s.io/klog/v2" + + "context" +) + +type ExternalService struct { + pb.UnimplementedExternalPluginServiceServer +} + +// Init is a helper function for any startup stuff, for which now we have none :) +func (f *ExternalService) Init() { + klog.Infof("[Fluence] Created external service.") +} + +// GetGroup gets and returns the group info +// TODO no good way to look up group - we would need to ask Fluxion directly OR put the grpc +// service alongside the scheduler plugin, which seems like a bad design +func (s *ExternalService) GetGroup(ctx context.Context, in *pb.GroupRequest) (*pb.GroupResponse, error) { + klog.Infof("[Fluence] Calling get group endpoint! %v\n", in) + + // Prepare an empty match response (that can still be serialized) + emptyResponse := &pb.GroupResponse{} + return emptyResponse, nil +} + +// List group returns existing groups +func (s *ExternalService) ListGroups(ctx context.Context, in *pb.GroupRequest) (*pb.GroupResponse, error) { + + emptyResponse := &pb.GroupResponse{} + + // Prepare an empty match response (that can still be serialized) + klog.Infof("[Fluence] Calling list groups endpoint! %v\n", in) + + return emptyResponse, nil +} + +// GetResources gets the current Kubernetes Json Graph Format JGF +// This should be created on init of the scheduler +func (s *ExternalService) GetResources(ctx context.Context, in *pb.ResourceRequest) (*pb.ResourceResponse, error) { + + emptyResponse := &pb.ResourceResponse{} + + // Prepare an empty match response (that can still be serialized) + klog.Infof("[Fluence] Calling get resources endpoint! %v\n", in) + + jgf, err := os.ReadFile(defaults.KubernetesJsonGraphFormat) + if err != nil { + klog.Error("Error reading JGF") + return emptyResponse, err + } + emptyResponse.Graph = string(jgf) + return emptyResponse, nil +} From 673e34d002e37fcd97603574b02be15f1b4a8a46 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 17 Feb 2024 00:34:02 -0700 Subject: [PATCH 09/28] feat: add controller base image to build from here Problem: we want to be able to persist PodGroup if upstream removes it Solution: build our own controller image, also allowing us to tweak it to enhance fluence. This commit also renames the helm install to be "fluence" so it is easier for the developer workflow Signed-off-by: vsoch --- .github/test.sh | 0 Makefile | 2 + README.md | 173 +++++--------- examples/kind-config.yaml | 26 +++ examples/kube_setup/taint_workers.sh | 0 examples/pi/clean_pods.sh | 0 examples/pi/demo_failed_pod_cancellation.sh | 0 examples/pi/init_kind_cluster.sh | 0 .../run_experiments/process_job_template.py | 0 examples/run_experiments/run_experiments.py | 0 .../charts/as-a-second-scheduler/values.yaml | 1 + .../pkg/controllers/podgroup_controller.go | 220 ++++++++++++++++++ 12 files changed, 306 insertions(+), 116 deletions(-) mode change 100755 => 100644 .github/test.sh create mode 100644 examples/kind-config.yaml mode change 100755 => 100644 examples/kube_setup/taint_workers.sh mode change 100755 => 100644 examples/pi/clean_pods.sh mode change 100755 => 100644 examples/pi/demo_failed_pod_cancellation.sh mode change 100755 => 100644 examples/pi/init_kind_cluster.sh mode change 100755 => 100644 examples/run_experiments/process_job_template.py mode change 100755 => 100644 examples/run_experiments/run_experiments.py create mode 100644 sig-scheduler-plugins/pkg/controllers/podgroup_controller.go diff --git a/.github/test.sh b/.github/test.sh old mode 100755 new mode 100644 diff --git a/Makefile b/Makefile index 1356160..97efa75 100644 --- a/Makefile +++ b/Makefile @@ -26,8 +26,10 @@ update: clone prepare: clone # These are entirely new directory structures rm -rf $(CLONE_UPSTREAM)/pkg/fluence + rm -rf $(CLONE_UPSTREAM)/pkg/controllers/podgroup_controller.go rm -rf $(CLONE_UPSTREAM)/manifests/fluence cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence + cp -R sig-scheduler-plugins/pkg/controllers/* $(CLONE_UPSTREAM)/pkg/controllers/ # This is the one exception not from sig-scheduler-plugins because it is needed in both spots cp -R src/fluence/fluxcli-grpc $(CLONE_UPSTREAM)/pkg/fluence/fluxcli-grpc # These are files with subtle changes to add fluence diff --git a/README.md b/README.md index 0433799..f0d67cd 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,9 @@ ![docs/images/fluence.png](docs/images/fluence.png) -Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Scheduling Framework](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/). Fluence uses the directed-graph based [Fluxion scheduler](https://github.com/flux-framework/flux-sched) to map pods or [podgroups](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/pkg/coscheduling) to nodes. Fluence supports all the Fluxion scheduling algorithms (e.g., `hi`, `low`, `hinode`, etc.). Note that Fluence does not currently support use in conjunction with the kube-scheduler. Pods must all be scheduled by Fluence. +Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Scheduling Framework](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/). Fluence uses the directed-graph based [Fluxion scheduler](https://github.com/flux-framework/flux-sched) to map pods or [podgroups](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/pkg/coscheduling) to nodes. Fluence supports all the Fluxion scheduling algorithms (e.g., `hi`, `low`, `hinode`, etc.). + +**Important** Fluence does not currently support use in conjunction with the kube-scheduler. Pods must all be scheduled by Fluence, and *you should not use both schedulers in the same cluster*. ## Getting started @@ -66,7 +68,8 @@ cd upstream/manifests/install/charts helm install \ --set scheduler.image=ghcr.io/flux-framework/fluence:latest \ --set scheduler.sidecarimage=ghcr.io/flux-framework/fluence-sidecar \ - schedscheduler-plugins as-a-second-scheduler/ + --set controller.image=ghcr.io/flux-framework/fluence-controller \ + fluence as-a-second-scheduler/ ``` And that's it! See the [testing install](#testing-install) section for a basic example @@ -85,17 +88,18 @@ To build and test Fluence, you will need: There are two images we will be building: - the scheduler sidecar: built from the repository here - - the scheduler: built from [this branch of scheduler-plugins](https://github.com/openshift-psap/scheduler-plugins/blob/fluence/build/scheduler/Dockerfile) + - the scheduler: built (and modified) from [this branch of scheduler-plugins](https://github.com/openshift-psap/scheduler-plugins/blob/fluence/build/scheduler/Dockerfile) + - the controller: same as the scheduler -#### All at once (Sidecar + Scheduler) +#### Build All -**recommended** +**This builds the scheduler, sidecar to the scheduler, and controller** This will run the full builds for all containers in one step, which includes: 1. Building the fluence sidecar from source code in [src](src) 2. Cloning the upstream kubernetes-sigs/plugin-schedulers respository to ./upstream -3. Building the scheduler container +3. Building the scheduler and controller containers From the root here: @@ -106,115 +110,18 @@ make or customize the naming of your registry or local images: ```bash -make REGISTRY=vanessa SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar -``` - -As an alternative, you can do each of the steps separately or manually (detailed below). - -
- - Manual Build Instructions - -#### Build Sidecar - -To build the plugin containers, we will basically be running `make` from the [src](src) directory. We have wrapped that for you -in the Makefile: - -```bash -make build-sidecar -``` - -To build for a custom registry (e.g., "vanessa' on Docker Hub): - -```bash -make build-sidecar REGISTRY=vanessa -``` - -And specify the sidecar image name too: - -```bash -make build-sidecar REGISTRY=vanessa SIDECAR_IMAGE=another-sidecar -``` - -The equivalent manual command is: - -```bash -cd src -make -``` - -Using either of the approaches above, this will create the scheduler plugin main container, which can be tagged and pushed to the preferred registry. As an example, -here we push to the result of the build above: - -```bash -docker push docker.io/vanessa/fluence-sidecar:latest -``` - -#### Build Scheduler - -Note that you can run this entire process like: - -```bash -make prepare -make build -``` - -Or customize the name of the scheduler image: - -```bash -make prepare -make build REGISTRY=vanessa -``` - -For a custom scheduler or controller image (we just need the scheduler): - -```bash -make build REGISTRY=vanessa CONTROLLER_IMAGE=fluence-controller SCHEDULER_IMAGE=fluence -``` - -To walk through it manually, first, clone the upstream scheduler-plugins repository: - -```bash -git clone https://github.com/kubernetes-sigs/scheduler-plugins ./upstream -``` - -We need to add our fluence package to the scheduler plugins to build. You can do that manully as follows: - -```bash -# These are entirely new directory structures -cp -R sig-scheduler-plugins/pkg/fluence ./upstream/pkg/fluence -cp -R sig-scheduler-plugins/manifests/fluence ./upstream/manifests/fluence - -# These are files with subtle changes to add fluence -cp sig-scheduler-plugins/cmd/scheduler/main.go ./upstream/cmd/scheduler/main.go -cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml ./upstream/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml -cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml ./upstream/manifests/install/charts/as-a-second-scheduler/values.yaml -``` - -Then change directory to the scheduler plugins repository. - -```bash -cd ./upstream +make REGISTRY=vanessa SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller ``` -And build! You'll most likely want to set a custom registry and image name again: +As an alternative, you can look at the Makefile to do each of the steps separately. -```bash -# This will build to localhost -make local-image - -# this will build to docker.io/vanessa/fluence -make local-image REGISTRY=vanessa CONTROLLER_IMAGE=fluence -``` - -
- -**Important** the make command above produces _two images_ and you want to use the first that is mentioned in the output (not the second, which is a controller). Whatever build approach you use, you'll want to push to your registry for later discovery! ```bash docker push docker.io/vanessa/fluence +docker push docker.io/vanessa/fluence-sidecar +docker push docker.io/vanessa/fluence-controller ``` ### Prepare Cluster @@ -268,7 +175,7 @@ scheduler: controller: name: scheduler-plugins-controller - image: registry.k8s.io/scheduler-plugins/controller:v0.27.8 + image: ghcr.io/flux-framework/fluence-controller:latest replicaCount: 1 pullPolicy: IfNotPresent @@ -303,7 +210,8 @@ cd upstream/manifests/install/charts helm install \ --set scheduler.image=vanessa/fluence:latest \ --set scheduler.sidecarimage=vanessa/fluence-sidecar \ - schedscheduler-plugins as-a-second-scheduler/ + --set controller.image=vanessa/fluence-controller \ + fluence as-a-second-scheduler/ ``` If you load your images into your testing environment and don't need to pull, you can change the pull policy too: @@ -312,14 +220,15 @@ If you load your images into your testing environment and don't need to pull, yo helm install \ --set scheduler.image=vanessa/fluence:latest \ --set scheduler.sidecarimage=vanessa/fluence-sidecar \ + --set controller.image=vanessa/fluence-controller \ --set scheduler.sidecarPullPolicy=IfNotPresent \ - schedscheduler-plugins as-a-second-scheduler/ + fluence as-a-second-scheduler/ ``` If you need to uninstall (e.g., to redo something): ```bash -helm uninstall schedscheduler-plugins +helm uninstall fluence ``` Next you can move down to testing the install. @@ -519,14 +428,21 @@ The easiest thing to do is to build the containers in some container namespace t make build REGISTRY=ghcr.io/vsoch ``` +If needed, create a "multi node" kind cluster: + +```bash +kind create cluster --config ./examples/kind-config.yaml +``` + And then install with your custom images: ```bash cd ./upstream/manifests/install/charts helm install \ --set scheduler.image=ghcr.io/vsoch/fluence:latest \ + --set controller.image=ghcr.io/vsoch/fluence-controller:latest \ --set scheduler.sidecarimage=ghcr.io/vsoch/fluence-sidecar:latest \ - schedscheduler-plugins as-a-second-scheduler/ + fluence as-a-second-scheduler/ ``` And then apply what you need to test, and look at logs! @@ -540,8 +456,9 @@ Note that if you want to enable extra endpoints for the fluence kubectl plugin a helm install \ --set scheduler.image=ghcr.io/vsoch/fluence:latest \ --set scheduler.enableExternalService=true \ + --set controller.image=vanessa/fluence-controller \ --set scheduler.sidecarimage=ghcr.io/vsoch/fluence-sidecar:latest \ - schedscheduler-plugins as-a-second-scheduler/ + fluence as-a-second-scheduler/ ``` For this setup if you are developing locally with kind, you will need to enable the ingress. Here is `kind-config.yaml` @@ -569,6 +486,30 @@ And to create: kind create cluster --config ./kind-config.yaml ``` +#### Vanessa Thinking + +> Updated February 15, 2024 + +What I think might be happening (and not always, sometimes) + +- New pod group, no node list +- Fluence assigns nodes +- Nodes get assigned to pods 1:1 +- POD group is deleted +- Some pod is sent back to queue (kubelet rejects, etc) +- POD group does not exist and is recreated, no node list +- Fluence asks again, but still has the first job. Not enough resources, asks forever. + +The above would not happen with the persistent pod group (if it wasn't cleaned up until the deletion of the job) and wouldn't happen if there are just enough resources to account for the overlap. + +- Does Fluence allocate resources for itself? +- It would be nice to be able to inspect the state of Fluence. +- At some point we want to be using the TBA fluxion-go instead of the one off branch we currently have (but we don't need to be blocked for that) +- We should (I think) restore pod group (it's in the controller here) and have our own container built. That way we have total control over the custom resource, and we don't risk it going away. + - As a part of that, we can add add a mutating webhook that emulates what we are doing in fluence now to find the label, but instead we will create the CRD to hold state instead of trying to hold in the operator. +- It could then also be investigated that we can more flexibly change the size of the group, within some min/max size (also determined by labels?) to help with scheduling. +- Note that kueue has added a Pod Group object, so probably addresses the static case here. + #### Components - [FluxStateData](sig-scheduler-plugins/pkg/fluence/core/core.go): is given to the [framework.CycleState](https://github.com/kubernetes/kubernetes/blob/242b41b36a20032f99e8a059ca0a5d764105217b/pkg/scheduler/framework/cycle_state.go#L48) and serves as a vehicle to store a cache of node name assignment. @@ -583,14 +524,14 @@ The install commands are shown above, but often you want to uninstall! ```bash helm list NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION -schedscheduler-plugins default 1 2024-01-08 12:04:58.558612156 -0700 MST deployed scheduler-plugins-0.27.80.27.8 +fluence default 1 2024-01-08 12:04:58.558612156 -0700 MST deployed scheduler-plugins-0.27.80.27.8 ``` And then uninstall: ```bash -$ helm uninstall schedscheduler-plugins -release "schedscheduler-plugins" uninstalled +$ helm uninstall fluence +release "fluence" uninstalled ``` diff --git a/examples/kind-config.yaml b/examples/kind-config.yaml new file mode 100644 index 0000000..2971483 --- /dev/null +++ b/examples/kind-config.yaml @@ -0,0 +1,26 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 8080 + hostPort: 8080 + protocol: TCP + - containerPort: 4242 + hostPort: 4242 + protocol: TCP + - containerPort: 4243 + hostPort: 4243 + protocol: TCP +- role: worker +- role: worker +- role: worker +- role: worker +- role: worker +- role: worker \ No newline at end of file diff --git a/examples/kube_setup/taint_workers.sh b/examples/kube_setup/taint_workers.sh old mode 100755 new mode 100644 diff --git a/examples/pi/clean_pods.sh b/examples/pi/clean_pods.sh old mode 100755 new mode 100644 diff --git a/examples/pi/demo_failed_pod_cancellation.sh b/examples/pi/demo_failed_pod_cancellation.sh old mode 100755 new mode 100644 diff --git a/examples/pi/init_kind_cluster.sh b/examples/pi/init_kind_cluster.sh old mode 100755 new mode 100644 diff --git a/examples/run_experiments/process_job_template.py b/examples/run_experiments/process_job_template.py old mode 100755 new mode 100644 diff --git a/examples/run_experiments/run_experiments.py b/examples/run_experiments/run_experiments.py old mode 100755 new mode 100644 diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml index 2a35a3a..be1e797 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml @@ -22,6 +22,7 @@ scheduler: controller: name: scheduler-plugins-controller image: registry.k8s.io/scheduler-plugins/controller:v0.27.8 + image: ghcr.io/flux-framework/fluence-controller:latest replicaCount: 1 pullPolicy: IfNotPresent diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go new file mode 100644 index 0000000..02eb4e4 --- /dev/null +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -0,0 +1,220 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "fmt" + "sort" + "strings" + "time" + + "github.com/go-logr/logr" + v1 "k8s.io/api/core/v1" + apierrs "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + schedv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + "sigs.k8s.io/scheduler-plugins/pkg/util" +) + +// PodGroupReconciler reconciles a PodGroup object +type PodGroupReconciler struct { + log logr.Logger + recorder record.EventRecorder + + client.Client + Scheme *runtime.Scheme + Workers int +} + +// +kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups/finalizers,verbs=update + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// TODO(user): Modify the Reconcile function to compare the state specified by +// the PodGroup object against the actual cluster state, and then +// perform operations to make the cluster state reflect the state specified by +// the user. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.0/pkg/reconcile +func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := log.FromContext(ctx) + log.Info("reconciling flux-framework/fluence-controller") + pg := &schedv1alpha1.PodGroup{} + if err := r.Get(ctx, req.NamespacedName, pg); err != nil { + if apierrs.IsNotFound(err) { + log.V(5).Info("Pod group has been deleted") + return ctrl.Result{}, nil + } + log.V(3).Error(err, "Unable to retrieve pod group") + return ctrl.Result{}, err + } + + if pg.Status.Phase == schedv1alpha1.PodGroupFinished || + pg.Status.Phase == schedv1alpha1.PodGroupFailed { + return ctrl.Result{}, nil + } + // If startScheduleTime - createTime > 2days, + // do not reconcile again because pod may have been GCed + if (pg.Status.Phase == schedv1alpha1.PodGroupScheduling || pg.Status.Phase == schedv1alpha1.PodGroupPending) && pg.Status.Running == 0 && + pg.Status.ScheduleStartTime.Sub(pg.CreationTimestamp.Time) > 48*time.Hour { + r.recorder.Event(pg, v1.EventTypeWarning, + "Timeout", "schedule time longer than 48 hours") + return ctrl.Result{}, nil + } + + podList := &v1.PodList{} + if err := r.List(ctx, podList, + client.MatchingLabelsSelector{ + Selector: labels.Set(map[string]string{ + schedv1alpha1.PodGroupLabel: pg.Name}).AsSelector(), + }); err != nil { + log.Error(err, "List pods for group failed") + return ctrl.Result{}, err + } + pods := podList.Items + + pgCopy := pg.DeepCopy() + switch pgCopy.Status.Phase { + case "": + pgCopy.Status.Phase = schedv1alpha1.PodGroupPending + case schedv1alpha1.PodGroupPending: + if len(pods) >= int(pg.Spec.MinMember) { + pgCopy.Status.Phase = schedv1alpha1.PodGroupScheduling + fillOccupiedObj(pgCopy, &pods[0]) + } + default: + pgCopy.Status.Running, pgCopy.Status.Succeeded, pgCopy.Status.Failed = getCurrentPodStats(pods) + if len(pods) < int(pg.Spec.MinMember) { + pgCopy.Status.Phase = schedv1alpha1.PodGroupPending + break + } + + if pgCopy.Status.Succeeded+pgCopy.Status.Running < pg.Spec.MinMember { + pgCopy.Status.Phase = schedv1alpha1.PodGroupScheduling + } + + if pgCopy.Status.Succeeded+pgCopy.Status.Running >= pg.Spec.MinMember { + pgCopy.Status.Phase = schedv1alpha1.PodGroupRunning + } + // Final state of pod group + if pgCopy.Status.Failed != 0 && + pgCopy.Status.Failed+pgCopy.Status.Running+pgCopy.Status.Succeeded >= pg.Spec.MinMember { + pgCopy.Status.Phase = schedv1alpha1.PodGroupFailed + } + if pgCopy.Status.Succeeded >= pg.Spec.MinMember { + pgCopy.Status.Phase = schedv1alpha1.PodGroupFinished + } + } + + return r.patchPodGroup(ctx, pg, pgCopy) +} + +func (r *PodGroupReconciler) patchPodGroup(ctx context.Context, old, new *schedv1alpha1.PodGroup) (ctrl.Result, error) { + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, new, patch); err != nil { + return ctrl.Result{}, err + } + err := r.Patch(ctx, new, patch) + return ctrl.Result{}, err +} + +func getCurrentPodStats(pods []v1.Pod) (int32, int32, int32) { + if len(pods) == 0 { + return 0, 0, 0 + } + + var ( + running int32 = 0 + succeeded int32 = 0 + failed int32 = 0 + ) + for _, pod := range pods { + switch pod.Status.Phase { + case v1.PodRunning: + running++ + case v1.PodSucceeded: + succeeded++ + case v1.PodFailed: + failed++ + } + } + return running, succeeded, failed +} + +func fillOccupiedObj(pg *schedv1alpha1.PodGroup, pod *v1.Pod) { + if len(pod.OwnerReferences) == 0 { + return + } + + var refs []string + for _, ownerRef := range pod.OwnerReferences { + refs = append(refs, fmt.Sprintf("%s/%s", pod.Namespace, ownerRef.Name)) + } + if len(refs) != 0 { + sort.Strings(refs) + pg.Status.OccupiedBy = strings.Join(refs, ",") + } +} + +// SetupWithManager sets up the controller with the Manager. +func (r *PodGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.recorder = mgr.GetEventRecorderFor("PodGroupController") + r.log = mgr.GetLogger() + r.log.Info("setup with manager flux-framework/fluence-controller") + + return ctrl.NewControllerManagedBy(mgr). + Watches(&v1.Pod{}, handler.EnqueueRequestsFromMapFunc(r.podToPodGroup)). + For(&schedv1alpha1.PodGroup{}). + WithOptions(controller.Options{MaxConcurrentReconciles: r.Workers}). + Complete(r) +} + +// podToPodGroup is a watcher that looks for pods and associated pod group +func (r *PodGroupReconciler) podToPodGroup(ctx context.Context, obj client.Object) []ctrl.Request { + + pod, ok := obj.(*v1.Pod) + if !ok { + return nil + } + r.log.Info("podToPodGroup flux-framework/fluence-controller") + r.log.V(5).Info("Running podToPodGroup", "pod", pod.Name, "namespace", pod.Namespace) + pgName := util.GetPodGroupLabel(pod) + if len(pgName) == 0 { + return nil + } + + r.log.V(5).Info("Add pod group when pod gets added", "podGroup", pgName, "pod", pod.Name, "namespace", pod.Namespace) + + return []ctrl.Request{{ + NamespacedName: types.NamespacedName{ + Namespace: pod.Namespace, + Name: pgName, + }}} +} From 41b2ad284e92ebe98048fd6acf15babb5dc2f053 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 17 Feb 2024 01:09:31 -0700 Subject: [PATCH 10/28] docker: simplify fluence build to use fluxion-go Signed-off-by: vsoch --- src/Makefile | 8 +- src/build/scheduler/Dockerfile | 129 +++------------------------------ src/fluence/fluxion/fluxion.go | 2 +- src/fluence/go.mod | 4 +- src/fluence/go.sum | 4 +- 5 files changed, 21 insertions(+), 126 deletions(-) diff --git a/src/Makefile b/src/Makefile index 3392add..af5fcb3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,12 +1,12 @@ -FLUX_SCHED_ROOT ?= /home/flux-sched +FLUX_SCHED_ROOT ?= /opt/flux-sched INSTALL_PREFIX ?= /usr +LIB_PREFIX ?= /usr/lib LOCALBIN ?= $(shell pwd)/bin COMMONENVVAR=GOOS=$(shell uname -s | tr A-Z a-z) +#BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${INSTALL_PREFIX}/lib -L${FLUX_SCHED_ROOT}/resource -lresource -L${FLUX_SCHED_ROOT}/resource/libjobspec -ljobspec_conv -L/${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -lczmq -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" +BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT} -I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${LIB_PREFIX} -L${LIB_PREFIX}/flux -L${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -lczmq -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" -# This is what worked -# GOOS=linux CGO_CFLAGS="-I/home/flux-sched/resource/reapi/bindings/c" CGO_LDFLAGS="-L/usr/lib -L/home/flux-sched/resource -lresource -L/home/flux-sched/resource/libjobspec -ljobspec_conv -L/home/flux-sched/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -lczmq -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" go build -ldflags '-w' -o bin/server cmd/main.go -BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${INSTALL_PREFIX}/lib -L${FLUX_SCHED_ROOT}/resource -lresource -L${FLUX_SCHED_ROOT}/resource/libjobspec -ljobspec_conv -L/${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -lczmq -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" LOCAL_REGISTRY=localhost:5000 LOCAL_IMAGE=fluence-sidecar:latest diff --git a/src/build/scheduler/Dockerfile b/src/build/scheduler/Dockerfile index 15a9678..67bd5ce 100644 --- a/src/build/scheduler/Dockerfile +++ b/src/build/scheduler/Dockerfile @@ -1,138 +1,33 @@ -FROM ubuntu:latest as base +FROM fluxrm/flux-sched:jammy -RUN apt -y update && apt -y upgrade && apt -y clean && apt -y autoremove +USER root ENV DEBIAN_FRONTEND=noninteractive ENV GO_VERSION=1.19.10 -ENV INSTALL_PREFIX=/usr -RUN apt install -y --no-install-recommends tzdata && \ - apt -y --no-install-recommends install \ - aspell \ - aspell-en \ - autoconf \ - automake \ - curl \ - git \ - libc6-dev \ - libczmq-dev \ - libmpich-dev \ - libncurses5-dev \ - libelf-dev \ - libssl-dev \ - libtool \ - libsodium-dev \ - libzmq3-dev \ - libjansson-dev \ - liblz4-dev \ - libhwloc-dev \ - libsqlite3-dev \ - lua5.1 \ - liblua5.1-dev \ - lua-posix \ - make \ - openssh-client \ - python3-dev \ - python3-cffi \ - python3-six \ - python3-yaml \ - python3-jsonschema \ - python3-sphinx \ - python3-pip \ - python3-setuptools \ - systemd \ - wget \ - uuid-dev && \ - apt -y clean && apt -y autoremove - -RUN echo 'alias python="/usr/bin/python3.8"' >> /root/.bashrc && \ - echo 'alias pip="/usr/bin/pip3"' >> /root/.bashrc && \ - . /root/.bashrc - -RUN echo 'set number' >> /root/.vimrc - -# Install cmake for new build system -RUN curl -s -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-$(uname -m).sh > cmake.sh ;\ - bash cmake.sh --prefix=/usr/local --skip-license ;\ - rm cmake.sh - -# Remove Python 2 -RUN apt purge -y python2.7-minimal - -# Python 3 should be linked to python -RUN ln -s /usr/bin/python3 /usr/bin/python -RUN apt install -y python3-pip \ - && apt -y --no-install-recommends install \ - libhwloc-dev \ - libboost-dev \ - libboost-system-dev \ - libboost-filesystem-dev \ - libboost-graph-dev \ - libboost-regex-dev \ - libxml2-dev \ - libyaml-cpp-dev \ - python3-yaml \ - libedit-dev \ - libarchive-dev \ - pkg-config && apt -y clean && apt -y autoremove - -RUN git clone https://github.com/flux-framework/flux-core.git /home/flux-core && \ - cd /home/flux-core/ && \ - ./autogen.sh && \ - PYTHON_VERSION=3 ./configure --prefix=${INSTALL_PREFIX} && \ - make && make install && \ - cd ../ && \ - rm -rf flux-core +RUN apt-get update && apt-get clean -y && apt -y autoremove # Install go 19.10 RUN wget https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz && tar -xvf go${GO_VERSION}.linux-amd64.tar.gz && \ mv go /usr/local && rm go${GO_VERSION}.linux-amd64.tar.gz -ENV GOROOT=/usr/local/go -ENV GOPATH=/go -ENV PATH="$GOROOT/bin:$PATH" -RUN mkdir -p /go/src +# ENV GOROOT=/usr/local/go +# ENV GOPATH=/go +ENV PATH=/usr/local/go/bin:$PATH RUN flux keygen +RUN git clone https://github.com/flux-framework/flux-sched.git /opt/flux-sched -ENV WITH_GO=yes -RUN git clone https://github.com/flux-framework/flux-sched.git /home/flux-sched && \ - cd /home/flux-sched/ && \ - # Ensure we pin to variant that has STATIC - will update when fix is in - git fetch && git checkout v0.31.0 && \ - # These need to be shared libraries - # https://github.com/flux-framework/flux-sched/pull/1094 - sed -i 's/add_library(resource STATIC/add_library(resource SHARED/g' resource/CMakeLists.txt && \ - sed -i 's/add_library ( reapi_module STATIC/add_library ( reapi_module SHARED/g' resource/reapi/bindings/CMakeLists.txt && \ - sed -i 's/add_library ( reapi_cli STATIC/add_library ( reapi_cli SHARED/g' resource/reapi/bindings/CMakeLists.txt && \ - sed -i 's/add_library ( jobspec_conv STATIC/add_library ( jobspec_conv SHARED/g' resource/libjobspec/CMakeLists.txt && \ - PYTHON_VERSION=3 ./configure --prefix=${INSTALL_PREFIX} && \ - make && make install - -RUN apt purge -y \ - python3-dev \ - python3-cffi \ - python3-six \ - python3-yaml \ - python3-jsonschema \ - python3-sphinx \ - python3-pip \ - python3-setuptools \ - && apt -y clean && apt -y autoremove - -ENV PATH=/usr/local/go/bin:$PATH +# Go dependencies for protobuf RUN apt -y update && apt -y upgrade && apt install --no-install-recommends -y protobuf-compiler curl && \ go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.26 && \ go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.1 # These need to be on the LD_LIBRARY_PATH for the server to find at runtime -# This mimcs what we use to build server -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/lib:/home/flux-sched/resource:/home/flux-sched/resource/libjobspec:/home/flux-sched/resource/reapi/bindings" -COPY fluence Makefile /go/src/fluence/ +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/lib:/usr/lib/flux WORKDIR /go/src/fluence/ +COPY fluence Makefile /go/src/fluence/ -# This is the 0.31.0 tag of flux-sched (same as we install above) -RUN go get -u github.com/flux-framework/flux-sched/resource/reapi/bindings/go/src/fluxcli@250eac78a6753253fc8353a3504d7e843d1b6b24 && \ - go mod tidy && \ +RUN go mod tidy && \ go mod vendor && \ - make server FLUX_SCHED_ROOT=/home/flux-sched INSTALL_PREFIX=${INSTALL_PREFIX} && \ + make server FLUX_SCHED_ROOT=/opt/flux-sched && \ mkdir -p /home/data/jobspecs /home/data/jgf && \ chmod -R ugo+rwx /home/data \ No newline at end of file diff --git a/src/fluence/fluxion/fluxion.go b/src/fluence/fluxion/fluxion.go index f29ac62..5775199 100644 --- a/src/fluence/fluxion/fluxion.go +++ b/src/fluence/fluxion/fluxion.go @@ -7,7 +7,7 @@ import ( pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxcli-grpc" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/jobspec" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/utils" - "github.com/flux-framework/flux-sched/resource/reapi/bindings/go/src/fluxcli" + "github.com/flux-framework/fluxion-go/pkg/fluxcli" "k8s.io/klog/v2" "context" diff --git a/src/fluence/go.mod b/src/fluence/go.mod index 5409a2a..5c57652 100644 --- a/src/fluence/go.mod +++ b/src/fluence/go.mod @@ -3,13 +3,14 @@ module github.com/flux-framework/flux-k8s/flux-plugin/fluence go 1.19 require ( - github.com/flux-framework/flux-sched/resource/reapi/bindings/go v0.0.0-20231213021445-250eac78a675 + github.com/flux-framework/fluxion-go v0.32.0 google.golang.org/grpc v1.38.0 google.golang.org/protobuf v1.26.0 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.22.3 k8s.io/apimachinery v0.22.3 k8s.io/client-go v0.22.3 + k8s.io/klog/v2 v2.9.0 k8s.io/kubectl v0.0.0 ) @@ -34,7 +35,6 @@ require ( google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect - k8s.io/klog/v2 v2.9.0 // indirect k8s.io/utils v0.0.0-20210819203725-bdf08cb9a70a // indirect sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect sigs.k8s.io/yaml v1.2.0 // indirect diff --git a/src/fluence/go.sum b/src/fluence/go.sum index 19e571c..5700215 100644 --- a/src/fluence/go.sum +++ b/src/fluence/go.sum @@ -98,8 +98,8 @@ github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d/go.mod h1:ZZM github.com/fatih/camelcase v1.0.0/go.mod h1:yN2Sb0lFhZJUdVvtELVWefmrXpuZESvPmqwoZc+/fpc= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/flux-framework/flux-sched/resource/reapi/bindings/go v0.0.0-20231213021445-250eac78a675 h1:FgEA3pnL/kDoLaVOUDa401yainApQJaow9jeBPg4dek= -github.com/flux-framework/flux-sched/resource/reapi/bindings/go v0.0.0-20231213021445-250eac78a675/go.mod h1:yhmzNyn45YhoxEohh1Sl3h3izLMqL7qpcvmYTRpv7eY= +github.com/flux-framework/fluxion-go v0.32.0 h1:NY6Y1mlTTTZhHD+CmAsDsdNTxUsAFDQoORpMZj8NFLI= +github.com/flux-framework/fluxion-go v0.32.0/go.mod h1:ZI3QxSvUfgJE2Snur/SntJmVfpMjr6D4ICVmdqJ9fkQ= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/form3tech-oss/jwt-go v3.2.3+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= From 1c0e5a32983d22f8f4081457022bd73a4fc351ff Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 17 Feb 2024 16:06:40 -0700 Subject: [PATCH 11/28] ci: add support to build and deploy fluence-controller Signed-off-by: vsoch --- .github/test.sh | 2 ++ .github/workflows/build-deploy.yaml | 43 +++++++++++++++++++++++++-- .github/workflows/test.yaml | 45 ++++++++++++++++++++++------- 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/.github/test.sh b/.github/test.sh index 44314ad..2b8b1e6 100644 --- a/.github/test.sh +++ b/.github/test.sh @@ -18,6 +18,8 @@ cd upstream/manifests/install/charts helm install \ --set scheduler.image=ghcr.io/flux-framework/fluence:latest \ --set scheduler.sidecarimage=ghcr.io/flux-framework/fluence-sidecar:latest \ + --set controller.image=ghcr.io/flux-framework/fluence-controller:latest \ + --set controller.pullPolicy=Never \ --set scheduler.pullPolicy=Never \ --set scheduler.sidecarPullPolicy=Never \ schedscheduler-plugins as-a-second-scheduler/ diff --git a/.github/workflows/build-deploy.yaml b/.github/workflows/build-deploy.yaml index c993aa9..575d2db 100644 --- a/.github/workflows/build-deploy.yaml +++ b/.github/workflows/build-deploy.yaml @@ -18,7 +18,7 @@ jobs: name: build fluence steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: go-version: ^1.19 @@ -45,7 +45,44 @@ jobs: - name: Deploy Container if: (github.event_name != 'pull_request') run: docker push ${{ env.container }} --all-tags - + + build-controller: + permissions: + packages: write + env: + container: ghcr.io/flux-framework/fluence-controller + runs-on: ubuntu-latest + name: build fluence-controller + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v4 + with: + go-version: ^1.19 + + - name: Build Containers + run: | + make prepare + make build REGISTRY=ghcr.io/flux-framework CONTROLLER_IMAGE=fluence-controller + + - name: Tag Release Image + if: (github.event_name == 'release') + run: | + tag=${GITHUB_REF#refs/tags/} + echo "Tagging and releasing ${{ env.container}}:${tag}" + docker tag ${{ env.container }}:latest ${{ env.container }}:${tag} + + - name: GHCR Login + if: (github.event_name != 'pull_request') + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Deploy Container + if: (github.event_name != 'pull_request') + run: docker push ${{ env.container }} --all-tags + build-sidecar: permissions: packages: write @@ -55,7 +92,7 @@ jobs: name: build sidecar steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: go-version: ^1.19 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3e24a33..98e2de2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -11,30 +11,42 @@ on: jobs: build-fluence: + + # The scheduler and controller are built together with the hack script + # in the upstream scheduler-plugins env: container: ghcr.io/flux-framework/fluence + controller: ghcr.io/flux-framework/fluence-controller runs-on: ubuntu-latest name: build fluence steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: go-version: ^1.19 - name: Build Containers run: | make prepare - make build REGISTRY=ghcr.io/flux-framework SCHEDULER_IMAGE=fluence + make build REGISTRY=ghcr.io/flux-framework SCHEDULER_IMAGE=fluence CONTROLLER_IMAGE=fluence-controller - - name: Save Container - run: docker save ${{ env.container }} | gzip > fluence_latest.tar.gz + - name: Save Containers + run: | + docker save ${{ env.container }} | gzip > fluence_latest.tar.gz + docker save ${{ env.controller }} | gzip > fluence_controller_latest.tar.gz - name: Upload container artifact uses: actions/upload-artifact@v4 with: name: fluence path: fluence_latest.tar.gz - + + - name: Upload container artifact + uses: actions/upload-artifact@v4 + with: + name: fluence_controller + path: fluence_controller_latest.tar.gz + build-sidecar: env: container: ghcr.io/flux-framework/fluence-sidecar @@ -42,7 +54,7 @@ jobs: name: build sidecar steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: go-version: ^1.19 @@ -59,7 +71,7 @@ jobs: with: name: fluence_sidecar path: fluence_sidecar_latest.tar.gz - + test-fluence: needs: [build-fluence, build-sidecar] permissions: @@ -67,14 +79,15 @@ jobs: env: fluence_container: ghcr.io/flux-framework/fluence sidecar_container: ghcr.io/flux-framework/fluence-sidecar + controller_container: ghcr.io/flux-framework/fluence-controller runs-on: ubuntu-latest - name: build fluence + name: test fluence steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: - go-version: ^1.20 + go-version: ^1.19 - name: Download fluence artifact uses: actions/download-artifact@v4 @@ -88,11 +101,18 @@ jobs: name: fluence_sidecar path: /tmp + - name: Download fluence_controller artifact + uses: actions/download-artifact@v4 + with: + name: fluence_controller + path: /tmp + - name: Load Docker images run: | ls /tmp/*.tar.gz docker load --input /tmp/fluence_sidecar_latest.tar.gz docker load --input /tmp/fluence_latest.tar.gz + docker load --input /tmp/fluence_controller_latest.tar.gz docker image ls -a | grep fluence - name: Create Kind Cluster @@ -106,10 +126,12 @@ jobs: env: fluence: ${{ env.fluence_container }} sidecar: ${{ env.sidecar_container }} + controller: ${{ env.controller_container }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | kind load docker-image ${fluence} kind load docker-image ${sidecar} + kind load docker-image ${controller} - name: Test Fluence run: /bin/bash ./.github/test.sh @@ -122,6 +144,8 @@ jobs: docker tag ${{ env.fluence_container }}:latest ${{ env.fluence_container }}:${tag} echo "Tagging and releasing ${{ env.sidecar_container}}:${tag}" docker tag ${{ env.sidecar_container }}:latest ${{ env.sidecar_container }}:${tag} + echo "Tagging and releasing ${{ env.controller_container}}:${tag}" + docker tag ${{ env.controller_container }}:latest ${{ env.controller_container }}:${tag} # If we get here, tests pass, and we can deploy - name: GHCR Login @@ -137,3 +161,4 @@ jobs: run: | docker push ${{ env.fluence_container }} --all-tags docker push ${{ env.sidecar_container }} --all-tags + docker push ${{ env.controller_container }} --all-tags \ No newline at end of file From 8add1e0f4df2f8adb2febc5265a8f3a01ffb0787 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 17 Feb 2024 13:44:36 -0700 Subject: [PATCH 12/28] feat: add start of webhook Problem: we want to try a design where a mutating admission webhook can handle receiving and creating PodGroup from labels. We are choosing mutating with the expectation that, at some point, we might be able to change the size (min/max/desired) either for the PodGroup or some other watcher to jobs. Note that this is an empty skeleton - the webhook is added and running but basically doing nothing. I am also fixing a bug that I noticed while running kind that fluence was assigning work to the control plane. I think there maybe used to be logic (a commented out worker label) that was anticipating doing a check for a control plane, but it looks like on production clusters we do not always haave access and it was never finished. Note that this addition does not guarantee this design will work, but it is just one step. Since the helm charts are manually genereated for the scheduler-plugin (as far as I can tell) this took me almost 6 hours to figure out and get working. I am really starting to think there is no skill behind software engineering beyond absolute patience. Signed-off-by: vsoch --- .github/test-kind-config.yaml | 5 + .github/workflows/test.yaml | 15 ++ Makefile | 8 +- README.md | 26 ++- .../simple_example/fluence-scheduler-pod.yaml | 2 +- .../scheduling/v1alpha1/podgroup_webhook.go | 190 ++++++++++++++++++ .../cmd/controller/app/server.go | 101 ++++++++++ .../crds/scheduling.x-k8s.io_podgroups.yaml | 108 ++++++++++ .../templates/deployment.yaml | 13 ++ .../mutating-webhook-configuration.yaml | 40 ++++ .../templates/selfsigned-issuer.yaml | 10 + .../templates/serving-cert.yaml | 17 ++ .../templates/webhook-service.yaml | 15 ++ .../charts/as-a-second-scheduler/values.yaml | 10 +- src/fluence/utils/utils.go | 18 +- 15 files changed, 565 insertions(+), 13 deletions(-) create mode 100644 .github/test-kind-config.yaml create mode 100644 sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go create mode 100644 sig-scheduler-plugins/cmd/controller/app/server.go create mode 100644 sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml create mode 100644 sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml create mode 100644 sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/selfsigned-issuer.yaml create mode 100644 sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/serving-cert.yaml create mode 100644 sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml diff --git a/.github/test-kind-config.yaml b/.github/test-kind-config.yaml new file mode 100644 index 0000000..0fe29e7 --- /dev/null +++ b/.github/test-kind-config.yaml @@ -0,0 +1,5 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane +- role: worker \ No newline at end of file diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 98e2de2..ed45891 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -107,12 +107,21 @@ jobs: name: fluence_controller path: /tmp + - name: Make Space For Build + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + - name: Load Docker images run: | ls /tmp/*.tar.gz docker load --input /tmp/fluence_sidecar_latest.tar.gz + rm /tmp/fluence_sidecar_latest.tar.gz docker load --input /tmp/fluence_latest.tar.gz + rm /tmp/fluence_latest.tar.gz docker load --input /tmp/fluence_controller_latest.tar.gz + rm /tmp/fluence_controller_latest.tar.gz docker image ls -a | grep fluence - name: Create Kind Cluster @@ -121,6 +130,7 @@ jobs: cluster_name: kind kubectl_version: v1.28.2 version: v0.20.0 + config: ./.github/test-kind-config.yaml - name: Load Docker Containers into Kind env: @@ -133,6 +143,11 @@ jobs: kind load docker-image ${sidecar} kind load docker-image ${controller} + - name: Install Cert Manager + run: | + kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml + sleep 10 + - name: Test Fluence run: /bin/bash ./.github/test.sh diff --git a/Makefile b/Makefile index 97efa75..8976cb4 100644 --- a/Makefile +++ b/Makefile @@ -27,15 +27,19 @@ prepare: clone # These are entirely new directory structures rm -rf $(CLONE_UPSTREAM)/pkg/fluence rm -rf $(CLONE_UPSTREAM)/pkg/controllers/podgroup_controller.go - rm -rf $(CLONE_UPSTREAM)/manifests/fluence + rm -rf $(CLONE_UPSTREAM)/apis/scheduling/v1alpha1/podgroup_webhook.go + rm -rf $(CLONE_UPSTREAM)/cmd/controller/app/server.go cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence cp -R sig-scheduler-plugins/pkg/controllers/* $(CLONE_UPSTREAM)/pkg/controllers/ # This is the one exception not from sig-scheduler-plugins because it is needed in both spots cp -R src/fluence/fluxcli-grpc $(CLONE_UPSTREAM)/pkg/fluence/fluxcli-grpc # These are files with subtle changes to add fluence cp sig-scheduler-plugins/cmd/scheduler/main.go ./upstream/cmd/scheduler/main.go - cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml + cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/*.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/templates/ + cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/*.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/crds/ cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/values.yaml + cp sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go $(CLONE_UPSTREAM)/apis/scheduling/v1alpha1/podgroup_webhook.go + cp sig-scheduler-plugins/cmd/controller/app/server.go $(CLONE_UPSTREAM)/cmd/controller/app/server.go build: prepare REGISTRY=${REGISTRY} IMAGE=${SCHEDULER_IMAGE} CONTROLLER_IMAGE=${CONTROLLER_IMAGE} $(BASH) $(CLONE_UPSTREAM)/hack/build-images.sh diff --git a/README.md b/README.md index f0d67cd..3821ad8 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched ## Getting started -For instructions on how to start Fluence on a K8s cluster, see [examples](examples/). Documentation and instructions for reproducing our CANOPIE2022 paper (citation below) can be found in the [canopie22-artifacts branch](https://github.com/flux-framework/flux-k8s/tree/canopie22-artifacts). +For instructions on how to start Fluence on a K8s cluster, see [examples](examples/). Documentation and instructions for reproducing our CANOPIE-2022 paper (citation below) can be found in the [canopie22-artifacts branch](https://github.com/flux-framework/flux-k8s/tree/canopie22-artifacts). For background on the Flux framework and the Fluxion scheduler, you can take a look at our award-winning R&D100 submission: https://ipo.llnl.gov/sites/default/files/2022-02/Flux_RD100_Final.pdf. For next steps: - To understand how it works, see [Design](#design) @@ -56,8 +56,13 @@ pods with different names cannot be part of the same group that needs to be sche ### Deploy We provide a set of pre-build containers [alongside the repository](https://github.com/orgs/flux-framework/packages?repo_name=flux-k8s) -that you can easily use to deploy Fluence right away! You'll simply need to clone the proper helm charts, and then install to your cluster. -We provide helper commands to do that. +that you can easily use to deploy Fluence right away! You'll first need to install the certificate manager: + +```bash +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml +``` + +And then clone the proper helm charts, and then install to your cluster. We provide helper commands to do that. ```bash # This clones the upstream scheduler plugins code, we will add fluence to it! @@ -131,7 +136,13 @@ docker push docker.io/vanessa/fluence-controller These steps will require a Kubernetes cluster to install to, and having pushed the plugin container to a registry. If you aren't using a cloud provider, you can create a local one with `kind`: ```bash -kind create cluster +kind create cluster --config ./examples/kind-config.yaml +``` + +And install the certificate manager: + +```bash +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml ``` **Important** if you are developing or testing fluence, note that custom scheduler plugins don't seem to work out of the box with MiniKube (but everything works with kind). Likely there are extensions or similar that need to be configured with MiniKube (that we have not looked into). @@ -456,7 +467,7 @@ Note that if you want to enable extra endpoints for the fluence kubectl plugin a helm install \ --set scheduler.image=ghcr.io/vsoch/fluence:latest \ --set scheduler.enableExternalService=true \ - --set controller.image=vanessa/fluence-controller \ + --set controller.image=ghcr.io/vsoch/fluence-controller \ --set scheduler.sidecarimage=ghcr.io/vsoch/fluence-sidecar:latest \ fluence as-a-second-scheduler/ ``` @@ -486,6 +497,11 @@ And to create: kind create cluster --config ./kind-config.yaml ``` +#### TODO + + - Try what [kueue does](https://github.com/kubernetes-sigs/kueue/blob/6d57813a52066dab412735deeeb60ebb0cdb8e8e/cmd/kueue/main.go#L146-L155) to not require cert-manager. + - Possible bug with using kind (with custom config we are scheduling things to the control plane) - need to verify this didn't start happening with mutating webhook addition. + #### Vanessa Thinking > Updated February 15, 2024 diff --git a/examples/simple_example/fluence-scheduler-pod.yaml b/examples/simple_example/fluence-scheduler-pod.yaml index a7cc126..71a8463 100644 --- a/examples/simple_example/fluence-scheduler-pod.yaml +++ b/examples/simple_example/fluence-scheduler-pod.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: Pod metadata: - name: fluence-scheduled-pod + name: fluence-scheduled-pod-1 labels: name: scheduler-example spec: diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go new file mode 100644 index 0000000..55c4d45 --- /dev/null +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go @@ -0,0 +1,190 @@ +/* +Copyright 2023 Lawrence Livermore National Security, LLC + +(c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: MIT +*/ + +// This file is not used, but maintained as the original addition of an OrasCache webhook + +package v1alpha1 + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" +) + +var ( + logger = ctrl.Log.WithName("setup") +) + +// IMPORTANT: if you use the controller-runtime builder, it will derive this name automatically from the gvk (kind, version, etc. so find the actual created path in the logs) +// kubectl describe mutatingwebhookconfigurations.admissionregistration.k8s.io +// It will also only allow you to describe one object type with For() +// This is disabled so we manually manage it - multiple types to a list did not work: config/webhook/manifests.yaml +////kubebuilder:webhook:path=/mutate-v1-sidecar,mutating=true,failurePolicy=fail,sideEffects=None,groups=core;batch,resources=pods;jobs,verbs=create,versions=v1,name=morascache.kb.io,admissionReviewVersions=v1 + +// NewMutatingWebhook allows us to keep the sidecarInjector private +// If it's public it's exported and kubebuilder tries to add to zz_generated_deepcopy +// and you get all kinds of terrible errors about admission.Decoder missing DeepCopyInto +func NewMutatingWebhook(mgr manager.Manager) *fluenceWatcher { + return &fluenceWatcher{decoder: admission.NewDecoder(mgr.GetScheme())} +} + +// mutate-v1-fluence + +type fluenceWatcher struct { + decoder *admission.Decoder +} + +func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admission.Response { + + logger.Info("Running webhook handle") + // First try for job + job := &batchv1.Job{} + err := a.decoder.Decode(req, job) + if err != nil { + + // Try for a pod next + pod := &corev1.Pod{} + err := a.decoder.Decode(req, pod) + if err != nil { + logger.Error(err, "Admission error.") + return admission.Errored(http.StatusBadRequest, err) + } + + // If we get here, we decoded a pod + /*err = a.InjectPod(pod) + if err != nil { + logger.Error("Inject pod error.", err) + return admission.Errored(http.StatusBadRequest, err) + }*/ + + // Mutate the fields in pod + marshalledPod, err := json.Marshal(pod) + if err != nil { + logger.Error(err, "Marshalling pod error.") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission pod success.") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledPod) + } + /* + // If we get here, we found a job + err = a.InjectJob(job) + + if err != nil { + logger.Error("Inject job error.", err) + return admission.Errored(http.StatusBadRequest, err) + }*/ + + marshalledJob, err := json.Marshal(job) + + if err != nil { + logger.Error(err, "Marshalling job error.") + return admission.Errored(http.StatusInternalServerError, err) + } + + logger.Info("Admission job success.") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledJob) +} + +// Default is the expected entrypoint for a webhook +func (a *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { + pod, ok := obj.(*corev1.Pod) + if !ok { + job, ok := obj.(*batchv1.Job) + if !ok { + return fmt.Errorf("expected a Pod or Job but got a %T", obj) + } + logger.Info(fmt.Sprintf("Job %s is marked for fluence.", job.Name)) + return nil + // return a.InjectJob(job) + } + logger.Info(fmt.Sprintf("Pod %s is marked for fluence.", pod.Name)) + return nil + //return a.InjectPod(pod) +} + +// InjectPod adds the sidecar container to a pod +func (a *fluenceWatcher) InjectPod(pod *corev1.Pod) error { + + /* + // Cut out early if we have no labels + if pod.Annotations == nil { + logger.Info(fmt.Sprintf("Pod %s is not marked for oras storage.", pod.Name)) + return nil + } + + // Parse oras known labels into settings + settings := orasSettings.NewOrasCacheSettings(pod.Annotations) + + // Cut out early if no oras identifiers! + if !settings.MarkedForOras { + logger.Warnf("Pod %s is not marked for oras storage.", pod.Name) + return nil + } + + // Validate, return error if no good here. + if !settings.Validate() { + logger.Warnf("Pod %s oras storage did not validate.", pod.Name) + return fmt.Errorf("oras storage was requested but is not valid") + } + + // The selector for the namespaced registry is the namespace + if pod.Labels == nil { + pod.Labels = map[string]string{} + } + + // Even pods without say, the launcher, that are marked should have the network added + pod.Labels[defaults.OrasSelectorKey] = pod.ObjectMeta.Namespace + oras.AddSidecar(&pod.Spec, pod.ObjectMeta.Namespace, settings) + logger.Info(fmt.Sprintf("Pod %s is marked for oras storage.", pod.Name))*/ + return nil +} + +// InjectJob adds the sidecar container to the PodTemplateSpec of the Job +func (a *fluenceWatcher) InjectJob(job *batchv1.Job) error { + + /* + // Cut out early if we have no labels + if job.Annotations == nil { + logger.Info(fmt.Sprintf("Job %s is not marked for oras storage.", job.Name)) + return nil + } + + // Parse oras known labels into settings + settings := orasSettings.NewOrasCacheSettings(job.Annotations) + + // Cut out early if no oras identifiers! + if !settings.MarkedForOras { + logger.Warnf("Job %s is not marked for oras storage.", job.Name) + return nil + } + + // Validate, return error if no good here. + if !settings.Validate() { + logger.Warnf("Job %s oras storage did not validate.", job.Name) + return fmt.Errorf("oras storage was requested but is not valid") + } + + // Add the sidecar to the podspec of the job + if job.Spec.Template.Labels == nil { + job.Spec.Template.Labels = map[string]string{} + } + + // Add network to spec template so all pods are targeted + job.Spec.Template.Labels[defaults.OrasSelectorKey] = job.ObjectMeta.Namespace + oras.AddSidecar(&job.Spec.Template.Spec, job.ObjectMeta.Namespace, settings) + logger.Info(fmt.Sprintf("Job %s is marked for oras storage.", job.Name))*/ + return nil +} diff --git a/sig-scheduler-plugins/cmd/controller/app/server.go b/sig-scheduler-plugins/cmd/controller/app/server.go new file mode 100644 index 0000000..5927bec --- /dev/null +++ b/sig-scheduler-plugins/cmd/controller/app/server.go @@ -0,0 +1,101 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package app + +import ( + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/klog/v2/klogr" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/healthz" + + "sigs.k8s.io/controller-runtime/pkg/webhook" + api "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + "sigs.k8s.io/scheduler-plugins/pkg/controllers" +) + +var ( + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + utilruntime.Must(api.AddToScheme(scheme)) +} + +func Run(s *ServerRunOptions) error { + config := ctrl.GetConfigOrDie() + config.QPS = float32(s.ApiServerQPS) + config.Burst = s.ApiServerBurst + + // Controller Runtime Controllers + ctrl.SetLogger(klogr.New()) + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + Scheme: scheme, + MetricsBindAddress: s.MetricsAddr, + Port: 9443, + HealthProbeBindAddress: s.ProbeAddr, + LeaderElection: s.EnableLeaderElection, + LeaderElectionID: "sched-plugins-controllers", + LeaderElectionNamespace: "kube-system", + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + return err + } + + if err = (&controllers.PodGroupReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Workers: s.Workers, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "PodGroup") + return err + } + + mgr.GetWebhookServer().Register("/mutate-v1-fluence", &webhook.Admission{ + Handler: api.NewMutatingWebhook(mgr), + }) + + if err = (&controllers.ElasticQuotaReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Workers: s.Workers, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ElasticQuota") + return err + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + return err + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up ready check") + return err + } + + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "unable to start manager") + return err + } + return nil +} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml new file mode 100644 index 0000000..d633b7d --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml @@ -0,0 +1,108 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes-sigs/scheduler-plugins/pull/50 + controller-gen.kubebuilder.io/version: v0.11.1 + # TODO this needs if .Vaues.enableCertManager added back + cert-manager.io/inject-ca-from: '{{ .Release.Namespace }}/{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-serving-cert' + creationTimestamp: null + name: podgroups.scheduling.x-k8s.io +spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + name: webhook-service + namespace: system + path: /convert + conversionReviewVersions: + - v1 + group: scheduling.x-k8s.io + names: + kind: PodGroup + listKind: PodGroupList + plural: podgroups + shortNames: + - pg + - pgs + singular: podgroup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: PodGroup is a collection of Pod; used for batch workload. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: Specification of the desired behavior of the pod group. + properties: + minMember: + description: MinMember defines the minimal number of members/tasks + to run the pod group; if there's not enough resources to start all + tasks, the scheduler will not start anyone. + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: MinResources defines the minimal resource of members/tasks + to run the pod group; if there's not enough resources to start all + tasks, the scheduler will not start anyone. + type: object + scheduleTimeoutSeconds: + description: ScheduleTimeoutSeconds defines the maximal time of members/tasks + to wait before run the pod group; + format: int32 + type: integer + type: object + status: + description: Status represents the current information about a pod group. + This data may not be up to date. + properties: + failed: + description: The number of pods which reached phase Failed. + format: int32 + type: integer + occupiedBy: + description: OccupiedBy marks the workload (e.g., deployment, statefulset) + UID that occupy the podgroup. It is empty if not initialized. + type: string + phase: + description: Current phase of PodGroup. + type: string + running: + description: The number of actively running pods. + format: int32 + type: integer + scheduleStartTime: + description: ScheduleStartTime of the group + format: date-time + type: string + succeeded: + description: The number of pods which reached phase Succeeded. + format: int32 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml index 83ecccc..289a0e5 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml @@ -20,6 +20,19 @@ spec: - name: scheduler-plugins-controller image: {{ .Values.controller.image }} imagePullPolicy: {{ .Values.controller.pullPolicy }} + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: webhook-server-cert --- apiVersion: apps/v1 kind: Deployment diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml new file mode 100644 index 0000000..d6e7330 --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml @@ -0,0 +1,40 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: {{ include "scheduler-plugins-as-a-second-scheduler.name" . }}-mutating-webhook-configuration + {{- if .Values.enableCertManager }} + annotations: + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-serving-cert + {{- end}} + labels: + {{- include "scheduler-plugins-as-a-second-scheduler.labels" . | nindent 4 }} +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: '{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-webhook-service' + namespace: '{{ .Release.Namespace }}' + path: /mutate-v1-fluence + {{- with (index .Values.webhookService.ports 0) }} + port: {{ .port }} + {{- end }} + + failurePolicy: Fail + name: morascache.kb.io + rules: + - apiGroups: + - "" + - core + - batch + - scheduling.x-k8s.io + apiVersions: + - v1 + - v1alpha1 + operations: + - CREATE + resources: + - pods + - jobs + - podgroups + sideEffects: None diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/selfsigned-issuer.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/selfsigned-issuer.yaml new file mode 100644 index 0000000..aa4d0a1 --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/selfsigned-issuer.yaml @@ -0,0 +1,10 @@ +{{- if .Values.enableCertManager }} +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-selfsigned-issuer + labels: + {{- include "scheduler-plugins-as-a-second-scheduler.labels" . | nindent 4 }} +spec: + selfSigned: {} +{{- end}} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/serving-cert.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/serving-cert.yaml new file mode 100644 index 0000000..0edefe2 --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/serving-cert.yaml @@ -0,0 +1,17 @@ +{{- if .Values.enableCertManager }} +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-serving-cert + labels: + {{- include "scheduler-plugins-as-a-second-scheduler.labels" . | nindent 4 }} +spec: + dnsNames: + - '{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc' + - '{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc.{{ + .Values.kubernetesClusterDomain }}' + issuerRef: + kind: Issuer + name: '{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-selfsigned-issuer' + secretName: webhook-server-cert +{{- end}} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml new file mode 100644 index 0000000..bedfb95 --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-webhook-service + labels: + app.kubernetes.io/component: webhook + app.kubernetes.io/created-by: scheduler-plugins-controller + app.kubernetes.io/part-of: scheduler-plugins-controller + {{- include "scheduler-plugins-as-a-second-scheduler.labels" . | nindent 4 }} +spec: + type: {{ .Values.webhookService.type }} + selector: + app: scheduler-plugins-controller + ports: + {{- .Values.webhookService.ports | toYaml | nindent 2 -}} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml index be1e797..a5a7870 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml @@ -21,7 +21,6 @@ scheduler: controller: name: scheduler-plugins-controller - image: registry.k8s.io/scheduler-plugins/controller:v0.27.8 image: ghcr.io/flux-framework/fluence-controller:latest replicaCount: 1 pullPolicy: IfNotPresent @@ -45,3 +44,12 @@ pluginConfig: # args: # scoringStrategy: # type: MostAllocated # default is LeastAllocated + +enableCertManager: true +kubernetesClusterDomain: cluster.local +webhookService: + ports: + - port: 9443 + protocol: TCP + targetPort: 9443 + type: ClusterIP \ No newline at end of file diff --git a/src/fluence/utils/utils.go b/src/fluence/utils/utils.go index f30eeda..f81f81c 100644 --- a/src/fluence/utils/utils.go +++ b/src/fluence/utils/utils.go @@ -16,6 +16,10 @@ import ( resourcehelper "k8s.io/kubectl/pkg/util/resource" ) +var ( + controlPlaneLabel = "node-role.kubernetes.io/control-plane" +) + // CreateJGF creates the Json Graph Format func CreateJGF(filename string, skipLabel *string) error { ctx := context.Background() @@ -55,12 +59,18 @@ func CreateJGF(filename string, skipLabel *string) error { for node_index, node := range nodes.Items { - // Question from V: what was this for (what is a worker)? - // _, worker := node.Labels["node-role.kubernetes.io/worker"] + // We should not be scheduling to the control plane + _, ok := node.Labels[controlPlaneLabel] + if ok { + fmt.Println("Skipping control plane node ", node.GetName()) + continue + } + // Anything labeled with "skipLabel" meaning it is present, + // should be skipped if *skipLabel != "" { - _, fluxnode := node.Labels[*skipLabel] - if !fluxnode { + _, ok := node.Labels[*skipLabel] + if ok { fmt.Println("Skipping node ", node.GetName()) continue } From 10d624d4e25ee7c26365ecbf0f283de267d8b109 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 17 Feb 2024 19:57:09 -0700 Subject: [PATCH 13/28] webhook: adding support for adding pod group labels Problem: we need every pod object coming into the cluster to be part of a group. Solution: This change adds logic to the mutating webhook to add the labels that indicate the group name and size. We can eventually add flexibility here. I also realize that we can easily watch for job objects first, and add the group size/name to the pod template. This will be much more efficient to then not have to add to the individual pods that are part of a larger job. With this approach I was able to create a fluence scheduled pod, and then see my labels added! It does not do anything beyond that. I am also adding a nice script that makes it easy to build, load, and install fluence freshly, otherwise you will use all your internet data for the month in like, two days. Do not do that :P Signed-off-by: vsoch --- README.md | 81 +++------ hack/quick-build.sh | 36 ++++ .../scheduling/v1alpha1/podgroup_webhook.go | 167 +++++++++--------- .../charts/as-a-second-scheduler/values.yaml | 2 +- .../pkg/controllers/podgroup_controller.go | 29 ++- .../pkg/fluence/group/group.go | 10 +- .../pkg/fluence/labels/labels.go | 8 + 7 files changed, 187 insertions(+), 146 deletions(-) create mode 100755 hack/quick-build.sh create mode 100644 sig-scheduler-plugins/pkg/fluence/labels/labels.go diff --git a/README.md b/README.md index 3821ad8..4f33cd9 100644 --- a/README.md +++ b/README.md @@ -21,18 +21,26 @@ For background on the Flux framework and the Fluxion scheduler, you can take a l Fluence is a custom scheduler plugin that you can specify to use with two directive in your pod spec - - Asking for `fluence` as the scheduler name -- Defining a named group of pods with the `fluence.flux-framework.org/pod-group` label. -- Defining the group size with the `fluence.flux-framework.org/group-size` label. +- On either a job or a single or group of pods: + - Defining a named group of pods with the `fluence.flux-framework.org/pod-group` label. + - Defining the group size with the `fluence.flux-framework.org/group-size` label. -If you are using Fluence, these values are required. -An example is shown below for an indexed job, which will create multiple pods. +The way it works: + +1. We have a mutating admission webhook that looks for jobs and pods, and ensures there are fluence labels. +2. A PodGroup reconciler is watching for these same objects. When they are created (this is not done yet): + a. We find the labels and create the pod group object. + b. The pod group object has a timestamp for creation. +3. When the pod is then given to fluence for scheduling, it already has the PodGroup created with name/size and can properly sort. + +Another strategy I'm considering (if the above runs into trouble) is to watch a [channel](https://book-v1.book.kubebuilder.io/beyond_basics/controller_watches). An example is shown below for an indexed job, which will create multiple pods. ```yaml apiVersion: batch/v1 kind: Job metadata: name: fluence-job - annotations: + labels: fluence.flux-framework.org/pod-group: my-pods fluence.flux-framework.org/group-size: 10 spec: @@ -225,17 +233,6 @@ helm install \ fluence as-a-second-scheduler/ ``` -If you load your images into your testing environment and don't need to pull, you can change the pull policy too: - -```bash -helm install \ - --set scheduler.image=vanessa/fluence:latest \ - --set scheduler.sidecarimage=vanessa/fluence-sidecar \ - --set controller.image=vanessa/fluence-controller \ - --set scheduler.sidecarPullPolicy=IfNotPresent \ - fluence as-a-second-scheduler/ -``` - If you need to uninstall (e.g., to redo something): ```bash @@ -433,31 +430,27 @@ make proto #### Workflow -The easiest thing to do is to build the containers in some container namespace that you control (meaning you can push to a registry), e.g.,: +You should first do these on your own: -```bash -make build REGISTRY=ghcr.io/vsoch -``` +1. Create the kind cluster (`kubectl apply -f ./examples/kind-cluster.yaml`) +2. Install the certificate manager. -If needed, create a "multi node" kind cluster: +I was having trouble developing this easily because it's a lot of steps to build and load containers and change directories and uninstall/install the charts, so I put together a small script that does the following: -```bash -kind create cluster --config ./examples/kind-config.yaml -``` +1. Takes a registry of interest (probably doesn't matter since we are working locally, defaults to `ghcr.io/vsoch` +2. builds all three images, the controller, sidecar, and fluence +3. loads them all into kind +4. changes directory to the charts +5. uninstalls the fluence helm instance (if installed) +6. installs it, targeted the images just built, and setting pullPolicy to never -And then install with your custom images: +The last step ensures we use the images we loaded! You can basically just do: ```bash -cd ./upstream/manifests/install/charts -helm install \ - --set scheduler.image=ghcr.io/vsoch/fluence:latest \ - --set controller.image=ghcr.io/vsoch/fluence-controller:latest \ - --set scheduler.sidecarimage=ghcr.io/vsoch/fluence-sidecar:latest \ - fluence as-a-second-scheduler/ +./hack/quick-build.sh ``` -And then apply what you need to test, and look at logs! -And then keep doing that until you get what you want :) Note that I haven't found a good way for the VSCode developer tools to work because we develop fluence outside of the tree it's supposed to be in. +This sped up my development time immensely. If you want to manually do the steps, see that script for instructions. ##### kubectl plugin @@ -472,26 +465,7 @@ helm install \ fluence as-a-second-scheduler/ ``` -For this setup if you are developing locally with kind, you will need to enable the ingress. Here is `kind-config.yaml` - -```yaml -kind: Cluster -apiVersion: kind.x-k8s.io/v1alpha4 -nodes: -- role: control-plane - kubeadmConfigPatches: - - | - kind: InitConfiguration - nodeRegistration: - kubeletExtraArgs: - node-labels: "ingress-ready=true" - extraPortMappings: - - containerPort: 4242 - hostPort: 4242 - protocol: TCP -``` - -And to create: +For this setup if you are developing locally with kind, you will need to enable the ingress, as is done in [examples/kind-config.yaml](examples/kind-config.yaml). ```bash kind create cluster --config ./kind-config.yaml @@ -500,7 +474,6 @@ kind create cluster --config ./kind-config.yaml #### TODO - Try what [kueue does](https://github.com/kubernetes-sigs/kueue/blob/6d57813a52066dab412735deeeb60ebb0cdb8e8e/cmd/kueue/main.go#L146-L155) to not require cert-manager. - - Possible bug with using kind (with custom config we are scheduling things to the control plane) - need to verify this didn't start happening with mutating webhook addition. #### Vanessa Thinking diff --git a/hack/quick-build.sh b/hack/quick-build.sh new file mode 100755 index 0000000..23a5c87 --- /dev/null +++ b/hack/quick-build.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Before running this, you should: +# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane) +# 2. Install cert-manager +# 3. Customize the script to point to your registry if you intend to push + +REGISTRY="${1:-ghcr.io/vsoch}" +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT=$(dirname ${HERE}) + +# Go to the script directory +cd ${ROOT} + +# These build each of the images. The sidecar is separate from the other two in src/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller + +# This is what it might look like to push +# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest + +# We load into kind so we don't need to push/pull and use up internet data ;) +kind load docker-image ${REGISTRY}/fluence-sidecar:latest +kind load docker-image ${REGISTRY}/fluence-controller:latest +kind load docker-image ${REGISTRY}/fluence:latest + +# And then install using the charts. The pull policy ensures we use the loaded ones +cd ${ROOT}/upstream/manifests/install/charts +helm uninstall fluence || true +helm install \ + --set scheduler.image=${REGISTRY}/fluence:latest \ + --set scheduler.sidecarPullPolicy=Never \ + --set scheduler.pullPolicy=Never \ + --set controller.pullPolicy=Never \ + --set controller.image=${REGISTRY}/fluence-controller:latest \ + --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ + fluence as-a-second-scheduler/ diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go index 55c4d45..13d327c 100644 --- a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go @@ -21,6 +21,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" ) var ( @@ -41,20 +42,23 @@ func NewMutatingWebhook(mgr manager.Manager) *fluenceWatcher { } // mutate-v1-fluence - type fluenceWatcher struct { decoder *admission.Decoder } +// Handle is the main handler for the webhook, which is looking for jobs and pods (in that order) +// If a job comes in (with a pod template) first, we add the labels there first (and they will +// not be added again). func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admission.Response { logger.Info("Running webhook handle") - // First try for job + + // Try for a job first, which would be created before pods job := &batchv1.Job{} err := a.decoder.Decode(req, job) if err != nil { - // Try for a pod next + // Assume we operate on the level of pods for now pod := &corev1.Pod{} err := a.decoder.Decode(req, pod) if err != nil { @@ -63,32 +67,33 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi } // If we get here, we decoded a pod - /*err = a.InjectPod(pod) + err = a.EnsureGroup(pod) if err != nil { - logger.Error("Inject pod error.", err) + logger.Error(err, "Issue adding PodGroup to pod.") return admission.Errored(http.StatusBadRequest, err) - }*/ + } + + logger.Info("Admission pod success.") - // Mutate the fields in pod marshalledPod, err := json.Marshal(pod) if err != nil { logger.Error(err, "Marshalling pod error.") return admission.Errored(http.StatusInternalServerError, err) } - logger.Info("Admission pod success.") + + logger.Info("Admission job success.") return admission.PatchResponseFromRaw(req.Object.Raw, marshalledPod) } - /* - // If we get here, we found a job - err = a.InjectJob(job) - if err != nil { - logger.Error("Inject job error.", err) - return admission.Errored(http.StatusBadRequest, err) - }*/ + // If we get here, err was nil and we have a Job! + err = a.EnsureGroupOnJob(job) + if err != nil { + logger.Error(err, "Issue adding PodGroup to job.") + return admission.Errored(http.StatusBadRequest, err) + } + logger.Info("Admission job success.") marshalledJob, err := json.Marshal(job) - if err != nil { logger.Error(err, "Marshalling job error.") return admission.Errored(http.StatusInternalServerError, err) @@ -98,93 +103,89 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi return admission.PatchResponseFromRaw(req.Object.Raw, marshalledJob) } -// Default is the expected entrypoint for a webhook +// Default is the expected entrypoint for a webhook... +// I don't remember if this is even called... func (a *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { pod, ok := obj.(*corev1.Pod) if !ok { - job, ok := obj.(*batchv1.Job) - if !ok { - return fmt.Errorf("expected a Pod or Job but got a %T", obj) - } - logger.Info(fmt.Sprintf("Job %s is marked for fluence.", job.Name)) - return nil - // return a.InjectJob(job) + return fmt.Errorf("expected a Pod or Job but got a %T", obj) } logger.Info(fmt.Sprintf("Pod %s is marked for fluence.", pod.Name)) - return nil - //return a.InjectPod(pod) + return a.EnsureGroup(pod) } -// InjectPod adds the sidecar container to a pod -func (a *fluenceWatcher) InjectPod(pod *corev1.Pod) error { +// EnsureGroup adds pod group label and size if not present +// This ensures that every pod passing through is part of a group. +// Note that we need to do similar for Job. +// A pod without a job wrapper, and without metadata is a group +// of size 1. +func (a *fluenceWatcher) EnsureGroup(pod *corev1.Pod) error { - /* - // Cut out early if we have no labels - if pod.Annotations == nil { - logger.Info(fmt.Sprintf("Pod %s is not marked for oras storage.", pod.Name)) - return nil - } - - // Parse oras known labels into settings - settings := orasSettings.NewOrasCacheSettings(pod.Annotations) - - // Cut out early if no oras identifiers! - if !settings.MarkedForOras { - logger.Warnf("Pod %s is not marked for oras storage.", pod.Name) - return nil - } + // Add labels if we don't have anything. Everything is a group! + if pod.Labels == nil { + pod.Labels = map[string]string{} + } - // Validate, return error if no good here. - if !settings.Validate() { - logger.Warnf("Pod %s oras storage did not validate.", pod.Name) - return fmt.Errorf("oras storage was requested but is not valid") - } + // Do we have a group name? + groupName, ok := pod.Labels[labels.PodGroupNameLabel] - // The selector for the namespaced registry is the namespace - if pod.Labels == nil { - pod.Labels = map[string]string{} - } + // If we don't have a fluence group, create one under fluence namespace + if !ok { + groupName = fmt.Sprintf("fluence-group-%s-%s", pod.Namespace, pod.Name) + pod.Labels[labels.PodGroupNameLabel] = groupName + } - // Even pods without say, the launcher, that are marked should have the network added - pod.Labels[defaults.OrasSelectorKey] = pod.ObjectMeta.Namespace - oras.AddSidecar(&pod.Spec, pod.ObjectMeta.Namespace, settings) - logger.Info(fmt.Sprintf("Pod %s is marked for oras storage.", pod.Name))*/ + // Do we have a group size? This will be parsed as a string, likely + groupSize, ok := pod.Labels[labels.PodGroupSizeLabel] + if !ok { + groupSize = "1" + pod.Labels[labels.PodGroupSizeLabel] = groupSize + } return nil } -// InjectJob adds the sidecar container to the PodTemplateSpec of the Job -func (a *fluenceWatcher) InjectJob(job *batchv1.Job) error { +// getJobLabel takes a label name and default and returns the value +// We look on both the job and underlying pod spec template +func getJobLabel(job *batchv1.Job, labelName, defaultLabel string) string { - /* - // Cut out early if we have no labels - if job.Annotations == nil { - logger.Info(fmt.Sprintf("Job %s is not marked for oras storage.", job.Name)) - return nil + value, ok := job.Labels[labelName] + if !ok { + value, ok = job.Spec.Template.ObjectMeta.Labels[labelName] + if !ok { + value = defaultLabel } + } + return value +} - // Parse oras known labels into settings - settings := orasSettings.NewOrasCacheSettings(job.Annotations) +// EnsureGroupOnJob looks for fluence labels (size and name) on both the job +// and the pod template. We ultimately put on the pod, the lowest level unit. +// Since we have the size of the job (paramllism) we can use that for the size +func (a *fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { - // Cut out early if no oras identifiers! - if !settings.MarkedForOras { - logger.Warnf("Job %s is not marked for oras storage.", job.Name) - return nil - } + // Be forgiving - allow the person to specify it on the job directly or on the Podtemplate + // We will ultimately put the metadata on the Pod. + if job.Spec.Template.ObjectMeta.Labels == nil { + job.Spec.Template.ObjectMeta.Labels = map[string]string{} + } + if job.Labels == nil { + job.Labels = map[string]string{} + } - // Validate, return error if no good here. - if !settings.Validate() { - logger.Warnf("Job %s oras storage did not validate.", job.Name) - return fmt.Errorf("oras storage was requested but is not valid") - } + /// First get the name for the pod group (also setting on the pod template) + defaultName := fmt.Sprintf("fluence-group-%s-%s", job.Namespace, job.Name) + groupName := getJobLabel(job, labels.PodGroupNameLabel, defaultName) - // Add the sidecar to the podspec of the job - if job.Spec.Template.Labels == nil { - job.Spec.Template.Labels = map[string]string{} - } + // Wherever we find it, make sure the pod group name is on the pod spec template + job.Spec.Template.ObjectMeta.Labels[labels.PodGroupNameLabel] = groupName - // Add network to spec template so all pods are targeted - job.Spec.Template.Labels[defaults.OrasSelectorKey] = job.ObjectMeta.Namespace - oras.AddSidecar(&job.Spec.Template.Spec, job.ObjectMeta.Namespace, settings) - logger.Info(fmt.Sprintf("Job %s is marked for oras storage.", job.Name))*/ + // Now do the same for the size, but the size is the size of the job + jobSize := *job.Spec.Parallelism + if jobSize == int32(0) { + jobSize = int32(1) + } + labelSize := fmt.Sprintf("%d", jobSize) + groupSize := getJobLabel(job, labels.PodGroupSizeLabel, labelSize) + job.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize return nil } diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml index a5a7870..e48aa98 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml @@ -23,7 +23,7 @@ controller: name: scheduler-plugins-controller image: ghcr.io/flux-framework/fluence-controller:latest replicaCount: 1 - pullPolicy: IfNotPresent + pullPolicy: Always # LoadVariationRiskBalancing and TargetLoadPacking are not enabled by default # as they need extra RBAC privileges on metrics.k8s.io. diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index 02eb4e4..fc8e8d4 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -199,11 +199,12 @@ func (r *PodGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { // podToPodGroup is a watcher that looks for pods and associated pod group func (r *PodGroupReconciler) podToPodGroup(ctx context.Context, obj client.Object) []ctrl.Request { + r.log.Info("PANCAKES pre get pod in podToPodGroup flux-framework/fluence-controller") pod, ok := obj.(*v1.Pod) if !ok { return nil } - r.log.Info("podToPodGroup flux-framework/fluence-controller") + r.log.Info("PANCAKES post get pod in podToPodGroup flux-framework/fluence-controller") r.log.V(5).Info("Running podToPodGroup", "pod", pod.Name, "namespace", pod.Namespace) pgName := util.GetPodGroupLabel(pod) if len(pgName) == 0 { @@ -212,6 +213,32 @@ func (r *PodGroupReconciler) podToPodGroup(ctx context.Context, obj client.Objec r.log.V(5).Info("Add pod group when pod gets added", "podGroup", pgName, "pod", pod.Name, "namespace", pod.Namespace) + // TODO we need an ability to trigger a create here. Likely we will just add + // the create function to watches. I'm wondering if we want to set the owner + // to the pod or the job that triggers? + // newPodGroup ensures we have a pod group + /*func newPodGroup(name, namespace string, size int32, pod *v1.Pod) { + + // Create an owner reference to the pod + // https://github.com/kubernetes/apimachinery/blob/master/pkg/apis/meta/v1/types.go#L295 + ownerRef := metav1.OwnerReferences{ + Kind: pod.ObjectMeta.Kind, + Name: pod.Name, + APIVersion: pod.ObjectMeta.APIVersion, + UID: pod.ObjectMeta.UID, + } + pg := PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + OwnerReferences: []metav1.OwnerReferences{ownerRef}, + }, + Spec: PodGroupSpec{ + MinMember: size, + }, + } + }*/ + return []ctrl.Request{{ NamespacedName: types.NamespacedName{ Namespace: pod.Namespace, diff --git a/sig-scheduler-plugins/pkg/fluence/group/group.go b/sig-scheduler-plugins/pkg/fluence/group/group.go index b681504..291ad17 100644 --- a/sig-scheduler-plugins/pkg/fluence/group/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group/group.go @@ -10,11 +10,7 @@ import ( "k8s.io/kubernetes/pkg/scheduler/framework" fcore "sigs.k8s.io/scheduler-plugins/pkg/fluence/core" -) - -const ( - PodGroupNameLabel = "fluence.pod-group" - PodGroupSizeLabel = "fluence.group-size" + "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" ) // getDefaultGroupName returns a group name based on the pod namespace and name @@ -71,13 +67,13 @@ func DeleteFluenceGroup(pod *v1.Pod) { // getFluenceGroupName looks for the group to indicate a fluence group, and returns it func getFluenceGroupName(pod *v1.Pod) string { - groupName, _ := pod.Labels[PodGroupNameLabel] + groupName, _ := pod.Labels[labels.PodGroupNameLabel] return groupName } // getFluenceGroupSize gets the size of the fluence group func getFluenceGroupSize(pod *v1.Pod) int32 { - size, _ := pod.Labels[PodGroupSizeLabel] + size, _ := pod.Labels[labels.PodGroupSizeLabel] // Default size of 1 if the label is not set (but name is) if size == "" { diff --git a/sig-scheduler-plugins/pkg/fluence/labels/labels.go b/sig-scheduler-plugins/pkg/fluence/labels/labels.go new file mode 100644 index 0000000..e409ddc --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/labels/labels.go @@ -0,0 +1,8 @@ +package labels + +// Labels to be shared between different components + +const ( + PodGroupNameLabel = "fluence.pod-group" + PodGroupSizeLabel = "fluence.group-size" +) From 000baac47dd77e36f9cbee455b7509bfa5dfcb02 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 18 Feb 2024 11:40:52 -0700 Subject: [PATCH 14/28] pod-group: labels for name and size now lead to creation Problem: we want the labels (size and name) that are explicitly set to lead to the creation of the pod group so the user does not need to. This is done by way of a watcher on pod, which will trigger after the webhook that ensures that every pod (in a job or single pod) has the proper label. Likely we want to do this for other abstractions that hold pods as well, because it ensures that no matter how the pods go into pending, we have the correct size and name. The only case that a pod can come in without the label means that it was not scheduled by fluence. The user is directed to not do this, but it is not impossible (e.g., fluence sees itself show up here actually). So after this addition we have the full steps to add the labels and create the pod group, and next steps are (finally) to integrate this into fluence (and remove the old abstraction to store it in memory). Signed-off-by: vsoch --- Makefile | 3 +- examples/pod-group/lammps/lammps2.yaml | 3 - .../scheduling/v1alpha1/podgroup_webhook.go | 38 ++- .../apis/scheduling/v1alpha1/types.go | 188 ++++++++++++++ .../cmd/controller/app/server.go | 5 + .../mutating-webhook-configuration.yaml | 3 +- .../pkg/controllers/podgroup_controller.go | 229 +++++++++++++----- .../pkg/fluence/group/group.go | 2 +- .../pkg/fluence/labels/labels.go | 6 +- 9 files changed, 399 insertions(+), 78 deletions(-) create mode 100644 sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go diff --git a/Makefile b/Makefile index 8976cb4..6ab44fe 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,6 @@ prepare: clone # These are entirely new directory structures rm -rf $(CLONE_UPSTREAM)/pkg/fluence rm -rf $(CLONE_UPSTREAM)/pkg/controllers/podgroup_controller.go - rm -rf $(CLONE_UPSTREAM)/apis/scheduling/v1alpha1/podgroup_webhook.go rm -rf $(CLONE_UPSTREAM)/cmd/controller/app/server.go cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence cp -R sig-scheduler-plugins/pkg/controllers/* $(CLONE_UPSTREAM)/pkg/controllers/ @@ -38,7 +37,7 @@ prepare: clone cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/*.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/templates/ cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/*.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/crds/ cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/values.yaml - cp sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go $(CLONE_UPSTREAM)/apis/scheduling/v1alpha1/podgroup_webhook.go + cp sig-scheduler-plugins/apis/scheduling/v1alpha1/*.go $(CLONE_UPSTREAM)/apis/scheduling/v1alpha1/ cp sig-scheduler-plugins/cmd/controller/app/server.go $(CLONE_UPSTREAM)/cmd/controller/app/server.go build: prepare diff --git a/examples/pod-group/lammps/lammps2.yaml b/examples/pod-group/lammps/lammps2.yaml index acdd2d5..5cc7535 100644 --- a/examples/pod-group/lammps/lammps2.yaml +++ b/examples/pod-group/lammps/lammps2.yaml @@ -8,9 +8,6 @@ spec: headlessName: l2 pod: schedulerName: fluence - labels: - fluence.pod-group: lammps2 - fluence.group-size: "2" containers: - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed workingDir: /opt/lammps/examples/reaxff/HNS diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go index 13d327c..bc99fe4 100644 --- a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go @@ -61,9 +61,12 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi // Assume we operate on the level of pods for now pod := &corev1.Pod{} err := a.decoder.Decode(req, pod) + + // Assume it's a pod group or something else. + // We aren't in charge of validating people's pods. + // I don't think we should ever hit this case, actually if err != nil { - logger.Error(err, "Admission error.") - return admission.Errored(http.StatusBadRequest, err) + return admission.Allowed("Found non-pod, non-job, this webhook does not validate beyond those.") } // If we get here, we decoded a pod @@ -73,6 +76,8 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi return admission.Errored(http.StatusBadRequest, err) } + // Send the updated pod to the events channel + //*a.events <- event.GenericEvent{Object: pod} logger.Info("Admission pod success.") marshalledPod, err := json.Marshal(pod) @@ -92,7 +97,10 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi return admission.Errored(http.StatusBadRequest, err) } + // Send the updated job to the events channel + //*a.events <- event.GenericEvent{Object: job} logger.Info("Admission job success.") + marshalledJob, err := json.Marshal(job) if err != nil { logger.Error(err, "Marshalling job error.") @@ -106,12 +114,20 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi // Default is the expected entrypoint for a webhook... // I don't remember if this is even called... func (a *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { - pod, ok := obj.(*corev1.Pod) + job, ok := obj.(*batchv1.Job) if !ok { - return fmt.Errorf("expected a Pod or Job but got a %T", obj) + pod, ok := obj.(*corev1.Pod) + + // This is adkin to an admission success - it's not a pod or job, so we don't care + // I don't think we should ever hit this case, actually + if !ok { + return nil + } + logger.Info(fmt.Sprintf("Pod %s is marked for fluence.", pod.Name)) + return a.EnsureGroup(pod) } - logger.Info(fmt.Sprintf("Pod %s is marked for fluence.", pod.Name)) - return a.EnsureGroup(pod) + logger.Info(fmt.Sprintf("Job %s is marked for fluence.", job.Name)) + return a.EnsureGroupOnJob(job) } // EnsureGroup adds pod group label and size if not present @@ -127,12 +143,12 @@ func (a *fluenceWatcher) EnsureGroup(pod *corev1.Pod) error { } // Do we have a group name? - groupName, ok := pod.Labels[labels.PodGroupNameLabel] + groupName, ok := pod.Labels[labels.PodGroupLabel] // If we don't have a fluence group, create one under fluence namespace if !ok { - groupName = fmt.Sprintf("fluence-group-%s-%s", pod.Namespace, pod.Name) - pod.Labels[labels.PodGroupNameLabel] = groupName + groupName = fmt.Sprintf("fluence-group-%s", pod.Name) + pod.Labels[labels.PodGroupLabel] = groupName } // Do we have a group size? This will be parsed as a string, likely @@ -174,10 +190,10 @@ func (a *fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { /// First get the name for the pod group (also setting on the pod template) defaultName := fmt.Sprintf("fluence-group-%s-%s", job.Namespace, job.Name) - groupName := getJobLabel(job, labels.PodGroupNameLabel, defaultName) + groupName := getJobLabel(job, labels.PodGroupLabel, defaultName) // Wherever we find it, make sure the pod group name is on the pod spec template - job.Spec.Template.ObjectMeta.Labels[labels.PodGroupNameLabel] = groupName + job.Spec.Template.ObjectMeta.Labels[labels.PodGroupLabel] = groupName // Now do the same for the size, but the size is the size of the job jobSize := *job.Spec.Parallelism diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go new file mode 100644 index 0000000..77f10f3 --- /dev/null +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go @@ -0,0 +1,188 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/scheduler-plugins/apis/scheduling" +) + +// ElasticQuota sets elastic quota restrictions per namespace +// +genclient +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +kubebuilder:object:root=true +// +kubebuilder:resource:shortName={eq,eqs} +// +kubebuilder:subresource:status +// +kubebuilder:metadata:annotations="api-approved.kubernetes.io=https://github.com/kubernetes-sigs/scheduler-plugins/pull/52" +type ElasticQuota struct { + metav1.TypeMeta `json:",inline"` + + // Standard object's metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` + + // ElasticQuotaSpec defines the Min and Max for Quota. + // +optional + Spec ElasticQuotaSpec `json:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` + + // ElasticQuotaStatus defines the observed use. + // +optional + Status ElasticQuotaStatus `json:"status,omitempty" protobuf:"bytes,3,opt,name=status"` +} + +// ElasticQuotaSpec defines the Min and Max for Quota. +type ElasticQuotaSpec struct { + // Min is the set of desired guaranteed limits for each named resource. + // +optional + Min v1.ResourceList `json:"min,omitempty" protobuf:"bytes,1,rep,name=min, casttype=ResourceList,castkey=ResourceName"` + + // Max is the set of desired max limits for each named resource. The usage of max is based on the resource configurations of + // successfully scheduled pods. + // +optional + Max v1.ResourceList `json:"max,omitempty" protobuf:"bytes,2,rep,name=max, casttype=ResourceList,castkey=ResourceName"` +} + +// ElasticQuotaStatus defines the observed use. +type ElasticQuotaStatus struct { + // Used is the current observed total usage of the resource in the namespace. + // +optional + Used v1.ResourceList `json:"used,omitempty" protobuf:"bytes,1,rep,name=used,casttype=ResourceList,castkey=ResourceName"` +} + +// +kubebuilder:object:root=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// ElasticQuotaList is a list of ElasticQuota items. +type ElasticQuotaList struct { + metav1.TypeMeta `json:",inline"` + + // Standard list metadata. + // +optional + metav1.ListMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` + + // Items is a list of ElasticQuota objects. + Items []ElasticQuota `json:"items" protobuf:"bytes,2,rep,name=items"` +} + +// PodGroupPhase is the phase of a pod group at the current time. +type PodGroupPhase string + +// These are the valid phase of podGroups. +const ( + // PodGroupPending means the pod group has been accepted by the system, but scheduler can not allocate + // enough resources to it. + PodGroupPending PodGroupPhase = "Pending" + + // PodGroupRunning means the `spec.minMember` pods of the pod group are in running phase. + PodGroupRunning PodGroupPhase = "Running" + + // PodGroupScheduling means the number of pods scheduled is bigger than `spec.minMember` + // but the number of running pods has not reached the `spec.minMember` pods of PodGroups. + PodGroupScheduling PodGroupPhase = "Scheduling" + + // PodGroupUnknown means a part of `spec.minMember` pods of the pod group have been scheduled but the others can not + // be scheduled due to, e.g. not enough resource; scheduler will wait for related controllers to recover them. + PodGroupUnknown PodGroupPhase = "Unknown" + + // PodGroupFinished means the `spec.minMember` pods of the pod group are successfully finished. + PodGroupFinished PodGroupPhase = "Finished" + + // PodGroupFailed means at least one of `spec.minMember` pods have failed. + PodGroupFailed PodGroupPhase = "Failed" + + // PodGroupLabel is the default label of coscheduling + PodGroupLabel = scheduling.GroupName + "/pod-group" +) + +// PodGroup is a collection of Pod; used for batch workload. +// +genclient +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +kubebuilder:object:root=true +// +kubebuilder:resource:shortName={pg,pgs} +// +kubebuilder:subresource:status +// +kubebuilder:metadata:annotations="api-approved.kubernetes.io=https://github.com/kubernetes-sigs/scheduler-plugins/pull/50" +type PodGroup struct { + metav1.TypeMeta `json:",inline"` + // Standard object's metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired behavior of the pod group. + // +optional + Spec PodGroupSpec `json:"spec,omitempty"` + + // Status represents the current information about a pod group. + // This data may not be up to date. + // +optional + Status PodGroupStatus `json:"status,omitempty"` +} + +// PodGroupSpec represents the template of a pod group. +type PodGroupSpec struct { + // MinMember defines the minimal number of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start anyone. + MinMember int32 `json:"minMember,omitempty"` + + // MinResources defines the minimal resource of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start anyone. + MinResources v1.ResourceList `json:"minResources,omitempty"` + + // ScheduleTimeoutSeconds defines the maximal time of members/tasks to wait before run the pod group; + ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` +} + +// PodGroupStatus represents the current state of a pod group. +type PodGroupStatus struct { + // Current phase of PodGroup. + Phase PodGroupPhase `json:"phase,omitempty"` + + // OccupiedBy marks the workload (e.g., deployment, statefulset) UID that occupy the podgroup. + // It is empty if not initialized. + OccupiedBy string `json:"occupiedBy,omitempty"` + + // The number of actively running pods. + // +optional + Running int32 `json:"running,omitempty"` + + // The number of pods which reached phase Succeeded. + // +optional + Succeeded int32 `json:"succeeded,omitempty"` + + // The number of pods which reached phase Failed. + // +optional + Failed int32 `json:"failed,omitempty"` + + // ScheduleStartTime of the group (note that we changed this to a micro time) + // +optional + ScheduleStartTime metav1.MicroTime `json:"scheduleStartTime,omitempty"` +} + +// +kubebuilder:object:root=true + +// PodGroupList is a collection of pod groups. +type PodGroupList struct { + metav1.TypeMeta `json:",inline"` + // Standard list metadata + // +optional + metav1.ListMeta `json:"metadata,omitempty"` + + // Items is the list of PodGroup + Items []PodGroup `json:"items"` +} diff --git a/sig-scheduler-plugins/cmd/controller/app/server.go b/sig-scheduler-plugins/cmd/controller/app/server.go index 5927bec..d42c0f4 100644 --- a/sig-scheduler-plugins/cmd/controller/app/server.go +++ b/sig-scheduler-plugins/cmd/controller/app/server.go @@ -26,6 +26,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/webhook" + api "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" "sigs.k8s.io/scheduler-plugins/pkg/controllers" ) @@ -62,6 +63,10 @@ func Run(s *ServerRunOptions) error { return err } + // Create a channel for the mutating webhook to communicate back to the reconciler + // This way we create the PodGroup before scheduling + //c := make(chan event.GenericEvent) + if err = (&controllers.PodGroupReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml index d6e7330..c639127 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml @@ -36,5 +36,6 @@ webhooks: resources: - pods - jobs - - podgroups +# Can uncomment this if we want to mutate the pod groups after creation +# - podgroups sideEffects: None diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index fc8e8d4..72bda77 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -20,23 +20,25 @@ import ( "context" "fmt" "sort" + "strconv" "strings" "time" "github.com/go-logr/logr" v1 "k8s.io/api/core/v1" apierrs "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" schedv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + fluenceLabels "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" "sigs.k8s.io/scheduler-plugins/pkg/util" ) @@ -65,96 +67,197 @@ type PodGroupReconciler struct { // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.0/pkg/reconcile func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := log.FromContext(ctx) - log.Info("reconciling flux-framework/fluence-controller") + log.Info("reconciling flux-framework/fluence-controller for request") pg := &schedv1alpha1.PodGroup{} + if err := r.Get(ctx, req.NamespacedName, pg); err != nil { + + // Case 1: if we get here and it's not found, assume not created if apierrs.IsNotFound(err) { - log.V(5).Info("Pod group has been deleted") + log.Info("PodGroup", "Status", fmt.Sprintf("Pod group %s is not found, deleted.", req.NamespacedName)) return ctrl.Result{}, nil } - log.V(3).Error(err, "Unable to retrieve pod group") + log.Error(err, fmt.Sprintf("Unable to retrieve pod group %s", req.NamespacedName)) return ctrl.Result{}, err } - if pg.Status.Phase == schedv1alpha1.PodGroupFinished || - pg.Status.Phase == schedv1alpha1.PodGroupFailed { + // Grab all statuses (and groups of them) we are interested in + schedulingOrPending := (pg.Status.Phase == schedv1alpha1.PodGroupScheduling || pg.Status.Phase == schedv1alpha1.PodGroupPending) + twoDaysOld := pg.Status.ScheduleStartTime.Sub(pg.CreationTimestamp.Time) > 48*time.Hour + finishedOrFailed := pg.Status.Phase == schedv1alpha1.PodGroupFinished || pg.Status.Phase == schedv1alpha1.PodGroupFailed + + // Finished or failed - clean up the group + if finishedOrFailed { + log.Info("PodGroup", "Status", fmt.Sprintf("Pod group %s is finished or failed.", req.NamespacedName)) return ctrl.Result{}, nil } + // If startScheduleTime - createTime > 2days, // do not reconcile again because pod may have been GCed - if (pg.Status.Phase == schedv1alpha1.PodGroupScheduling || pg.Status.Phase == schedv1alpha1.PodGroupPending) && pg.Status.Running == 0 && - pg.Status.ScheduleStartTime.Sub(pg.CreationTimestamp.Time) > 48*time.Hour { - r.recorder.Event(pg, v1.EventTypeWarning, - "Timeout", "schedule time longer than 48 hours") + if schedulingOrPending && pg.Status.Running == 0 && twoDaysOld { + r.recorder.Event(pg, v1.EventTypeWarning, "Timeout", "schedule time longer than 48 hours") return ctrl.Result{}, nil } + // We can get the podList and check for sizes here podList := &v1.PodList{} - if err := r.List(ctx, podList, - client.MatchingLabelsSelector{ - Selector: labels.Set(map[string]string{ - schedv1alpha1.PodGroupLabel: pg.Name}).AsSelector(), - }); err != nil { + + // Select based on the group name + groupNameSelector := labels.Set(map[string]string{schedv1alpha1.PodGroupLabel: pg.Name}).AsSelector() + err := r.List(ctx, podList, client.MatchingLabelsSelector{Selector: groupNameSelector}) + if err != nil { log.Error(err, "List pods for group failed") return ctrl.Result{}, err } - pods := podList.Items + // Inspect the size, set on the group if not done yet + size := len(podList.Items) + log.Info("PodGroup", "Name", pg.Name, "Size", size) + + // When first created, size should be unset (MinMember) + if int(pg.Spec.MinMember) == 0 { + log.Info("PodGroup", "Status", fmt.Sprintf("Pod group %s updating size to %d", pg.Name, size)) + return r.updatePodGroupSize(ctx, pg, int32(size)) + + } else if int(pg.Spec.MinMember) != size { + // TODO: Not clear what to do here. Arguably, we also want to check the label size + // because (in the future) we can accept smaller sizes. But then we also need + // to account for if the labels are different, do we take the smallest? + log.Info("PodGroup", "Status", fmt.Sprintf("WARNING: Pod group current MinMember %s does not match %d", pg.Spec.MinMember, size)) + } + + // If we get here, we have a PodGroup with a set size and can inspect / update phase + pods := podList.Items pgCopy := pg.DeepCopy() + switch pgCopy.Status.Phase { case "": pgCopy.Status.Phase = schedv1alpha1.PodGroupPending case schedv1alpha1.PodGroupPending: if len(pods) >= int(pg.Spec.MinMember) { pgCopy.Status.Phase = schedv1alpha1.PodGroupScheduling - fillOccupiedObj(pgCopy, &pods[0]) + + // Always update owner references to be the first pod + // E.g., if a job owns it, ensure the group is deleted with it + updateOwnerReferences(pgCopy, &pods[0]) } default: + + // Get updated counts of running, succeeded, and failed pods pgCopy.Status.Running, pgCopy.Status.Succeeded, pgCopy.Status.Failed = getCurrentPodStats(pods) + + // If for some reason we weren't pending and now have fewer than min required, flip back to pending if len(pods) < int(pg.Spec.MinMember) { pgCopy.Status.Phase = schedv1alpha1.PodGroupPending break } + // A pod with succeeded + running STILL less than the minimum required is scheduling if pgCopy.Status.Succeeded+pgCopy.Status.Running < pg.Spec.MinMember { pgCopy.Status.Phase = schedv1alpha1.PodGroupScheduling } + // A pod with succeeded + running >= the minimum required is running! if pgCopy.Status.Succeeded+pgCopy.Status.Running >= pg.Spec.MinMember { pgCopy.Status.Phase = schedv1alpha1.PodGroupRunning } - // Final state of pod group + + // We have non zero failed, and the total of failed, running amd succeeded > min member + // Final state of pod group is FAILED womp womp if pgCopy.Status.Failed != 0 && pgCopy.Status.Failed+pgCopy.Status.Running+pgCopy.Status.Succeeded >= pg.Spec.MinMember { pgCopy.Status.Phase = schedv1alpha1.PodGroupFailed } + + // Finished! This is where we want to get :) + // TODO: ideally the owning higher level object deletion will delete here, + // but that won't always work for one of pods - need a new strategy if pgCopy.Status.Succeeded >= pg.Spec.MinMember { pgCopy.Status.Phase = schedv1alpha1.PodGroupFinished } } + // TODO need better handling here of cleanup, etc. This mostly handles status changes return r.patchPodGroup(ctx, pg, pgCopy) } +// newPodGroup creates a new podGroup object, capturing the creation time +// This should be followed by a request to reconsile it +func (r *PodGroupReconciler) newPodGroup( + ctx context.Context, + name, namespace string, + groupSize int32, +) (*schedv1alpha1.PodGroup, error) { + + pg := &schedv1alpha1.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + // Note that we don't know the size yet + // The most important thing here is the MicroTime! + Spec: schedv1alpha1.PodGroupSpec{ + MinMember: groupSize, + }, + Status: schedv1alpha1.PodGroupStatus{ + ScheduleStartTime: metav1.NewMicroTime(time.Now()), + }, + } + // TODO need to set a controller reference? + // ctrl.SetControllerReference(cluster, job, r.Scheme) + err := r.Create(ctx, pg) + if err != nil { + r.log.Error(err, "Failed to create new PodGroup", "Namespace:", pg.Namespace, "Name:", pg.Name) + return pg, err + } + // Successful - return and requeue + return pg, nil + +} + +// patchPodGroup is a halper function to run a patch and then return the correct result / error for the reconciler func (r *PodGroupReconciler) patchPodGroup(ctx context.Context, old, new *schedv1alpha1.PodGroup) (ctrl.Result, error) { patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, new, patch); err != nil { + r.log.Error(err, "Issue patching PodGroup", "Namespace:", old.Namespace, "Name:", old.Name) return ctrl.Result{}, err } err := r.Patch(ctx, new, patch) + if err != nil { + r.log.Error(err, "Issue patching PodGroup", "Namespace:", old.Namespace, "Name:", old.Name) + } return ctrl.Result{}, err } +// updatePodGroup does an update with reconcile instead of a patch request +func (r *PodGroupReconciler) updatePodGroupSize( + ctx context.Context, + old *schedv1alpha1.PodGroup, + size int32, +) (ctrl.Result, error) { + + patch := client.MergeFrom(old.DeepCopy()) + old.Spec.MinMember = size + + // Apply the patch to update the size + r.Status().Update(ctx, old) + err := r.Patch(ctx, old, patch) + return ctrl.Result{Requeue: true}, err +} + +// getCurrentPodStats gets the number of running, succeeded, and failed +// We use these to populate the PodGroup func getCurrentPodStats(pods []v1.Pod) (int32, int32, int32) { if len(pods) == 0 { return 0, 0, 0 } - var ( running int32 = 0 succeeded int32 = 0 failed int32 = 0 ) + + // Loop and count things. for _, pod := range pods { switch pod.Status.Phase { case v1.PodRunning: @@ -168,7 +271,11 @@ func getCurrentPodStats(pods []v1.Pod) (int32, int32, int32) { return running, succeeded, failed } -func fillOccupiedObj(pg *schedv1alpha1.PodGroup, pod *v1.Pod) { +// updateOwnerReferences ensures the group is always owned by the same entity that owns the pod +// This ensures that, for example, a job that is wrapping pods is the owner. +func updateOwnerReferences(pg *schedv1alpha1.PodGroup, pod *v1.Pod) { + + // Case 1: The pod itself doesn't have owner references. YOLO if len(pod.OwnerReferences) == 0 { return } @@ -184,64 +291,68 @@ func fillOccupiedObj(pg *schedv1alpha1.PodGroup, pod *v1.Pod) { } // SetupWithManager sets up the controller with the Manager. +// We watch the events channel, which is going to trigger from the mutating webhook +// to send over when a pod group is created (hopefully preceeding schedule). func (r *PodGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { r.recorder = mgr.GetEventRecorderFor("PodGroupController") r.log = mgr.GetLogger() r.log.Info("setup with manager flux-framework/fluence-controller") return ctrl.NewControllerManagedBy(mgr). - Watches(&v1.Pod{}, handler.EnqueueRequestsFromMapFunc(r.podToPodGroup)). + Watches(&v1.Pod{}, handler.EnqueueRequestsFromMapFunc(r.ensurePodGroup)). For(&schedv1alpha1.PodGroup{}). WithOptions(controller.Options{MaxConcurrentReconciles: r.Workers}). Complete(r) } -// podToPodGroup is a watcher that looks for pods and associated pod group -func (r *PodGroupReconciler) podToPodGroup(ctx context.Context, obj client.Object) []ctrl.Request { - - r.log.Info("PANCAKES pre get pod in podToPodGroup flux-framework/fluence-controller") +func (r *PodGroupReconciler) ensurePodGroup(ctx context.Context, obj client.Object) []ctrl.Request { pod, ok := obj.(*v1.Pod) if !ok { return nil } - r.log.Info("PANCAKES post get pod in podToPodGroup flux-framework/fluence-controller") - r.log.V(5).Info("Running podToPodGroup", "pod", pod.Name, "namespace", pod.Namespace) - pgName := util.GetPodGroupLabel(pod) - if len(pgName) == 0 { + groupName := util.GetPodGroupLabel(pod) + + // This case only happens when something is not scheduled by fluence + if len(groupName) == 0 { + r.log.Info("Pod: ", "Name", pod.Name, "Status", pod.Status.Phase, "Action", "Not fluence owned") return nil } - r.log.V(5).Info("Add pod group when pod gets added", "podGroup", pgName, "pod", pod.Name, "namespace", pod.Namespace) + // If we are watching the Pod and it's beyond pending, we hopefully already made a group + // and that group should be in the reconcile process. + if pod.Status.Phase != v1.PodPending { + r.log.Info("Pod: ", "Name", pod.Name, "Status", pod.Status.Phase, "Action", "Skipping reconcile") + return nil + } - // TODO we need an ability to trigger a create here. Likely we will just add - // the create function to watches. I'm wondering if we want to set the owner - // to the pod or the job that triggers? - // newPodGroup ensures we have a pod group - /*func newPodGroup(name, namespace string, size int32, pod *v1.Pod) { + // At this point we should have a group size (string) set by the webhook + rawSize := pod.Labels[fluenceLabels.PodGroupSizeLabel] + groupSize, err := strconv.ParseInt(rawSize, 10, 32) + if err != nil { + r.log.Error(err, "Parsing PodGroup size.") + return nil + } - // Create an owner reference to the pod - // https://github.com/kubernetes/apimachinery/blob/master/pkg/apis/meta/v1/types.go#L295 - ownerRef := metav1.OwnerReferences{ - Kind: pod.ObjectMeta.Kind, - Name: pod.Name, - APIVersion: pod.ObjectMeta.APIVersion, - UID: pod.ObjectMeta.UID, - } - pg := PodGroup{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - OwnerReferences: []metav1.OwnerReferences{ownerRef}, - }, - Spec: PodGroupSpec{ - MinMember: size, - }, - } - }*/ + namespacedName := types.NamespacedName{ + Namespace: pod.Namespace, + Name: groupName, + } - return []ctrl.Request{{ - NamespacedName: types.NamespacedName{ - Namespace: pod.Namespace, - Name: pgName, - }}} + // Create the pod group if the pod is pending + pg := &schedv1alpha1.PodGroup{} + if err := r.Get(ctx, namespacedName, pg); err != nil { + + // Case 1: if we get here and it's not found, assume not created + if apierrs.IsNotFound(err) { + r.log.Info("Pod: ", "Status", pod.Status.Phase, "Name", pod.Name, "Group", groupName, "Namespace", pod.Namespace, "Action", "Creating PodGroup") + + // TODO should an owner be set here? Setting to a specific pod seems risky/wrong in case deleted. + err, _ := r.newPodGroup(ctx, groupName, pod.Namespace, int32(groupSize)) + if err != nil { + return []ctrl.Request{{NamespacedName: namespacedName}} + } + r.log.Info("Pod: ", "Status", pod.Status.Phase, "Name", pod.Name, "Group", groupName, "Namespace", pod.Namespace, "Action", "Issue Creating PodGroup") + } + } + return nil } diff --git a/sig-scheduler-plugins/pkg/fluence/group/group.go b/sig-scheduler-plugins/pkg/fluence/group/group.go index 291ad17..4af84e2 100644 --- a/sig-scheduler-plugins/pkg/fluence/group/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group/group.go @@ -67,7 +67,7 @@ func DeleteFluenceGroup(pod *v1.Pod) { // getFluenceGroupName looks for the group to indicate a fluence group, and returns it func getFluenceGroupName(pod *v1.Pod) string { - groupName, _ := pod.Labels[labels.PodGroupNameLabel] + groupName, _ := pod.Labels[labels.PodGroupLabel] return groupName } diff --git a/sig-scheduler-plugins/pkg/fluence/labels/labels.go b/sig-scheduler-plugins/pkg/fluence/labels/labels.go index e409ddc..e0040ea 100644 --- a/sig-scheduler-plugins/pkg/fluence/labels/labels.go +++ b/sig-scheduler-plugins/pkg/fluence/labels/labels.go @@ -3,6 +3,10 @@ package labels // Labels to be shared between different components const ( - PodGroupNameLabel = "fluence.pod-group" + // https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/apis/scheduling/v1alpha1/types.go#L109 + PodGroupLabel = "scheduling.x-k8s.io/pod-group" + + // TODO add more labels here, to be discovered used later + //PodGroupNameLabel = "fluence.pod-group" PodGroupSizeLabel = "fluence.group-size" ) From 7874d571601839ff8b7bc257b61e8211d22f60e2 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 18 Feb 2024 16:09:01 -0700 Subject: [PATCH 15/28] fluence: refactor to use new PodGroup Problem: fluence should only be storing state of jobid and presence of a group name in a map to indicate node assignment. Soluion: update the code here. Note that this is not working yet, and I am pushing / opening the PR to not use the work (and will update accordingly, and using this PR to test). Signed-off-by: vsoch --- README.md | 198 ++++++------ docs/README.md | 25 ++ .../simple_example/fluence-scheduler-pod.yaml | 4 +- .../pkg/fluence/core/core.go | 202 +++++------- sig-scheduler-plugins/pkg/fluence/events.go | 150 +++++++++ sig-scheduler-plugins/pkg/fluence/fluence.go | 295 ++++++------------ .../pkg/fluence/group/group.go | 96 +----- .../pkg/fluence/utils/utils.go | 14 +- src/fluence/fluxion/fluxion.go | 2 +- 9 files changed, 466 insertions(+), 520 deletions(-) create mode 100644 docs/README.md create mode 100644 sig-scheduler-plugins/pkg/fluence/events.go diff --git a/README.md b/README.md index 4f33cd9..8556dd1 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,11 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched ## Getting started For instructions on how to start Fluence on a K8s cluster, see [examples](examples/). Documentation and instructions for reproducing our CANOPIE-2022 paper (citation below) can be found in the [canopie22-artifacts branch](https://github.com/flux-framework/flux-k8s/tree/canopie22-artifacts). -For background on the Flux framework and the Fluxion scheduler, you can take a look at our award-winning R&D100 submission: https://ipo.llnl.gov/sites/default/files/2022-02/Flux_RD100_Final.pdf. For next steps: +For background on the Flux framework and the Fluxion scheduler, you can take a look at our award-winning [R&D100 submission](https://ipo.llnl.gov/sites/default/files/2022-02/Flux_RD100_Final.pdf). For next steps: - To understand how it works, see [Design](#design) - To deploy our pre-built images, go to [Deploy](#deploy) - - To build your own images, go to [Setup](#setup) + - To build your own images, go to [Build](#build) - To learn about repository organization, see [Developer](#developer) ### Design @@ -21,19 +21,47 @@ For background on the Flux framework and the Fluxion scheduler, you can take a l Fluence is a custom scheduler plugin that you can specify to use with two directive in your pod spec - - Asking for `fluence` as the scheduler name -- On either a job or a single or group of pods: - - Defining a named group of pods with the `fluence.flux-framework.org/pod-group` label. - - Defining the group size with the `fluence.flux-framework.org/group-size` label. + +Note that any abstraction with pods (or a single pod) marked for fluence will automatically have the group name +and nodes derived. However, if you want to customize this metadata (for example, define the size of the pod group explicitly you can use +the following labels): + + - A named group of pods with the `scheduling.x-k8s.io/pod-group` label. + - Defining the group size with the `fluence.group-size` label. + +We expect to define more labels to customize the scheduling logic. The way it works: -1. We have a mutating admission webhook that looks for jobs and pods, and ensures there are fluence labels. -2. A PodGroup reconciler is watching for these same objects. When they are created (this is not done yet): +1. We have a mutating admission webhook that looks for jobs and pods, and ensures there are fluence labels (likely we will add more abstractions). +2. A PodGroup reconciler is watching for these same objects. When they are created: a. We find the labels and create the pod group object. - b. The pod group object has a timestamp for creation. + b. The pod group object has a timestamp for creation in milliseconds. 3. When the pod is then given to fluence for scheduling, it already has the PodGroup created with name/size and can properly sort. -Another strategy I'm considering (if the above runs into trouble) is to watch a [channel](https://book-v1.book.kubebuilder.io/beyond_basics/controller_watches). An example is shown below for an indexed job, which will create multiple pods. +Here is an example of a Job intended for Fluence: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: fluence-job +spec: + completions: 10 + parallelism: 10 + completionMode: Indexed + template: + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [echo, potato] + restartPolicy: Never + backoffLimit: 4 +``` + +And you can imagine if you want to group pods from different abstractions together, or declare a different size than what is represented in the Job: ```yaml apiVersion: batch/v1 @@ -41,8 +69,8 @@ kind: Job metadata: name: fluence-job labels: - fluence.flux-framework.org/pod-group: my-pods - fluence.flux-framework.org/group-size: 10 + scheduling.x-k8s.io/pod-group: min-size-group + fluence.group-size: 5 spec: completions: 10 parallelism: 10 @@ -58,8 +86,7 @@ spec: backoffLimit: 4 ``` -The group size might be different than, for example, your higher level abstraction (e.g., the IndexedJob) as there is no reason -pods with different names cannot be part of the same group that needs to be scheduled together. +There is no reason pods with different names or under different abstractions cannot be part of the same group that needs to be scheduled together. ### Deploy @@ -88,7 +115,7 @@ helm install \ And that's it! See the [testing install](#testing-install) section for a basic example to schedule pods using Fluence. -### Setup +### Build To build and test Fluence, you will need: @@ -96,9 +123,7 @@ To build and test Fluence, you will need: - [helm](https://helm.sh/docs/intro/install/) to install charts for scheduler plugins. - A Kubernetes cluster for testing, e.g., you can deploy one with [kind](https://kind.sigs.k8s.io/docs/user/quick-start/) -### Building Fluence - -There are two images we will be building: +There are three images we will be building: - the scheduler sidecar: built from the repository here - the scheduler: built (and modified) from [this branch of scheduler-plugins](https://github.com/openshift-psap/scheduler-plugins/blob/fluence/build/scheduler/Dockerfile) @@ -111,7 +136,7 @@ There are two images we will be building: This will run the full builds for all containers in one step, which includes: 1. Building the fluence sidecar from source code in [src](src) -2. Cloning the upstream kubernetes-sigs/plugin-schedulers respository to ./upstream +2. Cloning the upstream kubernetes-sigs/plugin-schedulers repository to ./upstream 3. Building the scheduler and controller containers From the root here: @@ -128,26 +153,18 @@ make REGISTRY=vanessa SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONT As an alternative, you can look at the Makefile to do each of the steps separately. - -Whatever build approach you use, you'll want to push to your registry for later discovery! - -```bash -docker push docker.io/vanessa/fluence -docker push docker.io/vanessa/fluence-sidecar -docker push docker.io/vanessa/fluence-controller -``` - -### Prepare Cluster +#### Prepare Cluster > Prepare a cluster and install the Kubernetes scheduling plugins framework -These steps will require a Kubernetes cluster to install to, and having pushed the plugin container to a registry. If you aren't using a cloud provider, you can create a local one with `kind`: +These steps will require a Kubernetes cluster to install to, and having pushed the plugin container to a registry OR loading +them into the local cluster and setting the image pull policy to `Never`. If you aren't using a cloud provider, you can create a local one with `kind`: ```bash kind create cluster --config ./examples/kind-config.yaml ``` -And install the certificate manager: +And again install the certificate manager: ```bash kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml @@ -155,7 +172,7 @@ kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/ **Important** if you are developing or testing fluence, note that custom scheduler plugins don't seem to work out of the box with MiniKube (but everything works with kind). Likely there are extensions or similar that need to be configured with MiniKube (that we have not looked into). -### Install Fluence +#### Install Fluence For some background, the [Scheduling Framework](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/) provided by Kubernetes means that our container is going to provide specific endpoints to allow for custom scheduling. At this point you can follow the instructions @@ -184,19 +201,26 @@ helm show values as-a-second-scheduler/ scheduler: name: fluence - image: registry.k8s.io/scheduler-plugins/kube-scheduler:v0.27.8 + image: ghcr.io/flux-framework/fluence:latest replicaCount: 1 leaderElect: false sidecarimage: ghcr.io/flux-framework/fluence-sidecar:latest policy: lonode pullPolicy: Always sidecarPullPolicy: Always + loggingLevel: "9" + + # Port is for GRPC, and enabling the external service will also + # create the service and ingress to it, along with adding + # additional API endpoints for our TBA kubectl plugin + enableExternalService: false + port: 4242 controller: name: scheduler-plugins-controller image: ghcr.io/flux-framework/fluence-controller:latest replicaCount: 1 - pullPolicy: IfNotPresent + pullPolicy: Always # LoadVariationRiskBalancing and TargetLoadPacking are not enabled by default # as they need extra RBAC privileges on metrics.k8s.io. @@ -217,6 +241,15 @@ pluginConfig: # args: # scoringStrategy: # type: MostAllocated # default is LeastAllocated + +enableCertManager: true +kubernetesClusterDomain: cluster.local +webhookService: + ports: + - port: 9443 + protocol: TCP + targetPort: 9443 + type: ClusterIP ``` @@ -239,9 +272,15 @@ If you need to uninstall (e.g., to redo something): helm uninstall fluence ``` +Or see the name you used: + +```bash +helm list +``` + Next you can move down to testing the install. -### Testing Install +#### Testing Install The installation process will run one scheduler and one controller pod for the Scheduler Plugin Framework in the default namespace. You can double check that everything is running as follows: @@ -284,35 +323,40 @@ kubectl logs fluence-6bbcbc6bbf-xjfx6 -c scheduler-plugins-scheduler If you haven't done anything, you'll likely just see health checks. -### Deploy Pods +#### Testing Pods and Jobs -Let's now run a simple example! Change directory into this directory: +You can test deploying pods and jobs. ```bash -# This is from the root of flux-k8s -cd examples/simple_example +kubectl apply -f examples/simple_example/fluence-scheduler-pod.yaml ``` +or a job: -And then we want to deploy two pods, one assigned to the `default-scheduler` and the other -`fluence`. For FYI, we do this via setting `schedulerName` in the spec: +```bash +# size 3 +kubectl apply -f examples/test_example/fluence-sized-job.yaml + +# size 1 +kubectl apply -f examples/test_example/fluence-job.yaml +``` + +Note that all of these have (in their spec) a designation of the fluence scheduler. ```yaml spec: schedulerName: fluence ``` -Here is how to create the pods: +Once it was created, aside from checking that it ran OK, you can verify by looking at the scheduler logs again: ```bash -kubectl apply -f default-scheduler-pod.yaml -kubectl apply -f fluence-scheduler-pod.yaml +kubectl logs fluence-6bbcbc6bbf-xjfx6 ``` -Once it was created, aside from checking that it ran OK, I could verify by looking at the scheduler logs again: +
+ +Scheduler Logs -```bash -kubectl logs fluence-6bbcbc6bbf-xjfx6 -``` ```bash Defaulted container "sidecar" out of: sidecar, scheduler-plugins-scheduler This is the fluxion grpc server @@ -361,6 +405,8 @@ FINAL NODE RESULT: [GRPCServer] Response podID:"fluence-scheduled-pod" nodelist:{nodeID:"kind-control-plane" tasks:1} jobID:1 ``` +
+ I was trying to look for a way to see the assignment, and maybe we can see it here (this is the best I could come up with!) ```bash @@ -385,7 +431,6 @@ pod/fluence-scheduled-pod spec.containers{fluence-scheduled-container} kubelet For the above, I found [this page](https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/#enable-leader-election) very helpful. -Finally, note that we also have a more appropriate example with jobs under [examples/test_example](examples/test_example). It's slightly more sane because it uses Job, and jobs are expected to complete (whereas pods are not and will get into crash loop backoffs, etc). For example of how to programmatically interact with the job pods and check states, events, see the [test.sh](.github/test.sh) script. ### Developer @@ -397,9 +442,10 @@ If you are looking to develop: - [src](src): includes source code for fluence. You'll find logs for this code in the `sidecar` container of the fluence pod. - [sig-scheduler-plugins](sig-scheduler-plugins): includes assets (manifests and Go files) that are intended to be added to the kubernetes-sigs/scheduler-plugins upstream repository before build. You'll find logs for this container in the `scheduler-plugins-scheduler` container of the pod. + - [apis](sig-scheduler-plugins/apis): customized PodGroup to define the status scheduled time in micro seconds - [manifests](sig-scheduler-plugins/manifests): manifests for helm and Kubernetes - [pkg](sig-scheduler-plugins/pkg): the main fluence module to add to upstream - - [cmd](sig-scheduler-plugins/cmd): the main.go to replace in upstream + - [cmd](sig-scheduler-plugins/cmd): the main.go to replace in upstream - *upstream*: the default name this upstream is cloned to when you do a make build command. Note that the clone of the repository and copying of files to the correct locations is all automated through the [Makefile](Makefile). Additional commands provided include the following: @@ -447,7 +493,7 @@ I was having trouble developing this easily because it's a lot of steps to build The last step ensures we use the images we loaded! You can basically just do: ```bash -./hack/quick-build.sh +/bin/bash ./hack/quick-build.sh ``` This sped up my development time immensely. If you want to manually do the steps, see that script for instructions. @@ -474,60 +520,18 @@ kind create cluster --config ./kind-config.yaml #### TODO - Try what [kueue does](https://github.com/kubernetes-sigs/kueue/blob/6d57813a52066dab412735deeeb60ebb0cdb8e8e/cmd/kueue/main.go#L146-L155) to not require cert-manager. - -#### Vanessa Thinking - -> Updated February 15, 2024 - -What I think might be happening (and not always, sometimes) - -- New pod group, no node list -- Fluence assigns nodes -- Nodes get assigned to pods 1:1 -- POD group is deleted -- Some pod is sent back to queue (kubelet rejects, etc) -- POD group does not exist and is recreated, no node list -- Fluence asks again, but still has the first job. Not enough resources, asks forever. - -The above would not happen with the persistent pod group (if it wasn't cleaned up until the deletion of the job) and wouldn't happen if there are just enough resources to account for the overlap. - -- Does Fluence allocate resources for itself? -- It would be nice to be able to inspect the state of Fluence. -- At some point we want to be using the TBA fluxion-go instead of the one off branch we currently have (but we don't need to be blocked for that) -- We should (I think) restore pod group (it's in the controller here) and have our own container built. That way we have total control over the custom resource, and we don't risk it going away. - - As a part of that, we can add add a mutating webhook that emulates what we are doing in fluence now to find the label, but instead we will create the CRD to hold state instead of trying to hold in the operator. -- It could then also be investigated that we can more flexibly change the size of the group, within some min/max size (also determined by labels?) to help with scheduling. -- Note that kueue has added a Pod Group object, so probably addresses the static case here. + - Add other abstraction types to be intercepted (and labeled with sizes) #### Components - [FluxStateData](sig-scheduler-plugins/pkg/fluence/core/core.go): is given to the [framework.CycleState](https://github.com/kubernetes/kubernetes/blob/242b41b36a20032f99e8a059ca0a5d764105217b/pkg/scheduler/framework/cycle_state.go#L48) and serves as a vehicle to store a cache of node name assignment. -#### Helm - -The install commands are shown above, but often you want to uninstall! - -> What is the name of the installed plugin again? - -```bash - helm list -NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION -fluence default 1 2024-01-08 12:04:58.558612156 -0700 MST deployed scheduler-plugins-0.27.80.27.8 -``` - -And then uninstall: - -```bash -$ helm uninstall fluence -release "fluence" uninstalled -``` - - ## Papers You can find details of Fluence architecture, implementation, experiments, and improvements to the Kubeflow MPI operator in our collaboration's papers: -``` + +```bibtex @INPROCEEDINGS{10029991, author={Milroy, Daniel J. and Misale, Claudia and Georgakoudis, Giorgis and Elengikal, Tonia and Sarkar, Abhik and Drocco, Maurizio and Patki, Tapasya and Yeom, Jae-Seung and Gutierrez, Carlos Eduardo Arango and Ahn, Dong H. and Park, Yoonho}, booktitle={2022 IEEE/ACM 4th International Workshop on Containers and New Orchestration Paradigms for Isolated Environments in HPC (CANOPIE-HPC)}, @@ -539,7 +543,7 @@ You can find details of Fluence architecture, implementation, experiments, and i doi={10.1109/CANOPIE-HPC56864.2022.00011} } ``` -``` +```bibtex @INPROCEEDINGS{9652595, author={Misale, Claudia and Drocco, Maurizio and Milroy, Daniel J. and Gutierrez, Carlos Eduardo Arango and Herbein, Stephen and Ahn, Dong H. and Park, Yoonho}, booktitle={2021 3rd International Workshop on Containers and New Orchestration Paradigms for Isolated Environments in HPC (CANOPIE-HPC)}, @@ -551,7 +555,7 @@ You can find details of Fluence architecture, implementation, experiments, and i doi={10.1109/CANOPIEHPC54579.2021.00006} } ``` -``` +```bibtex @inproceedings{10.1007/978-3-030-96498-6_18, address = {Cham}, author = {Misale, Claudia and Milroy, Daniel J. and Gutierrez, Carlos Eduardo Arango and Drocco, Maurizio and Herbein, Stephen and Ahn, Dong H. and Kaiser, Zvonko and Park, Yoonho}, diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..155ffc8 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,25 @@ +# Development Notes + +## Thinking + +> Updated February 15, 2024 + +What I think might be happening (and not always, sometimes) + +- New pod group, no node list +- Fluence assigns nodes +- Nodes get assigned to pods 1:1 +- POD group is deleted +- Some pod is sent back to queue (kubelet rejects, etc) +- POD group does not exist and is recreated, no node list +- Fluence asks again, but still has the first job. Not enough resources, asks forever. + +The above would not happen with the persistent pod group (if it wasn't cleaned up until the deletion of the job) and wouldn't happen if there are just enough resources to account for the overlap. + +- Does Fluence allocate resources for itself? +- It would be nice to be able to inspect the state of Fluence. +- At some point we want to be using the TBA fluxion-go instead of the one off branch we currently have (but we don't need to be blocked for that) +- We should (I think) restore pod group (it's in the controller here) and have our own container built. That way we have total control over the custom resource, and we don't risk it going away. + - As a part of that, we can add add a mutating webhook that emulates what we are doing in fluence now to find the label, but instead we will create the CRD to hold state instead of trying to hold in the operator. +- It could then also be investigated that we can more flexibly change the size of the group, within some min/max size (also determined by labels?) to help with scheduling. +- Note that kueue has added a Pod Group object, so probably addresses the static case here. diff --git a/examples/simple_example/fluence-scheduler-pod.yaml b/examples/simple_example/fluence-scheduler-pod.yaml index 71a8463..b09c714 100644 --- a/examples/simple_example/fluence-scheduler-pod.yaml +++ b/examples/simple_example/fluence-scheduler-pod.yaml @@ -1,11 +1,11 @@ apiVersion: v1 kind: Pod metadata: - name: fluence-scheduled-pod-1 + name: fluence-scheduled-pod labels: name: scheduler-example spec: schedulerName: fluence containers: - name: fluence-scheduled-container - image: registry.k8s.io/pause:2.0 \ No newline at end of file + image: registry.k8s.io/pause:2.0 diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 53a627e..a3f4531 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -3,10 +3,7 @@ package core import ( "fmt" - v1 "k8s.io/api/core/v1" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/klog/v2" + klog "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" @@ -26,13 +23,9 @@ func (s *FluxStateData) Clone() framework.StateData { return &FluxStateData{NodeCache: s.NodeCache} } -// NewFluxState creates an entry for the CycleState with the minimum that we might need -func NewFluxState(nodeName string, groupName string, size int32) *FluxStateData { - cache := NodeCache{ - NodeName: nodeName, - GroupName: groupName, - MinGroupSize: size, - } +// NewFluxState creates an entry for the CycleState with the node and group name +func NewFluxState(nodeName string, groupName string) *FluxStateData { + cache := NodeCache{NodeName: nodeName} return &FluxStateData{NodeCache: cache} } @@ -42,162 +35,127 @@ func NewFluxState(nodeName string, groupName string, size int32) *FluxStateData type NodeCache struct { NodeName string - // This is derived from tasks, where - // task is an allocation to some node - // High level it is most often akin to the - // number of pods on the node. I'm not sure that I understand this - // https://github.com/flux-framework/flux-k8s/blob/9f24f36752e3cced1b1112d93bfa366fb58b3c84/src/fluence/fluxion/fluxion.go#L94-L97 - // How does that relate to a single pod? It is called "Count" in other places - Tasks int + // Tie assignment back to PodGroup, which can be used to get size and time created + GroupName string - // These fields are primarily for the FluxStateData - // Without a PodGroup CRD we keep min size here - MinGroupSize int32 - GroupName string + // Assigned tasks (often pods) to nodes + // https://github.com/flux-framework/flux-k8s/blob/9f24f36752e3cced1b1112d93bfa366fb58b3c84/src/fluence/fluxion/fluxion.go#L94-L97 + AssignedTasks int } // A pod group cache holds a list of nodes for an allocation, where each has some number of tasks // along with the expected group size. This is intended to replace PodGroup // given the group name, size (derived from annotations) and timestamp type PodGroupCache struct { + GroupName string // This is a cache of nodes for pods Nodes []NodeCache - Size int32 - Name string - - // Keep track of when the group was initially created! - // This is like, the main thing we need. - TimeCreated metav1.MicroTime } -// Memory cache of pod group name to pod group cache, above -var podGroupCache map[string]*PodGroupCache +// PodGroups seen by fluence +var groupsSeen map[string]*PodGroupCache -// Init populates the podGroupCache +// Init populates the groupsSeen cache func Init() { - podGroupCache = map[string]*PodGroupCache{} -} - -// RegisterPodGroup ensures that the PodGroup exists in the cache -// This is an experimental replacement for an actual PodGroup -// We take a timestampo, which if called from Less (during sorting) is tiem.Time -// if called later (an individual pod) we go for its creation timestamp -func RegisterPodGroup(pod *v1.Pod, groupName string, groupSize int32) error { - entry, ok := podGroupCache[groupName] - - if !ok { - - // Assume we create the group with the timestamp - // of the first pod seen. There might be imperfections - // by the second, but as long as we sort them via millisecond - // this should prevent interleaving - nodes := []NodeCache{} - - // Create the new entry for the pod group - entry = &PodGroupCache{ - Name: groupName, - Size: groupSize, - Nodes: nodes, - TimeCreated: metav1.NowMicro(), - } - - // Tell the user when it was created - klog.Infof("[Fluence] Pod group %s was created at %s\n", entry.Name, entry.TimeCreated) - } - - // If the size has changed, we currently do not allow updating it. - // We issue a warning. In the future this could be supported with a grow command. - if entry.Size != groupSize { - klog.Infof("[Fluence] Pod group %s request to change size from %s to %s is not yet supported\n", groupName, entry.Size, groupSize) - // entry.GroupSize = groupSize - } - podGroupCache[groupName] = entry - return nil + groupsSeen = map[string]*PodGroupCache{} } -// GetPodGroup gets a pod group in the cache by name -func GetPodGroup(groupName string) *PodGroupCache { - entry, _ := podGroupCache[groupName] +// GetFluenceCache determines if a group has been seen. +// Yes -> we return the PodGroupCache entry +// No -> the entry is nil / does not exist +func GetFluenceCache(groupName string) *PodGroupCache { + entry, _ := groupsSeen[groupName] return entry } // DeletePodGroup deletes a pod from the group cache func DeletePodGroup(groupName string) { - delete(podGroupCache, groupName) -} - -// ListGroups lists groups, primarily for debugging -func ListGroups() { - for name, pg := range podGroupCache { - fmt.Printf(" %s: size %s, created at %s\n", name, pg.Size, &pg.TimeCreated) - } + delete(groupsSeen, groupName) } // CreateNodePodsList creates a list of node pod caches -func CreateNodePodsList(nodelist []*pb.NodeAlloc, groupName string) (nodepods []NodeCache) { +func CreateNodeList(nodelist []*pb.NodeAlloc, groupName string) (nodepods []NodeCache) { // Create a pod cache for each node nodepods = make([]NodeCache, len(nodelist)) + // TODO: should we be integrating topology information here? Could it be the + // case that some nodes (pods) in the group should be closer? for i, v := range nodelist { nodepods[i] = NodeCache{ - NodeName: v.GetNodeID(), - Tasks: int(v.GetTasks()), + NodeName: v.GetNodeID(), + AssignedTasks: int(v.GetTasks()), + GroupName: groupName, } } - // Update the pods in the PodGraphCache - updatePodGroupNodes(groupName, nodepods) - klog.Infof("[Fluence] Pod group cache updated with nodes\n", podGroupCache) + // Update the pods in the PodGroupCache (groupsSeen) + updatePodGroupCache(groupName, nodepods) return nodepods } // updatePodGroupList updates the PodGroupCache with a listing of nodes -func updatePodGroupNodes(groupName string, nodes []NodeCache) { - group := podGroupCache[groupName] - group.Nodes = nodes - podGroupCache[groupName] = group +func updatePodGroupCache(groupName string, nodes []NodeCache) { + cache := PodGroupCache{ + Nodes: nodes, + GroupName: groupName, + } + groupsSeen[groupName] = &cache } -// HavePodNodes returns true if the listing of pods is not empty -// This should be all pods that are needed - the allocation will not -// be successful otherwise, so we just check > 0 -func (p *PodGroupCache) HavePodNodes() bool { - return len(p.Nodes) > 0 -} +// GetNextNode gets the next node in the PodGroupCache +func (p *PodGroupCache) GetNextNode() (string, error) { -// CancelAllocation resets the node cache and allocation status -func (p *PodGroupCache) CancelAllocation() { - p.Nodes = []NodeCache{} -} + nextnode := "" -// GetNextNode gets the next available node we can allocate for a group -func GetNextNode(groupName string) (string, error) { - entry, ok := podGroupCache[groupName] - if !ok { - return "", fmt.Errorf("[Fluence] Map is empty\n") - } - if len(entry.Nodes) == 0 { - return "", fmt.Errorf("[Fluence] Error while getting a node\n") + // Quick failure state - we ran out of nodes + if len(p.Nodes) == 0 { + return nextnode, fmt.Errorf("[Fluence] PodGroup %s ran out of nodes.", p.GroupName) } - nodename := entry.Nodes[0].NodeName - klog.Infof("[Fluence] Next node for group %s is %s", groupName, nodename) + // The next is the 0th in the list + nextnode = p.Nodes[0].NodeName + klog.Infof("[Fluence] Next node for group %s is %s", p.GroupName, nextnode) - if entry.Nodes[0].Tasks == 1 { - klog.Infof("[Fluence] First node has one task") - slice := entry.Nodes[1:] + // If there is only one task left, we are going to use it (and remove the node) + if p.Nodes[0].AssignedTasks == 1 { + klog.Infof("[Fluence] First node has one remaining task slot") + slice := p.Nodes[1:] + + // If after we remove the node there are no nodes left... + // Note that I'm not deleting the node from the cache because that is the + // only way fluence knows it has already assigned work (presence of the key) if len(slice) == 0 { - klog.Infof("[Fluence] After this node, the slice is empty, deleting group %s from cache\n", groupName) - delete(podGroupCache, groupName) - return nodename, nil + klog.Infof("[Fluence] Assigning node %s. There are NO reamining nodes for group %s\n", nextnode, p.GroupName) + // delete(podGroupCache, groupName) + return nextnode, nil } - klog.Infof("[Fluence] After this node, the slide still has nodes") - updatePodGroupNodes(groupName, slice) - return nodename, nil + + klog.Infof("[Fluence] Assigning node %s. There are nodes left for group", nextnode, p.GroupName) + updatePodGroupCache(p.GroupName, slice) + return nextnode, nil + } + + // If we get here the first node had >1 assigned tasks + klog.Infof("[Fluence] Assigning node %s for group %s. There are still task assignments available for this node.", nextnode, p.GroupName) + p.Nodes[0].AssignedTasks = p.Nodes[0].AssignedTasks - 1 + return nextnode, nil +} + +// GetNextNode gets the next available node we can allocate for a group +// TODO this should be able to take and pass forward a number of tasks. +// It is implicity 1 now, but doesn't have to be. +func GetNextNode(groupName string) (string, error) { + + // Get our entry from the groupsSeen cache + klog.Infof("[Fluence] groups seen %s", groupsSeen) + entry, ok := groupsSeen[groupName] + + // This case should not happen + if !ok { + return "", fmt.Errorf("[Fluence] Map is empty") } - klog.Infof("[Fluence] Subtracting one task from first node") - entry.Nodes[0].Tasks = entry.Nodes[0].Tasks - 1 - return nodename, nil + // Get the next node from the PodGroupCache + return entry.GetNextNode() } diff --git a/sig-scheduler-plugins/pkg/fluence/events.go b/sig-scheduler-plugins/pkg/fluence/events.go new file mode 100644 index 0000000..bc265f7 --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/events.go @@ -0,0 +1,150 @@ +package fluence + +import ( + "context" + "time" + + "google.golang.org/grpc" + v1 "k8s.io/api/core/v1" + klog "k8s.io/klog/v2" + + pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" +) + +// Events are associated with inforers, typically on pods, e.g., +// delete: deletion of a pod +// update: update of a pod! +// For both of the above, there are cases to cancel the flux job +// associated with the group id + +// cancelFluxJobForPod cancels the flux job for a pod. +// We assume that the cancelled job also means deleting the pod group +func (f *Fluence) cancelFluxJob(groupName string) error { + + jobid, ok := f.groupToJobId[groupName] + + // The job was already cancelled by another pod + if !ok { + klog.Infof("[Fluence] Request for cancel of group %s is already complete.", groupName) + return nil + } + klog.Infof("[Fluence] Cancel flux job: %v for group %s", jobid, groupName) + + // This first error is about connecting to the server + conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) + if err != nil { + klog.Errorf("[Fluence] Error connecting to server: %v", err) + return err + } + defer conn.Close() + + grpcclient := pb.NewFluxcliServiceClient(conn) + _, cancel := context.WithTimeout(context.Background(), 200*time.Second) + defer cancel() + + // This error reflects the success or failure of the cancel request + request := &pb.CancelRequest{JobID: int64(jobid)} + res, err := grpcclient.Cancel(context.Background(), request) + if err != nil { + klog.Errorf("[Fluence] did not receive any cancel response: %v", err) + return err + } + klog.Infof("[Fluence] Job cancellation for group %s result: %d", groupName, res.Error) + + // And this error is if the cancel was successful or not + if res.Error == 0 { + klog.Infof("[Fluence] Successful cancel of flux job: %d for group %s", jobid, groupName) + delete(f.groupToJobId, groupName) + } else { + klog.Warningf("[Fluence] Failed to cancel flux job %d for group %s", jobid, groupName) + } + return nil +} + +// updatePod is called on an update, and the old and new object are presented +func (f *Fluence) updatePod(oldObj, newObj interface{}) { + + oldPod := oldObj.(*v1.Pod) + newPod := newObj.(*v1.Pod) + + // a pod is updated, get the group + // TODO should we be checking group / size for old vs new? + groupName, _ := f.pgMgr.GetPodGroup(context.TODO(), oldPod) + + klog.Infof("[Fluence] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, newPod.Status.Phase, oldPod.Status.Phase) + + switch newPod.Status.Phase { + case v1.PodPending: + // in this state we don't know if a pod is going to be running, thus we don't need to update job map + case v1.PodRunning: + // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler + case v1.PodSucceeded: + klog.Infof("[Fluence] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) + + f.mutex.Lock() + defer f.mutex.Unlock() + + // Do we have the group id in our cache? If yes, we haven't deleted the jobid yet + // I am worried here that if some pods are succeeded and others pending, this could + // be a mistake - fluence would schedule it again + _, ok := f.groupToJobId[groupName] + if ok { + f.cancelFluxJob(groupName) + } else { + klog.Infof("[Fluence] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + } + + case v1.PodFailed: + + // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test + klog.Warningf("[Fluence] Pod %s in group %s failed, Fluence needs to free the resources", newPod.Name, groupName) + + f.mutex.Lock() + defer f.mutex.Unlock() + + _, ok := f.groupToJobId[groupName] + if ok { + f.cancelFluxJob(groupName) + } else { + klog.Errorf("[Fluence] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + } + case v1.PodUnknown: + // don't know how to deal with it as it's unknown phase + default: + // shouldn't enter this branch + } +} + +// deletePod handles the delete event handler +func (f *Fluence) deletePod(podObj interface{}) { + klog.Info("[Fluence] Delete Pod event handler") + pod := podObj.(*v1.Pod) + groupName, _ := f.pgMgr.GetPodGroup(context.TODO(), pod) + + klog.Infof("[Fluence] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) + switch pod.Status.Phase { + case v1.PodSucceeded: + case v1.PodPending: + klog.Infof("[Fluence] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) + + f.mutex.Lock() + defer f.mutex.Unlock() + + _, ok := f.groupToJobId[groupName] + if ok { + f.cancelFluxJob(groupName) + } else { + klog.Infof("[Fluence] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + } + case v1.PodRunning: + f.mutex.Lock() + defer f.mutex.Unlock() + + _, ok := f.groupToJobId[groupName] + if ok { + f.cancelFluxJob(groupName) + } else { + klog.Infof("[Fluence] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + } + } +} diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 26282e5..0e8ec21 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -1,19 +1,3 @@ -/* -Copyright 2022 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - package fluence import ( @@ -32,7 +16,7 @@ import ( clientscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/cache" corev1helpers "k8s.io/component-helpers/scheduling/corev1" - "k8s.io/klog/v2" + klog "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" "sigs.k8s.io/controller-runtime/pkg/client" @@ -49,12 +33,9 @@ type Fluence struct { handle framework.Handle client client.Client - // Important: I tested moving this into the group, but it's a bad idea because - // we need to delete the group after the last allocation is given, and then we - // no longer have the ID. It might be a better approach to delete it elsewhere - // (but I'm not sure where that elsewhere could be) - podNameToJobId map[string]uint64 - pgMgr coschedulingcore.Manager + // Store jobid on the level of a group (which can be a single pod) + groupToJobId map[string]uint64 + pgMgr coschedulingcore.Manager } // Name is the name of the plugin used in the Registry and configurations. @@ -79,7 +60,7 @@ func (f *Fluence) Name() string { // https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/coscheduling.go#L63 func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { - f := &Fluence{handle: handle, podNameToJobId: make(map[string]uint64)} + f := &Fluence{handle: handle, groupToJobId: make(map[string]uint64)} ctx := context.TODO() fcore.Init() @@ -106,7 +87,7 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { fieldSelector, err := fields.ParseSelector(",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) if err != nil { - klog.ErrorS(err, "ParseSelector failed") + klog.Errorf("ParseSelector failed %s", err) os.Exit(1) } @@ -116,6 +97,7 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { podInformer := informerFactory.Core().V1().Pods() scheduleTimeDuration := time.Duration(500) * time.Second + // https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/core/core.go#L84 pgMgr := coschedulingcore.NewPodGroupManager( k8scli, handle.SnapshotSharedLister(), @@ -141,34 +123,27 @@ func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { // Less is used to sort pods in the scheduling queue in the following order. // 1. Compare the priorities of Pods. -// 2. Compare the initialization timestamps of fluence pod groups -// 3. Fall back, sort by namespace/name -// See https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/ -// Less is part of Sort, which is the earliest we can see a pod unless we use gate -// IMPORTANT: Less sometimes is not called for smaller sizes, not sure why. -// To get around this we call it during PreFilter too. +// 2. Compare the initialization timestamps of PodGroups or Pods. +// 3. Compare the keys of PodGroups/Pods: /. func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { - klog.Infof("[Fluence] Ordering pods in Less") - - // ensure we have a PodGroup no matter what - klog.Infof("[Fluence] Comparing %s and %s", podInfo1.Pod.Name, podInfo2.Pod.Name) - podGroup1 := fgroup.EnsureFluenceGroup(podInfo1.Pod) - podGroup2 := fgroup.EnsureFluenceGroup(podInfo2.Pod) - - // First preference to priority, but only if they are different + klog.Infof("ordering pods in fluence scheduler plugin") prio1 := corev1helpers.PodPriority(podInfo1.Pod) prio2 := corev1helpers.PodPriority(podInfo2.Pod) - - // ...and only allow this to sort if they aren't the same - // The assumption here is that pods with priority are ignored by fluence if prio1 != prio2 { return prio1 > prio2 } + // Important: this GetPodGroup returns the first name as the Namespaced one, + // which is what fluence needs to distinguish between namespaces. Just the + // name could be replicated between different namespaces + ctx := context.TODO() + name1, podGroup1 := f.pgMgr.GetPodGroup(ctx, podInfo1.Pod) + name2, podGroup2 := f.pgMgr.GetPodGroup(ctx, podInfo2.Pod) + // Fluence can only compare if we have two known groups. // This tries for that first, and falls back to the initial attempt timestamp - creationTime1 := fgroup.GetCreationTimestamp(podGroup1, podInfo1) - creationTime2 := fgroup.GetCreationTimestamp(podGroup2, podInfo2) + creationTime1 := fgroup.GetCreationTimestamp(name1, podGroup1, podInfo1) + creationTime2 := fgroup.GetCreationTimestamp(name2, podGroup2, podInfo2) // If they are the same, fall back to sorting by name. if creationTime1.Equal(&creationTime2) { @@ -178,7 +153,7 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { } // PreFilter checks info about the Pod / checks conditions that the cluster or the Pod must meet. -// This still comes after sort +// This comes after sort func (f *Fluence) PreFilter( ctx context.Context, state *framework.CycleState, @@ -189,31 +164,46 @@ func (f *Fluence) PreFilter( // groupName will be named according to the single pod namespace / pod if there wasn't // a user defined group. This is a size 1 group we handle equivalently. - pg := fgroup.GetPodsGroup(pod) + groupName, pg := f.pgMgr.GetPodGroup(ctx, pod) + klog.Infof("[Fluence] Pod %s is in group %s with minimum members %d", pod.Name, groupName, pg.Spec.MinMember) - klog.Infof("[Fluence] Pod %s group size %d", pod.Name, pg.Size) - klog.Infof("[Fluence] Pod %s group name is %s", pod.Name, pg.Name) + // Has this podgroup been seen by fluence yet? If yes, we will have it in the cache + cache := fcore.GetFluenceCache(groupName) + klog.Infof("[Fluence] cache %s", cache) - // Note that it is always the case we have a group - // We have not yet derived a node list - if !pg.HavePodNodes() { - klog.Infof("[Fluence] Does not have nodes yet, asking Fluxion") - err := f.AskFlux(ctx, pod, int(pg.Size)) + // Fluence has never seen this before, we need to schedule an allocation + // It also could have been seen, but was not able to get one. + if cache == nil { + klog.Infof("[Fluence] Does not have nodes for %s yet, asking Fluxion", groupName) + + // groupName is the namespaced name / + err := f.AskFlux(ctx, pod, pg, groupName) if err != nil { klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } } - nodename, err := fcore.GetNextNode(pg.Name) - klog.Infof("Node Selected %s (%s:%s)", nodename, pod.Name, pg.Name) + + // We can only get here if an allocation is done (and there is no error above) + // The cache would only originally be nil if we didn't do that yet. It should + // always be defined (not nil) when we get here + cache = fcore.GetFluenceCache(groupName) + + // This is the next node in the list + nodename, err := fcore.GetNextNode(groupName) if err != nil { return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } - - // Create a fluxState (CycleState) with things that might be useful/ - klog.Info("Node Selected: ", nodename) - cache := fcore.NodeCache{NodeName: nodename} - state.Write(framework.StateKey(pod.Name), &fcore.FluxStateData{NodeCache: cache}) + klog.Infof("Node Selected %s (pod %s:group %s)", nodename, pod.Name, groupName) + + // Create a fluxState (CycleState) with things that might be useful + // This isn't a PodGroupCache, but a single node cache, which also + // has group information, but just is for one node. Note that assigned + // tasks is hard coded to 1 but this isn't necessarily the case - we should + // eventually be able to GetNextNode for a number of tasks, for example + // (unless task == pod in which case it is always 1) + nodeCache := fcore.NodeCache{NodeName: nodename, GroupName: groupName, AssignedTasks: 1} + state.Write(framework.StateKey(pod.Name), &fcore.FluxStateData{NodeCache: nodeCache}) return nil, framework.NewStatus(framework.Success, "") } @@ -226,8 +216,16 @@ func (f *Fluence) Filter( ) *framework.Status { klog.Info("Filtering input node ", nodeInfo.Node().Name) - if v, e := cycleState.Read(framework.StateKey(pod.Name)); e == nil { - if value, ok := v.(*fcore.FluxStateData); ok && value.NodeCache.NodeName != nodeInfo.Node().Name { + state, err := cycleState.Read(framework.StateKey(pod.Name)) + + // No error means we retrieved the state + if err == nil { + + // Try to convert the state to FluxStateDate + value, ok := state.(*fcore.FluxStateData) + + // If we have state data that isn't equal to the current assignment, no go + if ok && value.NodeCache.NodeName != nodeInfo.Node().Name { return framework.NewStatus(framework.Unschedulable, "pod is not permitted") } else { klog.Infof("Filter: node %s selected for %s\n", value.NodeCache.NodeName, pod.Name) @@ -243,24 +241,33 @@ func (f *Fluence) PreFilterExtensions() framework.PreFilterExtensions { } // AskFlux will ask flux for an allocation for nodes for the pod group. -func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { +func (f *Fluence) AskFlux( + ctx context.Context, + pod *v1.Pod, + pg *sched.PodGroup, + groupName string, +) error { + // clean up previous match if a pod has already allocated previously f.mutex.Lock() - _, isPodAllocated := f.podNameToJobId[pod.Name] + _, isAllocated := f.groupToJobId[groupName] f.mutex.Unlock() - if isPodAllocated { - klog.Infof("[Fluence] Pod %s is allocated, cleaning up previous allocation\n", pod.Name) - f.mutex.Lock() - f.cancelFluxJobForPod(pod) - f.mutex.Unlock() + // Not allowing cancel for now - not sure how or why we could do this, need to better + // understand the case. This function should ONLY be successful on a new match allocate, + // otherwise the calling logic does not make sense. + if isAllocated { + return fmt.Errorf("[Fluence] Pod %s in group %s is allocated and calling AskFlux, should we be here?\n", pod.Name, groupName) } - // Does the task name here matter? We are naming the entire group for the pod - jobspec := utils.InspectPodInfo(pod) + // IMPORTANT: this is a JobSpec for *one* pod, assuming they are all the same. + // This obviously may not be true if we have a hetereogenous PodGroup. + // We name it based on the group, since it will represent the group + jobspec := utils.PreparePodJobSpec(pod, groupName) klog.Infof("[Fluence] Inspect pod info, jobspec: %s\n", jobspec) conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) + // TODO change this to just return fmt.Errorf if err != nil { klog.Errorf("[Fluence] Error connecting to server: %v\n", err) return err @@ -274,154 +281,34 @@ func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) error { request := &pb.MatchRequest{ Ps: jobspec, Request: "allocate", - Count: int32(count)} + Count: pg.Spec.MinMember, + } - // Question from vsoch; Why return err instead of err2 here? - // err would return a nil value, but we need to return non nil, - // otherwise it's going to try to use the allocation (but there is none) + // An error here is an error with making the request r, err := grpcclient.Match(context.Background(), request) if err != nil { klog.Errorf("[Fluence] did not receive any match response: %v\n", err) return err } - klog.Infof("[Fluence] response podID %s\n", r.GetPodID()) - - // Presence of a podGroup is indicated by a groupName - // Flag that the group is allocated (yes we also have the job id, testing for now) - pg := fgroup.GetPodsGroup(pod) + // TODO GetPodID should be renamed, because it will reflect the group + klog.Infof("[Fluence] Match response ID %s\n", r.GetPodID()) // Get the nodelist and inspect nodes := r.GetNodelist() klog.Infof("[Fluence] Nodelist returned from Fluxion: %s\n", nodes) - nodelist := fcore.CreateNodePodsList(nodes, pg.Name) - klog.Infof("[Fluence] parsed node pods list %s\n", nodelist) + // Assign the nodelist - this sets the group name in the groupSeen cache + // at this point, we can retrieve the cache and get nodes + nodelist := fcore.CreateNodeList(nodes, groupName) + jobid := uint64(r.GetJobID()) + klog.Infof("[Fluence] parsed node pods list %s for job id %d\n", nodelist, jobid) + // TODO would be nice to actually be able to ask flux jobs -a to fluence + // That way we can verify assignments, etc. f.mutex.Lock() - f.podNameToJobId[pod.Name] = jobid - klog.Infof("[Fluence] Check job assignment: %s\n", f.podNameToJobId) + f.groupToJobId[groupName] = jobid f.mutex.Unlock() return nil } - -// cancelFluxJobForPod cancels the flux job for a pod. -// We assume that the cancelled job also means deleting the pod group -func (f *Fluence) cancelFluxJobForPod(pod *v1.Pod) error { - jobid := f.podNameToJobId[pod.Name] - - klog.Infof("[Fluence] Cancel flux job: %v for pod %s", jobid, pod.Name) - - conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) - - if err != nil { - klog.Errorf("[Fluence] Error connecting to server: %v", err) - return err - } - defer conn.Close() - - grpcclient := pb.NewFluxcliServiceClient(conn) - _, cancel := context.WithTimeout(context.Background(), 200*time.Second) - defer cancel() - - // I think this error reflects the success or failure of the cancel request - request := &pb.CancelRequest{JobID: int64(jobid)} - res, err := grpcclient.Cancel(context.Background(), request) - if err != nil { - klog.Errorf("[Fluence] did not receive any cancel response: %v", err) - return err - } - klog.Infof("[Fluence] Job cancellation for pod %s result: %d", pod.Name, res.Error) - - // And this error is if the cancel was successful or not - if res.Error == 0 { - klog.Infof("[Fluence] Successful cancel of flux job: %v for pod %s", jobid, pod.Name) - delete(f.podNameToJobId, pod.Name) - - // If we are successful, clear the group allocated nodes - fgroup.DeleteFluenceGroup(pod) - } else { - klog.Warningf("[Fluence] Failed to cancel flux job %v for pod %s", jobid, pod.Name) - } - return nil -} - -// EventHandlers updatePod handles cleaning up resources -func (f *Fluence) updatePod(oldObj, newObj interface{}) { - - oldPod := oldObj.(*v1.Pod) - newPod := newObj.(*v1.Pod) - - klog.Infof("[Fluence] Processing event for pod %s from %s to %s", newPod.Name, newPod.Status.Phase, oldPod.Status.Phase) - - switch newPod.Status.Phase { - case v1.PodPending: - // in this state we don't know if a pod is going to be running, thus we don't need to update job map - case v1.PodRunning: - // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler - case v1.PodSucceeded: - klog.Infof("[Fluence] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) - - f.mutex.Lock() - defer f.mutex.Unlock() - - if _, ok := f.podNameToJobId[newPod.Name]; ok { - f.cancelFluxJobForPod(newPod) - } else { - klog.Infof("[Fluence] Succeeded pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) - } - case v1.PodFailed: - // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test - klog.Warningf("[Fluence] Pod %s failed, Fluence needs to free the resources", newPod.Name) - - f.mutex.Lock() - defer f.mutex.Unlock() - - if _, ok := f.podNameToJobId[newPod.Name]; ok { - f.cancelFluxJobForPod(newPod) - } else { - klog.Errorf("[Fluence] Failed pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) - } - case v1.PodUnknown: - // don't know how to deal with it as it's unknown phase - default: - // shouldn't enter this branch - } -} - -// deletePod handles the delete event handler -// TODO when should we clear group from the cache? -func (f *Fluence) deletePod(podObj interface{}) { - klog.Info("[Fluence] Delete Pod event handler") - - pod := podObj.(*v1.Pod) - klog.Infof("[Fluence] Delete pod has status %s", pod.Status.Phase) - switch pod.Status.Phase { - case v1.PodSucceeded: - case v1.PodPending: - klog.Infof("[Fluence] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) - - f.mutex.Lock() - defer f.mutex.Unlock() - - if _, ok := f.podNameToJobId[pod.Name]; ok { - f.cancelFluxJobForPod(pod) - } else { - klog.Infof("[Fluence] Terminating pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) - } - case v1.PodRunning: - f.mutex.Lock() - defer f.mutex.Unlock() - - if _, ok := f.podNameToJobId[pod.Name]; ok { - f.cancelFluxJobForPod(pod) - } else { - klog.Infof("[Fluence] Deleted pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) - } - } - - // We assume that a request to delete one pod means all of them. - // We have to take an all or nothing approach for now - fgroup.DeleteFluenceGroup(pod) -} diff --git a/sig-scheduler-plugins/pkg/fluence/group/group.go b/sig-scheduler-plugins/pkg/fluence/group/group.go index 4af84e2..455b9e5 100644 --- a/sig-scheduler-plugins/pkg/fluence/group/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group/group.go @@ -1,103 +1,23 @@ package group import ( - "fmt" - "strconv" - - v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/klog/v2" + klog "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" - fcore "sigs.k8s.io/scheduler-plugins/pkg/fluence/core" - "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" + sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" ) -// getDefaultGroupName returns a group name based on the pod namespace and name -// We could do this for pods that are not labeled, and treat them as a size 1 group -func getDefaultGroupName(pod *v1.Pod) string { - return fmt.Sprintf("%s-%s", pod.Namespace, pod.Name) -} - -// getPodsGroup gets the pods group, if it exists. -func GetPodsGroup(pod *v1.Pod) *fcore.PodGroupCache { - groupName := EnsureFluenceGroup(pod) - return fcore.GetPodGroup(groupName) -} - -// GetGroup is a courtesy wrapper around fcore.GetPodGroup -func GetGroup(groupName string) *fcore.PodGroupCache { - return fcore.GetPodGroup(groupName) -} - -// ensureFluenceGroup ensure that a podGroup is created for the named fluence group -// Preference goes to the traditional PodGroup (created by the user) -// and falls back to having one created by fluence. If there is no PodGroup -// created and no fluence annotation, we do not create the group. -// Likely for fluence we'd want a cleanup function somehow too, -// for now assume groups are unique by name. -func EnsureFluenceGroup(pod *v1.Pod) string { - - // Get the group name and size from the fluence labels - groupName := getFluenceGroupName(pod) - groupSize := getFluenceGroupSize(pod) - - // If there isn't a group, make a single node sized group - // This is so we can always treat the cases equally - if groupName == "" { - klog.Infof("[Fluence] Group annotation missing for pod %s", pod.Name) - groupName = getDefaultGroupName(pod) - } - klog.Infof("[Fluence] Group name for %s is %s", pod.Name, groupName) - klog.Infof("[Fluence] Group size for %s is %d", pod.Name, groupSize) - - // Register the pod group (with the pod) in our cache - fcore.RegisterPodGroup(pod, groupName, groupSize) - return groupName -} - -// deleteFluenceGroup ensures the pod group is deleted, if it exists -func DeleteFluenceGroup(pod *v1.Pod) { - // Get the group name and size from the fluence labels - pg := GetPodsGroup(pod) - fcore.DeletePodGroup(pg.Name) - klog.Infof("[Fluence] known groups are:\n") - fcore.ListGroups() -} - -// getFluenceGroupName looks for the group to indicate a fluence group, and returns it -func getFluenceGroupName(pod *v1.Pod) string { - groupName, _ := pod.Labels[labels.PodGroupLabel] - return groupName -} - -// getFluenceGroupSize gets the size of the fluence group -func getFluenceGroupSize(pod *v1.Pod) int32 { - size, _ := pod.Labels[labels.PodGroupSizeLabel] - - // Default size of 1 if the label is not set (but name is) - if size == "" { - return 1 - } - - // We don't want the scheduler to fail if someone puts a value for size - // that doesn't convert nicely. They can find this in the logs. - intSize, err := strconv.ParseUint(size, 10, 32) - if err != nil { - klog.Error(" [Fluence] Parsing integer size for pod group") - } - return int32(intSize) -} - // GetCreationTimestamp first tries the fluence group, then falls back to the initial attempt timestamp -func GetCreationTimestamp(groupName string, podInfo *framework.QueuedPodInfo) metav1.MicroTime { - pg := fcore.GetPodGroup(groupName) +// This is the only update we have made to the upstream PodGroupManager, because we are expecting +// a MicroTime and not a time.Time. +func GetCreationTimestamp(groupName string, pg *sched.PodGroup, podInfo *framework.QueuedPodInfo) metav1.MicroTime { // IsZero is an indicator if this was actually set // If the group label was present and we have a group, this will be true - if !pg.TimeCreated.IsZero() { - klog.Infof(" [Fluence] Pod group %s was created at %s\n", groupName, pg.TimeCreated) - return pg.TimeCreated + if !pg.Status.ScheduleStartTime.IsZero() { + klog.Infof(" [Fluence] Pod group %s was created at %s\n", groupName, pg.Status.ScheduleStartTime) + return pg.Status.ScheduleStartTime } // We should actually never get here. klog.Errorf(" [Fluence] Pod group %s time IsZero, we should not have reached here", groupName) diff --git a/sig-scheduler-plugins/pkg/fluence/utils/utils.go b/sig-scheduler-plugins/pkg/fluence/utils/utils.go index e384669..f2969d2 100644 --- a/sig-scheduler-plugins/pkg/fluence/utils/utils.go +++ b/sig-scheduler-plugins/pkg/fluence/utils/utils.go @@ -21,7 +21,7 @@ import ( "strings" v1 "k8s.io/api/core/v1" - "k8s.io/klog/v2" + klog "k8s.io/klog/v2" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" ) @@ -39,12 +39,14 @@ func getPodJobspecLabels(pod *v1.Pod) []string { return labels } -// InspectPodInfo takes a pod object and returns the pod.spec -// Note from vsoch - I updated this to calculate containers across the pod -// if that's wrong we can change it back. -func InspectPodInfo(pod *v1.Pod) *pb.PodSpec { +// PreparePodJobSpec takes a pod object and returns the jobspec +// The jobspec is based on the pod, and assumes it will be duplicated +// for a MatchAllocate request (representing all pods). We name the +// jobspec based on the group and not the individual ID. +// This calculates across containers in the od +func PreparePodJobSpec(pod *v1.Pod, groupName string) *pb.PodSpec { ps := new(pb.PodSpec) - ps.Id = pod.Name + ps.Id = groupName // Note from vsoch - there was an if check here to see if we had labels, // I don't think there is risk to adding an empty list but we can add diff --git a/src/fluence/fluxion/fluxion.go b/src/fluence/fluxion/fluxion.go index 5775199..05e94fa 100644 --- a/src/fluence/fluxion/fluxion.go +++ b/src/fluence/fluxion/fluxion.go @@ -8,7 +8,7 @@ import ( "github.com/flux-framework/flux-k8s/flux-plugin/fluence/jobspec" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/utils" "github.com/flux-framework/fluxion-go/pkg/fluxcli" - "k8s.io/klog/v2" + klog "k8s.io/klog/v2" "context" "errors" From 8e0b4613f574521a0a4d75b0c49878656e04ef7a Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 18 Feb 2024 21:40:05 -0700 Subject: [PATCH 16/28] feat: podgroup deletion when finished/failed Problem: Since the PodGroup controller creates the PodGroup, it should delete it as well. Solution: Ideally I wanted to attach an owner reference, meaning that the top level job (that also owns the pod) would be owner to the PodGroup. But that does not seem to take - either because the controller is the owner or the field is read only for k8s. For the time being, I decided to delete the PodGroup when the group is determined to be Finished/Failed, which happens when that number of pods equals or exceeds the MinimumSize. I think granted that MinimumSize == size this should be OK with fluence, and we might need to consider other approaches if/when the min size is smaller than the total size (because fluence might still see a pod in the queue and try to schedule again. I think what we might do in that case is just update the MinSize for the group, so if fluence schedules again it will be for the smaller size. But not sure about that either! TBA. The important thing now is that the pod group cleans itself up! Signed-off-by: vsoch --- examples/test_example/fluence-sized-job.yaml | 16 ++++ .../pkg/controllers/podgroup_controller.go | 89 +++++++++++++------ 2 files changed, 80 insertions(+), 25 deletions(-) create mode 100644 examples/test_example/fluence-sized-job.yaml diff --git a/examples/test_example/fluence-sized-job.yaml b/examples/test_example/fluence-sized-job.yaml new file mode 100644 index 0000000..a195d87 --- /dev/null +++ b/examples/test_example/fluence-sized-job.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: fluence-sized-job +spec: + parallelism: 3 + completions: 3 + template: + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [echo, potato] + restartPolicy: Never + backoffLimit: 4 diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index 72bda77..fa4593c 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -125,60 +125,81 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c // to account for if the labels are different, do we take the smallest? log.Info("PodGroup", "Status", fmt.Sprintf("WARNING: Pod group current MinMember %s does not match %d", pg.Spec.MinMember, size)) } + return r.updateStatus(ctx, pg, podList.Items) - // If we get here, we have a PodGroup with a set size and can inspect / update phase - pods := podList.Items - pgCopy := pg.DeepCopy() +} +func (r *PodGroupReconciler) updateStatus( + ctx context.Context, + pg *schedv1alpha1.PodGroup, + pods []v1.Pod, +) (ctrl.Result, error) { - switch pgCopy.Status.Phase { + patch := client.MergeFrom(pg.DeepCopy()) + + switch pg.Status.Phase { case "": - pgCopy.Status.Phase = schedv1alpha1.PodGroupPending + pg.Status.Phase = schedv1alpha1.PodGroupPending + result, err := r.updateOwnerReferences(ctx, pg, &pods[0]) + if result.Requeue || err != nil { + return result, err + } + case schedv1alpha1.PodGroupPending: if len(pods) >= int(pg.Spec.MinMember) { - pgCopy.Status.Phase = schedv1alpha1.PodGroupScheduling - - // Always update owner references to be the first pod - // E.g., if a job owns it, ensure the group is deleted with it - updateOwnerReferences(pgCopy, &pods[0]) + pg.Status.Phase = schedv1alpha1.PodGroupScheduling + result, err := r.updateOwnerReferences(ctx, pg, &pods[0]) + if result.Requeue || err != nil { + return result, err + } } default: // Get updated counts of running, succeeded, and failed pods - pgCopy.Status.Running, pgCopy.Status.Succeeded, pgCopy.Status.Failed = getCurrentPodStats(pods) + running, succeeded, failed := getCurrentPodStats(pods) // If for some reason we weren't pending and now have fewer than min required, flip back to pending if len(pods) < int(pg.Spec.MinMember) { - pgCopy.Status.Phase = schedv1alpha1.PodGroupPending + pg.Status.Phase = schedv1alpha1.PodGroupPending break } // A pod with succeeded + running STILL less than the minimum required is scheduling - if pgCopy.Status.Succeeded+pgCopy.Status.Running < pg.Spec.MinMember { - pgCopy.Status.Phase = schedv1alpha1.PodGroupScheduling + if succeeded+running < pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupScheduling } // A pod with succeeded + running >= the minimum required is running! - if pgCopy.Status.Succeeded+pgCopy.Status.Running >= pg.Spec.MinMember { - pgCopy.Status.Phase = schedv1alpha1.PodGroupRunning + if succeeded+running >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupRunning } // We have non zero failed, and the total of failed, running amd succeeded > min member // Final state of pod group is FAILED womp womp - if pgCopy.Status.Failed != 0 && - pgCopy.Status.Failed+pgCopy.Status.Running+pgCopy.Status.Succeeded >= pg.Spec.MinMember { - pgCopy.Status.Phase = schedv1alpha1.PodGroupFailed + if failed != 0 && failed+running+succeeded >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupFailed } // Finished! This is where we want to get :) // TODO: ideally the owning higher level object deletion will delete here, // but that won't always work for one of pods - need a new strategy - if pgCopy.Status.Succeeded >= pg.Spec.MinMember { - pgCopy.Status.Phase = schedv1alpha1.PodGroupFinished + if succeeded >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupFinished } + pg.Status.Running = running + pg.Status.Failed = failed + pg.Status.Succeeded = succeeded } - // TODO need better handling here of cleanup, etc. This mostly handles status changes - return r.patchPodGroup(ctx, pg, pgCopy) + // Apply the patch to update, or delete if finished + // TODO would be better if owner references took here, so delete on owner deletion + var err error + if pg.Status.Phase == schedv1alpha1.PodGroupFinished || pg.Status.Phase == schedv1alpha1.PodGroupFailed { + err = r.Delete(ctx, pg) + } else { + r.Status().Update(ctx, pg) + err = r.Patch(ctx, pg, patch) + } + return ctrl.Result{Requeue: true}, err } // newPodGroup creates a new podGroup object, capturing the creation time @@ -273,21 +294,37 @@ func getCurrentPodStats(pods []v1.Pod) (int32, int32, int32) { // updateOwnerReferences ensures the group is always owned by the same entity that owns the pod // This ensures that, for example, a job that is wrapping pods is the owner. -func updateOwnerReferences(pg *schedv1alpha1.PodGroup, pod *v1.Pod) { +func (r *PodGroupReconciler) updateOwnerReferences( + ctx context.Context, + pg *schedv1alpha1.PodGroup, + pod *v1.Pod, +) (ctrl.Result, error) { // Case 1: The pod itself doesn't have owner references. YOLO if len(pod.OwnerReferences) == 0 { - return + return ctrl.Result{}, nil } + // Collect owner references for pod group + owners := []metav1.OwnerReference{} var refs []string for _, ownerRef := range pod.OwnerReferences { refs = append(refs, fmt.Sprintf("%s/%s", pod.Namespace, ownerRef.Name)) + owners = append(owners, ownerRef) } + patch := client.MergeFrom(pg.DeepCopy()) if len(refs) != 0 { sort.Strings(refs) pg.Status.OccupiedBy = strings.Join(refs, ",") } + if len(owners) > 0 { + pg.ObjectMeta.OwnerReferences = owners + } + // Apply the patch to update the size + r.Status().Update(ctx, pg) + err := r.Patch(ctx, pg, patch) + return ctrl.Result{Requeue: true}, err + } // SetupWithManager sets up the controller with the Manager. @@ -346,6 +383,8 @@ func (r *PodGroupReconciler) ensurePodGroup(ctx context.Context, obj client.Obje if apierrs.IsNotFound(err) { r.log.Info("Pod: ", "Status", pod.Status.Phase, "Name", pod.Name, "Group", groupName, "Namespace", pod.Namespace, "Action", "Creating PodGroup") + //owner := r.getOwnerMetadata(pod) + // TODO should an owner be set here? Setting to a specific pod seems risky/wrong in case deleted. err, _ := r.newPodGroup(ctx, groupName, pod.Namespace, int32(groupSize)) if err != nil { From 68815a5a3059b1f2f6e0dc4190093b012af92d1e Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 18 Feb 2024 23:56:36 -0700 Subject: [PATCH 17/28] feat: add support for other abstractions Problem: we need to be able to run deployments, stateful/replica sets and have them handled by fluence. Solution: allow the webhook to create pod groups for them. In the case they are not targeted for fluence (any abstraction) and get into the PreFilter, allow creation of a FauxPodGroup that will simply schedule one job for the pod. We do this twice - in PreFilter and in the events for update/delete. Signed-off-by: vsoch --- README.md | 8 +- .../simple_example/fluence-deployment.yaml | 19 ++ .../simple_example/fluence-replicaset.yaml | 21 ++ .../simple_example/fluence-statefulset.yaml | 21 ++ .../scheduling/v1alpha1/podgroup_webhook.go | 224 ++++++++++++++---- .../mutating-webhook-configuration.yaml | 5 + .../pkg/controllers/podgroup_controller.go | 4 +- sig-scheduler-plugins/pkg/fluence/events.go | 17 +- sig-scheduler-plugins/pkg/fluence/fluence.go | 41 ++-- .../pkg/fluence/group/group.go | 21 ++ 10 files changed, 312 insertions(+), 69 deletions(-) create mode 100644 examples/simple_example/fluence-deployment.yaml create mode 100644 examples/simple_example/fluence-replicaset.yaml create mode 100644 examples/simple_example/fluence-statefulset.yaml diff --git a/README.md b/README.md index 8556dd1..8922078 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,10 @@ spec: backoffLimit: 4 ``` -There is no reason pods with different names or under different abstractions cannot be part of the same group that needs to be scheduled together. +There is no reason pods with different names or under different abstractions cannot be part of the same group that needs to be scheduled together. Also note that: + +- We currently do not allow scheduling to a control plane +- Deployments, StatefulSets, and ReplicaSets can be scheduled and have pod groups created, however the pod groups are not cleaned up as these abstractions are not meant to complete. ### Deploy @@ -520,7 +523,8 @@ kind create cluster --config ./kind-config.yaml #### TODO - Try what [kueue does](https://github.com/kubernetes-sigs/kueue/blob/6d57813a52066dab412735deeeb60ebb0cdb8e8e/cmd/kueue/main.go#L146-L155) to not require cert-manager. - - Add other abstraction types to be intercepted (and labeled with sizes) + - Try other strategies for setting owner references (so cleans up when owner deleted) + - When that is done, add tests for deletion of pod group (the current method is not perfect and needs improvement) #### Components diff --git a/examples/simple_example/fluence-deployment.yaml b/examples/simple_example/fluence-deployment.yaml new file mode 100644 index 0000000..9eb6cef --- /dev/null +++ b/examples/simple_example/fluence-deployment.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: example-deployment +spec: + selector: + matchLabels: + app: example-deployment + replicas: 3 + template: + metadata: + labels: + app: example-deployment + spec: + schedulerName: fluence + containers: + - name: example + image: rockylinux:9 + command: ["sleep", "infinity"] \ No newline at end of file diff --git a/examples/simple_example/fluence-replicaset.yaml b/examples/simple_example/fluence-replicaset.yaml new file mode 100644 index 0000000..f00e826 --- /dev/null +++ b/examples/simple_example/fluence-replicaset.yaml @@ -0,0 +1,21 @@ +apiVersion: apps/v1 +kind: ReplicaSet +metadata: + name: example-replicaset + labels: + app: example-replicaset +spec: + replicas: 3 + selector: + matchLabels: + app: example-replicaset + template: + metadata: + labels: + app: example-replicaset + spec: + schedulerName: fluence + containers: + - name: example + image: rockylinux:9 + command: ["sleep", "infinity"] \ No newline at end of file diff --git a/examples/simple_example/fluence-statefulset.yaml b/examples/simple_example/fluence-statefulset.yaml new file mode 100644 index 0000000..80da82a --- /dev/null +++ b/examples/simple_example/fluence-statefulset.yaml @@ -0,0 +1,21 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: example-statefulset + labels: + app: example-statefulset +spec: + replicas: 3 + selector: + matchLabels: + app: example-statefulset + template: + metadata: + labels: + app: example-statefulset + spec: + schedulerName: fluence + containers: + - name: example + image: rockylinux:9 + command: ["sleep", "infinity"] \ No newline at end of file diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go index bc99fe4..c2582f9 100644 --- a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go @@ -15,6 +15,7 @@ import ( "fmt" "net/http" + appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -51,83 +52,127 @@ type fluenceWatcher struct { // not be added again). func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admission.Response { - logger.Info("Running webhook handle") + logger.Info("Running webhook handle, determining pod wrapper abstraction...") - // Try for a job first, which would be created before pods job := &batchv1.Job{} err := a.decoder.Decode(req, job) - if err != nil { - - // Assume we operate on the level of pods for now - pod := &corev1.Pod{} - err := a.decoder.Decode(req, pod) - - // Assume it's a pod group or something else. - // We aren't in charge of validating people's pods. - // I don't think we should ever hit this case, actually + if err == nil { + err = a.EnsureGroupOnJob(job) + if err != nil { + logger.Error(err, "Issue adding PodGroup to Job") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledJob, err := json.Marshal(job) if err != nil { - return admission.Allowed("Found non-pod, non-job, this webhook does not validate beyond those.") + logger.Error(err, "Marshalling job error.") + return admission.Errored(http.StatusInternalServerError, err) } + logger.Info("Admission job success.") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledJob) + } - // If we get here, we decoded a pod + pod := &corev1.Pod{} + err = a.decoder.Decode(req, pod) + if err == nil { err = a.EnsureGroup(pod) if err != nil { - logger.Error(err, "Issue adding PodGroup to pod.") + logger.Error(err, "Issue adding PodGroup to Pod") return admission.Errored(http.StatusBadRequest, err) } - - // Send the updated pod to the events channel - //*a.events <- event.GenericEvent{Object: pod} - logger.Info("Admission pod success.") - marshalledPod, err := json.Marshal(pod) if err != nil { - logger.Error(err, "Marshalling pod error.") + logger.Error(err, "Marshalling pod error") return admission.Errored(http.StatusInternalServerError, err) } - - logger.Info("Admission job success.") + logger.Info("Admission pod success") return admission.PatchResponseFromRaw(req.Object.Raw, marshalledPod) } - // If we get here, err was nil and we have a Job! - err = a.EnsureGroupOnJob(job) - if err != nil { - logger.Error(err, "Issue adding PodGroup to job.") - return admission.Errored(http.StatusBadRequest, err) + set := &appsv1.StatefulSet{} + err = a.decoder.Decode(req, set) + if err == nil { + err = a.EnsureGroupStatefulSet(set) + if err != nil { + logger.Error(err, "Issue adding PodGroup to StatefulSet") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledSet, err := json.Marshal(set) + if err != nil { + logger.Error(err, "Marshalling StatefulSet error") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission StatefulSet success") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledSet) } - // Send the updated job to the events channel - //*a.events <- event.GenericEvent{Object: job} - logger.Info("Admission job success.") + d := &appsv1.Deployment{} + err = a.decoder.Decode(req, d) + if err == nil { + err = a.EnsureGroupDeployment(d) + if err != nil { + logger.Error(err, "Issue adding PodGroup to Deployment") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledD, err := json.Marshal(d) + if err != nil { + logger.Error(err, "Marshalling Deployment error") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission Deployment success") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledD) + } - marshalledJob, err := json.Marshal(job) - if err != nil { - logger.Error(err, "Marshalling job error.") - return admission.Errored(http.StatusInternalServerError, err) + rset := &appsv1.ReplicaSet{} + err = a.decoder.Decode(req, rset) + if err == nil { + err = a.EnsureGroupReplicaSet(rset) + if err != nil { + logger.Error(err, "Issue adding PodGroup to ReplicaSet") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledSet, err := json.Marshal(rset) + if err != nil { + logger.Error(err, "Marshalling StatefulSet error") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission StatefulSet success") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledSet) } - logger.Info("Admission job success.") - return admission.PatchResponseFromRaw(req.Object.Raw, marshalledJob) + // We should not get down here + return admission.Allowed("Object not known, this webhook does not validate beyond those.") + } // Default is the expected entrypoint for a webhook... // I don't remember if this is even called... func (a *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { - job, ok := obj.(*batchv1.Job) - if !ok { - pod, ok := obj.(*corev1.Pod) - // This is adkin to an admission success - it's not a pod or job, so we don't care - // I don't think we should ever hit this case, actually - if !ok { - return nil - } - logger.Info(fmt.Sprintf("Pod %s is marked for fluence.", pod.Name)) + switch obj.(type) { + case *batchv1.Job: + job := obj.(*batchv1.Job) + return a.EnsureGroupOnJob(job) + + case *corev1.Pod: + pod := obj.(*corev1.Pod) return a.EnsureGroup(pod) + + case *appsv1.StatefulSet: + set := obj.(*appsv1.StatefulSet) + return a.EnsureGroupStatefulSet(set) + + case *appsv1.Deployment: + d := obj.(*appsv1.Deployment) + return a.EnsureGroupDeployment(d) + + case *appsv1.ReplicaSet: + set := obj.(*appsv1.ReplicaSet) + return a.EnsureGroupReplicaSet(set) + + default: + // no match } - logger.Info(fmt.Sprintf("Job %s is marked for fluence.", job.Name)) - return a.EnsureGroupOnJob(job) + return nil } // EnsureGroup adds pod group label and size if not present @@ -205,3 +250,88 @@ func (a *fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { job.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize return nil } + +// EnsureGroupStatefulSet creates a PodGroup for a StatefulSet +func (a *fluenceWatcher) EnsureGroupStatefulSet(set *appsv1.StatefulSet) error { + + // StatefulSet requires on top level explicitly + if set.Labels == nil { + set.Labels = map[string]string{} + } + defaultName := fmt.Sprintf("fluence-group-%s-%s", set.Namespace, set.Name) + groupName, ok := set.Labels[labels.PodGroupLabel] + if !ok { + groupName = defaultName + } + set.Spec.Template.ObjectMeta.Labels[labels.PodGroupLabel] = groupName + + // Now do the same for the size, but the size is the size of the job + size := *set.Spec.Replicas + if size == int32(0) { + size = int32(1) + } + labelSize := fmt.Sprintf("%d", size) + groupSize, ok := set.Labels[labels.PodGroupSizeLabel] + if !ok { + groupSize = labelSize + } + set.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize + return nil +} + +// EnsureGroupStatefulSet creates a PodGroup for a StatefulSet +func (a *fluenceWatcher) EnsureGroupReplicaSet(set *appsv1.ReplicaSet) error { + + // StatefulSet requires on top level explicitly + if set.Labels == nil { + set.Labels = map[string]string{} + } + defaultName := fmt.Sprintf("fluence-group-%s-%s", set.Namespace, set.Name) + groupName, ok := set.Labels[labels.PodGroupLabel] + if !ok { + groupName = defaultName + } + set.Spec.Template.ObjectMeta.Labels[labels.PodGroupLabel] = groupName + + // Now do the same for the size, but the size is the size of the job + size := *set.Spec.Replicas + if size == int32(0) { + size = int32(1) + } + labelSize := fmt.Sprintf("%d", size) + groupSize, ok := set.Labels[labels.PodGroupSizeLabel] + if !ok { + groupSize = labelSize + } + set.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize + return nil +} + +// EnsureGroupDeployment creates a PodGroup for a Deployment +// This is redundant, can refactor later +func (a *fluenceWatcher) EnsureGroupDeployment(d *appsv1.Deployment) error { + + // StatefulSet requires on top level explicitly + if d.Labels == nil { + d.Labels = map[string]string{} + } + defaultName := fmt.Sprintf("fluence-group-%s-%s", d.Namespace, d.Name) + groupName, ok := d.Labels[labels.PodGroupLabel] + if !ok { + groupName = defaultName + } + d.Spec.Template.ObjectMeta.Labels[labels.PodGroupLabel] = groupName + + // Now do the same for the size, but the size is the size of the job + size := *d.Spec.Replicas + if size == int32(0) { + size = int32(1) + } + labelSize := fmt.Sprintf("%d", size) + groupSize, ok := d.Labels[labels.PodGroupSizeLabel] + if !ok { + groupSize = labelSize + } + d.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize + return nil +} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml index c639127..edbe7f0 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml @@ -26,6 +26,7 @@ webhooks: - apiGroups: - "" - core + - apps - batch - scheduling.x-k8s.io apiVersions: @@ -36,6 +37,10 @@ webhooks: resources: - pods - jobs + - statefulsets + - deployments + - replicasets + # Can uncomment this if we want to mutate the pod groups after creation # - podgroups sideEffects: None diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index fa4593c..ee267bd 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -123,7 +123,7 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c // TODO: Not clear what to do here. Arguably, we also want to check the label size // because (in the future) we can accept smaller sizes. But then we also need // to account for if the labels are different, do we take the smallest? - log.Info("PodGroup", "Status", fmt.Sprintf("WARNING: Pod group current MinMember %s does not match %d", pg.Spec.MinMember, size)) + log.Info("PodGroup", "Status", fmt.Sprintf("WARNING: Pod group current MinMember %d does not match %d", pg.Spec.MinMember, size)) } return r.updateStatus(ctx, pg, podList.Items) @@ -192,6 +192,8 @@ func (r *PodGroupReconciler) updateStatus( // Apply the patch to update, or delete if finished // TODO would be better if owner references took here, so delete on owner deletion + // TODO deletion is not currently handled for Deployment, ReplicaSet, StatefulSet + // as they are expected to persist. You can delete / lose and bring up again var err error if pg.Status.Phase == schedv1alpha1.PodGroupFinished || pg.Status.Phase == schedv1alpha1.PodGroupFailed { err = r.Delete(ctx, pg) diff --git a/sig-scheduler-plugins/pkg/fluence/events.go b/sig-scheduler-plugins/pkg/fluence/events.go index bc265f7..395517a 100644 --- a/sig-scheduler-plugins/pkg/fluence/events.go +++ b/sig-scheduler-plugins/pkg/fluence/events.go @@ -9,6 +9,7 @@ import ( klog "k8s.io/klog/v2" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" + fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" ) // Events are associated with inforers, typically on pods, e.g., @@ -69,7 +70,13 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { // a pod is updated, get the group // TODO should we be checking group / size for old vs new? - groupName, _ := f.pgMgr.GetPodGroup(context.TODO(), oldPod) + groupName, pg := f.pgMgr.GetPodGroup(context.TODO(), oldPod) + + // If PodGroup is nil, still try to look up a faux name + if pg == nil { + pg = fgroup.CreateFakeGroup(oldPod) + groupName = pg.Name + } klog.Infof("[Fluence] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, newPod.Status.Phase, oldPod.Status.Phase) @@ -119,7 +126,13 @@ func (f *Fluence) updatePod(oldObj, newObj interface{}) { func (f *Fluence) deletePod(podObj interface{}) { klog.Info("[Fluence] Delete Pod event handler") pod := podObj.(*v1.Pod) - groupName, _ := f.pgMgr.GetPodGroup(context.TODO(), pod) + groupName, pg := f.pgMgr.GetPodGroup(context.TODO(), pod) + + // If PodGroup is nil, still try to look up a faux name + if pg == nil { + pg = fgroup.CreateFakeGroup(pod) + groupName = pg.Name + } klog.Infof("[Fluence] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) switch pod.Status.Phase { diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 0e8ec21..8cdc066 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -165,30 +165,37 @@ func (f *Fluence) PreFilter( // groupName will be named according to the single pod namespace / pod if there wasn't // a user defined group. This is a size 1 group we handle equivalently. groupName, pg := f.pgMgr.GetPodGroup(ctx, pod) - klog.Infof("[Fluence] Pod %s is in group %s with minimum members %d", pod.Name, groupName, pg.Spec.MinMember) - // Has this podgroup been seen by fluence yet? If yes, we will have it in the cache - cache := fcore.GetFluenceCache(groupName) - klog.Infof("[Fluence] cache %s", cache) - - // Fluence has never seen this before, we need to schedule an allocation - // It also could have been seen, but was not able to get one. - if cache == nil { - klog.Infof("[Fluence] Does not have nodes for %s yet, asking Fluxion", groupName) - - // groupName is the namespaced name / - err := f.AskFlux(ctx, pod, pg, groupName) + // Not scheduled by fluence - we have no idea about groups or sizes, just ask for one + if pg == nil { + klog.Infof("[Fluence] Unknown request to schedule %s yet, asking Fluxion for one node", pod.Name) + pg = fgroup.CreateFakeGroup(pod) + err := f.AskFlux(ctx, pod, pg, pg.Name) if err != nil { klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } + } else { + klog.Infof("[Fluence] Pod %s is in group %s with minimum members %d", pod.Name, groupName, pg.Spec.MinMember) + + // Has this podgroup been seen by fluence yet? If yes, we will have it in the cache + cache := fcore.GetFluenceCache(groupName) + klog.Infof("[Fluence] cache %s", cache) + + // Fluence has never seen this before, we need to schedule an allocation + // It also could have been seen, but was not able to get one. + if cache == nil { + klog.Infof("[Fluence] Does not have nodes for %s yet, asking Fluxion", groupName) + + // groupName is the namespaced name / + err := f.AskFlux(ctx, pod, pg, groupName) + if err != nil { + klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) + return nil, framework.NewStatus(framework.Unschedulable, err.Error()) + } + } } - // We can only get here if an allocation is done (and there is no error above) - // The cache would only originally be nil if we didn't do that yet. It should - // always be defined (not nil) when we get here - cache = fcore.GetFluenceCache(groupName) - // This is the next node in the list nodename, err := fcore.GetNextNode(groupName) if err != nil { diff --git a/sig-scheduler-plugins/pkg/fluence/group/group.go b/sig-scheduler-plugins/pkg/fluence/group/group.go index 455b9e5..0ee0831 100644 --- a/sig-scheduler-plugins/pkg/fluence/group/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group/group.go @@ -1,6 +1,9 @@ package group import ( + "fmt" + + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" klog "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" @@ -8,11 +11,29 @@ import ( sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" ) +// CreateFakeGroup wraps an arbitrary pod in a fake group for fluence to schedule +// This happens only in PreFilter so we already sorted +func CreateFakeGroup(pod *corev1.Pod) *sched.PodGroup { + groupName := fmt.Sprintf("fluence-solo-%s-%s", pod.Namespace, pod.Name) + return &sched.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: groupName, + Namespace: pod.Namespace, + }, + Spec: sched.PodGroupSpec{MinMember: int32(1)}, + } +} + // GetCreationTimestamp first tries the fluence group, then falls back to the initial attempt timestamp // This is the only update we have made to the upstream PodGroupManager, because we are expecting // a MicroTime and not a time.Time. func GetCreationTimestamp(groupName string, pg *sched.PodGroup, podInfo *framework.QueuedPodInfo) metav1.MicroTime { + // Don't try to get a time for a pod group that does not exist + if pg == nil { + return metav1.NewMicroTime(*podInfo.InitialAttemptTimestamp) + } + // IsZero is an indicator if this was actually set // If the group label was present and we have a group, this will be true if !pg.Status.ScheduleStartTime.IsZero() { From 0e472595f428876123fecc1e0d4c807f2b3edca6 Mon Sep 17 00:00:00 2001 From: vsoch Date: Mon, 19 Feb 2024 03:41:29 -0700 Subject: [PATCH 18/28] bug: the metav1.MicroTime was not being set Problem: I noticed in testing that the time only had granularity down to the second. Solution: It appears that when we do a create of the PodGroup from the reconciler watch, the metadata (beyond name and namespace) does not stick. I am not sure why, but the labels are still retrievable from the pods (via the mutating webhook) after. So instead, we need to get the size and creation timestamp at the first hit in reconcile, which (given how that works) should still somewhat honor the order. I did try adding the timestamp to a label but it got hairy really quickly (kept me up about 3 hours longer than I intended to!) The good news now is that I see the microseconds in the Schedule Start Time, so we should be almost ready to test this on a GCP cluster. I also had lots of time waiting for the containers to rebuild so I made a diagram of how it is currently working. I have some concerns about the internal state of fluxion (my kind cluster stopped working after some hours and I do not know why) but we can address them later. We mostly need to see if there are jobs that are being forgotten, etc. Signed-off-by: vsoch --- README.md | 1 + docs/README.md | 24 ++++++ docs/images/fluence-design.png | Bin 0 -> 87714 bytes hack/quick-build.sh | 2 +- .../pkg/controllers/podgroup_controller.go | 75 +++++++++++++++--- .../pkg/fluence/labels/labels.go | 24 ++++++ 6 files changed, 112 insertions(+), 14 deletions(-) create mode 100644 docs/images/fluence-design.png diff --git a/README.md b/README.md index 8922078..ae420fd 100644 --- a/README.md +++ b/README.md @@ -525,6 +525,7 @@ kind create cluster --config ./kind-config.yaml - Try what [kueue does](https://github.com/kubernetes-sigs/kueue/blob/6d57813a52066dab412735deeeb60ebb0cdb8e8e/cmd/kueue/main.go#L146-L155) to not require cert-manager. - Try other strategies for setting owner references (so cleans up when owner deleted) - When that is done, add tests for deletion of pod group (the current method is not perfect and needs improvement) +- We really need to see the state of fluxion - I had this running for about 6 hours in kind, and at some point it just stopped working. I deleted and re-created the cluster and it was restored. It could be a development hiccup but would be good to know! #### Components diff --git a/docs/README.md b/docs/README.md index 155ffc8..c4718d6 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,29 @@ # Development Notes +## Design + +![images/fluence-design.png](images/fluence-design.png) + +The picture above shows the fluence custom scheduler, which uses the Flux Framework component "fluxion" Go bindings in a custom Kubernetes scheduler. In the above, we see two pods running in a Kubernetes cluster that are intended for scheduling. The fluence pod (beige) has the fluence-sidecar and the fluence-scheduler, 2 containers. The controller pod has the fluence controller (1 container). Generally speaking, the containers are responsible for the following: + +- **fluence-controller**: watches for incoming pods and abstractions with pods (e.g., job) to create corresponding pod groups with names, sizes, and timestamps +- **fluence-scheduler**: provides the expected scheduling plugin with functions to sort, pre-filter, etc. the queue of pods is essentially moving through here +- **fluence-sidecar**: the fluxion GRPC service that is queried by the fluence-scheduler to request an allocation for a pod group + +Both the controller and scheduler logic are bootstrapped from the same underlying kubernetes-sigs project, the scheduler-plugins, despite being in different pods (green). For steps, scheduling works as follows. Note that it is [much more complicated than this](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/), but we explain the high level details. + +1. A user submits a job to their cluster with kubectl after installing fluence with helm charts. +2. The mutating webhook provided by the fluence-controller intercepts the job and adds labels +3. The controller for PodGroup (an abstraction that holds a name, size, and time created to describe one or more pods) is watching for pod events +4. When a pod is creating (it shows up as Pending or other in the cluster, and doesn't have to be scheduled yet) it starts to reconcile +5. The reconcile ensures that the PodGroup is created and updated with the correct metadata and statuses (and cleaned up when the time comes) +6. As soon as the Pod is pending and the group exists, it starts going through the scheduling queue and hits the fluence-scheduler endpoints +7. The fluence-scheduler uses the PodGroup name to associate each individual pod with a group and start time, allowing to sort them together +8. They are sorted together, down to the MicroSecond, and Created to run on the cluster +9. When the top level abstraction cleans up and the PodGroup size is equal to the number of pods finished or failed, the PodGroup cleans up + +The result is (hopefully) a smooth and efficient scheduling experience. We are still working on it. + ## Thinking > Updated February 15, 2024 diff --git a/docs/images/fluence-design.png b/docs/images/fluence-design.png new file mode 100644 index 0000000000000000000000000000000000000000..c35d9fed86830e92f21fe4d3f4cee782958f18b4 GIT binary patch literal 87714 zcmeGDRZv{-7X^sYjk~*BaMvKAad!*u1h*i;A$V|iXq@1|og_$*08Oyq9^5tPoc#VX zRWpxMb??iqo0lYgx{rL{-fOMB_C7HhYVw$9WN2`3aF~h;GFotO@L%EJz!WH8;FDGf z4R$!V5I98{Nu3X-NBPJf2nNZAO86x^3HHsUzk=y16}YsHqK7q0mAQ(1q|kfldop3P zSF8S~jNo4~vNE_JN&4I1rb1GtCZ8Yq#uw`APvUE-?_6!Yy}b`ao0@Xrm}tXsq)=!F z9KkBcz~8`sqWzhnsQ-O>4nnj0-)8|K;GBP73&){`hP?XUFOdKLU;F>|f}Cft>DSGC z)~+z_@SvroEvNpsEmY)|^&zHxL4hHOo_jFKM3ABUGVryUu2hn8i~dMW^aHbkfQym5 z*YAVwHlo6iFhzV$@_cRCOFo=$rW`R9e6XCaGbtDP?^M{#P{rmgNys52U9VDOV>p@q z&)+YRrHZLlGYJ&0e_j3C>ftRAB7cl`HvvrQ@$UTLYTrr@S2UzIQ)VcZ_!L$o{ePWm zGcshg)j41%VM-7B@rCd<9D6Hq+a)lyxCz^37b^8J?P0V zSJ0UP>qnJtwTi^Q=n@mufTY^}$PbzOdp&PPF62_9M4^%^IF2FnZ>90jf!Lc$5CR*L zya{M$y?R&R`zO;Zh)4Rr$b9VctwFq6>GE-uy%-l-%UUJ7+?~ucf4aZsd*a8U68qh3 zKP~2aA@+AW)nlLE{#P1@k@xm+a^KTUE~!XL3_g?6w&Q#qVdn8tn|`e!p@%kI47Eg1 zC>XKR>(Kjnv8A%2bA7ePf4W?i&8#=zVCM7A)%Hj#F^8e&?*@y$m*+0h#UbE$kW>Ret&6mS>cuTkGchu=~U~fnsmCZ^_I5{0T(oEb9p+buBQuT5Oe9! z`K%R&jNKdb)JZDB+UB}OCE%d5|Er94ddHZ66s=CTT5or1C+)f|Gqi6Un|sDl+kuqZ_o#MXiyoM*K2O}e}Z>!&N!W6=+&l=y5$)9{Q^ z7!;M#m_vW*e8B}4oFNieGk%1!%*>F@Wb?sTVxcGYpy$8ujAaLHk)@pHY7G9^6KN>p zAyqMpiBaH*jb@XvXq9TvEisY{Isw5Fbr;Cl;P{Bf@&@#lN~Ulr>5VKBC7RiqYcyHcKtU+qP+NDyA?j^!T?4f&ErGy%f?#aP8I7 zn3L~WXh?AuIFoePzs%%c66s;16 zlXyInetCXMA5h8R-v)p>+WomSZ>kiMtO9rV5LCJQ1?^zZTTmN_59 zwyHyhb{NFoc7OPC;syK~k5P$v4ULeM-MC%DS+yhJ@5SC!$@Peeon587-^p^vAlv&t zKigH?nb+v$6Sl`P)dRFE)STzb3lKe*TAc@pOI35dfka`o08%CA_4{%LcHq`0#c%Tv zns!U^U|OU&la-n!8Xq?wfPI%**g8m_N8JNpaR%i-SUCP~ud%^?d$6KSM4Q+CH%|Tf zAq4)1wQUT)$z8iG?pQ&T=AVr+Up7w?C7zkzHbBEe(fT}gKH<^?KR<5f(#d{;N5&%Q zH6KY~Jl`BlSdT zZ{R~5i6sQG))y?HoWZ_{NLuAjYTW(d{BuP?N)j?Q)l82kNZfJ&WkvZN0935Mf#~(( z60a8vc&e2QL&8JieL7j`+Pnh-sZ23dZkdNP+E0!^t?*54Yjh=|hwGUxL|RYj4sJ;> zPT$m5c3He{$me)bK!t&!w0FDDauCfuU^9m4dw+im-?Po#*~TEpVwuY7RfH4~l5ux7l^zo>vHB7K@5JeMi$+(yP3G z_@DMLg&ubtgUk@Grb^@)#ZWLXbWSurZIW=Ac311yJG^U$qz7eknxxT})_(dc@yI5L z4K_Ru;?%Fp+pr0#(Y8OMftm$g>-H-nAS#`n%H%9e#rs~-uSU5nIO02i;UkRxGJvF~QE>W*GDO?}p8 zLR|*%d3sX37tJoPB%=41nCq&Td%%8KX7hR-DW7c|!t+tnWZD*h2Q#&a zgoUW3v!;LW{SZkL9uc8943`M*1mGt!6U$cnz#gt~@IOXY@Wc~B@cetRv$NGs1@Kxb zt~3V99;ZoX^aDCqxA_Unm-BDP)C%kBZ~n?8mg<#;)aiUqIdQ1~;0~`*HCM2+>wddo ztUlmFxoj+vhYOIQ;{;0@Y%l?b8E$%y#Covbj|56QD$mnZlMZY$egz7?cN0$KKaqH~ z!Q`@nPQSSceM4Hz7B<|jPS+&(`DZTMzUH6&$Gy7Yj&n6rJx7m^my@gw$e@)9b?W{x zh@!Vop*#ZsW(xecLI}<8ybcpca9|uptvWm(CLd1w>g9_OJkwcqJ$*#WlruAuB16=2 z_&>LIJ%oHgKA5Xz+r1wkHVd$j+om!@Y&NtI-{v&&j%)>jAi$AJxTQi>j!ebs>A1Chvsf!M5{>Ba<|y{J{c zGl57vT}^L3+o9~ataN@xuQh5@>yW-<^*Q~5ZdY+VC}Qn@u{|;lOP;AWd#Uex=AH0h zg1RoZ7pX0Ov5GpeiTY0)#~l6#Lq6cn_q!9DrlktWm!?+TaJOe*!@!xJ&pR#1#ed*- zM$`H54+;u{?nmN=fza5Kia=Am1BBTVaKED_fz)KI(<({+GxBN2;Zaf4!fxijN)>_o z;nXgVGaM!+p6?yqCv%1BPfqZ$xu9nL#{%xRkAJe2X=b*Uk$b=26`m_Xs71Y1Vl=JJNt85~ zo8C>6IRf*(zNR7vPaz?zXU} zm#9B*K{e?BU6TA2)G8I)O5~$p^w5`bP7FvL{g)PZKv;EVB7-r?q!Rgt0SI#d%%XOJ@0K%_JF5m^E#N>y#C5%?zGaW+xxA?RoU%c z8d6=q9||PgfvE_9o+B>F03DeHKj~5c6j#!!u_S?53GdbAduiJd0Vt^PEAPZxu%UB0 zh^zN*6ZlyioP+Q0J7hL*?~`Q~#aW-+e^g2lA8hM#HX(XCth?CaSkVQ9THyT^E?4ve zASB6qKLH}eb|TN<|8eU+N53`If_KvORr(a(yga@~O{J4X^LfeLiskCnVBtaFw_EG& z`M{`__q(5%WG0jGpPUEemFBJA2N0$}jf~s#<6jd_hPB>6QT;k&yrFjV>)^M5whp>G zvjC7E2v)qG^E=+I2>jM{{ToT)z(wIj{tutg&?5fK_jzPlWTx5mo0Pc2dFht zID_3l3!N(B?98pqJ>GQ$Ky+eGV-0HCAp%5+csMo3x{K$BQ)QCS|Al(2*eKZV&(Ca+ zdwg}VFHQ@*(@Rnjpa?WVUO}YCnUk3@AqAccQ6>1sWBfQX^&0pRqrP=Xq_>Je8ik;*xo07&BN%~&p_EI
8vqW%IU1Lz|t~=m_S5pSHPXmVR3bDzgU`^n`>xjv^dOqY!4T816tX2O>CVQ zgc)-KtXnm@I8dIz5?~)GMZIe@ia!481-Y2=%eWj`0Nb0elo{YAOyRJFFr2~|8+B^f zE`w_43Q_aZ3y;F4)_YOa_#S*UCC_2l3~`ySG0+}4ggY!uEqoKWC~Oj3aJKw@61XfH zek7ohr5f9u7POrAQKRpmsO;3)YXo2)x2rwPOBD%JyNVVf)0($D zUavFx?O-%wV#{LFmVhQX(8fC%&KGSvw?K0}(}Mg+q^3T&0c6Ubzk<*rud+itl)x@A zFGC{(K)An)Yz?jH^}Q^2uX_i=1Y)Fo^;L5!r-_I)Zy)@Bkmp6^4mxh`<*!gS>Iy-^ zi5U8$15!HtnaziC2Kje@6|?6=OHSGq5gA#!Gkf>#oieZ)JwyO;&+1zcx^|yOb_KG3 z0t!u_Tw|1u#P~BZXL-E-Ei8V8#dOmEvOAo-)sHPf@h0GIXDoA^Z{-re8w?U|8vmOk zThFdjp%DPR7KwazX2>Gho-FKBFMM-x zu{T?-KihJ1y5^dv{J*dY0?PSFDXkaut-gIh%02VHkKcjf5P*h%Mk`YMN+qoM*T|e)dWEhZ0g`Yv&E`fl?jNUDy=D z4Q72o!b$caw-f-I48@TO6L%6-_z&~D-u>MzWL!YHVO)I)S>A_AhCxpO+3mXrqusOO zOCnb*5R3UJr-+D%7!SymjqeddtM2d&Z?#U!=Axos05rgqw=Tzntp!}hZ}{(w5_Sy0 zn@x9n?x%!zdmU!%%L9y*w8wA*@C23ORbsZ-9D(?XL zM<-yW0e9L>75`7~Zh{G5;dh|`asp|uk>;MwYjZ?B{NC{3H8;1$cZA>1lXU;gi%qnD zUyDM*Bx80rhC)JD> z|NZmL|4`-s)z?zx^Oo9N|1Ey`;no0G-{Zw@VQ@~}^ZNbuVer#+<~~^kLSweg#m)eGI6kdNz6;W?9=O zvv{^n`(6aQmuV|L)Xj$P4JV1)|6-W#SBHX2=CAEJXmTP6rNV!+LF-Mr=8U(h>wCWo zBcB(Ey8Lbc(N?JDBGtS0JF+$xbue0}qF=V#!<&BlSGq-6Pz z{!n(=W?mi7lVwttTUf2M-CU(6{Dx7BL&O6n28QQbc9YH-j05y`x?EluuTB9L-W5T_K5(jNhtA^0Ox$1t1 zwfh%4W95$Q>$hvcKR6DRw-l<_K|oZGR;a;+@(jk8-|U&eve%J-ruQ^8RYP5{R{fvvWnmW-PXF;jN^Qp|{aD@I2P-CG}A_`>Q=&(}u*xjn=3PgTVd( z327hhX_!OehXo?yG6f7omT6JEftcQPz|$BatWe4RxCJQ2*!d({X>O%`e}~C@(I~h` zyiV_9G71}L#KhP0iG0y(xOKpGbMwho8w7NQ8{I2xg-am!0sU%)&>sBq3=jtqr!mUw zSh!AtGuH{gU$Yv|5kSEqBl80wy8Q~Ua{yaIp@?*5U&e*zP^OpWhg%$myWAf_5djQ5 zBX)*ZPSfsaZh)t?c@%IdfWn{y1sRJ%!v=Jmz@0*Z`0I0FwEs7Cf>0`?4ul^aK&6u_ zguUR%PG#2a2I?JlKv-&b-va0t5b&-}VUiLF2=vo3gT*O(y=VkH4$1&9AK}%?CsGN! z10Hl|$!<7ljcL!K#kdL%MzA}YTJtH!LZ?@C!3P8gl}N2GqEY}rSa*k77F%-~q#Bz^ z55o`?Rb#ApsQ?~f+RyEhgh7)9;vTf0m7{$D%aHRH4!TanVHlp3+~dQ6lYX9u&|+AZ zP)T)%KvWCI$T^NM3!jcGk!;wm0LK8I8S+zVN4}RfOXYFEq#X3i7NF496?WDH8n{3VX{{2Hv2nkimc-0D+Ah-d{0+ZVAEC$OF%`THhe{s|4FFr?4 zA9$Be${@-;SDM?ox2xJGqFJL;%u-;h!Z86TNce+Ey&w$%LPB~xY`-Js0a3yqbiSHo z7MAgUy7wwI_QM`AHAXpU7j3uV{G`&)m_RL2SID;hY1nt>7~t^|szuRNxp=Z7i3LlG zqyBHRnOtTGvBmx~Y*k=FLi-H3%H#4j%qtEBjz;iCd<1;v6NVB+!bQ01R~4#jc~;hg zqE$lr2xd!b(N6RZ%D|X!_5yE9iS*^WzV;lNsvY_Qk_S0oZ%axd#J6h{GfjY%N=*jo zjT;L{mcE7HsXG}C^rfgknRmv86Xd1tHwf_EhGF!8iBVDlR*a06yLqtS#}mZ1FpnP* zZjP0aRX071x zZu>@~%vOeDKVMjmJ})9!y5n~!irw#qq5=$K!%WLPnj~&dX$Ha_P|#XN61eb4NbP&}cB|bTtFlt`o3^y_Kxqy|me#NP4vkzth{hBb##-)) zh#Y{XnYa{5qPf%QWIkTKviRp*c)SIZeY7G8{)O)$Tzu zJc@Apm?JiGxro=n-fS-*m)a5S?3O<(h0q@~@m3P4eDO5%w80S>=D{|Sgqc7n58iE5 zbmbsH4=9r7Wl`y{HF0hz^^|t@5{5~IrM`}+vpD9(F2nFc9H4?{!nG89_taaTE`H-= z)03aU4d5(KZO|ZVAly|KCcJ`9cp1Kz&@}}Iu^!u>*q4n^XW&)q#|1R(DR` zQ)i_W2q$(!K}!}}d+cBh`0F4!$Zdi->o;NU99A@)acl0zJyW=2scwzE8&0xuv-n6L z@xIf`#lcR=Y(jAnWIr=4^|Rp*^JlxWXh>X>zTsyS*#R* zs^$PD2$M9_FRp0*DmowwtCr7wQ-)vcie+zZYD%LC4kCJp@d}<7V-<180ea2a+!Z;{Gs9)hzCfHGHS_ zV!PjaDb`*%CVg=MSx?Gv%dbvQ^yQIhsdGq#TqwYm;!Mw@q{g@sMKVQzAMJwX06`Ko zTO*c4`qU^1X0Wo6MKA!#_h6?FxGuFGucg)|%(hozoY6&UH~@_@`R{4M4X1b+=~i+0 z?W%Wlr5OEwB-YUc;=Dv1pX_$T#U9}HNsYfpFTk%d{DfnXAx`X!+P07jFa8$IMvq_G z5zdq0Y;E|f#(y=bNre%%QW?G5jfAUVFv&y&%215oBg))QDI(%Bq$<&m zT%X|f<)7C(vxKC~J~@5QuTqeyF0vQ}AE&eFCnoWqdVYu@J2|_hk9?oOkch>;|zu$ zM4=*Gkoux*d|X8$E!KJCf-jMLyeF-p#kxdpNFx=^u}w0Op1*Tn;I;4Vrd??$j7uo@ zz0#B*KZbolN=W$~-A;5L-MdV7gO8ku&P0lsXlNdGsr*;8l~jPaka0yb&CMk>wM1-( zU|+y2zYYH#$|O^WN^Ee{e9H&!q;CNc8-#aBdn>-8$ICXUw154JtBKuCug6seT^5Os zD*VSW4I7L5aMNp9a&;uY^TdxPmJ_f^oeE#l@mkLd#OL9k#HNW<+arhu)}G8uAzJNF}b&86^^96EO6q*aiBj7`!fr zgOpy{8_}D)kZUpjrUoht?I|hcZh>rHm?MnH!RyN=R+p3}?e6)y5f@NDB>l((ni)2s z2wQdgxC<@&u^_ll-->GAK~0KIxrt0*sB}L83&=|zv6CapnN}}jG%eiAu-2p#o#zU_97gBR zwBl6^N)urDtW1$xI|)*a(hFK$rRcu5as+X{GuH7WSCf}O?}*gYSaWbX!Xp}wa+=}_ z$LAqq`4XW<{X=d@q;|f^t?mi871{ds5-46&u*rdBSod-4XZ!SLb1=5Ei@l{PmTXLr zPMG@;_6~eeYPtS3^g>(DJ&u@yG48pp6hrMG`3dN6P|P7f`Vg~)5=eNYf<`^K%H>!? z_&ko}Bb8Hu3w|rS(F}Vw?{U zP|2H3nLr#bL`3`g*B8*Qy`u@_^JW=IEX|#&An`={#sY0(**r=k4hSx829{+y9n zW!ccgr^1>Z$h9+<%cr{xtK~NrIN7$IDGIOYboSF0T?V+SOC#xX_@s*U-Q_0I;%dL( zBjvv&5dOj+gE*7dIQPzXUwp^vE%x@JDN^45^RMk z*k~9%+Cz zH&6L++SZ=eaENT*vdE40gVROdb5NnB{pebuo^U5Nd%qCN1%X-h70>=h&Z zF6|w~;%4DASsu;24)@==C?0%#5XTnd6;C9iNiA9uE=pK;N}&hZM+>o_bw7y-+3_zD zS$Nmn9@`3keSvx)s#aWt{_ypMOvf-D6pzhr*iQ1|N$K0~7aBq*pcH2fna0Dk({m8V zyT+F_pKk=}dAWFjArI`~OS`S?IH{Wheuo*9S|nX%hlD39$qMyCJ;vfBOt_22@M%`Z z=mMZ#Ikd=2c0_B`Vtz_=-zrsKP&ZIcGxISDc^-g;RwnecCui) z{XIMaNEdutoWVz^6}nSY_>&H{2^fvL<7p&;xK*J~0!R^4lPJGxW{`~cKhXM`xUdt! zATyupX{u&=XEhAttE?K|OD7tr@yi4QJuJ-~W6tGh2H7PEzpKhXqEHO^eSYV~8EG$* z4v(@PB@$g*ea*R_h^s=8?-%ZBnN88rl2f4UW`$*5^jzwwmUDwxaX&6?Tl9 z?UHdL|BI!SS|g#TYW-T*F5h|6xw7qR-u;ec3#TM41uMSZgzy_P@dN#0TZ6)A-qqBv zcN=akSIk=2x5bu8Ej;4l?UehU>kZ7#@#UEb^9aevsye9*Oa%3)y&lu~V2XPp$ANWY z$ltJKhHU9|i?gpr;7S?rTooKDyfjUkQ5(_r+ywf5WfE=UE8obXSF{9w0jdIC!Qt-` za=79l3`;AVV4qiWM8)8f{H@;#z7GS3& zJ)W5`iQwrSMJqNrj_K-+v5dVY1^ri+<|X3M^>NHw~QM2aZ>u6|?> zwC)nTS1$~VD*^LDfTdAZZuCpDJz!;;v_o^rqNg@A2rnIt1BFjlx`u>p;G-CVM(P** zxoX6lkq(4ZFTMTXW!e}59%nH(H@O4C5=X*o`cc${>Km0M&9%_viRrAAk?&EWGXq*T zvFMeZ%<`G1ov-OL8RE+f;Cl)+hhOmv^ezMT$lNvMTt7&MVY`8I)TY-Wz(#70>&P`{ zbXmP;s=9t1?bp?n(7a8~cG2j_Hq0VkGHZpFQ7Ji=LVtXXF{@sLYKEvv{)DxaUfa$W z;vlBsvB*39ALUg(wv|K@W&%LtGXMfnx&*3%r_HA6JuDs8h*1_QR)y2X;efuH={i~$ zGcnZTic)`K_<8XT8Er2SL-9B+puw&tvRHLg+Gr1hkZSa&LgKU9P~n{D-Np3>UtAeQ z^(#b}9Mx5|na3Ysz6dAbhVTu;(oWbqGM&}s$I8zsk|sAi=z#O(^E=WlQ@zlgp4acS zQzCla+-M!UlW?tkGfI=k&S<6Nv%#Q_E8S>CLj3G2#a5F?UpYDSOi`?BbvyrTZg|b? zA$ZSs+wdBnXNSnLRN#*+)sWN{;%NQFj+&H6ziz+BpBVn8=*)C?+_qM=Snwi3Pdp^^ z+?nPAmosEx)Fb^^$q=m}XezC?pzoHSyjA)1fXa(jg-PC6=TzwY2?!IZ)wndoK42zm ztNcp;?pOeau*Q9B=%9kemvK?n*5?K_h8gbx9zpT~?YqnU)qa)uT(!P>i?L{9GC@Rj zhE-Tt7-g%jh_8mZSb^X`hFDFTy1Y${rcxE)peZCOVK=JJvB3C^9qjC@hL+-yDv>bB z1^Fq;d}f8#Eor2=qDDhLw|8^KEhMmN_> z*PZ!0cmH;}!J=QInlb~HNs&rs*^O>@_O+Ep+eSyZNr%1kB{L@bsan)Sl$Fl}tBZF* zqrc9R-QjKWID_+2s{of%k?j;G8x!qGG(6bQGvo=ihR-9a_?!;W%yn(_1_Fp5Eyl3UXGuEJDoj!)`s^T0>uX?a#a$jynnPxuYxhFe)|m}{S!~4v z&l4E|7;5R!-Ae=l(TCO&IJm(#lf^~e7G#k*^t>wom<&g1N~_Ug^VDihZpPb?`%J?z z$Z>p)eoi`F-XF#aDwD8w(=tQ-%DmIyoa=4aMUd8zBrhWOv%S)(+^0iz&>qvjnD**# zx|Bo0`942?QikSBp#tE=&rms1`p&ADt;>_N4~wb;j41Q!P@6@wrwU04vB^~vCL5w- z_%LMSi(~!LTsG}cssQgIc?zK^a+AlQHvGyw%!m2p9z~$kWIKLEp5|--WSrfR=t)V;BPm3aH86Lo0{W;w zQ*DGYzc3E1#9jD<02z8@4-DJpo9>`dOJ1?E$mjTVxRZ>5O zcbP1sG~(CUU{zXR{N!Bl=Rdf7}qCSQ|@Cs$<_EMH^CHFxHb z)_Qnfxn7shzRsVr*=l*hQ$@$jXMp>Wq9#8hA0KV-rlWQat1Mua$}byi(hvLrtl!s8hs=e8&bq*-uE**7H8|9ZoJgAzO0%lu57~`>;v-vNPAA z)=+MZmhVhzvBXLM?YEUzXvu-Ue#f`%#mRd5CXe+B#@yF5$Eq08gk2Fa6oTDkOlWiz z=mmI{s@l4Q3T)PeMCkGu@l?fy+zeAitU^_-<2hSPy{cERK00A)osLL~(^EZ^BNAkY z4P+im-|R9CnbJfYCE%Aa&ytgAx(YLMHTwjDbct5t4VlVZo#tP^|AT?|O5g?PI|vIl zL&ZMud@@Qh5g)>0`~1PfUa|Izjqz%xZf>u}dpFIkjtnPcEnpzkMrv*HI4*CAM7)-t z@%m0`v8H#te=6 zn(ZuvM};e_!&P8BkM+hKc*ztJL1GLH2v)53TwA0`Qma{ltMj<>D#<__5h@roY$~qc zJP>g7n}=W4KvTQWNA~tpIo5ea`S)Lx?RnVD)7?S;|v7-^Dd5a}!{sSsY0!3{zfdV=Z|Lxqkia^m2o9fo!q_IOQ`<9}L(- z1AtK!Y&;~y5R5cRr-R}~y(%I*_br9j9uO9_g+Gt2iGuic3XjODy~*oENGU#)VmVi^^R4n^nA4CAtXJp+ny6yfSJy-8%XadSEth7S! z!#;YqFY3__v)nMrHu`^~dmatty0uF_yup_=6wi5$l_Z8C!tA#!@JFIC`u+K) ztTDJPMo~B=bU&)uxW#-!)hJ0`8p~1eBLN*1zfmdLQ){PL_O{iT6meAXx4pUi#6;d#DoGScpXGI5x6__0P1JitCJ$-?9P5H0IyZ>v5$P5JU-Kq0E#6 z2N;^HkqDKXkk&mkp_XCQN%OO_1>!ZDNT^e9VZ7!vK^hrFyq3=At>wz^m#{j;Ym!jLaHiM(=UXia}ZU9wa~Niq9Mp7R?Is$ z(+8G*)u=>Q ziaH3=47dfUkx>aGAnO)*_*GQ>mvStY@UF4jVuFpiKKfP5(1PKz$fU$UuoZb_@Oz~F zf%aIqgCU*@iQwmM(i6#c<@D)AGb}C>>PP#hHeF+ar}AY{PS#U@T1=E0Q(SGDd^i2~ zuV;5qa7#*Y$Z8TZ6J2WsYE(hzYdh zF{AWf4muw-Z*^LsrKM!;_$Se5w;#EIb`5V;62r)|ja;Tlb?LX~9TUZ`M2Ep~T~5Lt zblf5k201@$pITIwD0VrxN3V|RZ*W3%`3%Kg9U2@aJeH{mQlrY)zH9ewk{+T!x7wQ& zbs~Ld1~KSUipa{@1cXq`EUm zTS*4I<8Ye5@HJ+5lxug9Z+hSTx9;yKLiaQ+2yYzHPNN6jW~eX?@T!$N8FLXZ=|D0m z2avE&vb|Dq&1%~MKY7KScJ2NwZQy7Q;JH;_c zE<5Gi*vG^MKO#Zb)tXZZueWAC-hUO}05dG^=)l;Hg31{%`{V@hL~Kj#&AG4kF5t;E zz9+~N^+^zAN2T4K`u~BK!e}zk_JH9CWZ)0(#qa6a@1BedwvthCE(=GWfk$*_1dan! zlSu#G!w`TP3Hm!oq4*ldlz~N~tc{5eb| zAvdA~ux1i|v^_gu_UZq#C!}>j-)rg0#3oy*UmIS%)Q|cd_|oTn)CI3itR}OFlz7~C z{*}xhZbgvNn9u5b+sj{nv8VeBEdJ8{zM#R`o9hcDlOX-q3+LEl(|-@J@PB`R=xXr& zaM{z21SdH(`C-;L7ptx%DgfHDPZo_%Dcm-8a$V>=)!9MHD_;HUW&mG(rc!^FO2r@+ zOBJ_9ub=DKboDK;ZR=O=O}(xDG1t_y$e__6jiD1la;MEa9)+c%OMgF%qq(a0=>Bj% zdNrQO^7eWmN?Pq7XuGv|!+C(aM-S_XjeYiM&pf4;TwzPKnw+_{k@uNMCo;;I~0Xth1Qa%XPgvg`2 z1FubpL<@2lZ?2^5C=JBkG9;iA1};(kY6x6A;+_jjtx!pLFGOuvAxhQTKh_^7yHwyZ13Bb!v7Xb3nFbcq>e)k zqG;*G$rOoqM0$(f^Ducb`sBb_rkCkt7_wEf(6s(xyl&_3nxAwtyUcso*E+Yfui$23 zV`yn?Iw$Z77Val8)lUAEd#{i5I=n%){+vtl>2JPDehfzyx!pfI-=!Xnxru-8riI8%bw3bmpg22^p1C83?_y=&cL()5?2|+tf8`y^S9~ zNer4K<2&OW#|=eP6A?&bohbMM!|`BQ9kk+iY!-q?X0>xZn4tJd9NOZ+Wr(qxnJ{q^ zY^au;#+SA9_xyBZ352+|P`_T$d3j(FjkDgbQZ2;QFDt zv6T2YM32i*h!}aSjYBTfGT~EIM4sjjic!z|Y~f8~fohq{$#W_1&k~7Enve%d;B~_K z(;NNTx2BzYb)nZ1tVR^bd~(-cqT=4qbr-_wfX6dZjZEbWINq@ zxDvpor1A)0mwJvnt0r# zN;lsX%zFgh8AMbRD1~p)ZXrd7Adw8lrm?gC$ZoW*Ia7R{yjVoOduSgbmd>5jpn;9Q z8;4s4UP)y%vh6Vbn>Z!~f*(f2g$Z+N7ZGfC<#jpy(27PCev^M?A9tvcfl~PDm@kU! zr3aMOL=B^kbP zl4hw5COaT)LbuE3cFd;!oU$*U#b~Wvszmz(2bM)Zi%%1*>9*iuO_+~q6US{9q)Ta& z|Mb@>g%(y5U144%L~E3+={A!6P}YV^wBvkK?zmNsEqlU}GnCpe{vOaA1w|C;rb$&zN!2NT1P*|pv zuT;Q5L+Xaz^h%+AdmMg05+OkEdfF$pCoEJ_f^rlav`upRk%h=p3{1bEbvb`(bghyz zla)=QR%tp@-Jk$6+3c}wArFkH8REw_)TWb^{(3y`s)wu#-q*a88+cqKucu47UFC`& zNk_4(bQmb1icvA+N1acxGJzm_J7D?9MB8luuD-3f^0yuuWla%{GdwL)C2CVwf}h06 zgw$!@#7L8)WC>xkW91eXP_=!Y2J!<)R!c3q*ZN={7r*_q#-`$hCmw1T%8p_GD-1u{ z-OJ&>a-n8LWah8fH_4kEXQa20ln`pAo!h01q%)MrsNVe}{{qCTWX#K?4cJEk*1!hy zO&T=2Ou^3JioD5pU4cQw32VNW*GklBz1PQeUBtI@brN>4Gjph zh2aM?ZzaEi5ye-N+QCiFwE0DGHW|oRDl!iNT6hV?>4UBO_i)sA;cDM_!iCc^+UbR$ z>=o=;IJQgQL8xp>WZoB1(Vyp|6~l0`>pqS}7`#u{%QtFY&cQ|%_I>{F8jB2uNGPC! z4{P_!@wA^Q_J4p1Muy1gCjW{eh)c$A#`3ix$b-Zue8B3Nrh1-xhV2ybs!rR8=bN_U_AtUvL!ZJ`G*JBeS*b1y zh1Q@!uHG7*W}OZ0Qw08UqxQ~bWz`(YDu>MSv#ad7b>esCy9iJ@$1pd}KEK zE4YK*E&G3@92OGEJ$mafV0Z1AGBCJ0(ibp;4tT{+6 zLMvkWO)6{P;!I&0m%GD*rLY?5Dm*Ci) zM^NJ4^z*DS*XBWo=}+95*5ohrmT&7@qum~%b%W?V*y||qljU{jK($-?av!?8K?st!K%?8My|FtLv{V` zBsfp@fs|LCEb^J0sr<7m)VuKNM5F(d=x46D^3cb>rrDxz=mdXnvPgF}*D!%8O|Pp0 z?;kqy-Q>N+)0sEexcWjUP?kCnIES~fM+s!Jmbt`*O6AF42R=QzxAfjABuH@T2W)*j z7sj<%{lGDQuuw8cY1pTI*{OiN%+_mmkejbO*kir%k@@_wem6+;#qqAganREk-NJM- z_5E{rC?3*n3ZvEiKE3~#I#(Q^(xX$se&2EyKt%WNEl@0fkDOw6;lp5%eZJCEe914r zWT9T)i_!LywIes*4q1^fUqN`$eLGP&%-`AzBA-|neC0?dlQP@h)~zd}Fy_6D>1%oJ@8wdsdU%tSFE>K?Q=d4yru*#^s?u#$SdxwU1s@7u;H3(2zmVibf}Rdc5~1lFI}E=6g`_B zfO+GHsE`Z}e#=Uxl5(stC)tL@_+Fg~t2gv)jh!Kpdx?cKZ)}BGNmxw*W%0d6pB#5K z^Sassi$-}2U?=JCpNf>emKjnaYwc8CBO>%HelJ4cw?za5e>GmD1E7B3cfBnNN_Y{j zL44uZ64h{u9OQ)NT-yPreF8kVFeOVm-7Ne_gNyTpxc%Aq^@DbbG){*0XQCC@IeZT> z%uS~1N=qR_jK#K_4!UEk!l%=U;O#(Vi)y#s-AR`dqNCu7;JGr?f|KY`*kl>0461eA z!lIhvn>_63uZcIR^kQBHL_Adg_4A4(VY$WFy%uv~lK_CILR1L++nSM{_DLJ4n?IsF zKFZA?Ud?QXyFYID4>Gu+9*#Kd%@k==v!pUfk3~>~-+ZL!pBrxyg@(Ataq>)#P4;Ec zvn2p8xcOMTjPiA5(1`^}VcE77NJ|wfwV?U&k z8g|@MR@@zNj^(5@j&C}{*TUo3q%rz~(U-T1X2LdUpA6SN)|&>ThjKZla^|a+G2ORz zn*-{M-+K-dI1cn>ztcGTa2DQ7B~W| z1=wa%w{Ql6tAQBmk3bQ@r3Ujy*Z+gLw+@S{?b^m?=P<%LA@e@Z z%{H5wY)2JO%gxA_315Zmgyq-v4ondzvV|kl={jTVI$xu`{plVmRv;Fxeg`%p$}oN} zT<4gU7P`-e-MaM~t5du`G1SJiu$~#>?<;mE#mt*j?Khig{hNs~5TVVK5u| zF*LywQ!9(%E?Z8-+Nb# ztybgfqRi>cpJ2+m!$hY&1!=y$y2b2 zrr}jaq{i*d)i zI=wQ%CL$wKgJg>{nUSIuLr@$WA6K3Vs$`YxVL>ZLXv`XQ*^z|%XKEwO{4DV6uq#yJ z6`-js3+%Z{ZlV!@IN?fw_-U>17+q8Ty|+6P+@mUrE&KCfqTiHX{F5^QZ>}UESrq;; zyPC?6>?y97;e`7sV{I?jqZa(HBW$Y}Jxg7Vr2@?dqxF@fnd>#)bUkCv&JZ9(942!4 ztpZgcX?H5R#I=j{6HJI`v{wznveuw&$R4@g8LCc=PAX42(Js*dlew}lcRqLUNZ9O7 z!3-Uywar{aJ$lz|+Rc*Ttr0yI z{iJ)6ep7~Fjj<&s<-bOeBEItCY)5NS+16KOVzL_;%h21-3C1nvi0qaL`J`ovUma6y zhKiBKVL{|{<^KEtzSyT%%-2q*3kQ&jnvRu-DbQt&s_}WXg8}_83=_=ud>2ht6wM4!xUaD zZWI~kne(Uv!K?V}jrgT(9^bR_=Pz9 zrST-IN_=6ZSG!{9*YgIa@uoGQBjE&75fM>|~k*HOf=WOH#> zPl8Au$$d;F{;KGF$Oeum%9FbeznW4=E%B{`ap|t;-5S$kG_3!2e-Q-f7{fqEEbh#! zyKUkpZ!1B#wI%Rl-qUrrz{AlaLTdtniZDO#*-!4~LN27Mt@YcaulbaCPoszshFVog zs0h(vBTUBn3$2#Xgg!jvQfF;a@8eieA-9!8h75S$<%SEst9CuMbC}IjVXi}n0Ju@? z?^oYT@t3p2?Rg4qU*FcLSd~dy!TCS9euJrFRa&HnWEN9TG~KbpBf_?f~R_u8DP_iu+T zLZXtoFjtdMjOJ;T*cV|~0xGYb7z#8=oU|3q<=9W)FYE{kf78Pi5|rGD!`rB&T;V(3wEAdqRdyn zl@5D=iSS6Vsn^K@6*ME=Au2x&BO5iUH>zUL38c5r*(8p(7m1kf36+V^$M*mV685W> z^KS4F6m<85C-X@r~hC&kG7rfifbXN)ba0UMmSjzXBF)-gXi8R0hiE91gQ%2ej zi{xAx5rQB$lvpC?6G#6@We6^>`&wL@EgFvN_0c7p4x5#Z8M{M|!~0Y@grG}mYrOkqDGy<){(`0z zP%>LXR)z%j3qF;mQb{qIPzV6iq4IP2no-+#<~s4{#l>$wsIB4oJZdd(F!^ZSYnzCq zkYcH#@PDG)l%y8~xZ)9m+5$FontZHQ89k1wBn%|KD+ux+;#aXe0bbfo= z%%RJ8S_NI%&C#q#+E0#$B*Lf6ry3Qd+(dq=`TKCazsx>I;+UTUsJi>b(+T*^ZzQU1 zTiv3vgEEtupK(305wwU-I!ysNDMzXU+lB;76Fc-%;&*XC?TTHExz+lzSPyG!olk~Y zA;n}MZMbab-lnyI3-#pca`pZDM{rdx*`Z;@KkL%ZrinjbKue`|HUjjH%bHy)M&{zar#yQ09Zq1p9 z;J>W;QBH&y6Q8vVX$ucr*mA3o+I8t7eKIj#6ZsNNo`lSx6Y1kZ;Es17&%=02*c0g~ zWqB0jMyN$tD--g*b+WsMf(Ij|#EZ$9B3SE6rg;0&QM7&8?}k5X34pw(Vn|SF>CE{< zeKXVcwa`beQt!p5W5nH0WHoOMUPz{T16mQ$A!+-X2~c?(sSipwDCBu%;2*#D!9Ok9 za2lukpnJ~O#Y(V_t34LGZeB`Wy^_1!wAPviW4#rKdR^_lJRTt3pt#8b1>@Rdz3|A4k(xU#@flWo~A;WcE`}d$IOjh3F%t|#G>v$QE+Y=;3sm1XK zv#-L*J%7543t?~Dy$ZC*Z0O6?-H48br~&aF7@g7==3=Tq;^(MCN_6yC*yP~kOCw|6 zicXOPK}V7KYf-cQXs!XjP=ED7D%7krpwe8`A*&-sJnG2mfm1P>BY%(bhCscw8CSkM zHhJdAxAK}fv(Z3XITZtn(fXv_Bm7|T$(PD>A`Js!FCjbzBf{mRyY%JA%gejxjIqI| zFpol#o-c+NqXC$Y5*q3BAucwmhyJ{;#)p<0XLFQO-LosP+j1I@kFobtZpw9#KU8$o zquL=#-O?L=tdXp{U*B+rn9b?cyawNXi2U9y{Nh)fjMu?xy_DsAF=Q6iv8K|k&USNS z#TB4R{pcm>b0lXAHWZ6}Ivfyv>Kwn>?sKb7Y8$*DY|T zjDe(uBxBJHG)}zId`F%3gwyfYyT^@}S3nU4NJvzVk!B7KB8=2=in(K1GzZY0V>QGs z()F|YhXm79*Gt$uBvel##2;3ixO7t6O&?t0!}adXV1j}l+Os5#A%t(pvPcmoc8 zoNI>e_l8>nP=wvC&zJlQWR9GYLkLC`34$QxV|nmQ7)RKEWJefV0_0|Og)T(ZzT?T= z405B{AT;DuXHmwp-Nq6s?Yx}V^aLr={bFAs@0)DGgJ%1g+?@|7e;6R`#fm+E^pW|@ zYX&O^%Y5L(s@3y{u0$itTsF)`wfX5vt5zzR608OIgD}bHbxzotr#Q8l(5Joyj4d&C zbJ5g;p7i?A3<`FSZ56XwCwK9MZ8X- zK#wYasAstCJ1gBGi5t81RRs`Jy{Y$=COxse@;6srMHxz06@6W_Q9)wiF!9UVWkIZs zb`zXJ@;?zIze(hq_NMYORL>5>e&` zY7)xeshLgIQ&6@x2#?l?vW+wBs^&=J=Z69^j7Xv~aUwvVnP8p{xM=pd!Cfo_LvhPu zg9rM#B~F`d2c&$jAH^gaoL}B`3$JrMMyhiO&W(l%k;}iok787DwM!G_kPkpSRGCb%_v35ib)Qf!8zf5aGNkaV}(l zm3#nrc`vD8SVQ`=!l=FcUup!b-YcL^Mwf^;?@ZBazu$BBaE7sFU93?YdmUWd)trPD zhrNeIVE-yo-U~H)4C8V5b*kYK0J3HM#uOZQh6uT z(;gk)T0hzj4aVw>No;yBTLa=r67$r2V@0e4%@^+3@HAe>`)Ri8^gI6n9kIT3!3KJ% zzm-%+D*+bxzWy^S;$X+2m}iY6*-=qWQ1$vLk;{F+&HtnAif`-VRecY{Dj*I$O}X1P z+^dK$m&x&Z9Dl`7@vr-X@>31O0)2<{8g{sa{r`JS-v3M2_@4M~^LAEZ6#DGLc@My29-kC#cg#z%QlB}Aue>#_5dN+FKf2lpt_AqL;Lk9R% zaeW02!x|xy+Km3tJms#$rR~N0fBW-DF*%1yzE*Y>zjm6ydt=+3hQ>Q`4_2S*30x?@ zEly?U%fZ~$=y#tE`iG^I0=H=K%n+D-8lWS1I4uIE;?&-oQY*hB z^a194WOrg^1D#%&Dx{0(;dAByQX&HlelqrOq!cn_z#kQWwtQzAz&h}O3w8A+a7~Rh z*w@E&_BEY-Y{D5blfG?T)G+AL`KpT-%s(-(+7&o1mF^3F97tQ1AaUz zkao(g$|74ptIVYD%WbW?Z!z#>^~2QBzC`ZqvSsc_;J*_Ua^=7f57U7d&Yy01?H+@( zOrh%k?H2b5tBKK@cW0gn>CKz~Lpg-KZ)&v0t!Q3+`u1u4DF*$RX-kik#vY*z;HaY? zTG-&mn7{&(_^`IN0QWd!n?1vQ{io^dmxpPuJ5#z7Fr8GlL9<8jPjjQgA54#m;s3EK z60nvHU@h)|A;ik6?9NicDfxi0J*ne;A_t4ji?t4uZz#(-lADL=Y8NB^o7G;A!ZSv{ z$<;e(D*jh|<1?j5gEUa~(tg_^JvBoYc z`v&c=))0O6{Z=|<9ilRNO5VCd?aS#}RSfS(U3-kHY1WH&{O@x6@qc8XLbv*p%cRfs zBew_;08;KzX*Qd!rqLs^dG0w(nD-x#2Qc5cK!IPO02s+#0FfO|@-^NZL>Q zllUqa#$|9k4+_)e-bi{@h>s`!RA=tt7QbZ?dP(Ns%g+K&UhKp#bZr})??TUsbOdcV zkVyK^KhN)~@&XS3KO5s3hkCL?c`Z%iY(v89(Waya$rQe6?_i`h5^5P5U{_vt-QGj~ zv@(cm{`y>Jw)%D@*|;iGh=OnlaiDZ z(!hYnwb#ssvel+ib=(Hz?sFNUq#TaFyiou|9-kP`i@*LdpO_ah^O^q*edYSrf{&w4 zcMYv98WEqxmW=B_dIfeQx{L1KhzXv}{;1^bQoYx_Aaz1!2ly=ouXk6C@9w(p9PL}b zA}cK?d0kO#=-fpbW<8ptKJVAbr*oA;k_Ai~jXFb7So#qrMPt970^VL@HC<*F?=A#* zp22s`P&n7^gDEfdAX0=tivtyjzu@*2pyk%Vnp`Xv*Nnsr0TU4GkT1Lb`5O9Z<()kOy!jyLrH6+DaNl)>WuYose@ zn}ue}KL1^_*;rO!hEXoi4_MU(Qp)|&6xzD?B_ah-x6YuG(4Gn-AKf|P63ngGSu#ej zlzYWP?j`l!JhnitLp|G`va;MVS-${s{kx#^t@*5NpwSlx`^DMaD4SI2(6z)$vky$h zU3LOqq73?8@8*oI8%>ST06kv%nXceoW&53#q2&Jzm|mbcrmX*FBOg;Vm zyf*@p#)|J1#s2iQ=Dm`_-je`XHOsZ1)n%3Bjo7Vv8Mn5w2yxg)Rbx`TMJQKNUo^93J9LZ7e0iIqjmlxv)W~IK#Jec0P7Oz-D<*)bz zVP2PSD#G5fkQE=7U%^BT1(Z8Ht&}zr?ECz>($VJr_VTY$cM%Lwyk9 zDqR%JeSAh8LL5`AyS7pddSa!_7~nBu{dI8Iau?Szvj@gbFE~u|qG}?!16)RkP49$LP6}wQ z9XXs$c?ZGtUs%2``zH53ZJO8o!ujxXrn2=f`>P5-8Rc*Td|x7jB?cEGc0*1WikL@lqNrmFbRIwnSXd>$ks z{d>#X(oQ93ZHe>`-s*@-dvvkLMCS9iLf^aOhqE16_o+s_UPRhEF>i;(_}kBYy;?&g zU#AK^Ua8TZHZuBxROj7Un#FhK@{!^#gXHLW#Scgrj4QeRFK~d(>D79QnJZ6)R=mVz zYOP_y2L^MlP7B$*cN66`y2=lsMD={eP5{=sZ8Yi*7+}BG>WNgFXpozosoYIE?NNSz z*C5|#M!Qdb8unM%=gNPf0lQb)mH~AGFj&#?<49gC#72p3Vs)u*y6od@?JF@t1RP+# zKtH4Be5a2P7~8qNa}*&*BykEngod+4hS7=XgnbS~aOBP`Av)^u#&oIGQ?7JIy%g7@ zi~!K{w^LR2iYN}A+6O=%1ds*U&Xl~F`$Snt$keVUtyk6Iu9fygc3YoY0QIlP_9e^B%6r}Rq8agAL_(3 z^a~y@cN*MlsA2`o)^K#G{H6|atd48kp1jMQDLtAB^Xh z<=wplG?}XEsmaE2?mM29?;kYHBsc}fuqr$=y=mvvcpy8#Xu+Vyb9ZG7ajm9P83=c} z0|ELggPoDO3I3$5&KdehA~3Gp)8WsOfek^-oiEpk{5!pNcSfTdW+%Kyzy30*kwRAx zUH^05HbiM$74M!?uS-7BY@E*_5fwFOTFPu9p|n`N?}s8(aZNlrdrV`3JfS>+Xv?GR{<7$JQWr5JrOCTI$z#RI)q&p?1wS3H&54A09|eHL~m(FET;q7 zO`4!Lq>jH}y^bN@I=d9PZfU7N8dHMpU4^BgSH8Y95-m=dS5Q+no(~@XXmoe`seya{5&~5J>AXO zHtKqTM}*IUUk%iq5l3&H&)&)h=@JuVZ{4YW>=a|1Wrl5QfniLyA)jn8eQ%plS8H!a*Gi`81 z+e!;RoDXp>a|s5SX-%aGyGNL~bK(XFXT2KTR`+m8GkNXtVh_A#JL^gS{WPA{`h+r-^r3 zIkm_*8;Dn+a!QK+F`F&_vDV3Vj_f(v%3e`#ajW0$i@Z#0@>++I34$$yfk1&5;2DFq zMz7l$pZ&ajTdqnyr7wI)0EV`U_sLc=-X&sY9W1(%6PYmyB4+e>yhPTbhVhoZZ+0x? z@xuh@X^}$3vc|SBsxs&EKvW^go-x_bXMcTo3Wl1#a(lb)-rPyV z0TVBa_Zx1#<%jMQx$eVfg32@cf(3^`b-(|V53tW)Tb>qjzl$>Aa?!IpGGt?RcR7?U ziajG>G;#z%yK4CuhGpb@%ZnSS6R5ZGzbv_aw7YVZDw6|Id6Pza9;dP3!h0^|73AzE zFrO2^(0=B2K9-SzkrCRXZ`Ds`laMN)Gx##Iuq1lCo9DMg5*#n#>>RY6sCe<4f32^3 z><2IiYxUPyvuOwoIoCYVCs-qtP<#pjY5JA1t!(5xvb&|{y4E!|zsXt-gI|=BNtUv@ z81(e^Mp3>1e|FlLs`OcG{l@9lIDCe!TY%w`Zj%F_%BxPOjHlJTzdEHBD&ykS9Wtsz zqn&Z`b2XzF@^ckH*@xXAoa)tPs}AQ1sMp%@@wz>^hwXOroo`|P!fatG5Dj(tnbclWc7(hO^ z38NGDlwH(FAe)-{a!Apng8<=%%X*PooN#==Rn1o`BD0F=y5`7@h&KeE1e=pHHXHe0 zg+7}M(-iBi_w%C9Sk$3|v@a8Vr`sL+qQSV>IuqAYRDWyLEtvUHx;Jy48s`+`HPVLs zE$BuAe-I)H86o-}xjBj74kF)~DEPB+^nwjp?PYqZIu|ECTu$%|0S2Caj`q2|P|qh; z>X`Ughq8-~ye2ie_7((*)pvMKn$r-aynK$F7Kefjx6Pb+d5kR{Xx`+;@a~5VFyWLz zAfNc7zi^fr3iN6dtfpm;4qbXtvjnl#7?L>FrebUTFq-uHK7~JvV0wQ1@TxOUCi}4@ z6^uI)6%n0byKndBn2+zBrR@3E0`*knH?SO-KU2a}3RI`Cd63vnH`h!kb@tKn6g_V4 zr!%YP85q~rAYu&(0Sbvm;}3T~_CBv#wF~*co>P+v60<&jgTZj)O>-@c0YT<6`=UuX~Aaj0H(n;h%@Q(~1oKiU;UZxYOV1*U^ zyx^u!Y5QsQID~02PFQRmm)4@CyP~scyzQPj=m^q3<%MeK6c$RdUm+LcX9LZCxhov zToEv%Z&mVCU~Qhlmmsgp_u*tk>@L#Ei>%d)G4{Xwh3lxdQO?w|GLV?@31jSs>hwf? zO~J~HUar^e#~`W^GUI%Gll(5Y;P?dSd9@Dt9hfSJszP&!L5WflH<+TJ=_XTl$AvTR zM{r}nvG(^alq@vPiT227{I$YPcSl>vds5-2CD98Mt}G>l)g$Wmf^QN?}Q&5&j` zHx!KViVS}GI$YS}jN{g9r4X3Db$r^w1Qh>HR88Ya0EGLYZT;>;?@6wTvAle8{Mqc# zCfa7~r#Meh1<82cE4oL_s~M?+);LpG%(&a5^^*zch-f;2=&KEX_inh+(%_T`>te7$ z02MQ^%Cmz7IloiTqvSXKqxz9!leJc1W~(?(J;*+=&S(jvF&<3y*RIqosF>=?zeg3& zdy%(({5Xgyouqv~3q9Y)TkKV&7L<={yxeLNKV||S>j>D&4-$f|R{tnKTGkxbOjBb$ zLC|j2^>uB5s0QzV@S&hTRCCJyY1j}^B>vk!u)^;($d1>}w99_Tr@u<;JV1FZMyxh+ zKi$*iDBDt6eC#E_REBsX!Dm4)QxFu^IQ-x()IVw6W447HmL;H$+bipPo=mSp0nH2E`5it>h{G97$Zhc{Hn*CK6n*YrQ$tNqX^6R*1sPH zu*iALjcm%GIjVmrFX{ zIHVCftmwxrxC>2ZXldOiMlS)(-^b2T3KrXX5&3EdJJ&jRW@*wdah{O3jta+IUXS3` zDcWKG&0a%~&T?GO$kR={*P6gsUwsFD%74fVbtLWnWs7m@?+%99 z65kT*xnR7o+B+NPd6DX1J(W*GSEkXI2ivG@Je^RKg3XS|=X=MC`isaD0V^sChEN~g z$l8$S?Y~Zrf}$}Uu`ziY(FWY;Gu=goc87D2Np@#c2P_ueL^PEvYQX`e59=H|#1HE@C%HQGPzNbcnUS z+yx=Dhry9E-$O69R}3UIUu117|7&=cUymg-gs zOQ0>>mt*&VHK3E|m+sisWTU)&C{VbOGlf;Agctcv6ejnrBuW%t3Q#6U0LjDz`rJNq zLE99mVzn)YGo?CBa5e)Q5=;N0L!_D}KGBl23u((n&%NaLYzuHnfGd^u5lanZ<8%%4 zvKv0IjTB`5IvK5(S!$l1qPJ#GU%z_wN?2GJIHswpvNC>&5>8A+w6;$27lwg=cjAD2 zM@Z-b6Bx^rQB%x-9xI#NBt#Nah`PdL#GC+jjYlwrvkYFOMGb6o0x6 z>htJh@}apXuRARl9yP!i$cKac$%=g(#BdjDu?hG~DMPO^%x=8IXTmp3CX&(2#3eCb zKkSZ!saJc^w_<~T0B@28r%OOvcm#aY#Jt}OJeg}BHoQsR3_80@9;*&1y^uXjX5Ku+{ zOQbt!;mWj)XarAM8G#zl{=1Dr*%KBP+#irn!RK$43**ocD)_Ai1Uvxs{*mS}mCyKG zG;|D&H{&B+*9Va@%x3^qQ$|99i`Hy@=eK-xMmC^>bSHnamw(3uQTS;`sc$)%And;M zfyVMDN?0Uutm{M+v$jTOxW zygnyasow*YgWaGPS(!6=3+0+}I=}y$R&Y=ZP%r!3MQeXtnh9i$Hsj_%f(TgnVXlEb zq8HC0VV))7{yq&;Hc#mfnhC-k-jwDy7#de0N@4f{;%7{!v@lx$X=@Hk2Hgx7i>gGi31gR`nqtqWZ@cxg3HbnX2VPSnZ zwZF*)&X^nRdxmm)elfk=YQO}^4g|2^L58SvPr^m?>)r6$RkdO;nR!-6Uq*`Pox2!PFfBnjI{ zJX{R1Soq#qr=eCYLAx;eg0K?5Nil?esvc>2te zrS0q_bz2z=AAhmj90)%wuh&{{)zpX)iC`ptZH$M4b({bu6Bh8#S1QZoQ>%bOu;7cz zot_cS-l_I&oQ9|b%(&>R3# z3^0{mG0o9daYI@IgK$=sl9pC&p?JS)6Cr$9k_V|G1o!}_=-)EL){dEN6a+XKChOAJXF`1#KFP&wy35S-RuK7%%D=?D}Uil3H%%i z6C^AI0G~F#f3^p9D0<_#v4K8(ez2-c8Xwo**0v?6o&%i4pvBTTkyg}()%Gt40(O%u z;~!g~damtU!?N#*v1f~ghQH=mjZ+HmS!wnql(FI)80q8QQT^A8bz;C%3|9jh7n2HmpSs`!Hht2!JM&K}Vi2r=m-2PZ&l&|kg zD=UuM$E#1-e*CaS6y?>yU)%hQ_O8^JT!}cjK@ZT)k)3>`7@w=*!hA>7pwKXv2w;XkLPDj%|+KQn}c0x zX*=~2LbXMRW)A2itj_={@|$Oj;qv3j_KrZ6KLzgCjW0iTp154W>&Z%zBGWIgJ!wm>r{csTq0*eO{pB~fjw!a_|K&R9i&~uIsZBsjG8#k9pow3BD z7%sK6)pC0kLa8s}xIdqlnQ_N2GNtgDw(dCck_>yy4-M)gSiiW~@{vo(VB-}8h;b^B{l+MeqXBZPy0u7DHPBDG3i?HuRP7go9N=9tj54HY^+?=<(l?0 zEx)t+y{cK*gg&tUByp4$J2s9uP_jv zd`ON?y#IMYLih_Yn<|$dzQ(@>j(5z3+Qql#TAn2!cWF))3tN}5)l9;M{eKM}&z*LR zS+wUTm#9p&bo5vq>u80o0!Ho`~4%G|D5kX36hdyiTUn|KgPIKEf zXcm!G{=kMe2tp@*Kgm^MjK(}|XMAU^akeKx{@>P{VhuzF;Dj;*x^o3X zOK1_PI&OP@$1T0T!X^O85j9kR*286T7@gr@%7Y+IW4BQ3Ly8zlg2n-l(ZTUmL<*7* zH(uj1k^8(?m(*0zT-Kx}`;c~}<o{&j{(`Z7m>_GJBPdV)I5^dOU^v|;e zR=qqr9!GPhd2#ei!BlP(>2ozdoye>0edV%F#mWkCkw7eaCmh>Oi3RmA}Dw8Cnm^xznorQ zFIOw__}X1O4Ic(GM__)b zw2Np7(pPDlrJZk`zbm0ay>?O&>Z<|8&}ZfF83^}%eNPB)h4fI>5nod!@qc`5N$;p+ z-r*P{0rEZbxUeqx%!8H<7Weq@o3H3@(k?1ny!(Qc3ob!yFxmpjK)W%9%|sb- zd+oE*JbLel>om*Kv;8~jf>QcVbLZZ@>}-_HdrrO8NafFM`N^ALy-wu!$Ui(va*(z> zhuCez`1~l*xPc15W6$+{UdbEth%8Q$87N+A2Zu^781DNm`O*jQU;eQ9+n!a^f`^p( zFM!yZ`B1w(tIbYiiV3Q5UxIryK&0= z&a}SA4PF=TdBS#J(NJB0zWK&IDHCLr!}**wDlsP)U^M@N*ko={F@JW&YU3%=3B_TJe|3|cxXpUS?P@xCakd%5oXD5sr<^-`YWwicC`HvBrkL4!!oPhY+ojtY_ojmkJ%0Sl;k)_Ewzj!F=PB`F zbm-keW`Ca_7r@^DClkj6vJjaUP!0enbfXvgQ}4V1vehHtJ74h{ z5SnmaIPHv2rq%(2?`j)(`I36IJi)@d`%DVh4ykSvj75nC!d0sSv0=6JnLm3T=miGC z6kw7PQbq<+C5>Z^>HobC_z!#)1Ch3|Dj(qBiSIZ;F_SgfQc%U-L$QVG%Q|HYFZMd~ z7b)6p^L0q1HIaGy;vqc-q9FijjYy4ntmV5p{NG|(24+UB@#s5H=3JU)f%$8{iPe2f zeWS0U+OFD$8T0Q%3E7P%tl8gEvib4%`|f0HBxT4l zfXO}I29afwPsat%7XI`*jTUWQnL+PMKW89n=p*mNf^AV&_~$ zQP_gp0N$R&K5uic@ zPKkcN1%P6z{mq@mH6Qj%>Fj{$xg6TB$TtVf&KSZ5KfIYht1LPP{8l4vV;24~+UG#3 zDcDttj*Fu)K6pI(v*+98owe}U_OR{R+9kL$=Nox(;}VfO?OxToO(R;wRTS?im3R&^ zL3OQ;UUCre1{b>f<5(H?s{Id!D=1EQyk$t@331<1LGy1)oPbgx-H*H4YMiXw<9j?S z!41HvOoVqzwA%^cMdF(&u5uWB9e=^a`v_9Nq&Fw}X{&8H1mPF%L8WTIv1(}eBc=zB zGDJs>0Mh@n>FCi=dQ|dUkANNDwuTvU#h0AmoUgUV6U$H^IQ6nR}NEtEvqV49TkDm+b%+6L_n3} zo9cBYE~wLh`P+TUgc|TnGTR1=^-}CYUW2l6#x2h%Zzhp5IvgC_*Wk(`r%}om?XPrf zsrY0c`279wUx=Y*90-cwIt|;*e50rsp#_`pn*|cz4XYEhG_2@q4(~7UzTP+lpj< z*kqrgaXM6n*1Cneg-q4Hf#K=7xkn~bGoM*CJYVc}h#vw!7W?bUaW&Y$U8L3`pQHz8 z4$j@E5PM#?ns01lkBuY4o15x(*M&6Ha-2lQs|J+S1;JAG^^&Fi7D@gSq6vPmd|JYG0s8eO2fZ2Nc}h@kZ{yXsa3@5I{bBNGFSBn#d=4-Rv_vVz{_6zx@W& zK7S4$qZW3BN4iU%JU6R+Wb+u3s#F!2cAI*rM4?!WUSTYzn~-($M>^iT4=dZ~jJ4v)V>1N8P%*xPEQ5 z&gm??a}%%g(E~$y|2)i6Lst@|QxMPZ|h?11%Dtb3MT1ESiPy@-mK?Qyqf!6V9En?;E9HWQwm| zzrMb@qN1dl|DZFB z{3OaIo|N)PSM{Y5$40Oj(n-^Ri-`#pB)7sbJy%hiARO?Y?+w*D7l*V+6|zs5nA&G& zb=1@<_3E7m2M102Q@sz;{QZ$hRNlS&bACQFIGC&9E17i{13(M(%THHuMdGL@JG;Ww zW`Z(rx`1195pKDvDjzX?|GO7sS9i-vzc0mtn8DaA%hpX?bi*(^D}i-Rd7=7p;VTNBnc8f6( zvep~t+k6fQ;P)l4jPF#^{Je%dbImZKE zaX)2b%u-AhP|GuaWVi)P%wEyNBiXS90{!pHSe|WP?oIPu1y|8n&(#q6iMkxnc%f^& zyxkG2=Sd}d!Uw)unz;7O@aW~=Y=Qput4bK4^n?U50)izb$Cud%ele;qsxM#qMn)>- zytavZBDDL`AMNG92`w9m{O`hA5FqbWY@5tBrAUdmY&~I}#oGB|S%4_;!nbysPky&A zSd3>(_g3|A)-m;a9!R{F)lUmnA9dh~gdWox*I|y=@%@hon!zs*Nn74N>_GzLYw4wL zNvHvP`@3A!c$szm*0YzHjrlpRlDW zItZ0+z9=W6EB-p?`zNa2s!zS#oWqQaeQ0Bpb}7F!h_8D?R3Qq0%reW}VKp5nh1jo_ zlSzx83BeQst~9!GW^&7}I~OkqWH#{{8H#g9C`XDs@p!^Q69Ex>ccM5oGgJ81)ZvSXOMI!}>cBzzF zJgGxZq(IOLp;=f(GW|_VUNZuma&wySm%92I8nodWrE<~R-~V1kg`lJ3`Ij$WK7al! zB*Blx;yKO$6L(-R%Lw!cBP-)GPx-^^rsC*$(sA@ohd)t=r|){tm%JbA9*TrGHKpq9 zdpcgMqwL4hb?Mt5W12z?r`@=WZC}@Z>TEUX|82IwnbYA5UsEcmA})?UO?FjgKz6qU z(I3-x!cLHzM8n|LA(lfU2TyYxvM5ARs9rA|WX)T_PnVC0)|pol;6l zHwS6yZcsuxq#NnZL+7{9`@HwLKfeEP_S$>RHDk;<#_|g)b#~SV{Rh+waL-UH;Y%@F z)Jl|@#es-9k2r#{AN*<3DQLqS6w^mUL=F8J1LNdSIP(-)HyRExhb9R+Pd5(_O?CC( z4wo-3%(gc+B!V;f}e@{avlJ#rQxZlR-X4j6YR$mfj^6_>lRmMagC} zTp_q*{+Y$*Py!LRrwlCUC@)_4BSc(#>;^SwNf5PDo^JOl#FaMAsX4q>W~Leyt~~z} zUN|V4nlFQaL-Qn6&exR@T{br+&)?KsccE(*ZSl zc+Z1R>cz_AY2>7fXGG;f*N;bO`l)f+hVWf=pRC4Gw#x1}V;EmY8M;{4apjj%IxRaH zU7e7Y3I6GDn{dZ_{Aua1paJ2Ru;ADuSz&NH(*1g4J94A+ZUn#Q*OZXi8(%dL8sW%pJt2D!6(xiJtX4)ah^7Y~98E5E##+}@| zcOS856FDsi5Iw>4*`jnTb-r$vnwmx<;+?zWc#wq(sP8;mRecamUwI;GFnni)$pWz5 z+z;MSswZ;UV7qz*G{f-UoUJ)_fT6p}Z^}Z=!IbV)0D@Et7?AV3 zlbRWvNtpg}zC2JS^B$1W#m9B#9kFbDrkVq7a_ITz z7bUj!@n52X6wodv63prhZ{viA1p0{xW{cs_ZAKdevi(G73Z1h^iWw^TR)h8|DJ0u% zd||B!j>E@P;12kp^{R0v;4_gfZHeJJPJ`v5^)N*Pd&6m`ZPSxC6SbtuA*NYhsa#V7 zsX%f71j|9IG%_+$US1wt4{3FKb*iMKw6#XW>#-j8_3p7oV-v)}jR3KXZDqQOH!No6 z+8iqey6=o=>Zkb_g4vOps^fU0Q_~*Lv4a27fy;B+o)UJt;s3?oQCV3T+!el6?-`ez zocsqu7AP85u#FA!HZa3cbk#2sj7?@WQ6(Nolnim~TR#RxJYGDKIuRdpqHLOWm$N_{ z*88;TZ2^?F^{|)Dz!k>p6}Jf^8*iFEdYGSF=iKNimmw?;_N*2Exoa06x{mVLH%PU6 z`zNfJpF02OXEPjWX8MA_)z92%4iqrek`EP4Z>MVR3y_^&Cv6iMrpV_02)}DSQ^%7{ z8I<<4q!`2tvmY)HBRa;HFYBo{Z`>v>@KBwuIxZhGJyB5EdgESzrt^+WC>Y`h+>YE{ z4tez$+3UCdNXzMiV~AMbKvfuia&OY&0+Vfsv0agDv_pK+6qL3IQ8!=RM_eG8jbJt*5>^os>b|Af^S|sd~dd0 zUILhAQ-0lQ*9J?VN2!r|vFx~m%_l28!5%j`Y-N9m96ZOyg zF`64=Maf!sPZw>|(Tn0M=t39I-;lT9@NC-2Q_U4-Y(GX=*7ue~7;o2DXh{P3Jwg~R z+r+Lw$ZD;{Y~WSPCE73u&JMpO)EtPWSHm7FoOYydWTVW4aNZHUBad*@ChFi>+e;6x zeE}w@vnAYsD~=*=*nae^2oa?SZvrKq85@Mz_rr^Y9=g%koWfH1;ds!V&rwp&D5p8; zik}#M5%*_WVN1i!c)r|_`kegG6#q_dg?P?ln6F+Pj$5fnQ+8}E0^G!rC3XvZtD4D> zpzNJl#W!1|Q*6?=Au0x9*0a~YI^8feU!va?_|L{@}C!!Pxtb-$e7ophBR zqt^{GWEB+jT(cusy1BWHmuO#uD|k=Hj6S;?3yXb92Sp5G!S5k3ihhgg7IC94~bXRrj2{yY%o(iMUWqcf<}=bD57T zq{XfxgFM-si9-k^MO6Yxq~YqC+y>lC)j%FKW@EB93-zW#n>dU-hPbPBJ6)jriTz

2G~IwVHJ_&lz5yD#-(=Yi&Xw{LcY; zJtNl9K07($@OXrxrVo#QXc+U`+QeWJ;>-mE*9tw6<8S(q4QYU-Zt8Hc{vlA>YJ5FE zO7r&T3;(2{t?v*=Z+)rVV1H-~@hF0lBO$H1z4*o=E3ZCOY1wu0MeY1nt)accaUc0u zW`2J6brI*A`x{tfRaI458kv^_4=8ukl(YXGfEoCcBkiRMrZvRhQfD`AZ$j7YoBFY} z$%Q{Lg^~G*^r$g<&R1kuBGA1^!W~O}Iwls32ps}lV?KMD4MO%|_Oj#43~ZbVIFd)6 z{pvjRf_dlSIQemU`W(e54b|W1m%uGFI0+2+4@pN+86k(4 z&xe|7l=kR~utWUN62j7R+M+qizlzDL42M zt+J=`#LLl8GYN^0Iz{?~i;JxKMe-G}v#P=&hf4d+EM9sJ8{V(=^*p4Ve9lU++{t2@ zhs4ugxf*FRX>^y7k#XzvSPJi#EEDYKv*e!dezTL>#{2LG==J+mdO2AGUqBqQTQ5DF zX*BhQKsLdlH4Q#k^qd@^Bsyc(ZCL5QGmS>mL+|7&dflmxHipLeDHGqBMvs*gtK-pF zqtl-F>+Qoc2}CH0sSTZ zLEbbMIIX3q{-d`M39hK$wfh-0cHZMULrORbOpq-dwyV4beq9k^Qwh7pIT8PK*Xe98 zu^)7h+R5|opJqJuq$%$lGs!JnXu3&sIGjlEe}0E>az3{V{EjDfuDn({{}i?2C%;a7 zPCds~aWuoG=-t}vG>_Dn^izy1h8cvkO0xepV>>_>q^CL=!RaT46}3{808r&5NZEr9R`(b zumg|d=3_63OaH4(e0OF5d}M#86%Hv-5nhPeHnVma_O&gishJk!{5R>4bX?h8G}@< ziKCiId`xatT?&i#{yO;=Lx>!6gQ<^OeMBAb*xxGGA?6fXm=)A(- zaCC#DC8|2!Sf91Y>b`ZOIn`EXb)+HRNb$T{a>=J(_qyBtqipOa)R5A{sZxzz^u`rT zQKE-0LaJ>R1?lO({7+Oz8=h}GTej<0x3fFCI^E&6S$MQHHT5CiYNpEUMAQgv`v5u- ze%T!IR&8(@V;IOOlFpDcV`5_d5qoo=Z;Kzx%|Vnoo`;g~^{FMhAO2>&2O&t&J<{)a zFzT9$Mg~vS`Wl;VnpOKzZ>KWHb_H9 zE>l!q;FI-y11BJZy?j{!uGAbkkIM-N2p||(OI!>!bzEbdfM|EP=h}Mtrua9d=HyVb z5MdfIGW7!}fAYFM9!$y)ywf|i&m{0T70G@Uz(JBM;bXP;av>-u?q|js|-I7hjsY ztL>76%JA>fDbD8R0QJ?^*I!y%$`JJOk9^c5N@AaymNrvR@clcT*s4Ohx|EwSd?6^f zx~Fj+(_g5%9~{g`X&0!>9y7D}l;X3K6E=?H7TFC^o#x7Byed_?PA{9TOo@1?aX8_i zS&M5dGw~Cp?-95X$^RslU3GBhpBXFfKTddK45u?^A*WSIaO}Ss0PtWnh3@|B#}t90 zXiDRFP=?Vb0gB34|}!=4cq9 zv4~cC3N9#1`mw$Am{-!$F4PPE0=EwVov{G4P$=qVLIA zZdR6(f`afOK<^P*j*vI+K3xO+#mLBr++;}-@KEF9_rdk4?(SD9+%_bf=3{ksD@{%O zKz}uYLP*XAKk$3SzlI7Ja6ye@OlAU~&b*ZX((p;+oGl^s$zSxKekr*kTEidE1xQ@&IoD_>7KN4%&I)tUajUr;WEB>8&0qi`5idcb-FGn1I6G2Fa%wHH7gO^-KYY*<~xH7fp z?&UjIBy2}%qO~NBvFbT}qED%KR-&z>%_(Fs!FyQ3xBk@S=f6w^yL$$iq;51$jtzg- zcP(l=XJfwfsgv}P_|h57a;X9hKFTp>x?r<$PfShKa&>*}g#u!AJZNF{WH|Xo)+K(F z%MkFc)NVxYZ*)7|Rv{!L#Ky)ZNAdPmRvCxlYcwwJgL_DcD_0w)(&`DB6xyx&`viuX z?g2LH)AvpfiDLHD5n#phycYR%E<{t4dTEQSB)k|{AiD((Ipa2>R57hlcWBN>PKbG+ zu_+eG>Dk$|7gOQ8Z)B^fk6l+aK|Tw@mnPzGldRDi`B$1T?yGWVtw(|CJe|}h4)kKw z)I#QmJ5CqQ4v`1B+pPZbmdP)QoP?LKzM0kxdT0l-vYdhL&EB)Ln?PZt+ z`jX1G3B|CTWKVdauq@M(V4A@W`ifHa?!UWY+nV6NN^S&&9#=+Yx+jScR(5($@(gA06@iUZ~*+F zJ>x%^NZLkUi*+CJYx@jN2e4xo&5gP}hM4r9USznbwRz318M<0G2*SifGyHTa?GexC zhs(kudjbQ+bB+k{?;+BHb_b2;caA%M#;Wt!d_Pg%d+79B((=SxC=njKLwJqMxDM`E zzn+@kMeB>s9Awq$xh^sB>S09ojF4y0@Zil7GL&l;4Wn#jA@&t6| z4huw{NIy}wg5w1|2R9y(fyS2%3@E4UPm7V?`rc`R)JX1MCqbF{oml8FGdqz13vyYs z*~-6`T(lZ81`05>DfMgVvUJnf1Qi2KQ;fz#16+qdYxc9WUsS^Pos^`o`2FMXXPe0;%&TJWVz%W)uyD)`PuyV4EIt-e%O7 zfz-agkphL5Tb~s3rLpR}RbUxfa^(5_7C83`M99j@0$5>qSQx0}HEOLfH{eU7f=IiA zn}tdrN&|cq6JKxQBX#YxA(-{mHOk3;0URfLN+J3G>@iI?b-(wUk5kvY{m6K6; zXp4C7m|os)pgx+oSf6DDk*XkjGRX?F(Y4fT>mzf-ax?{D#`>^v;)I=K;%-LM{aY@N z*C=%Cc^4fA*jVu$6QMCPV2e?=3Mwnev_M@0w+O z322ae%6M_=Q?c+q{2Z()1otP>9w~uFjr>6g>{d z-47Xm$uEd@ZF%p94Ucz{OrxPSxwlX6EncwxIO>Z;l5z1NJLD6#n5Dg-$c?9b(LnHZ z^!z+f`HJ_&hhmv#*z*L3%bBsTZv^fjvs*H}iPQ_E;o5ILngdU_ zj|3!T7r&;RYZ9_4BT-}4VbSP&bc}RVMz%PRM~G&2h>lZ0SLlLTzGW-txN#%gAUx50 zsp8=tA?ADM0TzWQTzbmayT6I=9m&F;&bHy_tWowJRNxQ0%hd3W;^X}nsr+|C7?-%G z%o%S|l9W<^C4XE&DWu>9el;+U2PW1MmHEuKkJ7|Ppw@+jpezyIv^elc=|%}_uKz0W zeYcJ!HErpOOU*6Gh=8i+t?gU=xk==!sb$9(EN>+;6_&q|zzy4hGQi8-f`~-fctO_7 z>ywVac;R_BX3fJS3J#P3ZrG=`D}fmMs71CGG_>BrN}_iDCK_tKC5u)%3}hyIf%mLv zacD)E`oxRsUccOss0rH~;3K?4J!9l)mM?xLmoOh&-F-1!JeiAr>p@LcpEe*^9m0Ld zuA1}6SLzdSiZjZG;00m8yMfqlhsx(veiBc)GZBJrLHj3UBA%>-I*t0g3*Cz4H; z9AqeAOy`|3H4@cb?q(g73K2q{$Gp>9la^CWu(2e(1(ZzG^lKS4cq20bs(<{U9nI-I+IE2ha_^HNH*YW z_nv@ET}ycdlDt=jS4+C9C9z3-wx3fnC5^W2rzW|B$*onV$GU4o-Zl?rBv%{3oea)N zKlS)6;|L@3)!)In7s=0kF+BR?MtLxsD9(oQcnHgz+gqKKJ!8}pZd2s;?KT!sdq3er zPF$Y0hrk0mSFaEri}E)m9zfFK;>7z39t%@6rXbtiFuy$LxUGt)%sSVRZzhY;Z`=~t zP1`)bGFJZe9d9go9JE3&Gj|$^7WFD#4rpMkOs4lfLrhIlYP$5K@tA4!4#n-C%nlN` zlY8rO`25+fNhu6qT9}UXX^jZmwP?4Zs?3aZ^!4~Lg#%9HrUvt!V`AYaGoUs!EnEjP zm9h}Y3)p(R%M}s!x$B#O8>G!$m&n8}ytI}$o8oD9y)F#b@X=mV(7!=@jEeKUf zNz>p)S>0y$&Oz|vi{S%~Q&*I0noBJ^1%&OR`lHq`6v*gNpH7|$p;dii)6MV(1wX`b z=c|Xupm)zlrDRa+?zbvR-yFoC3!L56?AJzkwSj)v%DK1UCWsu<_0%sHzqA+;R(84i z9%D$8-?P~42eI6G>*hW+5AoEz3NYxu_S9Kl-rRY7P2< zy?kC_!5;kIi~-y>F)LxrZeodsCF$W5`B{NK*~M3`k1-%%L(rTwHfTO{NcR}8mEhwB zf&7Vw0K+G08H4cnGvK zJ)B0r>J^x+tnaGenw4nrt8g#E%}hca6?+kjagR<4k%I`2ASY^t2>S-)1TQIY)dC2h_Mt48Y_3= zvr`Ee*06XdHHFeK3|+cJ!WfY}Jn}Yb)N{LXUQJ04X5LY&t)>K>s+^qWhj#>nyWT=6 zy8ha@tq426R7|L;Eu>-}a9)}+`+_E1gI^AolcVbctEOLLdA^A~KDyCPN=_!J)N@~G za>M1H;~Dq>k+sHN2ILb#x45;IbG(~PwnNvrR2|!kvxM-CXzv>|=vr77-@9wr1?FLu zZ>$AQj^p)VMpIIuPnM?DH6qnPc2p&?l&*|P)^)UE(~AAXD>ZXZ7^Dlac0Ok%&!_CI zZ>rDn{6`~nt37`H!+cDs5~7B4a1>wR{uINq<_8%!O_{eTz1ogD&zI^)(}F30tE9C&|P#wFPtx@ zxg~!Xug)Iu!*jqKO)2f*3vacAy0=CheB1eb{wUj$Z8gyMl07TuTSdnEP_L|$yR^;9 zK5hJKrAZHeHS9OH`m4UDxH~ryw;U&lx%ap`=uXa8PB zc&TH_Z27&CrY5DF-e~&giCAsLLFf8y7i%>vZ%nkceQ_#S6EWxQeX4Lnc7Xf4!|9!- zGKs}c@+MOO-mH%DBW0nJy2*7r(l3R5B$=xgQ-3}BWTxk6+FYgb1f3cS%vPs}V?Jgj z-#-dTzC;>Ux1mtiNSy7j#8Pj*87U{TN!RMXy$C>i@56Bbkqxqad|vTcN{;0sDm5Wa zi9`R=0&u55@LPDXN}Y|53jc51aMX>8PvUJZ9L0uT3EjT|L9SvfhngJ8tGUdCw3%~I zhAgZ7J#h{6hJadtUwMC8bw>f-I-MlS{?r8fB=r1pXXs-c=;gwWx(F?M2|tU~ve)}1|8V|OVQgFJ z!vH$T9kVF0_ zQox&fCrS9OXMqJaHQN8sQ1o5Ex7XHakOR}rXu^2YgJ(dZ|}?c z8^$i@(S)PE=->vLGbd=*uPp$x-~YOIcg6XS8z+bRdhlOy40U)M0{z*Suumk=mI!fV za5Dl;!hSt^GWl`sTZqPQj@+G8mv`hyk!R8x-gLJY&AS-#4%Tni1n;?psMF+zIy;5f zS_oUE=sYi4n?SmJP?uW2JXH$?_ZHkn&Qf`SIK-XWb0`o+ZqK#Sy|?>Rv7Yq+1Gklc z_3M>~4(PJpc$C+xWo!fpT(z8v=JxbfY^)J2#XIiXZ9Sp{na4>Wpg+M-Lp(FMYnw*9 z{#pe{xXB>{Sqy}K*Vqs~-A4}VBX3kn#1BLHlai{Ve2C{D^n!4|K7Os{ zddELpw|^o#LN}h@Q_4U$H~>`9Plw`|=|1F=H}#J?$c$-83izTX3JSdHbiXCcvof=g z^Ypm+9@`Z9#m94i@s7LYkCL<^W$0`H5nCuIzKw&v%L())q6RzE_={$~Q3p z!Z7&v>JMtGl|v-lY3v3+!~N0Jde>Zmf63L_AF=xKt_S=6WidZxcrqQyu;=fIDyKa5 z67hBKyZ5mt)uvr*jO%{eV=vr~9Jv43)6>OHJrIG{$sS+w*;u>fMIPBkN{j*44TK3% zjosH;kh-aP+=3Yn&=)5WFA^e{)DgX8Wyj(TUR@7E!PeF zV4^KIFx@TNj~AWfmo!2kK-Sape8fDYC{j`OANn{PK_WSj{q>G1g+y5U!22(KPA)vu)q{tFH89Pm|-8998x zAf+$2@}GS{HMf(@1z0aRf_C~wu(!Va#tmit@VQO`LnnvZ}dd5^{N-0KcI}NKBk|cl$+d`b4_vfl~*=3b{%9 zU!x)b)i5q{`6DYkA+jX|26&E*2jB$-8Ce=tQC1m~SH8JpRgsyz=7rr(XaBC&7nP+0 zG=);9L&ES34_UR`xrmqAmPoIz=2i}Z$e++*J4=1AJJI}gK0?g9Kuy3reQI&hMJI0# z*G5X@|3-J=bw-6pavw}#vg4N97C=eTmVcM^e{CIrM`tdT4Af7S&_M|y#^|Xh z?z`SJ>Wa8oRmFjTfG~Kj42_O9`z<|N9&Ch&C)HOOq}}AEu-!a0;-pjoz`ekY{tx2e zW&kWWT*$biu5_%3s2f6J<{OzJ>luZQE|rLvqq-F+T^;*$@R7{sHs>>_{$4IhM_$Eh zAjX-hW?$F08n!K`>Eew@PojZR^Bd4PXi1U~Ud^gU_!rSZXm|V{uA99+iADfO{~vpx zk$nKf@&gWXxU1kO2|LG>QRTkkz3%zu^(F&CHusHc~Ld%tUOd|+M)j~XN6r>ify=GRQah zwj+_vYqhQ|zS@FVN$`JSr8p*3e7Fro714B%ulO5;sUO<|=G`~;_F$rl$wRP_O~Alx z`HG$RvsupkuOyz%a|4IS#Xj$p4`14Zt6-v=JE##bS;5;se3ZMTNu3E9Yox6yX(tX| z2fO^{Yq34v_(V;aU_7ercojIci;HLI)H?s?@u2QDh4%r2dkw()C z(QXZv#T&>gx7j%jb*h?@)RtwxnL!p^(P;@r(&qL8pQS$Vzy&i!BN<$nSw08O$A z1sPNNAUkY8I>0UOGn=rOX7vt7g_ixt_u}q8evY7@7Z_mFbUnS(KatSCjCbD3GIe2O zF5D|Q^g^Ka4Z?Xgz(Y+?@~mHLzP$`P=-kNgtkp`<$jlrP)54ud8!$fJegR*v5r!dENzYhQUd zLeRfRM~{YnTijT5`|9x$l;|r5gz4*ZA-!H_ya$H%lpfKI6h+>q-P3kF4(@u=f$POj zK}YQ4e3c9aYkRz7{Ai>*Tku9F7xq6%fm$a^qFb79Mne{Kbb6R;wo ze~lW9j83c)UCPal`YI$%^>%^){Skk^RK3$K1LHmWmtK$Wr2(9C#FDpo7VF{n@6nsj z^4J+x4F&>_Vs4{t-mE=YX?O{C>ftd!mp385sKMnx@Bth6gmMmMyu7Z@Dk>^Yx5u*x z=_LSMocO(iikf=7+Ct5pdFZiv3>Ar%fTIUo_*Y z$yw!#waE8hUmkqMkZ&syl<##*^4jCv`DI~zw8L;&YEYq@`p(t)&~lvjT*6r-l83sK zLdavq<7mrQC49|^sZVpIW=mZdXYS z)HE(fA-8L~=1=AgkcJoRu*roA1(Fop4i>jpQ)#0{Jy<eSda~*>;Do^?@=aZx#gd(`lev~n zwT#S_+ZPz-94(;?Z**h2bQiY&V3Kui|ih4k=y(9g`Dt_sdl?Mzr}RhjnqUt;|E_Gct@SN_5yBNSa$T7EFE=2jXA z{{-D}2>Vkfi&*OS_blX+!a$hX)_@?fN1>T&g6-hEpSLgP|Nc+td>`3$O zTI)?&^?BAY$|=SMAOY4eAZ~K4GkmVc*}(JUm!p_DsIt3N{V{wABP^=XX!O{)TanW- z`ScCXpt4TgXe=n4`;~hyH5O;k;Z_GZe5 zrhl%)>VB~NsNp=rX+B#zjXG?wDUaZ&Sycz;=wvm2+x6x}xx!zqvqT`_)LEf~J zD5YahHDM%DoyJqpdd8%0)~!6~U&*w%r>bi^>HZWZ`uvLLdd5OhiNUiCuVw*L+7sEz zMrC3j>mpKe^G>xk{&NhpDcpb9gxR6&5Dq<#9~{p(eL4Sd3UfiZAl!P~ z)k`Rtn~xx}1NTrkf4D%n5V%md2)M6sv2Y1+$#7|KP`E6(T)2V)tKXu41S$r^r@Tis z;Gr@Ht?53SL6*NL2K^C(vav$k{L9JS|XQ&UPO8-#Z6 zdmu*n3eFiHlJ*@(D)loek!opd91aEqBfqD&y4~!CyUZz>BegiG&fRsLEl|(dJq|Pn z(oNPGgYTHa9!Z_R&})QtD~v|v^bDw|vVvUE&Y>|&KdN`baLvVR&Y|rnsri>%J(r4^ zQg;g*z@VazUSPN;wpj1|@~V-T)0R&cz)yP|(=*s4Wff{22Dq%lsKfq?Rq+s0wt!Dn zXXw_KpC*A}736zjN;Y>ay#?DcFEI|8mPIgCaFW%Q-e;)HipxOlPpscq0DcCZMM!g5 zVfsQA@iM(?Z~v&9HV-#L-IMp_ff7HvF z%Z<*aKU%XpTQ%#-5Q6D`b}TzJzENp;pj01UwJ&YJ5_hDzA6<&SPiC>9DoJS0<+%&4W1l<|hiTs)LVn0q<{vRT_i zh%R855wXEkdbB1mbu~zkA+h@HHz0G*qrx>{CcY`O=8&Dn$=&;9%cI*~X#qsqI)}eO zIIfR*OugImS*r8#Z=K*zG#8v{QTz)ZO?Wv**9FckZpR10#F$%Nhlp15Q3}_>#h4-u z^SH`uG_q<9P9E-(*dUs+R?$KBHteT{hi!k}xEXMw-HyXQm}ygm`O`=V;VR{RE6Sas zI!=>=Tf{S&Ia^?cKEMnSK-xp{;Fk!HpvCbiVz8Xh0nbQ^Z}Bzpts1__<6{#!Yt34_ z3I)x}GW}u}Os$-Dv-9jyL*H4Q5dzI4|N9rL*@T5y>dj3}UkB5cCkNamKe+ASw(c;( z%00m=QGIszv7pAmB^8yw79wiCw;H$5=EAS-&Hxx~XSSrhoGYCCF;_e;6VyP4yd-KI$? zh&cUbHOCoG?(@p5_{C;pu1Ez6!H*B$Hw2OKVTlQFrhI&SpC=xts=3lp{Lad=iMH2q z|HWU8UBGL9cjR+rDyd@lsJZ@)s<<(KSG@_?dJM4jb?T4?!;J$d-pp$mbDXWaly6Yc zV7ascaZ&QB75lsrAz`CUhd(u;W9Z>P>MH8NE}r9-g$!u6=^{fZ1U&%VlMV)>ji1dO8uj^8p2WCS8~u9lmNI2$k-evuMNl zZW=D;`{eRCjZ9Wxzd+lHNaGDWLAjgEMIYor_NvPHj{w|UPW z7s#y@cDUWg-!83EVzTCHt@M~L%)Mvv1$`8Ub5u=%!wH=G=u6HkjFFEzkBbbSYs|Iq zoXZBSy>`u^)%eDS>|9#IPJx3V6mV3AR`Tn2Fk)m<(+ZbD7)-U`h709ovAS;!`5N^!SDkU1qB9EOQk z#2d>6dw(0O)14EveM#BYI|Tf7iX6Vk_urrxn0(HcEi$#_xNn;5 zCBREvHNJ1SyCNjL8Uz-=9nxJB?NZQN9p| z;cW^}K_c(hfgI$g?}zTpKzs$c_9Ld}aG_ILp8qsWME{;aHehmsm>lm6SoNxwtHM9F ztfSxG-%`r2yIj;1_S=3l!G*S7Yp7edjkfSK_@-Z|ibis5 z$fT8!{x;+P|E++3DXv@(TSspd;bX>FLm;(aKEM1UM<=|Xq3SnwV+_!g|2=Y@U4P{u zs9%6x)x&|);0GRH6&t=vH)P|udR>pNYd&gy7F=xPzGE)a3y-n6ULDeB8UkLch))vb zuVw(^f1KOJCy1?7AT3L-%CA+zvvb|m^2Qw71;+KTDu~YRvYc>8(1kQ@th0541dd`|HKyG zL1)9d)Y7GC&IOQ+(cz$d=vW65c!9W@59hm7yH z*o;9*X02ueDlKCfkLhU6?QIenLUqMZjW%0FTeGp9TVAWr5GHTj2fqsnNd>xcP^Mp{ zv51?r>0S-dc4a;_^v%N*g09*Sy{6PGBszw!t-e!mqAGuo4^+8Sx2-TXj6NQIE3MWzjW5=Upsvj%5&Mxb?8DXSbzM_ZvJWQ5pMDzjHz@x=B@sc& z?|RtmaUReI)`32L4E}$TFTu{DWJwDb8oo|aJ|dIvG|U7fQCY`)a1ZlNx7L%Al7P&! z?7~7R7~cl33aVMiXq(Z)^Ud{v%tj!l&yEuNZzijU|H(tU5IX(54oIEuC#V=0c+3~? zz!isLLfsE4ho^x+*nofw-m&l@qMo9`TZaiw+??&rvH_0SDEmu2Qyw0kur*;&53sxh zGNffVP))!OT+Z^jzjgWPkM!|1p6OCsU=)>B0nXJkVKp_iovkfS9{ZI~{x;ozetya{ zS%L?v60LrabZ&d=YmLv1`Jcctnx9RCEz92*1>EcZ_ltYGyDt=&9@4VGR35xL-9$kz z-t558r?jTbsnnE5?bt%L^;%0PFDrl!1E2FQoo7X4pQ zqSI=>Ty+GBa%=D$5b%kv-mW8=8?P`S4u{nVBk^iq;CKU8vI-!o*(@~SFkghMu6>*$ zlvk2?{!fGrS9=+C_#KVI4_~++y<)*w^UWSEK&CvQui5LG>xqER9qAqKOL}@Nhwm{s);ZbPq@<+j zf?o7mH{;HZ5O;GiU0Mv9=;1^VW=r!!f_YA7NtRdk0YbUTF4M>XA!g zTig7d1=8@n`NmrCh?lE3^`EnX6#n;34G@Q|t*wF4zXqZdJCmh>eRnd$AA1a7_g?oa zQ#kCHCX8ZPv#Baq+wc>cm?I*8U&Yi4bG)dCaY^C==l9kLAhX^9-IAyXATJORVg2qI z%1LR#OfWC5$iL{iiHqS|Qc}Xj#`a!KjfI{bm-(7BjmPUQA;ZUE&O`9|E9`JAXE41N{u>R_av-RXt@KV7rm<5!So{q`U~1eECe_tT6sW!{zc z6nl?-u;LVlP0KmK&pE`HKr>1BC*;z2AN&SZR|FYGFtDq(Q#${k*9daaivL zGhQO0h(7%uH(QkO{(p_D7;P9V5~$y_)epSjN?|%pVEns8M+~@=Y6g9WyvT#n9A2lJ z!&QZ$OoXZ;f`J@WJO;G{le>{U+Pad>ABLjh%-7uQ)OBQYqE^&Ff_rOpu9CZN@(+pm zaDdp$)bw<*PBSksth!yQ_fZL-D*wx}i;;(skdWTLe-D7kHlzVsVl$qN+f3#6B3Yfo zz!ewKFR->4PBRTUTSeZgl4a1bhCHyI405b_>|ion(fz^O~NJqrF< z@7a0dQX52o>U%Kb1Cn(B~kLzK2(k230A3R$W*M{8b9 zU!4a8hV#q7cz0hAG7tA1J+JPb-NJnmc;!$ao?%3>fw)QA5h=@rJI@3+{Bs|r@%{`$ zIg#7&+x3!_D5Tk|n1{pDUI2fjC{!d$(5*k7Q9ATcy}tV~opZ|RNQ$~*6fFN48a?=` z(IorL9~T}ewYO$E+=oV5U*W)%T9QhPu%%g4S6oOahGcV{#??oN4)2&hqzFS|&)hu3 z+6hMbd?P5|xECzaRv-{z>7$Y?Vyybg(4~kyxR(IKq%+N#k{#Px7n62b2P^36^M(K- z$^`C0gLBg7&z}#KENPM$Q&G6TxW8XwefA*t6x(gi>6^}G)O!ir2y_to28BMmz`F+~ zIXpRe09X+q{{XZ(-6iQKfdK)JwuStq>6!HZd{~i>KssvP z-G1Y}VyO3RJ>QdhvU*{rD@K`HYV2FmX@_Ku7Y&;5&+MLf674ch47^<66{OVvH0S;bA&Rh0 zOQx7b2#>?64Rsr}(h#BLzN8S*CW2HD1ZdmKRKC@|KIt~fLr0@Kqs^$ro`cpq#@~?P z(C}OsOytXP#gAc5eDz8iywb%ZPAy$RHhoDS$9on34^K#<{lp5*$Fl>H;t0M$@G83w{(978XItn+M?} z3ThSlf~tVY(ig*TG`&!(C5IciQhm+6;y8NR+0&DL1x+|>HWhHD%x3_IdJqj}T+HwM zELG#g%tm{s$@S+ig5<2*DjaCRx(-(S*IhLH)LpKRoroNuLMNoYKe#^dWn)2ZY_4;2 z%EM%ZtV}CO`XalL&nrd~BXLq<`g$jLlv^YW9#)>k?zHTvwWDn3;~?l*?3*@{xyStk+ z-S2tNx5xLJHO68+b3S)o;W=F!>%vO?XWhr5_}w!x`FM4ol?jG`d?3up9c!5cdHUG> z$@Nf{UA><=nO#3b=>nX$R<%8ehK}Bzf<5VSwJy*q_=EOMKa7}vW3Q}sYVs>kn+F9^ zcJVaCLwWm)=~g;aW&*P|h-s0KlQ=ltt9#?)Y2!AxiaR1mAMd<^U;?AeucP@ z7jX_l7QU-)`ABppCB^(cC-8fL>?-PsAP(J^Gl++?6S6(s+|H?Wk$m?@i`{nD!9RJB zS|A@^gHk}-$OWB%iE|Ov!BAEe#)K;&edfa+(TT{Z*1=X`M{j^xCgaupAv1rbp|cRG z{vRu7<1f*-mC(A@x0&RjE$uhfvt3^!8+#V;>7>(ikUjo*+^yaUjwD%teIu*y@sV9SyX*X=Q(i*5xzpvce)P?2oB{Xv7dnJ-Orx3C)4Gu}jtE>h z+QpX$V~9P`%h}r_a>mX_NRbsGJl5|4B+k$MDSxZU%ikSQDt{m)h`x9C4$yvcrWfR{ zS~=DAG$LaQ~ z><4pJIp!jR@*3ct97;Jq3V1|;BF3T#anZ(+wf9dT5pL082@OwDhk7G2OHG==Lbx^# zn13lzuZui#oc=+9KqBNee;T>=H$3J!CV|^)r~6+Z{;~gr|7p`j%XYheq_$&h?VRE& zOIBO-9^6hiHr(H?Zu^;-xL(u zA5VCJGzzd~LP+=u^9%~#BuP3_R_jEVO5^Y1-1t<{_7e8gMM%GJ4#xE3$XcC>(Q=QE zM?MjdMI(+u*mdL!mBBq=xed^ZV4#y@XVxz^{Lv*9McFIWwqo1}O>2W~p%|A}`$lq6 zAjjg*b_JLFG8xehG2cZll)KiSg`oeN_Lel06%*6@2qGK=@GoaDU!INR@L5nGz*m*S zoU)K(oh5Ub^vQZBTKniPUbZoGh<}N|7r^47xn%zizi*1_2NSxUil$fFqlG}i1ML!% zL$zDC-b8j>ahrNtIV4nx*}asN9r~6SQ)m~`vgnK+cETFlyI?CV%D5pTx0Q%FTEw@S zL8~IF1Y*{t$bANA-e5_vx(IyXCUunNmm!J@S@fskltD&5sS4IU6#Q9R6ax z;1|rPjnIpr0z$O6)Xpq?@Uza#68;bc41{Ypx-aUZSp->Ulj@QhghvW7yh*e;P|Q}P za?X^lqTZLG|NF)J<-UM)pe|iJ=mDrJc3_gi8>7Un_&#^1IpVVjNy$DiPi(ncOD4{~ z%P}#1j{Opy&g@L_=t)CK>2`yDwCZvi5CM6`vgt*pEi6m$JB(1^&4h>RxG5n- zvKzNbp=U@_Dn11TBQ_2W^%GZ0ZpD~9eo=eB^Lg+8jIRDwI9Eo0g<>S7rJv6GNdHwe zv9XjIH{`;v{I2f3$ZQB>z`0A<^^$-F0Tag~i6*@F%ULAT1_cDMFv2?5w&%2E$nDof zR*aT2V+4|EQTb0?g%B2|H8>Mxfrqb+eBBw)FlY#e29FfZ)(BmZnGjcP7M|8T5ZYg$ zpJOP9IDf={h87NZu!Gd6f`K5ACpl;)3Z|6XI!;C3=%d!fC{sqy9FR8N+sAA7Y8Fi0 zE8&Fp#$<_mG$vheK;-d}P7vviyn|@99x?q=Sd9Dztsi#qXjzIFHGyFXOh?D;<|H=a z1Zq89t>0uaICGy33Hvs{Xfv{V+{r-g2j!rEt11NL842;;3v?opUPnTBON-Mxj*;a> zzvZCV-bPW7XOOMgeufr`iJITyKYe-3`0)F*?UD5KbN3`ADaur!5BnC@sr;W&a!)}@ zp~G7msNl7fu@>HIcS!GW9*HF^?|yg}1dybKx~%bzJkJzO;P5%YEOT$J61owYmF?$< zPm3Vjm_ev<;Zl)8A1IV@Im^Qd7Dz8ROFoc`FheAeS2yutq24pYB0khCdiJ-!J%Zbx z%iJ8>0!e%p5#nlGvs?LV5EGJKnNBa1qtu(9*7mQ*xK(m^^}IOD^X;M3m^FnAR841QP^3VGUt%CMp8u7}C^EuIuF2;d z$c5WE1C%(QG#(H7ak4MsY*3j4w@8o{14_H@c$&?a%*yARPF}BZcaK`=nLhHbB<&Co zX`JGwwe1tFC z-UXp4Xi0YY>h>>nLuZ{a>!@9dBI^Ab7L~XCCzD%%kwU*50%P2usZaBSU~+d(h6fGC zz@2V3m$_!aFlY?}s`?Gn`W_eJu*`zE7wfZ7Nu*m*vQTVA2st}%!d30PAid$d*3FkhVBkkzf$BdLSxsHgtpsa;6y}wbT7W-lCkSi7v#3 zN8B$jm%EqdGEgtd_~Nn%hkYb_^!W;jA9IS4{5MUF9%9Ui>wJsQbF07k`Ofcim(u-ciBlCYP|BWgIKai%e2zkBwTouKmx_P<97fTX_@LXHzIQ}+zB zy9yb`VZMYAaiDw3ZF~G7xxg^DDaR9<+}`~9eCt^tyP&ZIdw*B=$5!Pnxs(1 z4Fg*X7@mDGWUE~Q@vEuBfp)@e&HU%5XlE3VWeF|yV>nIzr+zwHL3@_u-da{BEjlxl zWKtI&CRQ=tVA~Gjjo;cbK|Z1qXV%)kgl@@i5I0ls9f7MidoerS8tK@lORI zK)%zxQ@ZNYh6@_kR(tKvVT0Sj-MYLNNb@g^FO1SFCVC$o?=ms`u66L78gc!5#U4_3 zvq1SOyof*n&%01YqnuXFk%0y}_45U&u}ccqEls3vfEe9X$XdwvQf^RT}d zAn#63eIp2wwUbN| zzeCYy5tujJZVR0iMGT4@+eFC0^VdxOdKxM;;I zh0Z$9)G>(J@;-JN$1TjTl+3R{5H}NAE*BE3iybPBbl$EBH!X2dz0!I1N`g94&~wUs z?K^9Z118F?584<+EoKY7KVg;eiz^IoaQ z|0@6;_3j=9I-V@vHPW*8eK-!3aZTc*fk{*AaX`A))dvm^OJp$-;!S@f*PA`d_Qe36 zoK?L4J3Rx9u3@DLP~ZL2NIDLN=qM}7($b;URKa`pO`9v1-@n}_5<-uum!$F97ru&H zV~&5Z<~#gLraTIPPpAfcdEP|=VKML_6pXD@wCXsMUPq^>!XS;U3Z(R@0p|(+LK!lv)9#1HP3-fQg0~tOotSu>yslXK}`5C4zDDk;ZS3-S=-i&Ei z2YHy5FdA2}*t@vgZIj~Q#3v=GYH6WH;0chUaL>$`?Om`Xm$30!7RnvxM*V-O2_<{n zn6WYof_6_;Em&x=m+;N7We6vCBhr{O4BVO06AJHuKili-VHT7X>)Ql$L5?lb72?l3 z&_^e$C+$YnxCrVHy7FNVs-(T0X+H8X8X+iacWWaMg*&>WNKc6OH;zW+8L74Ysc%*& z>G6{wG94$_dzJ+ZX9KVT9n&@fts}=_-Qitw7fZNm7Jii1+d-w`5=84Fc&B&leL&U` z7ZT~ewUPB6bVOn1`UJa@H+{P zvQI>d7XLFwr#ciGfg1B78Z?F7lcqu@lvjwu1#tbNT^&QexG%nr;)Gwgpzk1C??b*P zTPuLtij3%5;lN`;|3ECgGW^EpFnH~G%7<+$QuK_gU%kdGr+H1u|LJL&kpJ%ftR#!E zarB)Ai_2lZyvaCdQ{h-Q<8%uSXFaa#c$vIM^Yb* zV5x3sWO#Uv&DNhE{c^+|Z-ZJh&r`j<&1(ob+d?GLUPa|b>^+w%fnpNFCUtx3(G*!nv= zvqT+hiFs!RfAGeAU8s?3)Xp=^UJL`yciO%GJJ`g}EG{m-zPkFXiV#P)CnKVng3KkB zvl#sU`;c(G)oMK0v3Q6xeH)92jjd^?KSv}cSPlT|)EAcjy;KhnC_qp5Xgnuy4;h8u z{&c15o~+UqpJ2J}zduHW1MwW+HwJc3+1<86>a-F?n}Jd3;}#SD{ku1y064LdsHv$* zuP@wU*ipmx-M`*aojo`HNM9IkP%l2|Kt7;#Pksg0AQRDiHV8bZjSEpn->TO z!Lm&g#HUkKN{#z6NzvTg-%KrvPPH3^o{i}#mQ!|u2Fb;KfbpK--8%zTy54PlTWSCBJ`%)3`(Ymy;s>pe3FC7LJhTh_L}wOh*`Bk!hR>|oFah&oN5MBe^xq@003xgvXm^*zQJ^6YDWW1v{QN!M)R zLz=kvOSGq2ug-RLNO^TJebvcZO_WAFGbcLpFt;P^Ipsb*o=FDAr=X!oP3-b+2@!$Y((x5XUFTqj-aMZ}{cM4zy zzZM$)M+zJVEFc_S)dw4fcj9pj;Cj08=Hd6u&%dv(30S8Tl*CEN4e6O4$}#YJHkGU) z*6J>@K(R?KDc;Wfot9A)?a}EU^Ls>#U12D7(e$AEFxGEfwoI}JNc?>fq%l{>O%8y3 zTwB}P>H&^GRdqfDm-$WvOFzZK!y_gp!e#X=*A~ZeJ0l&6Xbcehx>yHD$~oPipCo>G62zvYJZ{{F zTpEoQT`9!JL`};paH<*;krMXx8zK{$W|rj(xNf~ivO}_xD+h|CMsgV|t#(cm-9bER zY7~KJ8SEZS)4_3;Ol2;OjpbS`TNT`s`2gUn`)5KlB!Y-1)p{RutZt2n<8|s9x+BwU zY8TimhT$Ml2yr%RE9)z-=)c9Os=DL?s>n$$sVz`9rG&!?+ke>2Tp-8rOG3v3MO+!2 zUgnnuhPi)rNtc!NH@O}E4w|IJ%=D_XULeN%WU=_~ZNkKArI-zH4qdFbA|xa%$jj>y z>nkiO9F&w8k{EhVcSXSdW2^W2FmMkoe_#Mok^cJI)Z};?pH1|3)wVb)DG9eZm!@nu zHHAn*++j{(ZgJl8ZSKI0c*G|fc^NGwS&6KK{H6zMXg@E*kt)^Zk;Nfide%$b?XKhl z)6OxI_46$GjQ@8l#M;YrmY56;cWix9RPT>cKRJ4m6YeFccZ>{gYYePwwfA}|o@>SM z&Q5Tqh29NpI46jrx~jL%h3d1s4*VVV-ZM^POusN&$|vSaN{fR@VjikK(UkMjikIN8 zy#uc+74^Hy+NLI+j~^?uv$ON_d%KoJL{$y?>W-r@FbIDCww-_x>Y|YmQ>x8iv93rB z4JDGVeXBn>roZh2s;Ve^NI$cyqt;m{w1MAvW`r<68Jf75++LB+adO5_2JYv{PDb}sK~`-{lB++ z9QazI(p}kgo4tMo*+u#v;xBf{6P8zpr20vx`IQ+`ej8nR{gzpM^%fQD93+nr$iV^5>Ng!-QgMT3I@}fJMSFEYJkVhjsw*Na^wJ zKjOQIBeL9gz_GS-5WU&_X809@mm(picRSyPdzkMg%&c3@0b(qvG1Z7Kpe@SHpf@W< zXf#cl_+h?byXv4IqgayViMd;vf6b_3NZl<&JfoI|Qkmq?@Z_nqs4FABk8AkB)M5xE zpN|-0rv3MqIpSeT2YGmeScDijWkqGO-R5K1aC-;i+s0d0c0+1nV!~QlGNJ|zogZTS zdo7umL}C5Q?}v5~(D40TX+Sh@W&TMun#5{!nl3W+?g|$ey6%CX*iWwmO|{CXJF_(C zsWkGQ;hj!$+~@%P))sCRBI0}_thvx`abs#Kl&lid8QU(nf~=Jv+mm^rgdWIc&1*NQJ57O{%kKh`EZLGtc z68XXAKcvRu`me)O?f|e9>y!A;PX3ptVrPR(THNsZN~`eE&x?lGf#KcUUF&k(N|F`* zwlXrq_uecjc3kQoo-;5P1DHB`Wb8p@wz4b|SdEzm9Kqh9iaKTrlJN49^G-0!l zDFtu8ZH-g^=n#cb&SBv5i(+P*EkK4GTIja91B^s3QkYL}QQlrW2QboY)Be*C-NneD zKON~qBsAKX`bgEijzofPbf!fL$VkZ|41JKu9zi{=-8)3TzCc=152#qy9yaE_E$r>5 zGAt?A){E0Liqkf#w5v*R`*{vYaA2fgHvqAx24ftHR|wwv&XshPLRqzdR0ySSdsNJe z4_CwWzC<$O3T8LMs#+xJx2MbHZhmK>m^e^u$sahImjE6WRqoOTvxyy`OoO$RK7hj? zAUe6Ug-R&b)XTGJM6<;7Awsia8L-|>g;FAEi0s-aHBI;fYxk_6aw(^=7IUtxROYBF zf%;WKTx65zlU{1qre1ZG>b*G}ug4xLbJa+FKB}5}%WJE|PnQ}|kYm(T9UM4=L&U>~ zXqoBzaulPpdWN?C5F@hPL%KdYnJ;Q-Dc$V+C0s`m_E-i7-wwoM*|DDgd2R=vCVVK{ zu?;S-40@KAOoZsmn+*~Q59F+7z2pkzyJ^MjHCt(Nh_K8Jq|&H4k6?3%V6lrxw5I&5 zTMAE_*{*$*Ti0XN;dttK$Zu@6#9sFRN0nP~q2h_F;Kkwe?(nIid*JFb9&3ie<;y2$ z3xK{UjK_I}5nESKN@K@k>op(c-nu1MOrL*R>K|o8@T`zM3Yz|Wt`A-_5(Z=&9e&Xd!1!1`iK98Y@Ku^;A zui9Av$#WVCu^bEnbye{oUdtIWM=t4-zTu9I)hd}U=gE=|wOzxlRHAPjy;W`?O?Pvk zc%#4AJgk4Q#m!EP4Vn}#eS2aK?nDcd7D7|1E6>Mc4b$E@B3VRMyD1XVDp_3Cz)s#4 zlGR~`){$T%8^MO6tVEEVrfpqco?L8gtNi?cjRqlbrn)&QJ0?X z^UZx*+tfimFcyurb zMU^D>!zEJE3a)}@Eg4QD^lL-+{1OH7gS%FG+8=UXv7ZkmMPxU$x#?2>5Rs;iQEm}@ zFSqAfyo(?8dJnEh3zxkjN4$j>np_`5;0Otskx{sBkM65E9jt&RZ%J0RlHBN4x-V4O zDu<|~q^w%-Pvzv;w|J(5pRn+`T?jNKD(i!Tlgm!L$>Au(!jeAdUR=>m8p>suaOuq~ znXsZ1NTU zLktW!Oo>*erd2kZ0f6eGD9|8xBEsaZAb+g+FH3_%2k#%}JV;GvN2%g@Fl@VT(cSW= zF5YFua4#+PoXQ9yB_*X%)(`qqZl2=shAj}PxTdLgE&4^CigU6!!nA~}?zKHNNJ-P# zYiUMlx`yGq`*#{p)8tX5MnpKEg)HH|Z5n?(s=UOQvWxQ6ir**2KE}k{#~|6pwpkam z{EfaU;{VCgRO3SQw7>9KIE%1l^n?pPIx6am|Ha_ORbp<4xNnye>`x)9ynBs8n}rq`DSKodvX>(k7zfD}7yOZ;YMI_Bx2Q z8a0imrK|XqC%gU>hx^I=gzY-h%f>R9%+$z$YTh|*di8|1cH^#mXJl+~MIr0I4#-T@ zAFeCF^oG>--Zkmk`bnk5r4p6D7Nro9EOgA| zuq}(NWUxoA16&vZuffPuPrO;3WsTj!=9XJO>l`iF#Pt!?ZH0b$zkNoc`Kn63Hqm6n z?LJ}OMd>Rg7D~8*bDzG8SYES#J_%Zz{wb-=*z?s%4#{ENiDrX-PvW`4)O78Z31asW zhZRIO-cQJ67_%Q@sj>wT$sb&T33% zEgLU|P<=Ki`HqUYm%fONq!A9MehPn`ZmA(U=}H+^F&x<+ZlPO|T_(NC*+AaC#CR$) zXq+zVOu+nk&_XvV*_wP7BKrJ57&%9b_st~+BR=i1h2c_KBXvLMtcS_SfH1$Z^Hu%G znf(0Swf!2dK@!&OgQwYn&`d1g$GvYD%sBH@O0NC;+LE`JRO-3w2>&y*`trFA@&q!c z^&K-`qiXxPuEs@AQlPA(_Ri9L$xH59;|=5#tLOiHYt+3+tzboAbMl6kKQTS_+CFWl z2-pyvI+vkQW|tk=+z=L8WtIy%Pn}`3Ko~>GxeOyB%&)XQAmee_Ka|`LDOthM)$;Cm z&GB|~zaL?w9Ju_{BxhbHN7O}=zD+!O6Ta_qnYZ^hTe8NQB-CMvBp!rLA=o;yHo zgG}Z&dN#Fbv=QN&v8Z?2<~n(zDn<ussN;7S#FGX}*~^(~L@dgLE0%v_UMjR%OLE$;?DpIGZA5M8ek z3(aj85>Eep7Ehm)=n{~E)HxFeG)~&{Li_SSdnpUWyNks-=90R#-h9w`+uRe5b(5CM zRP;4DgQFx%R+$xENKt5|z#w#OcuDQu8%kdOK)blOys8%%)q;>DzwvRjQ5a~f*Vfex zwC+eQQqsp{Td_9nI0Kob!@h_QQ5r4`5iw9#d_NNEUmWGie<9tLW2Jg9(y)JIUzdC^ z(D5*(A!UPN`#rCyu3W!jNUWJPy93>adarGbCc{I7zG-_>l{O@?yuR5xn#gN9C=FCz zy|bgEy(A=FsQy0wn^uY7n-pnD1`iS}Eck(u2A5u0<8{7Bdfx~Yl-Of*|Dw*SO8Q}A z(y_V~DmS?dSJ`?DgaL^V65H)gk2Csnq-Ia0M%RQR37STgW#5+9&Mhg6Mx_&}iHVTw z4|6`L8JgU@>Dp^C+4M%8o?aA3*)HmlniYp`e!joFB*y8r&ygX&n*!3(Gz|>$&OA(J zT>;dTQBIpt3>46-_>fK5VGzsuxHy&3TrfhbOosWGgy<|<@vOnYbIP{AhS8dKlpMvx zvbP2wL^Hj#Ps}$xv>I78YMisP+Hx?KquBzoKVro2N>h&BPid6 z`q6N)0(Fb!`qj&pWt-P=1(d$g4+L!HHK}}>>9Ps&ITDgs`jSA1cyelnkWFP(=WcDC z;yMs@lK6wXs7&07l&}~MWzJMVME*+0xWcZACGVd7LpGYuRdXbAbo%PYkE8ms(44b0 z2mAPH_dOpwuJ?$B&O!|C`dA`hX-eiD>?cb6n;yCziV8d580hC3Z0m1hzjXx!bU9U3 zRnc0xcF!s{Zi%9H?~6Qx-0iLHIPgic!$JC+`^7Q+z?=;AIqVFs-nuTP-vr5Sw98z&fKFlxtnCUvU_!#3X9nK)`u(4K%X*?(nlswKbJf! zyUfI-+MqsWR;xbKgQOFl&+oQ5qs|UxZhy^0&&{&|WWK`n1Qmfb<@ncrxW z94(@szU7VKN{Jmq%I9^tMRo6Li92d2!C%t(PE*ri@EU!8KYW{<$r;Da)z^2s-sg1g z-b(`f(pd}5=Et*mzY*T!Wk>GRImhlE`yMq1V*Zs0G8QmcZlN_bF`*sMYJD_5L`$)3 z8J^kRnwUaCpq(<(yQ(-S~84KOu|9RVbN=y`7J0HaRz<*(W#D_mA)yN0+t(r}+P1$&2- z)#kOGHVzt>RXmSVMEdz7pm+Dugv$pDq99m^`KtxG#Ubqa&Z z1>(KnwT94*Kossv{T(sCm)QY&6#8p&6-!!VMuGwA;`VZ_&joJ^_ndQDtL7c)<;<%I z2>qAECG-=O`-2M>~6{g7R0ujAw6kMeT$Ke!HeWQpi+=|g5-2ZYY)yG^n0{W)KA ze)QgZ7SMg<8zDYJ z`7}g~yvx$HKw^6I3_1gA7f#D12uV0&&!@X{Fq8H!SdU)Enuvs^Ov}hXb65-PR<8QM z@sElGC@sK(Mdk*SQk1Bbl{bfoKQ_4iQ!an|p67-R4ARW}^psE?C<}PwElXI>?2VQW z?R3Z2c25%5Mn^HTdRMCZ8cQ0_L)i3rnOV1uVRU48Q;gLvQ&!RB=H!fim9Z?M3gYuk zFuG?9nhs||Tz2Jnn%t-ktl4bytiXq_GS?jKmrosN9Vgv2Kb2qom;ZA#c0p(o4pu8G z%}!2(u|X_VEZ-(3F6bnPWe5 z>YLq6`3rcNPaKV=@MEkPcNjx8c7pu6@lijt4NUM=QHV7+w~L*4ABhC94)LRG*~o04 zA#tmx)eQ-lQ$Coj?-d2)t_P+67q zFxt{aXFAb+EqQ%fo)8(?k>T#~N?q&Hi4C|XsmODMB_%M%JKBcN-&y5^CgjyrR8%xH zkZYD-PM{V4yH`{+K>nd1%v8|yju7wG6t#YfpgG^yru^rdpq;Ji^w_py4KV6>K{4#u zW*bnvwt9K4m@>(KHp$Y75bwc%^!s$*|M-YlpSF%)AFMhXtOB|3a?Ze%dMGA*qhIjg zf5wx<^QDYSDVnAwW#IV`KVP;0W*@(0c$>{ZFBXmG!1jrB=#ZSmormK=)g*_&^ABhU z4c{WjIa#hQw-`tX#(92g@n;u{NU7+Y(Jhzq_Wz4&AVUny{RqPHOY+O6Kn^uzK<8~n0w z3CEGVl44x+LyT;}-y~LEZtkSeah=Y22neOG5}$>?Lt7dhz3Fxb=|6Kr-Wp%#~huR?CMp89cHx652Vxdaw#weC~Q6O#4g3R(DTR zJwHan8{zSeojEz`GV`YmT^MbBUE9A|kHq8CPhWZnN6@oFKuW2%A{o5QOpyz0V|u;D z%Hff@Y0K|1LzR=LxxuWqzpS%g{A0p@XGQEU0w!x+57&dg2U)5n6kgMFi;A*-=joZ& z+RQ%?t3d}!4ghlkDY_e_3DRsqw;RD?W!2-k%IC385kSx^vn-b(P^+@t5`rBRN$5nD76kU?f9W`F& zH*Hu(VzP;{pS7>PrJB_pMLwF`p4BXu6&NzqjE$3)jWuV)BI~r2Bqt9Rj#KOs;4{^0 z)LAYsg#F}DDXn5jHe$2gdZValKp>+erX;56)N$o61=)$Krgr;PiI`8N-N{OnN7J)6 z6d#xXcZ?mwWlFFF>;k7&FJNf{wuYV<`nFuDS6BAbG53J_6rF&a(J`iN=2PY*ik{EWXrr?}h~ zs{&KY8#aC{1J8|M3&Wn^oeXGYKg^oNMMYsD?9Zcnl9Kj)d?1q-cm@g7ekU&kx4K0T z^SXAy5zb;+l4oT}NJwa)*5~GKouoCR(UZPBp6cl^3sqiRR4px~f&{|Dvmlqop_+wD z{6S5|X>2J^3AG?Q;?7?>!$>R#`i1VVYR$y8`;TV=$I6y^=*?9(1Nq1-m!av8X;9g{ zGHD_O^zYx(GFDVLp0%%eGTGcyt3PAtkq0{S%FFbOS%;^km1bmU>+9>Af~lPBNEjj2 zw!t z>%EcE_wSS%sQ9~N?(GYi5Xm7SA?s1C&gY-bg2C=$4PNB+aLUvgG7+|Lmd6?1Osk*XwFeI9$Z%8)zo^6;vLPpN}yX3rQ;PH8TW zt7EnO_7T3X=OV!luH2e;GLdTsDPhy0-u0%+F>M#Uf7Nrlk18C2Hh}>D<@_V)M3SYz z&iUoVb74UZtfXUTF5zHN1Lxe*7fz36NYZS6UEChTSepl2y3x zWR@3i&oS4K!OL_Z5Yl7qz|UcLgr~~w($h8=gCYYCoAd@DiA@hRjJhDVq|WwHH{zH5 zDi>;_O+9C$T`)R---W>DJjr@d#}Z=e<8D6K88Z>Xun%C zxxow1Rg{3Vd%{l7DdA2FDTHKHo4D3gi2KM6nAe}%=|-7aMijxMrYeQAWQQy0{0)nX|FdP$byJ4Gqw*EzugCsV)P=RAkDy8TyM5qgiC>3*ZbLd(y|S@Um0 z%knE^dj-Q5i( zB}aZ~HQb88<@4>^4rchQq4V3@Q{er{XW4*ly$T~m{=7F%(Q)$c@FP;rJ_7NnW{I)` zvB_9s-GvD_Z=j*kysozJtYGGgktupxjM!UB$dIY;J^AO;%rar9IA@SDf)!3RV(?t5 zJJ@vVu=Vi@+#6l$@trxyuWTJVa9^^dfNk;Hx0oF$u}4@5SJf`hsW!I;BY~qoA;ax1 zRYpE48!o~_s?+2a5Qh(-Yk2873BBQl6FRPtvgvYkHL<$+)#i?Y>QLu=0jZ`d;{i^S zX1;rWsx4Fgz;w)HKLK*yC;b>ogj7Qgo{NKzpkEu->daBj^KIW{)o|uqVvZ5T$y;_) z88m<E)~KY?VAmJ!>G9Z4f0I<(e=wJ%a+n71daDp0AH%89GQmjczciClFM$Y~Nlp{L zjsBL?eCe%fa!L5h8nLXWCAkE3F~^P4bYxUq8jDGGM1ql_*h8&Z_aNp!wrYv+pKXx! z8MwLU`;9G6S0o=E{KP`>^2^G=?RrN~&uYEqJ(O^WyNzx+KVuJCL$lekue;HZnE)mP zkTPCt9X=ZyKeFfn2W||Fgzv7Td12=#1mg*HBVvQyS$a)82Lu~?^*qY##7Tx47FdTd zlojbbsSmz?uMb{Bh94VA&SsG@qZ__RbB+E-zAfhE{dl%g$Eysf3C$UsZ!7eYmTHKY z*cv_j6eAN1Hc1D>wN&X7eijcTahFhhrPaHEH?eSZ3i;I<`FF*WI$e9$$c`p+B{}#g z*teL8JPNwb)ztioix3xyoAYu4M%i0y%l$x6RY^>2<5Qqem`r!N?>>G4D+`ttu4d_a zsY^m^{Mjv}7YO5ZKDxK%1u<4sO@ZPL+Dd4Lf{YHKGQsdiWpo2)Zz3iZH+1)1n_A8g zua(?~|8uq@f8gNYOpObnnSl~QD^N=U>vPaQ16&z3H8lVZQEF3QxgQ@@!xT6xsH-a} zHGyci(u(ayUtBilh>qtvSkh?A*65ksqQ@Sx6982mG2q=yU|A^OGe=K!QU0bPq8`$H%DI6x+4_Mw1r*&Ie+d2Pje! zLO#%cgtBLC3Q&Ph{`z&8j^W2PPSpkC{A`HIt#dI~0=7n_t>9!-J$Ky#-0fCz5Smod z01?xDFga_49WATfLDmCRcIQO#l7^mw5_~V(ipMv%>Od)HX}UnHw4SPSfxhy~O-%7M zLoiPq(j0|&yjeYP;o-<{dRH8PJ2+epx3Xck!g=v$_3@dVv+?n>mbCyW_i^TH?DI>! zr4gA21LcpI;jGHo%d>-eXH6r$BSbSlnLA%Fd#tq7ih9igPpmO^R_SeI)<>^XSUC`EF(+IYr-4 z|3pAo%8yf0-BVn{4~A7Gsi{7l^4fx4I4KUtYF}ga?=3uP1_*Wb8XD%JD8xI^g3dSl zs3T}dpply?MijCzBuI$S{!(Ixn-ax+ z-ofkrGX9h}6E3tJmacc~uF<7eZp}@rVR>I$A|r4LK`<8I)*IKhaII`tun+&-#ncle zJaC5pmXPAj7;`WV5H1sshypMXdBZ!BtLpyy zDLIsK%ui6Rk?Pw}XKAkNJ_IlqK>H@=^V>|lA3uDfzcJGmwQp^kDCA40bJ;+Ji5*TB zL=+Uz0NDn&Tg#Zv*tebI-ChT#t1SbA!+89r4Mb^e){V78Ne#5G!!cRo-yPtI&%@4w z^v#vW!6dyvp@ zcMlH&prLm~8;Cn9oLe6G7w3Rx7|=2Vq8M}oI{Gv{lv=}^xo4abKGN-ld!+GPjvRJl z3r9(|+bl_F$eL=ylB4T8Z>=rg8oG#z6L4%g%RHG6`o!`8rgE+F4-aFxSR|$?#hm${bgcY`BuIArK?R(+>&F~@6obs{x?RN zOp6g{2)^yL8_rQ-OEYRYHI#|jB9nEBqn)y(N8fg-jDNR(#Y@S?Yt*0nW1ur9r}i1U zW$|ytIOqG))!{z|adEdPtM9tm-RjKxcHCbdYFDPlkTYVp-O!Gl=VaVG!=b|5KLBob z5+8R!XQqjnSx>h#8z(2McCLKCr>7?_aw9(@*M8O9wPi!V$nMmnYv1(Tpx*Hlmd8TGSt z)y(+Q3YziXzP`Y}hV=AEp`plJ+4W_SK*|OPW>8;yFB|oN6bvL5Jl6lf77f+f?|%bg zgP@53-Vlz{qF)T#4m-Wl{qA!b>wX&+A1?jqx?74IR{S~TJ3?Z$ zw4L5^RZf9R_me4Hy(#OtKad6&P4lJoMJHxYxGOF2(6D5lhsmf_`{v~8_Ac4mh7$>O2PVUtDDH$HZlmbSd~`S{$w9XO1P;p{%C%ySP{+E3zZY!@XsCS}{rbA(|@WbU7xy zed7`eRqNtM><#ODr@`>hkeRtTOqZpyCd?Ec7;9c1&gL1mJzfa;#9zBh`#2(|tyWg_ z`9VF;WD4EtjIbXg3}K?!2UPq?(2uo%acgt8Jw1xHZOpX4AEQdgH-INZwO=~Zm3lrc zAVdvfYsxB;+gZr(rJx}wj*QUxJaROe>6XNb;U-aJ+Nf679IVUB&YYIsx%6J*L~07We>A7IZwJa zg1zE0z(%-8N;0IAd1^Tt?kgcZzSzTgfVv+5#vyyGOiWxZwt}w0dB$GeP<>Ixgx*#z z)4=Dkf8!^-d(IVtThQFtSfwKE<>fV}Ohf32^Y76OC$V;G5OoLD1ijWnEYV7|I#$Z? zaSs$J6pT1HpAX_9Al8)Z>LFhBAs}9<`+=LyJzIkd^UyqA-is4xSh`RHu3a{J^2hmn zh07oDmKsfmW2HNTWZ zLNm8{$EUlm&>|wD*kMUY^f~!dmv8TpnXC@7pIQd}y4f7FZ!s(}TAP~-e>?sOp<=t# z&ddwv{fM8ckimn4hkc+&K(t5OiP%{vZLV`+R4c=d`5UJ3gPY&0I~!YM2V1URv(@l$ z#=_y+42`7K@nqT!zP&C}h=LUAeR)I#XPDUa4TmfQF_{#|XQnZWv0EzA4PDn|5SN~o|;;4 zdU{#*JFCfjnKa|uU8p5!pd$vqbq8dQz=briu!vjo0r$-dDprWQgr6cA|7>K}b978p zKGT5z{g}3de5M;mgc@qPKvQ!~lj(T!^`&w-^O8!X(Q7!bky&R-!l@PA%B=nR@Orqq z2g<(-^;7WW4mvgZ6FChP)ztJf=tDX#GKI_F(MVdX*Z(v>5L+!bVUu8xVB*BbrR*N; zh#8I1>ccW6#3Uw;ol_B8!K%s1rgv?g2ZV)upNNnGJ z^Tz(B$s!r9SD)^$nTGz(Lb9+{PLlb|-p8EV<3*%4)bv{>11qdUGIyY2CVtVf_0iL_ z2w&<68p)Q)?Ys^(1 zPV>kP-_}dqe))ckKb?2?(9?{o>aafwo>6>e4^ zXkMP^OzQk0EeEkI2m|G@JCf$4`f@n_j``+f1$fgyuO6-gauWt&m&}mqOql=fZfh8 z2wJik0w_=@RPrm!xICUG?kiBl7=$-}XFC&U#`@KL%GnU@%Xhv?EnnMRJ88@1pJRhi z?IQRF8)%{zX1|N*%(Plgq(Qo+yFsM8Ti~XS( zkMFnl`|Q1btbH8Ye|jMIb)VNfXN)=Kn3k57em>622j#4+tRekA^PHv$DqPJT<0;5* zXj{8NUx@Sv@t~aA+g-byUR{T$Vc@5CVNy+`97@$rPRW#9$kA(BnVPCmSov&x4Q>3c zEh~5-*jUe}SHmd+l$^ef@KV@>4cIZFCd=Ru(TDaK-m58}R}_+W6Pb;sWrGe88>K=N z(L0DhcpoxI_Owy=OS0&#H=YyN!M^76fKfjKGVdoZhQCHE!b~oCHG|XeUfZfTNl`4i zf|VGP+)Zaxf~>c3XCA7*zG3cF$A`tAD*fj+fo$5L*@`_~Q@d4w<|A*0eD3=pAb$r+ z3Fzdw3VOAQ-LWiqczAzD77bZ!Ha6~fp@NuhX40JXfLh}#y;m}*7K`WZ&5n~q^y=7ngoghy1C3&A$^eked_~E}MG_V)Dj^j5degS&mZ^y7LW2!VHJF!;XBIxi4e3qVE?>4g9$Sl8?{2vJ+J*X;y9 z2}FwLPf!gf(ewGV?SryvvcB&VKMoGgXE}^Xx)&i9c6Q_79_Del!2|6UX4dIBR#s>~ z7b5n!?!0(vL|F=vqFgf*OWWNwb92M~ZCa(`@pqv^+j0s+dWvmOj?%^DdnE0NuGm-fYZ=Uqg4F|5sNED;~2q-_C=wRKa=B_+UOMX7=m$+^Sz|%v_ z&5P94)Y>dXMI{(?T8#A}m6nF7y6?I4t2i=-)=EoI3D_8TZPX6SriUCSDaq+hqC=`{ z9F3n@{g@7HMhfFF-^Zh-J)JAp_5{i{0+@U1X-M({7VqogK2g5DyFV~rha=(R`*pPP z-O2a%%tT8|xajbR-~1~Aqu;0Fqa&Z45j?CtMhq--&9PvlI#Wq>TSYqG!Re?{a;59x z0B)^p8J>!`Fq>)aq8AhCinI3C>L72vga{gW@KTKr23J|m$)g6tqUz}R4z8{$(2Z{a zfTdNi*_oZJIOEL8@g$pk{$##;Rb(7XX zMp_NrpNr{Y<>MMJrphQ7)cxt=c#sAh9GJ7ba^loCG1=bRGgn;S9b4u14^C+SVu0C- z&@dh+`qv;V-g@rijb{&8)+8T48S7ru%2r$%^491Ni=>!ERpBoG zGmegwWHX}GNom%rdo!ZZ=r~a$r@U)|prS(UB(8B^7BMp;uIBQAftHf8wYvHgglruz zk=L($9UUFT#Kgdv5}u^7S$6qn@w!j$H(2B0?H?SJkc__(cWZ8Lc4#Kl(_00JlZ<-8 z+r>wM{h7g5A&`?_1PT5WO``FnEd6bQuTV-VO%7E~P%ppWSz#)vG~gmCHGjF%2_)$s z9-_V1CH4_@5D=Z_z-8S(eM*IT0EHnj5tbD{zs%~_7p~f_yl&jh&CO^+u1R-S=5@2R zN{;M>|V6HyR=0Lz_o<#;`t4I+x{a#ND08yQlQ-p_sbELF%;38ss`b&^|zG z4~Z$m>Sm+wJRo3SS{kZ}gp^bW71c$Sf`yg!VlrQ5Qq=^GGZ)@4#DaA#d2;AYTnHA` zcui&|w)y5h&o(R33up>*jTPI_Axpo<*z8k*)yGJU8TFM4BW0QnB>kDy$I~}T63$yW zQO=L7RO31?|KqmWIMy^s62Jf%79tm$^8|?D3sw|?!e^J4*5>9-Zl@br*Ho?$%(rjN zF?ajI%}!74!@PuU>R^?3%wDL>SRn7*+@ubP{VX&o3rS7KypX`}6dWbfRQZ=-HLzLEUcoc7`q4 zZ-^l1PpCmcw~rrJf;R~l)Xf>S9(TvYm0iK>|K;i%jL15y1opMt_W86amQWjFhK26F zbgQk zI2l0DE<{^7WffesrQ~+PyukyxlX$aH(6U7E2WboD1h0O+O2gM!)P8w!@gl0G$}x{+ zPb;Y49C0pCUVh_6LYcKygYD{%>FKA!O@JVD!a8J@L9b)$fICLMuzH~^FHh9ln@{8W z=;+(Kw*1E1=`K=$!U=G5<6{YcsL9M+1B5i+y|V-1^J{q1DnW63n`NR3>^^oD^k381 z^PN&L(6m3nnkWfaS^bkmcwfZMorq>WRJHSmDzzsXIlcPV5>vz~Vi3yqff261ac@uK z>YrZoZw*nNhrnC@-&R5#u-)c%N=8AE)(u%$SYTy)YLvmJ?wsEO#Y#xwQ>|@*=Yh%t z=l`SAx3Qq2pnj^W4m$jd?G$xZWdW~<^=^U zxJ)Q0D0a0l+oF1GWq~q?op+_`l|6-d{+)7OtuvF84mT$TIUQTvg10~+FgpXNX);7f zCkugi`F5K}S|kUG{@2J%KnWtbJE!-J%8a5x4PKS(r0Xy9oD5N(Hki7USKmgeh~Vk( zFOUBOZTnuB_`G2^%~|ZEl0M0utV*ry7hHnJ*hXw&IC(R(TnIhnD9GivU%2_j$L;J?U>qDM7aLQA=T{V%7L}SR6UQI*%(*XJt;6#b-9I z>g>#_sp(o?J{*kgzkba$x%5f!_wU~z9e7&)^DU9~3YM0t~%SL^7VKzw>`cD83# z$PRrCGim)dxA{0E>v3u~SZ@%SK%n=goG7u*Xm8eN4eB!kfx@a-MC*+NF?YhBuPiq^ z^;87?s=mJ;UY_KdDn2J2_f{zvO{YA3q)dG&Q1e@&&i0%#s$xF%Pk5Hw9ntj<_y3Ni zf}k$QFD{nKRZ@PB)9ssm9YI10#bc^CXgKDX79}nQ* zJV37tr(J$~MDtX8z_Sb!Kk8($zV!8LkyKgkNbywWw-tH<5;)=LSXjVki}h6HH^x3k z@T({)BH>Q=1sNYm6#WICI^SnoLw4CA9~DX2*&D#C7g3k|+O8DgOWs(^p zi3NTvEZ}*0JltI>z9u3 z=~Kw#6PYK}{=($XIj)lb$J{9=G8s~*(h1Su9+-2kyr1O%JK=+es$j92e@Bye0CGo= z%L2<(F2+I)PK=F3{X=hQV{>FIymvHpRdu;uM{V!*)?_v21$E!H>D6}??hM%^v+*au zB1k-FZPYN><#2ASTTmf*!05aQ87E#oKStw&?pog0fCS;CM;k&04mwu1KabvW0}3CS z?AA{=CHS`M0fbM*$T&4Om#Zjwch>@{N%nW+X3TgC3z~eMJQKBOQ0=CpX&?aw_MgS& z7=&hiqW^9Oh?b+xuv;%;VPY!l=?OUQ$fB(t9$IO<0+h=XCL{eN@Djwt#d}sMt8Rg> z1W^OoSui2HJ0Ktc_$;7g{0MRzAg~3XW}r+2wVwITKp4nSBRu7v*|C5#2qxt(_gr`0 z_(45bfP$VKMb259C%vM`(GzuMrgT5;kyu&8cd|dqW|5zFF6H>~k~kLe&o57RYqQ*9 zo)CwMVHm@`bG^sAM*fepqC6^&S$`g+o`1Rd{RcDh_3hDWCjxK$V{5&X;ySoHc`$9a zzP@Ba**QyFQJ%--{}ougNY=tZD5JxXk&?0iu$)X1-MSs;S`;)K=kZGGHHh{kQp7iq z*AtdVApUsu23__C!HMs%JD$x3e5E)|P_&dJU%rXCp@vH>em0Ir!xo1EoxrPd=L;qaNm z=EF;bR|DHM)L3Vgok2kXFx1p90BjGY79TG5;eX|*Kli%=^ety+*xd5cQh!Tf1SjCu zoSaxcyxmX-ToO2e&w_vswXnDd`mdjHDzzIcz}YQG?0qu<5+l%=n;svFo#S9`wY64Ks^Ip_} z?g;lopZIg5Da2A#m76KuXa4OfDlj7sBNoM3DjVXxEK6Qi0)PxE!V;=5BmRA=cA$p9uR_@VB0{A53 zy1KgY6mn33?I3~!?U5IL{66A41{Ov6vdcSW0mv9soReqx-+Wj(L|W-3j|(KZc_$?=3Gdb)H#DN`8&H@XS5$tFQKqLiyGS#&=^+FGjvT485^!BGIK)7)PineO z#lQd;3_?BP%a^XMt}hV~K1~H60uB{G@t}86(QCm{KR}53^cEKfM;ZqI;FF-pCSZI$ zPrbUnhVMm1Mh+-&Ia+>^(9jS9irGET)6Dd~y4d^#^`KM>91U~g{rZ(` zjCS1yu|WKz6J2l_EJ%0ua|F}z%>A$p-s!yX(&qm2o>X!dO{al`ufJa$Qs@ugN_&RA zK!}qloHIq0`C7!c^tOEe<`k1sLD$@(84!ZPamPe!C!D`rZaDkxNp;Wp&61AkwQ(+w zzruP2$;)9LH~M;iUynY8TVk=VRE$Nd>_4;#4OBn{Tn}!R6Y|(#segt|&f*j#GHQRQ z=Wa9=nK+UQ^yI+$wW87FEizHpbo%j|7hL@I9=q3K&+;I(<7MP%B8=B+YzKBC_CVBY zF6T29nc;kZ7H{^Qo8{;<5~Q5Ed<~}bX>SSvb||V4ruJSfM{y7!S(=)u4|iNlsS#lR zLMESQ1ALb9Aqx3Rbc~ggubbX>%FkV4=PLEU@EZdS&1Ve71JDb`IG^&v1nA^zCwD+x z1A!W(MW^K@C7&6g?Q3CveNq+ij&aa{Ee9g;*jQL)8r4v3coi%vpVGi!BQ-sJpo?0i z?#lf94Hv)A4F>F;(awCnM|OkwWW+UL^Qjut8;S?u?a#!I#>jwc9?L?!h``G!6h_Z$ z?c$KQ8xA1R@wXe0h}Ht&;E*8`s1Fmx>yLM>tPEh0KA_T)lN;|=@X5)ojTe5; zzibq!p3DAZ-g2pAxxMoZ{f&50Q4#XVD<(?MzJ$Q$Ae<|bpM>lz<`HuvO(9&$WK+Ac z6;wmZ&Dn0(1^KNU41r^O>0z$6Fn$b+wbkZh1vxtDVZ-XCJo70`d#nt43r4bo7t=X0 zdoa*%F#Y26=k3N4-wqSt|;T?{<-|ugS7P+-l@Kc2;G_|$w z?(S%csVouLLTq&Qtxb@twyvO1I*?H5Z?oVO5qxCb-Z8D7%bYHVz{czYj2~_4ygpQK zaB^{xX1@3-`3UiR zDq7m$fFMu~))p7*}Cr;ms)GrATQJZkAI9KP@N=Lf%Z84pdOfuGBF*Zl1k0>D#wN#I@GQ)UD_C zl+awSh}55n$q7@{yi)5&=JuD(r?D`gs8n|S%~Rp5H@|viR5>uBK0=?AJH_C2l=hP!DW3t!x20x4wUXw)T?=cvbte zgfD?pWKiNo#Z+1gJTIJ4%#nji3ciOxiY$u7HVemo9`9u5X&K=vG$(&FhXdl`EtwX zAlwU?ywm(p@jHa6c$fq5oLL*4AxwB5#42**MJ~Rc4kuwBFV~-+toO>v$pQauVPn$) z(taSZz^K>k@OXcFcPCP+RvwatAuv6qyxJuwqi(x5XdjQeMgKtw4@Y2Wa5Cxn)d;Rz z^l#@6DqjS}`UE9vzc^|Nyx~oEKW}MjA%$G>qsb%PD>8- zh$@Ka^o=jm@N;N^R+K3fDl|Ii^p`cPt>tJkD#|D>P{Kk| z>m}!=!-NYhKbGL)?yRp(qmYx(U{!B2ral2hC>y55GYm ztE~$y%$Ags!;Y+k#Bh4~{T_rQ^6I$^KdQ zakQ}sp@~|Snz`d|D^)zEY9+<_C>c__)TQ&Bt-%Tz=?QAAq!!r%#{CU~+k0jf8~QX;>N>j;vRu5=K?md)qk)-!Jz) zz3Qdk02F_2L_H$Xt%%(4V8v$}`||w}e6Y0isJy zSPke<6qTfL;-{tE1@$*XLGwz$1@uu+n>wj_v>ZEK#*T zP=c}mn?XY>w%x#1$l2TAa{V5~vw(5L<<(yQhC?b{QbqMbOsrqLso97|o7ZGnojrQ> zk#0wlOO)0f+<|a51z~zeMQC+)adB*Qb#`VZ10h%PfRQ677#%f5{jTeWu8d>c|CVw8 zn4j>7a8RY@)7V0tMQ^V}v|?5#2B?O`dnjNe#Konvj0?-O8&N9+MMVQH!d%z*d$J5R z=O)xl-u*3$K9kgpfV{ClF1=qFD?m1)*3~7ONf z3s$1ahak5;`oZrsOPVUG>MG8!csYH~v%j2WP}6dZebX|~H_Xm`JM#S-m!fuhayr2~ zo8=i=bx>2HW3y#emZ0;{vbB1t|JZ5ye9$WzR`L>8?J#%~Z=JX#yIjWTox3y*T#^f9 zLTL(jM3Poonm!^m5ue>XXtgrFtskZp<$Qma`|J0GnS!kFjB|*OP3F;$TnTuwpn=gZ zNpT!3mSP&qvN8kDtzJhDZ6hYEAAQnORb9BdVR`isae)A`^EVAq?4abDq`b8O)_eu_ z-8jiWl9Ll2M>zg%hdZDU2FOy_w~w{8Nn_tX92~6ot(^$w&Nw3zec*c_+bEzL(XMk+ z1`YR-`c8U=l@7Pd*}l?7dg_(TW>e2^-wyCJOUdHjO9aT)xe^punwA%p31?4b!=g=% z&wqu5wM3%1##IeJY_2-6>zL0bbysy>*k)sYfG;j80gKfL1?+)+_sk7kpKT8VHuA|C zeno-NJ4#Ej?j3(vZF~gJ@0B)G^OG7)`PoGWXy)D*5n}R3y@H;_PAxft5KCiIj`3bv z5*n|?&tES#wwB+I-+jF>QO4aF7#;KL(j+&*8XMJVuhO>`{wSpAB>X4)hcKTAcl^*c zt$PF^hkjLIOUq(VB!L!jeOOk6XouI639_=zfG45yjyooK_YUoQ#_+a_3?%!(SNVp> zS{3g9@`J+wC8Z-F5tLFj(p7Tz+3pSu{MjEFJ;lNNJh*^K6Es*Ws`H!u!*g4&1sMgd zV}(S-gOOk3Z5?dS*;X~G3UIuU;mOYI$f;qD9D+>lNtYZ%RUDBry{N5`t{Ha4R-40B zox@cNh0~Yhjzba*EF}nMpNqP6#!!H;eqTM-|J7Zt-GEBWf7lb~2p|D=CbotH0V?lp z(`svuwna7LNwVM|SaWvUsg7INpJat24zCZ3OGsgWH1WKbDvBJmGA03K%)VdmfLjF0 zIZD)P%-0<3WIj$B5pzu;LT~PW>F`CC6HT)^r&zInfpmlh`WoNoY+M@k8cnkXSk6|U z%l)BSI#v+!K*ZXMMt6t31&Ekb))r4lc=+(QZzW)gAW5({<9gNiaO&T;7H|r@y}ec0 z>Tnn;2TN_}mK`{2t#<1IxpMvA&=G!tDftM-zJ#GU3jDH}n?JBe4{SK#S>Z$q6)G01 z;75*bgVFT)3*tQjG=5SGAqO9e??EeM*Vwejmd-fNyqyiZK|Liw8zn8b?)h*y>{^Rj zkB^U}LfsVoL70>g#LP}1f?{k=XAxgeJP{iR8L%Rk-SQ^_H)6*(a6-_DoZShoM`-EN zIp6eoeh8`ek1uKAj_(V|&0%^)#}ExhZOSMDl>Li}EP<=k{2)d{B%0N~O^u(f?eROX ze{4*%=BhDg)ES`5)2yuONo8IqC*8xzfKmhdbR$;*zz9I7JcSx%cH1Az^nO8Coew`< z|7}xs4Gn;zrlKRkSf58jOzte>-96j?c@#dmY=NED(&S=9>=8YlSN!hU@x02#iM_U( zrMiZ#*8bDBkAF~80 zg)8(%-&Zba)h!+_SKm@Jsh@9d@`^_~IuThJniLx8=M@y^msM2c6;#knPfoJ1Hq_bU zbGqHkL#l>YFebw$Wg2$W31QNM1NukRX2vyI>h>P}1?*5;D`9~B#M)d6VmhgVOLqL{ zl+jLkVJup9lyN_K>4^`4CYl6fXZ*s4pUFXQMzAe)*@u_D46Ro!>(p;gtk*PQqo!pa zqo>Qt&|x+jI?IdLmXc8$`KHUp=)2OhjU7H`HMUe$Xnvl(K@+;;~?bsn1NFbDJz~#ExvA{(p_cBxtF;p(%OD3<0fz9GKn$tZqr3dw3p@>zIyxU6# zU9J{$CwRBcB4OR_?2#IsV%pNU^SQV5y6o0UHXqJ5=9?{;949F#88chnQBvn}a{0<> zNjZP-Z@n(o7skDI@4PDfH z#9MLNq{XY!kWoDuE~WL$=s?I}BD@3p;pj>gWv8L9!A!Rcj!%l)o#L{vNC#(*Np)1G z&9BkHm_KocCEWI?*eXO=yJ&QnT7T{$eoP08zGZkngJx=Gw$kqVGT6@REyxNZ=D8MK zEODZeS8(^(EN9zx=X1^YpwJ5l;_{3<^qXOG_CMr1Y{H9g+l+@mp>D zB~8=aSaaHm(ff}Hu0=`-Iru(w8$AA|Cu?oO!di<{gPTxQqc_3 zZJz6+96{fIKKh=Usjl^0&{ZAM+am&+OFT#T=bHZ{^qM=f<>@;+1qJM=3qM_V!|yU% zNyh_zOM7H26-x5vh%9G$bL#Or_m`H3R1;Q;JGJf0_|&R-Fp6@xnHIyACkb%YrUii!A{ckH$RW+WHEwMqGEyYb5R2sWNc zHCyev9FOjrZ(BK$j%UNB+Z7qHI%x^@+)R$DJsXKJ`zOtL>Sl>seGZI^ENBY1C9ulzO=WHsNg|Jd$XyZ-&=CdpI4-+2Ds?mH`aA^Rakro}66_f) ztgb9i%uM@ys+RM8V@W9@BLOV~4~^vM?Z|^bm$G=pQR~QFORRsiX1Kofai4XtrL(ns zSTR_rHyb^9-Dt(zpZ9^xtH9AY98Ef_IG8jDRvhGAXi`eq@M&L&i)C zgcRwchK?GQ{9JUX$-Ue5ZeA5RaXErwkemtts#la3=G&H;eKaxrWxmfj?gI%PuA0a+ z@+HFLV&i2p`M6kfJ50kORNPnj-1l;C{U;@n7~iaFj{QvKj3#f3hs*;2hdv6IZ*4Vm z*sj|2S{!zuf~4jB!S|#1_@7Gz8RO%oSsmVxhbW#tucN>2v)OJPaCed(y0T#jTyZf( zVmOFLXjbd>HBG#vl_c+ZDpWlTu`qI7rn{ zvDX2BWYG<2g^s_Ju2~>i;O!99RFzYf7#PPYTQu#8dEVQP{1S*x3b1)=ew1KoSw~?40$E+SGr?*GMnw>R3?Tt(KO?Vc zgObucY$b2|UXUS>iu2D%mLcfUTa*Y$S!$ zyMb8H@O@5D5_jFQz3Xvo{Y)u0jv>FnKex#Q5QGW<1Gb>t?3h_}by7)e=@V#d!!nEM zwn%7ph!D%aD z2yU z-iMB&d;}Rg7SPYdlr$79uwd1?W=k72wkL<8KZv1tTtQ_}P?A#y((4_oimOaYFJ8z< z*L-uuQ6rZAP$Mg+?Rs!gJZGyb@%LKJYbv+bD6H}mAB}Hsn>=+3>OP7M=(Oq4xRhU? zoTP9uR&Uab6e1$BekNf?Lbw2gmsD8_DIwz#GGEmgOh(s3og>IFQRxL0WCASjl`m|Z zTdg3)gE^a>Adx*iv0ztMl9F5M8u96hVIqx>tKEo8F0%YM(5KMhvx$#*p$-MQLW5}~ ziN=VxBK;2JZ{NlZBJ!D!#fYUM@$&MuwY67aYPtN4v9%v(XwddtxaqXqR$A|8?eTMZ zDx&>x6lzXAVl^=8A4*Je=#?rtq%vi*V}`J(Vtm`B>pml+j_QYsvv2yIE_#4R54!LXOir&^WHu3<{>&1{O;&fwu zt?Hc0&*E5uI2f47^OKX4%S+8VW1oTg6a_%wgAp=L1@juAuq}-wLB3RIV?<&3DR#Sa zHVllKYUqfXUM3h4f*9tpiFMnlyr)~Q&&ZimDGq{EH7K z>Qv*UP;w;u)i}3xgCTgV6zRfdAq8ESK$;_tDkusX2>v17MR_|BVwHIh_Cs%_5T2^# zkWNHg;|))!h7=2!vuYU|Zvp%)zY_f{@H5mysN>#sd$v-y6CBFpO?Os1n7agVC@Bf2 z2?3C16*2Jex{aAEobW%O6|GjsW+Kiz4hWA>B0t$mjjW3cyMK4g*~)8CUY-jA&w^S` zZn%QGf}}+X)0Q0^4~iYa*IA-u*V*L6ZgXv`#hRR;q| zG?wh+U%vzc2Cz?nq9z3&Uxs|4jHT_^>BmRZYQ9=GFA*e56h0$wTF>+673}_S+9-A` z&UU8M#TTk zll-6OiFC&VBu1pa-yLuXDiJSAW4|MkGe;U&Sr^*q=U7^l+87s@8duovTJ5n8PFR)| zhrSd_ktdM@6)GeSbLT5 zSmq^U`#es@WKY$UY(uO=Ga+SWHVPhlI*pC za0eXV@7jV9G0VRpk(!!%e@2s2GAHyv5-K|?H8;E$X9)mkdWkW=2hQeMh8!nqR12VmmmCsjd{^d<-KY4`(cRgu2SgV}Pvv zZ(iC6ko6TL&aZFWMT^#_AXwniL?PTbA}%r2-#Pdtv1u170lh7mm$iWJeHZu*CCW$( znId$@r$EshnO<22n@Hr1Hl-MfHIjtc;!pMVn4v9Q4D%#6gniK`$? zh3P{)BX1Cto9n@0)q>=kvFW4v_u&Xg&bLCz<-c!lYg2@DzW(<)E;Kt*O61kH2NfZu zmD!k=80eew&4<%YH@KYVF>8`fAWwJ$H9+q=tKUuH@k9fI)i61@m|?p7k_C=8aO_cO z6gW`U-rVoEy0Jwu7Z>J+r&Ti9E_nK{qYbiJ212a`mwis*`HaZKVGQWInS!}~r7 zg=xXzd_Q8v+1*5_19Pj07jw}6GQy|e7#SJq#jmI3>4_=~v7pO*=py~*LD4^pnkJ3| zH`^erBf@4)ud|ztd`)#rddlnhk~xDktwf%6!+&wivo5su_eqQ-bnEkn;Aq1tcTvl= zrgoa~lU2q4*Y(mAgaGPOaaq~@>f};wNG4gr8)au%dK4?7dbx7svfruKj%<#v^ndbC zQ@GJs->~+d?BnE;Vb27fVYNU7(oC~d;csPITiE_&AF=8}Lo(yQ_kYW7Il<~Oxwz!0 zSd0Ze5b@t80R~zMLBBoCthw>oX=1W6iLxf4HKwD8B|K)ZP>>6>?A^--B6hW54lSQO zEbLVahHQI^DCjy_I%hfh3;ME|NZHHs!^Aq|(To*$d@XWj%?n{1jL>dW;uT^d zm+WMcNs}Jn(>S?*YG@Si;}K|aeZ)(R-$R~g)!5qhbiNPLfDCcM-IYSY;-6Ch>L$+L z*a(0pFs6E4AQ-Ndgp1M@0}m7x$5Xxm-6I`}XvP7h_*x=bGnV(6h1mw?HrodVhihc4 z^(rjPbL@@mRW8$DBZQ;dMj`z6Q2Wxu2pNnApfz$vuKYhNgBeUtO-xPw*`o_H zJNuECAI@k`Hi-mTxUt4P9i(kfPk#uDi#e5cExx$YBTa7xx7_FQ4Iy@-^aIrucro1X z-mN_`yq`QntP+?ydqFZjDFz-L;S5y}0@`$*3J`2bhqYdkafck8LG*YBG_`UPd=fcc zz4?lkra!59BnNH<6Xxv#4x89jZ_u6=E`sSU8ND}KNWjCm16Kfk*QB$}YKOI{>1;X_ zy}d~+Pp6vcMAx4BcBM%gd5|oba^Jy-{yY8xK4>gN_nv?kU|(zEe}dT=+>h%@8U``3>%*P%zGZ+{ zHt4O{3IenXMyN$nTYCviz0lFqd#XZOp$NMQ$9FhFVAHq(EiRqIR$7nZUr#KC;MdU5 zAnq5`K4lKJ;jlK5OeVj0@}A;9syr?~?vsJ1f^jDx8GlL44`@C)V5FM_gwLJglfS-Qa3HqO-`sZWd zp2Gh>KRt=^pN|HINbvvd7dwr;15+uWc`TXy$N?zNSEr}$ryGG6VGJjL$OZO4b@u@G zhfA+T-hSupXB6}%nc-Ig5GYDbPcO{Nla$tP2Gl~GM2^GNPG3MPn9S#s1_K?fS5bc- z3_Q&>GN5_VGB6}DJlF#IO0;dqEu4ymhZwG;v~*ZV$frOcN#F+!FJ0-Qx0bR#m@l(B z3gpTSnIPm3x4pF`nN0T230&ZV=|O){ z2%xZZfJX*HUc~%fuaZLRtGqV*QG_9&Z+~`v&SAYcJv%F220|4*HT82Ae$T72JRyys zk0eh4rP*;e3S=Q*Ktxb)Ae?bAa6T$bWmq4XNnC@ws z6E(M`CG92_Z4baD=V0sVGGckwl%JB&Z4Ay7MwQ1GLKHZIDRhJYOzmP|Z zYKB0bm2}U1>eX#^p`S>O{%a^ll$4$aLgql}03<2!_`RAM8m@-oDM9`KsOz8& z!HswUh7z%`u*k^BC=@c0>Cr@(1!+@dcfgXr-rnBq7_?=uNmgD)#uB`AaEEMcY#8;B zE`WTV&)t3@=v0Wj(4Y<&i=kWrdJU_{Hu$m z3BfM(v%mk!%F3-*M`EI?l~ozoWZ>jUgv7bNxB#?$ z-LL@>py0bZUv0EDN)jSk517Ty&dwUoh8P$bDG@1UQ^uE zvKAgTwip&149q9(UZ_KFpukpja;UF06|MPg= zDT9pxrK6|ENJ%-jxOn&8%Vpe0P7VcuHGPyqyu2%V?cx_l00wIC`Fq|Tf}`gQADVz2 z@N)_o(wj5cTiJPoNd^6N@KSsqe(Hs0S~O*Xxdkxz_>Ygd-Fv!N5D&1_wpT&m0Y*(a zp-mO?=i%5_hP}F`2CCD^-u`(XE3T2g{s}=+){@iUYRHcK@Rj}}ELue*zVnQB&lehf=sk1zb- zSSa9#p@9!bXSc@Zb~0B{ndV{w#A%6<#ev#v)#$!Nv%|N&SR0P0D&Mqe4umMVZ zk}z5|*I?0-o4b2A)cg~ZIYbx}GahKqT)jjGP}AqcD_{yg+C4>I2!W<=M+zWtfvA@x@Q6-hf)Rc;mX^?=Uzkl06F%Vq4}*>a&@Q-1YP_E3x;O86 zk2aF^d;rmqUxOlRj50GH+8d@c2sTFFa0!I1xS=g@u|Ate;j!#&FTvouh(>g{UF^j+ zH*=AeK@#<|AZJxba zLfxS@7XP_g#-MSTX|OdI3pnC|f#6I(qxmzx=5+%@Maso0NJvP;#981{*@=Q_LoguY z&kSZe4Cxp-Ihtxde}w80=1N1 zt|-iiP>4I_g@y2$))p4G@5>TNAa>6j92~y(x-j-Z{o2@o3bcC$7>SBd1IQBcWp6z~ z);aY!?r{J)7S@bXmC8ND4g?-#CORA>n^z$i^SO#t0ZY%o``gg$uc={fqKF$6x_CRF zsICs5{xc+%$tWl`c0%U#RUAMMFtFF4Fd`2Z>%*I3Pp+?%FY5mWYcG&qcFGDukPs2` z5QQLrZZ=;Q3%>yVjHKiEG!xg%%xp+G;`q6S!*8jJzp>jG(5D#Vj}V`T@j#mg2xR4{ z-{S$btlwk4kB%p6Li^Y{`rxlX_%1{C&hMY7N#+j{gw(c66DklCl6iZmzyKpG(_T(1 zon{RTFX#YT0=Lf1!}A+N=1JQBe9o)?pA&wo@V5OHiW literal 0 HcmV?d00001 diff --git a/hack/quick-build.sh b/hack/quick-build.sh index 23a5c87..b3ccefe 100755 --- a/hack/quick-build.sh +++ b/hack/quick-build.sh @@ -33,4 +33,4 @@ helm install \ --set controller.pullPolicy=Never \ --set controller.image=${REGISTRY}/fluence-controller:latest \ --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ - fluence as-a-second-scheduler/ + fluence as-a-second-scheduler/ \ No newline at end of file diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index ee267bd..5061ac1 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -70,6 +70,8 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c log.Info("reconciling flux-framework/fluence-controller for request") pg := &schedv1alpha1.PodGroup{} + // Get the timestamp as soon as reconcile happens as a fallback below + timestamp := metav1.NewMicroTime(time.Now()) if err := r.Get(ctx, req.NamespacedName, pg); err != nil { // Case 1: if we get here and it's not found, assume not created @@ -110,14 +112,20 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, err } + // If the scheduler time created is Zero (not set) we set it here + if pg.Status.ScheduleStartTime.IsZero() { + return r.setTimeCreated(ctx, pg, podList.Items, timestamp) + } + // Inspect the size, set on the group if not done yet size := len(podList.Items) log.Info("PodGroup", "Name", pg.Name, "Size", size) // When first created, size should be unset (MinMember) + // Get size label from the first pod if int(pg.Spec.MinMember) == 0 { log.Info("PodGroup", "Status", fmt.Sprintf("Pod group %s updating size to %d", pg.Name, size)) - return r.updatePodGroupSize(ctx, pg, int32(size)) + return r.updatePodGroupSize(ctx, pg, int32(size), podList.Items) } else if int(pg.Spec.MinMember) != size { // TODO: Not clear what to do here. Arguably, we also want to check the label size @@ -128,6 +136,39 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return r.updateStatus(ctx, pg, podList.Items) } + +func (r *PodGroupReconciler) setTimeCreated( + ctx context.Context, + pg *schedv1alpha1.PodGroup, + pods []v1.Pod, + timestamp metav1.MicroTime, +) (ctrl.Result, error) { + + // First priority goes to annotation, if set + if len(pods) > 0 { + + strTime, ok := pods[0].Labels[fluenceLabels.PodGroupTimeCreated] + if ok { + mt := metav1.MicroTime{} + b := []byte(strTime) + err := mt.UnmarshalJSON(b) + if err == nil { + timestamp = mt + } + } + } + + // Now patch to update it + patch := client.MergeFrom(pg.DeepCopy()) + pg.Status.ScheduleStartTime = timestamp + + // Apply the patch to update the size + r.Status().Update(ctx, pg) + err := r.Patch(ctx, pg, patch) + return ctrl.Result{Requeue: true}, err + +} + func (r *PodGroupReconciler) updateStatus( ctx context.Context, pg *schedv1alpha1.PodGroup, @@ -206,6 +247,8 @@ func (r *PodGroupReconciler) updateStatus( // newPodGroup creates a new podGroup object, capturing the creation time // This should be followed by a request to reconsile it +// I'm not sure this actually takes, because the metadata (spec) +// does not stick func (r *PodGroupReconciler) newPodGroup( ctx context.Context, name, namespace string, @@ -217,8 +260,7 @@ func (r *PodGroupReconciler) newPodGroup( Name: name, Namespace: namespace, }, - // Note that we don't know the size yet - // The most important thing here is the MicroTime! + // Note that these don't really stick Spec: schedv1alpha1.PodGroupSpec{ MinMember: groupSize, }, @@ -226,15 +268,12 @@ func (r *PodGroupReconciler) newPodGroup( ScheduleStartTime: metav1.NewMicroTime(time.Now()), }, } - // TODO need to set a controller reference? - // ctrl.SetControllerReference(cluster, job, r.Scheme) + err := r.Create(ctx, pg) if err != nil { r.log.Error(err, "Failed to create new PodGroup", "Namespace:", pg.Namespace, "Name:", pg.Name) - return pg, err } - // Successful - return and requeue - return pg, nil + return pg, err } @@ -257,8 +296,19 @@ func (r *PodGroupReconciler) updatePodGroupSize( ctx context.Context, old *schedv1alpha1.PodGroup, size int32, + pods []v1.Pod, ) (ctrl.Result, error) { + // First priority goes to annotation, if set + if len(pods) > 0 { + rawSize := pods[0].Labels[fluenceLabels.PodGroupSizeLabel] + groupSize, err := strconv.ParseInt(rawSize, 10, 32) + if err == nil { + size = int32(groupSize) + } + } + + // Now patch to update it patch := client.MergeFrom(old.DeepCopy()) old.Spec.MinMember = size @@ -385,11 +435,10 @@ func (r *PodGroupReconciler) ensurePodGroup(ctx context.Context, obj client.Obje if apierrs.IsNotFound(err) { r.log.Info("Pod: ", "Status", pod.Status.Phase, "Name", pod.Name, "Group", groupName, "Namespace", pod.Namespace, "Action", "Creating PodGroup") - //owner := r.getOwnerMetadata(pod) - - // TODO should an owner be set here? Setting to a specific pod seems risky/wrong in case deleted. - err, _ := r.newPodGroup(ctx, groupName, pod.Namespace, int32(groupSize)) - if err != nil { + // Note that most of this does not stick - we have to get metadata later from pods + // Or just use a hiuristic (e.g., take the first pod or use reconciler first hit time) + _, err := r.newPodGroup(ctx, groupName, pod.Namespace, int32(groupSize)) + if err == nil { return []ctrl.Request{{NamespacedName: namespacedName}} } r.log.Info("Pod: ", "Status", pod.Status.Phase, "Name", pod.Name, "Group", groupName, "Namespace", pod.Namespace, "Action", "Issue Creating PodGroup") diff --git a/sig-scheduler-plugins/pkg/fluence/labels/labels.go b/sig-scheduler-plugins/pkg/fluence/labels/labels.go index e0040ea..e377d97 100644 --- a/sig-scheduler-plugins/pkg/fluence/labels/labels.go +++ b/sig-scheduler-plugins/pkg/fluence/labels/labels.go @@ -1,5 +1,11 @@ package labels +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + // Labels to be shared between different components const ( @@ -9,4 +15,22 @@ const ( // TODO add more labels here, to be discovered used later //PodGroupNameLabel = "fluence.pod-group" PodGroupSizeLabel = "fluence.group-size" + + // Internal use + PodGroupTimeCreated = "flunce.created-at" ) + +// getTimeCreated returns the timestamp when we saw the object +func GetTimeCreated() string { + + // Set the time created for a label + createdAt := metav1.NewMicroTime(time.Now()) + + // If we get an error here, the reconciler will set the time + var timestamp string + timeCreated, err := createdAt.MarshalJSON() + if err == nil { + timestamp = string(timeCreated) + } + return timestamp +} From 956123a289416e7ece58699c91437d4731afbdb7 Mon Sep 17 00:00:00 2001 From: vsoch Date: Mon, 19 Feb 2024 08:36:52 -0700 Subject: [PATCH 19/28] docs: update to design description Problem: the design description did not correspond with the numbers Solution: fix them up, also fix some bugs in the controller and fluence that assume we have pods / pod group (we do not always) Signed-off-by: vsoch --- docs/README.md | 13 +++--- .../pkg/controllers/podgroup_controller.go | 17 ++++++-- sig-scheduler-plugins/pkg/fluence/fluence.go | 41 ++++++++----------- .../pkg/fluence/utils/utils.go | 8 +--- 4 files changed, 41 insertions(+), 38 deletions(-) diff --git a/docs/README.md b/docs/README.md index c4718d6..5884850 100644 --- a/docs/README.md +++ b/docs/README.md @@ -16,11 +16,14 @@ Both the controller and scheduler logic are bootstrapped from the same underlyin 2. The mutating webhook provided by the fluence-controller intercepts the job and adds labels 3. The controller for PodGroup (an abstraction that holds a name, size, and time created to describe one or more pods) is watching for pod events 4. When a pod is creating (it shows up as Pending or other in the cluster, and doesn't have to be scheduled yet) it starts to reconcile -5. The reconcile ensures that the PodGroup is created and updated with the correct metadata and statuses (and cleaned up when the time comes) -6. As soon as the Pod is pending and the group exists, it starts going through the scheduling queue and hits the fluence-scheduler endpoints -7. The fluence-scheduler uses the PodGroup name to associate each individual pod with a group and start time, allowing to sort them together -8. They are sorted together, down to the MicroSecond, and Created to run on the cluster -9. When the top level abstraction cleans up and the PodGroup size is equal to the number of pods finished or failed, the PodGroup cleans up + - The reconcile ensures that the PodGroup is created and updated with the correct metadata and statuses (and cleaned up when the time comes) +5. As soon as the Pod is pending and the group exists, it starts going through the scheduling [queue and process](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/) and hits the fluence-scheduler endpoints + - The fluence-scheduler uses the PodGroup name to associate each individual pod with a group and start time, allowing to sort them together + - Starting times are based on micro seconds to provide a distinct-ness to group creation times, even when done en-masse + - Pods that don't get have a group (if there is delay in the reconciler making one) are pushed off from scheduling until they do. +6. Fluxion is queried via a GRPC endpoint, asking for a match for the job specification and an allocation -- "MatchAllocate" +7. The pods are then scheduled together, and the abstraction (e.g., Job) created in the Kubernetes cluster + - When the top level abstraction cleans up and the PodGroup size is equal to the number of pods finished or failed, the PodGroup cleans up The result is (hopefully) a smooth and efficient scheduling experience. We are still working on it. diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index 5061ac1..73b7d2d 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -180,7 +180,7 @@ func (r *PodGroupReconciler) updateStatus( switch pg.Status.Phase { case "": pg.Status.Phase = schedv1alpha1.PodGroupPending - result, err := r.updateOwnerReferences(ctx, pg, &pods[0]) + result, err := r.updateOwnerReferences(ctx, pg, pods) if result.Requeue || err != nil { return result, err } @@ -188,7 +188,7 @@ func (r *PodGroupReconciler) updateStatus( case schedv1alpha1.PodGroupPending: if len(pods) >= int(pg.Spec.MinMember) { pg.Status.Phase = schedv1alpha1.PodGroupScheduling - result, err := r.updateOwnerReferences(ctx, pg, &pods[0]) + result, err := r.updateOwnerReferences(ctx, pg, pods) if result.Requeue || err != nil { return result, err } @@ -349,12 +349,21 @@ func getCurrentPodStats(pods []v1.Pod) (int32, int32, int32) { func (r *PodGroupReconciler) updateOwnerReferences( ctx context.Context, pg *schedv1alpha1.PodGroup, - pod *v1.Pod, + pods []v1.Pod, ) (ctrl.Result, error) { + // We will want to re-queue in most cases + result := ctrl.Result{Requeue: true} + + // No pods, just ignore + if len(pods) == 0 { + return result, nil + } + pod := pods[0] + // Case 1: The pod itself doesn't have owner references. YOLO if len(pod.OwnerReferences) == 0 { - return ctrl.Result{}, nil + return result, nil } // Collect owner references for pod group diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 8cdc066..f126db6 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -166,34 +166,29 @@ func (f *Fluence) PreFilter( // a user defined group. This is a size 1 group we handle equivalently. groupName, pg := f.pgMgr.GetPodGroup(ctx, pod) - // Not scheduled by fluence - we have no idea about groups or sizes, just ask for one + // If we don't have a pod group and it's here, it was asked to be scheduled by fluence + // but the group isn't ready. Unshedulable for now. if pg == nil { - klog.Infof("[Fluence] Unknown request to schedule %s yet, asking Fluxion for one node", pod.Name) - pg = fgroup.CreateFakeGroup(pod) - err := f.AskFlux(ctx, pod, pg, pg.Name) + klog.Infof("[Fluence] Group %s/%s does not have a pod group, not schedulable yet.", pod.Namespace, pod.Name) + return nil, framework.NewStatus(framework.Unschedulable, "Missing podgroup") + } + klog.Infof("[Fluence] Pod %s is in group %s with minimum members %d", pod.Name, groupName, pg.Spec.MinMember) + + // Has this podgroup been seen by fluence yet? If yes, we will have it in the cache + cache := fcore.GetFluenceCache(groupName) + klog.Infof("[Fluence] cache %s", cache) + + // Fluence has never seen this before, we need to schedule an allocation + // It also could have been seen, but was not able to get one. + if cache == nil { + klog.Infof("[Fluence] Does not have nodes for %s yet, asking Fluxion", groupName) + + // groupName is the namespaced name / + err := f.AskFlux(ctx, pod, pg, groupName) if err != nil { klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) return nil, framework.NewStatus(framework.Unschedulable, err.Error()) } - } else { - klog.Infof("[Fluence] Pod %s is in group %s with minimum members %d", pod.Name, groupName, pg.Spec.MinMember) - - // Has this podgroup been seen by fluence yet? If yes, we will have it in the cache - cache := fcore.GetFluenceCache(groupName) - klog.Infof("[Fluence] cache %s", cache) - - // Fluence has never seen this before, we need to schedule an allocation - // It also could have been seen, but was not able to get one. - if cache == nil { - klog.Infof("[Fluence] Does not have nodes for %s yet, asking Fluxion", groupName) - - // groupName is the namespaced name / - err := f.AskFlux(ctx, pod, pg, groupName) - if err != nil { - klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) - } - } } // This is the next node in the list diff --git a/sig-scheduler-plugins/pkg/fluence/utils/utils.go b/sig-scheduler-plugins/pkg/fluence/utils/utils.go index f2969d2..f24f6d4 100644 --- a/sig-scheduler-plugins/pkg/fluence/utils/utils.go +++ b/sig-scheduler-plugins/pkg/fluence/utils/utils.go @@ -17,7 +17,6 @@ limitations under the License. package utils import ( - "fmt" "strings" v1 "k8s.io/api/core/v1" @@ -53,11 +52,8 @@ func PreparePodJobSpec(pod *v1.Pod, groupName string) *pb.PodSpec { // the check back if there is ps.Labels = getPodJobspecLabels(pod) - // Note that Container gets use for the JobSpec, so we provide - // the pod name (to be associated with tasks) for it. We are making - // the assumption that this one container represents the group, - // which is OK for now, but might not always be true! - ps.Container = fmt.Sprintf("%s-%s", pod.Namespace, pod.Name) + // the jobname should be the group name + ps.Container = groupName // Create accumulated requests for cpu and limits // CPU and memory are summed across containers From 10379356d614aa9d55271403eb586f6d3177a971 Mon Sep 17 00:00:00 2001 From: vsoch Date: Mon, 19 Feb 2024 14:49:49 -0700 Subject: [PATCH 20/28] testing: gke then eks I am making small changes as I test on GKE and EKS. My first tests on GKE had me creating / deleting jobs, and I think the state of fluence (fluxion) got out of sync with the jobs, meaning that fluxion thought jobs were running that were not and then was unable to allocate new ones. To adjust for that we can add back in the cancel response, but this will only work given that fluence has not lost memory of the job id. We likely need an approach that can either save the jobids to the state data (that could be reloaded) or a way to inspect jobs explicitly and purge, OR (better) a way to look up a job not based on the id, but based on the group id (the command in the jobspec). That way, regardless of a jobid, we could lose all of our state and still find the old (stale) job to delete. With a fresh state and larger cluster I am able to run jobs on GKE, but they are enormously slow - lammps size 2 2 2 is taking over 20 minutes. This is not the fault of fluence - GKE networking sucks. To keep debugging I likely need to move over to AWS with EFA, of course that introduces more things to figure out like EFA, etc. Signed-off-by: vsoch --- sig-scheduler-plugins/pkg/fluence/events.go | 3 +++ sig-scheduler-plugins/pkg/fluence/fluence.go | 15 +++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/sig-scheduler-plugins/pkg/fluence/events.go b/sig-scheduler-plugins/pkg/fluence/events.go index 395517a..b891713 100644 --- a/sig-scheduler-plugins/pkg/fluence/events.go +++ b/sig-scheduler-plugins/pkg/fluence/events.go @@ -22,6 +22,9 @@ import ( // We assume that the cancelled job also means deleting the pod group func (f *Fluence) cancelFluxJob(groupName string) error { + // TODO: it's a bit risky to store state here, because if the scheduler + // restarts we cannot look up the jobid, and then cannot cancel it. + // There is no way to request cancelling the job for a specific group jobid, ok := f.groupToJobId[groupName] // The job was already cancelled by another pod diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index f126db6..33976ae 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -255,11 +255,18 @@ func (f *Fluence) AskFlux( _, isAllocated := f.groupToJobId[groupName] f.mutex.Unlock() - // Not allowing cancel for now - not sure how or why we could do this, need to better - // understand the case. This function should ONLY be successful on a new match allocate, - // otherwise the calling logic does not make sense. + // This case happens when there is some reason that an initial job pods partially allocated, + // but then the job restarted, and new pods are present but fluence had assigned nodes to + // the old ones (and there aren't enough). The job would have had to complete in some way, + // and the PodGroup would have to then recreate, and have the same job id (the group name). + // This happened when I cancalled a bunch of jobs and they didn't have the chance to + // cancel in fluence. What we can do here is assume the previous pods are no longer running + // and cancel the flux job to create again. if isAllocated { - return fmt.Errorf("[Fluence] Pod %s in group %s is allocated and calling AskFlux, should we be here?\n", pod.Name, groupName) + klog.Info("Warning - group %s was previously allocated and is requesting again, so must have completed.", groupName) + f.mutex.Lock() + f.cancelFluxJob(groupName) + f.mutex.Unlock() } // IMPORTANT: this is a JobSpec for *one* pod, assuming they are all the same. From f52e2092a8d15fb8ee32a8d270130897d88c0769 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 7 Mar 2024 01:33:45 -0700 Subject: [PATCH 21/28] refactor: testing idea to wrap coscheduling This is the "skeleton" of a new idea to wrap coscheduling, adding in the logic for fluence only where it is needed, likely in the PodGroup (in the new fluence/core/core that wraps the same in coscheduling). This is just a skeleton because we are deploying the sidecar with the wrapped scheduling and absolutely no logic ported over to AskFlux. I think I have a sense of where to put this, but wanted to save this vanilla/skeleton state in case we need to go back to it. Note that it did not work to have fluence inherit the functions from coscheduler, so I opted for a strategy of adding it as a helper field, and then just using it when necessary. Signed-off-by: vsoch --- README.md | 6 + examples/pod-group/lammps/lammps2.yaml | 4 +- examples/pod-group/lammps/lammps4-2.yaml | 4 +- examples/pod-group/lammps/lammps4-3.yaml | 4 +- examples/pod-group/lammps/lammps4.yaml | 4 +- examples/pod-group/lammps/lammps5.yaml | 4 +- examples/pod-group/lammps/lammps6.yaml | 4 +- examples/test_example/fluence-sized-job.yaml | 2 +- sig-scheduler-plugins/cmd/scheduler/main.go | 5 +- .../pkg/controllers/podgroup_controller.go | 46 +- sig-scheduler-plugins/pkg/fluence/README.md | 29 -- .../pkg/fluence/core/core.go | 392 ++++++++++++----- sig-scheduler-plugins/pkg/fluence/events.go | 166 ------- sig-scheduler-plugins/pkg/fluence/fluence.go | 414 ++++++++---------- .../pkg/fluence/labels/labels.go | 40 +- 15 files changed, 556 insertions(+), 568 deletions(-) delete mode 100644 sig-scheduler-plugins/pkg/fluence/README.md delete mode 100644 sig-scheduler-plugins/pkg/fluence/events.go diff --git a/README.md b/README.md index ae420fd..89f2a18 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,12 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched **Important** Fluence does not currently support use in conjunction with the kube-scheduler. Pods must all be scheduled by Fluence, and *you should not use both schedulers in the same cluster*. +## TODO + +- Need to list pods, get state, and if is completed, cancel the job id. +- Keep track of state of all pods in group, when all of pods are completed, then issue cancel. +- Calculate on the fly - on the update event we want to loop through pods, if ALL completed, then delete the podid for fluence. + ## Getting started For instructions on how to start Fluence on a K8s cluster, see [examples](examples/). Documentation and instructions for reproducing our CANOPIE-2022 paper (citation below) can be found in the [canopie22-artifacts branch](https://github.com/flux-framework/flux-k8s/tree/canopie22-artifacts). diff --git a/examples/pod-group/lammps/lammps2.yaml b/examples/pod-group/lammps/lammps2.yaml index 5cc7535..5a83c97 100644 --- a/examples/pod-group/lammps/lammps2.yaml +++ b/examples/pod-group/lammps/lammps2.yaml @@ -14,6 +14,6 @@ spec: command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite resources: limits: - cpu: 2 + cpu: 10 requests: - cpu: 2 \ No newline at end of file + cpu: 10 diff --git a/examples/pod-group/lammps/lammps4-2.yaml b/examples/pod-group/lammps/lammps4-2.yaml index 777e73c..6b647bc 100644 --- a/examples/pod-group/lammps/lammps4-2.yaml +++ b/examples/pod-group/lammps/lammps4-2.yaml @@ -17,6 +17,6 @@ spec: command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite resources: limits: - cpu: 2 + cpu: 10 requests: - cpu: 2 \ No newline at end of file + cpu: 10 diff --git a/examples/pod-group/lammps/lammps4-3.yaml b/examples/pod-group/lammps/lammps4-3.yaml index 76c5ed0..b182751 100644 --- a/examples/pod-group/lammps/lammps4-3.yaml +++ b/examples/pod-group/lammps/lammps4-3.yaml @@ -17,6 +17,6 @@ spec: command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite resources: limits: - cpu: 2 + cpu: 10 requests: - cpu: 2 \ No newline at end of file + cpu: 10 diff --git a/examples/pod-group/lammps/lammps4.yaml b/examples/pod-group/lammps/lammps4.yaml index 38ae0a7..9420902 100644 --- a/examples/pod-group/lammps/lammps4.yaml +++ b/examples/pod-group/lammps/lammps4.yaml @@ -18,6 +18,6 @@ spec: command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite resources: limits: - cpu: 2 + cpu: 10 requests: - cpu: 2 \ No newline at end of file + cpu: 10 diff --git a/examples/pod-group/lammps/lammps5.yaml b/examples/pod-group/lammps/lammps5.yaml index 7546b48..e85299f 100644 --- a/examples/pod-group/lammps/lammps5.yaml +++ b/examples/pod-group/lammps/lammps5.yaml @@ -17,6 +17,6 @@ spec: command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite resources: limits: - cpu: 2 + cpu: 10 requests: - cpu: 2 \ No newline at end of file + cpu: 10 diff --git a/examples/pod-group/lammps/lammps6.yaml b/examples/pod-group/lammps/lammps6.yaml index 2030192..14ebae3 100644 --- a/examples/pod-group/lammps/lammps6.yaml +++ b/examples/pod-group/lammps/lammps6.yaml @@ -17,6 +17,6 @@ spec: command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite resources: limits: - cpu: 2 + cpu: 10 requests: - cpu: 2 \ No newline at end of file + cpu: 10 diff --git a/examples/test_example/fluence-sized-job.yaml b/examples/test_example/fluence-sized-job.yaml index a195d87..d1e7556 100644 --- a/examples/test_example/fluence-sized-job.yaml +++ b/examples/test_example/fluence-sized-job.yaml @@ -11,6 +11,6 @@ spec: containers: - name: fluence-job image: busybox - command: [echo, potato] + command: [sleep, "20"] restartPolicy: Never backoffLimit: 4 diff --git a/sig-scheduler-plugins/cmd/scheduler/main.go b/sig-scheduler-plugins/cmd/scheduler/main.go index d9a580a..2b21d28 100644 --- a/sig-scheduler-plugins/cmd/scheduler/main.go +++ b/sig-scheduler-plugins/cmd/scheduler/main.go @@ -26,6 +26,7 @@ import ( "sigs.k8s.io/scheduler-plugins/pkg/capacityscheduling" "sigs.k8s.io/scheduler-plugins/pkg/coscheduling" + "sigs.k8s.io/scheduler-plugins/pkg/fluence" "sigs.k8s.io/scheduler-plugins/pkg/networkaware/networkoverhead" "sigs.k8s.io/scheduler-plugins/pkg/networkaware/topologicalsort" "sigs.k8s.io/scheduler-plugins/pkg/noderesources" @@ -36,7 +37,7 @@ import ( "sigs.k8s.io/scheduler-plugins/pkg/trimaran/loadvariationriskbalancing" "sigs.k8s.io/scheduler-plugins/pkg/trimaran/lowriskovercommitment" "sigs.k8s.io/scheduler-plugins/pkg/trimaran/targetloadpacking" - "sigs.k8s.io/scheduler-plugins/pkg/fluence" + // Ensure scheme package is initialized. _ "sigs.k8s.io/scheduler-plugins/apis/config/scheme" ) @@ -56,8 +57,6 @@ func main() { app.WithPlugin(preemptiontoleration.Name, preemptiontoleration.New), app.WithPlugin(targetloadpacking.Name, targetloadpacking.New), app.WithPlugin(lowriskovercommitment.Name, lowriskovercommitment.New), - // Sample plugins below. - // app.WithPlugin(crossnodepreemption.Name, crossnodepreemption.New), app.WithPlugin(podstate.Name, podstate.New), app.WithPlugin(qos.Name, qos.New), app.WithPlugin(fluence.Name, fluence.New), diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index 73b7d2d..27c31cb 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -58,10 +58,8 @@ type PodGroupReconciler struct { // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. -// TODO(user): Modify the Reconcile function to compare the state specified by -// the PodGroup object against the actual cluster state, and then -// perform operations to make the cluster state reflect the state specified by -// the user. +// Note that we currently don't do deletion based on owner references, but that +// would be ideal (I could not get it to work) // // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.0/pkg/reconcile @@ -82,6 +80,7 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c log.Error(err, fmt.Sprintf("Unable to retrieve pod group %s", req.NamespacedName)) return ctrl.Result{}, err } + log.Info("REFERENCES", "Reconciler", pg.ObjectMeta.OwnerReferences) // Grab all statuses (and groups of them) we are interested in schedulingOrPending := (pg.Status.Phase == schedv1alpha1.PodGroupScheduling || pg.Status.Phase == schedv1alpha1.PodGroupPending) @@ -175,35 +174,32 @@ func (r *PodGroupReconciler) updateStatus( pods []v1.Pod, ) (ctrl.Result, error) { + log := log.FromContext(ctx) patch := client.MergeFrom(pg.DeepCopy()) + log.Info("PodGroup", "Phase", pg.Status.Phase) switch pg.Status.Phase { case "": pg.Status.Phase = schedv1alpha1.PodGroupPending - result, err := r.updateOwnerReferences(ctx, pg, pods) - if result.Requeue || err != nil { - return result, err - } case schedv1alpha1.PodGroupPending: if len(pods) >= int(pg.Spec.MinMember) { + log.Info("PodGroup", "Phase", "Scheduling") pg.Status.Phase = schedv1alpha1.PodGroupScheduling - result, err := r.updateOwnerReferences(ctx, pg, pods) - if result.Requeue || err != nil { - return result, err - } } default: - // Get updated counts of running, succeeded, and failed pods - running, succeeded, failed := getCurrentPodStats(pods) - // If for some reason we weren't pending and now have fewer than min required, flip back to pending if len(pods) < int(pg.Spec.MinMember) { + log.Info("PodGroup", "Phase", "Length of pods less than min member, pending") pg.Status.Phase = schedv1alpha1.PodGroupPending break } + // Get updated counts of running, succeeded, and failed pods + running, succeeded, failed := getCurrentPodStats(pods) + log.Info("PodGroup", "Running", running, "Succeeded", succeeded, "Failed", failed) + // A pod with succeeded + running STILL less than the minimum required is scheduling if succeeded+running < pg.Spec.MinMember { pg.Status.Phase = schedv1alpha1.PodGroupScheduling @@ -232,16 +228,18 @@ func (r *PodGroupReconciler) updateStatus( } // Apply the patch to update, or delete if finished - // TODO would be better if owner references took here, so delete on owner deletion - // TODO deletion is not currently handled for Deployment, ReplicaSet, StatefulSet - // as they are expected to persist. You can delete / lose and bring up again var err error if pg.Status.Phase == schedv1alpha1.PodGroupFinished || pg.Status.Phase == schedv1alpha1.PodGroupFailed { + log.Info("PodGroup", "Status", "Finished", "Owners", pg.OwnerReferences) + + // Delete the group if it is finished or failed err = r.Delete(ctx, pg) - } else { - r.Status().Update(ctx, pg) - err = r.Patch(ctx, pg, patch) + // Update but don't requeue + // _, err := r.updateOwnerReferences(ctx, pg, pods) + return ctrl.Result{}, err } + r.Status().Update(ctx, pg) + err = r.Patch(ctx, pg, patch) return ctrl.Result{Requeue: true}, err } @@ -366,21 +364,25 @@ func (r *PodGroupReconciler) updateOwnerReferences( return result, nil } - // Collect owner references for pod group + // Collect current owner references for pod group, + // We want to ensure we add unique ones across the pod owners := []metav1.OwnerReference{} var refs []string for _, ownerRef := range pod.OwnerReferences { refs = append(refs, fmt.Sprintf("%s/%s", pod.Namespace, ownerRef.Name)) owners = append(owners, ownerRef) } + patch := client.MergeFrom(pg.DeepCopy()) if len(refs) != 0 { sort.Strings(refs) pg.Status.OccupiedBy = strings.Join(refs, ",") } + // If we have owners, collapose into list if len(owners) > 0 { pg.ObjectMeta.OwnerReferences = owners } + // Apply the patch to update the size r.Status().Update(ctx, pg) err := r.Patch(ctx, pg, patch) diff --git a/sig-scheduler-plugins/pkg/fluence/README.md b/sig-scheduler-plugins/pkg/fluence/README.md deleted file mode 100644 index 61f4923..0000000 --- a/sig-scheduler-plugins/pkg/fluence/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Overview - -Project to manage Flux tasks needed to standardize kubernetes HPC scheduling interfaces - -## Installing the chart - -More detail will be added here about installing the chart. You will -be using the [install-as-a-second-scheduler](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/manifests/install/charts/as-a-second-scheduler) -charts. Fluence-specific values are detailed below. - -### Fluence specific values - -In `values.yaml` it is possible to customize the container image, already defaulted to the latest release, and the allocation policy -used by the scheduler. -Most common options are: - -- `lonode`: choose the nodes with lower ID first. Can be compared to packing -- `low`: choose cores with lowest IDs from multiple nodes. Can be compared to spread process-to-resource placement - -## Maturity Level - - - -- [x] Sample (for demonstrating and inspiring purpose) -- [ ] Alpha (used in companies for pilot projects) -- [ ] Beta (used in companies and developed actively) -- [ ] Stable (used in companies for production workloads) - - diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index a3f4531..efa1127 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -1,161 +1,329 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package core import ( + "context" "fmt" + "sync" + "time" - klog "k8s.io/klog/v2" - + gochache "github.com/patrickmn/go-cache" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + informerv1 "k8s.io/client-go/informers/core/v1" + listerv1 "k8s.io/client-go/listers/core/v1" + "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" - pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" + "sigs.k8s.io/controller-runtime/pkg/client" + + "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + "sigs.k8s.io/scheduler-plugins/pkg/util" ) -// FluxStateData is a CycleState -// It holds the PodCache for a pod, which has node assignment, group, and group size -// We also save the group name and size, and time created, in case we want to (somehow) resume scheduling -// In practice I'm not sure how CycleState objects are dumped and loaded. Kueue has a dumper :P -// https://github.com/kubernetes/enhancements/blob/master/keps/sig-scheduling/624-scheduling-framework/README.md#cyclestate -type FluxStateData struct { - NodeCache NodeCache +type Status string + +const ( + // PodGroupNotSpecified denotes no PodGroup is specified in the Pod spec. + PodGroupNotSpecified Status = "PodGroup not specified" + // PodGroupNotFound denotes the specified PodGroup in the Pod spec is + // not found in API server. + PodGroupNotFound Status = "PodGroup not found" + Success Status = "Success" + Wait Status = "Wait" +) + +// Manager defines the interfaces for PodGroup management. +type Manager interface { + PreFilter(context.Context, *corev1.Pod) error + Permit(context.Context, *corev1.Pod) Status + GetPodGroup(context.Context, *corev1.Pod) (string, *v1alpha1.PodGroup) + GetCreationTimestamp(*corev1.Pod, time.Time) time.Time + DeletePermittedPodGroup(string) + CalculateAssignedPods(string, string) int + ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) + BackoffPodGroup(string, time.Duration) } -// Clone is required for CycleState plugins -func (s *FluxStateData) Clone() framework.StateData { - return &FluxStateData{NodeCache: s.NodeCache} +// PodGroupManager defines the scheduling operation called +type PodGroupManager struct { + // client is a generic controller-runtime client to manipulate both core resources and PodGroups. + client client.Client + // snapshotSharedLister is pod shared list + snapshotSharedLister framework.SharedLister + // scheduleTimeout is the default timeout for podgroup scheduling. + // If podgroup's scheduleTimeoutSeconds is set, it will be used. + scheduleTimeout *time.Duration + // permittedPG stores the podgroup name which has passed the pre resource check. + permittedPG *gochache.Cache + // backedOffPG stores the podgorup name which failed scheudling recently. + backedOffPG *gochache.Cache + // podLister is pod lister + podLister listerv1.PodLister + sync.RWMutex } -// NewFluxState creates an entry for the CycleState with the node and group name -func NewFluxState(nodeName string, groupName string) *FluxStateData { - cache := NodeCache{NodeName: nodeName} - return &FluxStateData{NodeCache: cache} +// NewPodGroupManager creates a new operation object. +func NewPodGroupManager(client client.Client, snapshotSharedLister framework.SharedLister, scheduleTimeout *time.Duration, podInformer informerv1.PodInformer) *PodGroupManager { + pgMgr := &PodGroupManager{ + client: client, + snapshotSharedLister: snapshotSharedLister, + scheduleTimeout: scheduleTimeout, + podLister: podInformer.Lister(), + permittedPG: gochache.New(3*time.Second, 3*time.Second), + backedOffPG: gochache.New(10*time.Second, 10*time.Second), + } + return pgMgr } -// NodeCache holds the node name and tasks for the node -// For the PodGroupCache, these are organized by group name, -// and there is a list of them -type NodeCache struct { - NodeName string +func (pgMgr *PodGroupManager) BackoffPodGroup(pgName string, backoff time.Duration) { + if backoff == time.Duration(0) { + return + } + pgMgr.backedOffPG.Add(pgName, nil, backoff) +} - // Tie assignment back to PodGroup, which can be used to get size and time created - GroupName string +// ActivateSiblings stashes the pods belonging to the same PodGroup of the given pod +// in the given state, with a reserved key "kubernetes.io/pods-to-activate". +func (pgMgr *PodGroupManager) ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) { + pgName := util.GetPodGroupLabel(pod) + if pgName == "" { + return + } - // Assigned tasks (often pods) to nodes - // https://github.com/flux-framework/flux-k8s/blob/9f24f36752e3cced1b1112d93bfa366fb58b3c84/src/fluence/fluxion/fluxion.go#L94-L97 - AssignedTasks int -} + pods, err := pgMgr.podLister.Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: pgName}), + ) + if err != nil { + klog.ErrorS(err, "Failed to obtain pods belong to a PodGroup", "podGroup", pgName) + return + } -// A pod group cache holds a list of nodes for an allocation, where each has some number of tasks -// along with the expected group size. This is intended to replace PodGroup -// given the group name, size (derived from annotations) and timestamp -type PodGroupCache struct { - GroupName string + for i := range pods { + if pods[i].UID == pod.UID { + pods = append(pods[:i], pods[i+1:]...) + break + } + } - // This is a cache of nodes for pods - Nodes []NodeCache + if len(pods) != 0 { + if c, err := state.Read(framework.PodsToActivateKey); err == nil { + if s, ok := c.(*framework.PodsToActivate); ok { + s.Lock() + for _, pod := range pods { + namespacedName := GetNamespacedName(pod) + s.Map[namespacedName] = pod + } + s.Unlock() + } + } + } } -// PodGroups seen by fluence -var groupsSeen map[string]*PodGroupCache +// PreFilter filters out a pod if +// 1. it belongs to a podgroup that was recently denied or +// 2. the total number of pods in the podgroup is less than the minimum number of pods +// that is required to be scheduled. +func (pgMgr *PodGroupManager) PreFilter(ctx context.Context, pod *corev1.Pod) error { + klog.V(5).InfoS("Pre-filter", "pod", klog.KObj(pod)) + pgFullName, pg := pgMgr.GetPodGroup(ctx, pod) + if pg == nil { + return nil + } -// Init populates the groupsSeen cache -func Init() { - groupsSeen = map[string]*PodGroupCache{} -} + if _, exist := pgMgr.backedOffPG.Get(pgFullName); exist { + return fmt.Errorf("podGroup %v failed recently", pgFullName) + } -// GetFluenceCache determines if a group has been seen. -// Yes -> we return the PodGroupCache entry -// No -> the entry is nil / does not exist -func GetFluenceCache(groupName string) *PodGroupCache { - entry, _ := groupsSeen[groupName] - return entry -} + pods, err := pgMgr.podLister.Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: util.GetPodGroupLabel(pod)}), + ) + if err != nil { + return fmt.Errorf("podLister list pods failed: %w", err) + } -// DeletePodGroup deletes a pod from the group cache -func DeletePodGroup(groupName string) { - delete(groupsSeen, groupName) -} + if len(pods) < int(pg.Spec.MinMember) { + return fmt.Errorf("pre-filter pod %v cannot find enough sibling pods, "+ + "current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember) + } -// CreateNodePodsList creates a list of node pod caches -func CreateNodeList(nodelist []*pb.NodeAlloc, groupName string) (nodepods []NodeCache) { + if pg.Spec.MinResources == nil { + return nil + } - // Create a pod cache for each node - nodepods = make([]NodeCache, len(nodelist)) + // TODO(cwdsuzhou): This resource check may not always pre-catch unschedulable pod group. + // It only tries to PreFilter resource constraints so even if a PodGroup passed here, + // it may not necessarily pass Filter due to other constraints such as affinity/taints. + if _, ok := pgMgr.permittedPG.Get(pgFullName); ok { + return nil + } - // TODO: should we be integrating topology information here? Could it be the - // case that some nodes (pods) in the group should be closer? - for i, v := range nodelist { - nodepods[i] = NodeCache{ - NodeName: v.GetNodeID(), - AssignedTasks: int(v.GetTasks()), - GroupName: groupName, - } + nodes, err := pgMgr.snapshotSharedLister.NodeInfos().List() + if err != nil { + return err } - // Update the pods in the PodGroupCache (groupsSeen) - updatePodGroupCache(groupName, nodepods) - return nodepods + minResources := pg.Spec.MinResources.DeepCopy() + podQuantity := resource.NewQuantity(int64(pg.Spec.MinMember), resource.DecimalSI) + minResources[corev1.ResourcePods] = *podQuantity + err = CheckClusterResource(nodes, minResources, pgFullName) + if err != nil { + klog.ErrorS(err, "Failed to PreFilter", "podGroup", klog.KObj(pg)) + return err + } + pgMgr.permittedPG.Add(pgFullName, pgFullName, *pgMgr.scheduleTimeout) + return nil } -// updatePodGroupList updates the PodGroupCache with a listing of nodes -func updatePodGroupCache(groupName string, nodes []NodeCache) { - cache := PodGroupCache{ - Nodes: nodes, - GroupName: groupName, +// Permit permits a pod to run, if the minMember match, it would send a signal to chan. +func (pgMgr *PodGroupManager) Permit(ctx context.Context, pod *corev1.Pod) Status { + pgFullName, pg := pgMgr.GetPodGroup(ctx, pod) + if pgFullName == "" { + return PodGroupNotSpecified + } + if pg == nil { + // A Pod with a podGroup name but without a PodGroup found is denied. + return PodGroupNotFound + } + + assigned := pgMgr.CalculateAssignedPods(pg.Name, pg.Namespace) + // The number of pods that have been assigned nodes is calculated from the snapshot. + // The current pod in not included in the snapshot during the current scheduling cycle. + if int32(assigned)+1 >= pg.Spec.MinMember { + return Success } - groupsSeen[groupName] = &cache + return Wait } -// GetNextNode gets the next node in the PodGroupCache -func (p *PodGroupCache) GetNextNode() (string, error) { +// GetCreationTimestamp returns the creation time of a podGroup or a pod. +func (pgMgr *PodGroupManager) GetCreationTimestamp(pod *corev1.Pod, ts time.Time) time.Time { + pgName := util.GetPodGroupLabel(pod) + if len(pgName) == 0 { + return ts + } + var pg v1alpha1.PodGroup + if err := pgMgr.client.Get(context.TODO(), types.NamespacedName{Namespace: pod.Namespace, Name: pgName}, &pg); err != nil { + return ts + } + return pg.CreationTimestamp.Time +} - nextnode := "" +// DeletePermittedPodGroup deletes a podGroup that passes Pre-Filter but reaches PostFilter. +func (pgMgr *PodGroupManager) DeletePermittedPodGroup(pgFullName string) { + pgMgr.permittedPG.Delete(pgFullName) +} - // Quick failure state - we ran out of nodes - if len(p.Nodes) == 0 { - return nextnode, fmt.Errorf("[Fluence] PodGroup %s ran out of nodes.", p.GroupName) +// GetPodGroup returns the PodGroup that a Pod belongs to in cache. +func (pgMgr *PodGroupManager) GetPodGroup(ctx context.Context, pod *corev1.Pod) (string, *v1alpha1.PodGroup) { + pgName := util.GetPodGroupLabel(pod) + if len(pgName) == 0 { + return "", nil + } + var pg v1alpha1.PodGroup + if err := pgMgr.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: pgName}, &pg); err != nil { + return fmt.Sprintf("%v/%v", pod.Namespace, pgName), nil } + return fmt.Sprintf("%v/%v", pod.Namespace, pgName), &pg +} - // The next is the 0th in the list - nextnode = p.Nodes[0].NodeName - klog.Infof("[Fluence] Next node for group %s is %s", p.GroupName, nextnode) +// CalculateAssignedPods returns the number of pods that has been assigned nodes: assumed or bound. +func (pgMgr *PodGroupManager) CalculateAssignedPods(podGroupName, namespace string) int { + nodeInfos, err := pgMgr.snapshotSharedLister.NodeInfos().List() + klog.Info(nodeInfos) + if err != nil { + klog.ErrorS(err, "Cannot get nodeInfos from frameworkHandle") + return 0 + } + var count int + for _, nodeInfo := range nodeInfos { + for _, podInfo := range nodeInfo.Pods { + pod := podInfo.Pod + if util.GetPodGroupLabel(pod) == podGroupName && pod.Namespace == namespace && pod.Spec.NodeName != "" { + count++ + } + } + } - // If there is only one task left, we are going to use it (and remove the node) - if p.Nodes[0].AssignedTasks == 1 { - klog.Infof("[Fluence] First node has one remaining task slot") - slice := p.Nodes[1:] + return count +} - // If after we remove the node there are no nodes left... - // Note that I'm not deleting the node from the cache because that is the - // only way fluence knows it has already assigned work (presence of the key) - if len(slice) == 0 { - klog.Infof("[Fluence] Assigning node %s. There are NO reamining nodes for group %s\n", nextnode, p.GroupName) - // delete(podGroupCache, groupName) - return nextnode, nil +// CheckClusterResource checks if resource capacity of the cluster can satisfy . +// It returns an error detailing the resource gap if not satisfied; otherwise returns nil. +func CheckClusterResource(nodeList []*framework.NodeInfo, resourceRequest corev1.ResourceList, desiredPodGroupName string) error { + for _, info := range nodeList { + if info == nil || info.Node() == nil { + continue } - klog.Infof("[Fluence] Assigning node %s. There are nodes left for group", nextnode, p.GroupName) - updatePodGroupCache(p.GroupName, slice) - return nextnode, nil + nodeResource := util.ResourceList(getNodeResource(info, desiredPodGroupName)) + for name, quant := range resourceRequest { + quant.Sub(nodeResource[name]) + if quant.Sign() <= 0 { + delete(resourceRequest, name) + continue + } + resourceRequest[name] = quant + } + if len(resourceRequest) == 0 { + return nil + } } + return fmt.Errorf("resource gap: %v", resourceRequest) +} - // If we get here the first node had >1 assigned tasks - klog.Infof("[Fluence] Assigning node %s for group %s. There are still task assignments available for this node.", nextnode, p.GroupName) - p.Nodes[0].AssignedTasks = p.Nodes[0].AssignedTasks - 1 - return nextnode, nil +// GetNamespacedName returns the namespaced name. +func GetNamespacedName(obj metav1.Object) string { + return fmt.Sprintf("%v/%v", obj.GetNamespace(), obj.GetName()) } -// GetNextNode gets the next available node we can allocate for a group -// TODO this should be able to take and pass forward a number of tasks. -// It is implicity 1 now, but doesn't have to be. -func GetNextNode(groupName string) (string, error) { +func getNodeResource(info *framework.NodeInfo, desiredPodGroupName string) *framework.Resource { + nodeClone := info.Clone() + for _, podInfo := range info.Pods { + if podInfo == nil || podInfo.Pod == nil { + continue + } + if util.GetPodGroupFullName(podInfo.Pod) != desiredPodGroupName { + continue + } + nodeClone.RemovePod(podInfo.Pod) + } - // Get our entry from the groupsSeen cache - klog.Infof("[Fluence] groups seen %s", groupsSeen) - entry, ok := groupsSeen[groupName] + leftResource := framework.Resource{ + ScalarResources: make(map[corev1.ResourceName]int64), + } + allocatable := nodeClone.Allocatable + requested := nodeClone.Requested + + leftResource.AllowedPodNumber = allocatable.AllowedPodNumber - len(nodeClone.Pods) + leftResource.MilliCPU = allocatable.MilliCPU - requested.MilliCPU + leftResource.Memory = allocatable.Memory - requested.Memory + leftResource.EphemeralStorage = allocatable.EphemeralStorage - requested.EphemeralStorage - // This case should not happen - if !ok { - return "", fmt.Errorf("[Fluence] Map is empty") + for k, allocatableEx := range allocatable.ScalarResources { + requestEx, ok := requested.ScalarResources[k] + if !ok { + leftResource.ScalarResources[k] = allocatableEx + } else { + leftResource.ScalarResources[k] = allocatableEx - requestEx + } } - // Get the next node from the PodGroupCache - return entry.GetNextNode() + klog.V(4).InfoS("Node left resource", "node", klog.KObj(info.Node()), "resource", leftResource) + return &leftResource } diff --git a/sig-scheduler-plugins/pkg/fluence/events.go b/sig-scheduler-plugins/pkg/fluence/events.go deleted file mode 100644 index b891713..0000000 --- a/sig-scheduler-plugins/pkg/fluence/events.go +++ /dev/null @@ -1,166 +0,0 @@ -package fluence - -import ( - "context" - "time" - - "google.golang.org/grpc" - v1 "k8s.io/api/core/v1" - klog "k8s.io/klog/v2" - - pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" - fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" -) - -// Events are associated with inforers, typically on pods, e.g., -// delete: deletion of a pod -// update: update of a pod! -// For both of the above, there are cases to cancel the flux job -// associated with the group id - -// cancelFluxJobForPod cancels the flux job for a pod. -// We assume that the cancelled job also means deleting the pod group -func (f *Fluence) cancelFluxJob(groupName string) error { - - // TODO: it's a bit risky to store state here, because if the scheduler - // restarts we cannot look up the jobid, and then cannot cancel it. - // There is no way to request cancelling the job for a specific group - jobid, ok := f.groupToJobId[groupName] - - // The job was already cancelled by another pod - if !ok { - klog.Infof("[Fluence] Request for cancel of group %s is already complete.", groupName) - return nil - } - klog.Infof("[Fluence] Cancel flux job: %v for group %s", jobid, groupName) - - // This first error is about connecting to the server - conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) - if err != nil { - klog.Errorf("[Fluence] Error connecting to server: %v", err) - return err - } - defer conn.Close() - - grpcclient := pb.NewFluxcliServiceClient(conn) - _, cancel := context.WithTimeout(context.Background(), 200*time.Second) - defer cancel() - - // This error reflects the success or failure of the cancel request - request := &pb.CancelRequest{JobID: int64(jobid)} - res, err := grpcclient.Cancel(context.Background(), request) - if err != nil { - klog.Errorf("[Fluence] did not receive any cancel response: %v", err) - return err - } - klog.Infof("[Fluence] Job cancellation for group %s result: %d", groupName, res.Error) - - // And this error is if the cancel was successful or not - if res.Error == 0 { - klog.Infof("[Fluence] Successful cancel of flux job: %d for group %s", jobid, groupName) - delete(f.groupToJobId, groupName) - } else { - klog.Warningf("[Fluence] Failed to cancel flux job %d for group %s", jobid, groupName) - } - return nil -} - -// updatePod is called on an update, and the old and new object are presented -func (f *Fluence) updatePod(oldObj, newObj interface{}) { - - oldPod := oldObj.(*v1.Pod) - newPod := newObj.(*v1.Pod) - - // a pod is updated, get the group - // TODO should we be checking group / size for old vs new? - groupName, pg := f.pgMgr.GetPodGroup(context.TODO(), oldPod) - - // If PodGroup is nil, still try to look up a faux name - if pg == nil { - pg = fgroup.CreateFakeGroup(oldPod) - groupName = pg.Name - } - - klog.Infof("[Fluence] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, newPod.Status.Phase, oldPod.Status.Phase) - - switch newPod.Status.Phase { - case v1.PodPending: - // in this state we don't know if a pod is going to be running, thus we don't need to update job map - case v1.PodRunning: - // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler - case v1.PodSucceeded: - klog.Infof("[Fluence] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) - - f.mutex.Lock() - defer f.mutex.Unlock() - - // Do we have the group id in our cache? If yes, we haven't deleted the jobid yet - // I am worried here that if some pods are succeeded and others pending, this could - // be a mistake - fluence would schedule it again - _, ok := f.groupToJobId[groupName] - if ok { - f.cancelFluxJob(groupName) - } else { - klog.Infof("[Fluence] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) - } - - case v1.PodFailed: - - // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test - klog.Warningf("[Fluence] Pod %s in group %s failed, Fluence needs to free the resources", newPod.Name, groupName) - - f.mutex.Lock() - defer f.mutex.Unlock() - - _, ok := f.groupToJobId[groupName] - if ok { - f.cancelFluxJob(groupName) - } else { - klog.Errorf("[Fluence] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) - } - case v1.PodUnknown: - // don't know how to deal with it as it's unknown phase - default: - // shouldn't enter this branch - } -} - -// deletePod handles the delete event handler -func (f *Fluence) deletePod(podObj interface{}) { - klog.Info("[Fluence] Delete Pod event handler") - pod := podObj.(*v1.Pod) - groupName, pg := f.pgMgr.GetPodGroup(context.TODO(), pod) - - // If PodGroup is nil, still try to look up a faux name - if pg == nil { - pg = fgroup.CreateFakeGroup(pod) - groupName = pg.Name - } - - klog.Infof("[Fluence] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) - switch pod.Status.Phase { - case v1.PodSucceeded: - case v1.PodPending: - klog.Infof("[Fluence] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) - - f.mutex.Lock() - defer f.mutex.Unlock() - - _, ok := f.groupToJobId[groupName] - if ok { - f.cancelFluxJob(groupName) - } else { - klog.Infof("[Fluence] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) - } - case v1.PodRunning: - f.mutex.Lock() - defer f.mutex.Unlock() - - _, ok := f.groupToJobId[groupName] - if ok { - f.cancelFluxJob(groupName) - } else { - klog.Infof("[Fluence] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) - } - } -} diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 33976ae..1ad1fd3 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -1,124 +1,140 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package fluence import ( "context" "fmt" - "os" "sync" "time" - "google.golang.org/grpc" v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/informers" clientscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/cache" + + fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" + label "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" + corev1helpers "k8s.io/component-helpers/scheduling/corev1" - klog "k8s.io/klog/v2" + "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" - "sigs.k8s.io/controller-runtime/pkg/client" - sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" - coschedulingcore "sigs.k8s.io/scheduler-plugins/pkg/coscheduling/core" + "sigs.k8s.io/scheduler-plugins/pkg/util" + + "sigs.k8s.io/scheduler-plugins/apis/config" + "sigs.k8s.io/scheduler-plugins/apis/scheduling" + "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" fcore "sigs.k8s.io/scheduler-plugins/pkg/fluence/core" - pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" - fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" - "sigs.k8s.io/scheduler-plugins/pkg/fluence/utils" ) +// Fluence schedules pods in a group using Fluxion as a backend +// We inherit cosched.Coscheduling to use some of the primary functions type Fluence struct { mutex sync.Mutex - handle framework.Handle client client.Client // Store jobid on the level of a group (which can be a single pod) groupToJobId map[string]uint64 - pgMgr coschedulingcore.Manager -} -// Name is the name of the plugin used in the Registry and configurations. -// Note that this would do better as an annotation (fluence.flux-framework.org/pod-group) -// But we cannot use them as selectors then! -const ( - Name = "Fluence" -) + frameworkHandler framework.Handle + pgMgr fcore.Manager + scheduleTimeout *time.Duration + pgBackoff *time.Duration +} var ( - _ framework.QueueSortPlugin = &Fluence{} - _ framework.PreFilterPlugin = &Fluence{} - _ framework.FilterPlugin = &Fluence{} + _ framework.QueueSortPlugin = &Fluence{} + _ framework.PreFilterPlugin = &Fluence{} + _ framework.PostFilterPlugin = &Fluence{} // Here down are from coscheduling + _ framework.PermitPlugin = &Fluence{} + _ framework.ReservePlugin = &Fluence{} + _ framework.EnqueueExtensions = &Fluence{} ) -func (f *Fluence) Name() string { - return Name -} +const ( + // Name is the name of the plugin used in Registry and configurations. + Name = "Fluence" +) // Initialize and return a new Fluence Custom Scheduler Plugin -// This class and functions are analogous to: -// https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/coscheduling.go#L63 -func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { +func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) { - f := &Fluence{handle: handle, groupToJobId: make(map[string]uint64)} - - ctx := context.TODO() - fcore.Init() - - fluxPodsInformer := handle.SharedInformerFactory().Core().V1().Pods().Informer() - fluxPodsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - UpdateFunc: f.updatePod, - DeleteFunc: f.deletePod, - }) - - go fluxPodsInformer.Run(ctx.Done()) + // Keep these empty for now, use defaults + args := config.CoschedulingArgs{} scheme := runtime.NewScheme() - clientscheme.AddToScheme(scheme) - v1.AddToScheme(scheme) - sched.AddToScheme(scheme) - k8scli, err := client.New(handle.KubeConfig(), client.Options{Scheme: scheme}) - if err != nil { - return nil, err - } - - // Save the kubernetes client for fluence to interact with cluster objects - f.client = k8scli + _ = clientscheme.AddToScheme(scheme) + _ = v1.AddToScheme(scheme) + _ = v1alpha1.AddToScheme(scheme) - fieldSelector, err := fields.ParseSelector(",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) + client, err := client.New(handle.KubeConfig(), client.Options{Scheme: scheme}) if err != nil { - klog.Errorf("ParseSelector failed %s", err) - os.Exit(1) + return nil, err } - informerFactory := informers.NewSharedInformerFactoryWithOptions(handle.ClientSet(), 0, informers.WithTweakListOptions(func(opt *metav1.ListOptions) { - opt.FieldSelector = fieldSelector.String() - })) - podInformer := informerFactory.Core().V1().Pods() - scheduleTimeDuration := time.Duration(500) * time.Second + // Performance improvement when retrieving list of objects by namespace or we'll log 'index not exist' warning. + handle.SharedInformerFactory().Core().V1().Pods().Informer().AddIndexers(cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) - // https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/core/core.go#L84 - pgMgr := coschedulingcore.NewPodGroupManager( - k8scli, + // PermitWaitingTimeSeconds is the waiting timeout in seconds. + scheduleTimeDuration := time.Duration(args.PermitWaitingTimeSeconds) * time.Second + pgMgr := fcore.NewPodGroupManager( + client, handle.SnapshotSharedLister(), &scheduleTimeDuration, - podInformer, + // Keep the podInformer (from frameworkHandle) as the single source of Pods. + handle.SharedInformerFactory().Core().V1().Pods(), ) - f.pgMgr = pgMgr - // stopCh := make(chan struct{}) - // defer close(stopCh) - // informerFactory.Start(stopCh) - informerFactory.Start(ctx.Done()) + // The main difference here is adding the groupToJobId lookup + plugin := &Fluence{ + frameworkHandler: handle, + pgMgr: pgMgr, + scheduleTimeout: &scheduleTimeDuration, + groupToJobId: make(map[string]uint64), + } - if !cache.WaitForCacheSync(ctx.Done(), podInformer.Informer().HasSynced) { - err := fmt.Errorf("WaitForCacheSync failed") - klog.ErrorS(err, "Cannot sync caches") + // PodGroupBackoffSeconds: backoff time in seconds before a pod group can be scheduled again. + if args.PodGroupBackoffSeconds < 0 { + err := fmt.Errorf("parse arguments failed") + klog.ErrorS(err, "PodGroupBackoffSeconds cannot be negative") return nil, err + } else if args.PodGroupBackoffSeconds > 0 { + pgBackoff := time.Duration(args.PodGroupBackoffSeconds) * time.Second + plugin.pgBackoff = &pgBackoff } + return plugin, nil +} + +func (f *Fluence) Name() string { + return Name +} - klog.Info("Fluence scheduler plugin started") - return f, nil +// Fluence has added delete, although I wonder if update includes that signal +// and it's redundant? +func (f *Fluence) EventsToRegister() []framework.ClusterEventWithHint { + // To register a custom event, follow the naming convention at: + // https://git.k8s.io/kubernetes/pkg/scheduler/eventhandlers.go#L403-L410 + pgGVK := fmt.Sprintf("podgroups.v1alpha1.%v", scheduling.GroupName) + return []framework.ClusterEventWithHint{ + {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Add | framework.Delete}}, + {Event: framework.ClusterEvent{Resource: framework.GVK(pgGVK), ActionType: framework.Add | framework.Update | framework.Delete}}, + } } // Less is used to sort pods in the scheduling queue in the following order. @@ -147,177 +163,131 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { // If they are the same, fall back to sorting by name. if creationTime1.Equal(&creationTime2) { - return coschedulingcore.GetNamespacedName(podInfo1.Pod) < coschedulingcore.GetNamespacedName(podInfo2.Pod) + return fcore.GetNamespacedName(podInfo1.Pod) < fcore.GetNamespacedName(podInfo2.Pod) } return creationTime1.Before(&creationTime2) -} - -// PreFilter checks info about the Pod / checks conditions that the cluster or the Pod must meet. -// This comes after sort -func (f *Fluence) PreFilter( - ctx context.Context, - state *framework.CycleState, - pod *v1.Pod, -) (*framework.PreFilterResult, *framework.Status) { - klog.Infof("[Fluence] Examining pod %s", pod.Name) - - // groupName will be named according to the single pod namespace / pod if there wasn't - // a user defined group. This is a size 1 group we handle equivalently. - groupName, pg := f.pgMgr.GetPodGroup(ctx, pod) +} - // If we don't have a pod group and it's here, it was asked to be scheduled by fluence - // but the group isn't ready. Unshedulable for now. - if pg == nil { - klog.Infof("[Fluence] Group %s/%s does not have a pod group, not schedulable yet.", pod.Namespace, pod.Name) - return nil, framework.NewStatus(framework.Unschedulable, "Missing podgroup") - } - klog.Infof("[Fluence] Pod %s is in group %s with minimum members %d", pod.Name, groupName, pg.Spec.MinMember) - - // Has this podgroup been seen by fluence yet? If yes, we will have it in the cache - cache := fcore.GetFluenceCache(groupName) - klog.Infof("[Fluence] cache %s", cache) - - // Fluence has never seen this before, we need to schedule an allocation - // It also could have been seen, but was not able to get one. - if cache == nil { - klog.Infof("[Fluence] Does not have nodes for %s yet, asking Fluxion", groupName) - - // groupName is the namespaced name / - err := f.AskFlux(ctx, pod, pg, groupName) - if err != nil { - klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) - } +// PreFilter performs the following validations. +// 1. Whether the PodGroup that the Pod belongs to is on the deny list. +// 2. Whether the total number of pods in a PodGroup is less than its `minMember`. +func (f *Fluence) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { + // If PreFilter fails, return framework.UnschedulableAndUnresolvable to avoid + // any preemption attempts. + if err := f.pgMgr.PreFilter(ctx, pod); err != nil { + klog.ErrorS(err, "PreFilter failed", "pod", klog.KObj(pod)) + return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) } - - // This is the next node in the list - nodename, err := fcore.GetNextNode(groupName) - if err != nil { - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) - } - klog.Infof("Node Selected %s (pod %s:group %s)", nodename, pod.Name, groupName) - - // Create a fluxState (CycleState) with things that might be useful - // This isn't a PodGroupCache, but a single node cache, which also - // has group information, but just is for one node. Note that assigned - // tasks is hard coded to 1 but this isn't necessarily the case - we should - // eventually be able to GetNextNode for a number of tasks, for example - // (unless task == pod in which case it is always 1) - nodeCache := fcore.NodeCache{NodeName: nodename, GroupName: groupName, AssignedTasks: 1} - state.Write(framework.StateKey(pod.Name), &fcore.FluxStateData{NodeCache: nodeCache}) return nil, framework.NewStatus(framework.Success, "") } -// TODO we need to account for affinity here -func (f *Fluence) Filter( - ctx context.Context, - cycleState *framework.CycleState, - pod *v1.Pod, - nodeInfo *framework.NodeInfo, -) *framework.Status { +// PostFilter is used to reject a group of pods if a pod does not pass PreFilter or Filter. +func (f *Fluence) PostFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, + filteredNodeStatusMap framework.NodeToStatusMap) (*framework.PostFilterResult, *framework.Status) { + pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) + if pg == nil { + klog.V(4).InfoS("Pod does not belong to any group", "pod", klog.KObj(pod)) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, "can not find pod group") + } - klog.Info("Filtering input node ", nodeInfo.Node().Name) - state, err := cycleState.Read(framework.StateKey(pod.Name)) + // This indicates there are already enough Pods satisfying the PodGroup, + // so don't bother to reject the whole PodGroup. + assigned := f.pgMgr.CalculateAssignedPods(pg.Name, pod.Namespace) + if assigned >= int(pg.Spec.MinMember) { + klog.V(4).InfoS("Assigned pods", "podGroup", klog.KObj(pg), "assigned", assigned) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable) + } - // No error means we retrieved the state - if err == nil { + // If the gap is less than/equal 10%, we may want to try subsequent Pods + // to see they can satisfy the PodGroup + notAssignedPercentage := float32(int(pg.Spec.MinMember)-assigned) / float32(pg.Spec.MinMember) + if notAssignedPercentage <= 0.1 { + klog.V(4).InfoS("A small gap of pods to reach the quorum", "podGroup", klog.KObj(pg), "percentage", notAssignedPercentage) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable) + } - // Try to convert the state to FluxStateDate - value, ok := state.(*fcore.FluxStateData) + // It's based on an implicit assumption: if the nth Pod failed, + // it's inferrable other Pods belonging to the same PodGroup would be very likely to fail. + f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Namespace == pod.Namespace && label.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name { + klog.V(3).InfoS("PostFilter rejects the pod", "podGroup", klog.KObj(pg), "pod", klog.KObj(waitingPod.GetPod())) + waitingPod.Reject(f.Name(), "optimistic rejection in PostFilter") + } + }) - // If we have state data that isn't equal to the current assignment, no go - if ok && value.NodeCache.NodeName != nodeInfo.Node().Name { - return framework.NewStatus(framework.Unschedulable, "pod is not permitted") - } else { - klog.Infof("Filter: node %s selected for %s\n", value.NodeCache.NodeName, pod.Name) + if f.pgBackoff != nil { + pods, err := f.frameworkHandler.SharedInformerFactory().Core().V1().Pods().Lister().Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: label.GetPodGroupLabel(pod)}), + ) + if err == nil && len(pods) >= int(pg.Spec.MinMember) { + f.pgMgr.BackoffPodGroup(pgName, *f.pgBackoff) } } - return framework.NewStatus(framework.Success) + + f.pgMgr.DeletePermittedPodGroup(pgName) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, + fmt.Sprintf("PodGroup %v gets rejected due to Pod %v is unschedulable even after PostFilter", pgName, pod.Name)) } -// PreFilterExtensions allow for callbacks on filtered states -// https://github.com/kubernetes/kubernetes/blob/master/pkg/scheduler/framework/interface.go#L383 +// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one. func (f *Fluence) PreFilterExtensions() framework.PreFilterExtensions { return nil } -// AskFlux will ask flux for an allocation for nodes for the pod group. -func (f *Fluence) AskFlux( - ctx context.Context, - pod *v1.Pod, - pg *sched.PodGroup, - groupName string, -) error { - - // clean up previous match if a pod has already allocated previously - f.mutex.Lock() - _, isAllocated := f.groupToJobId[groupName] - f.mutex.Unlock() - - // This case happens when there is some reason that an initial job pods partially allocated, - // but then the job restarted, and new pods are present but fluence had assigned nodes to - // the old ones (and there aren't enough). The job would have had to complete in some way, - // and the PodGroup would have to then recreate, and have the same job id (the group name). - // This happened when I cancalled a bunch of jobs and they didn't have the chance to - // cancel in fluence. What we can do here is assume the previous pods are no longer running - // and cancel the flux job to create again. - if isAllocated { - klog.Info("Warning - group %s was previously allocated and is requesting again, so must have completed.", groupName) - f.mutex.Lock() - f.cancelFluxJob(groupName) - f.mutex.Unlock() - } - - // IMPORTANT: this is a JobSpec for *one* pod, assuming they are all the same. - // This obviously may not be true if we have a hetereogenous PodGroup. - // We name it based on the group, since it will represent the group - jobspec := utils.PreparePodJobSpec(pod, groupName) - klog.Infof("[Fluence] Inspect pod info, jobspec: %s\n", jobspec) - conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) - - // TODO change this to just return fmt.Errorf - if err != nil { - klog.Errorf("[Fluence] Error connecting to server: %v\n", err) - return err +// Permit is the functions invoked by the framework at "Permit" extension point. +func (f *Fluence) Permit(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (*framework.Status, time.Duration) { + waitTime := *f.scheduleTimeout + s := f.pgMgr.Permit(ctx, pod) + var retStatus *framework.Status + switch s { + case fcore.PodGroupNotSpecified: + return framework.NewStatus(framework.Success, ""), 0 + case fcore.PodGroupNotFound: + return framework.NewStatus(framework.Unschedulable, "PodGroup not found"), 0 + case fcore.Wait: + klog.InfoS("Pod is waiting to be scheduled to node", "pod", klog.KObj(pod), "nodeName", nodeName) + _, pg := f.pgMgr.GetPodGroup(ctx, pod) + + // Note this is in seconds, defaults to 60 seconds + if wait := util.GetWaitTimeDuration(pg, f.scheduleTimeout); wait != 0 { + waitTime = wait + } + retStatus = framework.NewStatus(framework.Wait) + // We will also request to move the sibling pods back to activeQ. + f.pgMgr.ActivateSiblings(pod, state) + case fcore.Success: + pgFullName := label.GetPodGroupFullName(pod) + f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if label.GetPodGroupFullName(waitingPod.GetPod()) == pgFullName { + klog.V(3).InfoS("Permit allows", "pod", klog.KObj(waitingPod.GetPod())) + waitingPod.Allow(f.Name()) + } + }) + klog.V(3).InfoS("Permit allows", "pod", klog.KObj(pod)) + retStatus = framework.NewStatus(framework.Success) + waitTime = 0 } - defer conn.Close() - grpcclient := pb.NewFluxcliServiceClient(conn) - _, cancel := context.WithTimeout(context.Background(), 200*time.Second) - defer cancel() + return retStatus, waitTime +} - request := &pb.MatchRequest{ - Ps: jobspec, - Request: "allocate", - Count: pg.Spec.MinMember, - } +// Reserve is the functions invoked by the framework at "reserve" extension point. +func (f *Fluence) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { + return nil +} - // An error here is an error with making the request - r, err := grpcclient.Match(context.Background(), request) - if err != nil { - klog.Errorf("[Fluence] did not receive any match response: %v\n", err) - return err +// Unreserve rejects all other Pods in the PodGroup when one of the pods in the group times out. +func (f *Fluence) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { + pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) + if pg == nil { + return } - - // TODO GetPodID should be renamed, because it will reflect the group - klog.Infof("[Fluence] Match response ID %s\n", r.GetPodID()) - - // Get the nodelist and inspect - nodes := r.GetNodelist() - klog.Infof("[Fluence] Nodelist returned from Fluxion: %s\n", nodes) - - // Assign the nodelist - this sets the group name in the groupSeen cache - // at this point, we can retrieve the cache and get nodes - nodelist := fcore.CreateNodeList(nodes, groupName) - - jobid := uint64(r.GetJobID()) - klog.Infof("[Fluence] parsed node pods list %s for job id %d\n", nodelist, jobid) - - // TODO would be nice to actually be able to ask flux jobs -a to fluence - // That way we can verify assignments, etc. - f.mutex.Lock() - f.groupToJobId[groupName] = jobid - f.mutex.Unlock() - return nil + f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Namespace == pod.Namespace && label.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name { + klog.V(3).InfoS("Unreserve rejects", "pod", klog.KObj(waitingPod.GetPod()), "podGroup", klog.KObj(pg)) + waitingPod.Reject(f.Name(), "rejection in Unreserve") + } + }) + f.pgMgr.DeletePermittedPodGroup(pgName) } diff --git a/sig-scheduler-plugins/pkg/fluence/labels/labels.go b/sig-scheduler-plugins/pkg/fluence/labels/labels.go index e377d97..f955d67 100644 --- a/sig-scheduler-plugins/pkg/fluence/labels/labels.go +++ b/sig-scheduler-plugins/pkg/fluence/labels/labels.go @@ -1,14 +1,33 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package labels import ( + "fmt" "time" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) // Labels to be shared between different components const ( + // We use the same label to be consistent // https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/apis/scheduling/v1alpha1/types.go#L109 PodGroupLabel = "scheduling.x-k8s.io/pod-group" @@ -16,10 +35,29 @@ const ( //PodGroupNameLabel = "fluence.pod-group" PodGroupSizeLabel = "fluence.group-size" - // Internal use + // Internal use (not used yet) PodGroupTimeCreated = "flunce.created-at" ) +// GetPodGroupLabel get pod group name from pod labels +func GetPodGroupLabel(pod *v1.Pod) string { + return pod.Labels[PodGroupLabel] +} + +// GetPodGroupFullName get namespaced group name from pod labels +func GetPodGroupFullName(pod *v1.Pod) string { + pgName := GetPodGroupLabel(pod) + if len(pgName) == 0 { + return "" + } + return fmt.Sprintf("%v/%v", pod.Namespace, pgName) +} + +// GetPodGroupSize gets the pod group size from the label +func GetPodGroupSize(pod *v1.Pod) string { + return pod.Labels[PodGroupSizeLabel] +} + // getTimeCreated returns the timestamp when we saw the object func GetTimeCreated() string { From 726149c421d981a55f63be224997d37fcbacea8d Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 14 Mar 2024 02:14:45 -0600 Subject: [PATCH 22/28] update: adding back in fluence logic Problem: fluence is missing! Solution: add back fluence. This is a different design in that we do the asking from the perspective of the pod group, meaning that we get back a full set of nodes, and save them (assigned exactly) to specific pods. This could also be more lenient - e.g., creating a cache of the list and then taking off the cache, but I like the finer granularity of 1:1 mapping for future issues that might arise (where one pod needs a new node). This design also introduces a nice feature that we can ask for the resources (meaning creating a jobspec) for exactly what we need across pods for the group because we are listing all pods for the group before we generate the jobspec. I left it as it currently was before (using one representative pod) to not incur too many changes but this definitely can be tried. There is likely more work to be done to test edge cases and account for resources when fluence starts (and be able to load a state if it restarts) but this is pretty great for a first shot! The local lammps experiment ran without clogging and I am testing on GKE as a next step. Finally, I think there is a lot of poetential error in allowing a ton of other PreFilter plugins to exist, each of which could return their own set of nodes to consider that might mismatch what fluence has decided on. For this reason I have done aggressive pruning and we can add things back as we see fit. Signed-off-by: vsoch --- Makefile | 2 + README.md | 15 +- sig-scheduler-plugins/cmd/scheduler/main.go | 13 +- .../templates/configmap.yaml | 64 +++++ .../charts/as-a-second-scheduler/values.yaml | 17 +- .../pkg/fluence/core/core.go | 210 ++++++-------- .../pkg/fluence/core/flux.go | 259 ++++++++++++++++++ sig-scheduler-plugins/pkg/fluence/fluence.go | 222 +++++---------- 8 files changed, 502 insertions(+), 300 deletions(-) create mode 100644 sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/configmap.yaml create mode 100644 sig-scheduler-plugins/pkg/fluence/core/flux.go diff --git a/Makefile b/Makefile index 6ab44fe..91789d8 100644 --- a/Makefile +++ b/Makefile @@ -26,12 +26,14 @@ update: clone prepare: clone # These are entirely new directory structures rm -rf $(CLONE_UPSTREAM)/pkg/fluence + # rm -rf $(CLONE_UPSTREAM)/cmd/app rm -rf $(CLONE_UPSTREAM)/pkg/controllers/podgroup_controller.go rm -rf $(CLONE_UPSTREAM)/cmd/controller/app/server.go cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence cp -R sig-scheduler-plugins/pkg/controllers/* $(CLONE_UPSTREAM)/pkg/controllers/ # This is the one exception not from sig-scheduler-plugins because it is needed in both spots cp -R src/fluence/fluxcli-grpc $(CLONE_UPSTREAM)/pkg/fluence/fluxcli-grpc + # cp -R sig-scheduler-plugins/cmd/app ./upstream/cmd/app # These are files with subtle changes to add fluence cp sig-scheduler-plugins/cmd/scheduler/main.go ./upstream/cmd/scheduler/main.go cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/*.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/templates/ diff --git a/README.md b/README.md index 89f2a18..8c51de7 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,11 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched ## TODO -- Need to list pods, get state, and if is completed, cancel the job id. -- Keep track of state of all pods in group, when all of pods are completed, then issue cancel. -- Calculate on the fly - on the update event we want to loop through pods, if ALL completed, then delete the podid for fluence. +- On init, need to load in resource graph that accounts for running stuff +- Need to allow for restart / crashes and looking up existing jobid, updating maps in PodGroup +- Since AskFlux is done on level of pod group, refactor function to account for specific resources of all pods (not just one pod) +- Figure out if EventsToRegister replaces old informer +- Would be nice to see the state of fluxion (retest the kubectl-fluence plugin) ## Getting started @@ -526,13 +528,6 @@ For this setup if you are developing locally with kind, you will need to enable kind create cluster --config ./kind-config.yaml ``` -#### TODO - - - Try what [kueue does](https://github.com/kubernetes-sigs/kueue/blob/6d57813a52066dab412735deeeb60ebb0cdb8e8e/cmd/kueue/main.go#L146-L155) to not require cert-manager. - - Try other strategies for setting owner references (so cleans up when owner deleted) - - When that is done, add tests for deletion of pod group (the current method is not perfect and needs improvement) -- We really need to see the state of fluxion - I had this running for about 6 hours in kind, and at some point it just stopped working. I deleted and re-created the cluster and it was restored. It could be a development hiccup but would be good to know! - #### Components - [FluxStateData](sig-scheduler-plugins/pkg/fluence/core/core.go): is given to the [framework.CycleState](https://github.com/kubernetes/kubernetes/blob/242b41b36a20032f99e8a059ca0a5d764105217b/pkg/scheduler/framework/cycle_state.go#L48) and serves as a vehicle to store a cache of node name assignment. diff --git a/sig-scheduler-plugins/cmd/scheduler/main.go b/sig-scheduler-plugins/cmd/scheduler/main.go index 2b21d28..4d98d52 100644 --- a/sig-scheduler-plugins/cmd/scheduler/main.go +++ b/sig-scheduler-plugins/cmd/scheduler/main.go @@ -22,6 +22,10 @@ import ( "k8s.io/component-base/cli" _ "k8s.io/component-base/metrics/prometheus/clientgo" // for rest client metric registration _ "k8s.io/component-base/metrics/prometheus/version" // for version metric registration + + // Uncomment here for a local one here we use to debug + // This was a clone from kubernetes/kubernetes -> cmd/app + //"sigs.k8s.io/scheduler-plugins/cmd/app" "k8s.io/kubernetes/cmd/kube-scheduler/app" "sigs.k8s.io/scheduler-plugins/pkg/capacityscheduling" @@ -29,15 +33,14 @@ import ( "sigs.k8s.io/scheduler-plugins/pkg/fluence" "sigs.k8s.io/scheduler-plugins/pkg/networkaware/networkoverhead" "sigs.k8s.io/scheduler-plugins/pkg/networkaware/topologicalsort" - "sigs.k8s.io/scheduler-plugins/pkg/noderesources" "sigs.k8s.io/scheduler-plugins/pkg/noderesourcetopology" - "sigs.k8s.io/scheduler-plugins/pkg/podstate" "sigs.k8s.io/scheduler-plugins/pkg/preemptiontoleration" - "sigs.k8s.io/scheduler-plugins/pkg/qos" "sigs.k8s.io/scheduler-plugins/pkg/trimaran/loadvariationriskbalancing" - "sigs.k8s.io/scheduler-plugins/pkg/trimaran/lowriskovercommitment" "sigs.k8s.io/scheduler-plugins/pkg/trimaran/targetloadpacking" + "sigs.k8s.io/scheduler-plugins/pkg/podstate" + "sigs.k8s.io/scheduler-plugins/pkg/qos" + // Ensure scheme package is initialized. _ "sigs.k8s.io/scheduler-plugins/apis/config/scheme" ) @@ -52,11 +55,9 @@ func main() { app.WithPlugin(loadvariationriskbalancing.Name, loadvariationriskbalancing.New), app.WithPlugin(networkoverhead.Name, networkoverhead.New), app.WithPlugin(topologicalsort.Name, topologicalsort.New), - app.WithPlugin(noderesources.AllocatableName, noderesources.NewAllocatable), app.WithPlugin(noderesourcetopology.Name, noderesourcetopology.New), app.WithPlugin(preemptiontoleration.Name, preemptiontoleration.New), app.WithPlugin(targetloadpacking.Name, targetloadpacking.New), - app.WithPlugin(lowriskovercommitment.Name, lowriskovercommitment.New), app.WithPlugin(podstate.Name, podstate.New), app.WithPlugin(qos.Name, qos.New), app.WithPlugin(fluence.Name, fluence.New), diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/configmap.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/configmap.yaml new file mode 100644 index 0000000..9f3d8bf --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/configmap.yaml @@ -0,0 +1,64 @@ +{{- if .Values.plugins.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: scheduler-config + namespace: {{ .Release.Namespace }} +data: + scheduler-config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1 + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: {{ .Values.scheduler.leaderElect }} + profiles: + # Compose all plugins in one profile + - schedulerName: {{ .Values.scheduler.name }} + plugins: + preBind: + disabled: + - name: {{ .Values.scheduler.name }} + filter: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + reserve: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + score: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + preScore: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + postFilter: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + preFilter: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + multiPoint: + enabled: + {{- range $.Values.plugins.enabled }} + - name: {{ title . }} + {{- end }} + disabled: + {{- range $.Values.plugins.disabled }} + - name: {{ title . }} + {{- end }} + {{- if $.Values.pluginConfig }} + pluginConfig: {{ toYaml $.Values.pluginConfig | nindent 6 }} + {{- end }} + + {{- /* TODO: wire CRD installation with enabled plugins. */}} +{{- end }} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml index e48aa98..4113209 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml @@ -31,14 +31,23 @@ controller: plugins: enabled: ["Fluence"] disabled: ["CapacityScheduling","NodeResourceTopologyMatch","NodeResourcesAllocatable","PrioritySort","Coscheduling"] # only in-tree plugins need to be defined here + # Disable EVERYTHING except for fluence + # VolumeBinding is required for PreBind, NodeResourcesFit is required or you'll get mismatches + # Yes - some of these are irrelevant for the use case here, but I'd rather be super + # conservative and be absolutely sure only fluence is running PreFilter to select nodes + disabledAll: ["NodePorts", "VolumeRestrictions", "EBSLimits", + "GCEPDLimits", "NodeVolumeLimits", "AzureDiskLimits", "VolumeZone", + "PodTopologySpread", "InterPodAffinity", "NodeAffinity", + "NodeUnschedulable", "NodeName", "TaintToleration", "DefaultPreemtion", + "NodeResourcesBalancedAllocation", "ImageLocality"] # Customize the enabled plugins' config. # Refer to the "pluginConfig" section of manifests//scheduler-config.yaml. # For example, for Coscheduling plugin, you want to customize the permit waiting timeout to 10 seconds: -pluginConfig: -- name: Coscheduling - args: - permitWaitingTimeSeconds: 10 # default is 60 +# pluginConfig: +# - name: Coscheduling +# args: +# permitWaitingTimeSeconds: 10 # default is 60 # Or, customize the other plugins # - name: NodeResourceTopologyMatch # args: diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index efa1127..eed9536 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -24,13 +24,13 @@ import ( gochache "github.com/patrickmn/go-cache" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" informerv1 "k8s.io/client-go/informers/core/v1" listerv1 "k8s.io/client-go/listers/core/v1" - "k8s.io/klog/v2" + klog "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/scheduler/framework" "sigs.k8s.io/controller-runtime/pkg/client" @@ -38,28 +38,25 @@ import ( "sigs.k8s.io/scheduler-plugins/pkg/util" ) -type Status string +// TODO should eventually store group name here to reassociate on reload +type FluxStateData struct { + NodeName string +} -const ( - // PodGroupNotSpecified denotes no PodGroup is specified in the Pod spec. - PodGroupNotSpecified Status = "PodGroup not specified" - // PodGroupNotFound denotes the specified PodGroup in the Pod spec is - // not found in API server. - PodGroupNotFound Status = "PodGroup not found" - Success Status = "Success" - Wait Status = "Wait" -) +func (s *FluxStateData) Clone() framework.StateData { + clone := &FluxStateData{ + NodeName: s.NodeName, + } + return clone +} // Manager defines the interfaces for PodGroup management. type Manager interface { - PreFilter(context.Context, *corev1.Pod) error - Permit(context.Context, *corev1.Pod) Status + PreFilter(context.Context, *corev1.Pod, *framework.CycleState) error + GetPodNode(*corev1.Pod) string GetPodGroup(context.Context, *corev1.Pod) (string, *v1alpha1.PodGroup) GetCreationTimestamp(*corev1.Pod, time.Time) time.Time DeletePermittedPodGroup(string) - CalculateAssignedPods(string, string) int - ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) - BackoffPodGroup(string, time.Duration) } // PodGroupManager defines the scheduling operation called @@ -77,7 +74,16 @@ type PodGroupManager struct { backedOffPG *gochache.Cache // podLister is pod lister podLister listerv1.PodLister + + // This isn't great to save state, but we can improve upon it + // we should have a way to load jobids into this if fluence is recreated + // If we can annotate them in fluxion and query for that, we can! + groupToJobId map[string]uint64 + podToNode map[string]string + + // Probably should just choose one... oh well sync.RWMutex + mutex sync.Mutex } // NewPodGroupManager creates a new operation object. @@ -89,59 +95,37 @@ func NewPodGroupManager(client client.Client, snapshotSharedLister framework.Sha podLister: podInformer.Lister(), permittedPG: gochache.New(3*time.Second, 3*time.Second), backedOffPG: gochache.New(10*time.Second, 10*time.Second), + groupToJobId: map[string]uint64{}, + podToNode: map[string]string{}, } return pgMgr } -func (pgMgr *PodGroupManager) BackoffPodGroup(pgName string, backoff time.Duration) { - if backoff == time.Duration(0) { - return +// GetStatuses string (of all pods) to show for debugging purposes +func (pgMgr *PodGroupManager) GetStatuses(pods []*corev1.Pod) string { + statuses := "" + for _, pod := range pods { + statuses += " " + fmt.Sprintf("%s", pod.Status.Phase) } - pgMgr.backedOffPG.Add(pgName, nil, backoff) + return statuses } -// ActivateSiblings stashes the pods belonging to the same PodGroup of the given pod -// in the given state, with a reserved key "kubernetes.io/pods-to-activate". -func (pgMgr *PodGroupManager) ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) { - pgName := util.GetPodGroupLabel(pod) - if pgName == "" { - return - } - - pods, err := pgMgr.podLister.Pods(pod.Namespace).List( - labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: pgName}), - ) - if err != nil { - klog.ErrorS(err, "Failed to obtain pods belong to a PodGroup", "podGroup", pgName) - return - } - - for i := range pods { - if pods[i].UID == pod.UID { - pods = append(pods[:i], pods[i+1:]...) - break - } - } - - if len(pods) != 0 { - if c, err := state.Read(framework.PodsToActivateKey); err == nil { - if s, ok := c.(*framework.PodsToActivate); ok { - s.Lock() - for _, pod := range pods { - namespacedName := GetNamespacedName(pod) - s.Map[namespacedName] = pod - } - s.Unlock() - } - } - } +// GetPodNode is a quick lookup to see if we have a node +func (pgMgr *PodGroupManager) GetPodNode(pod *corev1.Pod) string { + node, _ := pgMgr.podToNode[pod.Name] + return node } // PreFilter filters out a pod if // 1. it belongs to a podgroup that was recently denied or // 2. the total number of pods in the podgroup is less than the minimum number of pods // that is required to be scheduled. -func (pgMgr *PodGroupManager) PreFilter(ctx context.Context, pod *corev1.Pod) error { +func (pgMgr *PodGroupManager) PreFilter( + ctx context.Context, + pod *corev1.Pod, + state *framework.CycleState, +) error { + klog.V(5).InfoS("Pre-filter", "pod", klog.KObj(pod)) pgFullName, pg := pgMgr.GetPodGroup(ctx, pod) if pg == nil { @@ -159,15 +143,24 @@ func (pgMgr *PodGroupManager) PreFilter(ctx context.Context, pod *corev1.Pod) er return fmt.Errorf("podLister list pods failed: %w", err) } + // Get statuses to show for debugging + statuses := pgMgr.GetStatuses(pods) + + // This shows us the number of pods we have in the set and their states + klog.Infof("Fluence Pre-filter", "group", pgFullName, "pods", statuses, "MinMember", pg.Spec.MinMember, "Size", len(pods)) if len(pods) < int(pg.Spec.MinMember) { return fmt.Errorf("pre-filter pod %v cannot find enough sibling pods, "+ "current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember) } - if pg.Spec.MinResources == nil { - return nil - } + // TODO we likely can take advantage of these resources or other custom + // attributes we add. For now ignore and calculate based on pod needs (above) + // if pg.Spec.MinResources == nil { + // fmt.Printf("Fluence Min resources are null, skipping PreFilter") + // return nil + // } + // This is from coscheduling. // TODO(cwdsuzhou): This resource check may not always pre-catch unschedulable pod group. // It only tries to PreFilter resource constraints so even if a PodGroup passed here, // it may not necessarily pass Filter due to other constraints such as affinity/taints. @@ -175,43 +168,39 @@ func (pgMgr *PodGroupManager) PreFilter(ctx context.Context, pod *corev1.Pod) er return nil } - nodes, err := pgMgr.snapshotSharedLister.NodeInfos().List() + // TODO: right now we ask Fluxion for a podspec based on ONE pod, but + // we have the whole group! We can handle different pod needs now :) + repPod := pods[0] + nodes, err := pgMgr.AskFlux(ctx, *repPod, pg, pgFullName) if err != nil { + klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) return err } + klog.Infof("Node Selected %s (pod group %s)", nodes, pgFullName) + + // Some reason fluxion gave us the wrong size? + if len(nodes) != len(pods) { + klog.Info("Warning - group %s needs %d nodes but Fluxion returned the wrong number nodes %d.", pgFullName, len(pods), len(nodes)) + pgMgr.mutex.Lock() + pgMgr.cancelFluxJob(pgFullName, repPod) + pgMgr.mutex.Unlock() + } - minResources := pg.Spec.MinResources.DeepCopy() - podQuantity := resource.NewQuantity(int64(pg.Spec.MinMember), resource.DecimalSI) - minResources[corev1.ResourcePods] = *podQuantity - err = CheckClusterResource(nodes, minResources, pgFullName) - if err != nil { - klog.ErrorS(err, "Failed to PreFilter", "podGroup", klog.KObj(pg)) - return err + // Create a fluxState (CycleState) with all nodes - this is used to retrieve + // the specific node assigned to the pod in Filter, which returns a node + // Note that this probably is not useful beyond the pod we are in the context + // of, but why not do it. + for i, node := range nodes { + pod := pods[i] + stateData := FluxStateData{NodeName: node} + state.Write(framework.StateKey(pod.Name), &stateData) + // Also save to the podToNode lookup + pgMgr.podToNode[pod.Name] = node } pgMgr.permittedPG.Add(pgFullName, pgFullName, *pgMgr.scheduleTimeout) return nil } -// Permit permits a pod to run, if the minMember match, it would send a signal to chan. -func (pgMgr *PodGroupManager) Permit(ctx context.Context, pod *corev1.Pod) Status { - pgFullName, pg := pgMgr.GetPodGroup(ctx, pod) - if pgFullName == "" { - return PodGroupNotSpecified - } - if pg == nil { - // A Pod with a podGroup name but without a PodGroup found is denied. - return PodGroupNotFound - } - - assigned := pgMgr.CalculateAssignedPods(pg.Name, pg.Namespace) - // The number of pods that have been assigned nodes is calculated from the snapshot. - // The current pod in not included in the snapshot during the current scheduling cycle. - if int32(assigned)+1 >= pg.Spec.MinMember { - return Success - } - return Wait -} - // GetCreationTimestamp returns the creation time of a podGroup or a pod. func (pgMgr *PodGroupManager) GetCreationTimestamp(pod *corev1.Pod, ts time.Time) time.Time { pgName := util.GetPodGroupLabel(pod) @@ -243,51 +232,6 @@ func (pgMgr *PodGroupManager) GetPodGroup(ctx context.Context, pod *corev1.Pod) return fmt.Sprintf("%v/%v", pod.Namespace, pgName), &pg } -// CalculateAssignedPods returns the number of pods that has been assigned nodes: assumed or bound. -func (pgMgr *PodGroupManager) CalculateAssignedPods(podGroupName, namespace string) int { - nodeInfos, err := pgMgr.snapshotSharedLister.NodeInfos().List() - klog.Info(nodeInfos) - if err != nil { - klog.ErrorS(err, "Cannot get nodeInfos from frameworkHandle") - return 0 - } - var count int - for _, nodeInfo := range nodeInfos { - for _, podInfo := range nodeInfo.Pods { - pod := podInfo.Pod - if util.GetPodGroupLabel(pod) == podGroupName && pod.Namespace == namespace && pod.Spec.NodeName != "" { - count++ - } - } - } - - return count -} - -// CheckClusterResource checks if resource capacity of the cluster can satisfy . -// It returns an error detailing the resource gap if not satisfied; otherwise returns nil. -func CheckClusterResource(nodeList []*framework.NodeInfo, resourceRequest corev1.ResourceList, desiredPodGroupName string) error { - for _, info := range nodeList { - if info == nil || info.Node() == nil { - continue - } - - nodeResource := util.ResourceList(getNodeResource(info, desiredPodGroupName)) - for name, quant := range resourceRequest { - quant.Sub(nodeResource[name]) - if quant.Sign() <= 0 { - delete(resourceRequest, name) - continue - } - resourceRequest[name] = quant - } - if len(resourceRequest) == 0 { - return nil - } - } - return fmt.Errorf("resource gap: %v", resourceRequest) -} - // GetNamespacedName returns the namespaced name. func GetNamespacedName(obj metav1.Object) string { return fmt.Sprintf("%v/%v", obj.GetNamespace(), obj.GetName()) diff --git a/sig-scheduler-plugins/pkg/fluence/core/flux.go b/sig-scheduler-plugins/pkg/fluence/core/flux.go new file mode 100644 index 0000000..def239f --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/core/flux.go @@ -0,0 +1,259 @@ +package core + +import ( + "context" + "time" + + "google.golang.org/grpc" + "k8s.io/apimachinery/pkg/labels" + klog "k8s.io/klog/v2" + pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" + fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" + + "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + "sigs.k8s.io/scheduler-plugins/pkg/fluence/utils" + + corev1 "k8s.io/api/core/v1" +) + +// AskFlux will ask flux for an allocation for nodes for the pod group. +// We return the list of nodes, and assign to the entire group! +func (pgMgr *PodGroupManager) AskFlux( + ctx context.Context, + pod corev1.Pod, + pg *v1alpha1.PodGroup, + groupName string, +) ([]string, error) { + + // clean up previous match if a pod has already allocated previously + pgMgr.mutex.Lock() + _, isAllocated := pgMgr.groupToJobId[groupName] + pgMgr.mutex.Unlock() + + // This case happens when there is some reason that an initial job pods partially allocated, + // but then the job restarted, and new pods are present but fluence had assigned nodes to + // the old ones (and there aren't enough). The job would have had to complete in some way, + // and the PodGroup would have to then recreate, and have the same job id (the group name). + // This happened when I cancalled a bunch of jobs and they didn't have the chance to + // cancel in fluence. What we can do here is assume the previous pods are no longer running + // and cancel the flux job to create again. + if isAllocated { + klog.Info("Warning - group %s was previously allocated and is requesting again, so must have completed.", groupName) + pgMgr.mutex.Lock() + pgMgr.cancelFluxJob(groupName, &pod) + pgMgr.mutex.Unlock() + } + nodes := []string{} + + // IMPORTANT: this is a JobSpec for *one* pod, assuming they are all the same. + // This obviously may not be true if we have a hetereogenous PodGroup. + // We name it based on the group, since it will represent the group + jobspec := utils.PreparePodJobSpec(&pod, groupName) + klog.Infof("[Fluence] Inspect pod info, jobspec: %s\n", jobspec) + conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) + + // TODO change this to just return fmt.Errorf + if err != nil { + klog.Errorf("[Fluence] Error connecting to server: %v\n", err) + return nodes, err + } + defer conn.Close() + + grpcclient := pb.NewFluxcliServiceClient(conn) + _, cancel := context.WithTimeout(context.Background(), 200*time.Second) + defer cancel() + + request := &pb.MatchRequest{ + Ps: jobspec, + Request: "allocate", + Count: pg.Spec.MinMember, + } + + // An error here is an error with making the request + r, err := grpcclient.Match(context.Background(), request) + if err != nil { + klog.Errorf("[Fluence] did not receive any match response: %v\n", err) + return nodes, err + } + + // TODO GetPodID should be renamed, because it will reflect the group + klog.Infof("[Fluence] Match response ID %s\n", r.GetPodID()) + + // Get the nodelist and inspect + nodelist := r.GetNodelist() + for _, node := range nodelist { + nodes = append(nodes, node.NodeID) + } + jobid := uint64(r.GetJobID()) + klog.Infof("[Fluence] parsed node pods list %s for job id %d\n", nodes, jobid) + + // TODO would be nice to actually be able to ask flux jobs -a to fluence + // That way we can verify assignments, etc. + pgMgr.mutex.Lock() + pgMgr.groupToJobId[groupName] = jobid + pgMgr.mutex.Unlock() + return nodes, nil +} + +// cancelFluxJobForPod cancels the flux job for a pod. +// We assume that the cancelled job also means deleting the pod group +func (pgMgr *PodGroupManager) cancelFluxJob(groupName string, pod *corev1.Pod) error { + + jobid, ok := pgMgr.groupToJobId[groupName] + + // The job was already cancelled by another pod + if !ok { + klog.Infof("[Fluence] Request for cancel of group %s is already complete.", groupName) + return nil + } + klog.Infof("[Fluence] Cancel flux job: %v for group %s", jobid, groupName) + + // This first error is about connecting to the server + conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) + if err != nil { + klog.Errorf("[Fluence] Error connecting to server: %v", err) + return err + } + defer conn.Close() + + grpcclient := pb.NewFluxcliServiceClient(conn) + _, cancel := context.WithTimeout(context.Background(), 200*time.Second) + defer cancel() + + // This error reflects the success or failure of the cancel request + request := &pb.CancelRequest{JobID: int64(jobid)} + res, err := grpcclient.Cancel(context.Background(), request) + if err != nil { + klog.Errorf("[Fluence] did not receive any cancel response: %v", err) + return err + } + klog.Infof("[Fluence] Job cancellation for group %s result: %d", groupName, res.Error) + + // And this error is if the cancel was successful or not + if res.Error == 0 { + klog.Infof("[Fluence] Successful cancel of flux job: %d for group %s", jobid, groupName) + pgMgr.cleanup(pod, groupName) + } else { + klog.Warningf("[Fluence] Failed to cancel flux job %d for group %s", jobid, groupName) + } + return nil +} + +// cleanup deletes the group name from groupToJobId, and pods names from the node lookup +func (pgMgr *PodGroupManager) cleanup(pod *corev1.Pod, groupName string) { + + delete(pgMgr.groupToJobId, groupName) + + // Clean up previous pod->node assignments + pods, err := pgMgr.podLister.Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: groupName}), + ) + // TODO need to handle this / understand why it's the case + if err != nil { + return + } + for _, pod := range pods { + delete(pgMgr.podToNode, pod.Name) + } +} + +// UpdatePod is called on an update, and the old and new object are presented +func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { + + oldPod := oldObj.(*corev1.Pod) + newPod := newObj.(*corev1.Pod) + + // a pod is updated, get the group + // TODO should we be checking group / size for old vs new? + groupName, pg := pgMgr.GetPodGroup(context.TODO(), oldPod) + + // If PodGroup is nil, still try to look up a faux name + // TODO need to check if this might be problematic + if pg == nil { + pg = fgroup.CreateFakeGroup(oldPod) + groupName = pg.Name + } + + klog.Infof("[Fluence] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, oldPod.Status.Phase, newPod.Status.Phase) + + switch newPod.Status.Phase { + case corev1.PodPending: + // in this state we don't know if a pod is going to be running, thus we don't need to update job map + case corev1.PodRunning: + // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler + case corev1.PodSucceeded: + klog.Infof("[Fluence] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) + + pgMgr.mutex.Lock() + defer pgMgr.mutex.Unlock() + + // Do we have the group id in our cache? If yes, we haven't deleted the jobid yet + // I am worried here that if some pods are succeeded and others pending, this could + // be a mistake - fluence would schedule it again + _, ok := pgMgr.groupToJobId[groupName] + if ok { + pgMgr.cancelFluxJob(groupName, oldPod) + } else { + klog.Infof("[Fluence] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + } + + case corev1.PodFailed: + + // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test + klog.Warningf("[Fluence] Pod %s in group %s failed, Fluence needs to free the resources", newPod.Name, groupName) + + pgMgr.mutex.Lock() + defer pgMgr.mutex.Unlock() + + _, ok := pgMgr.groupToJobId[groupName] + if ok { + pgMgr.cancelFluxJob(groupName, oldPod) + } else { + klog.Errorf("[Fluence] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + } + case corev1.PodUnknown: + // don't know how to deal with it as it's unknown phase + default: + // shouldn't enter this branch + } +} + +// DeletePod handles the delete event handler +func (pgMgr *PodGroupManager) DeletePod(podObj interface{}) { + klog.Info("[Fluence] Delete Pod event handler") + pod := podObj.(*corev1.Pod) + groupName, pg := pgMgr.GetPodGroup(context.TODO(), pod) + + // If PodGroup is nil, still try to look up a faux name + if pg == nil { + pg = fgroup.CreateFakeGroup(pod) + groupName = pg.Name + } + + klog.Infof("[Fluence] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) + switch pod.Status.Phase { + case corev1.PodSucceeded: + case corev1.PodPending: + klog.Infof("[Fluence] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) + + pgMgr.mutex.Lock() + defer pgMgr.mutex.Unlock() + + _, ok := pgMgr.groupToJobId[groupName] + if ok { + pgMgr.cancelFluxJob(groupName, pod) + } else { + klog.Infof("[Fluence] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + } + case corev1.PodRunning: + pgMgr.mutex.Lock() + defer pgMgr.mutex.Unlock() + + _, ok := pgMgr.groupToJobId[groupName] + if ok { + pgMgr.cancelFluxJob(groupName, pod) + } else { + klog.Infof("[Fluence] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + } + } +} diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 1ad1fd3..5f9f635 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -22,20 +22,19 @@ import ( "sync" "time" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/sets" + klog "k8s.io/klog/v2" + + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" clientscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/cache" fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" - label "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" corev1helpers "k8s.io/component-helpers/scheduling/corev1" - "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/scheduler-plugins/pkg/util" "sigs.k8s.io/scheduler-plugins/apis/config" "sigs.k8s.io/scheduler-plugins/apis/scheduling" @@ -46,25 +45,17 @@ import ( // Fluence schedules pods in a group using Fluxion as a backend // We inherit cosched.Coscheduling to use some of the primary functions type Fluence struct { - mutex sync.Mutex - client client.Client - - // Store jobid on the level of a group (which can be a single pod) - groupToJobId map[string]uint64 - + mutex sync.Mutex + client client.Client frameworkHandler framework.Handle pgMgr fcore.Manager scheduleTimeout *time.Duration - pgBackoff *time.Duration } var ( - _ framework.QueueSortPlugin = &Fluence{} - _ framework.PreFilterPlugin = &Fluence{} - _ framework.PostFilterPlugin = &Fluence{} // Here down are from coscheduling - _ framework.PermitPlugin = &Fluence{} - _ framework.ReservePlugin = &Fluence{} - _ framework.EnqueueExtensions = &Fluence{} + _ framework.QueueSortPlugin = &Fluence{} + _ framework.PreFilterPlugin = &Fluence{} + _ framework.FilterPlugin = &Fluence{} ) const ( @@ -77,10 +68,11 @@ func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) // Keep these empty for now, use defaults args := config.CoschedulingArgs{} + ctx := context.TODO() scheme := runtime.NewScheme() _ = clientscheme.AddToScheme(scheme) - _ = v1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) _ = v1alpha1.AddToScheme(scheme) client, err := client.New(handle.KubeConfig(), client.Options{Scheme: scheme}) @@ -89,7 +81,8 @@ func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) } // Performance improvement when retrieving list of objects by namespace or we'll log 'index not exist' warning. - handle.SharedInformerFactory().Core().V1().Pods().Informer().AddIndexers(cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) + fluxPodsInformer := handle.SharedInformerFactory().Core().V1().Pods().Informer() + fluxPodsInformer.AddIndexers(cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) // PermitWaitingTimeSeconds is the waiting timeout in seconds. scheduleTimeDuration := time.Duration(args.PermitWaitingTimeSeconds) * time.Second @@ -101,22 +94,17 @@ func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) handle.SharedInformerFactory().Core().V1().Pods(), ) - // The main difference here is adding the groupToJobId lookup + // Event handlers to call on pgMgr + fluxPodsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: pgMgr.UpdatePod, + DeleteFunc: pgMgr.DeletePod, + }) + go fluxPodsInformer.Run(ctx.Done()) + plugin := &Fluence{ frameworkHandler: handle, pgMgr: pgMgr, scheduleTimeout: &scheduleTimeDuration, - groupToJobId: make(map[string]uint64), - } - - // PodGroupBackoffSeconds: backoff time in seconds before a pod group can be scheduled again. - if args.PodGroupBackoffSeconds < 0 { - err := fmt.Errorf("parse arguments failed") - klog.ErrorS(err, "PodGroupBackoffSeconds cannot be negative") - return nil, err - } else if args.PodGroupBackoffSeconds > 0 { - pgBackoff := time.Duration(args.PodGroupBackoffSeconds) * time.Second - plugin.pgBackoff = &pgBackoff } return plugin, nil } @@ -128,6 +116,7 @@ func (f *Fluence) Name() string { // Fluence has added delete, although I wonder if update includes that signal // and it's redundant? func (f *Fluence) EventsToRegister() []framework.ClusterEventWithHint { + // TODO I have not redone this yet, not sure what it does (it might replace our informer above) // To register a custom event, follow the naming convention at: // https://git.k8s.io/kubernetes/pkg/scheduler/eventhandlers.go#L403-L410 pgGVK := fmt.Sprintf("podgroups.v1alpha1.%v", scheduling.GroupName) @@ -137,6 +126,33 @@ func (f *Fluence) EventsToRegister() []framework.ClusterEventWithHint { } } +// TODO we need to account for affinity here +func (f *Fluence) Filter( + ctx context.Context, + cycleState *framework.CycleState, + pod *corev1.Pod, + nodeInfo *framework.NodeInfo, +) *framework.Status { + + klog.Info("Filtering input node ", nodeInfo.Node().Name) + state, err := cycleState.Read(framework.StateKey(pod.Name)) + + // No error means we retrieved the state + if err == nil { + + // Try to convert the state to FluxStateDate + value, ok := state.(*fcore.FluxStateData) + + // If we have state data that isn't equal to the current assignment, no go + if ok && value.NodeName != nodeInfo.Node().Name { + return framework.NewStatus(framework.Unschedulable, "pod is not permitted") + } else { + klog.Infof("Filter: node %s selected for %s\n", value.NodeName, pod.Name) + } + } + return framework.NewStatus(framework.Success) +} + // Less is used to sort pods in the scheduling queue in the following order. // 1. Compare the priorities of Pods. // 2. Compare the initialization timestamps of PodGroups or Pods. @@ -169,125 +185,37 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { } -// PreFilter performs the following validations. -// 1. Whether the PodGroup that the Pod belongs to is on the deny list. -// 2. Whether the total number of pods in a PodGroup is less than its `minMember`. -func (f *Fluence) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { - // If PreFilter fails, return framework.UnschedulableAndUnresolvable to avoid - // any preemption attempts. - if err := f.pgMgr.PreFilter(ctx, pod); err != nil { - klog.ErrorS(err, "PreFilter failed", "pod", klog.KObj(pod)) - return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) - } - return nil, framework.NewStatus(framework.Success, "") -} - -// PostFilter is used to reject a group of pods if a pod does not pass PreFilter or Filter. -func (f *Fluence) PostFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, - filteredNodeStatusMap framework.NodeToStatusMap) (*framework.PostFilterResult, *framework.Status) { - pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { - klog.V(4).InfoS("Pod does not belong to any group", "pod", klog.KObj(pod)) - return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, "can not find pod group") - } - - // This indicates there are already enough Pods satisfying the PodGroup, - // so don't bother to reject the whole PodGroup. - assigned := f.pgMgr.CalculateAssignedPods(pg.Name, pod.Namespace) - if assigned >= int(pg.Spec.MinMember) { - klog.V(4).InfoS("Assigned pods", "podGroup", klog.KObj(pg), "assigned", assigned) - return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable) - } - - // If the gap is less than/equal 10%, we may want to try subsequent Pods - // to see they can satisfy the PodGroup - notAssignedPercentage := float32(int(pg.Spec.MinMember)-assigned) / float32(pg.Spec.MinMember) - if notAssignedPercentage <= 0.1 { - klog.V(4).InfoS("A small gap of pods to reach the quorum", "podGroup", klog.KObj(pg), "percentage", notAssignedPercentage) - return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable) - } - - // It's based on an implicit assumption: if the nth Pod failed, - // it's inferrable other Pods belonging to the same PodGroup would be very likely to fail. - f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { - if waitingPod.GetPod().Namespace == pod.Namespace && label.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name { - klog.V(3).InfoS("PostFilter rejects the pod", "podGroup", klog.KObj(pg), "pod", klog.KObj(waitingPod.GetPod())) - waitingPod.Reject(f.Name(), "optimistic rejection in PostFilter") - } - }) - - if f.pgBackoff != nil { - pods, err := f.frameworkHandler.SharedInformerFactory().Core().V1().Pods().Lister().Pods(pod.Namespace).List( - labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: label.GetPodGroupLabel(pod)}), - ) - if err == nil && len(pods) >= int(pg.Spec.MinMember) { - f.pgMgr.BackoffPodGroup(pgName, *f.pgBackoff) - } - } - - f.pgMgr.DeletePermittedPodGroup(pgName) - return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, - fmt.Sprintf("PodGroup %v gets rejected due to Pod %v is unschedulable even after PostFilter", pgName, pod.Name)) -} - -// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one. +// PreFilterExtensions allow for callbacks on filtered states +// This is required to be defined for a PreFilter plugin +// https://github.com/kubernetes/kubernetes/blob/master/pkg/scheduler/framework/interface.go#L383 func (f *Fluence) PreFilterExtensions() framework.PreFilterExtensions { return nil } -// Permit is the functions invoked by the framework at "Permit" extension point. -func (f *Fluence) Permit(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (*framework.Status, time.Duration) { - waitTime := *f.scheduleTimeout - s := f.pgMgr.Permit(ctx, pod) - var retStatus *framework.Status - switch s { - case fcore.PodGroupNotSpecified: - return framework.NewStatus(framework.Success, ""), 0 - case fcore.PodGroupNotFound: - return framework.NewStatus(framework.Unschedulable, "PodGroup not found"), 0 - case fcore.Wait: - klog.InfoS("Pod is waiting to be scheduled to node", "pod", klog.KObj(pod), "nodeName", nodeName) - _, pg := f.pgMgr.GetPodGroup(ctx, pod) - - // Note this is in seconds, defaults to 60 seconds - if wait := util.GetWaitTimeDuration(pg, f.scheduleTimeout); wait != 0 { - waitTime = wait - } - retStatus = framework.NewStatus(framework.Wait) - // We will also request to move the sibling pods back to activeQ. - f.pgMgr.ActivateSiblings(pod, state) - case fcore.Success: - pgFullName := label.GetPodGroupFullName(pod) - f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { - if label.GetPodGroupFullName(waitingPod.GetPod()) == pgFullName { - klog.V(3).InfoS("Permit allows", "pod", klog.KObj(waitingPod.GetPod())) - waitingPod.Allow(f.Name()) - } - }) - klog.V(3).InfoS("Permit allows", "pod", klog.KObj(pod)) - retStatus = framework.NewStatus(framework.Success) - waitTime = 0 +// PreFilter performs the following validations. +// 1. Whether the PodGroup that the Pod belongs to is on the deny list. +// 2. Whether the total number of pods in a PodGroup is less than its `minMember`. +func (f *Fluence) PreFilter( + ctx context.Context, + state *framework.CycleState, + pod *corev1.Pod, +) (*framework.PreFilterResult, *framework.Status) { + + // Quick check if the pod is already scheduled + f.mutex.Lock() + node := f.pgMgr.GetPodNode(pod) + f.mutex.Unlock() + if node != "" { + result := framework.PreFilterResult{NodeNames: sets.New(node)} + return &result, framework.NewStatus(framework.Success, "") } - - return retStatus, waitTime -} - -// Reserve is the functions invoked by the framework at "reserve" extension point. -func (f *Fluence) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { - return nil -} - -// Unreserve rejects all other Pods in the PodGroup when one of the pods in the group times out. -func (f *Fluence) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { - pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { - return + // This will populate the node name into the pod group manager + err := f.pgMgr.PreFilter(ctx, pod, state) + if err != nil { + klog.ErrorS(err, "PreFilter failed", "pod", klog.KObj(pod)) + return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) } - f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { - if waitingPod.GetPod().Namespace == pod.Namespace && label.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name { - klog.V(3).InfoS("Unreserve rejects", "pod", klog.KObj(waitingPod.GetPod()), "podGroup", klog.KObj(pg)) - waitingPod.Reject(f.Name(), "rejection in Unreserve") - } - }) - f.pgMgr.DeletePermittedPodGroup(pgName) + node = f.pgMgr.GetPodNode(pod) + result := framework.PreFilterResult{NodeNames: sets.New(node)} + return &result, framework.NewStatus(framework.Success, "") } From 5a86a239362b1b3777f5194b7565969b27ac6565 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 14 Mar 2024 20:43:53 -0600 Subject: [PATCH 23/28] feat: add small logger just for fluence Problem: it is really hard using klog and parses through messy multi-threaded logs Solution: make a little (likely temporary) filesystem logger for a single place of truth! Signed-off-by: vsoch --- Makefile | 2 + README.md | 8 + .../pkg/controllers/podgroup_controller.go | 14 + .../pkg/fluence/core/core.go | 62 ++--- .../pkg/fluence/core/flux.go | 46 ++-- sig-scheduler-plugins/pkg/fluence/fluence.go | 23 +- sig-scheduler-plugins/pkg/fluence/register.go | 55 ++++ sig-scheduler-plugins/pkg/logger/logger.go | 88 ++++++ src/fluence/utils/utils.go | 256 +++++++++++------- 9 files changed, 388 insertions(+), 166 deletions(-) create mode 100644 sig-scheduler-plugins/pkg/fluence/register.go create mode 100644 sig-scheduler-plugins/pkg/logger/logger.go diff --git a/Makefile b/Makefile index 91789d8..d051a0e 100644 --- a/Makefile +++ b/Makefile @@ -26,9 +26,11 @@ update: clone prepare: clone # These are entirely new directory structures rm -rf $(CLONE_UPSTREAM)/pkg/fluence + rm -rf $(CLONE_UPSTREAM)/pkg/logger # rm -rf $(CLONE_UPSTREAM)/cmd/app rm -rf $(CLONE_UPSTREAM)/pkg/controllers/podgroup_controller.go rm -rf $(CLONE_UPSTREAM)/cmd/controller/app/server.go + cp -R sig-scheduler-plugins/pkg/logger $(CLONE_UPSTREAM)/pkg/logger cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence cp -R sig-scheduler-plugins/pkg/controllers/* $(CLONE_UPSTREAM)/pkg/controllers/ # This is the one exception not from sig-scheduler-plugins because it is needed in both spots diff --git a/README.md b/README.md index 8c51de7..e3e1214 100644 --- a/README.md +++ b/README.md @@ -509,6 +509,14 @@ The last step ensures we use the images we loaded! You can basically just do: This sped up my development time immensely. If you want to manually do the steps, see that script for instructions. +#### Logging + +For easier viewing of what fluence is doing (in the sig-scheduler-plugins) we have a file logger that can be seen in the container: + +```bash +$ kubectl exec -it fluence-68c4c586c6-nktdl -c scheduler-plugins-scheduler -- cat /tmp/fluence.log +``` + ##### kubectl plugin Note that if you want to enable extra endpoints for the fluence kubectl plugin and expose the GRPC as a service, you can do: diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index 27c31cb..a2fd4a6 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -405,6 +405,8 @@ func (r *PodGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } +// ensurePodGroup ensures we create the pod group (or delete) when pod is deleted +// for delete, this would be better done as an owner reference., but I haven't gotten it working func (r *PodGroupReconciler) ensurePodGroup(ctx context.Context, obj client.Object) []ctrl.Request { pod, ok := obj.(*v1.Pod) if !ok { @@ -418,6 +420,18 @@ func (r *PodGroupReconciler) ensurePodGroup(ctx context.Context, obj client.Obje return nil } + // If we deleted the pod... assume we delete the group too + if !pod.ObjectMeta.DeletionTimestamp.IsZero() { + r.log.Info("Pod: ", "Name", pod.Name, "Status", pod.Status.Phase, "Action", "Deleted") + + pg := &schedv1alpha1.PodGroup{} + err := r.Get(ctx, types.NamespacedName{Name: groupName, Namespace: pod.Namespace}, pg) + if err != nil { + r.Delete(ctx, pg) + } + return nil + } + // If we are watching the Pod and it's beyond pending, we hopefully already made a group // and that group should be in the reconcile process. if pod.Status.Phase != v1.PodPending { diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index eed9536..8b08468 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -35,6 +35,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + "sigs.k8s.io/scheduler-plugins/pkg/logger" "sigs.k8s.io/scheduler-plugins/pkg/util" ) @@ -84,10 +85,17 @@ type PodGroupManager struct { // Probably should just choose one... oh well sync.RWMutex mutex sync.Mutex + log *logger.DebugLogger } // NewPodGroupManager creates a new operation object. -func NewPodGroupManager(client client.Client, snapshotSharedLister framework.SharedLister, scheduleTimeout *time.Duration, podInformer informerv1.PodInformer) *PodGroupManager { +func NewPodGroupManager( + client client.Client, + snapshotSharedLister framework.SharedLister, + scheduleTimeout *time.Duration, + podInformer informerv1.PodInformer, + log *logger.DebugLogger, +) *PodGroupManager { pgMgr := &PodGroupManager{ client: client, snapshotSharedLister: snapshotSharedLister, @@ -97,6 +105,7 @@ func NewPodGroupManager(client client.Client, snapshotSharedLister framework.Sha backedOffPG: gochache.New(10*time.Second, 10*time.Second), groupToJobId: map[string]uint64{}, podToNode: map[string]string{}, + log: log, } return pgMgr } @@ -126,13 +135,14 @@ func (pgMgr *PodGroupManager) PreFilter( state *framework.CycleState, ) error { - klog.V(5).InfoS("Pre-filter", "pod", klog.KObj(pod)) + pgMgr.log.Info("[PodGroup PreFilter] pod %s", klog.KObj(pod)) pgFullName, pg := pgMgr.GetPodGroup(ctx, pod) if pg == nil { return nil } - if _, exist := pgMgr.backedOffPG.Get(pgFullName); exist { + _, exist := pgMgr.backedOffPG.Get(pgFullName) + if exist { return fmt.Errorf("podGroup %v failed recently", pgFullName) } @@ -147,7 +157,7 @@ func (pgMgr *PodGroupManager) PreFilter( statuses := pgMgr.GetStatuses(pods) // This shows us the number of pods we have in the set and their states - klog.Infof("Fluence Pre-filter", "group", pgFullName, "pods", statuses, "MinMember", pg.Spec.MinMember, "Size", len(pods)) + pgMgr.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", pgFullName, statuses, pg.Spec.MinMember, len(pods)) if len(pods) < int(pg.Spec.MinMember) { return fmt.Errorf("pre-filter pod %v cannot find enough sibling pods, "+ "current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember) @@ -164,7 +174,8 @@ func (pgMgr *PodGroupManager) PreFilter( // TODO(cwdsuzhou): This resource check may not always pre-catch unschedulable pod group. // It only tries to PreFilter resource constraints so even if a PodGroup passed here, // it may not necessarily pass Filter due to other constraints such as affinity/taints. - if _, ok := pgMgr.permittedPG.Get(pgFullName); ok { + _, ok := pgMgr.permittedPG.Get(pgFullName) + if ok { return nil } @@ -173,14 +184,14 @@ func (pgMgr *PodGroupManager) PreFilter( repPod := pods[0] nodes, err := pgMgr.AskFlux(ctx, *repPod, pg, pgFullName) if err != nil { - klog.Infof("[Fluence] Fluxion returned an error %s, not schedulable", err.Error()) + pgMgr.log.Info("[PodGroup PreFilter] Fluxion returned an error %s, not schedulable", err.Error()) return err } - klog.Infof("Node Selected %s (pod group %s)", nodes, pgFullName) + pgMgr.log.Info("Node Selected %s (pod group %s)", nodes, pgFullName) // Some reason fluxion gave us the wrong size? if len(nodes) != len(pods) { - klog.Info("Warning - group %s needs %d nodes but Fluxion returned the wrong number nodes %d.", pgFullName, len(pods), len(nodes)) + pgMgr.log.Warning("[PodGroup PreFilter] group %s needs %d nodes but Fluxion returned the wrong number nodes %d.", pgFullName, len(pods), len(nodes)) pgMgr.mutex.Lock() pgMgr.cancelFluxJob(pgFullName, repPod) pgMgr.mutex.Unlock() @@ -236,38 +247,3 @@ func (pgMgr *PodGroupManager) GetPodGroup(ctx context.Context, pod *corev1.Pod) func GetNamespacedName(obj metav1.Object) string { return fmt.Sprintf("%v/%v", obj.GetNamespace(), obj.GetName()) } - -func getNodeResource(info *framework.NodeInfo, desiredPodGroupName string) *framework.Resource { - nodeClone := info.Clone() - for _, podInfo := range info.Pods { - if podInfo == nil || podInfo.Pod == nil { - continue - } - if util.GetPodGroupFullName(podInfo.Pod) != desiredPodGroupName { - continue - } - nodeClone.RemovePod(podInfo.Pod) - } - - leftResource := framework.Resource{ - ScalarResources: make(map[corev1.ResourceName]int64), - } - allocatable := nodeClone.Allocatable - requested := nodeClone.Requested - - leftResource.AllowedPodNumber = allocatable.AllowedPodNumber - len(nodeClone.Pods) - leftResource.MilliCPU = allocatable.MilliCPU - requested.MilliCPU - leftResource.Memory = allocatable.Memory - requested.Memory - leftResource.EphemeralStorage = allocatable.EphemeralStorage - requested.EphemeralStorage - - for k, allocatableEx := range allocatable.ScalarResources { - requestEx, ok := requested.ScalarResources[k] - if !ok { - leftResource.ScalarResources[k] = allocatableEx - } else { - leftResource.ScalarResources[k] = allocatableEx - requestEx - } - } - klog.V(4).InfoS("Node left resource", "node", klog.KObj(info.Node()), "resource", leftResource) - return &leftResource -} diff --git a/sig-scheduler-plugins/pkg/fluence/core/flux.go b/sig-scheduler-plugins/pkg/fluence/core/flux.go index def239f..48e1500 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/flux.go +++ b/sig-scheduler-plugins/pkg/fluence/core/flux.go @@ -6,7 +6,6 @@ import ( "google.golang.org/grpc" "k8s.io/apimachinery/pkg/labels" - klog "k8s.io/klog/v2" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" @@ -38,7 +37,7 @@ func (pgMgr *PodGroupManager) AskFlux( // cancel in fluence. What we can do here is assume the previous pods are no longer running // and cancel the flux job to create again. if isAllocated { - klog.Info("Warning - group %s was previously allocated and is requesting again, so must have completed.", groupName) + pgMgr.log.Warning("[PodGroup AskFlux] group %s was previously allocated and is requesting again, so must have completed.", groupName) pgMgr.mutex.Lock() pgMgr.cancelFluxJob(groupName, &pod) pgMgr.mutex.Unlock() @@ -49,12 +48,12 @@ func (pgMgr *PodGroupManager) AskFlux( // This obviously may not be true if we have a hetereogenous PodGroup. // We name it based on the group, since it will represent the group jobspec := utils.PreparePodJobSpec(&pod, groupName) - klog.Infof("[Fluence] Inspect pod info, jobspec: %s\n", jobspec) + pgMgr.log.Info("[PodGroup AskFlux] Inspect pod info, jobspec: %s\n", jobspec) conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) // TODO change this to just return fmt.Errorf if err != nil { - klog.Errorf("[Fluence] Error connecting to server: %v\n", err) + pgMgr.log.Error("[PodGroup AskFlux] Error connecting to server: %v\n", err) return nodes, err } defer conn.Close() @@ -72,12 +71,12 @@ func (pgMgr *PodGroupManager) AskFlux( // An error here is an error with making the request r, err := grpcclient.Match(context.Background(), request) if err != nil { - klog.Errorf("[Fluence] did not receive any match response: %v\n", err) + pgMgr.log.Warning("[PodGroup AskFlux] did not receive any match response: %v\n", err) return nodes, err } // TODO GetPodID should be renamed, because it will reflect the group - klog.Infof("[Fluence] Match response ID %s\n", r.GetPodID()) + pgMgr.log.Info("[PodGroup AskFlux] Match response ID %s\n", r.GetPodID()) // Get the nodelist and inspect nodelist := r.GetNodelist() @@ -85,7 +84,7 @@ func (pgMgr *PodGroupManager) AskFlux( nodes = append(nodes, node.NodeID) } jobid := uint64(r.GetJobID()) - klog.Infof("[Fluence] parsed node pods list %s for job id %d\n", nodes, jobid) + pgMgr.log.Info("[PodGroup AskFlux] parsed node pods list %s for job id %d\n", nodes, jobid) // TODO would be nice to actually be able to ask flux jobs -a to fluence // That way we can verify assignments, etc. @@ -103,15 +102,15 @@ func (pgMgr *PodGroupManager) cancelFluxJob(groupName string, pod *corev1.Pod) e // The job was already cancelled by another pod if !ok { - klog.Infof("[Fluence] Request for cancel of group %s is already complete.", groupName) + pgMgr.log.Info("[PodGroup cancelFluxJob] Request for cancel of group %s is already complete.", groupName) return nil } - klog.Infof("[Fluence] Cancel flux job: %v for group %s", jobid, groupName) + pgMgr.log.Info("[PodGroup cancelFluxJob] Cancel flux job: %v for group %s", jobid, groupName) // This first error is about connecting to the server conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) if err != nil { - klog.Errorf("[Fluence] Error connecting to server: %v", err) + pgMgr.log.Error("[PodGroup cancelFluxJob] Error connecting to server: %v", err) return err } defer conn.Close() @@ -124,17 +123,17 @@ func (pgMgr *PodGroupManager) cancelFluxJob(groupName string, pod *corev1.Pod) e request := &pb.CancelRequest{JobID: int64(jobid)} res, err := grpcclient.Cancel(context.Background(), request) if err != nil { - klog.Errorf("[Fluence] did not receive any cancel response: %v", err) + pgMgr.log.Error("[PodGroup cancelFluxJob] did not receive any cancel response: %v", err) return err } - klog.Infof("[Fluence] Job cancellation for group %s result: %d", groupName, res.Error) + pgMgr.log.Info("[PodGroup cancelFluxJob] Job cancellation for group %s result: %d", groupName, res.Error) // And this error is if the cancel was successful or not if res.Error == 0 { - klog.Infof("[Fluence] Successful cancel of flux job: %d for group %s", jobid, groupName) + pgMgr.log.Info("[PodGroup cancelFluxJob] Successful cancel of flux job: %d for group %s", jobid, groupName) pgMgr.cleanup(pod, groupName) } else { - klog.Warningf("[Fluence] Failed to cancel flux job %d for group %s", jobid, groupName) + pgMgr.log.Warning("[PodGroup cancelFluxJob] Failed to cancel flux job %d for group %s", jobid, groupName) } return nil } @@ -174,7 +173,7 @@ func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { groupName = pg.Name } - klog.Infof("[Fluence] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, oldPod.Status.Phase, newPod.Status.Phase) + pgMgr.log.Verbose("[PodGroup UpdatePod] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, oldPod.Status.Phase, newPod.Status.Phase) switch newPod.Status.Phase { case corev1.PodPending: @@ -182,7 +181,7 @@ func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { case corev1.PodRunning: // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler case corev1.PodSucceeded: - klog.Infof("[Fluence] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) + pgMgr.log.Info("[PodGroup UpdatePod] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) pgMgr.mutex.Lock() defer pgMgr.mutex.Unlock() @@ -194,13 +193,13 @@ func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { if ok { pgMgr.cancelFluxJob(groupName, oldPod) } else { - klog.Infof("[Fluence] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + pgMgr.log.Verbose("[PodGroup UpdatePod] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) } case corev1.PodFailed: // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test - klog.Warningf("[Fluence] Pod %s in group %s failed, Fluence needs to free the resources", newPod.Name, groupName) + pgMgr.log.Warning("[PodGroup UpdatePod] Pod %s in group %s failed, Fluence needs to free the resources", newPod.Name, groupName) pgMgr.mutex.Lock() defer pgMgr.mutex.Unlock() @@ -209,7 +208,7 @@ func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { if ok { pgMgr.cancelFluxJob(groupName, oldPod) } else { - klog.Errorf("[Fluence] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + pgMgr.log.Error("[PodGroup UpdatePod] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) } case corev1.PodUnknown: // don't know how to deal with it as it's unknown phase @@ -220,7 +219,6 @@ func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { // DeletePod handles the delete event handler func (pgMgr *PodGroupManager) DeletePod(podObj interface{}) { - klog.Info("[Fluence] Delete Pod event handler") pod := podObj.(*corev1.Pod) groupName, pg := pgMgr.GetPodGroup(context.TODO(), pod) @@ -230,11 +228,11 @@ func (pgMgr *PodGroupManager) DeletePod(podObj interface{}) { groupName = pg.Name } - klog.Infof("[Fluence] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) + pgMgr.log.Verbose("[PodGroup DeletePod] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) switch pod.Status.Phase { case corev1.PodSucceeded: case corev1.PodPending: - klog.Infof("[Fluence] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) + pgMgr.log.Verbose("[PodGroup DeletePod] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) pgMgr.mutex.Lock() defer pgMgr.mutex.Unlock() @@ -243,7 +241,7 @@ func (pgMgr *PodGroupManager) DeletePod(podObj interface{}) { if ok { pgMgr.cancelFluxJob(groupName, pod) } else { - klog.Infof("[Fluence] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + pgMgr.log.Info("[PodGroup DeletePod] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) } case corev1.PodRunning: pgMgr.mutex.Lock() @@ -253,7 +251,7 @@ func (pgMgr *PodGroupManager) DeletePod(podObj interface{}) { if ok { pgMgr.cancelFluxJob(groupName, pod) } else { - klog.Infof("[Fluence] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + pgMgr.log.Info("[PodGroup DeletePod] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) } } } diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 5f9f635..84f3e95 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -25,6 +25,8 @@ import ( "k8s.io/apimachinery/pkg/util/sets" klog "k8s.io/klog/v2" + "sigs.k8s.io/scheduler-plugins/pkg/logger" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" clientscheme "k8s.io/client-go/kubernetes/scheme" @@ -50,6 +52,7 @@ type Fluence struct { frameworkHandler framework.Handle pgMgr fcore.Manager scheduleTimeout *time.Duration + log *logger.DebugLogger } var ( @@ -70,6 +73,11 @@ func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) args := config.CoschedulingArgs{} ctx := context.TODO() + // Make fluence his own little logger! + // This can eventually be a flag, but just going to set for now + // It shall be a very chonky file. Oh lawd he comin! + l := logger.NewDebugLogger(logger.LevelError, "/tmp/fluence.log") + scheme := runtime.NewScheme() _ = clientscheme.AddToScheme(scheme) _ = corev1.AddToScheme(scheme) @@ -92,6 +100,7 @@ func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) &scheduleTimeDuration, // Keep the podInformer (from frameworkHandle) as the single source of Pods. handle.SharedInformerFactory().Core().V1().Pods(), + l, ) // Event handlers to call on pgMgr @@ -105,8 +114,13 @@ func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) frameworkHandler: handle, pgMgr: pgMgr, scheduleTimeout: &scheduleTimeDuration, + log: l, } - return plugin, nil + + // TODO this is not supported yet + // Account for resources in running cluster + err = plugin.RegisterExisting(ctx) + return plugin, err } func (f *Fluence) Name() string { @@ -134,7 +148,7 @@ func (f *Fluence) Filter( nodeInfo *framework.NodeInfo, ) *framework.Status { - klog.Info("Filtering input node ", nodeInfo.Node().Name) + f.log.Verbose("[Fluence Filter] Filtering input node %s", nodeInfo.Node().Name) state, err := cycleState.Read(framework.StateKey(pod.Name)) // No error means we retrieved the state @@ -147,7 +161,7 @@ func (f *Fluence) Filter( if ok && value.NodeName != nodeInfo.Node().Name { return framework.NewStatus(framework.Unschedulable, "pod is not permitted") } else { - klog.Infof("Filter: node %s selected for %s\n", value.NodeName, pod.Name) + f.log.Info("[Fluence Filter] node %s selected for %s\n", value.NodeName, pod.Name) } } return framework.NewStatus(framework.Success) @@ -158,7 +172,6 @@ func (f *Fluence) Filter( // 2. Compare the initialization timestamps of PodGroups or Pods. // 3. Compare the keys of PodGroups/Pods: /. func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { - klog.Infof("ordering pods in fluence scheduler plugin") prio1 := corev1helpers.PodPriority(podInfo1.Pod) prio2 := corev1helpers.PodPriority(podInfo2.Pod) if prio1 != prio2 { @@ -212,7 +225,7 @@ func (f *Fluence) PreFilter( // This will populate the node name into the pod group manager err := f.pgMgr.PreFilter(ctx, pod, state) if err != nil { - klog.ErrorS(err, "PreFilter failed", "pod", klog.KObj(pod)) + f.log.Error("[Fluence PreFilter] failed pod %s: %s", klog.KObj(pod), err.Error()) return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) } node = f.pgMgr.GetPodNode(pod) diff --git a/sig-scheduler-plugins/pkg/fluence/register.go b/sig-scheduler-plugins/pkg/fluence/register.go new file mode 100644 index 0000000..8f39f09 --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/register.go @@ -0,0 +1,55 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fluence + +import ( + "context" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +// RegisterExisting uses the in cluster API to ensure existing pods +// are known to fluence, This is a one-time, static approach, so if a resource +// here goes away we cannot remove it from being known. But it's better than +// not having it, and having fluxion assume more resources than the +// cluster has available. This is a TODO as fluxion does not support it +func (f *Fluence) RegisterExisting(ctx context.Context) error { + + // creates an in-cluster config and client + config, err := rest.InClusterConfig() + if err != nil { + f.log.Error("[Fluence RegisterExisting] Error creating in-cluster config: %s\n", err) + return err + } + // creates the clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + f.log.Error("[Fluence RegisterExisting] Error creating client for config: %s\n", err) + return err + } + // get pods in all the namespaces by omitting namespace + // Or specify namespace to get pods in particular namespace + pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) + if err != nil { + f.log.Info("[Fluence RegisterExisting] Error listing pods: %s\n", err) + return err + } + f.log.Info("[Fluence RegisterExisting] Found %d existing pods in the cluster\n", len(pods.Items)) + return nil +} diff --git a/sig-scheduler-plugins/pkg/logger/logger.go b/sig-scheduler-plugins/pkg/logger/logger.go new file mode 100644 index 0000000..522be61 --- /dev/null +++ b/sig-scheduler-plugins/pkg/logger/logger.go @@ -0,0 +1,88 @@ +package logger + +// A small debug logger to write to file instead of klog +// I don't know where to close, so I'm opening and appending each time +// It's a bad design, but will work for debugging. + +import ( + "fmt" + "log" + "os" +) + +const ( + LevelNone = iota + LevelInfo + LevelWarning + LevelError + LevelVerbose + LevelDebug +) + +// TODO try saving state here when we can close +type DebugLogger struct { + level int + Filename string + handle *os.File +} + +func NewDebugLogger(level int, filename string) *DebugLogger { + return &DebugLogger{ + level: LevelNone, + Filename: filename, + } +} + +func (l *DebugLogger) Start() (*log.Logger, error) { + f, err := os.OpenFile(l.Filename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, os.ModePerm) + if err != nil { + return nil, err + } + logger := log.New(f, "", 0) + l.handle = f + return logger, nil +} +func (l *DebugLogger) Stop() error { + if l.handle != nil { + return l.handle.Close() + } + return nil +} + +// Logging functions you should use! +func (l *DebugLogger) Info(message ...any) error { + return l.log(LevelInfo, " INFO: ", message...) +} +func (l *DebugLogger) Error(message ...any) error { + return l.log(LevelError, " ERROR: ", message...) +} +func (l *DebugLogger) Debug(message ...any) error { + return l.log(LevelDebug, " DEBUG: ", message...) +} +func (l *DebugLogger) Verbose(message ...any) error { + return l.log(LevelVerbose, "VERBOSE: ", message...) +} +func (l *DebugLogger) Warning(message ...any) error { + return l.log(LevelWarning, "WARNING: ", message...) +} + +// log is the shared class function for actually printing to the log +func (l *DebugLogger) log(level int, prefix string, message ...any) error { + logger, err := l.Start() + if err != nil { + return err + } + // Assume the prolog (to be formatted) is at index 0 + prolog := message[0].(string) + if prefix != "" { + prolog = prefix + " " + prolog + } + rest := message[1:] + + // msg := fmt.Sprintf(message...) + fmt.Printf("Compariing level %d >= %d\n", level, l.level) + if level >= l.level { + logger.Printf(prolog, rest...) + } + return l.Stop() +} diff --git a/src/fluence/utils/utils.go b/src/fluence/utils/utils.go index f81f81c..e429056 100644 --- a/src/fluence/utils/utils.go +++ b/src/fluence/utils/utils.go @@ -4,6 +4,8 @@ import ( "context" "fmt" + klog "k8s.io/klog/v2" + "encoding/json" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/jgf" @@ -20,7 +22,56 @@ var ( controlPlaneLabel = "node-role.kubernetes.io/control-plane" ) +// RegisterExisting uses the in cluster API to get existing pods +// This is actually the same as computeTotalRequests but I wanted to compare the two +// It is currently not being used. The main difference is that below, we are essentially +// rounding the cpu to the smaller unit (logically for the graph) but losing some +// granularity, if we think "milli" values have feet. +func RegisterExisting(clientset *kubernetes.Clientset, ctx context.Context) (map[string]PodSpec, error) { + + // We are using PodSpec as a holder for a *summary* of cpu/memory being used + // by the node, it is a summation across pods we find on each one + nodes := map[string]PodSpec{} + + // get pods in all the namespaces by omitting namespace + // Or specify namespace to get pods in particular namespace + pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) + if err != nil { + klog.Infof("Error listing pods: %s\n", err) + return nodes, err + } + klog.Infof("Found %d existing pods in the cluster\n", len(pods.Items)) + + // Create a new PodSpec for each + for _, pod := range pods.Items { + + // Add the node to our lookup if we don't have it yet + _, ok := nodes[pod.Spec.NodeName] + if !ok { + nodes[pod.Spec.NodeName] = PodSpec{} + } + ps := nodes[pod.Spec.NodeName] + + for _, container := range pod.Spec.Containers { + specRequests := container.Resources.Requests + ps.Cpu += int32(specRequests.Cpu().Value()) + ps.Memory += specRequests.Memory().Value() + ps.Storage += specRequests.StorageEphemeral().Value() + + specLimits := container.Resources.Limits + gpuSpec := specLimits["nvidia.com/gpu"] + ps.Gpu += gpuSpec.Value() + } + nodes[pod.Spec.NodeName] = ps + } + return nodes, nil +} + // CreateJGF creates the Json Graph Format +// We currently don't have support in fluxion to allocate jobs for existing pods, +// so instead we create the graph with fewer resources. When that support is +// added (see sig-scheduler-plugins/pkg/fluence/register.go) we can +// remove the adjustment here, which is more of a hack func CreateJGF(filename string, skipLabel *string) error { ctx := context.Background() config, err := rest.InClusterConfig() @@ -28,16 +79,19 @@ func CreateJGF(filename string, skipLabel *string) error { fmt.Println("Error getting InClusterConfig") return err } - // creates the clientset clientset, err := kubernetes.NewForConfig(config) if err != nil { - fmt.Println("Error getting ClientSet") + fmt.Printf("Error getting ClientSet: %s", err) return err } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + fmt.Printf("Error listing nodes: %s", err) + return err + } - var fluxgraph jgf.Fluxjgf - fluxgraph = jgf.InitJGF() + // Create a Flux Json Graph Format (JGF) with all cluster nodes + fluxgraph := jgf.InitJGF() // TODO it looks like we can add more to the graph here - // let's remember to consider what else we can. @@ -53,11 +107,11 @@ func CreateJGF(filename string, skipLabel *string) error { vcores := 0 fmt.Println("Number nodes ", len(nodes.Items)) - var totalAllocCpu, totalmem int64 + var totalAllocCpu int64 totalAllocCpu = 0 sdnCount := 0 - for node_index, node := range nodes.Items { + for nodeIndex, node := range nodes.Items { // We should not be scheduling to the control plane _, ok := node.Labels[controlPlaneLabel] @@ -71,107 +125,121 @@ func CreateJGF(filename string, skipLabel *string) error { if *skipLabel != "" { _, ok := node.Labels[*skipLabel] if ok { - fmt.Println("Skipping node ", node.GetName()) + fmt.Printf("Skipping node %s\n", node.GetName()) continue } } - fmt.Println("node in flux group ", node.GetName()) - if !node.Spec.Unschedulable { - fieldselector, err := fields.ParseSelector("spec.nodeName=" + node.GetName() + ",status.phase!=" + string(corev1.PodSucceeded) + ",status.phase!=" + string(corev1.PodFailed)) - if err != nil { - return err - } - pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{ - FieldSelector: fieldselector.String(), - }) - if err != nil { - return err - } + if node.Spec.Unschedulable { + fmt.Printf("Skipping node %s, unschedulable\n", node.GetName()) + continue + } - // fmt.Println("Node ", node.GetName(), " has pods ", pods) - // Check if subnet already exists - // Here we build subnets according to topology.kubernetes.io/zone label - subnetName := node.Labels["topology.kubernetes.io/zone"] - subnet := fluxgraph.MakeSubnet(sdnCount, subnetName) - sdnCount = sdnCount + 1 - fluxgraph.MakeEdge(cluster, subnet, "contains") - fluxgraph.MakeEdge(subnet, cluster, "in") - - reqs := computeTotalRequests(pods) - cpuReqs := reqs[corev1.ResourceCPU] - memReqs := reqs[corev1.ResourceMemory] - - avail := node.Status.Allocatable.Cpu().MilliValue() - totalcpu := int64((avail - cpuReqs.MilliValue()) / 1000) //- 1 - fmt.Println("Node ", node.GetName(), " flux cpu ", totalcpu) - totalAllocCpu = totalAllocCpu + totalcpu - totalmem = node.Status.Allocatable.Memory().Value() - memReqs.Value() - fmt.Println("Node ", node.GetName(), " total mem ", totalmem) - gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable["nvidia.com/gpu"] - - // reslist := node.Status.Allocatable - // resources := make([]corev1.ResourceName, 0, len(reslist)) - // for resource := range reslist { - // fmt.Println("resource ", resource) - // resources = append(resources, resource) - // } - // for _, resource := range resources { - // value := reslist[resource] - - // fmt.Printf(" %s:\t%s\n", resource, value.String()) - // } - - workernode := fluxgraph.MakeNode(node_index, false, node.Name) - fluxgraph.MakeEdge(subnet, workernode, "contains") // this is rack otherwise - fluxgraph.MakeEdge(workernode, subnet, "in") // this is rack otherwise - - // socket := fluxgraph.MakeSocket(0, "socket") - // fluxgraph.MakeEdge(workernode, socket, "contains") - // fluxgraph.MakeEdge(socket, workernode, "in") - - if hasGpuAllocatable { - fmt.Println("GPU Resource quantity ", gpuAllocatable.Value()) - //MakeGPU(index int, name string, size int) string { - for index := 0; index < int(gpuAllocatable.Value()); index++ { - gpu := fluxgraph.MakeGPU(index, "nvidiagpu", 1) - fluxgraph.MakeEdge(workernode, gpu, "contains") // workernode was socket - fluxgraph.MakeEdge(gpu, workernode, "in") - } + fieldselector, err := fields.ParseSelector("spec.nodeName=" + node.GetName() + ",status.phase!=" + string(corev1.PodSucceeded) + ",status.phase!=" + string(corev1.PodFailed)) + if err != nil { + return err + } + pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{ + FieldSelector: fieldselector.String(), + }) + if err != nil { + return err + } + // Check if subnet already exists + // Here we build subnets according to topology.kubernetes.io/zone label + subnetName := node.Labels["topology.kubernetes.io/zone"] + subnet := fluxgraph.MakeSubnet(sdnCount, subnetName) + sdnCount = sdnCount + 1 + fluxgraph.MakeEdge(cluster, subnet, "contains") + fluxgraph.MakeEdge(subnet, cluster, "in") + + // These are requests for existing pods, for cpu and memory + reqs := computeTotalRequests(pods) + cpuReqs := reqs[corev1.ResourceCPU] + memReqs := reqs[corev1.ResourceMemory] + + // Actual values that we have available (minus requests) + totalCpu := node.Status.Allocatable.Cpu().MilliValue() + totalMem := node.Status.Allocatable.Memory().Value() + + // Values accounting for requests + availCpu := int64((totalCpu - cpuReqs.MilliValue()) / 1000) + availMem := totalMem - memReqs.Value() + + // Show existing to compare to + fmt.Printf("\n📦️ %s\n", node.GetName()) + fmt.Printf(" allocated cpu: %d\n", cpuReqs.Value()) + fmt.Printf(" allocated mem: %d\n", memReqs.Value()) + fmt.Printf(" available cpu: %d\n", availCpu) + fmt.Printf(" running pods: %d\n", len(pods.Items)) + + // keep track of overall total + totalAllocCpu += availCpu + fmt.Printf(" available mem: %d\n", availMem) + gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable["nvidia.com/gpu"] + + // reslist := node.Status.Allocatable + // resources := make([]corev1.ResourceName, 0, len(reslist)) + // for resource := range reslist { + // fmt.Println("resource ", resource) + // resources = append(resources, resource) + // } + // for _, resource := range resources { + // value := reslist[resource] + + // fmt.Printf(" %s:\t%s\n", resource, value.String()) + // } + + workernode := fluxgraph.MakeNode(nodeIndex, false, node.Name) + fluxgraph.MakeEdge(subnet, workernode, "contains") // this is rack otherwise + fluxgraph.MakeEdge(workernode, subnet, "in") // this is rack otherwise + + // socket := fluxgraph.MakeSocket(0, "socket") + // fluxgraph.MakeEdge(workernode, socket, "contains") + // fluxgraph.MakeEdge(socket, workernode, "in") + + if hasGpuAllocatable { + fmt.Println("GPU Resource quantity ", gpuAllocatable.Value()) + //MakeGPU(index int, name string, size int) string { + for index := 0; index < int(gpuAllocatable.Value()); index++ { + gpu := fluxgraph.MakeGPU(index, "nvidiagpu", 1) + fluxgraph.MakeEdge(workernode, gpu, "contains") // workernode was socket + fluxgraph.MakeEdge(gpu, workernode, "in") } - for index := 0; index < int(totalcpu); index++ { - // MakeCore(index int, name string) - core := fluxgraph.MakeCore(index, "core") - fluxgraph.MakeEdge(workernode, core, "contains") // workernode was socket - fluxgraph.MakeEdge(core, workernode, "in") - - // Question from Vanessa: - // How can we get here and have vcores ever not equal to zero? - if vcores == 0 { - fluxgraph.MakeNFDProperties(core, index, "cpu-", &node.Labels) - // fluxgraph.MakeNFDProperties(core, index, "netmark-", &node.Labels) - } else { - for vc := 0; vc < vcores; vc++ { - vcore := fluxgraph.MakeVCore(core, vc, "vcore") - fluxgraph.MakeNFDProperties(vcore, index, "cpu-", &node.Labels) - } + } + + for index := 0; index < int(availCpu); index++ { + // MakeCore(index int, name string) + core := fluxgraph.MakeCore(index, "core") + fluxgraph.MakeEdge(workernode, core, "contains") // workernode was socket + fluxgraph.MakeEdge(core, workernode, "in") + + // Question from Vanessa: + // How can we get here and have vcores ever not equal to zero? + if vcores == 0 { + fluxgraph.MakeNFDProperties(core, index, "cpu-", &node.Labels) + // fluxgraph.MakeNFDProperties(core, index, "netmark-", &node.Labels) + } else { + for vc := 0; vc < vcores; vc++ { + vcore := fluxgraph.MakeVCore(core, vc, "vcore") + fluxgraph.MakeNFDProperties(vcore, index, "cpu-", &node.Labels) } } + } - // MakeMemory(index int, name string, unit string, size int) - fractionmem := totalmem >> 30 - // fractionmem := (totalmem/totalcpu) >> 20 - // fmt.Println("Creating ", fractionmem, " vertices with ", 1<<10, " MB of mem") - for i := 0; i < /*int(totalcpu)*/ int(fractionmem); i++ { - mem := fluxgraph.MakeMemory(i, "memory", "MB", int(1<<10)) - fluxgraph.MakeEdge(workernode, mem, "contains") - fluxgraph.MakeEdge(mem, workernode, "in") - } + // MakeMemory(index int, name string, unit string, size int) + fractionMem := availMem >> 30 + // fractionmem := (totalmem/totalcpu) >> 20 + // fmt.Println("Creating ", fractionmem, " vertices with ", 1<<10, " MB of mem") + for i := 0; i < /*int(totalcpu)*/ int(fractionMem); i++ { + mem := fluxgraph.MakeMemory(i, "memory", "MB", int(1<<10)) + fluxgraph.MakeEdge(workernode, mem, "contains") + fluxgraph.MakeEdge(mem, workernode, "in") } } - fmt.Println("Can request at most ", totalAllocCpu, " exclusive cpu") + fmt.Printf("\nCan request at most %d exclusive cpu", totalAllocCpu) err = fluxgraph.WriteJGF(filename) if err != nil { return err From 8c99f108f80f2a6fcbf83dbab8e273e225cb6073 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 4 Apr 2024 19:40:15 -0600 Subject: [PATCH 24/28] test: only allow scheduling first pod Problem: we currently allow any pod in the group to make the request Solution: Making a BIG assumption that might be wrong, I am adding logic that only allows scheduling (meaning going through PreFilter with AskFlux) given that we see the first pod in the listing. In practice this is the first index (e.g., index 0) which based on our sorting strategy (timestamp then name) I think might work. But I am not 100% on that. The reason we want to do that is so the nodes are chosen for the first pod, and then the group can quickly follow and be actually assigned. Before I did this I kept seeing huge delays in waiting for the queue to move (e.g., 5/6 pods Running and the last one waiting, and then kicking in much later like an old car) and I think with this tweak that is fixed. But this is my subjective evaluation. I am also adding in the hack script for deploying to gke, which requires a push instead of a kind load. Signed-off-by: vsoch --- README.md | 1 - hack/quick-build-gke.sh | 33 ++++++++++++++++ hack/quick-build.sh | 2 +- .../pkg/fluence/core/core.go | 39 ++++++++++++++++--- sig-scheduler-plugins/pkg/logger/logger.go | 3 +- 5 files changed, 68 insertions(+), 10 deletions(-) create mode 100755 hack/quick-build-gke.sh diff --git a/README.md b/README.md index e3e1214..ff3327e 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,6 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched ## TODO -- On init, need to load in resource graph that accounts for running stuff - Need to allow for restart / crashes and looking up existing jobid, updating maps in PodGroup - Since AskFlux is done on level of pod group, refactor function to account for specific resources of all pods (not just one pod) - Figure out if EventsToRegister replaces old informer diff --git a/hack/quick-build-gke.sh b/hack/quick-build-gke.sh new file mode 100755 index 0000000..875360a --- /dev/null +++ b/hack/quick-build-gke.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Before running this, you should: +# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane) +# 2. Install cert-manager +# 3. Customize the script to point to your registry if you intend to push + +REGISTRY="${1:-ghcr.io/vsoch}" +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT=$(dirname ${HERE}) + +# Go to the script directory +cd ${ROOT} + +# These build each of the images. The sidecar is separate from the other two in src/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller + +# This is what it might look like to push +# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest + +# We load into kind so we don't need to push/pull and use up internet data ;) +docker push ${REGISTRY}/fluence-sidecar:latest +docker push ${REGISTRY}/fluence-controller:latest +docker push ${REGISTRY}/fluence:latest + +# And then install using the charts. The pull policy ensures we use the loaded ones +cd ${ROOT}/upstream/manifests/install/charts +helm uninstall fluence || true +helm install \ + --set scheduler.image=${REGISTRY}/fluence:latest \ + --set controller.image=${REGISTRY}/fluence-controller:latest \ + --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ + fluence as-a-second-scheduler/ diff --git a/hack/quick-build.sh b/hack/quick-build.sh index b3ccefe..23a5c87 100755 --- a/hack/quick-build.sh +++ b/hack/quick-build.sh @@ -33,4 +33,4 @@ helm install \ --set controller.pullPolicy=Never \ --set controller.image=${REGISTRY}/fluence-controller:latest \ --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ - fluence as-a-second-scheduler/ \ No newline at end of file + fluence as-a-second-scheduler/ diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 8b08468..1e75814 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -71,7 +71,7 @@ type PodGroupManager struct { scheduleTimeout *time.Duration // permittedPG stores the podgroup name which has passed the pre resource check. permittedPG *gochache.Cache - // backedOffPG stores the podgorup name which failed scheudling recently. + // backedOffPG stores the podgorup name which failed scheduling recently. backedOffPG *gochache.Cache // podLister is pod lister podLister listerv1.PodLister @@ -111,12 +111,25 @@ func NewPodGroupManager( } // GetStatuses string (of all pods) to show for debugging purposes -func (pgMgr *PodGroupManager) GetStatuses(pods []*corev1.Pod) string { +// Since we loop here, we also determine if the first pod is the one +// we are considering +func (pgMgr *PodGroupManager) GetStatusesAndIndex( + pods []*corev1.Pod, + pod *corev1.Pod, +) (string, bool, int) { statuses := "" - for _, pod := range pods { - statuses += " " + fmt.Sprintf("%s", pod.Status.Phase) + + // We need to distinguish 0 from the default and not finding anything + foundIndex := false + index := 0 + for i, p := range pods { + if p.Name == pod.Name { + foundIndex = true + index = i + } + statuses += " " + fmt.Sprintf("%s", p.Status.Phase) } - return statuses + return statuses, foundIndex, index } // GetPodNode is a quick lookup to see if we have a node @@ -153,8 +166,10 @@ func (pgMgr *PodGroupManager) PreFilter( return fmt.Errorf("podLister list pods failed: %w", err) } + // Only allow scheduling the first in the group so the others come after + // Get statuses to show for debugging - statuses := pgMgr.GetStatuses(pods) + statuses, found, idx := pgMgr.GetStatusesAndIndex(pods, pod) // This shows us the number of pods we have in the set and their states pgMgr.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", pgFullName, statuses, pg.Spec.MinMember, len(pods)) @@ -163,6 +178,18 @@ func (pgMgr *PodGroupManager) PreFilter( "current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember) } + if !found { + return fmt.Errorf("pod %s was not found in group - this should not happen", pod.Name) + } + + // We only will AskFlux for the first pod + // This makes an assumption that the order listed is the order in the queue, I'm not + // sure that is true in practice. This is the one case with retry. This design + // probably needs thinking and work. + if idx != 0 { + return fmt.Errorf("pod %s is not first in the list, will wait to schedule", pod.Name) + } + // TODO we likely can take advantage of these resources or other custom // attributes we add. For now ignore and calculate based on pod needs (above) // if pg.Spec.MinResources == nil { diff --git a/sig-scheduler-plugins/pkg/logger/logger.go b/sig-scheduler-plugins/pkg/logger/logger.go index 522be61..053021a 100644 --- a/sig-scheduler-plugins/pkg/logger/logger.go +++ b/sig-scheduler-plugins/pkg/logger/logger.go @@ -19,7 +19,6 @@ const ( LevelDebug ) -// TODO try saving state here when we can close type DebugLogger struct { level int Filename string @@ -28,7 +27,7 @@ type DebugLogger struct { func NewDebugLogger(level int, filename string) *DebugLogger { return &DebugLogger{ - level: LevelNone, + level: level, Filename: filename, } } From d8e67fa6520d3b4a107e6bd36f7a2ebc34457f05 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 17 Apr 2024 00:54:02 -0600 Subject: [PATCH 25/28] test: adding permit to allow for sibling pod scheduling Problem: the submit of the first index works for more controlled lengths (e.g., lammps takes a while) but was having issues with really quick jobs. Solution: try restoring the queue that allows for enabling siblings pods so any group can be scheduled. Signed-off-by: vsoch --- examples/pod-group-jobs/job1.yaml | 59 +++++++ examples/pod-group-jobs/job2.yaml | 59 +++++++ .../pkg/fluence/core/core.go | 161 +++++++++++++++--- sig-scheduler-plugins/pkg/fluence/fluence.go | 139 ++++++++++++++- .../pkg/fluence/group/group.go | 18 ++ sig-scheduler-plugins/pkg/logger/logger.go | 4 +- 6 files changed, 406 insertions(+), 34 deletions(-) create mode 100644 examples/pod-group-jobs/job1.yaml create mode 100644 examples/pod-group-jobs/job2.yaml diff --git a/examples/pod-group-jobs/job1.yaml b/examples/pod-group-jobs/job1.yaml new file mode 100644 index 0000000..e0ebba0 --- /dev/null +++ b/examples/pod-group-jobs/job1.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: s0 +spec: + clusterIP: None + selector: + job-name: job-0 +--- +apiVersion: batch/v1 +kind: Job +metadata: + # name will be derived based on iteration + name: job-0 +spec: + completions: 4 + parallelism: 4 + completionMode: Indexed + template: + metadata: + labels: + app: job-0 + spec: + subdomain: s0 + schedulerName: fluence + restartPolicy: Never + containers: + - name: example-workload + image: bash:latest + resources: + limits: + cpu: "3" + requests: + cpu: "3" + command: + - bash + - -c + - | + if [ $JOB_COMPLETION_INDEX -ne "0" ] + then + sleep infinity + fi + echo "START: $(date +%s)" + for i in 0 1 2 3 + do + gotStatus="-1" + wantStatus="0" + while [ $gotStatus -ne $wantStatus ] + do + ping -c 1 job-0-${i}.s0 > /dev/null 2>&1 + gotStatus=$? + if [ $gotStatus -ne $wantStatus ]; then + echo "Failed to ping pod job-0-${i}.s0, retrying in 1 second..." + sleep 1 + fi + done + echo "Successfully pinged pod: job-0-${i}.s0" + done + echo "DONE: $(date +%s)" \ No newline at end of file diff --git a/examples/pod-group-jobs/job2.yaml b/examples/pod-group-jobs/job2.yaml new file mode 100644 index 0000000..c39820b --- /dev/null +++ b/examples/pod-group-jobs/job2.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: s1 +spec: + clusterIP: None + selector: + job-name: job-1 +--- +apiVersion: batch/v1 +kind: Job +metadata: + # name will be derived based on iteration + name: job-1 +spec: + completions: 4 + parallelism: 4 + completionMode: Indexed + template: + metadata: + labels: + app: job-1 + spec: + subdomain: s1 + schedulerName: fluence + restartPolicy: Never + containers: + - name: example-workload + image: bash:latest + resources: + limits: + cpu: "3" + requests: + cpu: "3" + command: + - bash + - -c + - | + if [ $JOB_COMPLETION_INDEX -ne "0" ] + then + sleep infinity + fi + echo "START: $(date +%s)" + for i in 0 1 2 3 + do + gotStatus="-1" + wantStatus="0" + while [ $gotStatus -ne $wantStatus ] + do + ping -c 1 job-0-${i}.s1 > /dev/null 2>&1 + gotStatus=$? + if [ $gotStatus -ne $wantStatus ]; then + echo "Failed to ping pod job-0-${i}.s1, retrying in 1 second..." + sleep 1 + fi + done + echo "Successfully pinged pod: job-0-${i}.s1" + done + echo "DONE: $(date +%s)" \ No newline at end of file diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 1e75814..ea300ce 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -39,11 +39,33 @@ import ( "sigs.k8s.io/scheduler-plugins/pkg/util" ) +type Status string + +const ( + // PodGroupNotSpecified denotes no PodGroup is specified in the Pod spec. + PodGroupNotSpecified Status = "PodGroup not specified" + // PodGroupNotFound denotes the specified PodGroup in the Pod spec is + // not found in API server. + PodGroupNotFound Status = "PodGroup not found" + Success Status = "Success" + Wait Status = "Wait" + + permitStateKey = "PermitFluence" +) + // TODO should eventually store group name here to reassociate on reload type FluxStateData struct { NodeName string } +type PermitState struct { + Activate bool +} + +func (s *PermitState) Clone() framework.StateData { + return &PermitState{Activate: s.Activate} +} + func (s *FluxStateData) Clone() framework.StateData { clone := &FluxStateData{ NodeName: s.NodeName, @@ -58,6 +80,10 @@ type Manager interface { GetPodGroup(context.Context, *corev1.Pod) (string, *v1alpha1.PodGroup) GetCreationTimestamp(*corev1.Pod, time.Time) time.Time DeletePermittedPodGroup(string) + Permit(context.Context, *framework.CycleState, *corev1.Pod) Status + CalculateAssignedPods(string, string) int + ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) + BackoffPodGroup(string, time.Duration) } // PodGroupManager defines the scheduling operation called @@ -110,26 +136,69 @@ func NewPodGroupManager( return pgMgr } +func (pgMgr *PodGroupManager) BackoffPodGroup(pgName string, backoff time.Duration) { + if backoff == time.Duration(0) { + return + } + pgMgr.backedOffPG.Add(pgName, nil, backoff) +} + +// ActivateSiblings stashes the pods belonging to the same PodGroup of the given pod +// in the given state, with a reserved key "kubernetes.io/pods-to-activate". +func (pgMgr *PodGroupManager) ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) { + pgName := util.GetPodGroupLabel(pod) + if pgName == "" { + return + } + + // Only proceed if it's explicitly requested to activate sibling pods. + if c, err := state.Read(permitStateKey); err != nil { + return + } else if s, ok := c.(*PermitState); !ok || !s.Activate { + return + } + + pods, err := pgMgr.podLister.Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: pgName}), + ) + if err != nil { + klog.ErrorS(err, "Failed to obtain pods belong to a PodGroup", "podGroup", pgName) + return + } + + for i := range pods { + if pods[i].UID == pod.UID { + pods = append(pods[:i], pods[i+1:]...) + break + } + } + + if len(pods) != 0 { + if c, err := state.Read(framework.PodsToActivateKey); err == nil { + if s, ok := c.(*framework.PodsToActivate); ok { + s.Lock() + for _, pod := range pods { + namespacedName := GetNamespacedName(pod) + s.Map[namespacedName] = pod + } + s.Unlock() + } + } + } +} + // GetStatuses string (of all pods) to show for debugging purposes -// Since we loop here, we also determine if the first pod is the one -// we are considering -func (pgMgr *PodGroupManager) GetStatusesAndIndex( +func (pgMgr *PodGroupManager) GetStatuses( pods []*corev1.Pod, pod *corev1.Pod, -) (string, bool, int) { +) string { statuses := "" // We need to distinguish 0 from the default and not finding anything - foundIndex := false - index := 0 - for i, p := range pods { - if p.Name == pod.Name { - foundIndex = true - index = i - } + for _, p := range pods { statuses += " " + fmt.Sprintf("%s", p.Status.Phase) } - return statuses, foundIndex, index + return statuses } // GetPodNode is a quick lookup to see if we have a node @@ -138,6 +207,39 @@ func (pgMgr *PodGroupManager) GetPodNode(pod *corev1.Pod) string { return node } +// Permit permits a pod to run, if the minMember match, it would send a signal to chan. +func (pgMgr *PodGroupManager) Permit(ctx context.Context, state *framework.CycleState, pod *corev1.Pod) Status { + pgFullName, pg := pgMgr.GetPodGroup(ctx, pod) + if pgFullName == "" { + return PodGroupNotSpecified + } + if pg == nil { + // A Pod with a podGroup name but without a PodGroup found is denied. + return PodGroupNotFound + } + + assigned := pgMgr.CalculateAssignedPods(pg.Name, pg.Namespace) + // The number of pods that have been assigned nodes is calculated from the snapshot. + // The current pod in not included in the snapshot during the current scheduling cycle. + if int32(assigned)+1 >= pg.Spec.MinMember { + return Success + } + + if assigned == 0 { + // Given we've reached Permit(), it's mean all PreFilter checks (minMember & minResource) + // already pass through, so if assigned == 0, it could be due to: + // - minResource get satisfied + // - new pods added + // In either case, we should and only should use this 0-th pod to trigger activating + // its siblings. + // It'd be in-efficient if we trigger activating siblings unconditionally. + // See https://github.com/kubernetes-sigs/scheduler-plugins/issues/682 + state.Write(permitStateKey, &PermitState{Activate: true}) + } + + return Wait +} + // PreFilter filters out a pod if // 1. it belongs to a podgroup that was recently denied or // 2. the total number of pods in the podgroup is less than the minimum number of pods @@ -169,7 +271,7 @@ func (pgMgr *PodGroupManager) PreFilter( // Only allow scheduling the first in the group so the others come after // Get statuses to show for debugging - statuses, found, idx := pgMgr.GetStatusesAndIndex(pods, pod) + statuses := pgMgr.GetStatuses(pods, pod) // This shows us the number of pods we have in the set and their states pgMgr.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", pgFullName, statuses, pg.Spec.MinMember, len(pods)) @@ -178,18 +280,6 @@ func (pgMgr *PodGroupManager) PreFilter( "current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember) } - if !found { - return fmt.Errorf("pod %s was not found in group - this should not happen", pod.Name) - } - - // We only will AskFlux for the first pod - // This makes an assumption that the order listed is the order in the queue, I'm not - // sure that is true in practice. This is the one case with retry. This design - // probably needs thinking and work. - if idx != 0 { - return fmt.Errorf("pod %s is not first in the list, will wait to schedule", pod.Name) - } - // TODO we likely can take advantage of these resources or other custom // attributes we add. For now ignore and calculate based on pod needs (above) // if pg.Spec.MinResources == nil { @@ -233,7 +323,9 @@ func (pgMgr *PodGroupManager) PreFilter( stateData := FluxStateData{NodeName: node} state.Write(framework.StateKey(pod.Name), &stateData) // Also save to the podToNode lookup + pgMgr.mutex.Lock() pgMgr.podToNode[pod.Name] = node + pgMgr.mutex.Unlock() } pgMgr.permittedPG.Add(pgFullName, pgFullName, *pgMgr.scheduleTimeout) return nil @@ -252,6 +344,25 @@ func (pgMgr *PodGroupManager) GetCreationTimestamp(pod *corev1.Pod, ts time.Time return pg.CreationTimestamp.Time } +// CalculateAssignedPods returns the number of pods that has been assigned nodes: assumed or bound. +func (pgMgr *PodGroupManager) CalculateAssignedPods(podGroupName, namespace string) int { + nodeInfos, err := pgMgr.snapshotSharedLister.NodeInfos().List() + if err != nil { + pgMgr.log.Error("Cannot get nodeInfos from frameworkHandle: %s", err) + return 0 + } + var count int + for _, nodeInfo := range nodeInfos { + for _, podInfo := range nodeInfo.Pods { + pod := podInfo.Pod + if util.GetPodGroupLabel(pod) == podGroupName && pod.Namespace == namespace && pod.Spec.NodeName != "" { + count++ + } + } + } + return count +} + // DeletePermittedPodGroup deletes a podGroup that passes Pre-Filter but reaches PostFilter. func (pgMgr *PodGroupManager) DeletePermittedPodGroup(pgFullName string) { pgMgr.permittedPG.Delete(pgFullName) diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 84f3e95..099d2f3 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -22,8 +22,8 @@ import ( "sync" "time" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" - klog "k8s.io/klog/v2" "sigs.k8s.io/scheduler-plugins/pkg/logger" @@ -33,12 +33,12 @@ import ( "k8s.io/client-go/tools/cache" fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" + flabel "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" corev1helpers "k8s.io/component-helpers/scheduling/corev1" "k8s.io/kubernetes/pkg/scheduler/framework" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/scheduler-plugins/apis/config" "sigs.k8s.io/scheduler-plugins/apis/scheduling" "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" fcore "sigs.k8s.io/scheduler-plugins/pkg/fluence/core" @@ -52,6 +52,7 @@ type Fluence struct { frameworkHandler framework.Handle pgMgr fcore.Manager scheduleTimeout *time.Duration + pgBackoff *time.Duration log *logger.DebugLogger } @@ -59,6 +60,15 @@ var ( _ framework.QueueSortPlugin = &Fluence{} _ framework.PreFilterPlugin = &Fluence{} _ framework.FilterPlugin = &Fluence{} + + _ framework.PostFilterPlugin = &Fluence{} + _ framework.PermitPlugin = &Fluence{} + _ framework.ReservePlugin = &Fluence{} + + _ framework.EnqueueExtensions = &Fluence{} + + permitWaitingTimeSeconds int64 = 60 + podGroupBackoffSeconds int64 = 0 ) const ( @@ -69,14 +79,12 @@ const ( // Initialize and return a new Fluence Custom Scheduler Plugin func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) { - // Keep these empty for now, use defaults - args := config.CoschedulingArgs{} ctx := context.TODO() // Make fluence his own little logger! // This can eventually be a flag, but just going to set for now // It shall be a very chonky file. Oh lawd he comin! - l := logger.NewDebugLogger(logger.LevelError, "/tmp/fluence.log") + l := logger.NewDebugLogger(logger.LevelDebug, "/tmp/fluence.log") scheme := runtime.NewScheme() _ = clientscheme.AddToScheme(scheme) @@ -93,7 +101,7 @@ func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) fluxPodsInformer.AddIndexers(cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) // PermitWaitingTimeSeconds is the waiting timeout in seconds. - scheduleTimeDuration := time.Duration(args.PermitWaitingTimeSeconds) * time.Second + scheduleTimeDuration := time.Duration(permitWaitingTimeSeconds) * time.Second pgMgr := fcore.NewPodGroupManager( client, handle.SnapshotSharedLister(), @@ -110,11 +118,13 @@ func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) }) go fluxPodsInformer.Run(ctx.Done()) + backoffSeconds := time.Duration(podGroupBackoffSeconds) * time.Second plugin := &Fluence{ frameworkHandler: handle, pgMgr: pgMgr, scheduleTimeout: &scheduleTimeDuration, log: l, + pgBackoff: &backoffSeconds, } // TODO this is not supported yet @@ -219,16 +229,131 @@ func (f *Fluence) PreFilter( node := f.pgMgr.GetPodNode(pod) f.mutex.Unlock() if node != "" { + f.log.Info("[Fluence PreFilter] assigned pod %s to node %s\n", pod.Name, node) result := framework.PreFilterResult{NodeNames: sets.New(node)} return &result, framework.NewStatus(framework.Success, "") } + f.log.Info("[Fluence PreFilter] pod %s does not have a node assigned\n", pod.Name) + // This will populate the node name into the pod group manager err := f.pgMgr.PreFilter(ctx, pod, state) if err != nil { - f.log.Error("[Fluence PreFilter] failed pod %s: %s", klog.KObj(pod), err.Error()) + f.log.Error("[Fluence PreFilter] failed pod %s: %s", pod.Name, err.Error()) return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) } node = f.pgMgr.GetPodNode(pod) result := framework.PreFilterResult{NodeNames: sets.New(node)} return &result, framework.NewStatus(framework.Success, "") } + +// PostFilter is used to reject a group of pods if a pod does not pass PreFilter or Filter. +func (f *Fluence) PostFilter( + ctx context.Context, + state *framework.CycleState, + pod *corev1.Pod, + filteredNodeStatusMap framework.NodeToStatusMap, +) (*framework.PostFilterResult, *framework.Status) { + + pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) + if pg == nil { + f.log.Info("Pod does not belong to any group, pod %s", pod.Name) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, "can not find pod group") + } + + // This explicitly checks nodes, and we can skip scheduling another pod if we already + // have the minimum. For fluence since we expect an exact size this likely is not needed + assigned := f.pgMgr.CalculateAssignedPods(pg.Name, pod.Namespace) + if assigned >= int(pg.Spec.MinMember) { + f.log.Info("Assigned pods podGroup %s is assigned %s", pgName, assigned) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable) + } + + // Took out percentage chcek here, doesn't make sense to me. + + // It's based on an implicit assumption: if the nth Pod failed, + // it's inferrable other Pods belonging to the same PodGroup would be very likely to fail. + f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name { + f.log.Info("PostFilter rejects the pod for podGroup %s and pod %s", pgName, waitingPod.GetPod().Name) + waitingPod.Reject(f.Name(), "optimistic rejection in PostFilter") + } + }) + + if f.pgBackoff != nil { + pods, err := f.frameworkHandler.SharedInformerFactory().Core().V1().Pods().Lister().Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: flabel.GetPodGroupLabel(pod)}), + ) + if err == nil && len(pods) >= int(pg.Spec.MinMember) { + f.pgMgr.BackoffPodGroup(pgName, *f.pgBackoff) + } + } + + f.pgMgr.DeletePermittedPodGroup(pgName) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, + fmt.Sprintf("PodGroup %v gets rejected due to Pod %v is unschedulable even after PostFilter", pgName, pod.Name)) +} + +// Permit is the functions invoked by the framework at "Permit" extension point. +func (f *Fluence) Permit( + ctx context.Context, + state *framework.CycleState, + pod *corev1.Pod, + nodeName string, +) (*framework.Status, time.Duration) { + + f.log.Info("Checking permit for pod %s to node %s", pod.Name, nodeName) + waitTime := *f.scheduleTimeout + s := f.pgMgr.Permit(ctx, state, pod) + var retStatus *framework.Status + switch s { + case fcore.PodGroupNotSpecified: + f.log.Info("Checking permit for pod %s to node %s: PodGroupNotSpecified", pod.Name, nodeName) + return framework.NewStatus(framework.Success, ""), 0 + case fcore.PodGroupNotFound: + f.log.Info("Checking permit for pod %s to node %s: PodGroupNotFound", pod.Name, nodeName) + return framework.NewStatus(framework.Unschedulable, "PodGroup not found"), 0 + case fcore.Wait: + f.log.Info("Pod %s is waiting to be scheduled to node %s", pod.Name, nodeName) + _, pg := f.pgMgr.GetPodGroup(ctx, pod) + if wait := fgroup.GetWaitTimeDuration(pg, f.scheduleTimeout); wait != 0 { + waitTime = wait + } + retStatus = framework.NewStatus(framework.Wait) + + // We will also request to move the sibling pods back to activeQ. + f.pgMgr.ActivateSiblings(pod, state) + case fcore.Success: + pgFullName := flabel.GetPodGroupFullName(pod) + f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if flabel.GetPodGroupFullName(waitingPod.GetPod()) == pgFullName { + f.log.Info("Permit allows pod %s", waitingPod.GetPod().Name) + waitingPod.Allow(f.Name()) + } + }) + f.log.Info("Permit allows pod %s", pod.Name) + retStatus = framework.NewStatus(framework.Success) + waitTime = 0 + } + + return retStatus, waitTime +} + +// Reserve is the functions invoked by the framework at "reserve" extension point. +func (f *Fluence) Reserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) *framework.Status { + return nil +} + +// Unreserve rejects all other Pods in the PodGroup when one of the pods in the group times out. +func (f *Fluence) Unreserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) { + pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) + if pg == nil { + return + } + f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name { + f.log.Info("Unreserve rejects pod %s in group %s", waitingPod.GetPod().Name, pgName) + waitingPod.Reject(f.Name(), "rejection in Unreserve") + } + }) + f.pgMgr.DeletePermittedPodGroup(pgName) +} diff --git a/sig-scheduler-plugins/pkg/fluence/group/group.go b/sig-scheduler-plugins/pkg/fluence/group/group.go index 0ee0831..dd039e3 100644 --- a/sig-scheduler-plugins/pkg/fluence/group/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group/group.go @@ -2,6 +2,7 @@ package group import ( "fmt" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -11,6 +12,9 @@ import ( sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" ) +// DefaultWaitTime is 60s if ScheduleTimeoutSeconds is not specified. +const DefaultWaitTime = 60 * time.Second + // CreateFakeGroup wraps an arbitrary pod in a fake group for fluence to schedule // This happens only in PreFilter so we already sorted func CreateFakeGroup(pod *corev1.Pod) *sched.PodGroup { @@ -44,3 +48,17 @@ func GetCreationTimestamp(groupName string, pg *sched.PodGroup, podInfo *framewo klog.Errorf(" [Fluence] Pod group %s time IsZero, we should not have reached here", groupName) return metav1.NewMicroTime(*podInfo.InitialAttemptTimestamp) } + +// GetWaitTimeDuration returns a wait timeout based on the following precedences: +// 1. spec.scheduleTimeoutSeconds of the given pg, if specified +// 2. given scheduleTimeout, if not nil +// 3. fall back to DefaultWaitTime +func GetWaitTimeDuration(pg *sched.PodGroup, scheduleTimeout *time.Duration) time.Duration { + if pg != nil && pg.Spec.ScheduleTimeoutSeconds != nil { + return time.Duration(*pg.Spec.ScheduleTimeoutSeconds) * time.Second + } + if scheduleTimeout != nil && *scheduleTimeout != 0 { + return *scheduleTimeout + } + return DefaultWaitTime +} diff --git a/sig-scheduler-plugins/pkg/logger/logger.go b/sig-scheduler-plugins/pkg/logger/logger.go index 053021a..d1e238e 100644 --- a/sig-scheduler-plugins/pkg/logger/logger.go +++ b/sig-scheduler-plugins/pkg/logger/logger.go @@ -79,8 +79,8 @@ func (l *DebugLogger) log(level int, prefix string, message ...any) error { rest := message[1:] // msg := fmt.Sprintf(message...) - fmt.Printf("Compariing level %d >= %d\n", level, l.level) - if level >= l.level { + fmt.Printf("Compariing level %d <= %d\n", level, l.level) + if level <= l.level { logger.Printf(prolog, rest...) } return l.Stop() From ef0ed50b1bcfefc30285024cff1c538f66ad62e2 Mon Sep 17 00:00:00 2001 From: vsoch Date: Fri, 19 Apr 2024 23:53:08 -0600 Subject: [PATCH 26/28] go: update to 1.21 Problem: we need to update to a newer go to keep up with the sig-scheduler upstream, and also the rainbow scheduler integration. Solution: upgrade to 1.21. This also required some refactor of the main.go and fluence due to changes in function signatures. This is a test to see if tests are passing - the fluxion-go bindings used here are from a branch (not merged yet) that can be used for the PR this one is going into, and before merging that final one we should merge and release the bindings more properly. Signed-off-by: vsoch --- sig-scheduler-plugins/cmd/controller/app/server.go | 8 +++++--- sig-scheduler-plugins/pkg/fluence/core/core.go | 1 + sig-scheduler-plugins/pkg/fluence/fluence.go | 5 +++-- src/build/scheduler/Dockerfile | 4 ++-- src/fluence/go.mod | 4 ++-- src/fluence/go.sum | 2 ++ 6 files changed, 15 insertions(+), 9 deletions(-) diff --git a/sig-scheduler-plugins/cmd/controller/app/server.go b/sig-scheduler-plugins/cmd/controller/app/server.go index d42c0f4..aae8625 100644 --- a/sig-scheduler-plugins/cmd/controller/app/server.go +++ b/sig-scheduler-plugins/cmd/controller/app/server.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" api "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" "sigs.k8s.io/scheduler-plugins/pkg/controllers" ) @@ -50,9 +51,10 @@ func Run(s *ServerRunOptions) error { // Controller Runtime Controllers ctrl.SetLogger(klogr.New()) mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ - Scheme: scheme, - MetricsBindAddress: s.MetricsAddr, - Port: 9443, + Scheme: scheme, + Metrics: metricsserver.Options{ + BindAddress: s.MetricsAddr, + }, HealthProbeBindAddress: s.ProbeAddr, LeaderElection: s.EnableLeaderElection, LeaderElectionID: "sched-plugins-controllers", diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index ea300ce..7f1e052 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -293,6 +293,7 @@ func (pgMgr *PodGroupManager) PreFilter( // it may not necessarily pass Filter due to other constraints such as affinity/taints. _, ok := pgMgr.permittedPG.Get(pgFullName) if ok { + pgMgr.log.Info("[PodGroup PreFilter] Pod Group %s is already admitted", pgFullName) return nil } diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index 099d2f3..fe113d6 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -67,7 +67,8 @@ var ( _ framework.EnqueueExtensions = &Fluence{} - permitWaitingTimeSeconds int64 = 60 + // Set to be the same as coscheduling + permitWaitingTimeSeconds int64 = 300 podGroupBackoffSeconds int64 = 0 ) @@ -77,7 +78,7 @@ const ( ) // Initialize and return a new Fluence Custom Scheduler Plugin -func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error) { +func New(_ context.Context, obj runtime.Object, handle framework.Handle) (framework.Plugin, error) { ctx := context.TODO() diff --git a/src/build/scheduler/Dockerfile b/src/build/scheduler/Dockerfile index 67bd5ce..2a8892c 100644 --- a/src/build/scheduler/Dockerfile +++ b/src/build/scheduler/Dockerfile @@ -2,11 +2,11 @@ FROM fluxrm/flux-sched:jammy USER root ENV DEBIAN_FRONTEND=noninteractive -ENV GO_VERSION=1.19.10 +ENV GO_VERSION=1.21.9 RUN apt-get update && apt-get clean -y && apt -y autoremove -# Install go 19.10 +# Install go RUN wget https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz && tar -xvf go${GO_VERSION}.linux-amd64.tar.gz && \ mv go /usr/local && rm go${GO_VERSION}.linux-amd64.tar.gz diff --git a/src/fluence/go.mod b/src/fluence/go.mod index 5c57652..01fc126 100644 --- a/src/fluence/go.mod +++ b/src/fluence/go.mod @@ -1,9 +1,9 @@ module github.com/flux-framework/flux-k8s/flux-plugin/fluence -go 1.19 +go 1.21 require ( - github.com/flux-framework/fluxion-go v0.32.0 + github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2 google.golang.org/grpc v1.38.0 google.golang.org/protobuf v1.26.0 gopkg.in/yaml.v2 v2.4.0 diff --git a/src/fluence/go.sum b/src/fluence/go.sum index 5700215..534497d 100644 --- a/src/fluence/go.sum +++ b/src/fluence/go.sum @@ -100,6 +100,8 @@ github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5Kwzbycv github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/flux-framework/fluxion-go v0.32.0 h1:NY6Y1mlTTTZhHD+CmAsDsdNTxUsAFDQoORpMZj8NFLI= github.com/flux-framework/fluxion-go v0.32.0/go.mod h1:ZI3QxSvUfgJE2Snur/SntJmVfpMjr6D4ICVmdqJ9fkQ= +github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2 h1:Yz/vVX0XfB2q51ZLh2p8YI5vphvv0rZF4PqtKPscvsY= +github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2/go.mod h1:jA5+kOSLxchFzixzYEvMAGjkXB5yszO/HxUwdhX/5/U= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/form3tech-oss/jwt-go v3.2.3+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= From 3bd9cb508bb515ba4bbadde2a9b528550c110209 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 2 May 2024 18:42:36 -0600 Subject: [PATCH 27/28] naming: expand short named variables Problem: a lot of the variables with pg are hard to understand Solution: write out podGroup or groupName explicitly. Signed-off-by: vsoch --- README.md | 2 +- hack/quick-build-kind.sh | 36 +++++ hack/quick-build.sh | 22 +-- .../pkg/fluence/core/core.go | 143 +++++++++--------- .../pkg/fluence/core/flux.go | 134 ++++++++-------- sig-scheduler-plugins/pkg/fluence/fluence.go | 76 +++++----- .../pkg/fluence/group/group.go | 18 +-- .../pkg/fluence/labels/labels.go | 6 +- 8 files changed, 226 insertions(+), 211 deletions(-) create mode 100755 hack/quick-build-kind.sh diff --git a/README.md b/README.md index ff3327e..515d313 100644 --- a/README.md +++ b/README.md @@ -503,7 +503,7 @@ I was having trouble developing this easily because it's a lot of steps to build The last step ensures we use the images we loaded! You can basically just do: ```bash -/bin/bash ./hack/quick-build.sh +/bin/bash ./hack/quick-build-kind.sh ``` This sped up my development time immensely. If you want to manually do the steps, see that script for instructions. diff --git a/hack/quick-build-kind.sh b/hack/quick-build-kind.sh new file mode 100755 index 0000000..23a5c87 --- /dev/null +++ b/hack/quick-build-kind.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Before running this, you should: +# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane) +# 2. Install cert-manager +# 3. Customize the script to point to your registry if you intend to push + +REGISTRY="${1:-ghcr.io/vsoch}" +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT=$(dirname ${HERE}) + +# Go to the script directory +cd ${ROOT} + +# These build each of the images. The sidecar is separate from the other two in src/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller + +# This is what it might look like to push +# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest + +# We load into kind so we don't need to push/pull and use up internet data ;) +kind load docker-image ${REGISTRY}/fluence-sidecar:latest +kind load docker-image ${REGISTRY}/fluence-controller:latest +kind load docker-image ${REGISTRY}/fluence:latest + +# And then install using the charts. The pull policy ensures we use the loaded ones +cd ${ROOT}/upstream/manifests/install/charts +helm uninstall fluence || true +helm install \ + --set scheduler.image=${REGISTRY}/fluence:latest \ + --set scheduler.sidecarPullPolicy=Never \ + --set scheduler.pullPolicy=Never \ + --set controller.pullPolicy=Never \ + --set controller.image=${REGISTRY}/fluence-controller:latest \ + --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ + fluence as-a-second-scheduler/ diff --git a/hack/quick-build.sh b/hack/quick-build.sh index 23a5c87..c9b8eff 100755 --- a/hack/quick-build.sh +++ b/hack/quick-build.sh @@ -13,24 +13,4 @@ ROOT=$(dirname ${HERE}) cd ${ROOT} # These build each of the images. The sidecar is separate from the other two in src/ -make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller - -# This is what it might look like to push -# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest - -# We load into kind so we don't need to push/pull and use up internet data ;) -kind load docker-image ${REGISTRY}/fluence-sidecar:latest -kind load docker-image ${REGISTRY}/fluence-controller:latest -kind load docker-image ${REGISTRY}/fluence:latest - -# And then install using the charts. The pull policy ensures we use the loaded ones -cd ${ROOT}/upstream/manifests/install/charts -helm uninstall fluence || true -helm install \ - --set scheduler.image=${REGISTRY}/fluence:latest \ - --set scheduler.sidecarPullPolicy=Never \ - --set scheduler.pullPolicy=Never \ - --set controller.pullPolicy=Never \ - --set controller.image=${REGISTRY}/fluence-controller:latest \ - --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ - fluence as-a-second-scheduler/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller \ No newline at end of file diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 7f1e052..a74e749 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -95,10 +95,10 @@ type PodGroupManager struct { // scheduleTimeout is the default timeout for podgroup scheduling. // If podgroup's scheduleTimeoutSeconds is set, it will be used. scheduleTimeout *time.Duration - // permittedPG stores the podgroup name which has passed the pre resource check. - permittedPG *gochache.Cache - // backedOffPG stores the podgorup name which failed scheduling recently. - backedOffPG *gochache.Cache + // permittedpodGroup stores the podgroup name which has passed the pre resource check. + permittedpodGroup *gochache.Cache + // backedOffpodGroup stores the podgorup name which failed scheduling recently. + backedOffpodGroup *gochache.Cache // podLister is pod lister podLister listerv1.PodLister @@ -122,32 +122,32 @@ func NewPodGroupManager( podInformer informerv1.PodInformer, log *logger.DebugLogger, ) *PodGroupManager { - pgMgr := &PodGroupManager{ + podGroupManager := &PodGroupManager{ client: client, snapshotSharedLister: snapshotSharedLister, scheduleTimeout: scheduleTimeout, podLister: podInformer.Lister(), - permittedPG: gochache.New(3*time.Second, 3*time.Second), - backedOffPG: gochache.New(10*time.Second, 10*time.Second), + permittedpodGroup: gochache.New(3*time.Second, 3*time.Second), + backedOffpodGroup: gochache.New(10*time.Second, 10*time.Second), groupToJobId: map[string]uint64{}, podToNode: map[string]string{}, log: log, } - return pgMgr + return podGroupManager } -func (pgMgr *PodGroupManager) BackoffPodGroup(pgName string, backoff time.Duration) { +func (podGroupManager *PodGroupManager) BackoffPodGroup(groupName string, backoff time.Duration) { if backoff == time.Duration(0) { return } - pgMgr.backedOffPG.Add(pgName, nil, backoff) + podGroupManager.backedOffpodGroup.Add(groupName, nil, backoff) } // ActivateSiblings stashes the pods belonging to the same PodGroup of the given pod // in the given state, with a reserved key "kubernetes.io/pods-to-activate". -func (pgMgr *PodGroupManager) ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) { - pgName := util.GetPodGroupLabel(pod) - if pgName == "" { +func (podGroupManager *PodGroupManager) ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) { + groupName := util.GetPodGroupLabel(pod) + if groupName == "" { return } @@ -158,11 +158,11 @@ func (pgMgr *PodGroupManager) ActivateSiblings(pod *corev1.Pod, state *framework return } - pods, err := pgMgr.podLister.Pods(pod.Namespace).List( - labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: pgName}), + pods, err := podGroupManager.podLister.Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: groupName}), ) if err != nil { - klog.ErrorS(err, "Failed to obtain pods belong to a PodGroup", "podGroup", pgName) + klog.ErrorS(err, "Failed to obtain pods belong to a PodGroup", "podGroup", groupName) return } @@ -188,40 +188,39 @@ func (pgMgr *PodGroupManager) ActivateSiblings(pod *corev1.Pod, state *framework } // GetStatuses string (of all pods) to show for debugging purposes -func (pgMgr *PodGroupManager) GetStatuses( +func (podGroupManager *PodGroupManager) GetStatuses( pods []*corev1.Pod, - pod *corev1.Pod, ) string { statuses := "" // We need to distinguish 0 from the default and not finding anything - for _, p := range pods { - statuses += " " + fmt.Sprintf("%s", p.Status.Phase) + for _, pod := range pods { + statuses += " " + fmt.Sprintf("%s", pod.Status.Phase) } return statuses } // GetPodNode is a quick lookup to see if we have a node -func (pgMgr *PodGroupManager) GetPodNode(pod *corev1.Pod) string { - node, _ := pgMgr.podToNode[pod.Name] +func (podGroupManager *PodGroupManager) GetPodNode(pod *corev1.Pod) string { + node, _ := podGroupManager.podToNode[pod.Name] return node } // Permit permits a pod to run, if the minMember match, it would send a signal to chan. -func (pgMgr *PodGroupManager) Permit(ctx context.Context, state *framework.CycleState, pod *corev1.Pod) Status { - pgFullName, pg := pgMgr.GetPodGroup(ctx, pod) - if pgFullName == "" { +func (podGroupManager *PodGroupManager) Permit(ctx context.Context, state *framework.CycleState, pod *corev1.Pod) Status { + groupName, podGroup := podGroupManager.GetPodGroup(ctx, pod) + if groupName == "" { return PodGroupNotSpecified } - if pg == nil { + if podGroup == nil { // A Pod with a podGroup name but without a PodGroup found is denied. return PodGroupNotFound } - assigned := pgMgr.CalculateAssignedPods(pg.Name, pg.Namespace) + assigned := podGroupManager.CalculateAssignedPods(podGroup.Name, podGroup.Namespace) // The number of pods that have been assigned nodes is calculated from the snapshot. // The current pod in not included in the snapshot during the current scheduling cycle. - if int32(assigned)+1 >= pg.Spec.MinMember { + if int32(assigned)+1 >= podGroup.Spec.MinMember { return Success } @@ -244,24 +243,24 @@ func (pgMgr *PodGroupManager) Permit(ctx context.Context, state *framework.Cycle // 1. it belongs to a podgroup that was recently denied or // 2. the total number of pods in the podgroup is less than the minimum number of pods // that is required to be scheduled. -func (pgMgr *PodGroupManager) PreFilter( +func (podGroupManager *PodGroupManager) PreFilter( ctx context.Context, pod *corev1.Pod, state *framework.CycleState, ) error { - pgMgr.log.Info("[PodGroup PreFilter] pod %s", klog.KObj(pod)) - pgFullName, pg := pgMgr.GetPodGroup(ctx, pod) - if pg == nil { + podGroupManager.log.Info("[PodGroup PreFilter] pod %s", klog.KObj(pod)) + groupName, podGroup := podGroupManager.GetPodGroup(ctx, pod) + if podGroup == nil { return nil } - _, exist := pgMgr.backedOffPG.Get(pgFullName) + _, exist := podGroupManager.backedOffpodGroup.Get(groupName) if exist { - return fmt.Errorf("podGroup %v failed recently", pgFullName) + return fmt.Errorf("podGroup %v failed recently", groupName) } - pods, err := pgMgr.podLister.Pods(pod.Namespace).List( + pods, err := podGroupManager.podLister.Pods(pod.Namespace).List( labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: util.GetPodGroupLabel(pod)}), ) if err != nil { @@ -271,18 +270,18 @@ func (pgMgr *PodGroupManager) PreFilter( // Only allow scheduling the first in the group so the others come after // Get statuses to show for debugging - statuses := pgMgr.GetStatuses(pods, pod) + statuses := podGroupManager.GetStatuses(pods) // This shows us the number of pods we have in the set and their states - pgMgr.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", pgFullName, statuses, pg.Spec.MinMember, len(pods)) - if len(pods) < int(pg.Spec.MinMember) { + podGroupManager.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", groupName, statuses, podGroup.Spec.MinMember, len(pods)) + if len(pods) < int(podGroup.Spec.MinMember) { return fmt.Errorf("pre-filter pod %v cannot find enough sibling pods, "+ - "current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember) + "current pods number: %v, minMember of group: %v", pod.Name, len(pods), podGroup.Spec.MinMember) } // TODO we likely can take advantage of these resources or other custom // attributes we add. For now ignore and calculate based on pod needs (above) - // if pg.Spec.MinResources == nil { + // if podGroup.Spec.MinResources == nil { // fmt.Printf("Fluence Min resources are null, skipping PreFilter") // return nil // } @@ -291,28 +290,28 @@ func (pgMgr *PodGroupManager) PreFilter( // TODO(cwdsuzhou): This resource check may not always pre-catch unschedulable pod group. // It only tries to PreFilter resource constraints so even if a PodGroup passed here, // it may not necessarily pass Filter due to other constraints such as affinity/taints. - _, ok := pgMgr.permittedPG.Get(pgFullName) + _, ok := podGroupManager.permittedpodGroup.Get(groupName) if ok { - pgMgr.log.Info("[PodGroup PreFilter] Pod Group %s is already admitted", pgFullName) + podGroupManager.log.Info("[PodGroup PreFilter] Pod Group %s is already admitted", groupName) return nil } - // TODO: right now we ask Fluxion for a podspec based on ONE pod, but + // TODO: right now we ask Fluxion for a podspec based on ONE representative pod, but // we have the whole group! We can handle different pod needs now :) repPod := pods[0] - nodes, err := pgMgr.AskFlux(ctx, *repPod, pg, pgFullName) + nodes, err := podGroupManager.AskFlux(ctx, *repPod, podGroup, groupName) if err != nil { - pgMgr.log.Info("[PodGroup PreFilter] Fluxion returned an error %s, not schedulable", err.Error()) + podGroupManager.log.Info("[PodGroup PreFilter] Fluxion returned an error %s, not schedulable", err.Error()) return err } - pgMgr.log.Info("Node Selected %s (pod group %s)", nodes, pgFullName) + podGroupManager.log.Info("Node Selected %s (pod group %s)", nodes, groupName) // Some reason fluxion gave us the wrong size? if len(nodes) != len(pods) { - pgMgr.log.Warning("[PodGroup PreFilter] group %s needs %d nodes but Fluxion returned the wrong number nodes %d.", pgFullName, len(pods), len(nodes)) - pgMgr.mutex.Lock() - pgMgr.cancelFluxJob(pgFullName, repPod) - pgMgr.mutex.Unlock() + podGroupManager.log.Warning("[PodGroup PreFilter] group %s needs %d nodes but Fluxion returned the wrong number nodes %d.", groupName, len(pods), len(nodes)) + podGroupManager.mutex.Lock() + podGroupManager.cancelFluxJob(groupName, repPod) + podGroupManager.mutex.Unlock() } // Create a fluxState (CycleState) with all nodes - this is used to retrieve @@ -324,32 +323,32 @@ func (pgMgr *PodGroupManager) PreFilter( stateData := FluxStateData{NodeName: node} state.Write(framework.StateKey(pod.Name), &stateData) // Also save to the podToNode lookup - pgMgr.mutex.Lock() - pgMgr.podToNode[pod.Name] = node - pgMgr.mutex.Unlock() + podGroupManager.mutex.Lock() + podGroupManager.podToNode[pod.Name] = node + podGroupManager.mutex.Unlock() } - pgMgr.permittedPG.Add(pgFullName, pgFullName, *pgMgr.scheduleTimeout) + podGroupManager.permittedpodGroup.Add(groupName, groupName, *podGroupManager.scheduleTimeout) return nil } // GetCreationTimestamp returns the creation time of a podGroup or a pod. -func (pgMgr *PodGroupManager) GetCreationTimestamp(pod *corev1.Pod, ts time.Time) time.Time { - pgName := util.GetPodGroupLabel(pod) - if len(pgName) == 0 { +func (podGroupManager *PodGroupManager) GetCreationTimestamp(pod *corev1.Pod, ts time.Time) time.Time { + groupName := util.GetPodGroupLabel(pod) + if len(groupName) == 0 { return ts } - var pg v1alpha1.PodGroup - if err := pgMgr.client.Get(context.TODO(), types.NamespacedName{Namespace: pod.Namespace, Name: pgName}, &pg); err != nil { + var podGroup v1alpha1.PodGroup + if err := podGroupManager.client.Get(context.TODO(), types.NamespacedName{Namespace: pod.Namespace, Name: groupName}, &podGroup); err != nil { return ts } - return pg.CreationTimestamp.Time + return podGroup.CreationTimestamp.Time } // CalculateAssignedPods returns the number of pods that has been assigned nodes: assumed or bound. -func (pgMgr *PodGroupManager) CalculateAssignedPods(podGroupName, namespace string) int { - nodeInfos, err := pgMgr.snapshotSharedLister.NodeInfos().List() +func (podGroupManager *PodGroupManager) CalculateAssignedPods(podGroupName, namespace string) int { + nodeInfos, err := podGroupManager.snapshotSharedLister.NodeInfos().List() if err != nil { - pgMgr.log.Error("Cannot get nodeInfos from frameworkHandle: %s", err) + podGroupManager.log.Error("Cannot get nodeInfos from frameworkHandle: %s", err) return 0 } var count int @@ -365,21 +364,21 @@ func (pgMgr *PodGroupManager) CalculateAssignedPods(podGroupName, namespace stri } // DeletePermittedPodGroup deletes a podGroup that passes Pre-Filter but reaches PostFilter. -func (pgMgr *PodGroupManager) DeletePermittedPodGroup(pgFullName string) { - pgMgr.permittedPG.Delete(pgFullName) +func (podGroupManager *PodGroupManager) DeletePermittedPodGroup(groupName string) { + podGroupManager.permittedpodGroup.Delete(groupName) } // GetPodGroup returns the PodGroup that a Pod belongs to in cache. -func (pgMgr *PodGroupManager) GetPodGroup(ctx context.Context, pod *corev1.Pod) (string, *v1alpha1.PodGroup) { - pgName := util.GetPodGroupLabel(pod) - if len(pgName) == 0 { +func (podGroupManager *PodGroupManager) GetPodGroup(ctx context.Context, pod *corev1.Pod) (string, *v1alpha1.PodGroup) { + groupName := util.GetPodGroupLabel(pod) + if len(groupName) == 0 { return "", nil } - var pg v1alpha1.PodGroup - if err := pgMgr.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: pgName}, &pg); err != nil { - return fmt.Sprintf("%v/%v", pod.Namespace, pgName), nil + var podGroup v1alpha1.PodGroup + if err := podGroupManager.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: groupName}, &podGroup); err != nil { + return fmt.Sprintf("%v/%v", pod.Namespace, groupName), nil } - return fmt.Sprintf("%v/%v", pod.Namespace, pgName), &pg + return fmt.Sprintf("%v/%v", pod.Namespace, groupName), &podGroup } // GetNamespacedName returns the namespaced name. diff --git a/sig-scheduler-plugins/pkg/fluence/core/flux.go b/sig-scheduler-plugins/pkg/fluence/core/flux.go index 48e1500..50c8ff1 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/flux.go +++ b/sig-scheduler-plugins/pkg/fluence/core/flux.go @@ -17,17 +17,17 @@ import ( // AskFlux will ask flux for an allocation for nodes for the pod group. // We return the list of nodes, and assign to the entire group! -func (pgMgr *PodGroupManager) AskFlux( +func (podGroupManager *PodGroupManager) AskFlux( ctx context.Context, pod corev1.Pod, - pg *v1alpha1.PodGroup, + podGroup *v1alpha1.PodGroup, groupName string, ) ([]string, error) { // clean up previous match if a pod has already allocated previously - pgMgr.mutex.Lock() - _, isAllocated := pgMgr.groupToJobId[groupName] - pgMgr.mutex.Unlock() + podGroupManager.mutex.Lock() + _, isAllocated := podGroupManager.groupToJobId[groupName] + podGroupManager.mutex.Unlock() // This case happens when there is some reason that an initial job pods partially allocated, // but then the job restarted, and new pods are present but fluence had assigned nodes to @@ -37,10 +37,10 @@ func (pgMgr *PodGroupManager) AskFlux( // cancel in fluence. What we can do here is assume the previous pods are no longer running // and cancel the flux job to create again. if isAllocated { - pgMgr.log.Warning("[PodGroup AskFlux] group %s was previously allocated and is requesting again, so must have completed.", groupName) - pgMgr.mutex.Lock() - pgMgr.cancelFluxJob(groupName, &pod) - pgMgr.mutex.Unlock() + podGroupManager.log.Warning("[PodGroup AskFlux] group %s was previously allocated and is requesting again, so must have completed.", groupName) + podGroupManager.mutex.Lock() + podGroupManager.cancelFluxJob(groupName, &pod) + podGroupManager.mutex.Unlock() } nodes := []string{} @@ -48,12 +48,12 @@ func (pgMgr *PodGroupManager) AskFlux( // This obviously may not be true if we have a hetereogenous PodGroup. // We name it based on the group, since it will represent the group jobspec := utils.PreparePodJobSpec(&pod, groupName) - pgMgr.log.Info("[PodGroup AskFlux] Inspect pod info, jobspec: %s\n", jobspec) + podGroupManager.log.Info("[PodGroup AskFlux] Inspect pod info, jobspec: %s\n", jobspec) conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) // TODO change this to just return fmt.Errorf if err != nil { - pgMgr.log.Error("[PodGroup AskFlux] Error connecting to server: %v\n", err) + podGroupManager.log.Error("[PodGroup AskFlux] Error connecting to server: %v\n", err) return nodes, err } defer conn.Close() @@ -65,18 +65,18 @@ func (pgMgr *PodGroupManager) AskFlux( request := &pb.MatchRequest{ Ps: jobspec, Request: "allocate", - Count: pg.Spec.MinMember, + Count: podGroup.Spec.MinMember, } // An error here is an error with making the request r, err := grpcclient.Match(context.Background(), request) if err != nil { - pgMgr.log.Warning("[PodGroup AskFlux] did not receive any match response: %v\n", err) + podGroupManager.log.Warning("[PodGroup AskFlux] did not receive any match response: %v\n", err) return nodes, err } // TODO GetPodID should be renamed, because it will reflect the group - pgMgr.log.Info("[PodGroup AskFlux] Match response ID %s\n", r.GetPodID()) + podGroupManager.log.Info("[PodGroup AskFlux] Match response ID %s\n", r.GetPodID()) // Get the nodelist and inspect nodelist := r.GetNodelist() @@ -84,33 +84,33 @@ func (pgMgr *PodGroupManager) AskFlux( nodes = append(nodes, node.NodeID) } jobid := uint64(r.GetJobID()) - pgMgr.log.Info("[PodGroup AskFlux] parsed node pods list %s for job id %d\n", nodes, jobid) + podGroupManager.log.Info("[PodGroup AskFlux] parsed node pods list %s for job id %d\n", nodes, jobid) // TODO would be nice to actually be able to ask flux jobs -a to fluence // That way we can verify assignments, etc. - pgMgr.mutex.Lock() - pgMgr.groupToJobId[groupName] = jobid - pgMgr.mutex.Unlock() + podGroupManager.mutex.Lock() + podGroupManager.groupToJobId[groupName] = jobid + podGroupManager.mutex.Unlock() return nodes, nil } // cancelFluxJobForPod cancels the flux job for a pod. // We assume that the cancelled job also means deleting the pod group -func (pgMgr *PodGroupManager) cancelFluxJob(groupName string, pod *corev1.Pod) error { +func (podGroupManager *PodGroupManager) cancelFluxJob(groupName string, pod *corev1.Pod) error { - jobid, ok := pgMgr.groupToJobId[groupName] + jobid, ok := podGroupManager.groupToJobId[groupName] // The job was already cancelled by another pod if !ok { - pgMgr.log.Info("[PodGroup cancelFluxJob] Request for cancel of group %s is already complete.", groupName) + podGroupManager.log.Info("[PodGroup cancelFluxJob] Request for cancel of group %s is already complete.", groupName) return nil } - pgMgr.log.Info("[PodGroup cancelFluxJob] Cancel flux job: %v for group %s", jobid, groupName) + podGroupManager.log.Info("[PodGroup cancelFluxJob] Cancel flux job: %v for group %s", jobid, groupName) // This first error is about connecting to the server conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) if err != nil { - pgMgr.log.Error("[PodGroup cancelFluxJob] Error connecting to server: %v", err) + podGroupManager.log.Error("[PodGroup cancelFluxJob] Error connecting to server: %v", err) return err } defer conn.Close() @@ -123,28 +123,28 @@ func (pgMgr *PodGroupManager) cancelFluxJob(groupName string, pod *corev1.Pod) e request := &pb.CancelRequest{JobID: int64(jobid)} res, err := grpcclient.Cancel(context.Background(), request) if err != nil { - pgMgr.log.Error("[PodGroup cancelFluxJob] did not receive any cancel response: %v", err) + podGroupManager.log.Error("[PodGroup cancelFluxJob] did not receive any cancel response: %v", err) return err } - pgMgr.log.Info("[PodGroup cancelFluxJob] Job cancellation for group %s result: %d", groupName, res.Error) + podGroupManager.log.Info("[PodGroup cancelFluxJob] Job cancellation for group %s result: %d", groupName, res.Error) // And this error is if the cancel was successful or not if res.Error == 0 { - pgMgr.log.Info("[PodGroup cancelFluxJob] Successful cancel of flux job: %d for group %s", jobid, groupName) - pgMgr.cleanup(pod, groupName) + podGroupManager.log.Info("[PodGroup cancelFluxJob] Successful cancel of flux job: %d for group %s", jobid, groupName) + podGroupManager.cleanup(pod, groupName) } else { - pgMgr.log.Warning("[PodGroup cancelFluxJob] Failed to cancel flux job %d for group %s", jobid, groupName) + podGroupManager.log.Warning("[PodGroup cancelFluxJob] Failed to cancel flux job %d for group %s", jobid, groupName) } return nil } // cleanup deletes the group name from groupToJobId, and pods names from the node lookup -func (pgMgr *PodGroupManager) cleanup(pod *corev1.Pod, groupName string) { +func (podGroupManager *PodGroupManager) cleanup(pod *corev1.Pod, groupName string) { - delete(pgMgr.groupToJobId, groupName) + delete(podGroupManager.groupToJobId, groupName) // Clean up previous pod->node assignments - pods, err := pgMgr.podLister.Pods(pod.Namespace).List( + pods, err := podGroupManager.podLister.Pods(pod.Namespace).List( labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: groupName}), ) // TODO need to handle this / understand why it's the case @@ -152,28 +152,28 @@ func (pgMgr *PodGroupManager) cleanup(pod *corev1.Pod, groupName string) { return } for _, pod := range pods { - delete(pgMgr.podToNode, pod.Name) + delete(podGroupManager.podToNode, pod.Name) } } // UpdatePod is called on an update, and the old and new object are presented -func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { +func (podGroupManager *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { oldPod := oldObj.(*corev1.Pod) newPod := newObj.(*corev1.Pod) // a pod is updated, get the group // TODO should we be checking group / size for old vs new? - groupName, pg := pgMgr.GetPodGroup(context.TODO(), oldPod) + groupName, podGroup := podGroupManager.GetPodGroup(context.TODO(), oldPod) // If PodGroup is nil, still try to look up a faux name // TODO need to check if this might be problematic - if pg == nil { - pg = fgroup.CreateFakeGroup(oldPod) - groupName = pg.Name + if podGroup == nil { + podGroup = fgroup.CreateFakeGroup(oldPod) + groupName = podGroup.Name } - pgMgr.log.Verbose("[PodGroup UpdatePod] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, oldPod.Status.Phase, newPod.Status.Phase) + podGroupManager.log.Verbose("[PodGroup UpdatePod] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, oldPod.Status.Phase, newPod.Status.Phase) switch newPod.Status.Phase { case corev1.PodPending: @@ -181,34 +181,34 @@ func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { case corev1.PodRunning: // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler case corev1.PodSucceeded: - pgMgr.log.Info("[PodGroup UpdatePod] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) + podGroupManager.log.Info("[PodGroup UpdatePod] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) - pgMgr.mutex.Lock() - defer pgMgr.mutex.Unlock() + podGroupManager.mutex.Lock() + defer podGroupManager.mutex.Unlock() // Do we have the group id in our cache? If yes, we haven't deleted the jobid yet // I am worried here that if some pods are succeeded and others pending, this could // be a mistake - fluence would schedule it again - _, ok := pgMgr.groupToJobId[groupName] + _, ok := podGroupManager.groupToJobId[groupName] if ok { - pgMgr.cancelFluxJob(groupName, oldPod) + podGroupManager.cancelFluxJob(groupName, oldPod) } else { - pgMgr.log.Verbose("[PodGroup UpdatePod] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + podGroupManager.log.Verbose("[PodGroup UpdatePod] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) } case corev1.PodFailed: // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test - pgMgr.log.Warning("[PodGroup UpdatePod] Pod %s in group %s failed, Fluence needs to free the resources", newPod.Name, groupName) + podGroupManager.log.Warning("[PodGroup UpdatePod] Pod %s in group %s failed, Fluence needs to free the resources", newPod.Name, groupName) - pgMgr.mutex.Lock() - defer pgMgr.mutex.Unlock() + podGroupManager.mutex.Lock() + defer podGroupManager.mutex.Unlock() - _, ok := pgMgr.groupToJobId[groupName] + _, ok := podGroupManager.groupToJobId[groupName] if ok { - pgMgr.cancelFluxJob(groupName, oldPod) + podGroupManager.cancelFluxJob(groupName, oldPod) } else { - pgMgr.log.Error("[PodGroup UpdatePod] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + podGroupManager.log.Error("[PodGroup UpdatePod] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) } case corev1.PodUnknown: // don't know how to deal with it as it's unknown phase @@ -218,40 +218,40 @@ func (pgMgr *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { } // DeletePod handles the delete event handler -func (pgMgr *PodGroupManager) DeletePod(podObj interface{}) { +func (podGroupManager *PodGroupManager) DeletePod(podObj interface{}) { pod := podObj.(*corev1.Pod) - groupName, pg := pgMgr.GetPodGroup(context.TODO(), pod) + groupName, podGroup := podGroupManager.GetPodGroup(context.TODO(), pod) // If PodGroup is nil, still try to look up a faux name - if pg == nil { - pg = fgroup.CreateFakeGroup(pod) - groupName = pg.Name + if podGroup == nil { + podGroup = fgroup.CreateFakeGroup(pod) + groupName = podGroup.Name } - pgMgr.log.Verbose("[PodGroup DeletePod] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) + podGroupManager.log.Verbose("[PodGroup DeletePod] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) switch pod.Status.Phase { case corev1.PodSucceeded: case corev1.PodPending: - pgMgr.log.Verbose("[PodGroup DeletePod] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) + podGroupManager.log.Verbose("[PodGroup DeletePod] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) - pgMgr.mutex.Lock() - defer pgMgr.mutex.Unlock() + podGroupManager.mutex.Lock() + defer podGroupManager.mutex.Unlock() - _, ok := pgMgr.groupToJobId[groupName] + _, ok := podGroupManager.groupToJobId[groupName] if ok { - pgMgr.cancelFluxJob(groupName, pod) + podGroupManager.cancelFluxJob(groupName, pod) } else { - pgMgr.log.Info("[PodGroup DeletePod] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + podGroupManager.log.Info("[PodGroup DeletePod] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) } case corev1.PodRunning: - pgMgr.mutex.Lock() - defer pgMgr.mutex.Unlock() + podGroupManager.mutex.Lock() + defer podGroupManager.mutex.Unlock() - _, ok := pgMgr.groupToJobId[groupName] + _, ok := podGroupManager.groupToJobId[groupName] if ok { - pgMgr.cancelFluxJob(groupName, pod) + podGroupManager.cancelFluxJob(groupName, pod) } else { - pgMgr.log.Info("[PodGroup DeletePod] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + podGroupManager.log.Info("[PodGroup DeletePod] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) } } } diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index fe113d6..fddd3f0 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -50,9 +50,9 @@ type Fluence struct { mutex sync.Mutex client client.Client frameworkHandler framework.Handle - pgMgr fcore.Manager + podGroupManager fcore.Manager scheduleTimeout *time.Duration - pgBackoff *time.Duration + podGroupBackoff *time.Duration log *logger.DebugLogger } @@ -103,7 +103,7 @@ func New(_ context.Context, obj runtime.Object, handle framework.Handle) (framew // PermitWaitingTimeSeconds is the waiting timeout in seconds. scheduleTimeDuration := time.Duration(permitWaitingTimeSeconds) * time.Second - pgMgr := fcore.NewPodGroupManager( + podGroupManager := fcore.NewPodGroupManager( client, handle.SnapshotSharedLister(), &scheduleTimeDuration, @@ -112,20 +112,20 @@ func New(_ context.Context, obj runtime.Object, handle framework.Handle) (framew l, ) - // Event handlers to call on pgMgr + // Event handlers to call on podGroupManager fluxPodsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - UpdateFunc: pgMgr.UpdatePod, - DeleteFunc: pgMgr.DeletePod, + UpdateFunc: podGroupManager.UpdatePod, + DeleteFunc: podGroupManager.DeletePod, }) go fluxPodsInformer.Run(ctx.Done()) backoffSeconds := time.Duration(podGroupBackoffSeconds) * time.Second plugin := &Fluence{ frameworkHandler: handle, - pgMgr: pgMgr, + podGroupManager: podGroupManager, scheduleTimeout: &scheduleTimeDuration, log: l, - pgBackoff: &backoffSeconds, + podGroupBackoff: &backoffSeconds, } // TODO this is not supported yet @@ -144,10 +144,10 @@ func (f *Fluence) EventsToRegister() []framework.ClusterEventWithHint { // TODO I have not redone this yet, not sure what it does (it might replace our informer above) // To register a custom event, follow the naming convention at: // https://git.k8s.io/kubernetes/pkg/scheduler/eventhandlers.go#L403-L410 - pgGVK := fmt.Sprintf("podgroups.v1alpha1.%v", scheduling.GroupName) + podGroupGVK := fmt.Sprintf("podgroups.v1alpha1.%v", scheduling.GroupName) return []framework.ClusterEventWithHint{ {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Add | framework.Delete}}, - {Event: framework.ClusterEvent{Resource: framework.GVK(pgGVK), ActionType: framework.Add | framework.Update | framework.Delete}}, + {Event: framework.ClusterEvent{Resource: framework.GVK(podGroupGVK), ActionType: framework.Add | framework.Update | framework.Delete}}, } } @@ -193,8 +193,8 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { // which is what fluence needs to distinguish between namespaces. Just the // name could be replicated between different namespaces ctx := context.TODO() - name1, podGroup1 := f.pgMgr.GetPodGroup(ctx, podInfo1.Pod) - name2, podGroup2 := f.pgMgr.GetPodGroup(ctx, podInfo2.Pod) + name1, podGroup1 := f.podGroupManager.GetPodGroup(ctx, podInfo1.Pod) + name2, podGroup2 := f.podGroupManager.GetPodGroup(ctx, podInfo2.Pod) // Fluence can only compare if we have two known groups. // This tries for that first, and falls back to the initial attempt timestamp @@ -227,7 +227,7 @@ func (f *Fluence) PreFilter( // Quick check if the pod is already scheduled f.mutex.Lock() - node := f.pgMgr.GetPodNode(pod) + node := f.podGroupManager.GetPodNode(pod) f.mutex.Unlock() if node != "" { f.log.Info("[Fluence PreFilter] assigned pod %s to node %s\n", pod.Name, node) @@ -237,12 +237,12 @@ func (f *Fluence) PreFilter( f.log.Info("[Fluence PreFilter] pod %s does not have a node assigned\n", pod.Name) // This will populate the node name into the pod group manager - err := f.pgMgr.PreFilter(ctx, pod, state) + err := f.podGroupManager.PreFilter(ctx, pod, state) if err != nil { f.log.Error("[Fluence PreFilter] failed pod %s: %s", pod.Name, err.Error()) return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) } - node = f.pgMgr.GetPodNode(pod) + node = f.podGroupManager.GetPodNode(pod) result := framework.PreFilterResult{NodeNames: sets.New(node)} return &result, framework.NewStatus(framework.Success, "") } @@ -255,17 +255,17 @@ func (f *Fluence) PostFilter( filteredNodeStatusMap framework.NodeToStatusMap, ) (*framework.PostFilterResult, *framework.Status) { - pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { + groupName, podGroup := f.podGroupManager.GetPodGroup(ctx, pod) + if podGroup == nil { f.log.Info("Pod does not belong to any group, pod %s", pod.Name) return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, "can not find pod group") } // This explicitly checks nodes, and we can skip scheduling another pod if we already // have the minimum. For fluence since we expect an exact size this likely is not needed - assigned := f.pgMgr.CalculateAssignedPods(pg.Name, pod.Namespace) - if assigned >= int(pg.Spec.MinMember) { - f.log.Info("Assigned pods podGroup %s is assigned %s", pgName, assigned) + assigned := f.podGroupManager.CalculateAssignedPods(podGroup.Name, pod.Namespace) + if assigned >= int(podGroup.Spec.MinMember) { + f.log.Info("Assigned pods podGroup %s is assigned %s", groupName, assigned) return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable) } @@ -274,24 +274,24 @@ func (f *Fluence) PostFilter( // It's based on an implicit assumption: if the nth Pod failed, // it's inferrable other Pods belonging to the same PodGroup would be very likely to fail. f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { - if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name { - f.log.Info("PostFilter rejects the pod for podGroup %s and pod %s", pgName, waitingPod.GetPod().Name) + if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == podGroup.Name { + f.log.Info("PostFilter rejects the pod for podGroup %s and pod %s", groupName, waitingPod.GetPod().Name) waitingPod.Reject(f.Name(), "optimistic rejection in PostFilter") } }) - if f.pgBackoff != nil { + if f.podGroupBackoff != nil { pods, err := f.frameworkHandler.SharedInformerFactory().Core().V1().Pods().Lister().Pods(pod.Namespace).List( labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: flabel.GetPodGroupLabel(pod)}), ) - if err == nil && len(pods) >= int(pg.Spec.MinMember) { - f.pgMgr.BackoffPodGroup(pgName, *f.pgBackoff) + if err == nil && len(pods) >= int(podGroup.Spec.MinMember) { + f.podGroupManager.BackoffPodGroup(groupName, *f.podGroupBackoff) } } - f.pgMgr.DeletePermittedPodGroup(pgName) + f.podGroupManager.DeletePermittedPodGroup(groupName) return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, - fmt.Sprintf("PodGroup %v gets rejected due to Pod %v is unschedulable even after PostFilter", pgName, pod.Name)) + fmt.Sprintf("PodGroup %v gets rejected due to Pod %v is unschedulable even after PostFilter", groupName, pod.Name)) } // Permit is the functions invoked by the framework at "Permit" extension point. @@ -304,7 +304,7 @@ func (f *Fluence) Permit( f.log.Info("Checking permit for pod %s to node %s", pod.Name, nodeName) waitTime := *f.scheduleTimeout - s := f.pgMgr.Permit(ctx, state, pod) + s := f.podGroupManager.Permit(ctx, state, pod) var retStatus *framework.Status switch s { case fcore.PodGroupNotSpecified: @@ -315,18 +315,18 @@ func (f *Fluence) Permit( return framework.NewStatus(framework.Unschedulable, "PodGroup not found"), 0 case fcore.Wait: f.log.Info("Pod %s is waiting to be scheduled to node %s", pod.Name, nodeName) - _, pg := f.pgMgr.GetPodGroup(ctx, pod) - if wait := fgroup.GetWaitTimeDuration(pg, f.scheduleTimeout); wait != 0 { + _, podGroup := f.podGroupManager.GetPodGroup(ctx, pod) + if wait := fgroup.GetWaitTimeDuration(podGroup, f.scheduleTimeout); wait != 0 { waitTime = wait } retStatus = framework.NewStatus(framework.Wait) // We will also request to move the sibling pods back to activeQ. - f.pgMgr.ActivateSiblings(pod, state) + f.podGroupManager.ActivateSiblings(pod, state) case fcore.Success: - pgFullName := flabel.GetPodGroupFullName(pod) + podGroupFullName := flabel.GetPodGroupFullName(pod) f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { - if flabel.GetPodGroupFullName(waitingPod.GetPod()) == pgFullName { + if flabel.GetPodGroupFullName(waitingPod.GetPod()) == podGroupFullName { f.log.Info("Permit allows pod %s", waitingPod.GetPod().Name) waitingPod.Allow(f.Name()) } @@ -346,15 +346,15 @@ func (f *Fluence) Reserve(ctx context.Context, state *framework.CycleState, pod // Unreserve rejects all other Pods in the PodGroup when one of the pods in the group times out. func (f *Fluence) Unreserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) { - pgName, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { + groupName, podGroup := f.podGroupManager.GetPodGroup(ctx, pod) + if podGroup == nil { return } f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { - if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name { - f.log.Info("Unreserve rejects pod %s in group %s", waitingPod.GetPod().Name, pgName) + if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == podGroup.Name { + f.log.Info("Unreserve rejects pod %s in group %s", waitingPod.GetPod().Name, groupName) waitingPod.Reject(f.Name(), "rejection in Unreserve") } }) - f.pgMgr.DeletePermittedPodGroup(pgName) + f.podGroupManager.DeletePermittedPodGroup(groupName) } diff --git a/sig-scheduler-plugins/pkg/fluence/group/group.go b/sig-scheduler-plugins/pkg/fluence/group/group.go index dd039e3..2c3a3c1 100644 --- a/sig-scheduler-plugins/pkg/fluence/group/group.go +++ b/sig-scheduler-plugins/pkg/fluence/group/group.go @@ -31,18 +31,18 @@ func CreateFakeGroup(pod *corev1.Pod) *sched.PodGroup { // GetCreationTimestamp first tries the fluence group, then falls back to the initial attempt timestamp // This is the only update we have made to the upstream PodGroupManager, because we are expecting // a MicroTime and not a time.Time. -func GetCreationTimestamp(groupName string, pg *sched.PodGroup, podInfo *framework.QueuedPodInfo) metav1.MicroTime { +func GetCreationTimestamp(groupName string, podGroup *sched.PodGroup, podInfo *framework.QueuedPodInfo) metav1.MicroTime { // Don't try to get a time for a pod group that does not exist - if pg == nil { + if podGroup == nil { return metav1.NewMicroTime(*podInfo.InitialAttemptTimestamp) } // IsZero is an indicator if this was actually set // If the group label was present and we have a group, this will be true - if !pg.Status.ScheduleStartTime.IsZero() { - klog.Infof(" [Fluence] Pod group %s was created at %s\n", groupName, pg.Status.ScheduleStartTime) - return pg.Status.ScheduleStartTime + if !podGroup.Status.ScheduleStartTime.IsZero() { + klog.Infof(" [Fluence] Pod group %s was created at %s\n", groupName, podGroup.Status.ScheduleStartTime) + return podGroup.Status.ScheduleStartTime } // We should actually never get here. klog.Errorf(" [Fluence] Pod group %s time IsZero, we should not have reached here", groupName) @@ -50,12 +50,12 @@ func GetCreationTimestamp(groupName string, pg *sched.PodGroup, podInfo *framewo } // GetWaitTimeDuration returns a wait timeout based on the following precedences: -// 1. spec.scheduleTimeoutSeconds of the given pg, if specified +// 1. spec.scheduleTimeoutSeconds of the given podGroup, if specified // 2. given scheduleTimeout, if not nil // 3. fall back to DefaultWaitTime -func GetWaitTimeDuration(pg *sched.PodGroup, scheduleTimeout *time.Duration) time.Duration { - if pg != nil && pg.Spec.ScheduleTimeoutSeconds != nil { - return time.Duration(*pg.Spec.ScheduleTimeoutSeconds) * time.Second +func GetWaitTimeDuration(podGroup *sched.PodGroup, scheduleTimeout *time.Duration) time.Duration { + if podGroup != nil && podGroup.Spec.ScheduleTimeoutSeconds != nil { + return time.Duration(*podGroup.Spec.ScheduleTimeoutSeconds) * time.Second } if scheduleTimeout != nil && *scheduleTimeout != 0 { return *scheduleTimeout diff --git a/sig-scheduler-plugins/pkg/fluence/labels/labels.go b/sig-scheduler-plugins/pkg/fluence/labels/labels.go index f955d67..eb96c72 100644 --- a/sig-scheduler-plugins/pkg/fluence/labels/labels.go +++ b/sig-scheduler-plugins/pkg/fluence/labels/labels.go @@ -46,11 +46,11 @@ func GetPodGroupLabel(pod *v1.Pod) string { // GetPodGroupFullName get namespaced group name from pod labels func GetPodGroupFullName(pod *v1.Pod) string { - pgName := GetPodGroupLabel(pod) - if len(pgName) == 0 { + groupName := GetPodGroupLabel(pod) + if len(groupName) == 0 { return "" } - return fmt.Sprintf("%v/%v", pod.Namespace, pgName) + return fmt.Sprintf("%v/%v", pod.Namespace, groupName) } // GetPodGroupSize gets the pod group size from the label From cbeffceb04502a22396da984f620e8f9cd9ff99a Mon Sep 17 00:00:00 2001 From: vsoch Date: Tue, 14 May 2024 00:27:30 -0600 Subject: [PATCH 28/28] fix: response to review comments This set of changes includes the following: 1. Renaming short variable names to be longer and more understandable. 2. Not using the Status.ScheduleStartTime for the pod start time, but instead adding a new field. This previous field was there for a different purpose. 3. Creating named identifiers for resource types that can be shared in the jgf module along with others that use the same relations / vertex types. 4. Removing comments that are not necessary. 5. Changing JGF types from int to int64 that warrant it. 6. Spelling mistakes, etc. 7. Removing need to write jobspec to temporary file (we just need string) The JGF and utils modules need some additional looking - specifically I am worried that the paths->containment is not set, and sometimes the name reflects the index of the overall graph (global) and other times the index of the resource type. I think we likely want the latter for the inner name, but I am not sure in practice that fluxion is using it (internally). I am pushing these changes to assess testing, etc., and will update the PR as needed. There could also have been changes to upstream since the PR was opened that warrant additional fixes. Signed-off-by: vsoch --- README.md | 4 +- .../scheduling/v1alpha1/podgroup_webhook.go | 49 ++++--- .../apis/scheduling/v1alpha1/types.go | 12 +- .../cmd/controller/app/server.go | 4 - .../templates/webhook-service.yaml | 2 +- .../pkg/controllers/podgroup_controller.go | 10 +- .../pkg/fluence/core/core.go | 30 +++-- .../pkg/fluence/core/flux.go | 34 ++--- sig-scheduler-plugins/pkg/fluence/fluence.go | 103 ++++++++------- sig-scheduler-plugins/pkg/fluence/register.go | 10 +- .../pkg/fluence/utils/utils.go | 22 ++-- src/Makefile | 4 +- src/fluence/cmd/main.go | 8 +- src/fluence/fluxion/fluxion.go | 50 ++----- src/fluence/jgf/jgf.go | 123 +++++++++--------- src/fluence/jgf/types.go | 10 +- src/fluence/jobspec/jobspec.go | 37 +----- src/fluence/utils/utils.go | 95 +++++--------- 18 files changed, 269 insertions(+), 338 deletions(-) diff --git a/README.md b/README.md index 515d313..300eb1d 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ The way it works: 1. We have a mutating admission webhook that looks for jobs and pods, and ensures there are fluence labels (likely we will add more abstractions). 2. A PodGroup reconciler is watching for these same objects. When they are created: a. We find the labels and create the pod group object. - b. The pod group object has a timestamp for creation in milliseconds. + b. The pod group object has a timestamp for creation in microseconds. 3. When the pod is then given to fluence for scheduling, it already has the PodGroup created with name/size and can properly sort. Here is an example of a Job intended for Fluence: @@ -452,7 +452,7 @@ If you are looking to develop: - [src](src): includes source code for fluence. You'll find logs for this code in the `sidecar` container of the fluence pod. - [sig-scheduler-plugins](sig-scheduler-plugins): includes assets (manifests and Go files) that are intended to be added to the kubernetes-sigs/scheduler-plugins upstream repository before build. You'll find logs for this container in the `scheduler-plugins-scheduler` container of the pod. - - [apis](sig-scheduler-plugins/apis): customized PodGroup to define the status scheduled time in micro seconds + - [apis](sig-scheduler-plugins/apis): customized PodGroup to define the status scheduled time in microseconds - [manifests](sig-scheduler-plugins/manifests): manifests for helm and Kubernetes - [pkg](sig-scheduler-plugins/pkg): the main fluence module to add to upstream - [cmd](sig-scheduler-plugins/cmd): the main.go to replace in upstream diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go index c2582f9..7266d85 100644 --- a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go @@ -1,5 +1,5 @@ /* -Copyright 2023 Lawrence Livermore National Security, LLC +Copyright 2024 Lawrence Livermore National Security, LLC (c.f. AUTHORS, NOTICE.LLNS, COPYING) SPDX-License-Identifier: MIT @@ -50,14 +50,14 @@ type fluenceWatcher struct { // Handle is the main handler for the webhook, which is looking for jobs and pods (in that order) // If a job comes in (with a pod template) first, we add the labels there first (and they will // not be added again). -func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admission.Response { +func (hook *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admission.Response { logger.Info("Running webhook handle, determining pod wrapper abstraction...") job := &batchv1.Job{} - err := a.decoder.Decode(req, job) + err := hook.decoder.Decode(req, job) if err == nil { - err = a.EnsureGroupOnJob(job) + err = hook.EnsureGroupOnJob(job) if err != nil { logger.Error(err, "Issue adding PodGroup to Job") return admission.Errored(http.StatusBadRequest, err) @@ -72,9 +72,9 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi } pod := &corev1.Pod{} - err = a.decoder.Decode(req, pod) + err = hook.decoder.Decode(req, pod) if err == nil { - err = a.EnsureGroup(pod) + err = hook.EnsureGroup(pod) if err != nil { logger.Error(err, "Issue adding PodGroup to Pod") return admission.Errored(http.StatusBadRequest, err) @@ -89,9 +89,9 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi } set := &appsv1.StatefulSet{} - err = a.decoder.Decode(req, set) + err = hook.decoder.Decode(req, set) if err == nil { - err = a.EnsureGroupStatefulSet(set) + err = hook.EnsureGroupStatefulSet(set) if err != nil { logger.Error(err, "Issue adding PodGroup to StatefulSet") return admission.Errored(http.StatusBadRequest, err) @@ -105,15 +105,15 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi return admission.PatchResponseFromRaw(req.Object.Raw, marshalledSet) } - d := &appsv1.Deployment{} - err = a.decoder.Decode(req, d) + deployment := &appsv1.Deployment{} + err = hook.decoder.Decode(req, deployment) if err == nil { - err = a.EnsureGroupDeployment(d) + err = hook.EnsureGroupDeployment(deployment) if err != nil { logger.Error(err, "Issue adding PodGroup to Deployment") return admission.Errored(http.StatusBadRequest, err) } - marshalledD, err := json.Marshal(d) + marshalledD, err := json.Marshal(deployment) if err != nil { logger.Error(err, "Marshalling Deployment error") return admission.Errored(http.StatusInternalServerError, err) @@ -123,9 +123,9 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi } rset := &appsv1.ReplicaSet{} - err = a.decoder.Decode(req, rset) + err = hook.decoder.Decode(req, rset) if err == nil { - err = a.EnsureGroupReplicaSet(rset) + err = hook.EnsureGroupReplicaSet(rset) if err != nil { logger.Error(err, "Issue adding PodGroup to ReplicaSet") return admission.Errored(http.StatusBadRequest, err) @@ -145,29 +145,28 @@ func (a *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admi } // Default is the expected entrypoint for a webhook... -// I don't remember if this is even called... -func (a *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { +func (hook *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { switch obj.(type) { case *batchv1.Job: job := obj.(*batchv1.Job) - return a.EnsureGroupOnJob(job) + return hook.EnsureGroupOnJob(job) case *corev1.Pod: pod := obj.(*corev1.Pod) - return a.EnsureGroup(pod) + return hook.EnsureGroup(pod) case *appsv1.StatefulSet: set := obj.(*appsv1.StatefulSet) - return a.EnsureGroupStatefulSet(set) + return hook.EnsureGroupStatefulSet(set) case *appsv1.Deployment: - d := obj.(*appsv1.Deployment) - return a.EnsureGroupDeployment(d) + deployment := obj.(*appsv1.Deployment) + return hook.EnsureGroupDeployment(deployment) case *appsv1.ReplicaSet: set := obj.(*appsv1.ReplicaSet) - return a.EnsureGroupReplicaSet(set) + return hook.EnsureGroupReplicaSet(set) default: // no match @@ -180,7 +179,7 @@ func (a *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error // Note that we need to do similar for Job. // A pod without a job wrapper, and without metadata is a group // of size 1. -func (a *fluenceWatcher) EnsureGroup(pod *corev1.Pod) error { +func (hook *fluenceWatcher) EnsureGroup(pod *corev1.Pod) error { // Add labels if we don't have anything. Everything is a group! if pod.Labels == nil { @@ -221,7 +220,7 @@ func getJobLabel(job *batchv1.Job, labelName, defaultLabel string) string { // EnsureGroupOnJob looks for fluence labels (size and name) on both the job // and the pod template. We ultimately put on the pod, the lowest level unit. -// Since we have the size of the job (paramllism) we can use that for the size +// Since we have the size of the job (parallelism) we can use that for the size func (a *fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { // Be forgiving - allow the person to specify it on the job directly or on the Podtemplate @@ -252,7 +251,7 @@ func (a *fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { } // EnsureGroupStatefulSet creates a PodGroup for a StatefulSet -func (a *fluenceWatcher) EnsureGroupStatefulSet(set *appsv1.StatefulSet) error { +func (hook *fluenceWatcher) EnsureGroupStatefulSet(set *appsv1.StatefulSet) error { // StatefulSet requires on top level explicitly if set.Labels == nil { diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go index 77f10f3..fca7854 100644 --- a/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go @@ -136,12 +136,12 @@ type PodGroup struct { type PodGroupSpec struct { // MinMember defines the minimal number of members/tasks to run the pod group; // if there's not enough resources to start all tasks, the scheduler - // will not start anyone. + // will not start any. MinMember int32 `json:"minMember,omitempty"` // MinResources defines the minimal resource of members/tasks to run the pod group; // if there's not enough resources to start all tasks, the scheduler - // will not start anyone. + // will not start any. MinResources v1.ResourceList `json:"minResources,omitempty"` // ScheduleTimeoutSeconds defines the maximal time of members/tasks to wait before run the pod group; @@ -169,7 +169,13 @@ type PodGroupStatus struct { // +optional Failed int32 `json:"failed,omitempty"` - // ScheduleStartTime of the group (note that we changed this to a micro time) + // CreationTime is intended to mock the object CreationTime, + // but set by us to be MicroTime instead of Time. + // +optional + CreationTime metav1.MicroTime `json:"creationTime,omitempty"` + + // ScheduleStartTime of the group is when we want to start counting + // "at time N plus 48 hours, this is when we deem time waited is too long" // +optional ScheduleStartTime metav1.MicroTime `json:"scheduleStartTime,omitempty"` } diff --git a/sig-scheduler-plugins/cmd/controller/app/server.go b/sig-scheduler-plugins/cmd/controller/app/server.go index aae8625..c10968e 100644 --- a/sig-scheduler-plugins/cmd/controller/app/server.go +++ b/sig-scheduler-plugins/cmd/controller/app/server.go @@ -65,10 +65,6 @@ func Run(s *ServerRunOptions) error { return err } - // Create a channel for the mutating webhook to communicate back to the reconciler - // This way we create the PodGroup before scheduling - //c := make(chan event.GenericEvent) - if err = (&controllers.PodGroupReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml index bedfb95..e5339a1 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml @@ -12,4 +12,4 @@ spec: selector: app: scheduler-plugins-controller ports: - {{- .Values.webhookService.ports | toYaml | nindent 2 -}} + {{- .Values.webhookService.ports | toYaml | nindent 2 -}} diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index a2fd4a6..7afb815 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -83,6 +83,7 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c log.Info("REFERENCES", "Reconciler", pg.ObjectMeta.OwnerReferences) // Grab all statuses (and groups of them) we are interested in + // Note that 48 hours seems arbitrary, and if it is, we might make it a variable schedulingOrPending := (pg.Status.Phase == schedv1alpha1.PodGroupScheduling || pg.Status.Phase == schedv1alpha1.PodGroupPending) twoDaysOld := pg.Status.ScheduleStartTime.Sub(pg.CreationTimestamp.Time) > 48*time.Hour finishedOrFailed := pg.Status.Phase == schedv1alpha1.PodGroupFinished || pg.Status.Phase == schedv1alpha1.PodGroupFailed @@ -111,8 +112,11 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, err } - // If the scheduler time created is Zero (not set) we set it here - if pg.Status.ScheduleStartTime.IsZero() { + // If the pod group creation time created is Zero (not set) we set it here + // This only happens on the first reconcile, which should also be when the + // pod group is created. We set it here and don't use the underlying object + // CreationTime because we need to change the granularity to ms. + if pg.Status.CreationTime.IsZero() { return r.setTimeCreated(ctx, pg, podList.Items, timestamp) } @@ -159,7 +163,7 @@ func (r *PodGroupReconciler) setTimeCreated( // Now patch to update it patch := client.MergeFrom(pg.DeepCopy()) - pg.Status.ScheduleStartTime = timestamp + pg.Status.CreationTime = timestamp // Apply the patch to update the size r.Status().Update(ctx, pg) diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index a74e749..9de5a26 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -78,7 +78,7 @@ type Manager interface { PreFilter(context.Context, *corev1.Pod, *framework.CycleState) error GetPodNode(*corev1.Pod) string GetPodGroup(context.Context, *corev1.Pod) (string, *v1alpha1.PodGroup) - GetCreationTimestamp(*corev1.Pod, time.Time) time.Time + GetCreationTimestamp(*corev1.Pod, time.Time) metav1.MicroTime DeletePermittedPodGroup(string) Permit(context.Context, *framework.CycleState, *corev1.Pod) Status CalculateAssignedPods(string, string) int @@ -255,8 +255,8 @@ func (podGroupManager *PodGroupManager) PreFilter( return nil } - _, exist := podGroupManager.backedOffpodGroup.Get(groupName) - if exist { + _, exists := podGroupManager.backedOffpodGroup.Get(groupName) + if exists { return fmt.Errorf("podGroup %v failed recently", groupName) } @@ -290,8 +290,8 @@ func (podGroupManager *PodGroupManager) PreFilter( // TODO(cwdsuzhou): This resource check may not always pre-catch unschedulable pod group. // It only tries to PreFilter resource constraints so even if a PodGroup passed here, // it may not necessarily pass Filter due to other constraints such as affinity/taints. - _, ok := podGroupManager.permittedpodGroup.Get(groupName) - if ok { + _, exists = podGroupManager.permittedpodGroup.Get(groupName) + if exists { podGroupManager.log.Info("[PodGroup PreFilter] Pod Group %s is already admitted", groupName) return nil } @@ -331,17 +331,27 @@ func (podGroupManager *PodGroupManager) PreFilter( return nil } -// GetCreationTimestamp returns the creation time of a podGroup or a pod. -func (podGroupManager *PodGroupManager) GetCreationTimestamp(pod *corev1.Pod, ts time.Time) time.Time { +// GetCreationTimestamp returns the creation time of a podGroup or a pod in seconds (time.MicroTime) +// The Status.CreationTime is set by the PodGroup reconciler, which has to happen before we have +// a PodGroup. I don't see cases when this wouldn't happen, but in case we fall back to +// converting the pg.CreationTime to a MicroTime +func (podGroupManager *PodGroupManager) GetCreationTimestamp(pod *corev1.Pod, ts time.Time) metav1.MicroTime { groupName := util.GetPodGroupLabel(pod) if len(groupName) == 0 { - return ts + return metav1.NewMicroTime(ts) } var podGroup v1alpha1.PodGroup if err := podGroupManager.client.Get(context.TODO(), types.NamespacedName{Namespace: pod.Namespace, Name: groupName}, &podGroup); err != nil { - return ts + return metav1.NewMicroTime(ts) } - return podGroup.CreationTimestamp.Time + // First preference goes to microseconds. This should be set, as it is set by the first + // reconcile, and we wouldn'thave a pod group if it didn't pass through that. + if !podGroup.Status.CreationTime.IsZero() { + return podGroup.Status.CreationTime + } + // Fall back to CreationTime from Kubernetes, in seconds + // In practice this should not happen + return metav1.NewMicroTime(podGroup.CreationTimestamp.Time) } // CalculateAssignedPods returns the number of pods that has been assigned nodes: assumed or bound. diff --git a/sig-scheduler-plugins/pkg/fluence/core/flux.go b/sig-scheduler-plugins/pkg/fluence/core/flux.go index 50c8ff1..24c9212 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/flux.go +++ b/sig-scheduler-plugins/pkg/fluence/core/flux.go @@ -69,21 +69,21 @@ func (podGroupManager *PodGroupManager) AskFlux( } // An error here is an error with making the request - r, err := grpcclient.Match(context.Background(), request) + response, err := grpcclient.Match(context.Background(), request) if err != nil { podGroupManager.log.Warning("[PodGroup AskFlux] did not receive any match response: %v\n", err) return nodes, err } // TODO GetPodID should be renamed, because it will reflect the group - podGroupManager.log.Info("[PodGroup AskFlux] Match response ID %s\n", r.GetPodID()) + podGroupManager.log.Info("[PodGroup AskFlux] Match response ID %s\n", response.GetPodID()) // Get the nodelist and inspect - nodelist := r.GetNodelist() + nodelist := response.GetNodelist() for _, node := range nodelist { nodes = append(nodes, node.NodeID) } - jobid := uint64(r.GetJobID()) + jobid := uint64(response.GetJobID()) podGroupManager.log.Info("[PodGroup AskFlux] parsed node pods list %s for job id %d\n", nodes, jobid) // TODO would be nice to actually be able to ask flux jobs -a to fluence @@ -98,10 +98,10 @@ func (podGroupManager *PodGroupManager) AskFlux( // We assume that the cancelled job also means deleting the pod group func (podGroupManager *PodGroupManager) cancelFluxJob(groupName string, pod *corev1.Pod) error { - jobid, ok := podGroupManager.groupToJobId[groupName] + jobid, exists := podGroupManager.groupToJobId[groupName] // The job was already cancelled by another pod - if !ok { + if !exists { podGroupManager.log.Info("[PodGroup cancelFluxJob] Request for cancel of group %s is already complete.", groupName) return nil } @@ -121,15 +121,15 @@ func (podGroupManager *PodGroupManager) cancelFluxJob(groupName string, pod *cor // This error reflects the success or failure of the cancel request request := &pb.CancelRequest{JobID: int64(jobid)} - res, err := grpcclient.Cancel(context.Background(), request) + response, err := grpcclient.Cancel(context.Background(), request) if err != nil { podGroupManager.log.Error("[PodGroup cancelFluxJob] did not receive any cancel response: %v", err) return err } - podGroupManager.log.Info("[PodGroup cancelFluxJob] Job cancellation for group %s result: %d", groupName, res.Error) + podGroupManager.log.Info("[PodGroup cancelFluxJob] Job cancellation for group %s result: %d", groupName, response.Error) // And this error is if the cancel was successful or not - if res.Error == 0 { + if response.Error == 0 { podGroupManager.log.Info("[PodGroup cancelFluxJob] Successful cancel of flux job: %d for group %s", jobid, groupName) podGroupManager.cleanup(pod, groupName) } else { @@ -189,8 +189,8 @@ func (podGroupManager *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { // Do we have the group id in our cache? If yes, we haven't deleted the jobid yet // I am worried here that if some pods are succeeded and others pending, this could // be a mistake - fluence would schedule it again - _, ok := podGroupManager.groupToJobId[groupName] - if ok { + _, exists := podGroupManager.groupToJobId[groupName] + if exists { podGroupManager.cancelFluxJob(groupName, oldPod) } else { podGroupManager.log.Verbose("[PodGroup UpdatePod] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) @@ -204,8 +204,8 @@ func (podGroupManager *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { podGroupManager.mutex.Lock() defer podGroupManager.mutex.Unlock() - _, ok := podGroupManager.groupToJobId[groupName] - if ok { + _, exists := podGroupManager.groupToJobId[groupName] + if exists { podGroupManager.cancelFluxJob(groupName, oldPod) } else { podGroupManager.log.Error("[PodGroup UpdatePod] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) @@ -237,8 +237,8 @@ func (podGroupManager *PodGroupManager) DeletePod(podObj interface{}) { podGroupManager.mutex.Lock() defer podGroupManager.mutex.Unlock() - _, ok := podGroupManager.groupToJobId[groupName] - if ok { + _, exists := podGroupManager.groupToJobId[groupName] + if exists { podGroupManager.cancelFluxJob(groupName, pod) } else { podGroupManager.log.Info("[PodGroup DeletePod] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) @@ -247,8 +247,8 @@ func (podGroupManager *PodGroupManager) DeletePod(podObj interface{}) { podGroupManager.mutex.Lock() defer podGroupManager.mutex.Unlock() - _, ok := podGroupManager.groupToJobId[groupName] - if ok { + _, exists := podGroupManager.groupToJobId[groupName] + if exists { podGroupManager.cancelFluxJob(groupName, pod) } else { podGroupManager.log.Info("[PodGroup DeletePod] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index fddd3f0..44f0349 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -134,14 +134,13 @@ func New(_ context.Context, obj runtime.Object, handle framework.Handle) (framew return plugin, err } -func (f *Fluence) Name() string { +func (fluence *Fluence) Name() string { return Name } // Fluence has added delete, although I wonder if update includes that signal // and it's redundant? -func (f *Fluence) EventsToRegister() []framework.ClusterEventWithHint { - // TODO I have not redone this yet, not sure what it does (it might replace our informer above) +func (fluence *Fluence) EventsToRegister() []framework.ClusterEventWithHint { // To register a custom event, follow the naming convention at: // https://git.k8s.io/kubernetes/pkg/scheduler/eventhandlers.go#L403-L410 podGroupGVK := fmt.Sprintf("podgroups.v1alpha1.%v", scheduling.GroupName) @@ -152,14 +151,14 @@ func (f *Fluence) EventsToRegister() []framework.ClusterEventWithHint { } // TODO we need to account for affinity here -func (f *Fluence) Filter( +func (fluence *Fluence) Filter( ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeInfo *framework.NodeInfo, ) *framework.Status { - f.log.Verbose("[Fluence Filter] Filtering input node %s", nodeInfo.Node().Name) + fluence.log.Verbose("[Fluence Filter] Filtering input node %s", nodeInfo.Node().Name) state, err := cycleState.Read(framework.StateKey(pod.Name)) // No error means we retrieved the state @@ -172,7 +171,7 @@ func (f *Fluence) Filter( if ok && value.NodeName != nodeInfo.Node().Name { return framework.NewStatus(framework.Unschedulable, "pod is not permitted") } else { - f.log.Info("[Fluence Filter] node %s selected for %s\n", value.NodeName, pod.Name) + fluence.log.Info("[Fluence Filter] node %s selected for %s\n", value.NodeName, pod.Name) } } return framework.NewStatus(framework.Success) @@ -182,7 +181,7 @@ func (f *Fluence) Filter( // 1. Compare the priorities of Pods. // 2. Compare the initialization timestamps of PodGroups or Pods. // 3. Compare the keys of PodGroups/Pods: /. -func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { +func (fluence *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { prio1 := corev1helpers.PodPriority(podInfo1.Pod) prio2 := corev1helpers.PodPriority(podInfo2.Pod) if prio1 != prio2 { @@ -193,8 +192,8 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { // which is what fluence needs to distinguish between namespaces. Just the // name could be replicated between different namespaces ctx := context.TODO() - name1, podGroup1 := f.podGroupManager.GetPodGroup(ctx, podInfo1.Pod) - name2, podGroup2 := f.podGroupManager.GetPodGroup(ctx, podInfo2.Pod) + name1, podGroup1 := fluence.podGroupManager.GetPodGroup(ctx, podInfo1.Pod) + name2, podGroup2 := fluence.podGroupManager.GetPodGroup(ctx, podInfo2.Pod) // Fluence can only compare if we have two known groups. // This tries for that first, and falls back to the initial attempt timestamp @@ -212,60 +211,60 @@ func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { // PreFilterExtensions allow for callbacks on filtered states // This is required to be defined for a PreFilter plugin // https://github.com/kubernetes/kubernetes/blob/master/pkg/scheduler/framework/interface.go#L383 -func (f *Fluence) PreFilterExtensions() framework.PreFilterExtensions { +func (fluence *Fluence) PreFilterExtensions() framework.PreFilterExtensions { return nil } // PreFilter performs the following validations. // 1. Whether the PodGroup that the Pod belongs to is on the deny list. // 2. Whether the total number of pods in a PodGroup is less than its `minMember`. -func (f *Fluence) PreFilter( +func (fluence *Fluence) PreFilter( ctx context.Context, state *framework.CycleState, pod *corev1.Pod, ) (*framework.PreFilterResult, *framework.Status) { // Quick check if the pod is already scheduled - f.mutex.Lock() - node := f.podGroupManager.GetPodNode(pod) - f.mutex.Unlock() + fluence.mutex.Lock() + node := fluence.podGroupManager.GetPodNode(pod) + fluence.mutex.Unlock() if node != "" { - f.log.Info("[Fluence PreFilter] assigned pod %s to node %s\n", pod.Name, node) + fluence.log.Info("[Fluence PreFilter] assigned pod %s to node %s\n", pod.Name, node) result := framework.PreFilterResult{NodeNames: sets.New(node)} return &result, framework.NewStatus(framework.Success, "") } - f.log.Info("[Fluence PreFilter] pod %s does not have a node assigned\n", pod.Name) + fluence.log.Info("[Fluence PreFilter] pod %s does not have a node assigned\n", pod.Name) // This will populate the node name into the pod group manager - err := f.podGroupManager.PreFilter(ctx, pod, state) + err := fluence.podGroupManager.PreFilter(ctx, pod, state) if err != nil { - f.log.Error("[Fluence PreFilter] failed pod %s: %s", pod.Name, err.Error()) + fluence.log.Error("[Fluence PreFilter] failed pod %s: %s", pod.Name, err.Error()) return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) } - node = f.podGroupManager.GetPodNode(pod) + node = fluence.podGroupManager.GetPodNode(pod) result := framework.PreFilterResult{NodeNames: sets.New(node)} return &result, framework.NewStatus(framework.Success, "") } // PostFilter is used to reject a group of pods if a pod does not pass PreFilter or Filter. -func (f *Fluence) PostFilter( +func (fluence *Fluence) PostFilter( ctx context.Context, state *framework.CycleState, pod *corev1.Pod, filteredNodeStatusMap framework.NodeToStatusMap, ) (*framework.PostFilterResult, *framework.Status) { - groupName, podGroup := f.podGroupManager.GetPodGroup(ctx, pod) + groupName, podGroup := fluence.podGroupManager.GetPodGroup(ctx, pod) if podGroup == nil { - f.log.Info("Pod does not belong to any group, pod %s", pod.Name) + fluence.log.Info("Pod does not belong to any group, pod %s", pod.Name) return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, "can not find pod group") } // This explicitly checks nodes, and we can skip scheduling another pod if we already // have the minimum. For fluence since we expect an exact size this likely is not needed - assigned := f.podGroupManager.CalculateAssignedPods(podGroup.Name, pod.Namespace) + assigned := fluence.podGroupManager.CalculateAssignedPods(podGroup.Name, pod.Namespace) if assigned >= int(podGroup.Spec.MinMember) { - f.log.Info("Assigned pods podGroup %s is assigned %s", groupName, assigned) + fluence.log.Info("Assigned pods podGroup %s is assigned %s", groupName, assigned) return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable) } @@ -273,65 +272,65 @@ func (f *Fluence) PostFilter( // It's based on an implicit assumption: if the nth Pod failed, // it's inferrable other Pods belonging to the same PodGroup would be very likely to fail. - f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + fluence.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == podGroup.Name { - f.log.Info("PostFilter rejects the pod for podGroup %s and pod %s", groupName, waitingPod.GetPod().Name) - waitingPod.Reject(f.Name(), "optimistic rejection in PostFilter") + fluence.log.Info("PostFilter rejects the pod for podGroup %s and pod %s", groupName, waitingPod.GetPod().Name) + waitingPod.Reject(fluence.Name(), "optimistic rejection in PostFilter") } }) - if f.podGroupBackoff != nil { - pods, err := f.frameworkHandler.SharedInformerFactory().Core().V1().Pods().Lister().Pods(pod.Namespace).List( + if fluence.podGroupBackoff != nil { + pods, err := fluence.frameworkHandler.SharedInformerFactory().Core().V1().Pods().Lister().Pods(pod.Namespace).List( labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: flabel.GetPodGroupLabel(pod)}), ) if err == nil && len(pods) >= int(podGroup.Spec.MinMember) { - f.podGroupManager.BackoffPodGroup(groupName, *f.podGroupBackoff) + fluence.podGroupManager.BackoffPodGroup(groupName, *fluence.podGroupBackoff) } } - f.podGroupManager.DeletePermittedPodGroup(groupName) + fluence.podGroupManager.DeletePermittedPodGroup(groupName) return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, fmt.Sprintf("PodGroup %v gets rejected due to Pod %v is unschedulable even after PostFilter", groupName, pod.Name)) } // Permit is the functions invoked by the framework at "Permit" extension point. -func (f *Fluence) Permit( +func (fluence *Fluence) Permit( ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string, ) (*framework.Status, time.Duration) { - f.log.Info("Checking permit for pod %s to node %s", pod.Name, nodeName) - waitTime := *f.scheduleTimeout - s := f.podGroupManager.Permit(ctx, state, pod) + fluence.log.Info("Checking permit for pod %s to node %s", pod.Name, nodeName) + waitTime := *fluence.scheduleTimeout + s := fluence.podGroupManager.Permit(ctx, state, pod) var retStatus *framework.Status switch s { case fcore.PodGroupNotSpecified: - f.log.Info("Checking permit for pod %s to node %s: PodGroupNotSpecified", pod.Name, nodeName) + fluence.log.Info("Checking permit for pod %s to node %s: PodGroupNotSpecified", pod.Name, nodeName) return framework.NewStatus(framework.Success, ""), 0 case fcore.PodGroupNotFound: - f.log.Info("Checking permit for pod %s to node %s: PodGroupNotFound", pod.Name, nodeName) + fluence.log.Info("Checking permit for pod %s to node %s: PodGroupNotFound", pod.Name, nodeName) return framework.NewStatus(framework.Unschedulable, "PodGroup not found"), 0 case fcore.Wait: - f.log.Info("Pod %s is waiting to be scheduled to node %s", pod.Name, nodeName) - _, podGroup := f.podGroupManager.GetPodGroup(ctx, pod) - if wait := fgroup.GetWaitTimeDuration(podGroup, f.scheduleTimeout); wait != 0 { + fluence.log.Info("Pod %s is waiting to be scheduled to node %s", pod.Name, nodeName) + _, podGroup := fluence.podGroupManager.GetPodGroup(ctx, pod) + if wait := fgroup.GetWaitTimeDuration(podGroup, fluence.scheduleTimeout); wait != 0 { waitTime = wait } retStatus = framework.NewStatus(framework.Wait) // We will also request to move the sibling pods back to activeQ. - f.podGroupManager.ActivateSiblings(pod, state) + fluence.podGroupManager.ActivateSiblings(pod, state) case fcore.Success: podGroupFullName := flabel.GetPodGroupFullName(pod) - f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + fluence.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { if flabel.GetPodGroupFullName(waitingPod.GetPod()) == podGroupFullName { - f.log.Info("Permit allows pod %s", waitingPod.GetPod().Name) - waitingPod.Allow(f.Name()) + fluence.log.Info("Permit allows pod %s", waitingPod.GetPod().Name) + waitingPod.Allow(fluence.Name()) } }) - f.log.Info("Permit allows pod %s", pod.Name) + fluence.log.Info("Permit allows pod %s", pod.Name) retStatus = framework.NewStatus(framework.Success) waitTime = 0 } @@ -340,21 +339,21 @@ func (f *Fluence) Permit( } // Reserve is the functions invoked by the framework at "reserve" extension point. -func (f *Fluence) Reserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) *framework.Status { +func (fluence *Fluence) Reserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) *framework.Status { return nil } // Unreserve rejects all other Pods in the PodGroup when one of the pods in the group times out. -func (f *Fluence) Unreserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) { - groupName, podGroup := f.podGroupManager.GetPodGroup(ctx, pod) +func (fluence *Fluence) Unreserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) { + groupName, podGroup := fluence.podGroupManager.GetPodGroup(ctx, pod) if podGroup == nil { return } - f.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + fluence.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == podGroup.Name { - f.log.Info("Unreserve rejects pod %s in group %s", waitingPod.GetPod().Name, groupName) - waitingPod.Reject(f.Name(), "rejection in Unreserve") + fluence.log.Info("Unreserve rejects pod %s in group %s", waitingPod.GetPod().Name, groupName) + waitingPod.Reject(fluence.Name(), "rejection in Unreserve") } }) - f.podGroupManager.DeletePermittedPodGroup(groupName) + fluence.podGroupManager.DeletePermittedPodGroup(groupName) } diff --git a/sig-scheduler-plugins/pkg/fluence/register.go b/sig-scheduler-plugins/pkg/fluence/register.go index 8f39f09..1505633 100644 --- a/sig-scheduler-plugins/pkg/fluence/register.go +++ b/sig-scheduler-plugins/pkg/fluence/register.go @@ -29,27 +29,27 @@ import ( // here goes away we cannot remove it from being known. But it's better than // not having it, and having fluxion assume more resources than the // cluster has available. This is a TODO as fluxion does not support it -func (f *Fluence) RegisterExisting(ctx context.Context) error { +func (fluence *Fluence) RegisterExisting(ctx context.Context) error { // creates an in-cluster config and client config, err := rest.InClusterConfig() if err != nil { - f.log.Error("[Fluence RegisterExisting] Error creating in-cluster config: %s\n", err) + fluence.log.Error("[Fluence RegisterExisting] Error creating in-cluster config: %s\n", err) return err } // creates the clientset clientset, err := kubernetes.NewForConfig(config) if err != nil { - f.log.Error("[Fluence RegisterExisting] Error creating client for config: %s\n", err) + fluence.log.Error("[Fluence RegisterExisting] Error creating client for config: %s\n", err) return err } // get pods in all the namespaces by omitting namespace // Or specify namespace to get pods in particular namespace pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) if err != nil { - f.log.Info("[Fluence RegisterExisting] Error listing pods: %s\n", err) + fluence.log.Info("[Fluence RegisterExisting] Error listing pods: %s\n", err) return err } - f.log.Info("[Fluence RegisterExisting] Found %d existing pods in the cluster\n", len(pods.Items)) + fluence.log.Info("[Fluence RegisterExisting] Found %d existing pods in the cluster\n", len(pods.Items)) return nil } diff --git a/sig-scheduler-plugins/pkg/fluence/utils/utils.go b/sig-scheduler-plugins/pkg/fluence/utils/utils.go index f24f6d4..da9053b 100644 --- a/sig-scheduler-plugins/pkg/fluence/utils/utils.go +++ b/sig-scheduler-plugins/pkg/fluence/utils/utils.go @@ -44,16 +44,16 @@ func getPodJobspecLabels(pod *v1.Pod) []string { // jobspec based on the group and not the individual ID. // This calculates across containers in the od func PreparePodJobSpec(pod *v1.Pod, groupName string) *pb.PodSpec { - ps := new(pb.PodSpec) - ps.Id = groupName + podSpec := new(pb.PodSpec) + podSpec.Id = groupName - // Note from vsoch - there was an if check here to see if we had labels, + // There was an if check here to see if we had labels, // I don't think there is risk to adding an empty list but we can add // the check back if there is - ps.Labels = getPodJobspecLabels(pod) + podSpec.Labels = getPodJobspecLabels(pod) // the jobname should be the group name - ps.Container = groupName + podSpec.Container = groupName // Create accumulated requests for cpu and limits // CPU and memory are summed across containers @@ -87,12 +87,12 @@ func PreparePodJobSpec(pod *v1.Pod, groupName string) *pb.PodSpec { if cpus == 0 { cpus = 1 } - ps.Cpu = cpus - ps.Gpu = gpus - ps.Memory = memory - ps.Storage = storage + podSpec.Cpu = cpus + podSpec.Gpu = gpus + podSpec.Memory = memory + podSpec.Storage = storage // I removed specRequests.Cpu().MilliValue() but we can add back some derivative if desired - klog.Infof("[Jobspec] Pod spec: CPU %v, memory %v, GPU %v, storage %v", ps.Cpu, ps.Memory, ps.Gpu, ps.Storage) - return ps + klog.Infof("[Jobspec] Pod spec: CPU %v, memory %v, GPU %v, storage %v", podSpec.Cpu, podSpec.Memory, podSpec.Gpu, podSpec.Storage) + return podSpec } diff --git a/src/Makefile b/src/Makefile index af5fcb3..e31c8ec 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,8 +4,8 @@ INSTALL_PREFIX ?= /usr LIB_PREFIX ?= /usr/lib LOCALBIN ?= $(shell pwd)/bin COMMONENVVAR=GOOS=$(shell uname -s | tr A-Z a-z) -#BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${INSTALL_PREFIX}/lib -L${FLUX_SCHED_ROOT}/resource -lresource -L${FLUX_SCHED_ROOT}/resource/libjobspec -ljobspec_conv -L/${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -lczmq -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" -BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT} -I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${LIB_PREFIX} -L${LIB_PREFIX}/flux -L${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -lczmq -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" +#BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${INSTALL_PREFIX}/lib -L${FLUX_SCHED_ROOT}/resource -lresource -L${FLUX_SCHED_ROOT}/resource/libjobspec -ljobspec_conv -L/${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" +BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT} -I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${LIB_PREFIX} -L${LIB_PREFIX}/flux -L${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" LOCAL_REGISTRY=localhost:5000 diff --git a/src/fluence/cmd/main.go b/src/fluence/cmd/main.go index 3fb6a06..e8ef87d 100644 --- a/src/fluence/cmd/main.go +++ b/src/fluence/cmd/main.go @@ -48,12 +48,12 @@ func main() { } responsechan = make(chan string) - s := grpc.NewServer( + server := grpc.NewServer( grpc.KeepaliveParams(keepalive.ServerParameters{ MaxConnectionIdle: 5 * time.Minute, }), ) - pb.RegisterFluxcliServiceServer(s, &flux) + pb.RegisterFluxcliServiceServer(server, &flux) // External plugin (Kubectl) GRPC // This will eventually be an external GRPC module that can @@ -64,11 +64,11 @@ func main() { if *enableServicePlugin { plugin := service.ExternalService{} plugin.Init() - svcPb.RegisterExternalPluginServiceServer(s, &plugin) + svcPb.RegisterExternalPluginServiceServer(server, &plugin) } fmt.Printf("[GRPCServer] gRPC Listening on %s\n", lis.Addr().String()) - if err := s.Serve(lis); err != nil { + if err := server.Serve(lis); err != nil { fmt.Printf("[GRPCServer] failed to serve: %v\n", err) } diff --git a/src/fluence/fluxion/fluxion.go b/src/fluence/fluxion/fluxion.go index 05e94fa..f288cdf 100644 --- a/src/fluence/fluxion/fluxion.go +++ b/src/fluence/fluxion/fluxion.go @@ -20,10 +20,10 @@ type Fluxion struct { } // InitFluxion creates a new client to interaction with the fluxion API (via go bindings) -func (f *Fluxion) InitFluxion(policy *string, label *string) { - f.cli = fluxcli.NewReapiClient() +func (fluxion *Fluxion) InitFluxion(policy *string, label *string) { + fluxion.cli = fluxcli.NewReapiClient() - klog.Infof("[Fluence] Created flux resource client %s", f.cli) + klog.Infof("[Fluence] Created flux resource client %s", fluxion.cli) err := utils.CreateJGF(defaults.KubernetesJsonGraphFormat, label) if err != nil { return @@ -40,26 +40,25 @@ func (f *Fluxion) InitFluxion(policy *string, label *string) { p = string("{\"matcher_policy\": \"" + *policy + "\"}") klog.Infof("[Fluence] match policy: %s", p) } - - f.cli.InitContext(string(jgf), p) + fluxion.cli.InitContext(string(jgf), p) } // Cancel wraps the Cancel function of the fluxion go bindings -func (s *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelResponse, error) { +func (fluxion *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelResponse, error) { klog.Infof("[Fluence] received cancel request %v\n", in) - err := s.cli.Cancel(int64(in.JobID), true) + err := fluxion.cli.Cancel(int64(in.JobID), true) if err != nil { - return nil, errors.New("Error in Cancel") + return nil, err } // Why would we have an error code here if we check above? // This (I think) should be an error code for the specific job dr := &pb.CancelResponse{JobID: in.JobID} klog.Infof("[Fluence] sending cancel response %v\n", dr) - klog.Infof("[Fluence] cancel errors so far: %s\n", s.cli.GetErrMsg()) + klog.Infof("[Fluence] cancel errors so far: %s\n", fluxion.cli.GetErrMsg()) - reserved, at, overhead, mode, fluxerr := s.cli.Info(int64(in.JobID)) + reserved, at, overhead, mode, fluxerr := fluxion.cli.Info(int64(in.JobID)) klog.Infof("\n\t----Job Info output---") klog.Infof("jobid: %d\nreserved: %t\nat: %d\noverhead: %f\nmode: %s\nerror: %d\n", in.JobID, reserved, at, overhead, mode, fluxerr) @@ -67,48 +66,27 @@ func (s *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelR return dr, nil } -// generateJobSpec generates a jobspec for a match request and returns the string -func (s *Fluxion) generateJobspec(in *pb.MatchRequest) ([]byte, error) { - - spec := []byte{} - - // Create a temporary file to write and read the jobspec - // The first parameter here as the empty string creates in /tmp - file, err := os.CreateTemp("", "jobspec.*.yaml") - if err != nil { - return spec, err - } - defer os.Remove(file.Name()) - jobspec.CreateJobSpecYaml(in.Ps, in.Count, file.Name()) - - spec, err = os.ReadFile(file.Name()) - if err != nil { - return spec, errors.New("Error reading jobspec") - } - return spec, err -} - // Match wraps the MatchAllocate function of the fluxion go bindings // If a match is not possible, we return the error and an empty response -func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResponse, error) { +func (fluxion *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResponse, error) { emptyResponse := &pb.MatchResponse{} // Prepare an empty match response (that can still be serialized) klog.Infof("[Fluence] Received Match request %v\n", in) - // Generate the jobspec, written to temporary file and read as string - spec, err := s.generateJobspec(in) + // Generate the jobspec, array of bytes converted to string + spec, err := jobspec.CreateJobSpecYaml(in.Ps, in.Count) if err != nil { return emptyResponse, err } // Ask flux to match allocate! - reserved, allocated, at, overhead, jobid, fluxerr := s.cli.MatchAllocate(false, string(spec)) + reserved, allocated, at, overhead, jobid, fluxerr := fluxion.cli.MatchAllocate(false, string(spec)) utils.PrintOutput(reserved, allocated, at, overhead, jobid, fluxerr) // Be explicit about errors (or not) - errorMessages := s.cli.GetErrMsg() + errorMessages := fluxion.cli.GetErrMsg() if errorMessages == "" { klog.Infof("[Fluence] There are no errors") } else { diff --git a/src/fluence/jgf/jgf.go b/src/fluence/jgf/jgf.go index 1f45235..8a047f9 100644 --- a/src/fluence/jgf/jgf.go +++ b/src/fluence/jgf/jgf.go @@ -17,6 +17,7 @@ package jgf import ( "encoding/json" + "fmt" "log" "os" "strconv" @@ -26,13 +27,26 @@ import ( var ( // Defaults for nodes defaultExclusive = false - defaultRank = -1 - defaultSize = 1 + defaultRank = int64(-1) + defaultSize = int64(1) defaultUnit = "" // Relations - containsRelation = "contains" - inRelation = "in" + ContainsRelation = "contains" + InRelation = "in" + + // Vertex (node) types + // These are public to be used in the utils package + ClusterType = "cluster" + NodeType = "node" + CoreType = "core" + VirtualCoreType = "vcore" + RackType = "rack" + SocketType = "socket" + SubnetType = "subnet" + MemoryType = "memory" + NvidiaGPU = "nvidiagpu" + GPUType = "gpu" // Paths containmentKey = "containment" @@ -73,31 +87,20 @@ func (g *Fluxjgf) MakeEdge(source string, target string, contains string) { }, } g.Graph.Edges = append(g.Graph.Edges, newedge) - if contains == containsRelation { + if contains == ContainsRelation { tnode := g.NodeMap[target] tnode.Metadata.Paths[containmentKey] = g.NodeMap[source].Metadata.Paths[containmentKey] + "/" + tnode.Metadata.Name } } -// processLabels selects a subset based on a string filter -func processLabels(labels *map[string]string, filter string) (filtered map[string]string) { - filtered = map[string]string{} - for key, v := range *labels { - if strings.Contains(key, filter) { - filtered[key] = v - } - } - return -} - // MakeSubnet creates a subnet for the graph -func (g *Fluxjgf) MakeSubnet(index int, ip string) string { +func (g *Fluxjgf) MakeSubnet(index int64, ip string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "subnet", + Type: SubnetType, Basename: ip, - Name: ip + strconv.Itoa(g.Elements), + Name: ip + fmt.Sprintf("%d", g.Elements), Id: index, Uniq_id: g.Elements, Rank: defaultRank, @@ -114,11 +117,11 @@ func (g *Fluxjgf) MakeSubnet(index int, ip string) string { // MakeNode creates a new node for the graph func (g *Fluxjgf) MakeNode(index int, exclusive bool, subnet string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "node", + Type: NodeType, Basename: subnet, - Name: subnet + strconv.Itoa(g.Elements), + Name: subnet + fmt.Sprintf("%d", g.Elements), Id: g.Elements, Uniq_id: g.Elements, Rank: defaultRank, @@ -133,13 +136,13 @@ func (g *Fluxjgf) MakeNode(index int, exclusive bool, subnet string) string { } // MakeSocket creates a socket for the graph -func (g *Fluxjgf) MakeSocket(index int, name string) string { +func (g *Fluxjgf) MakeSocket(index int64, name string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "socket", + Type: SocketType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, Rank: defaultRank, @@ -154,13 +157,13 @@ func (g *Fluxjgf) MakeSocket(index int, name string) string { } // MakeCore creates a core for the graph -func (g *Fluxjgf) MakeCore(index int, name string) string { +func (g *Fluxjgf) MakeCore(index int64, name string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "core", + Type: CoreType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, Rank: defaultRank, @@ -175,13 +178,13 @@ func (g *Fluxjgf) MakeCore(index int, name string) string { } // MakeVCore makes a vcore (I think 2 vcpu == 1 cpu) for the graph -func (g *Fluxjgf) MakeVCore(coreid string, index int, name string) string { +func (g *Fluxjgf) MakeVCore(coreid string, index int64, name string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "vcore", + Type: VirtualCoreType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, Rank: defaultRank, @@ -192,13 +195,13 @@ func (g *Fluxjgf) MakeVCore(coreid string, index int, name string) string { }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, containsRelation) - g.MakeEdge(newnode.Id, coreid, inRelation) + g.MakeEdge(coreid, newnode.Id, ContainsRelation) + g.MakeEdge(newnode.Id, coreid, InRelation) return newnode.Id } // MakeNFProperties makes the node feature discovery properties for the graph -func (g *Fluxjgf) MakeNFDProperties(coreid string, index int, filter string, labels *map[string]string) { +func (g *Fluxjgf) MakeNFDProperties(coreid string, index int64, filter string, labels *map[string]string) { for key, _ := range *labels { if strings.Contains(key, filter) { name := strings.Split(key, "/")[1] @@ -207,11 +210,11 @@ func (g *Fluxjgf) MakeNFDProperties(coreid string, index int, filter string, lab } newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ Type: name, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, Rank: defaultRank, @@ -222,22 +225,22 @@ func (g *Fluxjgf) MakeNFDProperties(coreid string, index int, filter string, lab }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, containsRelation) + g.MakeEdge(coreid, newnode.Id, ContainsRelation) } } } -func (g *Fluxjgf) MakeNFDPropertiesByValue(coreid string, index int, filter string, labels *map[string]string) { +func (g *Fluxjgf) MakeNFDPropertiesByValue(coreid string, index int64, filter string, labels *map[string]string) { for key, val := range *labels { if strings.Contains(key, filter) { name := val newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ Type: name, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, Rank: defaultRank, @@ -248,19 +251,19 @@ func (g *Fluxjgf) MakeNFDPropertiesByValue(coreid string, index int, filter stri }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, containsRelation) + g.MakeEdge(coreid, newnode.Id, ContainsRelation) } } } // MakeMemory creates memory for the graph -func (g *Fluxjgf) MakeMemory(index int, name string, unit string, size int) string { +func (g *Fluxjgf) MakeMemory(index int64, name string, unit string, size int64) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "memory", + Type: MemoryType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, Rank: defaultRank, @@ -275,13 +278,13 @@ func (g *Fluxjgf) MakeMemory(index int, name string, unit string, size int) stri } // MakeGPU makes a gpu for the graph -func (g *Fluxjgf) MakeGPU(index int, name string, size int) string { +func (g *Fluxjgf) MakeGPU(index int64, name string, size int64) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "gpu", + Type: GPUType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, Rank: defaultRank, @@ -301,7 +304,7 @@ func (g *Fluxjgf) MakeCluster(clustername string) string { newnode := node{ Id: strconv.Itoa(0), Metadata: nodeMetadata{ - Type: "cluster", + Type: ClusterType, Basename: clustername, Name: clustername + "0", Id: g.Elements, @@ -320,14 +323,14 @@ func (g *Fluxjgf) MakeCluster(clustername string) string { } // MakeRack makes the rack -func (g *Fluxjgf) MakeRack(id int) string { +func (g *Fluxjgf) MakeRack(index int64) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "rack", - Basename: "rack", - Name: "rack" + strconv.Itoa(id), - Id: id, + Type: RackType, + Basename: RackType, + Name: RackType + fmt.Sprintf("%d", index), + Id: index, Uniq_id: g.Elements, Rank: defaultRank, Exclusive: defaultExclusive, diff --git a/src/fluence/jgf/types.go b/src/fluence/jgf/types.go index b2b743f..21ccd00 100644 --- a/src/fluence/jgf/types.go +++ b/src/fluence/jgf/types.go @@ -38,12 +38,12 @@ type nodeMetadata struct { Type string `json:"type"` Basename string `json:"basename"` Name string `json:"name"` - Id int `json:"id"` - Uniq_id int `json:"uniq_id"` - Rank int `json:"rank,omitempty"` + Id int64 `json:"id"` + Uniq_id int64 `json:"uniq_id"` + Rank int64 `json:"rank,omitempty"` Exclusive bool `json:"exclusive"` Unit string `json:"unit"` - Size int `json:"size"` + Size int64 `json:"size"` Paths map[string]string `json:"paths,omitempty"` Properties map[string]string `json:"properties,omitempty"` } @@ -57,6 +57,6 @@ type graph struct { type Fluxjgf struct { Graph graph `json:"graph"` - Elements int `json:"-"` + Elements int64 `json:"-"` NodeMap map[string]node `json:"-"` } diff --git a/src/fluence/jobspec/jobspec.go b/src/fluence/jobspec/jobspec.go index 683f586..96ed0fe 100644 --- a/src/fluence/jobspec/jobspec.go +++ b/src/fluence/jobspec/jobspec.go @@ -18,8 +18,6 @@ package jobspec import ( "fmt" "log" - "math" - "os" pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxcli-grpc" "gopkg.in/yaml.v2" @@ -39,7 +37,7 @@ Ps: &pb.PodSpec{ */ // CreateJobSpecYaml writes the protobuf jobspec into a yaml file -func CreateJobSpecYaml(spec *pb.PodSpec, count int32, filename string) error { +func CreateJobSpecYaml(spec *pb.PodSpec, count int32) ([]byte, error) { command := []string{spec.Container} fmt.Println("Labels ", spec.Labels, " ", len(spec.Labels)) @@ -68,38 +66,9 @@ func CreateJobSpecYaml(spec *pb.PodSpec, count int32, filename string) error { yamlbytes, err := yaml.Marshal(&js) if err != nil { log.Fatalf("[JobSpec] yaml.Marshal failed with '%s'\n", err) - return err + return yamlbytes, err } - return writeBytes(yamlbytes, filename) -} - -// WriteBytes writes a byte string to file -func writeBytes(bytelist []byte, filename string) error { - fmt.Printf("[JobSpec] Preparing to write:\n%s\n", string(bytelist)) - f, err := os.Create(filename) - if err != nil { - log.Fatalf("[JobSpec] Couldn't create file!!\n") - return err - } - defer f.Close() - - _, err = f.Write(bytelist) - if err != nil { - log.Fatalf("[JobSpec] Couldn't write file!!\n") - return err - } - - // Not sure why this is here, but will keep for now - _, err = f.WriteString("\n") - if err != nil { - log.Fatalf("[JobSpec] Couldn't append newline to file!!\n") - } - return err -} - -func toGB(bytes int64) int64 { - res := float64(bytes) / math.Pow(10, 9) - return int64(res) + return yamlbytes, nil } // createSocketResources creates the socket resources for the JobSpec diff --git a/src/fluence/utils/utils.go b/src/fluence/utils/utils.go index e429056..490a0e0 100644 --- a/src/fluence/utils/utils.go +++ b/src/fluence/utils/utils.go @@ -93,23 +93,18 @@ func CreateJGF(filename string, skipLabel *string) error { // Create a Flux Json Graph Format (JGF) with all cluster nodes fluxgraph := jgf.InitJGF() - // TODO it looks like we can add more to the graph here - - // let's remember to consider what else we can. - // subnets := make(map[string]string) - + // Top level of the graph is the cluster + // This assumes fluxion is only serving one cluster. + // previous comments indicate that we choose between the level + // of a rack and a subnet. A rack doesn't make sense (the nodes could + // be on multiple racks) so subnet is likely the right abstraction cluster := fluxgraph.MakeCluster("k8scluster") - // Rack needs to be disabled when using subnets - // rack := fluxgraph.MakeRack(0) - - // fluxgraph.MakeEdge(cluster, rack, "contains") - // fluxgraph.MakeEdge(rack, cluster, "in") - vcores := 0 fmt.Println("Number nodes ", len(nodes.Items)) var totalAllocCpu int64 totalAllocCpu = 0 - sdnCount := 0 + sdnCount := int64(0) for nodeIndex, node := range nodes.Items { @@ -146,13 +141,12 @@ func CreateJGF(filename string, skipLabel *string) error { return err } - // Check if subnet already exists - // Here we build subnets according to topology.kubernetes.io/zone label + // Here we build the subnet according to topology.kubernetes.io/zone label subnetName := node.Labels["topology.kubernetes.io/zone"] subnet := fluxgraph.MakeSubnet(sdnCount, subnetName) sdnCount = sdnCount + 1 - fluxgraph.MakeEdge(cluster, subnet, "contains") - fluxgraph.MakeEdge(subnet, cluster, "in") + fluxgraph.MakeEdge(cluster, subnet, jgf.ContainsRelation) + fluxgraph.MakeEdge(subnet, cluster, jgf.InRelation) // These are requests for existing pods, for cpu and memory reqs := computeTotalRequests(pods) @@ -179,64 +173,44 @@ func CreateJGF(filename string, skipLabel *string) error { fmt.Printf(" available mem: %d\n", availMem) gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable["nvidia.com/gpu"] - // reslist := node.Status.Allocatable - // resources := make([]corev1.ResourceName, 0, len(reslist)) - // for resource := range reslist { - // fmt.Println("resource ", resource) - // resources = append(resources, resource) - // } - // for _, resource := range resources { - // value := reslist[resource] - - // fmt.Printf(" %s:\t%s\n", resource, value.String()) - // } + // TODO possibly look at pod resources vs. node.Status.Allocatable workernode := fluxgraph.MakeNode(nodeIndex, false, node.Name) - fluxgraph.MakeEdge(subnet, workernode, "contains") // this is rack otherwise - fluxgraph.MakeEdge(workernode, subnet, "in") // this is rack otherwise - - // socket := fluxgraph.MakeSocket(0, "socket") - // fluxgraph.MakeEdge(workernode, socket, "contains") - // fluxgraph.MakeEdge(socket, workernode, "in") + fluxgraph.MakeEdge(subnet, workernode, jgf.ContainsRelation) + fluxgraph.MakeEdge(workernode, subnet, jgf.InRelation) if hasGpuAllocatable { fmt.Println("GPU Resource quantity ", gpuAllocatable.Value()) - //MakeGPU(index int, name string, size int) string { for index := 0; index < int(gpuAllocatable.Value()); index++ { - gpu := fluxgraph.MakeGPU(index, "nvidiagpu", 1) - fluxgraph.MakeEdge(workernode, gpu, "contains") // workernode was socket - fluxgraph.MakeEdge(gpu, workernode, "in") + gpu := fluxgraph.MakeGPU(int64(index), jgf.NvidiaGPU, 1) + fluxgraph.MakeEdge(workernode, gpu, jgf.ContainsRelation) + fluxgraph.MakeEdge(gpu, workernode, jgf.InRelation) } } for index := 0; index < int(availCpu); index++ { - // MakeCore(index int, name string) - core := fluxgraph.MakeCore(index, "core") - fluxgraph.MakeEdge(workernode, core, "contains") // workernode was socket - fluxgraph.MakeEdge(core, workernode, "in") + core := fluxgraph.MakeCore(int64(index), jgf.CoreType) + fluxgraph.MakeEdge(workernode, core, jgf.ContainsRelation) + fluxgraph.MakeEdge(core, workernode, jgf.InRelation) // Question from Vanessa: // How can we get here and have vcores ever not equal to zero? if vcores == 0 { - fluxgraph.MakeNFDProperties(core, index, "cpu-", &node.Labels) - // fluxgraph.MakeNFDProperties(core, index, "netmark-", &node.Labels) + fluxgraph.MakeNFDProperties(core, int64(index), "cpu-", &node.Labels) } else { - for vc := 0; vc < vcores; vc++ { - vcore := fluxgraph.MakeVCore(core, vc, "vcore") - fluxgraph.MakeNFDProperties(vcore, index, "cpu-", &node.Labels) + for virtualCore := 0; virtualCore < vcores; virtualCore++ { + vcore := fluxgraph.MakeVCore(core, int64(virtualCore), jgf.VirtualCoreType) + fluxgraph.MakeNFDProperties(vcore, int64(index), "cpu-", &node.Labels) } } } - // MakeMemory(index int, name string, unit string, size int) fractionMem := availMem >> 30 - // fractionmem := (totalmem/totalcpu) >> 20 - // fmt.Println("Creating ", fractionmem, " vertices with ", 1<<10, " MB of mem") - for i := 0; i < /*int(totalcpu)*/ int(fractionMem); i++ { - mem := fluxgraph.MakeMemory(i, "memory", "MB", int(1<<10)) - fluxgraph.MakeEdge(workernode, mem, "contains") - fluxgraph.MakeEdge(mem, workernode, "in") + for i := 0; i < int(fractionMem); i++ { + mem := fluxgraph.MakeMemory(int64(i), jgf.MemoryType, "MB", 1<<10) + fluxgraph.MakeEdge(workernode, mem, jgf.ContainsRelation) + fluxgraph.MakeEdge(mem, workernode, jgf.InRelation) } } fmt.Printf("\nCan request at most %d exclusive cpu", totalAllocCpu) @@ -248,6 +222,7 @@ func CreateJGF(filename string, skipLabel *string) error { } +// computeTotalRequests sums up the pod requests for the list. We do not consider limits. func computeTotalRequests(podList *corev1.PodList) (total map[corev1.ResourceName]resource.Quantity) { total = map[corev1.ResourceName]resource.Quantity{} for _, pod := range podList.Items { @@ -260,14 +235,6 @@ func computeTotalRequests(podList *corev1.PodList) (total map[corev1.ResourceNam total[podReqName] = v } } - // for podLimitName, podLimitValue := range podLimits { - // if v, ok := total[podLimitName]; !ok { - // total[podLimitName] = podLimitValue - // } else { - // v.Add(podLimitValue) - // total[podLimitName] = v - // } - // } } return } @@ -295,17 +262,17 @@ func ParseAllocResult(allocated, podName string) []allocation { // Parse graph and nodes into interfaces // TODO look at github.com/mitchellh/mapstructure // that might make this easier - nodes := dat["graph"].(interface{}) + nodes := dat["graph"] str1 := nodes.(map[string]interface{}) str2 := str1["nodes"].([]interface{}) for _, item := range str2 { str1 = item.(map[string]interface{}) metadata := str1["metadata"].(map[string]interface{}) - if metadata["type"].(string) == "core" { + if metadata["type"].(string) == jgf.CoreType { corecount = corecount + 1 } - if metadata["type"].(string) == "node" { + if metadata["type"].(string) == jgf.NodeType { result = append(result, allocation{ Type: metadata["type"].(string), Name: metadata["name"].(string), @@ -334,6 +301,6 @@ func PrintOutput(reserved bool, allocated string, at int64, overhead float64, jo // Only print error if we had one if fluxerr != nil { - fmt.Printf("error: %w\n", fluxerr) + fmt.Printf("error: %s\n", fluxerr) } }