diff --git a/.github/test-kind-config.yaml b/.github/test-kind-config.yaml new file mode 100644 index 0000000..0fe29e7 --- /dev/null +++ b/.github/test-kind-config.yaml @@ -0,0 +1,5 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane +- role: worker \ No newline at end of file diff --git a/.github/test.sh b/.github/test.sh old mode 100755 new mode 100644 index 44314ad..2b8b1e6 --- a/.github/test.sh +++ b/.github/test.sh @@ -18,6 +18,8 @@ cd upstream/manifests/install/charts helm install \ --set scheduler.image=ghcr.io/flux-framework/fluence:latest \ --set scheduler.sidecarimage=ghcr.io/flux-framework/fluence-sidecar:latest \ + --set controller.image=ghcr.io/flux-framework/fluence-controller:latest \ + --set controller.pullPolicy=Never \ --set scheduler.pullPolicy=Never \ --set scheduler.sidecarPullPolicy=Never \ schedscheduler-plugins as-a-second-scheduler/ diff --git a/.github/workflows/build-deploy.yaml b/.github/workflows/build-deploy.yaml index c993aa9..575d2db 100644 --- a/.github/workflows/build-deploy.yaml +++ b/.github/workflows/build-deploy.yaml @@ -18,7 +18,7 @@ jobs: name: build fluence steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: go-version: ^1.19 @@ -45,7 +45,44 @@ jobs: - name: Deploy Container if: (github.event_name != 'pull_request') run: docker push ${{ env.container }} --all-tags - + + build-controller: + permissions: + packages: write + env: + container: ghcr.io/flux-framework/fluence-controller + runs-on: ubuntu-latest + name: build fluence-controller + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v4 + with: + go-version: ^1.19 + + - name: Build Containers + run: | + make prepare + make build REGISTRY=ghcr.io/flux-framework CONTROLLER_IMAGE=fluence-controller + + - name: Tag Release Image + if: (github.event_name == 'release') + run: | + tag=${GITHUB_REF#refs/tags/} + echo "Tagging and releasing ${{ env.container}}:${tag}" + docker tag ${{ env.container }}:latest ${{ env.container }}:${tag} + + - name: GHCR Login + if: (github.event_name != 'pull_request') + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Deploy Container + if: (github.event_name != 'pull_request') + run: docker push ${{ env.container }} --all-tags + build-sidecar: permissions: packages: write @@ -55,7 +92,7 @@ jobs: name: build sidecar steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: go-version: ^1.19 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3e24a33..ed45891 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -11,30 +11,42 @@ on: jobs: build-fluence: + + # The scheduler and controller are built together with the hack script + # in the upstream scheduler-plugins env: container: ghcr.io/flux-framework/fluence + controller: ghcr.io/flux-framework/fluence-controller runs-on: ubuntu-latest name: build fluence steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: go-version: ^1.19 - name: Build Containers run: | make prepare - make build REGISTRY=ghcr.io/flux-framework SCHEDULER_IMAGE=fluence + make build REGISTRY=ghcr.io/flux-framework SCHEDULER_IMAGE=fluence CONTROLLER_IMAGE=fluence-controller - - name: Save Container - run: docker save ${{ env.container }} | gzip > fluence_latest.tar.gz + - name: Save Containers + run: | + docker save ${{ env.container }} | gzip > fluence_latest.tar.gz + docker save ${{ env.controller }} | gzip > fluence_controller_latest.tar.gz - name: Upload container artifact uses: actions/upload-artifact@v4 with: name: fluence path: fluence_latest.tar.gz - + + - name: Upload container artifact + uses: actions/upload-artifact@v4 + with: + name: fluence_controller + path: fluence_controller_latest.tar.gz + build-sidecar: env: container: ghcr.io/flux-framework/fluence-sidecar @@ -42,7 +54,7 @@ jobs: name: build sidecar steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: go-version: ^1.19 @@ -59,7 +71,7 @@ jobs: with: name: fluence_sidecar path: fluence_sidecar_latest.tar.gz - + test-fluence: needs: [build-fluence, build-sidecar] permissions: @@ -67,14 +79,15 @@ jobs: env: fluence_container: ghcr.io/flux-framework/fluence sidecar_container: ghcr.io/flux-framework/fluence-sidecar + controller_container: ghcr.io/flux-framework/fluence-controller runs-on: ubuntu-latest - name: build fluence + name: test fluence steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v3 + - uses: actions/setup-go@v4 with: - go-version: ^1.20 + go-version: ^1.19 - name: Download fluence artifact uses: actions/download-artifact@v4 @@ -88,11 +101,27 @@ jobs: name: fluence_sidecar path: /tmp + - name: Download fluence_controller artifact + uses: actions/download-artifact@v4 + with: + name: fluence_controller + path: /tmp + + - name: Make Space For Build + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + - name: Load Docker images run: | ls /tmp/*.tar.gz docker load --input /tmp/fluence_sidecar_latest.tar.gz + rm /tmp/fluence_sidecar_latest.tar.gz docker load --input /tmp/fluence_latest.tar.gz + rm /tmp/fluence_latest.tar.gz + docker load --input /tmp/fluence_controller_latest.tar.gz + rm /tmp/fluence_controller_latest.tar.gz docker image ls -a | grep fluence - name: Create Kind Cluster @@ -101,15 +130,23 @@ jobs: cluster_name: kind kubectl_version: v1.28.2 version: v0.20.0 + config: ./.github/test-kind-config.yaml - name: Load Docker Containers into Kind env: fluence: ${{ env.fluence_container }} sidecar: ${{ env.sidecar_container }} + controller: ${{ env.controller_container }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | kind load docker-image ${fluence} kind load docker-image ${sidecar} + kind load docker-image ${controller} + + - name: Install Cert Manager + run: | + kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml + sleep 10 - name: Test Fluence run: /bin/bash ./.github/test.sh @@ -122,6 +159,8 @@ jobs: docker tag ${{ env.fluence_container }}:latest ${{ env.fluence_container }}:${tag} echo "Tagging and releasing ${{ env.sidecar_container}}:${tag}" docker tag ${{ env.sidecar_container }}:latest ${{ env.sidecar_container }}:${tag} + echo "Tagging and releasing ${{ env.controller_container}}:${tag}" + docker tag ${{ env.controller_container }}:latest ${{ env.controller_container }}:${tag} # If we get here, tests pass, and we can deploy - name: GHCR Login @@ -137,3 +176,4 @@ jobs: run: | docker push ${{ env.fluence_container }} --all-tags docker push ${{ env.sidecar_container }} --all-tags + docker push ${{ env.controller_container }} --all-tags \ No newline at end of file diff --git a/.gitignore b/.gitignore index fa1845c..51462a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ plugins upstream scheduler-plugins +sig-scheduler-plugins/pkg/fluence/bin/ +src/bin +src/fluence/vendor \ No newline at end of file diff --git a/Makefile b/Makefile index dc87d50..d051a0e 100644 --- a/Makefile +++ b/Makefile @@ -10,24 +10,41 @@ SIDECAR_IMAGE ?= fluence-sidecar:latest CONTROLLER_IMAGE ?= fluence-controller SCHEDULER_IMAGE ?= fluence -.PHONY: all build build-sidecar prepare push push-sidecar push-controller +.PHONY: all build build-sidecar clone update push push-sidecar push-controller -all: build-sidecar prepare build +all: prepare build-sidecar build build-sidecar: make -C ./src LOCAL_REGISTRY=${REGISTRY} LOCAL_IMAGE=${SIDECAR_IMAGE} -prepare: +clone: if [ -d "$(CLONE_UPSTREAM)" ]; then echo "Upstream is cloned"; else git clone $(UPSTREAM) ./$(CLONE_UPSTREAM); fi + +update: clone + git -C $(CLONE_UPSTREAM) pull origin master + +prepare: clone # These are entirely new directory structures + rm -rf $(CLONE_UPSTREAM)/pkg/fluence + rm -rf $(CLONE_UPSTREAM)/pkg/logger + # rm -rf $(CLONE_UPSTREAM)/cmd/app + rm -rf $(CLONE_UPSTREAM)/pkg/controllers/podgroup_controller.go + rm -rf $(CLONE_UPSTREAM)/cmd/controller/app/server.go + cp -R sig-scheduler-plugins/pkg/logger $(CLONE_UPSTREAM)/pkg/logger cp -R sig-scheduler-plugins/pkg/fluence $(CLONE_UPSTREAM)/pkg/fluence - cp -R sig-scheduler-plugins/manifests/fluence $(CLONE_UPSTREAM)/manifests/fluence + cp -R sig-scheduler-plugins/pkg/controllers/* $(CLONE_UPSTREAM)/pkg/controllers/ + # This is the one exception not from sig-scheduler-plugins because it is needed in both spots + cp -R src/fluence/fluxcli-grpc $(CLONE_UPSTREAM)/pkg/fluence/fluxcli-grpc + # cp -R sig-scheduler-plugins/cmd/app ./upstream/cmd/app # These are files with subtle changes to add fluence cp sig-scheduler-plugins/cmd/scheduler/main.go ./upstream/cmd/scheduler/main.go - cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml + cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/*.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/templates/ + cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/*.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/crds/ cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml $(CLONE_UPSTREAM)/manifests/install/charts/as-a-second-scheduler/values.yaml + cp sig-scheduler-plugins/apis/scheduling/v1alpha1/*.go $(CLONE_UPSTREAM)/apis/scheduling/v1alpha1/ + cp sig-scheduler-plugins/cmd/controller/app/server.go $(CLONE_UPSTREAM)/cmd/controller/app/server.go -build: +build: prepare REGISTRY=${REGISTRY} IMAGE=${SCHEDULER_IMAGE} CONTROLLER_IMAGE=${CONTROLLER_IMAGE} $(BASH) $(CLONE_UPSTREAM)/hack/build-images.sh push-sidecar: diff --git a/README.md b/README.md index 968c2dc..300eb1d 100644 --- a/README.md +++ b/README.md @@ -2,21 +2,112 @@ ![docs/images/fluence.png](docs/images/fluence.png) -Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Scheduling Framework](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/). Fluence uses the directed-graph based [Fluxion scheduler](https://github.com/flux-framework/flux-sched) to map pods or [podgroups](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/pkg/coscheduling) to nodes. Fluence supports all the Fluxion scheduling algorithms (e.g., `hi`, `low`, `hinode`, etc.). Note that Fluence does not currently support use in conjunction with the kube-scheduler. Pods must all be scheduled by Fluence. +Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Scheduling Framework](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/). Fluence uses the directed-graph based [Fluxion scheduler](https://github.com/flux-framework/flux-sched) to map pods or [podgroups](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/pkg/coscheduling) to nodes. Fluence supports all the Fluxion scheduling algorithms (e.g., `hi`, `low`, `hinode`, etc.). + +**Important** Fluence does not currently support use in conjunction with the kube-scheduler. Pods must all be scheduled by Fluence, and *you should not use both schedulers in the same cluster*. + +## TODO + +- Need to allow for restart / crashes and looking up existing jobid, updating maps in PodGroup +- Since AskFlux is done on level of pod group, refactor function to account for specific resources of all pods (not just one pod) +- Figure out if EventsToRegister replaces old informer +- Would be nice to see the state of fluxion (retest the kubectl-fluence plugin) ## Getting started -For instructions on how to start Fluence on a K8s cluster, see [examples](examples/). Documentation and instructions for reproducing our CANOPIE2022 paper (citation below) can be found in the [canopie22-artifacts branch](https://github.com/flux-framework/flux-k8s/tree/canopie22-artifacts). -For background on the Flux framework and the Fluxion scheduler, you can take a look at our award-winning R&D100 submission: https://ipo.llnl.gov/sites/default/files/2022-02/Flux_RD100_Final.pdf. For next steps: +For instructions on how to start Fluence on a K8s cluster, see [examples](examples/). Documentation and instructions for reproducing our CANOPIE-2022 paper (citation below) can be found in the [canopie22-artifacts branch](https://github.com/flux-framework/flux-k8s/tree/canopie22-artifacts). +For background on the Flux framework and the Fluxion scheduler, you can take a look at our award-winning [R&D100 submission](https://ipo.llnl.gov/sites/default/files/2022-02/Flux_RD100_Final.pdf). For next steps: + - To understand how it works, see [Design](#design) - To deploy our pre-built images, go to [Deploy](#deploy) - - To build your own images, go to [Setup](#setup) + - To build your own images, go to [Build](#build) + - To learn about repository organization, see [Developer](#developer) + +### Design + +Fluence is a custom scheduler plugin that you can specify to use with two directive in your pod spec - + +- Asking for `fluence` as the scheduler name + +Note that any abstraction with pods (or a single pod) marked for fluence will automatically have the group name +and nodes derived. However, if you want to customize this metadata (for example, define the size of the pod group explicitly you can use +the following labels): + + - A named group of pods with the `scheduling.x-k8s.io/pod-group` label. + - Defining the group size with the `fluence.group-size` label. + +We expect to define more labels to customize the scheduling logic. + +The way it works: + +1. We have a mutating admission webhook that looks for jobs and pods, and ensures there are fluence labels (likely we will add more abstractions). +2. A PodGroup reconciler is watching for these same objects. When they are created: + a. We find the labels and create the pod group object. + b. The pod group object has a timestamp for creation in microseconds. +3. When the pod is then given to fluence for scheduling, it already has the PodGroup created with name/size and can properly sort. + +Here is an example of a Job intended for Fluence: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: fluence-job +spec: + completions: 10 + parallelism: 10 + completionMode: Indexed + template: + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [echo, potato] + restartPolicy: Never + backoffLimit: 4 +``` + +And you can imagine if you want to group pods from different abstractions together, or declare a different size than what is represented in the Job: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: fluence-job + labels: + scheduling.x-k8s.io/pod-group: min-size-group + fluence.group-size: 5 +spec: + completions: 10 + parallelism: 10 + completionMode: Indexed + template: + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [echo, potato] + restartPolicy: Never + backoffLimit: 4 +``` + +There is no reason pods with different names or under different abstractions cannot be part of the same group that needs to be scheduled together. Also note that: + +- We currently do not allow scheduling to a control plane +- Deployments, StatefulSets, and ReplicaSets can be scheduled and have pod groups created, however the pod groups are not cleaned up as these abstractions are not meant to complete. ### Deploy We provide a set of pre-build containers [alongside the repository](https://github.com/orgs/flux-framework/packages?repo_name=flux-k8s) -that you can easily use to deploy Fluence right away! You'll simply need to clone the proper helm charts, and then install to your cluster. -We provide helper commands to do that. +that you can easily use to deploy Fluence right away! You'll first need to install the certificate manager: + +```bash +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml +``` + +And then clone the proper helm charts, and then install to your cluster. We provide helper commands to do that. ```bash # This clones the upstream scheduler plugins code, we will add fluence to it! @@ -27,13 +118,14 @@ cd upstream/manifests/install/charts helm install \ --set scheduler.image=ghcr.io/flux-framework/fluence:latest \ --set scheduler.sidecarimage=ghcr.io/flux-framework/fluence-sidecar \ - schedscheduler-plugins as-a-second-scheduler/ + --set controller.image=ghcr.io/flux-framework/fluence-controller \ + fluence as-a-second-scheduler/ ``` And that's it! See the [testing install](#testing-install) section for a basic example to schedule pods using Fluence. -### Setup +### Build To build and test Fluence, you will need: @@ -41,22 +133,21 @@ To build and test Fluence, you will need: - [helm](https://helm.sh/docs/intro/install/) to install charts for scheduler plugins. - A Kubernetes cluster for testing, e.g., you can deploy one with [kind](https://kind.sigs.k8s.io/docs/user/quick-start/) -### Building Fluence - -There are two images we will be building: +There are three images we will be building: - the scheduler sidecar: built from the repository here - - the scheduler: built from [this branch of scheduler-plugins](https://github.com/openshift-psap/scheduler-plugins/blob/fluence/build/scheduler/Dockerfile) + - the scheduler: built (and modified) from [this branch of scheduler-plugins](https://github.com/openshift-psap/scheduler-plugins/blob/fluence/build/scheduler/Dockerfile) + - the controller: same as the scheduler -#### All at once (Sidecar + Scheduler) +#### Build All -**recommended** +**This builds the scheduler, sidecar to the scheduler, and controller** This will run the full builds for all containers in one step, which includes: 1. Building the fluence sidecar from source code in [src](src) -2. Cloning the upstream kubernetes-sigs/plugin-schedulers respository to ./upstream -3. Building the scheduler container +2. Cloning the upstream kubernetes-sigs/plugin-schedulers repository to ./upstream +3. Building the scheduler and controller containers From the root here: @@ -67,130 +158,31 @@ make or customize the naming of your registry or local images: ```bash -make REGISTRY=vanessa SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar -``` - -As an alternative, you can do each of the steps separately or manually (detailed below). - -
- - Manual Build Instructions - -#### Build Sidecar - -To build the plugin containers, we will basically be running `make` from the [src](src) directory. We have wrapped that for you -in the Makefile: - -```bash -make build-sidecar -``` - -To build for a custom registry (e.g., "vanessa' on Docker Hub): - -```bash -make build-sidecar REGISTRY=vanessa -``` - -And specify the sidecar image name too: - -```bash -make build-sidecar REGISTRY=vanessa SIDECAR_IMAGE=another-sidecar -``` - -The equivalent manual command is: - -```bash -cd src -make -``` - -Using either of the approaches above, this will create the scheduler plugin main container, which can be tagged and pushed to the preferred registry. As an example, -here we push to the result of the build above: - -```bash -docker push docker.io/vanessa/fluence-sidecar:latest -``` - -#### Build Scheduler - -Note that you can run this entire process like: - -```bash -make prepare -make build -``` - -Or customize the name of the scheduler image: - -```bash -make prepare -make build REGISTRY=vanessa -``` - -For a custom scheduler or controller image (we just need the scheduler): - -```bash -make build REGISTRY=vanessa CONTROLLER_IMAGE=fluence-controller SCHEDULER_IMAGE=fluence -``` - -To walk through it manually, first, clone the upstream scheduler-plugins repository: - -```bash -git clone https://github.com/kubernetes-sigs/scheduler-plugins ./upstream -``` - -We need to add our fluence package to the scheduler plugins to build. You can do that manully as follows: - -```bash -# These are entirely new directory structures -cp -R sig-scheduler-plugins/pkg/fluence ./upstream/pkg/fluence -cp -R sig-scheduler-plugins/manifests/fluence ./upstream/manifests/fluence - -# These are files with subtle changes to add fluence -cp sig-scheduler-plugins/cmd/scheduler/main.go ./upstream/cmd/scheduler/main.go -cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml ./upstream/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml -cp sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml ./upstream/manifests/install/charts/as-a-second-scheduler/values.yaml +make REGISTRY=vanessa SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller ``` -Then change directory to the scheduler plugins repository. - -```bash -cd ./upstream -``` +As an alternative, you can look at the Makefile to do each of the steps separately. -And build! You'll most likely want to set a custom registry and image name again: +#### Prepare Cluster -```bash -# This will build to localhost -make local-image - -# this will build to docker.io/vanessa/fluence -make local-image REGISTRY=vanessa CONTROLLER_IMAGE=fluence -``` - -
- -**Important** the make command above produces _two images_ and you want to use the first that is mentioned in the output (not the second, which is a controller). +> Prepare a cluster and install the Kubernetes scheduling plugins framework -Whatever build approach you use, you'll want to push to your registry for later discovery! +These steps will require a Kubernetes cluster to install to, and having pushed the plugin container to a registry OR loading +them into the local cluster and setting the image pull policy to `Never`. If you aren't using a cloud provider, you can create a local one with `kind`: ```bash -docker push docker.io/vanessa/fluence +kind create cluster --config ./examples/kind-config.yaml ``` -### Prepare Cluster - -> Prepare a cluster and install the Kubernetes scheduling plugins framework - -These steps will require a Kubernetes cluster to install to, and having pushed the plugin container to a registry. If you aren't using a cloud provider, you can create a local one with `kind`: +And again install the certificate manager: ```bash -kind create cluster +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml ``` **Important** if you are developing or testing fluence, note that custom scheduler plugins don't seem to work out of the box with MiniKube (but everything works with kind). Likely there are extensions or similar that need to be configured with MiniKube (that we have not looked into). -### Install Fluence +#### Install Fluence For some background, the [Scheduling Framework](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/) provided by Kubernetes means that our container is going to provide specific endpoints to allow for custom scheduling. At this point you can follow the instructions @@ -219,19 +211,26 @@ helm show values as-a-second-scheduler/ scheduler: name: fluence - image: registry.k8s.io/scheduler-plugins/kube-scheduler:v0.27.8 + image: ghcr.io/flux-framework/fluence:latest replicaCount: 1 leaderElect: false sidecarimage: ghcr.io/flux-framework/fluence-sidecar:latest policy: lonode pullPolicy: Always sidecarPullPolicy: Always + loggingLevel: "9" + + # Port is for GRPC, and enabling the external service will also + # create the service and ingress to it, along with adding + # additional API endpoints for our TBA kubectl plugin + enableExternalService: false + port: 4242 controller: name: scheduler-plugins-controller - image: registry.k8s.io/scheduler-plugins/controller:v0.27.8 + image: ghcr.io/flux-framework/fluence-controller:latest replicaCount: 1 - pullPolicy: IfNotPresent + pullPolicy: Always # LoadVariationRiskBalancing and TargetLoadPacking are not enabled by default # as they need extra RBAC privileges on metrics.k8s.io. @@ -252,6 +251,15 @@ pluginConfig: # args: # scoringStrategy: # type: MostAllocated # default is LeastAllocated + +enableCertManager: true +kubernetesClusterDomain: cluster.local +webhookService: + ports: + - port: 9443 + protocol: TCP + targetPort: 9443 + type: ClusterIP ``` @@ -264,28 +272,25 @@ cd upstream/manifests/install/charts helm install \ --set scheduler.image=vanessa/fluence:latest \ --set scheduler.sidecarimage=vanessa/fluence-sidecar \ - schedscheduler-plugins as-a-second-scheduler/ + --set controller.image=vanessa/fluence-controller \ + fluence as-a-second-scheduler/ ``` -If you load your images into your testing environment and don't need to pull, you can change the pull policy too: +If you need to uninstall (e.g., to redo something): ```bash -helm install \ - --set scheduler.image=vanessa/fluence:latest \ - --set scheduler.sidecarimage=vanessa/fluence-sidecar \ - --set scheduler.sidecarPullPolicy=IfNotPresent \ - schedscheduler-plugins as-a-second-scheduler/ +helm uninstall fluence ``` -If you need to uninstall (e.g., to redo something): +Or see the name you used: ```bash -helm uninstall schedscheduler-plugins +helm list ``` Next you can move down to testing the install. -### Testing Install +#### Testing Install The installation process will run one scheduler and one controller pod for the Scheduler Plugin Framework in the default namespace. You can double check that everything is running as follows: @@ -328,35 +333,40 @@ kubectl logs fluence-6bbcbc6bbf-xjfx6 -c scheduler-plugins-scheduler If you haven't done anything, you'll likely just see health checks. -### Deploy Pods +#### Testing Pods and Jobs -Let's now run a simple example! Change directory into this directory: +You can test deploying pods and jobs. + +```bash +kubectl apply -f examples/simple_example/fluence-scheduler-pod.yaml +``` +or a job: ```bash -# This is from the root of flux-k8s -cd examples/simple_example +# size 3 +kubectl apply -f examples/test_example/fluence-sized-job.yaml + +# size 1 +kubectl apply -f examples/test_example/fluence-job.yaml ``` -And then we want to deploy two pods, one assigned to the `default-scheduler` and the other -`fluence`. For FYI, we do this via setting `schedulerName` in the spec: +Note that all of these have (in their spec) a designation of the fluence scheduler. ```yaml spec: schedulerName: fluence ``` -Here is how to create the pods: +Once it was created, aside from checking that it ran OK, you can verify by looking at the scheduler logs again: ```bash -kubectl apply -f default-scheduler-pod.yaml -kubectl apply -f fluence-scheduler-pod.yaml +kubectl logs fluence-6bbcbc6bbf-xjfx6 ``` -Once it was created, aside from checking that it ran OK, I could verify by looking at the scheduler logs again: +
+ +Scheduler Logs -```bash -kubectl logs fluence-6bbcbc6bbf-xjfx6 -``` ```bash Defaulted container "sidecar" out of: sidecar, scheduler-plugins-scheduler This is the fluxion grpc server @@ -405,6 +415,8 @@ FINAL NODE RESULT: [GRPCServer] Response podID:"fluence-scheduled-pod" nodelist:{nodeID:"kind-control-plane" tasks:1} jobID:1 ``` +
+ I was trying to look for a way to see the assignment, and maybe we can see it here (this is the best I could come up with!) ```bash @@ -429,13 +441,110 @@ pod/fluence-scheduled-pod spec.containers{fluence-scheduled-container} kubelet For the above, I found [this page](https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/#enable-leader-election) very helpful. -Finally, note that we also have a more appropriate example with jobs under [examples/test_example](examples/test_example). It's slightly more sane because it uses Job, and jobs are expected to complete (whereas pods are not and will get into crash loop backoffs, etc). For example of how to programmatically interact with the job pods and check states, events, see the [test.sh](.github/test.sh) script. + +### Developer + +You can see [deploy](#deploy) for instructions on how to do a custom deployment. + +#### Organization + +If you are looking to develop: + + - [src](src): includes source code for fluence. You'll find logs for this code in the `sidecar` container of the fluence pod. + - [sig-scheduler-plugins](sig-scheduler-plugins): includes assets (manifests and Go files) that are intended to be added to the kubernetes-sigs/scheduler-plugins upstream repository before build. You'll find logs for this container in the `scheduler-plugins-scheduler` container of the pod. + - [apis](sig-scheduler-plugins/apis): customized PodGroup to define the status scheduled time in microseconds + - [manifests](sig-scheduler-plugins/manifests): manifests for helm and Kubernetes + - [pkg](sig-scheduler-plugins/pkg): the main fluence module to add to upstream + - [cmd](sig-scheduler-plugins/cmd): the main.go to replace in upstream + - *upstream*: the default name this upstream is cloned to when you do a make build command. + +Note that the clone of the repository and copying of files to the correct locations is all automated through the [Makefile](Makefile). Additional commands provided include the following: + +```bash +# Only clone the repository into ./upstream +make clone + +# Update the cloned upstream with a git pull origin master +make update +``` + +It's recommend to update once in a while if you have an older clone locally and there might be changes you are not accounting for. + +#### GRPC + +The fluence module uses GRPC to communicate with Flux, and these assets are stored in [src/fluence/fluxcli-grpc](src/fluence/fluxcli-grpc). +You should *only* update the [sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto](src/fluence/fluxcli-grpc/fluxcli.proto) file, +and then from the root run `make proto` to re-generate the other files: + +```bash +cd src + +# Install protoc tools to local bin +# make protoc +make proto +``` + +#### Workflow + +You should first do these on your own: + +1. Create the kind cluster (`kubectl apply -f ./examples/kind-cluster.yaml`) +2. Install the certificate manager. + +I was having trouble developing this easily because it's a lot of steps to build and load containers and change directories and uninstall/install the charts, so I put together a small script that does the following: + +1. Takes a registry of interest (probably doesn't matter since we are working locally, defaults to `ghcr.io/vsoch` +2. builds all three images, the controller, sidecar, and fluence +3. loads them all into kind +4. changes directory to the charts +5. uninstalls the fluence helm instance (if installed) +6. installs it, targeted the images just built, and setting pullPolicy to never + +The last step ensures we use the images we loaded! You can basically just do: + +```bash +/bin/bash ./hack/quick-build-kind.sh +``` + +This sped up my development time immensely. If you want to manually do the steps, see that script for instructions. + +#### Logging + +For easier viewing of what fluence is doing (in the sig-scheduler-plugins) we have a file logger that can be seen in the container: + +```bash +$ kubectl exec -it fluence-68c4c586c6-nktdl -c scheduler-plugins-scheduler -- cat /tmp/fluence.log +``` + +##### kubectl plugin + +Note that if you want to enable extra endpoints for the fluence kubectl plugin and expose the GRPC as a service, you can do: + +```bash +helm install \ + --set scheduler.image=ghcr.io/vsoch/fluence:latest \ + --set scheduler.enableExternalService=true \ + --set controller.image=ghcr.io/vsoch/fluence-controller \ + --set scheduler.sidecarimage=ghcr.io/vsoch/fluence-sidecar:latest \ + fluence as-a-second-scheduler/ +``` + +For this setup if you are developing locally with kind, you will need to enable the ingress, as is done in [examples/kind-config.yaml](examples/kind-config.yaml). + +```bash +kind create cluster --config ./kind-config.yaml +``` + +#### Components + + - [FluxStateData](sig-scheduler-plugins/pkg/fluence/core/core.go): is given to the [framework.CycleState](https://github.com/kubernetes/kubernetes/blob/242b41b36a20032f99e8a059ca0a5d764105217b/pkg/scheduler/framework/cycle_state.go#L48) and serves as a vehicle to store a cache of node name assignment. ## Papers You can find details of Fluence architecture, implementation, experiments, and improvements to the Kubeflow MPI operator in our collaboration's papers: -``` + +```bibtex @INPROCEEDINGS{10029991, author={Milroy, Daniel J. and Misale, Claudia and Georgakoudis, Giorgis and Elengikal, Tonia and Sarkar, Abhik and Drocco, Maurizio and Patki, Tapasya and Yeom, Jae-Seung and Gutierrez, Carlos Eduardo Arango and Ahn, Dong H. and Park, Yoonho}, booktitle={2022 IEEE/ACM 4th International Workshop on Containers and New Orchestration Paradigms for Isolated Environments in HPC (CANOPIE-HPC)}, @@ -447,7 +556,7 @@ You can find details of Fluence architecture, implementation, experiments, and i doi={10.1109/CANOPIE-HPC56864.2022.00011} } ``` -``` +```bibtex @INPROCEEDINGS{9652595, author={Misale, Claudia and Drocco, Maurizio and Milroy, Daniel J. and Gutierrez, Carlos Eduardo Arango and Herbein, Stephen and Ahn, Dong H. and Park, Yoonho}, booktitle={2021 3rd International Workshop on Containers and New Orchestration Paradigms for Isolated Environments in HPC (CANOPIE-HPC)}, @@ -459,7 +568,7 @@ You can find details of Fluence architecture, implementation, experiments, and i doi={10.1109/CANOPIEHPC54579.2021.00006} } ``` -``` +```bibtex @inproceedings{10.1007/978-3-030-96498-6_18, address = {Cham}, author = {Misale, Claudia and Milroy, Daniel J. and Gutierrez, Carlos Eduardo Arango and Drocco, Maurizio and Herbein, Stephen and Ahn, Dong H. and Kaiser, Zvonko and Park, Yoonho}, diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..5884850 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,52 @@ +# Development Notes + +## Design + +![images/fluence-design.png](images/fluence-design.png) + +The picture above shows the fluence custom scheduler, which uses the Flux Framework component "fluxion" Go bindings in a custom Kubernetes scheduler. In the above, we see two pods running in a Kubernetes cluster that are intended for scheduling. The fluence pod (beige) has the fluence-sidecar and the fluence-scheduler, 2 containers. The controller pod has the fluence controller (1 container). Generally speaking, the containers are responsible for the following: + +- **fluence-controller**: watches for incoming pods and abstractions with pods (e.g., job) to create corresponding pod groups with names, sizes, and timestamps +- **fluence-scheduler**: provides the expected scheduling plugin with functions to sort, pre-filter, etc. the queue of pods is essentially moving through here +- **fluence-sidecar**: the fluxion GRPC service that is queried by the fluence-scheduler to request an allocation for a pod group + +Both the controller and scheduler logic are bootstrapped from the same underlying kubernetes-sigs project, the scheduler-plugins, despite being in different pods (green). For steps, scheduling works as follows. Note that it is [much more complicated than this](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/), but we explain the high level details. + +1. A user submits a job to their cluster with kubectl after installing fluence with helm charts. +2. The mutating webhook provided by the fluence-controller intercepts the job and adds labels +3. The controller for PodGroup (an abstraction that holds a name, size, and time created to describe one or more pods) is watching for pod events +4. When a pod is creating (it shows up as Pending or other in the cluster, and doesn't have to be scheduled yet) it starts to reconcile + - The reconcile ensures that the PodGroup is created and updated with the correct metadata and statuses (and cleaned up when the time comes) +5. As soon as the Pod is pending and the group exists, it starts going through the scheduling [queue and process](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/) and hits the fluence-scheduler endpoints + - The fluence-scheduler uses the PodGroup name to associate each individual pod with a group and start time, allowing to sort them together + - Starting times are based on micro seconds to provide a distinct-ness to group creation times, even when done en-masse + - Pods that don't get have a group (if there is delay in the reconciler making one) are pushed off from scheduling until they do. +6. Fluxion is queried via a GRPC endpoint, asking for a match for the job specification and an allocation -- "MatchAllocate" +7. The pods are then scheduled together, and the abstraction (e.g., Job) created in the Kubernetes cluster + - When the top level abstraction cleans up and the PodGroup size is equal to the number of pods finished or failed, the PodGroup cleans up + +The result is (hopefully) a smooth and efficient scheduling experience. We are still working on it. + +## Thinking + +> Updated February 15, 2024 + +What I think might be happening (and not always, sometimes) + +- New pod group, no node list +- Fluence assigns nodes +- Nodes get assigned to pods 1:1 +- POD group is deleted +- Some pod is sent back to queue (kubelet rejects, etc) +- POD group does not exist and is recreated, no node list +- Fluence asks again, but still has the first job. Not enough resources, asks forever. + +The above would not happen with the persistent pod group (if it wasn't cleaned up until the deletion of the job) and wouldn't happen if there are just enough resources to account for the overlap. + +- Does Fluence allocate resources for itself? +- It would be nice to be able to inspect the state of Fluence. +- At some point we want to be using the TBA fluxion-go instead of the one off branch we currently have (but we don't need to be blocked for that) +- We should (I think) restore pod group (it's in the controller here) and have our own container built. That way we have total control over the custom resource, and we don't risk it going away. + - As a part of that, we can add add a mutating webhook that emulates what we are doing in fluence now to find the label, but instead we will create the CRD to hold state instead of trying to hold in the operator. +- It could then also be investigated that we can more flexibly change the size of the group, within some min/max size (also determined by labels?) to help with scheduling. +- Note that kueue has added a Pod Group object, so probably addresses the static case here. diff --git a/docs/images/fluence-design.png b/docs/images/fluence-design.png new file mode 100644 index 0000000..c35d9fe Binary files /dev/null and b/docs/images/fluence-design.png differ diff --git a/examples/indexed-jobs/job1.yaml b/examples/indexed-jobs/job1.yaml new file mode 100644 index 0000000..609e843 --- /dev/null +++ b/examples/indexed-jobs/job1.yaml @@ -0,0 +1,21 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: job-1 +spec: + completions: 5 + parallelism: 5 + completionMode: Indexed + template: + metadata: + labels: + fluence.pod-group: job-1 + fluence.group-size: "5" + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [sleep, "10"] + restartPolicy: Never + backoffLimit: 4 diff --git a/examples/indexed-jobs/job2.yaml b/examples/indexed-jobs/job2.yaml new file mode 100644 index 0000000..3d77660 --- /dev/null +++ b/examples/indexed-jobs/job2.yaml @@ -0,0 +1,21 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: job-2 +spec: + completions: 5 + parallelism: 5 + completionMode: Indexed + template: + metadata: + labels: + fluence.pod-group: job-2 + fluence.group-size: "5" + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [sleep, "10"] + restartPolicy: Never + backoffLimit: 4 diff --git a/examples/kind-config.yaml b/examples/kind-config.yaml new file mode 100644 index 0000000..2971483 --- /dev/null +++ b/examples/kind-config.yaml @@ -0,0 +1,26 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 8080 + hostPort: 8080 + protocol: TCP + - containerPort: 4242 + hostPort: 4242 + protocol: TCP + - containerPort: 4243 + hostPort: 4243 + protocol: TCP +- role: worker +- role: worker +- role: worker +- role: worker +- role: worker +- role: worker \ No newline at end of file diff --git a/examples/kube_setup/taint_workers.sh b/examples/kube_setup/taint_workers.sh old mode 100755 new mode 100644 diff --git a/examples/pi/clean_pods.sh b/examples/pi/clean_pods.sh old mode 100755 new mode 100644 diff --git a/examples/pi/demo_failed_pod_cancellation.sh b/examples/pi/demo_failed_pod_cancellation.sh old mode 100755 new mode 100644 diff --git a/examples/pi/init_kind_cluster.sh b/examples/pi/init_kind_cluster.sh old mode 100755 new mode 100644 diff --git a/examples/pod-group-jobs/job1.yaml b/examples/pod-group-jobs/job1.yaml new file mode 100644 index 0000000..e0ebba0 --- /dev/null +++ b/examples/pod-group-jobs/job1.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: s0 +spec: + clusterIP: None + selector: + job-name: job-0 +--- +apiVersion: batch/v1 +kind: Job +metadata: + # name will be derived based on iteration + name: job-0 +spec: + completions: 4 + parallelism: 4 + completionMode: Indexed + template: + metadata: + labels: + app: job-0 + spec: + subdomain: s0 + schedulerName: fluence + restartPolicy: Never + containers: + - name: example-workload + image: bash:latest + resources: + limits: + cpu: "3" + requests: + cpu: "3" + command: + - bash + - -c + - | + if [ $JOB_COMPLETION_INDEX -ne "0" ] + then + sleep infinity + fi + echo "START: $(date +%s)" + for i in 0 1 2 3 + do + gotStatus="-1" + wantStatus="0" + while [ $gotStatus -ne $wantStatus ] + do + ping -c 1 job-0-${i}.s0 > /dev/null 2>&1 + gotStatus=$? + if [ $gotStatus -ne $wantStatus ]; then + echo "Failed to ping pod job-0-${i}.s0, retrying in 1 second..." + sleep 1 + fi + done + echo "Successfully pinged pod: job-0-${i}.s0" + done + echo "DONE: $(date +%s)" \ No newline at end of file diff --git a/examples/pod-group-jobs/job2.yaml b/examples/pod-group-jobs/job2.yaml new file mode 100644 index 0000000..c39820b --- /dev/null +++ b/examples/pod-group-jobs/job2.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: s1 +spec: + clusterIP: None + selector: + job-name: job-1 +--- +apiVersion: batch/v1 +kind: Job +metadata: + # name will be derived based on iteration + name: job-1 +spec: + completions: 4 + parallelism: 4 + completionMode: Indexed + template: + metadata: + labels: + app: job-1 + spec: + subdomain: s1 + schedulerName: fluence + restartPolicy: Never + containers: + - name: example-workload + image: bash:latest + resources: + limits: + cpu: "3" + requests: + cpu: "3" + command: + - bash + - -c + - | + if [ $JOB_COMPLETION_INDEX -ne "0" ] + then + sleep infinity + fi + echo "START: $(date +%s)" + for i in 0 1 2 3 + do + gotStatus="-1" + wantStatus="0" + while [ $gotStatus -ne $wantStatus ] + do + ping -c 1 job-0-${i}.s1 > /dev/null 2>&1 + gotStatus=$? + if [ $gotStatus -ne $wantStatus ]; then + echo "Failed to ping pod job-0-${i}.s1, retrying in 1 second..." + sleep 1 + fi + done + echo "Successfully pinged pod: job-0-${i}.s1" + done + echo "DONE: $(date +%s)" \ No newline at end of file diff --git a/examples/pod-group/lammps/lammps2.yaml b/examples/pod-group/lammps/lammps2.yaml new file mode 100644 index 0000000..5a83c97 --- /dev/null +++ b/examples/pod-group/lammps/lammps2.yaml @@ -0,0 +1,19 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps2 +spec: + size: 2 + network: + headlessName: l2 + pod: + schedulerName: fluence + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 10 + requests: + cpu: 10 diff --git a/examples/pod-group/lammps/lammps4-2.yaml b/examples/pod-group/lammps/lammps4-2.yaml new file mode 100644 index 0000000..6b647bc --- /dev/null +++ b/examples/pod-group/lammps/lammps4-2.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps4-2 +spec: + size: 4 + network: + headlessName: l42 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps4-2 + fluence.group-size: "4" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 10 + requests: + cpu: 10 diff --git a/examples/pod-group/lammps/lammps4-3.yaml b/examples/pod-group/lammps/lammps4-3.yaml new file mode 100644 index 0000000..b182751 --- /dev/null +++ b/examples/pod-group/lammps/lammps4-3.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps4-3 +spec: + size: 4 + network: + headlessName: l43 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps4-3 + fluence.group-size: "4" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 10 + requests: + cpu: 10 diff --git a/examples/pod-group/lammps/lammps4.yaml b/examples/pod-group/lammps/lammps4.yaml new file mode 100644 index 0000000..9420902 --- /dev/null +++ b/examples/pod-group/lammps/lammps4.yaml @@ -0,0 +1,23 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps4 +spec: + size: 4 + network: + headlessName: l4 + pod: + schedulerName: fluence + labels: + app: lammps + fluence.pod-group: lammps4 + fluence.group-size: "4" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 10 + requests: + cpu: 10 diff --git a/examples/pod-group/lammps/lammps5.yaml b/examples/pod-group/lammps/lammps5.yaml new file mode 100644 index 0000000..e85299f --- /dev/null +++ b/examples/pod-group/lammps/lammps5.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps5 +spec: + size: 5 + network: + headlessName: l5 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps5 + fluence.group-size: "5" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 10 + requests: + cpu: 10 diff --git a/examples/pod-group/lammps/lammps6.yaml b/examples/pod-group/lammps/lammps6.yaml new file mode 100644 index 0000000..14ebae3 --- /dev/null +++ b/examples/pod-group/lammps/lammps6.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MiniCluster +metadata: + name: lammps6 +spec: + size: 6 + network: + headlessName: l6 + pod: + schedulerName: fluence + labels: + fluence.pod-group: lammps6 + fluence.group-size: "6" + containers: + - image: ghcr.io/converged-computing/metric-lammps:latest@sha256:e24a1ba8954f5a0a7a0bd854cfc5ca7f82ca12607dc6ace38d838591b8deb8ed + workingDir: /opt/lammps/examples/reaxff/HNS + command: lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite + resources: + limits: + cpu: 10 + requests: + cpu: 10 diff --git a/examples/run_experiments/process_job_template.py b/examples/run_experiments/process_job_template.py old mode 100755 new mode 100644 diff --git a/examples/run_experiments/run_experiments.py b/examples/run_experiments/run_experiments.py old mode 100755 new mode 100644 diff --git a/examples/simple_example/fluence-deployment.yaml b/examples/simple_example/fluence-deployment.yaml new file mode 100644 index 0000000..9eb6cef --- /dev/null +++ b/examples/simple_example/fluence-deployment.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: example-deployment +spec: + selector: + matchLabels: + app: example-deployment + replicas: 3 + template: + metadata: + labels: + app: example-deployment + spec: + schedulerName: fluence + containers: + - name: example + image: rockylinux:9 + command: ["sleep", "infinity"] \ No newline at end of file diff --git a/examples/simple_example/fluence-replicaset.yaml b/examples/simple_example/fluence-replicaset.yaml new file mode 100644 index 0000000..f00e826 --- /dev/null +++ b/examples/simple_example/fluence-replicaset.yaml @@ -0,0 +1,21 @@ +apiVersion: apps/v1 +kind: ReplicaSet +metadata: + name: example-replicaset + labels: + app: example-replicaset +spec: + replicas: 3 + selector: + matchLabels: + app: example-replicaset + template: + metadata: + labels: + app: example-replicaset + spec: + schedulerName: fluence + containers: + - name: example + image: rockylinux:9 + command: ["sleep", "infinity"] \ No newline at end of file diff --git a/examples/simple_example/fluence-scheduler-pod.yaml b/examples/simple_example/fluence-scheduler-pod.yaml index a7cc126..b09c714 100644 --- a/examples/simple_example/fluence-scheduler-pod.yaml +++ b/examples/simple_example/fluence-scheduler-pod.yaml @@ -8,4 +8,4 @@ spec: schedulerName: fluence containers: - name: fluence-scheduled-container - image: registry.k8s.io/pause:2.0 \ No newline at end of file + image: registry.k8s.io/pause:2.0 diff --git a/examples/simple_example/fluence-statefulset.yaml b/examples/simple_example/fluence-statefulset.yaml new file mode 100644 index 0000000..80da82a --- /dev/null +++ b/examples/simple_example/fluence-statefulset.yaml @@ -0,0 +1,21 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: example-statefulset + labels: + app: example-statefulset +spec: + replicas: 3 + selector: + matchLabels: + app: example-statefulset + template: + metadata: + labels: + app: example-statefulset + spec: + schedulerName: fluence + containers: + - name: example + image: rockylinux:9 + command: ["sleep", "infinity"] \ No newline at end of file diff --git a/examples/test_example/fluence-sized-job.yaml b/examples/test_example/fluence-sized-job.yaml new file mode 100644 index 0000000..d1e7556 --- /dev/null +++ b/examples/test_example/fluence-sized-job.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: fluence-sized-job +spec: + parallelism: 3 + completions: 3 + template: + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [sleep, "20"] + restartPolicy: Never + backoffLimit: 4 diff --git a/hack/quick-build-gke.sh b/hack/quick-build-gke.sh new file mode 100755 index 0000000..875360a --- /dev/null +++ b/hack/quick-build-gke.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Before running this, you should: +# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane) +# 2. Install cert-manager +# 3. Customize the script to point to your registry if you intend to push + +REGISTRY="${1:-ghcr.io/vsoch}" +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT=$(dirname ${HERE}) + +# Go to the script directory +cd ${ROOT} + +# These build each of the images. The sidecar is separate from the other two in src/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller + +# This is what it might look like to push +# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest + +# We load into kind so we don't need to push/pull and use up internet data ;) +docker push ${REGISTRY}/fluence-sidecar:latest +docker push ${REGISTRY}/fluence-controller:latest +docker push ${REGISTRY}/fluence:latest + +# And then install using the charts. The pull policy ensures we use the loaded ones +cd ${ROOT}/upstream/manifests/install/charts +helm uninstall fluence || true +helm install \ + --set scheduler.image=${REGISTRY}/fluence:latest \ + --set controller.image=${REGISTRY}/fluence-controller:latest \ + --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ + fluence as-a-second-scheduler/ diff --git a/hack/quick-build-kind.sh b/hack/quick-build-kind.sh new file mode 100755 index 0000000..23a5c87 --- /dev/null +++ b/hack/quick-build-kind.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Before running this, you should: +# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane) +# 2. Install cert-manager +# 3. Customize the script to point to your registry if you intend to push + +REGISTRY="${1:-ghcr.io/vsoch}" +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT=$(dirname ${HERE}) + +# Go to the script directory +cd ${ROOT} + +# These build each of the images. The sidecar is separate from the other two in src/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller + +# This is what it might look like to push +# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest + +# We load into kind so we don't need to push/pull and use up internet data ;) +kind load docker-image ${REGISTRY}/fluence-sidecar:latest +kind load docker-image ${REGISTRY}/fluence-controller:latest +kind load docker-image ${REGISTRY}/fluence:latest + +# And then install using the charts. The pull policy ensures we use the loaded ones +cd ${ROOT}/upstream/manifests/install/charts +helm uninstall fluence || true +helm install \ + --set scheduler.image=${REGISTRY}/fluence:latest \ + --set scheduler.sidecarPullPolicy=Never \ + --set scheduler.pullPolicy=Never \ + --set controller.pullPolicy=Never \ + --set controller.image=${REGISTRY}/fluence-controller:latest \ + --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ + fluence as-a-second-scheduler/ diff --git a/hack/quick-build.sh b/hack/quick-build.sh new file mode 100755 index 0000000..c9b8eff --- /dev/null +++ b/hack/quick-build.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Before running this, you should: +# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane) +# 2. Install cert-manager +# 3. Customize the script to point to your registry if you intend to push + +REGISTRY="${1:-ghcr.io/vsoch}" +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT=$(dirname ${HERE}) + +# Go to the script directory +cd ${ROOT} + +# These build each of the images. The sidecar is separate from the other two in src/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller \ No newline at end of file diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go new file mode 100644 index 0000000..7266d85 --- /dev/null +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go @@ -0,0 +1,336 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + +(c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: MIT +*/ + +// This file is not used, but maintained as the original addition of an OrasCache webhook + +package v1alpha1 + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" +) + +var ( + logger = ctrl.Log.WithName("setup") +) + +// IMPORTANT: if you use the controller-runtime builder, it will derive this name automatically from the gvk (kind, version, etc. so find the actual created path in the logs) +// kubectl describe mutatingwebhookconfigurations.admissionregistration.k8s.io +// It will also only allow you to describe one object type with For() +// This is disabled so we manually manage it - multiple types to a list did not work: config/webhook/manifests.yaml +////kubebuilder:webhook:path=/mutate-v1-sidecar,mutating=true,failurePolicy=fail,sideEffects=None,groups=core;batch,resources=pods;jobs,verbs=create,versions=v1,name=morascache.kb.io,admissionReviewVersions=v1 + +// NewMutatingWebhook allows us to keep the sidecarInjector private +// If it's public it's exported and kubebuilder tries to add to zz_generated_deepcopy +// and you get all kinds of terrible errors about admission.Decoder missing DeepCopyInto +func NewMutatingWebhook(mgr manager.Manager) *fluenceWatcher { + return &fluenceWatcher{decoder: admission.NewDecoder(mgr.GetScheme())} +} + +// mutate-v1-fluence +type fluenceWatcher struct { + decoder *admission.Decoder +} + +// Handle is the main handler for the webhook, which is looking for jobs and pods (in that order) +// If a job comes in (with a pod template) first, we add the labels there first (and they will +// not be added again). +func (hook *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admission.Response { + + logger.Info("Running webhook handle, determining pod wrapper abstraction...") + + job := &batchv1.Job{} + err := hook.decoder.Decode(req, job) + if err == nil { + err = hook.EnsureGroupOnJob(job) + if err != nil { + logger.Error(err, "Issue adding PodGroup to Job") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledJob, err := json.Marshal(job) + if err != nil { + logger.Error(err, "Marshalling job error.") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission job success.") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledJob) + } + + pod := &corev1.Pod{} + err = hook.decoder.Decode(req, pod) + if err == nil { + err = hook.EnsureGroup(pod) + if err != nil { + logger.Error(err, "Issue adding PodGroup to Pod") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledPod, err := json.Marshal(pod) + if err != nil { + logger.Error(err, "Marshalling pod error") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission pod success") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledPod) + } + + set := &appsv1.StatefulSet{} + err = hook.decoder.Decode(req, set) + if err == nil { + err = hook.EnsureGroupStatefulSet(set) + if err != nil { + logger.Error(err, "Issue adding PodGroup to StatefulSet") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledSet, err := json.Marshal(set) + if err != nil { + logger.Error(err, "Marshalling StatefulSet error") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission StatefulSet success") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledSet) + } + + deployment := &appsv1.Deployment{} + err = hook.decoder.Decode(req, deployment) + if err == nil { + err = hook.EnsureGroupDeployment(deployment) + if err != nil { + logger.Error(err, "Issue adding PodGroup to Deployment") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledD, err := json.Marshal(deployment) + if err != nil { + logger.Error(err, "Marshalling Deployment error") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission Deployment success") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledD) + } + + rset := &appsv1.ReplicaSet{} + err = hook.decoder.Decode(req, rset) + if err == nil { + err = hook.EnsureGroupReplicaSet(rset) + if err != nil { + logger.Error(err, "Issue adding PodGroup to ReplicaSet") + return admission.Errored(http.StatusBadRequest, err) + } + marshalledSet, err := json.Marshal(rset) + if err != nil { + logger.Error(err, "Marshalling StatefulSet error") + return admission.Errored(http.StatusInternalServerError, err) + } + logger.Info("Admission StatefulSet success") + return admission.PatchResponseFromRaw(req.Object.Raw, marshalledSet) + } + + // We should not get down here + return admission.Allowed("Object not known, this webhook does not validate beyond those.") + +} + +// Default is the expected entrypoint for a webhook... +func (hook *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { + + switch obj.(type) { + case *batchv1.Job: + job := obj.(*batchv1.Job) + return hook.EnsureGroupOnJob(job) + + case *corev1.Pod: + pod := obj.(*corev1.Pod) + return hook.EnsureGroup(pod) + + case *appsv1.StatefulSet: + set := obj.(*appsv1.StatefulSet) + return hook.EnsureGroupStatefulSet(set) + + case *appsv1.Deployment: + deployment := obj.(*appsv1.Deployment) + return hook.EnsureGroupDeployment(deployment) + + case *appsv1.ReplicaSet: + set := obj.(*appsv1.ReplicaSet) + return hook.EnsureGroupReplicaSet(set) + + default: + // no match + } + return nil +} + +// EnsureGroup adds pod group label and size if not present +// This ensures that every pod passing through is part of a group. +// Note that we need to do similar for Job. +// A pod without a job wrapper, and without metadata is a group +// of size 1. +func (hook *fluenceWatcher) EnsureGroup(pod *corev1.Pod) error { + + // Add labels if we don't have anything. Everything is a group! + if pod.Labels == nil { + pod.Labels = map[string]string{} + } + + // Do we have a group name? + groupName, ok := pod.Labels[labels.PodGroupLabel] + + // If we don't have a fluence group, create one under fluence namespace + if !ok { + groupName = fmt.Sprintf("fluence-group-%s", pod.Name) + pod.Labels[labels.PodGroupLabel] = groupName + } + + // Do we have a group size? This will be parsed as a string, likely + groupSize, ok := pod.Labels[labels.PodGroupSizeLabel] + if !ok { + groupSize = "1" + pod.Labels[labels.PodGroupSizeLabel] = groupSize + } + return nil +} + +// getJobLabel takes a label name and default and returns the value +// We look on both the job and underlying pod spec template +func getJobLabel(job *batchv1.Job, labelName, defaultLabel string) string { + + value, ok := job.Labels[labelName] + if !ok { + value, ok = job.Spec.Template.ObjectMeta.Labels[labelName] + if !ok { + value = defaultLabel + } + } + return value +} + +// EnsureGroupOnJob looks for fluence labels (size and name) on both the job +// and the pod template. We ultimately put on the pod, the lowest level unit. +// Since we have the size of the job (parallelism) we can use that for the size +func (a *fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { + + // Be forgiving - allow the person to specify it on the job directly or on the Podtemplate + // We will ultimately put the metadata on the Pod. + if job.Spec.Template.ObjectMeta.Labels == nil { + job.Spec.Template.ObjectMeta.Labels = map[string]string{} + } + if job.Labels == nil { + job.Labels = map[string]string{} + } + + /// First get the name for the pod group (also setting on the pod template) + defaultName := fmt.Sprintf("fluence-group-%s-%s", job.Namespace, job.Name) + groupName := getJobLabel(job, labels.PodGroupLabel, defaultName) + + // Wherever we find it, make sure the pod group name is on the pod spec template + job.Spec.Template.ObjectMeta.Labels[labels.PodGroupLabel] = groupName + + // Now do the same for the size, but the size is the size of the job + jobSize := *job.Spec.Parallelism + if jobSize == int32(0) { + jobSize = int32(1) + } + labelSize := fmt.Sprintf("%d", jobSize) + groupSize := getJobLabel(job, labels.PodGroupSizeLabel, labelSize) + job.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize + return nil +} + +// EnsureGroupStatefulSet creates a PodGroup for a StatefulSet +func (hook *fluenceWatcher) EnsureGroupStatefulSet(set *appsv1.StatefulSet) error { + + // StatefulSet requires on top level explicitly + if set.Labels == nil { + set.Labels = map[string]string{} + } + defaultName := fmt.Sprintf("fluence-group-%s-%s", set.Namespace, set.Name) + groupName, ok := set.Labels[labels.PodGroupLabel] + if !ok { + groupName = defaultName + } + set.Spec.Template.ObjectMeta.Labels[labels.PodGroupLabel] = groupName + + // Now do the same for the size, but the size is the size of the job + size := *set.Spec.Replicas + if size == int32(0) { + size = int32(1) + } + labelSize := fmt.Sprintf("%d", size) + groupSize, ok := set.Labels[labels.PodGroupSizeLabel] + if !ok { + groupSize = labelSize + } + set.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize + return nil +} + +// EnsureGroupStatefulSet creates a PodGroup for a StatefulSet +func (a *fluenceWatcher) EnsureGroupReplicaSet(set *appsv1.ReplicaSet) error { + + // StatefulSet requires on top level explicitly + if set.Labels == nil { + set.Labels = map[string]string{} + } + defaultName := fmt.Sprintf("fluence-group-%s-%s", set.Namespace, set.Name) + groupName, ok := set.Labels[labels.PodGroupLabel] + if !ok { + groupName = defaultName + } + set.Spec.Template.ObjectMeta.Labels[labels.PodGroupLabel] = groupName + + // Now do the same for the size, but the size is the size of the job + size := *set.Spec.Replicas + if size == int32(0) { + size = int32(1) + } + labelSize := fmt.Sprintf("%d", size) + groupSize, ok := set.Labels[labels.PodGroupSizeLabel] + if !ok { + groupSize = labelSize + } + set.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize + return nil +} + +// EnsureGroupDeployment creates a PodGroup for a Deployment +// This is redundant, can refactor later +func (a *fluenceWatcher) EnsureGroupDeployment(d *appsv1.Deployment) error { + + // StatefulSet requires on top level explicitly + if d.Labels == nil { + d.Labels = map[string]string{} + } + defaultName := fmt.Sprintf("fluence-group-%s-%s", d.Namespace, d.Name) + groupName, ok := d.Labels[labels.PodGroupLabel] + if !ok { + groupName = defaultName + } + d.Spec.Template.ObjectMeta.Labels[labels.PodGroupLabel] = groupName + + // Now do the same for the size, but the size is the size of the job + size := *d.Spec.Replicas + if size == int32(0) { + size = int32(1) + } + labelSize := fmt.Sprintf("%d", size) + groupSize, ok := d.Labels[labels.PodGroupSizeLabel] + if !ok { + groupSize = labelSize + } + d.Spec.Template.ObjectMeta.Labels[labels.PodGroupSizeLabel] = groupSize + return nil +} diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go new file mode 100644 index 0000000..fca7854 --- /dev/null +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/types.go @@ -0,0 +1,194 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/scheduler-plugins/apis/scheduling" +) + +// ElasticQuota sets elastic quota restrictions per namespace +// +genclient +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +kubebuilder:object:root=true +// +kubebuilder:resource:shortName={eq,eqs} +// +kubebuilder:subresource:status +// +kubebuilder:metadata:annotations="api-approved.kubernetes.io=https://github.com/kubernetes-sigs/scheduler-plugins/pull/52" +type ElasticQuota struct { + metav1.TypeMeta `json:",inline"` + + // Standard object's metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` + + // ElasticQuotaSpec defines the Min and Max for Quota. + // +optional + Spec ElasticQuotaSpec `json:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` + + // ElasticQuotaStatus defines the observed use. + // +optional + Status ElasticQuotaStatus `json:"status,omitempty" protobuf:"bytes,3,opt,name=status"` +} + +// ElasticQuotaSpec defines the Min and Max for Quota. +type ElasticQuotaSpec struct { + // Min is the set of desired guaranteed limits for each named resource. + // +optional + Min v1.ResourceList `json:"min,omitempty" protobuf:"bytes,1,rep,name=min, casttype=ResourceList,castkey=ResourceName"` + + // Max is the set of desired max limits for each named resource. The usage of max is based on the resource configurations of + // successfully scheduled pods. + // +optional + Max v1.ResourceList `json:"max,omitempty" protobuf:"bytes,2,rep,name=max, casttype=ResourceList,castkey=ResourceName"` +} + +// ElasticQuotaStatus defines the observed use. +type ElasticQuotaStatus struct { + // Used is the current observed total usage of the resource in the namespace. + // +optional + Used v1.ResourceList `json:"used,omitempty" protobuf:"bytes,1,rep,name=used,casttype=ResourceList,castkey=ResourceName"` +} + +// +kubebuilder:object:root=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// ElasticQuotaList is a list of ElasticQuota items. +type ElasticQuotaList struct { + metav1.TypeMeta `json:",inline"` + + // Standard list metadata. + // +optional + metav1.ListMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` + + // Items is a list of ElasticQuota objects. + Items []ElasticQuota `json:"items" protobuf:"bytes,2,rep,name=items"` +} + +// PodGroupPhase is the phase of a pod group at the current time. +type PodGroupPhase string + +// These are the valid phase of podGroups. +const ( + // PodGroupPending means the pod group has been accepted by the system, but scheduler can not allocate + // enough resources to it. + PodGroupPending PodGroupPhase = "Pending" + + // PodGroupRunning means the `spec.minMember` pods of the pod group are in running phase. + PodGroupRunning PodGroupPhase = "Running" + + // PodGroupScheduling means the number of pods scheduled is bigger than `spec.minMember` + // but the number of running pods has not reached the `spec.minMember` pods of PodGroups. + PodGroupScheduling PodGroupPhase = "Scheduling" + + // PodGroupUnknown means a part of `spec.minMember` pods of the pod group have been scheduled but the others can not + // be scheduled due to, e.g. not enough resource; scheduler will wait for related controllers to recover them. + PodGroupUnknown PodGroupPhase = "Unknown" + + // PodGroupFinished means the `spec.minMember` pods of the pod group are successfully finished. + PodGroupFinished PodGroupPhase = "Finished" + + // PodGroupFailed means at least one of `spec.minMember` pods have failed. + PodGroupFailed PodGroupPhase = "Failed" + + // PodGroupLabel is the default label of coscheduling + PodGroupLabel = scheduling.GroupName + "/pod-group" +) + +// PodGroup is a collection of Pod; used for batch workload. +// +genclient +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +kubebuilder:object:root=true +// +kubebuilder:resource:shortName={pg,pgs} +// +kubebuilder:subresource:status +// +kubebuilder:metadata:annotations="api-approved.kubernetes.io=https://github.com/kubernetes-sigs/scheduler-plugins/pull/50" +type PodGroup struct { + metav1.TypeMeta `json:",inline"` + // Standard object's metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired behavior of the pod group. + // +optional + Spec PodGroupSpec `json:"spec,omitempty"` + + // Status represents the current information about a pod group. + // This data may not be up to date. + // +optional + Status PodGroupStatus `json:"status,omitempty"` +} + +// PodGroupSpec represents the template of a pod group. +type PodGroupSpec struct { + // MinMember defines the minimal number of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start any. + MinMember int32 `json:"minMember,omitempty"` + + // MinResources defines the minimal resource of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start any. + MinResources v1.ResourceList `json:"minResources,omitempty"` + + // ScheduleTimeoutSeconds defines the maximal time of members/tasks to wait before run the pod group; + ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` +} + +// PodGroupStatus represents the current state of a pod group. +type PodGroupStatus struct { + // Current phase of PodGroup. + Phase PodGroupPhase `json:"phase,omitempty"` + + // OccupiedBy marks the workload (e.g., deployment, statefulset) UID that occupy the podgroup. + // It is empty if not initialized. + OccupiedBy string `json:"occupiedBy,omitempty"` + + // The number of actively running pods. + // +optional + Running int32 `json:"running,omitempty"` + + // The number of pods which reached phase Succeeded. + // +optional + Succeeded int32 `json:"succeeded,omitempty"` + + // The number of pods which reached phase Failed. + // +optional + Failed int32 `json:"failed,omitempty"` + + // CreationTime is intended to mock the object CreationTime, + // but set by us to be MicroTime instead of Time. + // +optional + CreationTime metav1.MicroTime `json:"creationTime,omitempty"` + + // ScheduleStartTime of the group is when we want to start counting + // "at time N plus 48 hours, this is when we deem time waited is too long" + // +optional + ScheduleStartTime metav1.MicroTime `json:"scheduleStartTime,omitempty"` +} + +// +kubebuilder:object:root=true + +// PodGroupList is a collection of pod groups. +type PodGroupList struct { + metav1.TypeMeta `json:",inline"` + // Standard list metadata + // +optional + metav1.ListMeta `json:"metadata,omitempty"` + + // Items is the list of PodGroup + Items []PodGroup `json:"items"` +} diff --git a/sig-scheduler-plugins/cmd/controller/app/server.go b/sig-scheduler-plugins/cmd/controller/app/server.go new file mode 100644 index 0000000..c10968e --- /dev/null +++ b/sig-scheduler-plugins/cmd/controller/app/server.go @@ -0,0 +1,104 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package app + +import ( + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/klog/v2/klogr" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/healthz" + + "sigs.k8s.io/controller-runtime/pkg/webhook" + + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + api "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + "sigs.k8s.io/scheduler-plugins/pkg/controllers" +) + +var ( + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + utilruntime.Must(api.AddToScheme(scheme)) +} + +func Run(s *ServerRunOptions) error { + config := ctrl.GetConfigOrDie() + config.QPS = float32(s.ApiServerQPS) + config.Burst = s.ApiServerBurst + + // Controller Runtime Controllers + ctrl.SetLogger(klogr.New()) + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{ + BindAddress: s.MetricsAddr, + }, + HealthProbeBindAddress: s.ProbeAddr, + LeaderElection: s.EnableLeaderElection, + LeaderElectionID: "sched-plugins-controllers", + LeaderElectionNamespace: "kube-system", + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + return err + } + + if err = (&controllers.PodGroupReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Workers: s.Workers, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "PodGroup") + return err + } + + mgr.GetWebhookServer().Register("/mutate-v1-fluence", &webhook.Admission{ + Handler: api.NewMutatingWebhook(mgr), + }) + + if err = (&controllers.ElasticQuotaReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Workers: s.Workers, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ElasticQuota") + return err + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + return err + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up ready check") + return err + } + + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "unable to start manager") + return err + } + return nil +} diff --git a/sig-scheduler-plugins/cmd/scheduler/main.go b/sig-scheduler-plugins/cmd/scheduler/main.go index d9a580a..4d98d52 100644 --- a/sig-scheduler-plugins/cmd/scheduler/main.go +++ b/sig-scheduler-plugins/cmd/scheduler/main.go @@ -22,21 +22,25 @@ import ( "k8s.io/component-base/cli" _ "k8s.io/component-base/metrics/prometheus/clientgo" // for rest client metric registration _ "k8s.io/component-base/metrics/prometheus/version" // for version metric registration + + // Uncomment here for a local one here we use to debug + // This was a clone from kubernetes/kubernetes -> cmd/app + //"sigs.k8s.io/scheduler-plugins/cmd/app" "k8s.io/kubernetes/cmd/kube-scheduler/app" "sigs.k8s.io/scheduler-plugins/pkg/capacityscheduling" "sigs.k8s.io/scheduler-plugins/pkg/coscheduling" + "sigs.k8s.io/scheduler-plugins/pkg/fluence" "sigs.k8s.io/scheduler-plugins/pkg/networkaware/networkoverhead" "sigs.k8s.io/scheduler-plugins/pkg/networkaware/topologicalsort" - "sigs.k8s.io/scheduler-plugins/pkg/noderesources" "sigs.k8s.io/scheduler-plugins/pkg/noderesourcetopology" - "sigs.k8s.io/scheduler-plugins/pkg/podstate" "sigs.k8s.io/scheduler-plugins/pkg/preemptiontoleration" - "sigs.k8s.io/scheduler-plugins/pkg/qos" "sigs.k8s.io/scheduler-plugins/pkg/trimaran/loadvariationriskbalancing" - "sigs.k8s.io/scheduler-plugins/pkg/trimaran/lowriskovercommitment" "sigs.k8s.io/scheduler-plugins/pkg/trimaran/targetloadpacking" - "sigs.k8s.io/scheduler-plugins/pkg/fluence" + + "sigs.k8s.io/scheduler-plugins/pkg/podstate" + "sigs.k8s.io/scheduler-plugins/pkg/qos" + // Ensure scheme package is initialized. _ "sigs.k8s.io/scheduler-plugins/apis/config/scheme" ) @@ -51,13 +55,9 @@ func main() { app.WithPlugin(loadvariationriskbalancing.Name, loadvariationriskbalancing.New), app.WithPlugin(networkoverhead.Name, networkoverhead.New), app.WithPlugin(topologicalsort.Name, topologicalsort.New), - app.WithPlugin(noderesources.AllocatableName, noderesources.NewAllocatable), app.WithPlugin(noderesourcetopology.Name, noderesourcetopology.New), app.WithPlugin(preemptiontoleration.Name, preemptiontoleration.New), app.WithPlugin(targetloadpacking.Name, targetloadpacking.New), - app.WithPlugin(lowriskovercommitment.Name, lowriskovercommitment.New), - // Sample plugins below. - // app.WithPlugin(crossnodepreemption.Name, crossnodepreemption.New), app.WithPlugin(podstate.Name, podstate.New), app.WithPlugin(qos.Name, qos.New), app.WithPlugin(fluence.Name, fluence.New), diff --git a/sig-scheduler-plugins/manifests/fluence/configmap.yaml b/sig-scheduler-plugins/manifests/fluence/configmap.yaml deleted file mode 100644 index 21ffacc..0000000 --- a/sig-scheduler-plugins/manifests/fluence/configmap.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: scheduler-config - namespace: scheduler-plugins -data: - scheduler-config.yaml: | - apiVersion: kubescheduler.config.k8s.io/v1beta3 - kind: KubeSchedulerConfiguration - leaderElection: - leaderElect: false - profiles: - - schedulerName: fluence - plugins: - preFilter: - enabled: - - name: Fluence - filter: - enabled: - - name: Fluence - score: - disabled: - - name: '*' \ No newline at end of file diff --git a/sig-scheduler-plugins/manifests/fluence/deploy.yaml b/sig-scheduler-plugins/manifests/fluence/deploy.yaml deleted file mode 100644 index 92e39b0..0000000 --- a/sig-scheduler-plugins/manifests/fluence/deploy.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: fluence - namespace: scheduler-plugins -spec: - replicas: 1 - selector: - matchLabels: - component: scheduler - template: - metadata: - labels: - component: scheduler - spec: - serviceAccountName: scheduler-plugins - containers: - - image: quay.io/cmisale1/fluence-sidecar:latest - imagePullPolicy: Always - command: - - /go/src/fluence/bin/server - - --policy=lonode - name: fluence-sidecar - - image: quay.io/cmisale1/fluence:dev - imagePullPolicy: Always - command: - - /bin/kube-scheduler - - --config=/etc/kubernetes/scheduler-config.yaml - - -v=9 - name: fluence - resources: - requests: - cpu: '0.1' - securityContext: - privileged: false - volumeMounts: - - mountPath: /etc/kubernetes - name: scheduler-config - hostNetwork: false - hostPID: false - volumes: - - name: scheduler-config - configMap: - name: scheduler-config - diff --git a/sig-scheduler-plugins/manifests/fluence/rbac.yaml b/sig-scheduler-plugins/manifests/fluence/rbac.yaml deleted file mode 100644 index 3416e18..0000000 --- a/sig-scheduler-plugins/manifests/fluence/rbac.yaml +++ /dev/null @@ -1,82 +0,0 @@ -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: scheduler-plugins -rules: -- apiGroups: [""] - resources: ["namespaces", "configmaps"] - verbs: ["get", "list", "watch"] -- apiGroups: ["", "events.k8s.io"] - resources: ["events"] - verbs: ["create", "patch", "update"] -- apiGroups: ["coordination.k8s.io"] - resources: ["leases"] - verbs: ["create"] -- apiGroups: ["coordination.k8s.io"] - resourceNames: ["kube-scheduler"] - resources: ["leases"] - verbs: ["get", "update"] -- apiGroups: [""] - resources: ["endpoints"] - verbs: ["create"] -- apiGroups: [""] - resourceNames: ["kube-scheduler"] - resources: ["endpoints"] - verbs: ["get", "update"] -- apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "list", "watch", "patch"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["delete", "get", "list", "watch", "update"] -- apiGroups: [""] - resources: ["bindings", "pods/binding"] - verbs: ["create"] -- apiGroups: [""] - resources: ["pods/status"] - verbs: ["patch", "update"] -- apiGroups: [""] - resources: ["replicationcontrollers", "services"] - verbs: ["get", "list", "watch"] -- apiGroups: ["apps", "extensions"] - resources: ["replicasets"] - verbs: ["get", "list", "watch"] -- apiGroups: ["apps"] - resources: ["statefulsets"] - verbs: ["get", "list", "watch"] -- apiGroups: ["policy"] - resources: ["poddisruptionbudgets"] - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: ["persistentvolumeclaims", "persistentvolumes"] - verbs: ["get", "list", "watch", "patch", "update"] -- apiGroups: ["authentication.k8s.io"] - resources: ["tokenreviews"] - verbs: ["create"] -- apiGroups: ["authorization.k8s.io"] - resources: ["subjectaccessreviews"] - verbs: ["create"] -- apiGroups: ["storage.k8s.io"] - resources: ["csinodes", "storageclasses" , "csidrivers" , "csistoragecapacities"] - verbs: ["get", "list", "watch"] -- apiGroups: ["topology.node.k8s.io"] - resources: ["noderesourcetopologies"] - verbs: ["*"] -# resources need to be updated with the scheduler plugins used -- apiGroups: ["scheduling.sigs.k8s.io"] - resources: ["podgroups", "elasticquotas"] - verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: scheduler-plugins - namespace: scheduler-plugins -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: scheduler-plugins -subjects: - - kind: ServiceAccount - name: scheduler-plugins - namespace: scheduler-plugins diff --git a/sig-scheduler-plugins/manifests/fluence/scheduling.sigs.k8s.io_podgroups.yaml b/sig-scheduler-plugins/manifests/fluence/scheduling.sigs.k8s.io_podgroups.yaml deleted file mode 120000 index 7f8408e..0000000 --- a/sig-scheduler-plugins/manifests/fluence/scheduling.sigs.k8s.io_podgroups.yaml +++ /dev/null @@ -1 +0,0 @@ -../coscheduling/crd.yaml \ No newline at end of file diff --git a/sig-scheduler-plugins/manifests/fluence/serviceaccount.yaml b/sig-scheduler-plugins/manifests/fluence/serviceaccount.yaml deleted file mode 100644 index fface49..0000000 --- a/sig-scheduler-plugins/manifests/fluence/serviceaccount.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: scheduler-plugins ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: scheduler-plugins - namespace: scheduler-plugins \ No newline at end of file diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml new file mode 100644 index 0000000..d633b7d --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml @@ -0,0 +1,108 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes-sigs/scheduler-plugins/pull/50 + controller-gen.kubebuilder.io/version: v0.11.1 + # TODO this needs if .Vaues.enableCertManager added back + cert-manager.io/inject-ca-from: '{{ .Release.Namespace }}/{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-serving-cert' + creationTimestamp: null + name: podgroups.scheduling.x-k8s.io +spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + name: webhook-service + namespace: system + path: /convert + conversionReviewVersions: + - v1 + group: scheduling.x-k8s.io + names: + kind: PodGroup + listKind: PodGroupList + plural: podgroups + shortNames: + - pg + - pgs + singular: podgroup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: PodGroup is a collection of Pod; used for batch workload. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: Specification of the desired behavior of the pod group. + properties: + minMember: + description: MinMember defines the minimal number of members/tasks + to run the pod group; if there's not enough resources to start all + tasks, the scheduler will not start anyone. + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: MinResources defines the minimal resource of members/tasks + to run the pod group; if there's not enough resources to start all + tasks, the scheduler will not start anyone. + type: object + scheduleTimeoutSeconds: + description: ScheduleTimeoutSeconds defines the maximal time of members/tasks + to wait before run the pod group; + format: int32 + type: integer + type: object + status: + description: Status represents the current information about a pod group. + This data may not be up to date. + properties: + failed: + description: The number of pods which reached phase Failed. + format: int32 + type: integer + occupiedBy: + description: OccupiedBy marks the workload (e.g., deployment, statefulset) + UID that occupy the podgroup. It is empty if not initialized. + type: string + phase: + description: Current phase of PodGroup. + type: string + running: + description: The number of actively running pods. + format: int32 + type: integer + scheduleStartTime: + description: ScheduleStartTime of the group + format: date-time + type: string + succeeded: + description: The number of pods which reached phase Succeeded. + format: int32 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/configmap.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/configmap.yaml new file mode 100644 index 0000000..9f3d8bf --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/configmap.yaml @@ -0,0 +1,64 @@ +{{- if .Values.plugins.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: scheduler-config + namespace: {{ .Release.Namespace }} +data: + scheduler-config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1 + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: {{ .Values.scheduler.leaderElect }} + profiles: + # Compose all plugins in one profile + - schedulerName: {{ .Values.scheduler.name }} + plugins: + preBind: + disabled: + - name: {{ .Values.scheduler.name }} + filter: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + reserve: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + score: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + preScore: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + postFilter: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + preFilter: + disabled: + {{- range $.Values.plugins.disabledAll }} + - name: {{ title . }} + {{- end }} + multiPoint: + enabled: + {{- range $.Values.plugins.enabled }} + - name: {{ title . }} + {{- end }} + disabled: + {{- range $.Values.plugins.disabled }} + - name: {{ title . }} + {{- end }} + {{- if $.Values.pluginConfig }} + pluginConfig: {{ toYaml $.Values.pluginConfig | nindent 6 }} + {{- end }} + + {{- /* TODO: wire CRD installation with enabled plugins. */}} +{{- end }} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml index 8a73245..289a0e5 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml @@ -20,6 +20,19 @@ spec: - name: scheduler-plugins-controller image: {{ .Values.controller.image }} imagePullPolicy: {{ .Values.controller.pullPolicy }} + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: webhook-server-cert --- apiVersion: apps/v1 kind: Deployment @@ -37,6 +50,7 @@ spec: metadata: labels: component: scheduler + app: fluence-scheduler spec: serviceAccountName: {{ .Values.scheduler.name }} containers: @@ -45,11 +59,17 @@ spec: command: - /go/src/fluence/bin/server - --policy={{ .Values.scheduler.policy }} + - --port={{ .Values.scheduler.port }} + {{ if .Values.scheduler.enableExternalService }}- --external-service{{ end }} name: sidecar + # These are exposed for the kubectl plugin + {{ if .Values.scheduler.enableExternalService }}ports: + - containerPort: {{ .Values.scheduler.port }} + hostPort: {{ .Values.scheduler.port }}{{ end }} - command: - /bin/kube-scheduler - --config=/etc/kubernetes/scheduler-config.yaml - - -v=9 + - -v={{ .Values.scheduler.loggingLevel }} image: {{ .Values.scheduler.image }} imagePullPolicy: {{ .Values.scheduler.pullPolicy }} livenessProbe: @@ -79,3 +99,32 @@ spec: - name: scheduler-config configMap: name: scheduler-config +{{ if .Values.scheduler.enableExternalService }}--- +apiVersion: v1 +kind: Service +metadata: + name: fluence-service +spec: + type: NodePort + selector: + app: fluence-scheduler + ports: + - port: {{ .Values.scheduler.port }} + targetPort: {{ .Values.scheduler.port }} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: fluence-ingress +spec: + rules: + - host: localhost + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: fluence-service + port: + number: {{ .Values.scheduler.port }}{{ end }} \ No newline at end of file diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml new file mode 100644 index 0000000..edbe7f0 --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/mutating-webhook-configuration.yaml @@ -0,0 +1,46 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: {{ include "scheduler-plugins-as-a-second-scheduler.name" . }}-mutating-webhook-configuration + {{- if .Values.enableCertManager }} + annotations: + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-serving-cert + {{- end}} + labels: + {{- include "scheduler-plugins-as-a-second-scheduler.labels" . | nindent 4 }} +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: '{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-webhook-service' + namespace: '{{ .Release.Namespace }}' + path: /mutate-v1-fluence + {{- with (index .Values.webhookService.ports 0) }} + port: {{ .port }} + {{- end }} + + failurePolicy: Fail + name: morascache.kb.io + rules: + - apiGroups: + - "" + - core + - apps + - batch + - scheduling.x-k8s.io + apiVersions: + - v1 + - v1alpha1 + operations: + - CREATE + resources: + - pods + - jobs + - statefulsets + - deployments + - replicasets + +# Can uncomment this if we want to mutate the pod groups after creation +# - podgroups + sideEffects: None diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/selfsigned-issuer.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/selfsigned-issuer.yaml new file mode 100644 index 0000000..aa4d0a1 --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/selfsigned-issuer.yaml @@ -0,0 +1,10 @@ +{{- if .Values.enableCertManager }} +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-selfsigned-issuer + labels: + {{- include "scheduler-plugins-as-a-second-scheduler.labels" . | nindent 4 }} +spec: + selfSigned: {} +{{- end}} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/serving-cert.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/serving-cert.yaml new file mode 100644 index 0000000..0edefe2 --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/serving-cert.yaml @@ -0,0 +1,17 @@ +{{- if .Values.enableCertManager }} +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-serving-cert + labels: + {{- include "scheduler-plugins-as-a-second-scheduler.labels" . | nindent 4 }} +spec: + dnsNames: + - '{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc' + - '{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc.{{ + .Values.kubernetesClusterDomain }}' + issuerRef: + kind: Issuer + name: '{{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-selfsigned-issuer' + secretName: webhook-server-cert +{{- end}} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml new file mode 100644 index 0000000..e5339a1 --- /dev/null +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/webhook-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "scheduler-plugins-as-a-second-scheduler.fullname" . }}-webhook-service + labels: + app.kubernetes.io/component: webhook + app.kubernetes.io/created-by: scheduler-plugins-controller + app.kubernetes.io/part-of: scheduler-plugins-controller + {{- include "scheduler-plugins-as-a-second-scheduler.labels" . | nindent 4 }} +spec: + type: {{ .Values.webhookService.type }} + selector: + app: scheduler-plugins-controller + ports: + {{- .Values.webhookService.ports | toYaml | nindent 2 -}} diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml index 1ae99f9..4113209 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/values.yaml @@ -11,12 +11,19 @@ scheduler: policy: lonode pullPolicy: Always sidecarPullPolicy: Always + loggingLevel: "9" + + # Port is for GRPC, and enabling the external service will also + # create the service and ingress to it, along with adding + # additional API endpoints for our TBA kubectl plugin + enableExternalService: false + port: 4242 controller: name: scheduler-plugins-controller - image: registry.k8s.io/scheduler-plugins/controller:v0.27.8 + image: ghcr.io/flux-framework/fluence-controller:latest replicaCount: 1 - pullPolicy: IfNotPresent + pullPolicy: Always # LoadVariationRiskBalancing and TargetLoadPacking are not enabled by default # as they need extra RBAC privileges on metrics.k8s.io. @@ -24,16 +31,34 @@ controller: plugins: enabled: ["Fluence"] disabled: ["CapacityScheduling","NodeResourceTopologyMatch","NodeResourcesAllocatable","PrioritySort","Coscheduling"] # only in-tree plugins need to be defined here + # Disable EVERYTHING except for fluence + # VolumeBinding is required for PreBind, NodeResourcesFit is required or you'll get mismatches + # Yes - some of these are irrelevant for the use case here, but I'd rather be super + # conservative and be absolutely sure only fluence is running PreFilter to select nodes + disabledAll: ["NodePorts", "VolumeRestrictions", "EBSLimits", + "GCEPDLimits", "NodeVolumeLimits", "AzureDiskLimits", "VolumeZone", + "PodTopologySpread", "InterPodAffinity", "NodeAffinity", + "NodeUnschedulable", "NodeName", "TaintToleration", "DefaultPreemtion", + "NodeResourcesBalancedAllocation", "ImageLocality"] # Customize the enabled plugins' config. # Refer to the "pluginConfig" section of manifests//scheduler-config.yaml. # For example, for Coscheduling plugin, you want to customize the permit waiting timeout to 10 seconds: -pluginConfig: -- name: Coscheduling - args: - permitWaitingTimeSeconds: 10 # default is 60 +# pluginConfig: +# - name: Coscheduling +# args: +# permitWaitingTimeSeconds: 10 # default is 60 # Or, customize the other plugins # - name: NodeResourceTopologyMatch # args: # scoringStrategy: # type: MostAllocated # default is LeastAllocated + +enableCertManager: true +kubernetesClusterDomain: cluster.local +webhookService: + ports: + - port: 9443 + protocol: TCP + targetPort: 9443 + type: ClusterIP \ No newline at end of file diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go new file mode 100644 index 0000000..7afb815 --- /dev/null +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -0,0 +1,477 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "fmt" + "sort" + "strconv" + "strings" + "time" + + "github.com/go-logr/logr" + v1 "k8s.io/api/core/v1" + apierrs "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + schedv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + fluenceLabels "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" + "sigs.k8s.io/scheduler-plugins/pkg/util" +) + +// PodGroupReconciler reconciles a PodGroup object +type PodGroupReconciler struct { + log logr.Logger + recorder record.EventRecorder + + client.Client + Scheme *runtime.Scheme + Workers int +} + +// +kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups/finalizers,verbs=update + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// Note that we currently don't do deletion based on owner references, but that +// would be ideal (I could not get it to work) +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.0/pkg/reconcile +func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := log.FromContext(ctx) + log.Info("reconciling flux-framework/fluence-controller for request") + pg := &schedv1alpha1.PodGroup{} + + // Get the timestamp as soon as reconcile happens as a fallback below + timestamp := metav1.NewMicroTime(time.Now()) + if err := r.Get(ctx, req.NamespacedName, pg); err != nil { + + // Case 1: if we get here and it's not found, assume not created + if apierrs.IsNotFound(err) { + log.Info("PodGroup", "Status", fmt.Sprintf("Pod group %s is not found, deleted.", req.NamespacedName)) + return ctrl.Result{}, nil + } + log.Error(err, fmt.Sprintf("Unable to retrieve pod group %s", req.NamespacedName)) + return ctrl.Result{}, err + } + log.Info("REFERENCES", "Reconciler", pg.ObjectMeta.OwnerReferences) + + // Grab all statuses (and groups of them) we are interested in + // Note that 48 hours seems arbitrary, and if it is, we might make it a variable + schedulingOrPending := (pg.Status.Phase == schedv1alpha1.PodGroupScheduling || pg.Status.Phase == schedv1alpha1.PodGroupPending) + twoDaysOld := pg.Status.ScheduleStartTime.Sub(pg.CreationTimestamp.Time) > 48*time.Hour + finishedOrFailed := pg.Status.Phase == schedv1alpha1.PodGroupFinished || pg.Status.Phase == schedv1alpha1.PodGroupFailed + + // Finished or failed - clean up the group + if finishedOrFailed { + log.Info("PodGroup", "Status", fmt.Sprintf("Pod group %s is finished or failed.", req.NamespacedName)) + return ctrl.Result{}, nil + } + + // If startScheduleTime - createTime > 2days, + // do not reconcile again because pod may have been GCed + if schedulingOrPending && pg.Status.Running == 0 && twoDaysOld { + r.recorder.Event(pg, v1.EventTypeWarning, "Timeout", "schedule time longer than 48 hours") + return ctrl.Result{}, nil + } + + // We can get the podList and check for sizes here + podList := &v1.PodList{} + + // Select based on the group name + groupNameSelector := labels.Set(map[string]string{schedv1alpha1.PodGroupLabel: pg.Name}).AsSelector() + err := r.List(ctx, podList, client.MatchingLabelsSelector{Selector: groupNameSelector}) + if err != nil { + log.Error(err, "List pods for group failed") + return ctrl.Result{}, err + } + + // If the pod group creation time created is Zero (not set) we set it here + // This only happens on the first reconcile, which should also be when the + // pod group is created. We set it here and don't use the underlying object + // CreationTime because we need to change the granularity to ms. + if pg.Status.CreationTime.IsZero() { + return r.setTimeCreated(ctx, pg, podList.Items, timestamp) + } + + // Inspect the size, set on the group if not done yet + size := len(podList.Items) + log.Info("PodGroup", "Name", pg.Name, "Size", size) + + // When first created, size should be unset (MinMember) + // Get size label from the first pod + if int(pg.Spec.MinMember) == 0 { + log.Info("PodGroup", "Status", fmt.Sprintf("Pod group %s updating size to %d", pg.Name, size)) + return r.updatePodGroupSize(ctx, pg, int32(size), podList.Items) + + } else if int(pg.Spec.MinMember) != size { + // TODO: Not clear what to do here. Arguably, we also want to check the label size + // because (in the future) we can accept smaller sizes. But then we also need + // to account for if the labels are different, do we take the smallest? + log.Info("PodGroup", "Status", fmt.Sprintf("WARNING: Pod group current MinMember %d does not match %d", pg.Spec.MinMember, size)) + } + return r.updateStatus(ctx, pg, podList.Items) + +} + +func (r *PodGroupReconciler) setTimeCreated( + ctx context.Context, + pg *schedv1alpha1.PodGroup, + pods []v1.Pod, + timestamp metav1.MicroTime, +) (ctrl.Result, error) { + + // First priority goes to annotation, if set + if len(pods) > 0 { + + strTime, ok := pods[0].Labels[fluenceLabels.PodGroupTimeCreated] + if ok { + mt := metav1.MicroTime{} + b := []byte(strTime) + err := mt.UnmarshalJSON(b) + if err == nil { + timestamp = mt + } + } + } + + // Now patch to update it + patch := client.MergeFrom(pg.DeepCopy()) + pg.Status.CreationTime = timestamp + + // Apply the patch to update the size + r.Status().Update(ctx, pg) + err := r.Patch(ctx, pg, patch) + return ctrl.Result{Requeue: true}, err + +} + +func (r *PodGroupReconciler) updateStatus( + ctx context.Context, + pg *schedv1alpha1.PodGroup, + pods []v1.Pod, +) (ctrl.Result, error) { + + log := log.FromContext(ctx) + patch := client.MergeFrom(pg.DeepCopy()) + log.Info("PodGroup", "Phase", pg.Status.Phase) + + switch pg.Status.Phase { + case "": + pg.Status.Phase = schedv1alpha1.PodGroupPending + + case schedv1alpha1.PodGroupPending: + if len(pods) >= int(pg.Spec.MinMember) { + log.Info("PodGroup", "Phase", "Scheduling") + pg.Status.Phase = schedv1alpha1.PodGroupScheduling + } + default: + + // If for some reason we weren't pending and now have fewer than min required, flip back to pending + if len(pods) < int(pg.Spec.MinMember) { + log.Info("PodGroup", "Phase", "Length of pods less than min member, pending") + pg.Status.Phase = schedv1alpha1.PodGroupPending + break + } + + // Get updated counts of running, succeeded, and failed pods + running, succeeded, failed := getCurrentPodStats(pods) + log.Info("PodGroup", "Running", running, "Succeeded", succeeded, "Failed", failed) + + // A pod with succeeded + running STILL less than the minimum required is scheduling + if succeeded+running < pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupScheduling + } + + // A pod with succeeded + running >= the minimum required is running! + if succeeded+running >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupRunning + } + + // We have non zero failed, and the total of failed, running amd succeeded > min member + // Final state of pod group is FAILED womp womp + if failed != 0 && failed+running+succeeded >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupFailed + } + + // Finished! This is where we want to get :) + // TODO: ideally the owning higher level object deletion will delete here, + // but that won't always work for one of pods - need a new strategy + if succeeded >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupFinished + } + pg.Status.Running = running + pg.Status.Failed = failed + pg.Status.Succeeded = succeeded + } + + // Apply the patch to update, or delete if finished + var err error + if pg.Status.Phase == schedv1alpha1.PodGroupFinished || pg.Status.Phase == schedv1alpha1.PodGroupFailed { + log.Info("PodGroup", "Status", "Finished", "Owners", pg.OwnerReferences) + + // Delete the group if it is finished or failed + err = r.Delete(ctx, pg) + // Update but don't requeue + // _, err := r.updateOwnerReferences(ctx, pg, pods) + return ctrl.Result{}, err + } + r.Status().Update(ctx, pg) + err = r.Patch(ctx, pg, patch) + return ctrl.Result{Requeue: true}, err +} + +// newPodGroup creates a new podGroup object, capturing the creation time +// This should be followed by a request to reconsile it +// I'm not sure this actually takes, because the metadata (spec) +// does not stick +func (r *PodGroupReconciler) newPodGroup( + ctx context.Context, + name, namespace string, + groupSize int32, +) (*schedv1alpha1.PodGroup, error) { + + pg := &schedv1alpha1.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + // Note that these don't really stick + Spec: schedv1alpha1.PodGroupSpec{ + MinMember: groupSize, + }, + Status: schedv1alpha1.PodGroupStatus{ + ScheduleStartTime: metav1.NewMicroTime(time.Now()), + }, + } + + err := r.Create(ctx, pg) + if err != nil { + r.log.Error(err, "Failed to create new PodGroup", "Namespace:", pg.Namespace, "Name:", pg.Name) + } + return pg, err + +} + +// patchPodGroup is a halper function to run a patch and then return the correct result / error for the reconciler +func (r *PodGroupReconciler) patchPodGroup(ctx context.Context, old, new *schedv1alpha1.PodGroup) (ctrl.Result, error) { + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, new, patch); err != nil { + r.log.Error(err, "Issue patching PodGroup", "Namespace:", old.Namespace, "Name:", old.Name) + return ctrl.Result{}, err + } + err := r.Patch(ctx, new, patch) + if err != nil { + r.log.Error(err, "Issue patching PodGroup", "Namespace:", old.Namespace, "Name:", old.Name) + } + return ctrl.Result{}, err +} + +// updatePodGroup does an update with reconcile instead of a patch request +func (r *PodGroupReconciler) updatePodGroupSize( + ctx context.Context, + old *schedv1alpha1.PodGroup, + size int32, + pods []v1.Pod, +) (ctrl.Result, error) { + + // First priority goes to annotation, if set + if len(pods) > 0 { + rawSize := pods[0].Labels[fluenceLabels.PodGroupSizeLabel] + groupSize, err := strconv.ParseInt(rawSize, 10, 32) + if err == nil { + size = int32(groupSize) + } + } + + // Now patch to update it + patch := client.MergeFrom(old.DeepCopy()) + old.Spec.MinMember = size + + // Apply the patch to update the size + r.Status().Update(ctx, old) + err := r.Patch(ctx, old, patch) + return ctrl.Result{Requeue: true}, err +} + +// getCurrentPodStats gets the number of running, succeeded, and failed +// We use these to populate the PodGroup +func getCurrentPodStats(pods []v1.Pod) (int32, int32, int32) { + if len(pods) == 0 { + return 0, 0, 0 + } + var ( + running int32 = 0 + succeeded int32 = 0 + failed int32 = 0 + ) + + // Loop and count things. + for _, pod := range pods { + switch pod.Status.Phase { + case v1.PodRunning: + running++ + case v1.PodSucceeded: + succeeded++ + case v1.PodFailed: + failed++ + } + } + return running, succeeded, failed +} + +// updateOwnerReferences ensures the group is always owned by the same entity that owns the pod +// This ensures that, for example, a job that is wrapping pods is the owner. +func (r *PodGroupReconciler) updateOwnerReferences( + ctx context.Context, + pg *schedv1alpha1.PodGroup, + pods []v1.Pod, +) (ctrl.Result, error) { + + // We will want to re-queue in most cases + result := ctrl.Result{Requeue: true} + + // No pods, just ignore + if len(pods) == 0 { + return result, nil + } + pod := pods[0] + + // Case 1: The pod itself doesn't have owner references. YOLO + if len(pod.OwnerReferences) == 0 { + return result, nil + } + + // Collect current owner references for pod group, + // We want to ensure we add unique ones across the pod + owners := []metav1.OwnerReference{} + var refs []string + for _, ownerRef := range pod.OwnerReferences { + refs = append(refs, fmt.Sprintf("%s/%s", pod.Namespace, ownerRef.Name)) + owners = append(owners, ownerRef) + } + + patch := client.MergeFrom(pg.DeepCopy()) + if len(refs) != 0 { + sort.Strings(refs) + pg.Status.OccupiedBy = strings.Join(refs, ",") + } + // If we have owners, collapose into list + if len(owners) > 0 { + pg.ObjectMeta.OwnerReferences = owners + } + + // Apply the patch to update the size + r.Status().Update(ctx, pg) + err := r.Patch(ctx, pg, patch) + return ctrl.Result{Requeue: true}, err + +} + +// SetupWithManager sets up the controller with the Manager. +// We watch the events channel, which is going to trigger from the mutating webhook +// to send over when a pod group is created (hopefully preceeding schedule). +func (r *PodGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.recorder = mgr.GetEventRecorderFor("PodGroupController") + r.log = mgr.GetLogger() + r.log.Info("setup with manager flux-framework/fluence-controller") + + return ctrl.NewControllerManagedBy(mgr). + Watches(&v1.Pod{}, handler.EnqueueRequestsFromMapFunc(r.ensurePodGroup)). + For(&schedv1alpha1.PodGroup{}). + WithOptions(controller.Options{MaxConcurrentReconciles: r.Workers}). + Complete(r) +} + +// ensurePodGroup ensures we create the pod group (or delete) when pod is deleted +// for delete, this would be better done as an owner reference., but I haven't gotten it working +func (r *PodGroupReconciler) ensurePodGroup(ctx context.Context, obj client.Object) []ctrl.Request { + pod, ok := obj.(*v1.Pod) + if !ok { + return nil + } + groupName := util.GetPodGroupLabel(pod) + + // This case only happens when something is not scheduled by fluence + if len(groupName) == 0 { + r.log.Info("Pod: ", "Name", pod.Name, "Status", pod.Status.Phase, "Action", "Not fluence owned") + return nil + } + + // If we deleted the pod... assume we delete the group too + if !pod.ObjectMeta.DeletionTimestamp.IsZero() { + r.log.Info("Pod: ", "Name", pod.Name, "Status", pod.Status.Phase, "Action", "Deleted") + + pg := &schedv1alpha1.PodGroup{} + err := r.Get(ctx, types.NamespacedName{Name: groupName, Namespace: pod.Namespace}, pg) + if err != nil { + r.Delete(ctx, pg) + } + return nil + } + + // If we are watching the Pod and it's beyond pending, we hopefully already made a group + // and that group should be in the reconcile process. + if pod.Status.Phase != v1.PodPending { + r.log.Info("Pod: ", "Name", pod.Name, "Status", pod.Status.Phase, "Action", "Skipping reconcile") + return nil + } + + // At this point we should have a group size (string) set by the webhook + rawSize := pod.Labels[fluenceLabels.PodGroupSizeLabel] + groupSize, err := strconv.ParseInt(rawSize, 10, 32) + if err != nil { + r.log.Error(err, "Parsing PodGroup size.") + return nil + } + + namespacedName := types.NamespacedName{ + Namespace: pod.Namespace, + Name: groupName, + } + + // Create the pod group if the pod is pending + pg := &schedv1alpha1.PodGroup{} + if err := r.Get(ctx, namespacedName, pg); err != nil { + + // Case 1: if we get here and it's not found, assume not created + if apierrs.IsNotFound(err) { + r.log.Info("Pod: ", "Status", pod.Status.Phase, "Name", pod.Name, "Group", groupName, "Namespace", pod.Namespace, "Action", "Creating PodGroup") + + // Note that most of this does not stick - we have to get metadata later from pods + // Or just use a hiuristic (e.g., take the first pod or use reconciler first hit time) + _, err := r.newPodGroup(ctx, groupName, pod.Namespace, int32(groupSize)) + if err == nil { + return []ctrl.Request{{NamespacedName: namespacedName}} + } + r.log.Info("Pod: ", "Status", pod.Status.Phase, "Name", pod.Name, "Group", groupName, "Namespace", pod.Namespace, "Action", "Issue Creating PodGroup") + } + } + return nil +} diff --git a/sig-scheduler-plugins/pkg/fluence/README.md b/sig-scheduler-plugins/pkg/fluence/README.md deleted file mode 100644 index 61f4923..0000000 --- a/sig-scheduler-plugins/pkg/fluence/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Overview - -Project to manage Flux tasks needed to standardize kubernetes HPC scheduling interfaces - -## Installing the chart - -More detail will be added here about installing the chart. You will -be using the [install-as-a-second-scheduler](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/manifests/install/charts/as-a-second-scheduler) -charts. Fluence-specific values are detailed below. - -### Fluence specific values - -In `values.yaml` it is possible to customize the container image, already defaulted to the latest release, and the allocation policy -used by the scheduler. -Most common options are: - -- `lonode`: choose the nodes with lower ID first. Can be compared to packing -- `low`: choose cores with lowest IDs from multiple nodes. Can be compared to spread process-to-resource placement - -## Maturity Level - - - -- [x] Sample (for demonstrating and inspiring purpose) -- [ ] Alpha (used in companies for pilot projects) -- [ ] Beta (used in companies and developed actively) -- [ ] Stable (used in companies for production workloads) - - diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 11c90ef..9de5a26 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -1,5 +1,5 @@ /* -Copyright 2022 The Kubernetes Authors. +Copyright 2020 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,17 +17,55 @@ limitations under the License. package core import ( + "context" "fmt" + "sync" + "time" + + gochache "github.com/patrickmn/go-cache" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + informerv1 "k8s.io/client-go/informers/core/v1" + listerv1 "k8s.io/client-go/listers/core/v1" + klog "k8s.io/klog/v2" - "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" - pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" + "sigs.k8s.io/controller-runtime/pkg/client" + + "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + "sigs.k8s.io/scheduler-plugins/pkg/logger" + "sigs.k8s.io/scheduler-plugins/pkg/util" ) +type Status string + +const ( + // PodGroupNotSpecified denotes no PodGroup is specified in the Pod spec. + PodGroupNotSpecified Status = "PodGroup not specified" + // PodGroupNotFound denotes the specified PodGroup in the Pod spec is + // not found in API server. + PodGroupNotFound Status = "PodGroup not found" + Success Status = "Success" + Wait Status = "Wait" + + permitStateKey = "PermitFluence" +) + +// TODO should eventually store group name here to reassociate on reload type FluxStateData struct { NodeName string } +type PermitState struct { + Activate bool +} + +func (s *PermitState) Clone() framework.StateData { + return &PermitState{Activate: s.Activate} +} + func (s *FluxStateData) Clone() framework.StateData { clone := &FluxStateData{ NodeName: s.NodeName, @@ -35,65 +73,325 @@ func (s *FluxStateData) Clone() framework.StateData { return clone } -type NodePodsCount struct { - NodeName string - Count int +// Manager defines the interfaces for PodGroup management. +type Manager interface { + PreFilter(context.Context, *corev1.Pod, *framework.CycleState) error + GetPodNode(*corev1.Pod) string + GetPodGroup(context.Context, *corev1.Pod) (string, *v1alpha1.PodGroup) + GetCreationTimestamp(*corev1.Pod, time.Time) metav1.MicroTime + DeletePermittedPodGroup(string) + Permit(context.Context, *framework.CycleState, *corev1.Pod) Status + CalculateAssignedPods(string, string) int + ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) + BackoffPodGroup(string, time.Duration) } -var podgroupMap map[string][]NodePodsCount +// PodGroupManager defines the scheduling operation called +type PodGroupManager struct { + // client is a generic controller-runtime client to manipulate both core resources and PodGroups. + client client.Client + // snapshotSharedLister is pod shared list + snapshotSharedLister framework.SharedLister + // scheduleTimeout is the default timeout for podgroup scheduling. + // If podgroup's scheduleTimeoutSeconds is set, it will be used. + scheduleTimeout *time.Duration + // permittedpodGroup stores the podgroup name which has passed the pre resource check. + permittedpodGroup *gochache.Cache + // backedOffpodGroup stores the podgorup name which failed scheduling recently. + backedOffpodGroup *gochache.Cache + // podLister is pod lister + podLister listerv1.PodLister + + // This isn't great to save state, but we can improve upon it + // we should have a way to load jobids into this if fluence is recreated + // If we can annotate them in fluxion and query for that, we can! + groupToJobId map[string]uint64 + podToNode map[string]string + + // Probably should just choose one... oh well + sync.RWMutex + mutex sync.Mutex + log *logger.DebugLogger +} -func Init() { - podgroupMap = make(map[string][]NodePodsCount, 0) +// NewPodGroupManager creates a new operation object. +func NewPodGroupManager( + client client.Client, + snapshotSharedLister framework.SharedLister, + scheduleTimeout *time.Duration, + podInformer informerv1.PodInformer, + log *logger.DebugLogger, +) *PodGroupManager { + podGroupManager := &PodGroupManager{ + client: client, + snapshotSharedLister: snapshotSharedLister, + scheduleTimeout: scheduleTimeout, + podLister: podInformer.Lister(), + permittedpodGroup: gochache.New(3*time.Second, 3*time.Second), + backedOffpodGroup: gochache.New(10*time.Second, 10*time.Second), + groupToJobId: map[string]uint64{}, + podToNode: map[string]string{}, + log: log, + } + return podGroupManager } -func (n *NodePodsCount) Clone() framework.StateData { - return &NodePodsCount{ - NodeName: n.NodeName, - Count: n.Count, +func (podGroupManager *PodGroupManager) BackoffPodGroup(groupName string, backoff time.Duration) { + if backoff == time.Duration(0) { + return } + podGroupManager.backedOffpodGroup.Add(groupName, nil, backoff) } -func CreateNodePodsList(nodelist []*pb.NodeAlloc, pgname string) (nodepods []NodePodsCount) { - nodepods = make([]NodePodsCount, len(nodelist)) - for i, v := range nodelist { - nodepods[i] = NodePodsCount{ - NodeName: v.GetNodeID(), - Count: int(v.GetTasks()), +// ActivateSiblings stashes the pods belonging to the same PodGroup of the given pod +// in the given state, with a reserved key "kubernetes.io/pods-to-activate". +func (podGroupManager *PodGroupManager) ActivateSiblings(pod *corev1.Pod, state *framework.CycleState) { + groupName := util.GetPodGroupLabel(pod) + if groupName == "" { + return + } + + // Only proceed if it's explicitly requested to activate sibling pods. + if c, err := state.Read(permitStateKey); err != nil { + return + } else if s, ok := c.(*PermitState); !ok || !s.Activate { + return + } + + pods, err := podGroupManager.podLister.Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: groupName}), + ) + if err != nil { + klog.ErrorS(err, "Failed to obtain pods belong to a PodGroup", "podGroup", groupName) + return + } + + for i := range pods { + if pods[i].UID == pod.UID { + pods = append(pods[:i], pods[i+1:]...) + break } } - podgroupMap[pgname] = nodepods - klog.Info("MAP ", podgroupMap) - return + if len(pods) != 0 { + if c, err := state.Read(framework.PodsToActivateKey); err == nil { + if s, ok := c.(*framework.PodsToActivate); ok { + s.Lock() + for _, pod := range pods { + namespacedName := GetNamespacedName(pod) + s.Map[namespacedName] = pod + } + s.Unlock() + } + } + } } -func HaveList(pgname string) bool { - _, exists := podgroupMap[pgname] - return exists +// GetStatuses string (of all pods) to show for debugging purposes +func (podGroupManager *PodGroupManager) GetStatuses( + pods []*corev1.Pod, +) string { + statuses := "" + + // We need to distinguish 0 from the default and not finding anything + for _, pod := range pods { + statuses += " " + fmt.Sprintf("%s", pod.Status.Phase) + } + return statuses } -func GetNextNode(pgname string) (string, error) { - entry, ok := podgroupMap[pgname] - if !ok { - err := fmt.Errorf("Map is empty") - return "", err +// GetPodNode is a quick lookup to see if we have a node +func (podGroupManager *PodGroupManager) GetPodNode(pod *corev1.Pod) string { + node, _ := podGroupManager.podToNode[pod.Name] + return node +} + +// Permit permits a pod to run, if the minMember match, it would send a signal to chan. +func (podGroupManager *PodGroupManager) Permit(ctx context.Context, state *framework.CycleState, pod *corev1.Pod) Status { + groupName, podGroup := podGroupManager.GetPodGroup(ctx, pod) + if groupName == "" { + return PodGroupNotSpecified + } + if podGroup == nil { + // A Pod with a podGroup name but without a PodGroup found is denied. + return PodGroupNotFound + } + + assigned := podGroupManager.CalculateAssignedPods(podGroup.Name, podGroup.Namespace) + // The number of pods that have been assigned nodes is calculated from the snapshot. + // The current pod in not included in the snapshot during the current scheduling cycle. + if int32(assigned)+1 >= podGroup.Spec.MinMember { + return Success } - if len(entry) == 0 { - err := fmt.Errorf("Error while getting a node") - return "", err + + if assigned == 0 { + // Given we've reached Permit(), it's mean all PreFilter checks (minMember & minResource) + // already pass through, so if assigned == 0, it could be due to: + // - minResource get satisfied + // - new pods added + // In either case, we should and only should use this 0-th pod to trigger activating + // its siblings. + // It'd be in-efficient if we trigger activating siblings unconditionally. + // See https://github.com/kubernetes-sigs/scheduler-plugins/issues/682 + state.Write(permitStateKey, &PermitState{Activate: true}) } - nodename := entry[0].NodeName + return Wait +} + +// PreFilter filters out a pod if +// 1. it belongs to a podgroup that was recently denied or +// 2. the total number of pods in the podgroup is less than the minimum number of pods +// that is required to be scheduled. +func (podGroupManager *PodGroupManager) PreFilter( + ctx context.Context, + pod *corev1.Pod, + state *framework.CycleState, +) error { - if entry[0].Count == 1 { - slice := entry[1:] - if len(slice) == 0 { - delete(podgroupMap, pgname) - return nodename, nil + podGroupManager.log.Info("[PodGroup PreFilter] pod %s", klog.KObj(pod)) + groupName, podGroup := podGroupManager.GetPodGroup(ctx, pod) + if podGroup == nil { + return nil + } + + _, exists := podGroupManager.backedOffpodGroup.Get(groupName) + if exists { + return fmt.Errorf("podGroup %v failed recently", groupName) + } + + pods, err := podGroupManager.podLister.Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: util.GetPodGroupLabel(pod)}), + ) + if err != nil { + return fmt.Errorf("podLister list pods failed: %w", err) + } + + // Only allow scheduling the first in the group so the others come after + + // Get statuses to show for debugging + statuses := podGroupManager.GetStatuses(pods) + + // This shows us the number of pods we have in the set and their states + podGroupManager.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", groupName, statuses, podGroup.Spec.MinMember, len(pods)) + if len(pods) < int(podGroup.Spec.MinMember) { + return fmt.Errorf("pre-filter pod %v cannot find enough sibling pods, "+ + "current pods number: %v, minMember of group: %v", pod.Name, len(pods), podGroup.Spec.MinMember) + } + + // TODO we likely can take advantage of these resources or other custom + // attributes we add. For now ignore and calculate based on pod needs (above) + // if podGroup.Spec.MinResources == nil { + // fmt.Printf("Fluence Min resources are null, skipping PreFilter") + // return nil + // } + + // This is from coscheduling. + // TODO(cwdsuzhou): This resource check may not always pre-catch unschedulable pod group. + // It only tries to PreFilter resource constraints so even if a PodGroup passed here, + // it may not necessarily pass Filter due to other constraints such as affinity/taints. + _, exists = podGroupManager.permittedpodGroup.Get(groupName) + if exists { + podGroupManager.log.Info("[PodGroup PreFilter] Pod Group %s is already admitted", groupName) + return nil + } + + // TODO: right now we ask Fluxion for a podspec based on ONE representative pod, but + // we have the whole group! We can handle different pod needs now :) + repPod := pods[0] + nodes, err := podGroupManager.AskFlux(ctx, *repPod, podGroup, groupName) + if err != nil { + podGroupManager.log.Info("[PodGroup PreFilter] Fluxion returned an error %s, not schedulable", err.Error()) + return err + } + podGroupManager.log.Info("Node Selected %s (pod group %s)", nodes, groupName) + + // Some reason fluxion gave us the wrong size? + if len(nodes) != len(pods) { + podGroupManager.log.Warning("[PodGroup PreFilter] group %s needs %d nodes but Fluxion returned the wrong number nodes %d.", groupName, len(pods), len(nodes)) + podGroupManager.mutex.Lock() + podGroupManager.cancelFluxJob(groupName, repPod) + podGroupManager.mutex.Unlock() + } + + // Create a fluxState (CycleState) with all nodes - this is used to retrieve + // the specific node assigned to the pod in Filter, which returns a node + // Note that this probably is not useful beyond the pod we are in the context + // of, but why not do it. + for i, node := range nodes { + pod := pods[i] + stateData := FluxStateData{NodeName: node} + state.Write(framework.StateKey(pod.Name), &stateData) + // Also save to the podToNode lookup + podGroupManager.mutex.Lock() + podGroupManager.podToNode[pod.Name] = node + podGroupManager.mutex.Unlock() + } + podGroupManager.permittedpodGroup.Add(groupName, groupName, *podGroupManager.scheduleTimeout) + return nil +} + +// GetCreationTimestamp returns the creation time of a podGroup or a pod in seconds (time.MicroTime) +// The Status.CreationTime is set by the PodGroup reconciler, which has to happen before we have +// a PodGroup. I don't see cases when this wouldn't happen, but in case we fall back to +// converting the pg.CreationTime to a MicroTime +func (podGroupManager *PodGroupManager) GetCreationTimestamp(pod *corev1.Pod, ts time.Time) metav1.MicroTime { + groupName := util.GetPodGroupLabel(pod) + if len(groupName) == 0 { + return metav1.NewMicroTime(ts) + } + var podGroup v1alpha1.PodGroup + if err := podGroupManager.client.Get(context.TODO(), types.NamespacedName{Namespace: pod.Namespace, Name: groupName}, &podGroup); err != nil { + return metav1.NewMicroTime(ts) + } + // First preference goes to microseconds. This should be set, as it is set by the first + // reconcile, and we wouldn'thave a pod group if it didn't pass through that. + if !podGroup.Status.CreationTime.IsZero() { + return podGroup.Status.CreationTime + } + // Fall back to CreationTime from Kubernetes, in seconds + // In practice this should not happen + return metav1.NewMicroTime(podGroup.CreationTimestamp.Time) +} + +// CalculateAssignedPods returns the number of pods that has been assigned nodes: assumed or bound. +func (podGroupManager *PodGroupManager) CalculateAssignedPods(podGroupName, namespace string) int { + nodeInfos, err := podGroupManager.snapshotSharedLister.NodeInfos().List() + if err != nil { + podGroupManager.log.Error("Cannot get nodeInfos from frameworkHandle: %s", err) + return 0 + } + var count int + for _, nodeInfo := range nodeInfos { + for _, podInfo := range nodeInfo.Pods { + pod := podInfo.Pod + if util.GetPodGroupLabel(pod) == podGroupName && pod.Namespace == namespace && pod.Spec.NodeName != "" { + count++ + } } - podgroupMap[pgname] = slice - return nodename, nil } - entry[0].Count = entry[0].Count - 1 - return nodename, nil + return count +} + +// DeletePermittedPodGroup deletes a podGroup that passes Pre-Filter but reaches PostFilter. +func (podGroupManager *PodGroupManager) DeletePermittedPodGroup(groupName string) { + podGroupManager.permittedpodGroup.Delete(groupName) +} + +// GetPodGroup returns the PodGroup that a Pod belongs to in cache. +func (podGroupManager *PodGroupManager) GetPodGroup(ctx context.Context, pod *corev1.Pod) (string, *v1alpha1.PodGroup) { + groupName := util.GetPodGroupLabel(pod) + if len(groupName) == 0 { + return "", nil + } + var podGroup v1alpha1.PodGroup + if err := podGroupManager.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: groupName}, &podGroup); err != nil { + return fmt.Sprintf("%v/%v", pod.Namespace, groupName), nil + } + return fmt.Sprintf("%v/%v", pod.Namespace, groupName), &podGroup +} + +// GetNamespacedName returns the namespaced name. +func GetNamespacedName(obj metav1.Object) string { + return fmt.Sprintf("%v/%v", obj.GetNamespace(), obj.GetName()) } diff --git a/sig-scheduler-plugins/pkg/fluence/core/flux.go b/sig-scheduler-plugins/pkg/fluence/core/flux.go new file mode 100644 index 0000000..24c9212 --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/core/flux.go @@ -0,0 +1,257 @@ +package core + +import ( + "context" + "time" + + "google.golang.org/grpc" + "k8s.io/apimachinery/pkg/labels" + pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" + fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" + + "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + "sigs.k8s.io/scheduler-plugins/pkg/fluence/utils" + + corev1 "k8s.io/api/core/v1" +) + +// AskFlux will ask flux for an allocation for nodes for the pod group. +// We return the list of nodes, and assign to the entire group! +func (podGroupManager *PodGroupManager) AskFlux( + ctx context.Context, + pod corev1.Pod, + podGroup *v1alpha1.PodGroup, + groupName string, +) ([]string, error) { + + // clean up previous match if a pod has already allocated previously + podGroupManager.mutex.Lock() + _, isAllocated := podGroupManager.groupToJobId[groupName] + podGroupManager.mutex.Unlock() + + // This case happens when there is some reason that an initial job pods partially allocated, + // but then the job restarted, and new pods are present but fluence had assigned nodes to + // the old ones (and there aren't enough). The job would have had to complete in some way, + // and the PodGroup would have to then recreate, and have the same job id (the group name). + // This happened when I cancalled a bunch of jobs and they didn't have the chance to + // cancel in fluence. What we can do here is assume the previous pods are no longer running + // and cancel the flux job to create again. + if isAllocated { + podGroupManager.log.Warning("[PodGroup AskFlux] group %s was previously allocated and is requesting again, so must have completed.", groupName) + podGroupManager.mutex.Lock() + podGroupManager.cancelFluxJob(groupName, &pod) + podGroupManager.mutex.Unlock() + } + nodes := []string{} + + // IMPORTANT: this is a JobSpec for *one* pod, assuming they are all the same. + // This obviously may not be true if we have a hetereogenous PodGroup. + // We name it based on the group, since it will represent the group + jobspec := utils.PreparePodJobSpec(&pod, groupName) + podGroupManager.log.Info("[PodGroup AskFlux] Inspect pod info, jobspec: %s\n", jobspec) + conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) + + // TODO change this to just return fmt.Errorf + if err != nil { + podGroupManager.log.Error("[PodGroup AskFlux] Error connecting to server: %v\n", err) + return nodes, err + } + defer conn.Close() + + grpcclient := pb.NewFluxcliServiceClient(conn) + _, cancel := context.WithTimeout(context.Background(), 200*time.Second) + defer cancel() + + request := &pb.MatchRequest{ + Ps: jobspec, + Request: "allocate", + Count: podGroup.Spec.MinMember, + } + + // An error here is an error with making the request + response, err := grpcclient.Match(context.Background(), request) + if err != nil { + podGroupManager.log.Warning("[PodGroup AskFlux] did not receive any match response: %v\n", err) + return nodes, err + } + + // TODO GetPodID should be renamed, because it will reflect the group + podGroupManager.log.Info("[PodGroup AskFlux] Match response ID %s\n", response.GetPodID()) + + // Get the nodelist and inspect + nodelist := response.GetNodelist() + for _, node := range nodelist { + nodes = append(nodes, node.NodeID) + } + jobid := uint64(response.GetJobID()) + podGroupManager.log.Info("[PodGroup AskFlux] parsed node pods list %s for job id %d\n", nodes, jobid) + + // TODO would be nice to actually be able to ask flux jobs -a to fluence + // That way we can verify assignments, etc. + podGroupManager.mutex.Lock() + podGroupManager.groupToJobId[groupName] = jobid + podGroupManager.mutex.Unlock() + return nodes, nil +} + +// cancelFluxJobForPod cancels the flux job for a pod. +// We assume that the cancelled job also means deleting the pod group +func (podGroupManager *PodGroupManager) cancelFluxJob(groupName string, pod *corev1.Pod) error { + + jobid, exists := podGroupManager.groupToJobId[groupName] + + // The job was already cancelled by another pod + if !exists { + podGroupManager.log.Info("[PodGroup cancelFluxJob] Request for cancel of group %s is already complete.", groupName) + return nil + } + podGroupManager.log.Info("[PodGroup cancelFluxJob] Cancel flux job: %v for group %s", jobid, groupName) + + // This first error is about connecting to the server + conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) + if err != nil { + podGroupManager.log.Error("[PodGroup cancelFluxJob] Error connecting to server: %v", err) + return err + } + defer conn.Close() + + grpcclient := pb.NewFluxcliServiceClient(conn) + _, cancel := context.WithTimeout(context.Background(), 200*time.Second) + defer cancel() + + // This error reflects the success or failure of the cancel request + request := &pb.CancelRequest{JobID: int64(jobid)} + response, err := grpcclient.Cancel(context.Background(), request) + if err != nil { + podGroupManager.log.Error("[PodGroup cancelFluxJob] did not receive any cancel response: %v", err) + return err + } + podGroupManager.log.Info("[PodGroup cancelFluxJob] Job cancellation for group %s result: %d", groupName, response.Error) + + // And this error is if the cancel was successful or not + if response.Error == 0 { + podGroupManager.log.Info("[PodGroup cancelFluxJob] Successful cancel of flux job: %d for group %s", jobid, groupName) + podGroupManager.cleanup(pod, groupName) + } else { + podGroupManager.log.Warning("[PodGroup cancelFluxJob] Failed to cancel flux job %d for group %s", jobid, groupName) + } + return nil +} + +// cleanup deletes the group name from groupToJobId, and pods names from the node lookup +func (podGroupManager *PodGroupManager) cleanup(pod *corev1.Pod, groupName string) { + + delete(podGroupManager.groupToJobId, groupName) + + // Clean up previous pod->node assignments + pods, err := podGroupManager.podLister.Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: groupName}), + ) + // TODO need to handle this / understand why it's the case + if err != nil { + return + } + for _, pod := range pods { + delete(podGroupManager.podToNode, pod.Name) + } +} + +// UpdatePod is called on an update, and the old and new object are presented +func (podGroupManager *PodGroupManager) UpdatePod(oldObj, newObj interface{}) { + + oldPod := oldObj.(*corev1.Pod) + newPod := newObj.(*corev1.Pod) + + // a pod is updated, get the group + // TODO should we be checking group / size for old vs new? + groupName, podGroup := podGroupManager.GetPodGroup(context.TODO(), oldPod) + + // If PodGroup is nil, still try to look up a faux name + // TODO need to check if this might be problematic + if podGroup == nil { + podGroup = fgroup.CreateFakeGroup(oldPod) + groupName = podGroup.Name + } + + podGroupManager.log.Verbose("[PodGroup UpdatePod] Processing event for pod %s in group %s from %s to %s", newPod.Name, groupName, oldPod.Status.Phase, newPod.Status.Phase) + + switch newPod.Status.Phase { + case corev1.PodPending: + // in this state we don't know if a pod is going to be running, thus we don't need to update job map + case corev1.PodRunning: + // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler + case corev1.PodSucceeded: + podGroupManager.log.Info("[PodGroup UpdatePod] Pod %s succeeded, Fluence needs to free the resources", newPod.Name) + + podGroupManager.mutex.Lock() + defer podGroupManager.mutex.Unlock() + + // Do we have the group id in our cache? If yes, we haven't deleted the jobid yet + // I am worried here that if some pods are succeeded and others pending, this could + // be a mistake - fluence would schedule it again + _, exists := podGroupManager.groupToJobId[groupName] + if exists { + podGroupManager.cancelFluxJob(groupName, oldPod) + } else { + podGroupManager.log.Verbose("[PodGroup UpdatePod] Succeeded pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + } + + case corev1.PodFailed: + + // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test + podGroupManager.log.Warning("[PodGroup UpdatePod] Pod %s in group %s failed, Fluence needs to free the resources", newPod.Name, groupName) + + podGroupManager.mutex.Lock() + defer podGroupManager.mutex.Unlock() + + _, exists := podGroupManager.groupToJobId[groupName] + if exists { + podGroupManager.cancelFluxJob(groupName, oldPod) + } else { + podGroupManager.log.Error("[PodGroup UpdatePod] Failed pod %s/%s in group %s doesn't have flux jobid", newPod.Namespace, newPod.Name, groupName) + } + case corev1.PodUnknown: + // don't know how to deal with it as it's unknown phase + default: + // shouldn't enter this branch + } +} + +// DeletePod handles the delete event handler +func (podGroupManager *PodGroupManager) DeletePod(podObj interface{}) { + pod := podObj.(*corev1.Pod) + groupName, podGroup := podGroupManager.GetPodGroup(context.TODO(), pod) + + // If PodGroup is nil, still try to look up a faux name + if podGroup == nil { + podGroup = fgroup.CreateFakeGroup(pod) + groupName = podGroup.Name + } + + podGroupManager.log.Verbose("[PodGroup DeletePod] Delete pod %s in group %s has status %s", pod.Status.Phase, pod.Name, groupName) + switch pod.Status.Phase { + case corev1.PodSucceeded: + case corev1.PodPending: + podGroupManager.log.Verbose("[PodGroup DeletePod] Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) + + podGroupManager.mutex.Lock() + defer podGroupManager.mutex.Unlock() + + _, exists := podGroupManager.groupToJobId[groupName] + if exists { + podGroupManager.cancelFluxJob(groupName, pod) + } else { + podGroupManager.log.Info("[PodGroup DeletePod] Terminating pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + } + case corev1.PodRunning: + podGroupManager.mutex.Lock() + defer podGroupManager.mutex.Unlock() + + _, exists := podGroupManager.groupToJobId[groupName] + if exists { + podGroupManager.cancelFluxJob(groupName, pod) + } else { + podGroupManager.log.Info("[PodGroup DeletePod] Deleted pod %s/%s in group %s doesn't have flux jobid", pod.Namespace, pod.Name, groupName) + } + } +} diff --git a/sig-scheduler-plugins/pkg/fluence/fluence.go b/sig-scheduler-plugins/pkg/fluence/fluence.go index fec0a35..44f0349 100644 --- a/sig-scheduler-plugins/pkg/fluence/fluence.go +++ b/sig-scheduler-plugins/pkg/fluence/fluence.go @@ -1,5 +1,5 @@ /* -Copyright 2022 The Kubernetes Authors. +Copyright 2020 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,378 +19,341 @@ package fluence import ( "context" "fmt" - "os" "sync" "time" - "google.golang.org/grpc" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/sets" + + "sigs.k8s.io/scheduler-plugins/pkg/logger" + + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/informers" clientscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/cache" + + fgroup "sigs.k8s.io/scheduler-plugins/pkg/fluence/group" + flabel "sigs.k8s.io/scheduler-plugins/pkg/fluence/labels" + corev1helpers "k8s.io/component-helpers/scheduling/corev1" - "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" - "k8s.io/kubernetes/pkg/scheduler/metrics" - "sigs.k8s.io/controller-runtime/pkg/client" + + "sigs.k8s.io/scheduler-plugins/apis/scheduling" "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" - coschedulingcore "sigs.k8s.io/scheduler-plugins/pkg/coscheduling/core" fcore "sigs.k8s.io/scheduler-plugins/pkg/fluence/core" - pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" - "sigs.k8s.io/scheduler-plugins/pkg/fluence/utils" ) +// Fluence schedules pods in a group using Fluxion as a backend +// We inherit cosched.Coscheduling to use some of the primary functions type Fluence struct { - mutex sync.Mutex - handle framework.Handle - podNameToJobId map[string]uint64 - pgMgr coschedulingcore.Manager + mutex sync.Mutex + client client.Client + frameworkHandler framework.Handle + podGroupManager fcore.Manager + scheduleTimeout *time.Duration + podGroupBackoff *time.Duration + log *logger.DebugLogger } -var _ framework.QueueSortPlugin = &Fluence{} -var _ framework.PreFilterPlugin = &Fluence{} -var _ framework.FilterPlugin = &Fluence{} +var ( + _ framework.QueueSortPlugin = &Fluence{} + _ framework.PreFilterPlugin = &Fluence{} + _ framework.FilterPlugin = &Fluence{} -// Name is the name of the plugin used in the Registry and configurations. -const Name = "Fluence" + _ framework.PostFilterPlugin = &Fluence{} + _ framework.PermitPlugin = &Fluence{} + _ framework.ReservePlugin = &Fluence{} -func (f *Fluence) Name() string { - return Name -} + _ framework.EnqueueExtensions = &Fluence{} -// initialize and return a new Flux Plugin -// Note from vsoch: seems analogous to: -// https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/coscheduling.go#L63 -func New(_ runtime.Object, handle framework.Handle) (framework.Plugin, error) { + // Set to be the same as coscheduling + permitWaitingTimeSeconds int64 = 300 + podGroupBackoffSeconds int64 = 0 +) - f := &Fluence{handle: handle, podNameToJobId: make(map[string]uint64)} - klog.Info("Create plugin") - ctx := context.TODO() - fcore.Init() +const ( + // Name is the name of the plugin used in Registry and configurations. + Name = "Fluence" +) - fluxPodsInformer := handle.SharedInformerFactory().Core().V1().Pods().Informer() - fluxPodsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - UpdateFunc: f.updatePod, - DeleteFunc: f.deletePod, - }) +// Initialize and return a new Fluence Custom Scheduler Plugin +func New(_ context.Context, obj runtime.Object, handle framework.Handle) (framework.Plugin, error) { - go fluxPodsInformer.Run(ctx.Done()) - klog.Info("Create generic pod informer") + ctx := context.TODO() + + // Make fluence his own little logger! + // This can eventually be a flag, but just going to set for now + // It shall be a very chonky file. Oh lawd he comin! + l := logger.NewDebugLogger(logger.LevelDebug, "/tmp/fluence.log") scheme := runtime.NewScheme() _ = clientscheme.AddToScheme(scheme) - _ = v1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) _ = v1alpha1.AddToScheme(scheme) + client, err := client.New(handle.KubeConfig(), client.Options{Scheme: scheme}) if err != nil { return nil, err } - fieldSelector, err := fields.ParseSelector(",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) - if err != nil { - klog.ErrorS(err, "ParseSelector failed") - os.Exit(1) - } - informerFactory := informers.NewSharedInformerFactoryWithOptions(handle.ClientSet(), 0, informers.WithTweakListOptions(func(opt *metav1.ListOptions) { - opt.FieldSelector = fieldSelector.String() - })) - podInformer := informerFactory.Core().V1().Pods() - - scheduleTimeDuration := time.Duration(500) * time.Second + // Performance improvement when retrieving list of objects by namespace or we'll log 'index not exist' warning. + fluxPodsInformer := handle.SharedInformerFactory().Core().V1().Pods().Informer() + fluxPodsInformer.AddIndexers(cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) - pgMgr := coschedulingcore.NewPodGroupManager( + // PermitWaitingTimeSeconds is the waiting timeout in seconds. + scheduleTimeDuration := time.Duration(permitWaitingTimeSeconds) * time.Second + podGroupManager := fcore.NewPodGroupManager( client, handle.SnapshotSharedLister(), &scheduleTimeDuration, - podInformer, + // Keep the podInformer (from frameworkHandle) as the single source of Pods. + handle.SharedInformerFactory().Core().V1().Pods(), + l, ) - f.pgMgr = pgMgr - // stopCh := make(chan struct{}) - // defer close(stopCh) - // informerFactory.Start(stopCh) - informerFactory.Start(ctx.Done()) + // Event handlers to call on podGroupManager + fluxPodsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: podGroupManager.UpdatePod, + DeleteFunc: podGroupManager.DeletePod, + }) + go fluxPodsInformer.Run(ctx.Done()) - if !cache.WaitForCacheSync(ctx.Done(), podInformer.Informer().HasSynced) { - err := fmt.Errorf("WaitForCacheSync failed") - klog.ErrorS(err, "Cannot sync caches") - return nil, err + backoffSeconds := time.Duration(podGroupBackoffSeconds) * time.Second + plugin := &Fluence{ + frameworkHandler: handle, + podGroupManager: podGroupManager, + scheduleTimeout: &scheduleTimeDuration, + log: l, + podGroupBackoff: &backoffSeconds, } - klog.Info("Fluence starts") - return f, nil + // TODO this is not supported yet + // Account for resources in running cluster + err = plugin.RegisterExisting(ctx) + return plugin, err } -// Less is used to sort pods in the scheduling queue in the following order. -// 1. Compare the priorities of Pods. -// 2. Compare the initialization timestamps of PodGroups or Pods. -// 3. Compare the keys of PodGroups/Pods: /. -func (f *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { - klog.Infof("ordering pods from Coscheduling") - prio1 := corev1helpers.PodPriority(podInfo1.Pod) - prio2 := corev1helpers.PodPriority(podInfo2.Pod) - if prio1 != prio2 { - return prio1 > prio2 - } - creationTime1 := f.pgMgr.GetCreationTimestamp(podInfo1.Pod, *podInfo1.InitialAttemptTimestamp) - creationTime2 := f.pgMgr.GetCreationTimestamp(podInfo2.Pod, *podInfo2.InitialAttemptTimestamp) - if creationTime1.Equal(creationTime2) { - return coschedulingcore.GetNamespacedName(podInfo1.Pod) < coschedulingcore.GetNamespacedName(podInfo2.Pod) - } - return creationTime1.Before(creationTime2) +func (fluence *Fluence) Name() string { + return Name } -func (f *Fluence) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { - klog.Infof("Examining the pod") - var err error - var nodename string - if pgname, ok := f.isGroup(ctx, pod); ok { - if !fcore.HaveList(pgname) { - klog.Infof("Getting a pod group") - groupSize, _ := f.groupPreFilter(ctx, pod) - if _, err = f.AskFlux(ctx, pod, groupSize); err != nil { - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) - } - } - nodename, err = fcore.GetNextNode(pgname) - klog.Infof("Node Selected %s (%s:%s)", nodename, pod.Name, pgname) - if err != nil { - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) - } - } else { - nodename, err = f.AskFlux(ctx, pod, 1) - if err != nil { - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) - } +// Fluence has added delete, although I wonder if update includes that signal +// and it's redundant? +func (fluence *Fluence) EventsToRegister() []framework.ClusterEventWithHint { + // To register a custom event, follow the naming convention at: + // https://git.k8s.io/kubernetes/pkg/scheduler/eventhandlers.go#L403-L410 + podGroupGVK := fmt.Sprintf("podgroups.v1alpha1.%v", scheduling.GroupName) + return []framework.ClusterEventWithHint{ + {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Add | framework.Delete}}, + {Event: framework.ClusterEvent{Resource: framework.GVK(podGroupGVK), ActionType: framework.Add | framework.Update | framework.Delete}}, } - - klog.Info("Node Selected: ", nodename) - state.Write(framework.StateKey(pod.Name), &fcore.FluxStateData{NodeName: nodename}) - return nil, framework.NewStatus(framework.Success, "") - } -func (f *Fluence) isGroup(ctx context.Context, pod *v1.Pod) (string, bool) { - pgFullName, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { - klog.InfoS("Not in group", "pod", klog.KObj(pod)) - return "", false - } - return pgFullName, true -} +// TODO we need to account for affinity here +func (fluence *Fluence) Filter( + ctx context.Context, + cycleState *framework.CycleState, + pod *corev1.Pod, + nodeInfo *framework.NodeInfo, +) *framework.Status { -func (f *Fluence) groupPreFilter(ctx context.Context, pod *v1.Pod) (int, error) { - // klog.InfoS("Flux Pre-Filter", "pod", klog.KObj(pod)) - klog.InfoS("Flux Pre-Filter", "pod labels", pod.Labels) - _, pg := f.pgMgr.GetPodGroup(ctx, pod) - if pg == nil { - klog.InfoS("Not in group", "pod", klog.KObj(pod)) - return 0, nil - } + fluence.log.Verbose("[Fluence Filter] Filtering input node %s", nodeInfo.Node().Name) + state, err := cycleState.Read(framework.StateKey(pod.Name)) - klog.Info("pod group members ", pg.Spec.MinMember) - return int(pg.Spec.MinMember), nil -} + // No error means we retrieved the state + if err == nil { -func (f *Fluence) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { - klog.Info("Filtering input node ", nodeInfo.Node().Name) - if v, e := cycleState.Read(framework.StateKey(pod.Name)); e == nil { - if value, ok := v.(*fcore.FluxStateData); ok && value.NodeName != nodeInfo.Node().Name { + // Try to convert the state to FluxStateDate + value, ok := state.(*fcore.FluxStateData) + + // If we have state data that isn't equal to the current assignment, no go + if ok && value.NodeName != nodeInfo.Node().Name { return framework.NewStatus(framework.Unschedulable, "pod is not permitted") } else { - klog.Info("Filter: node selected by Flux ", value.NodeName) + fluence.log.Info("[Fluence Filter] node %s selected for %s\n", value.NodeName, pod.Name) } } - return framework.NewStatus(framework.Success) } -func (f *Fluence) PreFilterExtensions() framework.PreFilterExtensions { - return nil -} - -func (f *Fluence) AskFlux(ctx context.Context, pod *v1.Pod, count int) (string, error) { - // clean up previous match if a pod has already allocated previously - f.mutex.Lock() - _, isPodAllocated := f.podNameToJobId[pod.Name] - f.mutex.Unlock() - - if isPodAllocated { - klog.Info("Clean up previous allocation") - f.mutex.Lock() - f.cancelFluxJobForPod(pod.Name) - f.mutex.Unlock() - } - - jobspec := utils.InspectPodInfo(pod) - conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) - - if err != nil { - klog.Errorf("[FluxClient] Error connecting to server: %v", err) - return "", err +// Less is used to sort pods in the scheduling queue in the following order. +// 1. Compare the priorities of Pods. +// 2. Compare the initialization timestamps of PodGroups or Pods. +// 3. Compare the keys of PodGroups/Pods: /. +func (fluence *Fluence) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { + prio1 := corev1helpers.PodPriority(podInfo1.Pod) + prio2 := corev1helpers.PodPriority(podInfo2.Pod) + if prio1 != prio2 { + return prio1 > prio2 } - defer conn.Close() - - grpcclient := pb.NewFluxcliServiceClient(conn) - _, cancel := context.WithTimeout(context.Background(), 200*time.Second) - defer cancel() - request := &pb.MatchRequest{ - Ps: jobspec, - Request: "allocate", - Count: int32(count)} + // Important: this GetPodGroup returns the first name as the Namespaced one, + // which is what fluence needs to distinguish between namespaces. Just the + // name could be replicated between different namespaces + ctx := context.TODO() + name1, podGroup1 := fluence.podGroupManager.GetPodGroup(ctx, podInfo1.Pod) + name2, podGroup2 := fluence.podGroupManager.GetPodGroup(ctx, podInfo2.Pod) - r, err2 := grpcclient.Match(context.Background(), request) - if err2 != nil { - klog.Errorf("[FluxClient] did not receive any match response: %v", err2) - return "", err - } + // Fluence can only compare if we have two known groups. + // This tries for that first, and falls back to the initial attempt timestamp + creationTime1 := fgroup.GetCreationTimestamp(name1, podGroup1, podInfo1) + creationTime2 := fgroup.GetCreationTimestamp(name2, podGroup2, podInfo2) - klog.Infof("[FluxClient] response podID %s", r.GetPodID()) - - _, ok := f.isGroup(ctx, pod) - if count > 1 || ok { - pgFullName, _ := f.pgMgr.GetPodGroup(ctx, pod) - nodelist := fcore.CreateNodePodsList(r.GetNodelist(), pgFullName) - klog.Infof("[FluxClient] response nodeID %s", r.GetNodelist()) - klog.Info("[FluxClient] Parsed Nodelist ", nodelist) - jobid := uint64(r.GetJobID()) - - f.mutex.Lock() - f.podNameToJobId[pod.Name] = jobid - klog.Info("Check job set: ", f.podNameToJobId) - f.mutex.Unlock() - } else { - nodename := r.GetNodelist()[0].GetNodeID() - jobid := uint64(r.GetJobID()) - - f.mutex.Lock() - f.podNameToJobId[pod.Name] = jobid - klog.Info("Check job set: ", f.podNameToJobId) - f.mutex.Unlock() - - return nodename, nil + // If they are the same, fall back to sorting by name. + if creationTime1.Equal(&creationTime2) { + return fcore.GetNamespacedName(podInfo1.Pod) < fcore.GetNamespacedName(podInfo2.Pod) } + return creationTime1.Before(&creationTime2) - return "", nil } -func (f *Fluence) cancelFluxJobForPod(podName string) error { - jobid := f.podNameToJobId[podName] - - klog.Infof("Cancel flux job: %v for pod %s", jobid, podName) - - start := time.Now() - - conn, err := grpc.Dial("127.0.0.1:4242", grpc.WithInsecure()) - - if err != nil { - klog.Errorf("[FluxClient] Error connecting to server: %v", err) - return err - } - defer conn.Close() - - grpcclient := pb.NewFluxcliServiceClient(conn) - _, cancel := context.WithTimeout(context.Background(), 200*time.Second) - defer cancel() +// PreFilterExtensions allow for callbacks on filtered states +// This is required to be defined for a PreFilter plugin +// https://github.com/kubernetes/kubernetes/blob/master/pkg/scheduler/framework/interface.go#L383 +func (fluence *Fluence) PreFilterExtensions() framework.PreFilterExtensions { + return nil +} - request := &pb.CancelRequest{ - JobID: int64(jobid), +// PreFilter performs the following validations. +// 1. Whether the PodGroup that the Pod belongs to is on the deny list. +// 2. Whether the total number of pods in a PodGroup is less than its `minMember`. +func (fluence *Fluence) PreFilter( + ctx context.Context, + state *framework.CycleState, + pod *corev1.Pod, +) (*framework.PreFilterResult, *framework.Status) { + + // Quick check if the pod is already scheduled + fluence.mutex.Lock() + node := fluence.podGroupManager.GetPodNode(pod) + fluence.mutex.Unlock() + if node != "" { + fluence.log.Info("[Fluence PreFilter] assigned pod %s to node %s\n", pod.Name, node) + result := framework.PreFilterResult{NodeNames: sets.New(node)} + return &result, framework.NewStatus(framework.Success, "") } + fluence.log.Info("[Fluence PreFilter] pod %s does not have a node assigned\n", pod.Name) - res, err := grpcclient.Cancel(context.Background(), request) + // This will populate the node name into the pod group manager + err := fluence.podGroupManager.PreFilter(ctx, pod, state) if err != nil { - klog.Errorf("[FluxClient] did not receive any cancel response: %v", err) - return err + fluence.log.Error("[Fluence PreFilter] failed pod %s: %s", pod.Name, err.Error()) + return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) } + node = fluence.podGroupManager.GetPodNode(pod) + result := framework.PreFilterResult{NodeNames: sets.New(node)} + return &result, framework.NewStatus(framework.Success, "") +} - if res.Error == 0 { - delete(f.podNameToJobId, podName) - } else { - klog.Warningf("Failed to delete pod %s from the podname-jobid map.", podName) +// PostFilter is used to reject a group of pods if a pod does not pass PreFilter or Filter. +func (fluence *Fluence) PostFilter( + ctx context.Context, + state *framework.CycleState, + pod *corev1.Pod, + filteredNodeStatusMap framework.NodeToStatusMap, +) (*framework.PostFilterResult, *framework.Status) { + + groupName, podGroup := fluence.podGroupManager.GetPodGroup(ctx, pod) + if podGroup == nil { + fluence.log.Info("Pod does not belong to any group, pod %s", pod.Name) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, "can not find pod group") } - elapsed := metrics.SinceInSeconds(start) - klog.Info("Time elapsed (Cancel Job) :", elapsed) - - klog.Infof("Job cancellation for pod %s result: %d", podName, err) - if klog.V(2).Enabled() { - klog.Info("Check job set: after delete") - klog.Info(f.podNameToJobId) + // This explicitly checks nodes, and we can skip scheduling another pod if we already + // have the minimum. For fluence since we expect an exact size this likely is not needed + assigned := fluence.podGroupManager.CalculateAssignedPods(podGroup.Name, pod.Namespace) + if assigned >= int(podGroup.Spec.MinMember) { + fluence.log.Info("Assigned pods podGroup %s is assigned %s", groupName, assigned) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable) } - return nil -} -// EventHandlers -func (f *Fluence) updatePod(oldObj, newObj interface{}) { - // klog.Info("Update Pod event handler") - newPod := newObj.(*v1.Pod) - klog.Infof("Processing event for pod %s", newPod) - switch newPod.Status.Phase { - case v1.PodPending: - // in this state we don't know if a pod is going to be running, thus we don't need to update job map - case v1.PodRunning: - // if a pod is start running, we can add it state to the delta graph if it is scheduled by other scheduler - case v1.PodSucceeded: - klog.Infof("Pod %s succeeded, Fluence needs to free the resources", newPod.Name) - - f.mutex.Lock() - defer f.mutex.Unlock() - - if _, ok := f.podNameToJobId[newPod.Name]; ok { - f.cancelFluxJobForPod(newPod.Name) - } else { - klog.Infof("Succeeded pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) - } - case v1.PodFailed: - // a corner case need to be tested, the pod exit code is not 0, can be created with segmentation fault pi test - klog.Warningf("Pod %s failed, Fluence needs to free the resources", newPod.Name) + // Took out percentage chcek here, doesn't make sense to me. - f.mutex.Lock() - defer f.mutex.Unlock() + // It's based on an implicit assumption: if the nth Pod failed, + // it's inferrable other Pods belonging to the same PodGroup would be very likely to fail. + fluence.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == podGroup.Name { + fluence.log.Info("PostFilter rejects the pod for podGroup %s and pod %s", groupName, waitingPod.GetPod().Name) + waitingPod.Reject(fluence.Name(), "optimistic rejection in PostFilter") + } + }) - if _, ok := f.podNameToJobId[newPod.Name]; ok { - f.cancelFluxJobForPod(newPod.Name) - } else { - klog.Errorf("Failed pod %s/%s doesn't have flux jobid", newPod.Namespace, newPod.Name) + if fluence.podGroupBackoff != nil { + pods, err := fluence.frameworkHandler.SharedInformerFactory().Core().V1().Pods().Lister().Pods(pod.Namespace).List( + labels.SelectorFromSet(labels.Set{v1alpha1.PodGroupLabel: flabel.GetPodGroupLabel(pod)}), + ) + if err == nil && len(pods) >= int(podGroup.Spec.MinMember) { + fluence.podGroupManager.BackoffPodGroup(groupName, *fluence.podGroupBackoff) } - case v1.PodUnknown: - // don't know how to deal with it as it's unknown phase - default: - // shouldn't enter this branch } -} -func (f *Fluence) deletePod(podObj interface{}) { - klog.Info("Delete Pod event handler") + fluence.podGroupManager.DeletePermittedPodGroup(groupName) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, + fmt.Sprintf("PodGroup %v gets rejected due to Pod %v is unschedulable even after PostFilter", groupName, pod.Name)) +} - pod := podObj.(*v1.Pod) - klog.Info("Pod status: ", pod.Status.Phase) - switch pod.Status.Phase { - case v1.PodSucceeded: - case v1.PodPending: - klog.Infof("Pod %s completed and is Pending termination, Fluence needs to free the resources", pod.Name) +// Permit is the functions invoked by the framework at "Permit" extension point. +func (fluence *Fluence) Permit( + ctx context.Context, + state *framework.CycleState, + pod *corev1.Pod, + nodeName string, +) (*framework.Status, time.Duration) { + + fluence.log.Info("Checking permit for pod %s to node %s", pod.Name, nodeName) + waitTime := *fluence.scheduleTimeout + s := fluence.podGroupManager.Permit(ctx, state, pod) + var retStatus *framework.Status + switch s { + case fcore.PodGroupNotSpecified: + fluence.log.Info("Checking permit for pod %s to node %s: PodGroupNotSpecified", pod.Name, nodeName) + return framework.NewStatus(framework.Success, ""), 0 + case fcore.PodGroupNotFound: + fluence.log.Info("Checking permit for pod %s to node %s: PodGroupNotFound", pod.Name, nodeName) + return framework.NewStatus(framework.Unschedulable, "PodGroup not found"), 0 + case fcore.Wait: + fluence.log.Info("Pod %s is waiting to be scheduled to node %s", pod.Name, nodeName) + _, podGroup := fluence.podGroupManager.GetPodGroup(ctx, pod) + if wait := fgroup.GetWaitTimeDuration(podGroup, fluence.scheduleTimeout); wait != 0 { + waitTime = wait + } + retStatus = framework.NewStatus(framework.Wait) + + // We will also request to move the sibling pods back to activeQ. + fluence.podGroupManager.ActivateSiblings(pod, state) + case fcore.Success: + podGroupFullName := flabel.GetPodGroupFullName(pod) + fluence.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if flabel.GetPodGroupFullName(waitingPod.GetPod()) == podGroupFullName { + fluence.log.Info("Permit allows pod %s", waitingPod.GetPod().Name) + waitingPod.Allow(fluence.Name()) + } + }) + fluence.log.Info("Permit allows pod %s", pod.Name) + retStatus = framework.NewStatus(framework.Success) + waitTime = 0 + } - f.mutex.Lock() - defer f.mutex.Unlock() + return retStatus, waitTime +} - if _, ok := f.podNameToJobId[pod.Name]; ok { - f.cancelFluxJobForPod(pod.Name) - } else { - klog.Infof("Terminating pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) - } - case v1.PodRunning: - f.mutex.Lock() - defer f.mutex.Unlock() +// Reserve is the functions invoked by the framework at "reserve" extension point. +func (fluence *Fluence) Reserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) *framework.Status { + return nil +} - if _, ok := f.podNameToJobId[pod.Name]; ok { - f.cancelFluxJobForPod(pod.Name) - } else { - klog.Infof("Deleted pod %s/%s doesn't have flux jobid", pod.Namespace, pod.Name) - } +// Unreserve rejects all other Pods in the PodGroup when one of the pods in the group times out. +func (fluence *Fluence) Unreserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) { + groupName, podGroup := fluence.podGroupManager.GetPodGroup(ctx, pod) + if podGroup == nil { + return } + fluence.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Namespace == pod.Namespace && flabel.GetPodGroupLabel(waitingPod.GetPod()) == podGroup.Name { + fluence.log.Info("Unreserve rejects pod %s in group %s", waitingPod.GetPod().Name, groupName) + waitingPod.Reject(fluence.Name(), "rejection in Unreserve") + } + }) + fluence.podGroupManager.DeletePermittedPodGroup(groupName) } diff --git a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.pb.go b/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.pb.go deleted file mode 100644 index e317af2..0000000 --- a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.pb.go +++ /dev/null @@ -1,838 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.26.0 -// protoc v3.15.8 -// source: fluence/fluxcli-grpc/fluxcli.proto - -package fluxcli - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type PodSpec struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - Container string `protobuf:"bytes,2,opt,name=container,proto3" json:"container,omitempty"` - Cpu int32 `protobuf:"varint,3,opt,name=cpu,proto3" json:"cpu,omitempty"` - Memory int64 `protobuf:"varint,4,opt,name=memory,proto3" json:"memory,omitempty"` - Gpu int64 `protobuf:"varint,5,opt,name=gpu,proto3" json:"gpu,omitempty"` - Storage int64 `protobuf:"varint,6,opt,name=storage,proto3" json:"storage,omitempty"` - Labels []string `protobuf:"bytes,7,rep,name=labels,proto3" json:"labels,omitempty"` -} - -func (x *PodSpec) Reset() { - *x = PodSpec{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *PodSpec) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*PodSpec) ProtoMessage() {} - -func (x *PodSpec) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use PodSpec.ProtoReflect.Descriptor instead. -func (*PodSpec) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{0} -} - -func (x *PodSpec) GetId() string { - if x != nil { - return x.Id - } - return "" -} - -func (x *PodSpec) GetContainer() string { - if x != nil { - return x.Container - } - return "" -} - -func (x *PodSpec) GetCpu() int32 { - if x != nil { - return x.Cpu - } - return 0 -} - -func (x *PodSpec) GetMemory() int64 { - if x != nil { - return x.Memory - } - return 0 -} - -func (x *PodSpec) GetGpu() int64 { - if x != nil { - return x.Gpu - } - return 0 -} - -func (x *PodSpec) GetStorage() int64 { - if x != nil { - return x.Storage - } - return 0 -} - -func (x *PodSpec) GetLabels() []string { - if x != nil { - return x.Labels - } - return nil -} - -// The Match request message (allocate, allocate_orelse_reserve) -type MatchRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Ps *PodSpec `protobuf:"bytes,1,opt,name=ps,proto3" json:"ps,omitempty"` - Request string `protobuf:"bytes,2,opt,name=request,proto3" json:"request,omitempty"` - Count int32 `protobuf:"varint,3,opt,name=count,proto3" json:"count,omitempty"` -} - -func (x *MatchRequest) Reset() { - *x = MatchRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *MatchRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*MatchRequest) ProtoMessage() {} - -func (x *MatchRequest) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use MatchRequest.ProtoReflect.Descriptor instead. -func (*MatchRequest) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{1} -} - -func (x *MatchRequest) GetPs() *PodSpec { - if x != nil { - return x.Ps - } - return nil -} - -func (x *MatchRequest) GetRequest() string { - if x != nil { - return x.Request - } - return "" -} - -func (x *MatchRequest) GetCount() int32 { - if x != nil { - return x.Count - } - return 0 -} - -// The Nodes/Cluster Update Status -type NodeAlloc struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - NodeID string `protobuf:"bytes,1,opt,name=nodeID,proto3" json:"nodeID,omitempty"` - Tasks int32 `protobuf:"varint,2,opt,name=tasks,proto3" json:"tasks,omitempty"` -} - -func (x *NodeAlloc) Reset() { - *x = NodeAlloc{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[2] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *NodeAlloc) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*NodeAlloc) ProtoMessage() {} - -func (x *NodeAlloc) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[2] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use NodeAlloc.ProtoReflect.Descriptor instead. -func (*NodeAlloc) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{2} -} - -func (x *NodeAlloc) GetNodeID() string { - if x != nil { - return x.NodeID - } - return "" -} - -func (x *NodeAlloc) GetTasks() int32 { - if x != nil { - return x.Tasks - } - return 0 -} - -// The Match response message -type MatchResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - PodID string `protobuf:"bytes,1,opt,name=podID,proto3" json:"podID,omitempty"` - Nodelist []*NodeAlloc `protobuf:"bytes,2,rep,name=nodelist,proto3" json:"nodelist,omitempty"` - JobID int64 `protobuf:"varint,3,opt,name=jobID,proto3" json:"jobID,omitempty"` -} - -func (x *MatchResponse) Reset() { - *x = MatchResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[3] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *MatchResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*MatchResponse) ProtoMessage() {} - -func (x *MatchResponse) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[3] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use MatchResponse.ProtoReflect.Descriptor instead. -func (*MatchResponse) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{3} -} - -func (x *MatchResponse) GetPodID() string { - if x != nil { - return x.PodID - } - return "" -} - -func (x *MatchResponse) GetNodelist() []*NodeAlloc { - if x != nil { - return x.Nodelist - } - return nil -} - -func (x *MatchResponse) GetJobID() int64 { - if x != nil { - return x.JobID - } - return 0 -} - -type CancelRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - JobID int64 `protobuf:"varint,2,opt,name=jobID,proto3" json:"jobID,omitempty"` -} - -func (x *CancelRequest) Reset() { - *x = CancelRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[4] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *CancelRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*CancelRequest) ProtoMessage() {} - -func (x *CancelRequest) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[4] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use CancelRequest.ProtoReflect.Descriptor instead. -func (*CancelRequest) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{4} -} - -func (x *CancelRequest) GetJobID() int64 { - if x != nil { - return x.JobID - } - return 0 -} - -// The Match response message -type CancelResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - JobID int64 `protobuf:"varint,1,opt,name=jobID,proto3" json:"jobID,omitempty"` - Error int32 `protobuf:"varint,2,opt,name=error,proto3" json:"error,omitempty"` -} - -func (x *CancelResponse) Reset() { - *x = CancelResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[5] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *CancelResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*CancelResponse) ProtoMessage() {} - -func (x *CancelResponse) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[5] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use CancelResponse.ProtoReflect.Descriptor instead. -func (*CancelResponse) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{5} -} - -func (x *CancelResponse) GetJobID() int64 { - if x != nil { - return x.JobID - } - return 0 -} - -func (x *CancelResponse) GetError() int32 { - if x != nil { - return x.Error - } - return 0 -} - -// The Nodes/Cluster Update Status -type NodeStatus struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - CpuAvail int32 `protobuf:"varint,1,opt,name=cpuAvail,proto3" json:"cpuAvail,omitempty"` - GpuAvail int32 `protobuf:"varint,2,opt,name=gpuAvail,proto3" json:"gpuAvail,omitempty"` - StorageAvail int64 `protobuf:"varint,3,opt,name=storageAvail,proto3" json:"storageAvail,omitempty"` - MemoryAvail int64 `protobuf:"varint,4,opt,name=memoryAvail,proto3" json:"memoryAvail,omitempty"` - AllowedPods int64 `protobuf:"varint,5,opt,name=allowedPods,proto3" json:"allowedPods,omitempty"` - NodeIP string `protobuf:"bytes,6,opt,name=nodeIP,proto3" json:"nodeIP,omitempty"` - Replication int32 `protobuf:"varint,7,opt,name=replication,proto3" json:"replication,omitempty"` -} - -func (x *NodeStatus) Reset() { - *x = NodeStatus{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[6] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *NodeStatus) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*NodeStatus) ProtoMessage() {} - -func (x *NodeStatus) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[6] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use NodeStatus.ProtoReflect.Descriptor instead. -func (*NodeStatus) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{6} -} - -func (x *NodeStatus) GetCpuAvail() int32 { - if x != nil { - return x.CpuAvail - } - return 0 -} - -func (x *NodeStatus) GetGpuAvail() int32 { - if x != nil { - return x.GpuAvail - } - return 0 -} - -func (x *NodeStatus) GetStorageAvail() int64 { - if x != nil { - return x.StorageAvail - } - return 0 -} - -func (x *NodeStatus) GetMemoryAvail() int64 { - if x != nil { - return x.MemoryAvail - } - return 0 -} - -func (x *NodeStatus) GetAllowedPods() int64 { - if x != nil { - return x.AllowedPods - } - return 0 -} - -func (x *NodeStatus) GetNodeIP() string { - if x != nil { - return x.NodeIP - } - return "" -} - -func (x *NodeStatus) GetReplication() int32 { - if x != nil { - return x.Replication - } - return 0 -} - -// The JGF response message -type JGFRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Jgf string `protobuf:"bytes,1,opt,name=jgf,proto3" json:"jgf,omitempty"` -} - -func (x *JGFRequest) Reset() { - *x = JGFRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[7] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *JGFRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*JGFRequest) ProtoMessage() {} - -func (x *JGFRequest) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[7] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use JGFRequest.ProtoReflect.Descriptor instead. -func (*JGFRequest) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{7} -} - -func (x *JGFRequest) GetJgf() string { - if x != nil { - return x.Jgf - } - return "" -} - -// The JGF response message -type JGFResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Jgf string `protobuf:"bytes,1,opt,name=jgf,proto3" json:"jgf,omitempty"` -} - -func (x *JGFResponse) Reset() { - *x = JGFResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[8] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *JGFResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*JGFResponse) ProtoMessage() {} - -func (x *JGFResponse) ProtoReflect() protoreflect.Message { - mi := &file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[8] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use JGFResponse.ProtoReflect.Descriptor instead. -func (*JGFResponse) Descriptor() ([]byte, []int) { - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP(), []int{8} -} - -func (x *JGFResponse) GetJgf() string { - if x != nil { - return x.Jgf - } - return "" -} - -var File_fluence_fluxcli_grpc_fluxcli_proto protoreflect.FileDescriptor - -var file_fluence_fluxcli_grpc_fluxcli_proto_rawDesc = []byte{ - 0x0a, 0x22, 0x66, 0x6c, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x2f, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, - 0x69, 0x2d, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x07, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x22, 0xa5, 0x01, - 0x0a, 0x07, 0x50, 0x6f, 0x64, 0x53, 0x70, 0x65, 0x63, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1c, 0x0a, 0x09, 0x63, 0x6f, 0x6e, - 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x12, 0x10, 0x0a, 0x03, 0x63, 0x70, 0x75, 0x18, 0x03, - 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x63, 0x70, 0x75, 0x12, 0x16, 0x0a, 0x06, 0x6d, 0x65, 0x6d, - 0x6f, 0x72, 0x79, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, - 0x79, 0x12, 0x10, 0x0a, 0x03, 0x67, 0x70, 0x75, 0x18, 0x05, 0x20, 0x01, 0x28, 0x03, 0x52, 0x03, - 0x67, 0x70, 0x75, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x18, 0x06, - 0x20, 0x01, 0x28, 0x03, 0x52, 0x07, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x12, 0x16, 0x0a, - 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x18, 0x07, 0x20, 0x03, 0x28, 0x09, 0x52, 0x06, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x73, 0x22, 0x60, 0x0a, 0x0c, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, - 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x20, 0x0a, 0x02, 0x70, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, - 0x0b, 0x32, 0x10, 0x2e, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x50, 0x6f, 0x64, 0x53, - 0x70, 0x65, 0x63, 0x52, 0x02, 0x70, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x72, 0x65, 0x71, 0x75, 0x65, - 0x73, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x12, 0x14, 0x0a, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, - 0x52, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x22, 0x39, 0x0a, 0x09, 0x4e, 0x6f, 0x64, 0x65, 0x41, - 0x6c, 0x6c, 0x6f, 0x63, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x49, 0x44, 0x18, 0x01, - 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x49, 0x44, 0x12, 0x14, 0x0a, 0x05, - 0x74, 0x61, 0x73, 0x6b, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x74, 0x61, 0x73, - 0x6b, 0x73, 0x22, 0x6b, 0x0a, 0x0d, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, - 0x6e, 0x73, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x70, 0x6f, 0x64, 0x49, 0x44, 0x18, 0x01, 0x20, 0x01, - 0x28, 0x09, 0x52, 0x05, 0x70, 0x6f, 0x64, 0x49, 0x44, 0x12, 0x2e, 0x0a, 0x08, 0x6e, 0x6f, 0x64, - 0x65, 0x6c, 0x69, 0x73, 0x74, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x66, 0x6c, - 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x4e, 0x6f, 0x64, 0x65, 0x41, 0x6c, 0x6c, 0x6f, 0x63, 0x52, - 0x08, 0x6e, 0x6f, 0x64, 0x65, 0x6c, 0x69, 0x73, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x6a, 0x6f, 0x62, - 0x49, 0x44, 0x18, 0x03, 0x20, 0x01, 0x28, 0x03, 0x52, 0x05, 0x6a, 0x6f, 0x62, 0x49, 0x44, 0x22, - 0x25, 0x0a, 0x0d, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, - 0x12, 0x14, 0x0a, 0x05, 0x6a, 0x6f, 0x62, 0x49, 0x44, 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, - 0x05, 0x6a, 0x6f, 0x62, 0x49, 0x44, 0x22, 0x3c, 0x0a, 0x0e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, - 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x6a, 0x6f, 0x62, 0x49, - 0x44, 0x18, 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x05, 0x6a, 0x6f, 0x62, 0x49, 0x44, 0x12, 0x14, - 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x65, - 0x72, 0x72, 0x6f, 0x72, 0x22, 0xe6, 0x01, 0x0a, 0x0a, 0x4e, 0x6f, 0x64, 0x65, 0x53, 0x74, 0x61, - 0x74, 0x75, 0x73, 0x12, 0x1a, 0x0a, 0x08, 0x63, 0x70, 0x75, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x63, 0x70, 0x75, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x12, - 0x1a, 0x0a, 0x08, 0x67, 0x70, 0x75, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x05, 0x52, 0x08, 0x67, 0x70, 0x75, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x12, 0x22, 0x0a, 0x0c, 0x73, - 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, - 0x03, 0x52, 0x0c, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x12, - 0x20, 0x0a, 0x0b, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x18, 0x04, - 0x20, 0x01, 0x28, 0x03, 0x52, 0x0b, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x41, 0x76, 0x61, 0x69, - 0x6c, 0x12, 0x20, 0x0a, 0x0b, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x50, 0x6f, 0x64, 0x73, - 0x18, 0x05, 0x20, 0x01, 0x28, 0x03, 0x52, 0x0b, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x50, - 0x6f, 0x64, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x49, 0x50, 0x18, 0x06, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x49, 0x50, 0x12, 0x20, 0x0a, 0x0b, 0x72, - 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x07, 0x20, 0x01, 0x28, 0x05, - 0x52, 0x0b, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x1e, 0x0a, - 0x0a, 0x4a, 0x47, 0x46, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x6a, - 0x67, 0x66, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6a, 0x67, 0x66, 0x22, 0x1f, 0x0a, - 0x0b, 0x4a, 0x47, 0x46, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x10, 0x0a, 0x03, - 0x6a, 0x67, 0x66, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6a, 0x67, 0x66, 0x32, 0x87, - 0x01, 0x0a, 0x0e, 0x46, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, - 0x65, 0x12, 0x38, 0x0a, 0x05, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x12, 0x15, 0x2e, 0x66, 0x6c, 0x75, - 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x1a, 0x16, 0x2e, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x4d, 0x61, 0x74, 0x63, - 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x3b, 0x0a, 0x06, 0x43, - 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x12, 0x16, 0x2e, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, - 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x17, 0x2e, - 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x2e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x0e, 0x5a, 0x0c, 0x67, 0x72, 0x70, 0x63, - 0x2f, 0x66, 0x6c, 0x75, 0x78, 0x63, 0x6c, 0x69, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, -} - -var ( - file_fluence_fluxcli_grpc_fluxcli_proto_rawDescOnce sync.Once - file_fluence_fluxcli_grpc_fluxcli_proto_rawDescData = file_fluence_fluxcli_grpc_fluxcli_proto_rawDesc -) - -func file_fluence_fluxcli_grpc_fluxcli_proto_rawDescGZIP() []byte { - file_fluence_fluxcli_grpc_fluxcli_proto_rawDescOnce.Do(func() { - file_fluence_fluxcli_grpc_fluxcli_proto_rawDescData = protoimpl.X.CompressGZIP(file_fluence_fluxcli_grpc_fluxcli_proto_rawDescData) - }) - return file_fluence_fluxcli_grpc_fluxcli_proto_rawDescData -} - -var file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes = make([]protoimpl.MessageInfo, 9) -var file_fluence_fluxcli_grpc_fluxcli_proto_goTypes = []interface{}{ - (*PodSpec)(nil), // 0: fluxcli.PodSpec - (*MatchRequest)(nil), // 1: fluxcli.MatchRequest - (*NodeAlloc)(nil), // 2: fluxcli.NodeAlloc - (*MatchResponse)(nil), // 3: fluxcli.MatchResponse - (*CancelRequest)(nil), // 4: fluxcli.CancelRequest - (*CancelResponse)(nil), // 5: fluxcli.CancelResponse - (*NodeStatus)(nil), // 6: fluxcli.NodeStatus - (*JGFRequest)(nil), // 7: fluxcli.JGFRequest - (*JGFResponse)(nil), // 8: fluxcli.JGFResponse -} -var file_fluence_fluxcli_grpc_fluxcli_proto_depIdxs = []int32{ - 0, // 0: fluxcli.MatchRequest.ps:type_name -> fluxcli.PodSpec - 2, // 1: fluxcli.MatchResponse.nodelist:type_name -> fluxcli.NodeAlloc - 1, // 2: fluxcli.FluxcliService.Match:input_type -> fluxcli.MatchRequest - 4, // 3: fluxcli.FluxcliService.Cancel:input_type -> fluxcli.CancelRequest - 3, // 4: fluxcli.FluxcliService.Match:output_type -> fluxcli.MatchResponse - 5, // 5: fluxcli.FluxcliService.Cancel:output_type -> fluxcli.CancelResponse - 4, // [4:6] is the sub-list for method output_type - 2, // [2:4] is the sub-list for method input_type - 2, // [2:2] is the sub-list for extension type_name - 2, // [2:2] is the sub-list for extension extendee - 0, // [0:2] is the sub-list for field type_name -} - -func init() { file_fluence_fluxcli_grpc_fluxcli_proto_init() } -func file_fluence_fluxcli_grpc_fluxcli_proto_init() { - if File_fluence_fluxcli_grpc_fluxcli_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*PodSpec); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*MatchRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*NodeAlloc); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*MatchResponse); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*CancelRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*CancelResponse); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*NodeStatus); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[7].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*JGFRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes[8].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*JGFResponse); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_fluence_fluxcli_grpc_fluxcli_proto_rawDesc, - NumEnums: 0, - NumMessages: 9, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_fluence_fluxcli_grpc_fluxcli_proto_goTypes, - DependencyIndexes: file_fluence_fluxcli_grpc_fluxcli_proto_depIdxs, - MessageInfos: file_fluence_fluxcli_grpc_fluxcli_proto_msgTypes, - }.Build() - File_fluence_fluxcli_grpc_fluxcli_proto = out.File - file_fluence_fluxcli_grpc_fluxcli_proto_rawDesc = nil - file_fluence_fluxcli_grpc_fluxcli_proto_goTypes = nil - file_fluence_fluxcli_grpc_fluxcli_proto_depIdxs = nil -} diff --git a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto b/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto deleted file mode 100644 index f47d35b..0000000 --- a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli.proto +++ /dev/null @@ -1,76 +0,0 @@ -syntax = "proto3"; -option go_package = "grpc/fluxcli"; - -package fluxcli; - - -// Service definition -service FluxcliService { - // Sends a Match command - rpc Match(MatchRequest) returns (MatchResponse) {} - rpc Cancel(CancelRequest) returns (CancelResponse) {} -} - -message PodSpec { - string id = 1; - string container = 2; - int32 cpu = 3; - int64 memory = 4; - int64 gpu = 5; - int64 storage = 6; - repeated string labels = 7; -} - -// The Match request message (allocate, allocate_orelse_reserve) -message MatchRequest { - PodSpec ps = 1; - string request = 2; - int32 count = 3; -} - -// The Nodes/Cluster Update Status -message NodeAlloc { - string nodeID = 1; - int32 tasks = 2; -} - -// The Match response message -message MatchResponse { - string podID = 1; - repeated NodeAlloc nodelist = 2; - int64 jobID = 3; -} - -message CancelRequest { - int64 jobID = 2; -} - -// The Match response message -message CancelResponse { - int64 jobID = 1; - int32 error = 2; -} - - - -// The Nodes/Cluster Update Status -message NodeStatus { - int32 cpuAvail = 1; - int32 gpuAvail = 2; - int64 storageAvail = 3; - int64 memoryAvail = 4; - int64 allowedPods = 5; - string nodeIP = 6; - int32 replication = 7; -} - -// The JGF response message -message JGFRequest { - string jgf = 1; -} - - -// The JGF response message -message JGFResponse { - string jgf = 1; -} diff --git a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli_grpc.pb.go b/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli_grpc.pb.go deleted file mode 100644 index 7bd905a..0000000 --- a/sig-scheduler-plugins/pkg/fluence/fluxcli-grpc/fluxcli_grpc.pb.go +++ /dev/null @@ -1,139 +0,0 @@ -// Code generated by protoc-gen-go-grpc. DO NOT EDIT. - -package fluxcli - -import ( - context "context" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" -) - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -// Requires gRPC-Go v1.32.0 or later. -const _ = grpc.SupportPackageIsVersion7 - -// FluxcliServiceClient is the client API for FluxcliService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. -type FluxcliServiceClient interface { - // Sends a Match command - Match(ctx context.Context, in *MatchRequest, opts ...grpc.CallOption) (*MatchResponse, error) - Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) -} - -type fluxcliServiceClient struct { - cc grpc.ClientConnInterface -} - -func NewFluxcliServiceClient(cc grpc.ClientConnInterface) FluxcliServiceClient { - return &fluxcliServiceClient{cc} -} - -func (c *fluxcliServiceClient) Match(ctx context.Context, in *MatchRequest, opts ...grpc.CallOption) (*MatchResponse, error) { - out := new(MatchResponse) - err := c.cc.Invoke(ctx, "/fluxcli.FluxcliService/Match", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -func (c *fluxcliServiceClient) Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) { - out := new(CancelResponse) - err := c.cc.Invoke(ctx, "/fluxcli.FluxcliService/Cancel", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// FluxcliServiceServer is the server API for FluxcliService service. -// All implementations must embed UnimplementedFluxcliServiceServer -// for forward compatibility -type FluxcliServiceServer interface { - // Sends a Match command - Match(context.Context, *MatchRequest) (*MatchResponse, error) - Cancel(context.Context, *CancelRequest) (*CancelResponse, error) - mustEmbedUnimplementedFluxcliServiceServer() -} - -// UnimplementedFluxcliServiceServer must be embedded to have forward compatible implementations. -type UnimplementedFluxcliServiceServer struct { -} - -func (UnimplementedFluxcliServiceServer) Match(context.Context, *MatchRequest) (*MatchResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method Match not implemented") -} -func (UnimplementedFluxcliServiceServer) Cancel(context.Context, *CancelRequest) (*CancelResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method Cancel not implemented") -} -func (UnimplementedFluxcliServiceServer) mustEmbedUnimplementedFluxcliServiceServer() {} - -// UnsafeFluxcliServiceServer may be embedded to opt out of forward compatibility for this service. -// Use of this interface is not recommended, as added methods to FluxcliServiceServer will -// result in compilation errors. -type UnsafeFluxcliServiceServer interface { - mustEmbedUnimplementedFluxcliServiceServer() -} - -func RegisterFluxcliServiceServer(s grpc.ServiceRegistrar, srv FluxcliServiceServer) { - s.RegisterService(&FluxcliService_ServiceDesc, srv) -} - -func _FluxcliService_Match_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(MatchRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(FluxcliServiceServer).Match(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/fluxcli.FluxcliService/Match", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(FluxcliServiceServer).Match(ctx, req.(*MatchRequest)) - } - return interceptor(ctx, in, info, handler) -} - -func _FluxcliService_Cancel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(CancelRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(FluxcliServiceServer).Cancel(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/fluxcli.FluxcliService/Cancel", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(FluxcliServiceServer).Cancel(ctx, req.(*CancelRequest)) - } - return interceptor(ctx, in, info, handler) -} - -// FluxcliService_ServiceDesc is the grpc.ServiceDesc for FluxcliService service. -// It's only intended for direct use with grpc.RegisterService, -// and not to be introspected or modified (even as a copy) -var FluxcliService_ServiceDesc = grpc.ServiceDesc{ - ServiceName: "fluxcli.FluxcliService", - HandlerType: (*FluxcliServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "Match", - Handler: _FluxcliService_Match_Handler, - }, - { - MethodName: "Cancel", - Handler: _FluxcliService_Cancel_Handler, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "fluence/fluxcli-grpc/fluxcli.proto", -} diff --git a/sig-scheduler-plugins/pkg/fluence/group/group.go b/sig-scheduler-plugins/pkg/fluence/group/group.go new file mode 100644 index 0000000..2c3a3c1 --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/group/group.go @@ -0,0 +1,64 @@ +package group + +import ( + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + klog "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/scheduler/framework" + + sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" +) + +// DefaultWaitTime is 60s if ScheduleTimeoutSeconds is not specified. +const DefaultWaitTime = 60 * time.Second + +// CreateFakeGroup wraps an arbitrary pod in a fake group for fluence to schedule +// This happens only in PreFilter so we already sorted +func CreateFakeGroup(pod *corev1.Pod) *sched.PodGroup { + groupName := fmt.Sprintf("fluence-solo-%s-%s", pod.Namespace, pod.Name) + return &sched.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: groupName, + Namespace: pod.Namespace, + }, + Spec: sched.PodGroupSpec{MinMember: int32(1)}, + } +} + +// GetCreationTimestamp first tries the fluence group, then falls back to the initial attempt timestamp +// This is the only update we have made to the upstream PodGroupManager, because we are expecting +// a MicroTime and not a time.Time. +func GetCreationTimestamp(groupName string, podGroup *sched.PodGroup, podInfo *framework.QueuedPodInfo) metav1.MicroTime { + + // Don't try to get a time for a pod group that does not exist + if podGroup == nil { + return metav1.NewMicroTime(*podInfo.InitialAttemptTimestamp) + } + + // IsZero is an indicator if this was actually set + // If the group label was present and we have a group, this will be true + if !podGroup.Status.ScheduleStartTime.IsZero() { + klog.Infof(" [Fluence] Pod group %s was created at %s\n", groupName, podGroup.Status.ScheduleStartTime) + return podGroup.Status.ScheduleStartTime + } + // We should actually never get here. + klog.Errorf(" [Fluence] Pod group %s time IsZero, we should not have reached here", groupName) + return metav1.NewMicroTime(*podInfo.InitialAttemptTimestamp) +} + +// GetWaitTimeDuration returns a wait timeout based on the following precedences: +// 1. spec.scheduleTimeoutSeconds of the given podGroup, if specified +// 2. given scheduleTimeout, if not nil +// 3. fall back to DefaultWaitTime +func GetWaitTimeDuration(podGroup *sched.PodGroup, scheduleTimeout *time.Duration) time.Duration { + if podGroup != nil && podGroup.Spec.ScheduleTimeoutSeconds != nil { + return time.Duration(*podGroup.Spec.ScheduleTimeoutSeconds) * time.Second + } + if scheduleTimeout != nil && *scheduleTimeout != 0 { + return *scheduleTimeout + } + return DefaultWaitTime +} diff --git a/sig-scheduler-plugins/pkg/fluence/labels/labels.go b/sig-scheduler-plugins/pkg/fluence/labels/labels.go new file mode 100644 index 0000000..eb96c72 --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/labels/labels.go @@ -0,0 +1,74 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package labels + +import ( + "fmt" + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Labels to be shared between different components + +const ( + // We use the same label to be consistent + // https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/apis/scheduling/v1alpha1/types.go#L109 + PodGroupLabel = "scheduling.x-k8s.io/pod-group" + + // TODO add more labels here, to be discovered used later + //PodGroupNameLabel = "fluence.pod-group" + PodGroupSizeLabel = "fluence.group-size" + + // Internal use (not used yet) + PodGroupTimeCreated = "flunce.created-at" +) + +// GetPodGroupLabel get pod group name from pod labels +func GetPodGroupLabel(pod *v1.Pod) string { + return pod.Labels[PodGroupLabel] +} + +// GetPodGroupFullName get namespaced group name from pod labels +func GetPodGroupFullName(pod *v1.Pod) string { + groupName := GetPodGroupLabel(pod) + if len(groupName) == 0 { + return "" + } + return fmt.Sprintf("%v/%v", pod.Namespace, groupName) +} + +// GetPodGroupSize gets the pod group size from the label +func GetPodGroupSize(pod *v1.Pod) string { + return pod.Labels[PodGroupSizeLabel] +} + +// getTimeCreated returns the timestamp when we saw the object +func GetTimeCreated() string { + + // Set the time created for a label + createdAt := metav1.NewMicroTime(time.Now()) + + // If we get an error here, the reconciler will set the time + var timestamp string + timeCreated, err := createdAt.MarshalJSON() + if err == nil { + timestamp = string(timeCreated) + } + return timestamp +} diff --git a/sig-scheduler-plugins/pkg/fluence/register.go b/sig-scheduler-plugins/pkg/fluence/register.go new file mode 100644 index 0000000..1505633 --- /dev/null +++ b/sig-scheduler-plugins/pkg/fluence/register.go @@ -0,0 +1,55 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fluence + +import ( + "context" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +// RegisterExisting uses the in cluster API to ensure existing pods +// are known to fluence, This is a one-time, static approach, so if a resource +// here goes away we cannot remove it from being known. But it's better than +// not having it, and having fluxion assume more resources than the +// cluster has available. This is a TODO as fluxion does not support it +func (fluence *Fluence) RegisterExisting(ctx context.Context) error { + + // creates an in-cluster config and client + config, err := rest.InClusterConfig() + if err != nil { + fluence.log.Error("[Fluence RegisterExisting] Error creating in-cluster config: %s\n", err) + return err + } + // creates the clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + fluence.log.Error("[Fluence RegisterExisting] Error creating client for config: %s\n", err) + return err + } + // get pods in all the namespaces by omitting namespace + // Or specify namespace to get pods in particular namespace + pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) + if err != nil { + fluence.log.Info("[Fluence RegisterExisting] Error listing pods: %s\n", err) + return err + } + fluence.log.Info("[Fluence RegisterExisting] Found %d existing pods in the cluster\n", len(pods.Items)) + return nil +} diff --git a/sig-scheduler-plugins/pkg/fluence/utils/utils.go b/sig-scheduler-plugins/pkg/fluence/utils/utils.go index cfb857d..da9053b 100644 --- a/sig-scheduler-plugins/pkg/fluence/utils/utils.go +++ b/sig-scheduler-plugins/pkg/fluence/utils/utils.go @@ -20,57 +20,79 @@ import ( "strings" v1 "k8s.io/api/core/v1" - "k8s.io/klog/v2" - "k8s.io/kubernetes/pkg/scheduler/framework" + klog "k8s.io/klog/v2" pb "sigs.k8s.io/scheduler-plugins/pkg/fluence/fluxcli-grpc" ) -type NoopStateData struct{} +// TODO this package should be renamed something related to a PodSpec Info -func NewNoopStateData() framework.StateData { - return &NoopStateData{} -} - -func (d *NoopStateData) Clone() framework.StateData { - return d -} - -// InspectPodInfo takes a pod object and returns the pod.spec -func InspectPodInfo(pod *v1.Pod) *pb.PodSpec { - ps := new(pb.PodSpec) - ps.Id = pod.Name - cont := pod.Spec.Containers[0] - - //This will need to be done here AND at client level - if len(pod.Labels) > 0 { - r := make([]string, 0) - for key, val := range pod.Labels { - if strings.Contains(key, "jobspec") { - r = append(r, val) - } - } - if len(r) > 0 { - ps.Labels = r +// getPodJobpsecLabels looks across labels and returns those relevant +// to a jobspec +func getPodJobspecLabels(pod *v1.Pod) []string { + labels := []string{} + for label, value := range pod.Labels { + if strings.Contains(label, "jobspec") { + labels = append(labels, value) } } + return labels +} - specRequests := cont.Resources.Requests - specLimits := cont.Resources.Limits +// PreparePodJobSpec takes a pod object and returns the jobspec +// The jobspec is based on the pod, and assumes it will be duplicated +// for a MatchAllocate request (representing all pods). We name the +// jobspec based on the group and not the individual ID. +// This calculates across containers in the od +func PreparePodJobSpec(pod *v1.Pod, groupName string) *pb.PodSpec { + podSpec := new(pb.PodSpec) + podSpec.Id = groupName + + // There was an if check here to see if we had labels, + // I don't think there is risk to adding an empty list but we can add + // the check back if there is + podSpec.Labels = getPodJobspecLabels(pod) + + // the jobname should be the group name + podSpec.Container = groupName + + // Create accumulated requests for cpu and limits + // CPU and memory are summed across containers + // GPU cannot be shared across containers, but we + // take a count for the pod for the PodSpec + var cpus int32 = 0 + var memory int64 = 0 + var gpus int64 = 0 + + // I think we are OK to sum this too + // https://github.com/kubernetes/kubectl/blob/master/pkg/describe/describe.go#L4211-L4213 + var storage int64 = 0 + + for _, container := range pod.Spec.Containers { + + // Add on Cpu, Memory, GPU from container requests + // This is a limited set of resources owned by the pod + specRequests := container.Resources.Requests + cpus += int32(specRequests.Cpu().Value()) + memory += specRequests.Memory().Value() + storage += specRequests.StorageEphemeral().Value() + + specLimits := container.Resources.Limits + gpuSpec := specLimits["nvidia.com/gpu"] + gpus += gpuSpec.Value() - if specRequests.Cpu().Value() == 0 { - ps.Cpu = 1 - } else { - ps.Cpu = int32(specRequests.Cpu().Value()) } - if specRequests.Memory().Value() > 0 { - ps.Memory = specRequests.Memory().Value() + // If we have zero cpus, assume 1 + // We could use math.Max here, but it is expecting float64 + if cpus == 0 { + cpus = 1 } - gpu := specLimits["nvidia.com/gpu"] - ps.Gpu = gpu.Value() - ps.Storage = specRequests.StorageEphemeral().Value() - - klog.Infof("[Jobspec] Pod spec: CPU %v/%v-milli, memory %v, GPU %v, storage %v", ps.Cpu, specRequests.Cpu().MilliValue(), specRequests.Memory().Value(), ps.Gpu, ps.Storage) - - return ps + podSpec.Cpu = cpus + podSpec.Gpu = gpus + podSpec.Memory = memory + podSpec.Storage = storage + + // I removed specRequests.Cpu().MilliValue() but we can add back some derivative if desired + klog.Infof("[Jobspec] Pod spec: CPU %v, memory %v, GPU %v, storage %v", podSpec.Cpu, podSpec.Memory, podSpec.Gpu, podSpec.Storage) + return podSpec } diff --git a/sig-scheduler-plugins/pkg/logger/logger.go b/sig-scheduler-plugins/pkg/logger/logger.go new file mode 100644 index 0000000..d1e238e --- /dev/null +++ b/sig-scheduler-plugins/pkg/logger/logger.go @@ -0,0 +1,87 @@ +package logger + +// A small debug logger to write to file instead of klog +// I don't know where to close, so I'm opening and appending each time +// It's a bad design, but will work for debugging. + +import ( + "fmt" + "log" + "os" +) + +const ( + LevelNone = iota + LevelInfo + LevelWarning + LevelError + LevelVerbose + LevelDebug +) + +type DebugLogger struct { + level int + Filename string + handle *os.File +} + +func NewDebugLogger(level int, filename string) *DebugLogger { + return &DebugLogger{ + level: level, + Filename: filename, + } +} + +func (l *DebugLogger) Start() (*log.Logger, error) { + f, err := os.OpenFile(l.Filename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, os.ModePerm) + if err != nil { + return nil, err + } + logger := log.New(f, "", 0) + l.handle = f + return logger, nil +} +func (l *DebugLogger) Stop() error { + if l.handle != nil { + return l.handle.Close() + } + return nil +} + +// Logging functions you should use! +func (l *DebugLogger) Info(message ...any) error { + return l.log(LevelInfo, " INFO: ", message...) +} +func (l *DebugLogger) Error(message ...any) error { + return l.log(LevelError, " ERROR: ", message...) +} +func (l *DebugLogger) Debug(message ...any) error { + return l.log(LevelDebug, " DEBUG: ", message...) +} +func (l *DebugLogger) Verbose(message ...any) error { + return l.log(LevelVerbose, "VERBOSE: ", message...) +} +func (l *DebugLogger) Warning(message ...any) error { + return l.log(LevelWarning, "WARNING: ", message...) +} + +// log is the shared class function for actually printing to the log +func (l *DebugLogger) log(level int, prefix string, message ...any) error { + logger, err := l.Start() + if err != nil { + return err + } + // Assume the prolog (to be formatted) is at index 0 + prolog := message[0].(string) + if prefix != "" { + prolog = prefix + " " + prolog + } + rest := message[1:] + + // msg := fmt.Sprintf(message...) + fmt.Printf("Compariing level %d <= %d\n", level, l.level) + if level <= l.level { + logger.Printf(prolog, rest...) + } + return l.Stop() +} diff --git a/src/Makefile b/src/Makefile index a32efce..e31c8ec 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,11 +1,12 @@ -FLUX_SCHED_ROOT ?= /home/flux-sched +FLUX_SCHED_ROOT ?= /opt/flux-sched INSTALL_PREFIX ?= /usr +LIB_PREFIX ?= /usr/lib +LOCALBIN ?= $(shell pwd)/bin COMMONENVVAR=GOOS=$(shell uname -s | tr A-Z a-z) +#BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${INSTALL_PREFIX}/lib -L${FLUX_SCHED_ROOT}/resource -lresource -L${FLUX_SCHED_ROOT}/resource/libjobspec -ljobspec_conv -L/${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" +BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT} -I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${LIB_PREFIX} -L${LIB_PREFIX}/flux -L${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" -# This is what worked -# GOOS=linux CGO_CFLAGS="-I/home/flux-sched/resource/reapi/bindings/c" CGO_LDFLAGS="-L/usr/lib -L/home/flux-sched/resource -lresource -L/home/flux-sched/resource/libjobspec -ljobspec_conv -L/home/flux-sched/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -lczmq -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" go build -ldflags '-w' -o bin/server cmd/main.go -BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${INSTALL_PREFIX}/lib -L${FLUX_SCHED_ROOT}/resource -lresource -L${FLUX_SCHED_ROOT}/resource/libjobspec -ljobspec_conv -L/${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -lczmq -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" LOCAL_REGISTRY=localhost:5000 LOCAL_IMAGE=fluence-sidecar:latest @@ -14,6 +15,10 @@ RELEASE_VERSION?=v$(shell date +%Y%m%d)-$(shell git describe --tags --match "v*" .PHONY: all all: fluxcli +.PHONY: $(LOCALBIN) +$(LOCALBIN): + mkdir -p $(LOCALBIN) + .PHONY: fluxcli fluxcli: docker build -f build/scheduler/Dockerfile --build-arg ARCH="amd64" --build-arg RELEASE_VERSION="$(RELEASE_VERSION)" -t $(LOCAL_REGISTRY)/$(LOCAL_IMAGE) . @@ -22,6 +27,13 @@ fluxcli: server: $(COMMONENVVAR) $(BUILDENVVAR) go build -ldflags '-w' -o bin/server cmd/main.go +.PHONY: protoc +protoc: $(LOCALBIN) + GOBIN=$(LOCALBIN) go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.28 + GOBIN=$(LOCALBIN) go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.2 + +# You can use make protoc to download proto .PHONY: proto -proto: - protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluence/fluxcli-grpc/fluxcli.proto +proto: protoc + PATH=$(LOCALBIN):${PATH} protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluence/fluxcli-grpc/fluxcli.proto + PATH=$(LOCALBIN):${PATH} protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluence/service-grpc/service.proto \ No newline at end of file diff --git a/src/build/scheduler/Dockerfile b/src/build/scheduler/Dockerfile index 18c4bd7..2a8892c 100644 --- a/src/build/scheduler/Dockerfile +++ b/src/build/scheduler/Dockerfile @@ -1,137 +1,33 @@ -FROM ubuntu:latest as base +FROM fluxrm/flux-sched:jammy -RUN apt -y update && apt -y upgrade && apt -y clean && apt -y autoremove +USER root ENV DEBIAN_FRONTEND=noninteractive -ENV GO_VERSION=1.19.10 -ENV INSTALL_PREFIX=/usr +ENV GO_VERSION=1.21.9 -RUN apt install -y --no-install-recommends tzdata && \ - apt -y --no-install-recommends install \ - aspell \ - aspell-en \ - autoconf \ - automake \ - curl \ - git \ - libc6-dev \ - libczmq-dev \ - libmpich-dev \ - libncurses5-dev \ - libelf-dev \ - libssl-dev \ - libtool \ - libsodium-dev \ - libzmq3-dev \ - libjansson-dev \ - liblz4-dev \ - libhwloc-dev \ - libsqlite3-dev \ - lua5.1 \ - liblua5.1-dev \ - lua-posix \ - make \ - openssh-client \ - python3-dev \ - python3-cffi \ - python3-six \ - python3-yaml \ - python3-jsonschema \ - python3-sphinx \ - python3-pip \ - python3-setuptools \ - systemd \ - wget \ - uuid-dev && \ - apt -y clean && apt -y autoremove +RUN apt-get update && apt-get clean -y && apt -y autoremove -RUN echo 'alias python="/usr/bin/python3.8"' >> /root/.bashrc && \ - echo 'alias pip="/usr/bin/pip3"' >> /root/.bashrc && \ - . /root/.bashrc - -RUN echo 'set number' >> /root/.vimrc - -# Install cmake for new build system -RUN curl -s -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-$(uname -m).sh > cmake.sh ;\ - bash cmake.sh --prefix=/usr/local --skip-license ;\ - rm cmake.sh - -# Remove Python 2 -RUN apt purge -y python2.7-minimal - -# Python 3 should be linked to python -RUN ln -s /usr/bin/python3 /usr/bin/python -RUN apt install -y python3-pip \ - && apt -y --no-install-recommends install \ - libhwloc-dev \ - libboost-dev \ - libboost-system-dev \ - libboost-filesystem-dev \ - libboost-graph-dev \ - libboost-regex-dev \ - libxml2-dev \ - libyaml-cpp-dev \ - python3-yaml \ - libedit-dev \ - libarchive-dev \ - pkg-config && apt -y clean && apt -y autoremove - -RUN git clone https://github.com/flux-framework/flux-core.git /home/flux-core && \ - cd /home/flux-core/ && \ - ./autogen.sh && \ - PYTHON_VERSION=3 ./configure --prefix=${INSTALL_PREFIX} && \ - make && make install && \ - cd ../ && \ - rm -rf flux-core - -# Install go 19.10 +# Install go RUN wget https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz && tar -xvf go${GO_VERSION}.linux-amd64.tar.gz && \ mv go /usr/local && rm go${GO_VERSION}.linux-amd64.tar.gz -ENV GOROOT=/usr/local/go -ENV GOPATH=/go -ENV PATH="$GOROOT/bin:$PATH" -RUN mkdir -p /go/src +# ENV GOROOT=/usr/local/go +# ENV GOPATH=/go +ENV PATH=/usr/local/go/bin:$PATH RUN flux keygen +RUN git clone https://github.com/flux-framework/flux-sched.git /opt/flux-sched -ENV WITH_GO=yes -RUN git clone https://github.com/flux-framework/flux-sched.git /home/flux-sched && \ - cd /home/flux-sched/ && \ - # Ensure we pin to variant that has STATIC - will update when fix is in - git fetch && git checkout v0.31.0 && \ - # These need to be shared libraries - # https://github.com/flux-framework/flux-sched/pull/1094 - sed -i 's/add_library(resource STATIC/add_library(resource SHARED/g' resource/CMakeLists.txt && \ - sed -i 's/add_library ( reapi_module STATIC/add_library ( reapi_module SHARED/g' resource/reapi/bindings/CMakeLists.txt && \ - sed -i 's/add_library ( reapi_cli STATIC/add_library ( reapi_cli SHARED/g' resource/reapi/bindings/CMakeLists.txt && \ - sed -i 's/add_library ( jobspec_conv STATIC/add_library ( jobspec_conv SHARED/g' resource/libjobspec/CMakeLists.txt && \ - PYTHON_VERSION=3 ./configure --prefix=${INSTALL_PREFIX} && \ - make && make install - -RUN apt purge -y \ - python3-dev \ - python3-cffi \ - python3-six \ - python3-yaml \ - python3-jsonschema \ - python3-sphinx \ - python3-pip \ - python3-setuptools \ - && apt -y clean && apt -y autoremove - -ENV PATH=/usr/local/go/bin:$PATH +# Go dependencies for protobuf RUN apt -y update && apt -y upgrade && apt install --no-install-recommends -y protobuf-compiler curl && \ go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.26 && \ go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.1 # These need to be on the LD_LIBRARY_PATH for the server to find at runtime -# This mimcs what we use to build server -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/lib:/home/flux-sched/resource:/home/flux-sched/resource/libjobspec:/home/flux-sched/resource/reapi/bindings" -COPY fluence Makefile /go/src/fluence/ +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/lib:/usr/lib/flux WORKDIR /go/src/fluence/ +COPY fluence Makefile /go/src/fluence/ -# This is the 0.31.0 tag of flux-sched (same as we install above) -RUN go get -u github.com/flux-framework/flux-sched/resource/reapi/bindings/go/src/fluxcli@250eac78a6753253fc8353a3504d7e843d1b6b24 && \ - go mod tidy && \ - make server FLUX_SCHED_ROOT=/home/flux-sched INSTALL_PREFIX=${INSTALL_PREFIX} && \ +RUN go mod tidy && \ + go mod vendor && \ + make server FLUX_SCHED_ROOT=/opt/flux-sched && \ mkdir -p /home/data/jobspecs /home/data/jgf && \ chmod -R ugo+rwx /home/data \ No newline at end of file diff --git a/src/fluence/cmd/main.go b/src/fluence/cmd/main.go index c064ce8..e8ef87d 100644 --- a/src/fluence/cmd/main.go +++ b/src/fluence/cmd/main.go @@ -1,30 +1,44 @@ package main import ( - "fmt" "flag" + "fmt" "net" - "google.golang.org/grpc/keepalive" - "google.golang.org/grpc" + "strings" "time" + "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" + pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxcli-grpc" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxion" + "github.com/flux-framework/flux-k8s/flux-plugin/fluence/service" + svcPb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/service-grpc" ) - const ( - port = ":4242" + defaultPort = ":4242" + enableExternalService = false ) var responsechan chan string -func main () { +func main() { fmt.Println("This is the fluxion grpc server") policy := flag.String("policy", "", "Match policy") label := flag.String("label", "", "Label name for fluence dedicated nodes") + grpcPort := flag.String("port", defaultPort, "Port for grpc service") + enableServicePlugin := flag.Bool("external-service", enableExternalService, "Flag to enable the external service (defaults to false)") flag.Parse() + + // Ensure our port starts with : + port := *grpcPort + if !strings.HasPrefix(":", port) { + port = fmt.Sprintf(":%s", port) + } + + // Fluxion GRPC flux := fluxion.Fluxion{} flux.InitFluxion(policy, label) @@ -34,16 +48,29 @@ func main () { } responsechan = make(chan string) - s := grpc.NewServer( + server := grpc.NewServer( grpc.KeepaliveParams(keepalive.ServerParameters{ - MaxConnectionIdle: 5 * time.Minute, + MaxConnectionIdle: 5 * time.Minute, }), ) - pb.RegisterFluxcliServiceServer(s, &flux /*&server{flux: flux}*/) + pb.RegisterFluxcliServiceServer(server, &flux) + + // External plugin (Kubectl) GRPC + // This will eventually be an external GRPC module that can + // be shared by fluence (flux-k8s) and fluence-kubectl + // We give it a handle to Flux to get the state of groups + // and job Ids. The direct interaction with Fluxion + // happens through the other service handle + if *enableServicePlugin { + plugin := service.ExternalService{} + plugin.Init() + svcPb.RegisterExternalPluginServiceServer(server, &plugin) + } + fmt.Printf("[GRPCServer] gRPC Listening on %s\n", lis.Addr().String()) - if err := s.Serve(lis); err != nil { + if err := server.Serve(lis); err != nil { fmt.Printf("[GRPCServer] failed to serve: %v\n", err) } - + fmt.Printf("[GRPCServer] Exiting\n") -} \ No newline at end of file +} diff --git a/src/fluence/cmd/main.go.bk b/src/fluence/cmd/main.go.bk deleted file mode 100644 index 5e66d14..0000000 --- a/src/fluence/cmd/main.go.bk +++ /dev/null @@ -1,15 +0,0 @@ -package main - -import ( - "fmt" - "flag" - "github.com/flux-framework/flux-k8s/flux-plugin/kubeflux/fluxion" -) - -func main () { - policy := flag.String("policy", "", "Match policy") - flag.Parse() - fmt.Println("Policy ", policy) - fc := fluxion.Fluxion{Policy: *policy} - fc.InitFluxion() -} \ No newline at end of file diff --git a/src/fluence/defaults/defaults.go b/src/fluence/defaults/defaults.go new file mode 100644 index 0000000..f4fc8f2 --- /dev/null +++ b/src/fluence/defaults/defaults.go @@ -0,0 +1,5 @@ +package defaults + +var ( + KubernetesJsonGraphFormat = "/home/data/jgf/kubecluster.json" +) diff --git a/src/fluence/fluxcli-grpc/fluxcli.pb.go b/src/fluence/fluxcli-grpc/fluxcli.pb.go index e317af2..6bd47d4 100644 --- a/src/fluence/fluxcli-grpc/fluxcli.pb.go +++ b/src/fluence/fluxcli-grpc/fluxcli.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: -// protoc-gen-go v1.26.0 -// protoc v3.15.8 +// protoc-gen-go v1.28.1 +// protoc v3.20.3 // source: fluence/fluxcli-grpc/fluxcli.proto package fluxcli diff --git a/src/fluence/fluxcli-grpc/fluxcli.proto b/src/fluence/fluxcli-grpc/fluxcli.proto index f47d35b..1446041 100644 --- a/src/fluence/fluxcli-grpc/fluxcli.proto +++ b/src/fluence/fluxcli-grpc/fluxcli.proto @@ -3,8 +3,7 @@ option go_package = "grpc/fluxcli"; package fluxcli; - -// Service definition +// Service definition for Fluxclient service FluxcliService { // Sends a Match command rpc Match(MatchRequest) returns (MatchResponse) {} diff --git a/src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go b/src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go index 7bd905a..f984b04 100644 --- a/src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go +++ b/src/fluence/fluxcli-grpc/fluxcli_grpc.pb.go @@ -1,4 +1,8 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.20.3 +// source: fluence/fluxcli-grpc/fluxcli.proto package fluxcli diff --git a/src/fluence/fluxion/fluxion.go b/src/fluence/fluxion/fluxion.go index 7ef532a..f288cdf 100644 --- a/src/fluence/fluxion/fluxion.go +++ b/src/fluence/fluxion/fluxion.go @@ -3,14 +3,15 @@ package fluxion import ( "os" + "github.com/flux-framework/flux-k8s/flux-plugin/fluence/defaults" pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxcli-grpc" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/jobspec" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/utils" - "github.com/flux-framework/flux-sched/resource/reapi/bindings/go/src/fluxcli" + "github.com/flux-framework/fluxion-go/pkg/fluxcli" + klog "k8s.io/klog/v2" "context" "errors" - "fmt" ) type Fluxion struct { @@ -18,78 +19,95 @@ type Fluxion struct { pb.UnimplementedFluxcliServiceServer } -func (f *Fluxion) InitFluxion(policy *string, label *string) { - f.cli = fluxcli.NewReapiClient() +// InitFluxion creates a new client to interaction with the fluxion API (via go bindings) +func (fluxion *Fluxion) InitFluxion(policy *string, label *string) { + fluxion.cli = fluxcli.NewReapiClient() - fmt.Println("Created flux resource client ", f.cli) - fmt.Printf("%+v\n", f.cli) - filename := "/home/data/jgf/kubecluster.json" - err := utils.CreateJGF(filename, label) + klog.Infof("[Fluence] Created flux resource client %s", fluxion.cli) + err := utils.CreateJGF(defaults.KubernetesJsonGraphFormat, label) if err != nil { return } - jgf, err := os.ReadFile(filename) + jgf, err := os.ReadFile(defaults.KubernetesJsonGraphFormat) if err != nil { - fmt.Println("Error reading JGF") + klog.Error("Error reading JGF") return } p := "{}" if *policy != "" { p = string("{\"matcher_policy\": \"" + *policy + "\"}") - fmt.Println("Match policy: ", p) + klog.Infof("[Fluence] match policy: %s", p) } - - f.cli.InitContext(string(jgf), p) + fluxion.cli.InitContext(string(jgf), p) } -func (s *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelResponse, error) { - fmt.Printf("[GRPCServer] Received Cancel request %v\n", in) - err := s.cli.Cancel(int64(in.JobID), true) +// Cancel wraps the Cancel function of the fluxion go bindings +func (fluxion *Fluxion) Cancel(ctx context.Context, in *pb.CancelRequest) (*pb.CancelResponse, error) { + + klog.Infof("[Fluence] received cancel request %v\n", in) + err := fluxion.cli.Cancel(int64(in.JobID), true) if err != nil { - return nil, errors.New("Error in Cancel") + return nil, err } // Why would we have an error code here if we check above? // This (I think) should be an error code for the specific job dr := &pb.CancelResponse{JobID: in.JobID} - fmt.Printf("[GRPCServer] Sending Cancel response %v\n", dr) + klog.Infof("[Fluence] sending cancel response %v\n", dr) + klog.Infof("[Fluence] cancel errors so far: %s\n", fluxion.cli.GetErrMsg()) - fmt.Printf("[CancelRPC] Errors so far: %s\n", s.cli.GetErrMsg()) + reserved, at, overhead, mode, fluxerr := fluxion.cli.Info(int64(in.JobID)) + klog.Infof("\n\t----Job Info output---") + klog.Infof("jobid: %d\nreserved: %t\nat: %d\noverhead: %f\nmode: %s\nerror: %d\n", in.JobID, reserved, at, overhead, mode, fluxerr) - reserved, at, overhead, mode, fluxerr := s.cli.Info(int64(in.JobID)) - fmt.Println("\n\t----Job Info output---") - fmt.Printf("jobid: %d\nreserved: %t\nat: %d\noverhead: %f\nmode: %s\nerror: %d\n", in.JobID, reserved, at, overhead, mode, fluxerr) - - fmt.Printf("[GRPCServer] Sending Cancel response %v\n", dr) + klog.Infof("[GRPCServer] Sending Cancel response %v\n", dr) return dr, nil } -func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResponse, error) { - filename := "/home/data/jobspecs/jobspec.yaml" - jobspec.CreateJobSpecYaml(in.Ps, in.Count, filename) +// Match wraps the MatchAllocate function of the fluxion go bindings +// If a match is not possible, we return the error and an empty response +func (fluxion *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResponse, error) { + + emptyResponse := &pb.MatchResponse{} - spec, err := os.ReadFile(filename) + // Prepare an empty match response (that can still be serialized) + klog.Infof("[Fluence] Received Match request %v\n", in) + + // Generate the jobspec, array of bytes converted to string + spec, err := jobspec.CreateJobSpecYaml(in.Ps, in.Count) if err != nil { - return nil, errors.New("Error reading jobspec") + return emptyResponse, err } - fmt.Printf("[GRPCServer] Received Match request %v\n", in) - reserved, allocated, at, overhead, jobid, fluxerr := s.cli.MatchAllocate(false, string(spec)) + // Ask flux to match allocate! + reserved, allocated, at, overhead, jobid, fluxerr := fluxion.cli.MatchAllocate(false, string(spec)) utils.PrintOutput(reserved, allocated, at, overhead, jobid, fluxerr) - fmt.Printf("[MatchRPC] Errors so far: %s\n", s.cli.GetErrMsg()) + // Be explicit about errors (or not) + errorMessages := fluxion.cli.GetErrMsg() + if errorMessages == "" { + klog.Infof("[Fluence] There are no errors") + } else { + klog.Infof("[Fluence] Match errors so far: %s\n", errorMessages) + } if fluxerr != nil { - return nil, errors.New("Error in ReapiCliMatchAllocate") + klog.Infof("[Fluence] Match Flux err is %w\n", fluxerr) + return emptyResponse, errors.New("[Fluence] Error in ReapiCliMatchAllocate") } + // This usually means we cannot allocate + // We need to return an error here otherwise we try to pass an empty string + // to other RPC endpoints and get back an error. if allocated == "" { - return nil, nil + klog.Infof("[Fluence] Allocated is empty") + return emptyResponse, errors.New("Allocation was not possible") } - nodetasks := utils.ParseAllocResult(allocated) - + // Pass the spec name in so we can include it in the allocation result + // This will allow us to inspect the ordering later. + nodetasks := utils.ParseAllocResult(allocated, in.Ps.Container) nodetaskslist := make([]*pb.NodeAlloc, len(nodetasks)) for i, result := range nodetasks { nodetaskslist[i] = &pb.NodeAlloc{ @@ -98,6 +116,6 @@ func (s *Fluxion) Match(ctx context.Context, in *pb.MatchRequest) (*pb.MatchResp } } mr := &pb.MatchResponse{PodID: in.Ps.Id, Nodelist: nodetaskslist, JobID: int64(jobid)} - fmt.Printf("[GRPCServer] Response %v \n", mr) + klog.Infof("[Fluence] Match response %v \n", mr) return mr, nil } diff --git a/src/fluence/go.mod b/src/fluence/go.mod index 5a14548..01fc126 100644 --- a/src/fluence/go.mod +++ b/src/fluence/go.mod @@ -1,18 +1,45 @@ module github.com/flux-framework/flux-k8s/flux-plugin/fluence -go 1.16 +go 1.21 require ( - github.com/flux-framework/flux-sched/resource/reapi/bindings/go v0.0.0-20231213021445-250eac78a675 + github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2 google.golang.org/grpc v1.38.0 google.golang.org/protobuf v1.26.0 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.22.3 k8s.io/apimachinery v0.22.3 k8s.io/client-go v0.22.3 + k8s.io/klog/v2 v2.9.0 k8s.io/kubectl v0.0.0 ) +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/go-logr/logr v0.4.0 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/google/go-cmp v0.5.5 // indirect + github.com/google/gofuzz v1.1.0 // indirect + github.com/googleapis/gnostic v0.5.5 // indirect + github.com/json-iterator/go v1.1.11 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.1 // indirect + golang.org/x/net v0.0.0-20210520170846-37e1c6afe023 // indirect + golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d // indirect + golang.org/x/sys v0.0.0-20210616094352-59db8d763f22 // indirect + golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d // indirect + golang.org/x/text v0.3.6 // indirect + golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect + google.golang.org/appengine v1.6.5 // indirect + google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect + k8s.io/utils v0.0.0-20210819203725-bdf08cb9a70a // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect + sigs.k8s.io/yaml v1.2.0 // indirect +) + replace ( k8s.io/api => k8s.io/api v0.22.3 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.22.3 diff --git a/src/fluence/go.sum b/src/fluence/go.sum index 19e571c..534497d 100644 --- a/src/fluence/go.sum +++ b/src/fluence/go.sum @@ -98,8 +98,10 @@ github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d/go.mod h1:ZZM github.com/fatih/camelcase v1.0.0/go.mod h1:yN2Sb0lFhZJUdVvtELVWefmrXpuZESvPmqwoZc+/fpc= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/flux-framework/flux-sched/resource/reapi/bindings/go v0.0.0-20231213021445-250eac78a675 h1:FgEA3pnL/kDoLaVOUDa401yainApQJaow9jeBPg4dek= -github.com/flux-framework/flux-sched/resource/reapi/bindings/go v0.0.0-20231213021445-250eac78a675/go.mod h1:yhmzNyn45YhoxEohh1Sl3h3izLMqL7qpcvmYTRpv7eY= +github.com/flux-framework/fluxion-go v0.32.0 h1:NY6Y1mlTTTZhHD+CmAsDsdNTxUsAFDQoORpMZj8NFLI= +github.com/flux-framework/fluxion-go v0.32.0/go.mod h1:ZI3QxSvUfgJE2Snur/SntJmVfpMjr6D4ICVmdqJ9fkQ= +github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2 h1:Yz/vVX0XfB2q51ZLh2p8YI5vphvv0rZF4PqtKPscvsY= +github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2/go.mod h1:jA5+kOSLxchFzixzYEvMAGjkXB5yszO/HxUwdhX/5/U= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/form3tech-oss/jwt-go v3.2.3+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= diff --git a/src/fluence/jgf/jgf.go b/src/fluence/jgf/jgf.go index d12148b..8a047f9 100644 --- a/src/fluence/jgf/jgf.go +++ b/src/fluence/jgf/jgf.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -17,57 +17,42 @@ package jgf import ( "encoding/json" + "fmt" "log" "os" "strconv" "strings" ) -type node struct { - Id string `json:"id"` - Label string `json:"label,omitempty"` - Metadata nodeMetadata `json:"metadata,omitempty"` -} - -type edge struct { - Source string `json:"source"` - Relation string `json:"relation,omitempty"` - Target string `json:"target"` - Directed bool `json:"directed,omitempty"` - Metadata edgeMetadata `json:"metadata"` -} - -type edgeMetadata struct { - Name map[string]string `json:"name,omitempty"` -} - -type nodeMetadata struct { - Type string `json:"type"` - Basename string `json:"basename"` - Name string `json:"name"` - Id int `json:"id"` - Uniq_id int `json:"uniq_id"` - Rank int `json:"rank,omitempty"` - Exclusive bool `json:"exclusive"` - Unit string `json:"unit"` - Size int `json:"size"` - Paths map[string]string `json:"paths,omitempty"` - Properties map[string]string `json:"properties,omitempty"` -} - -type graph struct { - Nodes []node `json:"nodes"` - Edges []edge `json:"edges"` - // Metadata metadata `json:"metadata,omitempty"` - Directed bool `json:"directed,omitempty"` -} - -type Fluxjgf struct { - Graph graph `json:"graph"` - Elements int `json:"-"` - NodeMap map[string]node `json:"-"` -} +var ( + // Defaults for nodes + defaultExclusive = false + defaultRank = int64(-1) + defaultSize = int64(1) + defaultUnit = "" + + // Relations + ContainsRelation = "contains" + InRelation = "in" + + // Vertex (node) types + // These are public to be used in the utils package + ClusterType = "cluster" + NodeType = "node" + CoreType = "core" + VirtualCoreType = "vcore" + RackType = "rack" + SocketType = "socket" + SubnetType = "subnet" + MemoryType = "memory" + NvidiaGPU = "nvidiagpu" + GPUType = "gpu" + + // Paths + containmentKey = "containment" +) +// InitJGF initializes the Flux Json Graph Format object func InitJGF() (fluxgraph Fluxjgf) { var g graph fluxgraph = Fluxjgf{ @@ -77,155 +62,146 @@ func InitJGF() (fluxgraph Fluxjgf) { } return } + +// getDefaultPaths returns a new map with empty containment +// this cannot be a global shared variable or we get an error +// about inserting an edge to itself. +func getDefaultPaths() map[string]string { + return map[string]string{containmentKey: ""} +} + +// addNode adds a node to the JGF func (g *Fluxjgf) addNode(toadd node) { g.Graph.Nodes = append(g.Graph.Nodes, toadd) g.NodeMap[toadd.Id] = toadd g.Elements = g.Elements + 1 } +// MakeEdge creates an edge for the JGF func (g *Fluxjgf) MakeEdge(source string, target string, contains string) { newedge := edge{ Source: source, Target: target, Metadata: edgeMetadata{ - Name: map[string]string{ - "containment": contains, - }, + Name: map[string]string{containmentKey: contains}, }, } g.Graph.Edges = append(g.Graph.Edges, newedge) - if contains == "contains" { + if contains == ContainsRelation { tnode := g.NodeMap[target] - tnode.Metadata.Paths["containment"] = g.NodeMap[source].Metadata.Paths["containment"] + "/" + tnode.Metadata.Name - } - -} - -func processLabels(labels *map[string]string, filter string) (filtered map[string]string) { - filtered = make(map[string]string, 0) - for key, v := range *labels { - if strings.Contains(key, filter) { - - filtered[key] = v - } + tnode.Metadata.Paths[containmentKey] = g.NodeMap[source].Metadata.Paths[containmentKey] + "/" + tnode.Metadata.Name } - return } - -func (g *Fluxjgf) MakeSubnet(index int, ip string) string { +// MakeSubnet creates a subnet for the graph +func (g *Fluxjgf) MakeSubnet(index int64, ip string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "subnet", + Type: SubnetType, Basename: ip, - Name: ip + strconv.Itoa(g.Elements), + Name: ip + fmt.Sprintf("%d", g.Elements), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } +// MakeNode creates a new node for the graph func (g *Fluxjgf) MakeNode(index int, exclusive bool, subnet string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "node", + Type: NodeType, Basename: subnet, - Name: subnet + strconv.Itoa(g.Elements), + Name: subnet + fmt.Sprintf("%d", g.Elements), Id: g.Elements, Uniq_id: g.Elements, - Rank: -1, + Rank: defaultRank, Exclusive: exclusive, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } -func (g *Fluxjgf) MakeSocket(index int, name string) string { +// MakeSocket creates a socket for the graph +func (g *Fluxjgf) MakeSocket(index int64, name string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "socket", + Type: SocketType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } -func (g *Fluxjgf) MakeCore(index int, name string) string { +// MakeCore creates a core for the graph +func (g *Fluxjgf) MakeCore(index int64, name string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "core", + Type: CoreType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } -func (g *Fluxjgf) MakeVCore(coreid string, index int, name string) string { +// MakeVCore makes a vcore (I think 2 vcpu == 1 cpu) for the graph +func (g *Fluxjgf) MakeVCore(coreid string, index int64, name string) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "vcore", + Type: VirtualCoreType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, "contains") - g.MakeEdge(newnode.Id, coreid, "in") + g.MakeEdge(coreid, newnode.Id, ContainsRelation) + g.MakeEdge(newnode.Id, coreid, InRelation) return newnode.Id } -func (g *Fluxjgf) MakeNFDProperties(coreid string, index int, filter string, labels *map[string]string) { +// MakeNFProperties makes the node feature discovery properties for the graph +func (g *Fluxjgf) MakeNFDProperties(coreid string, index int64, filter string, labels *map[string]string) { for key, _ := range *labels { if strings.Contains(key, filter) { name := strings.Split(key, "/")[1] @@ -234,116 +210,111 @@ func (g *Fluxjgf) MakeNFDProperties(coreid string, index int, filter string, lab } newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ Type: name, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, "contains") + g.MakeEdge(coreid, newnode.Id, ContainsRelation) } } } -func (g *Fluxjgf) MakeNFDPropertiesByValue(coreid string, index int, filter string, labels *map[string]string) { +func (g *Fluxjgf) MakeNFDPropertiesByValue(coreid string, index int64, filter string, labels *map[string]string) { for key, val := range *labels { if strings.Contains(key, filter) { name := val newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ Type: name, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) - g.MakeEdge(coreid, newnode.Id, "contains") + g.MakeEdge(coreid, newnode.Id, ContainsRelation) } } } -func (g *Fluxjgf) MakeMemory(index int, name string, unit string, size int) string { +// MakeMemory creates memory for the graph +func (g *Fluxjgf) MakeMemory(index int64, name string, unit string, size int64) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "memory", + Type: MemoryType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, + Rank: defaultRank, + Exclusive: defaultExclusive, Unit: unit, Size: size, - Paths: map[string]string{ - "containment": "", - }, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } -func (g *Fluxjgf) MakeGPU(index int, name string, size int) string { +// MakeGPU makes a gpu for the graph +func (g *Fluxjgf) MakeGPU(index int64, name string, size int64) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "gpu", + Type: GPUType, Basename: name, - Name: name + strconv.Itoa(index), + Name: name + fmt.Sprintf("%d", index), Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, Size: size, - Paths: map[string]string{ - "containment": "", - }, + Paths: getDefaultPaths(), }, } g.addNode(newnode) return newnode.Id } +// MakeCluster creates the cluster func (g *Fluxjgf) MakeCluster(clustername string) string { g.Elements = 0 newnode := node{ Id: strconv.Itoa(0), Metadata: nodeMetadata{ - Type: "cluster", + Type: ClusterType, Basename: clustername, Name: clustername + "0", Id: g.Elements, Uniq_id: 0, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, Paths: map[string]string{ - "containment": "/" + clustername + "0", + containmentKey: "/" + clustername + "0", }, }, } @@ -351,22 +322,21 @@ func (g *Fluxjgf) MakeCluster(clustername string) string { return newnode.Id } -func (g *Fluxjgf) MakeRack(id int) string { +// MakeRack makes the rack +func (g *Fluxjgf) MakeRack(index int64) string { newnode := node{ - Id: strconv.Itoa(g.Elements), + Id: fmt.Sprintf("%d", g.Elements), Metadata: nodeMetadata{ - Type: "rack", - Basename: "rack", - Name: "rack" + strconv.Itoa(id), - Id: id, + Type: RackType, + Basename: RackType, + Name: RackType + fmt.Sprintf("%d", index), + Id: index, Uniq_id: g.Elements, - Rank: -1, - Exclusive: false, - Unit: "", - Size: 1, - Paths: map[string]string{ - "containment": "", - }, + Rank: defaultRank, + Exclusive: defaultExclusive, + Unit: defaultUnit, + Size: defaultSize, + Paths: getDefaultPaths(), }, } g.addNode(newnode) diff --git a/src/fluence/jgf/types.go b/src/fluence/jgf/types.go new file mode 100644 index 0000000..21ccd00 --- /dev/null +++ b/src/fluence/jgf/types.go @@ -0,0 +1,62 @@ +/* +Copyright © 2021 IBM Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package jgf + +type node struct { + Id string `json:"id"` + Label string `json:"label,omitempty"` + Metadata nodeMetadata `json:"metadata,omitempty"` +} + +type edge struct { + Source string `json:"source"` + Relation string `json:"relation,omitempty"` + Target string `json:"target"` + Directed bool `json:"directed,omitempty"` + Metadata edgeMetadata `json:"metadata"` +} + +type edgeMetadata struct { + Name map[string]string `json:"name,omitempty"` +} + +type nodeMetadata struct { + Type string `json:"type"` + Basename string `json:"basename"` + Name string `json:"name"` + Id int64 `json:"id"` + Uniq_id int64 `json:"uniq_id"` + Rank int64 `json:"rank,omitempty"` + Exclusive bool `json:"exclusive"` + Unit string `json:"unit"` + Size int64 `json:"size"` + Paths map[string]string `json:"paths,omitempty"` + Properties map[string]string `json:"properties,omitempty"` +} + +type graph struct { + Nodes []node `json:"nodes"` + Edges []edge `json:"edges"` + // Metadata metadata `json:"metadata,omitempty"` + Directed bool `json:"directed,omitempty"` +} + +type Fluxjgf struct { + Graph graph `json:"graph"` + Elements int64 `json:"-"` + NodeMap map[string]node `json:"-"` +} diff --git a/src/fluence/jobspec/jobspec.go b/src/fluence/jobspec/jobspec.go index 8ef90ae..96ed0fe 100644 --- a/src/fluence/jobspec/jobspec.go +++ b/src/fluence/jobspec/jobspec.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -18,16 +18,14 @@ package jobspec import ( "fmt" "log" - "math" - "os" pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/fluxcli-grpc" "gopkg.in/yaml.v2" ) - - /* + +Structure of the PodSpec that needs to be generated, for reference Ps: &pb.PodSpec{ Id: pod_jobspec.ID, Container: pod_jobspec.Containers[0].Image, @@ -38,149 +36,94 @@ Ps: &pb.PodSpec{ }, */ -func CreateJobSpecYaml(pr *pb.PodSpec, count int32, filename string) error { - socket_resources := make([]Resource, 1) - command := []string{pr.Container} - socket_resources[0] = Resource{Type: "core", Count: int64(pr.Cpu)} - if pr.Memory > 0 { - toMB := pr.Memory >> 20 - socket_resources = append(socket_resources, Resource{Type: "memory", Count: toMB}) - } +// CreateJobSpecYaml writes the protobuf jobspec into a yaml file +func CreateJobSpecYaml(spec *pb.PodSpec, count int32) ([]byte, error) { - if pr.Gpu > 0 { - socket_resources = append(socket_resources, Resource{Type: "gpu", Count: pr.Gpu}) - } + command := []string{spec.Container} + fmt.Println("Labels ", spec.Labels, " ", len(spec.Labels)) - fmt.Println("Labels ", pr.Labels, " ", len(pr.Labels)) + js := JobSpec{ + Version: Version{Version: 9999}, + Attributes: Attribute{System{Duration: 3600}}, - js := JobSpec{ - Version: Version{ - Version: 9999, - }, - Attributes: Attribute{ - System{ - Duration: 3600, - }, - }, - Tasks: []Task{ - { - // Command: "[\""+command+"\"]", - Command: command, - Slot: "default", - Counts: Count{ - PerSlot: 1, - }, - }, + // The name of the task likely needs to correspond with the pod + // Since we can't easily change the proto file, for now it is + // storing the pod namespaced name. + Tasks: []Task{ + { + Command: command, + Slot: "default", + Counts: Count{PerSlot: 1}, }, - } + }, + } + + // Assemble resources! + socketResources := createSocketResources(spec) + js.Version.Resources = createResources(spec, socketResources, count) + + // Write bytes to file + yamlbytes, err := yaml.Marshal(&js) + if err != nil { + log.Fatalf("[JobSpec] yaml.Marshal failed with '%s'\n", err) + return yamlbytes, err + } + return yamlbytes, nil +} + +// createSocketResources creates the socket resources for the JobSpec +func createSocketResources(spec *pb.PodSpec) []Resource { + + socketResources := []Resource{ + { + Type: "core", Count: int64(spec.Cpu), + }, + } + + // TODO double check what we are converting from -> to + if spec.Memory > 0 { + toMB := spec.Memory >> 20 + socketResources = append(socketResources, Resource{Type: "memory", Count: toMB}) + } + + if spec.Gpu > 0 { + socketResources = append(socketResources, Resource{Type: "gpu", Count: spec.Gpu}) + } + return socketResources +} + +// createResources assembles the list of JobSpec resources +func createResources(spec *pb.PodSpec, socketResources []Resource, count int32) []Resource { - slot_resource := make([]Resource, 1) - slot_resource[0] = Resource{ - Type: "slot", + slotResource := []Resource{ + { + Type: "slot", Count: int64(count), Label: "default", - With: socket_resources, - } - - if len(pr.Labels) > 0 { - for _, label := range pr.Labels { - if label == "zone" { - node_resource := make([]Resource, 1) - node_resource[0] = Resource{ - Type: "subnet", + With: socketResources, + }, + } + + // Presence of the zone label means we need to add a subnet + if len(spec.Labels) > 0 { + for _, label := range spec.Labels { + if label == "zone" { + nodeResource := []Resource{ + { + Type: "subnet", Count: 1, With: []Resource{ { - Type: "node", + Type: "node", Count: 1, - With: slot_resource, /*[]Resource{ - { - Type: "socket", - Count: 1, - With: slot_resource, - }, - },*/ + With: slotResource, }, }, - } - js.Version.Resources = node_resource + }, } - + return nodeResource } - - } else { - fmt.Println("No labels, going with plain JobSpec") - js.Version.Resources = slot_resource - } - - // js := JobSpec{ - // Version: Version{ - // Version: 9999, - // Resources: []Resource{ - // { - // Type: "node", - // Count: 1, - // With: []Resource{ - // { - // Type: "socket", - // Count: 1, - // With: []Resource{ - // { - // Type: "slot", - // Count: int64(count), - // Label: "default", - // With: socket_resources, - // }, - // }, - // }, - // }, - // }, - // }, - // }, - // Attributes: Attribute{ - // System{ - // Duration: 3600, - // }, - // }, - // Tasks: []Task{ - // { - // // Command: "[\""+command+"\"]", - // Command: command, - // Slot: "default", - // Counts: Count{ - // PerSlot: 1, - // }, - // }, - // }, - // } - yamlbytes, err := yaml.Marshal(&js) - if err != nil { - log.Fatalf("[JobSpec] yaml.Marshal failed with '%s'\n", err) - return err } - fmt.Printf("[JobSpec] JobSpec in YAML:\n%s\n", string(yamlbytes)) - f, err := os.Create(filename) - if err != nil { - log.Fatalf("[JobSpec] Couldn't create yaml file!!\n") - return err - } - defer f.Close() - - _, err = f.Write(yamlbytes) - if err != nil { - log.Fatalf("[JobSpec] Couldn't write yaml file!!\n") - return err - } - - _, err = f.WriteString("\n") - if err != nil { - log.Fatalf("[JobSpec] Couldn't write yaml file!!\n") - return err - } - return nil -} - -func toGB(bytes int64) int64 { - res := float64(bytes) / math.Pow(10, 9) - return int64(res) + } + return slotResource } diff --git a/src/fluence/jobspec/types.go b/src/fluence/jobspec/types.go index 9f4e4fc..8d6d06f 100644 --- a/src/fluence/jobspec/types.go +++ b/src/fluence/jobspec/types.go @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package jobspec type Version struct { diff --git a/src/fluence/service-grpc/service.pb.go b/src/fluence/service-grpc/service.pb.go new file mode 100644 index 0000000..eca0e69 --- /dev/null +++ b/src/fluence/service-grpc/service.pb.go @@ -0,0 +1,351 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.20.3 +// source: fluence/service-grpc/service.proto + +package service + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// GroupRequest for a group +type GroupRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Group string `protobuf:"bytes,1,opt,name=group,proto3" json:"group,omitempty"` +} + +func (x *GroupRequest) Reset() { + *x = GroupRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_fluence_service_grpc_service_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GroupRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GroupRequest) ProtoMessage() {} + +func (x *GroupRequest) ProtoReflect() protoreflect.Message { + mi := &file_fluence_service_grpc_service_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GroupRequest.ProtoReflect.Descriptor instead. +func (*GroupRequest) Descriptor() ([]byte, []int) { + return file_fluence_service_grpc_service_proto_rawDescGZIP(), []int{0} +} + +func (x *GroupRequest) GetGroup() string { + if x != nil { + return x.Group + } + return "" +} + +// GroupResponse +type GroupResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Size int64 `protobuf:"varint,2,opt,name=size,proto3" json:"size,omitempty"` +} + +func (x *GroupResponse) Reset() { + *x = GroupResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_fluence_service_grpc_service_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GroupResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GroupResponse) ProtoMessage() {} + +func (x *GroupResponse) ProtoReflect() protoreflect.Message { + mi := &file_fluence_service_grpc_service_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GroupResponse.ProtoReflect.Descriptor instead. +func (*GroupResponse) Descriptor() ([]byte, []int) { + return file_fluence_service_grpc_service_proto_rawDescGZIP(), []int{1} +} + +func (x *GroupResponse) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *GroupResponse) GetSize() int64 { + if x != nil { + return x.Size + } + return 0 +} + +type ResourceRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields +} + +func (x *ResourceRequest) Reset() { + *x = ResourceRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_fluence_service_grpc_service_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ResourceRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ResourceRequest) ProtoMessage() {} + +func (x *ResourceRequest) ProtoReflect() protoreflect.Message { + mi := &file_fluence_service_grpc_service_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ResourceRequest.ProtoReflect.Descriptor instead. +func (*ResourceRequest) Descriptor() ([]byte, []int) { + return file_fluence_service_grpc_service_proto_rawDescGZIP(), []int{2} +} + +type ResourceResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Graph string `protobuf:"bytes,1,opt,name=graph,proto3" json:"graph,omitempty"` +} + +func (x *ResourceResponse) Reset() { + *x = ResourceResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_fluence_service_grpc_service_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ResourceResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ResourceResponse) ProtoMessage() {} + +func (x *ResourceResponse) ProtoReflect() protoreflect.Message { + mi := &file_fluence_service_grpc_service_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ResourceResponse.ProtoReflect.Descriptor instead. +func (*ResourceResponse) Descriptor() ([]byte, []int) { + return file_fluence_service_grpc_service_proto_rawDescGZIP(), []int{3} +} + +func (x *ResourceResponse) GetGraph() string { + if x != nil { + return x.Graph + } + return "" +} + +var File_fluence_service_grpc_service_proto protoreflect.FileDescriptor + +var file_fluence_service_grpc_service_proto_rawDesc = []byte{ + 0x0a, 0x22, 0x66, 0x6c, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x2f, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, + 0x65, 0x2d, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x07, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x22, 0x24, 0x0a, + 0x0c, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x14, 0x0a, + 0x05, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x67, 0x72, + 0x6f, 0x75, 0x70, 0x22, 0x37, 0x0a, 0x0d, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x73, 0x69, 0x7a, 0x65, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, 0x04, 0x73, 0x69, 0x7a, 0x65, 0x22, 0x11, 0x0a, 0x0f, + 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x22, + 0x28, 0x0a, 0x10, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, + 0x6e, 0x73, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x67, 0x72, 0x61, 0x70, 0x68, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x05, 0x67, 0x72, 0x61, 0x70, 0x68, 0x32, 0xda, 0x01, 0x0a, 0x15, 0x45, 0x78, + 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x50, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x53, 0x65, 0x72, 0x76, + 0x69, 0x63, 0x65, 0x12, 0x45, 0x0a, 0x0c, 0x47, 0x65, 0x74, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x73, 0x12, 0x18, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x52, 0x65, + 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x19, 0x2e, + 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, + 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x3d, 0x0a, 0x0a, 0x4c, 0x69, + 0x73, 0x74, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x12, 0x15, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x69, + 0x63, 0x65, 0x2e, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x16, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x3b, 0x0a, 0x08, 0x47, 0x65, 0x74, + 0x47, 0x72, 0x6f, 0x75, 0x70, 0x12, 0x15, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, + 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x16, 0x2e, 0x73, + 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x2e, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x0e, 0x5a, 0x0c, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x73, + 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_fluence_service_grpc_service_proto_rawDescOnce sync.Once + file_fluence_service_grpc_service_proto_rawDescData = file_fluence_service_grpc_service_proto_rawDesc +) + +func file_fluence_service_grpc_service_proto_rawDescGZIP() []byte { + file_fluence_service_grpc_service_proto_rawDescOnce.Do(func() { + file_fluence_service_grpc_service_proto_rawDescData = protoimpl.X.CompressGZIP(file_fluence_service_grpc_service_proto_rawDescData) + }) + return file_fluence_service_grpc_service_proto_rawDescData +} + +var file_fluence_service_grpc_service_proto_msgTypes = make([]protoimpl.MessageInfo, 4) +var file_fluence_service_grpc_service_proto_goTypes = []interface{}{ + (*GroupRequest)(nil), // 0: service.GroupRequest + (*GroupResponse)(nil), // 1: service.GroupResponse + (*ResourceRequest)(nil), // 2: service.ResourceRequest + (*ResourceResponse)(nil), // 3: service.ResourceResponse +} +var file_fluence_service_grpc_service_proto_depIdxs = []int32{ + 2, // 0: service.ExternalPluginService.GetResources:input_type -> service.ResourceRequest + 0, // 1: service.ExternalPluginService.ListGroups:input_type -> service.GroupRequest + 0, // 2: service.ExternalPluginService.GetGroup:input_type -> service.GroupRequest + 3, // 3: service.ExternalPluginService.GetResources:output_type -> service.ResourceResponse + 1, // 4: service.ExternalPluginService.ListGroups:output_type -> service.GroupResponse + 1, // 5: service.ExternalPluginService.GetGroup:output_type -> service.GroupResponse + 3, // [3:6] is the sub-list for method output_type + 0, // [0:3] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_fluence_service_grpc_service_proto_init() } +func file_fluence_service_grpc_service_proto_init() { + if File_fluence_service_grpc_service_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_fluence_service_grpc_service_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GroupRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_fluence_service_grpc_service_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GroupResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_fluence_service_grpc_service_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ResourceRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_fluence_service_grpc_service_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ResourceResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_fluence_service_grpc_service_proto_rawDesc, + NumEnums: 0, + NumMessages: 4, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_fluence_service_grpc_service_proto_goTypes, + DependencyIndexes: file_fluence_service_grpc_service_proto_depIdxs, + MessageInfos: file_fluence_service_grpc_service_proto_msgTypes, + }.Build() + File_fluence_service_grpc_service_proto = out.File + file_fluence_service_grpc_service_proto_rawDesc = nil + file_fluence_service_grpc_service_proto_goTypes = nil + file_fluence_service_grpc_service_proto_depIdxs = nil +} diff --git a/src/fluence/service-grpc/service.proto b/src/fluence/service-grpc/service.proto new file mode 100644 index 0000000..6240314 --- /dev/null +++ b/src/fluence/service-grpc/service.proto @@ -0,0 +1,34 @@ +syntax = "proto3"; +option go_package = "grpc/service"; + +package service; + + +// Service definition for an external plugin like kubectl +service ExternalPluginService { + + // This is supported via a shared file in the container + rpc GetResources(ResourceRequest) returns (ResourceResponse) {} + + // Note we currently cannot support getting group metadata, need to add handle to get info, etc. + rpc ListGroups(GroupRequest) returns (GroupResponse) {} + rpc GetGroup(GroupRequest) returns (GroupResponse) {} +} + +// GroupRequest for a group +message GroupRequest { + string group = 1; +} + +// GroupResponse +message GroupResponse { + string name = 1; + int64 size = 2; +} + +message ResourceRequest {} +message ResourceResponse { + string graph = 1; +} + + diff --git a/src/fluence/service-grpc/service_grpc.pb.go b/src/fluence/service-grpc/service_grpc.pb.go new file mode 100644 index 0000000..c15f8f3 --- /dev/null +++ b/src/fluence/service-grpc/service_grpc.pb.go @@ -0,0 +1,181 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.20.3 +// source: fluence/service-grpc/service.proto + +package service + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// ExternalPluginServiceClient is the client API for ExternalPluginService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type ExternalPluginServiceClient interface { + // This is supported via a shared file in the container + GetResources(ctx context.Context, in *ResourceRequest, opts ...grpc.CallOption) (*ResourceResponse, error) + // Note we currently cannot support getting group metadata, need to add handle to get info, etc. + ListGroups(ctx context.Context, in *GroupRequest, opts ...grpc.CallOption) (*GroupResponse, error) + GetGroup(ctx context.Context, in *GroupRequest, opts ...grpc.CallOption) (*GroupResponse, error) +} + +type externalPluginServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewExternalPluginServiceClient(cc grpc.ClientConnInterface) ExternalPluginServiceClient { + return &externalPluginServiceClient{cc} +} + +func (c *externalPluginServiceClient) GetResources(ctx context.Context, in *ResourceRequest, opts ...grpc.CallOption) (*ResourceResponse, error) { + out := new(ResourceResponse) + err := c.cc.Invoke(ctx, "/service.ExternalPluginService/GetResources", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *externalPluginServiceClient) ListGroups(ctx context.Context, in *GroupRequest, opts ...grpc.CallOption) (*GroupResponse, error) { + out := new(GroupResponse) + err := c.cc.Invoke(ctx, "/service.ExternalPluginService/ListGroups", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *externalPluginServiceClient) GetGroup(ctx context.Context, in *GroupRequest, opts ...grpc.CallOption) (*GroupResponse, error) { + out := new(GroupResponse) + err := c.cc.Invoke(ctx, "/service.ExternalPluginService/GetGroup", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// ExternalPluginServiceServer is the server API for ExternalPluginService service. +// All implementations must embed UnimplementedExternalPluginServiceServer +// for forward compatibility +type ExternalPluginServiceServer interface { + // This is supported via a shared file in the container + GetResources(context.Context, *ResourceRequest) (*ResourceResponse, error) + // Note we currently cannot support getting group metadata, need to add handle to get info, etc. + ListGroups(context.Context, *GroupRequest) (*GroupResponse, error) + GetGroup(context.Context, *GroupRequest) (*GroupResponse, error) + mustEmbedUnimplementedExternalPluginServiceServer() +} + +// UnimplementedExternalPluginServiceServer must be embedded to have forward compatible implementations. +type UnimplementedExternalPluginServiceServer struct { +} + +func (UnimplementedExternalPluginServiceServer) GetResources(context.Context, *ResourceRequest) (*ResourceResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetResources not implemented") +} +func (UnimplementedExternalPluginServiceServer) ListGroups(context.Context, *GroupRequest) (*GroupResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method ListGroups not implemented") +} +func (UnimplementedExternalPluginServiceServer) GetGroup(context.Context, *GroupRequest) (*GroupResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetGroup not implemented") +} +func (UnimplementedExternalPluginServiceServer) mustEmbedUnimplementedExternalPluginServiceServer() {} + +// UnsafeExternalPluginServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to ExternalPluginServiceServer will +// result in compilation errors. +type UnsafeExternalPluginServiceServer interface { + mustEmbedUnimplementedExternalPluginServiceServer() +} + +func RegisterExternalPluginServiceServer(s grpc.ServiceRegistrar, srv ExternalPluginServiceServer) { + s.RegisterService(&ExternalPluginService_ServiceDesc, srv) +} + +func _ExternalPluginService_GetResources_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(ResourceRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ExternalPluginServiceServer).GetResources(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/service.ExternalPluginService/GetResources", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ExternalPluginServiceServer).GetResources(ctx, req.(*ResourceRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _ExternalPluginService_ListGroups_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GroupRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ExternalPluginServiceServer).ListGroups(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/service.ExternalPluginService/ListGroups", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ExternalPluginServiceServer).ListGroups(ctx, req.(*GroupRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _ExternalPluginService_GetGroup_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GroupRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ExternalPluginServiceServer).GetGroup(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/service.ExternalPluginService/GetGroup", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ExternalPluginServiceServer).GetGroup(ctx, req.(*GroupRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// ExternalPluginService_ServiceDesc is the grpc.ServiceDesc for ExternalPluginService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var ExternalPluginService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "service.ExternalPluginService", + HandlerType: (*ExternalPluginServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "GetResources", + Handler: _ExternalPluginService_GetResources_Handler, + }, + { + MethodName: "ListGroups", + Handler: _ExternalPluginService_ListGroups_Handler, + }, + { + MethodName: "GetGroup", + Handler: _ExternalPluginService_GetGroup_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "fluence/service-grpc/service.proto", +} diff --git a/src/fluence/service/service.go b/src/fluence/service/service.go new file mode 100644 index 0000000..ad61c1a --- /dev/null +++ b/src/fluence/service/service.go @@ -0,0 +1,61 @@ +package service + +import ( + "os" + + "github.com/flux-framework/flux-k8s/flux-plugin/fluence/defaults" + pb "github.com/flux-framework/flux-k8s/flux-plugin/fluence/service-grpc" + + "k8s.io/klog/v2" + + "context" +) + +type ExternalService struct { + pb.UnimplementedExternalPluginServiceServer +} + +// Init is a helper function for any startup stuff, for which now we have none :) +func (f *ExternalService) Init() { + klog.Infof("[Fluence] Created external service.") +} + +// GetGroup gets and returns the group info +// TODO no good way to look up group - we would need to ask Fluxion directly OR put the grpc +// service alongside the scheduler plugin, which seems like a bad design +func (s *ExternalService) GetGroup(ctx context.Context, in *pb.GroupRequest) (*pb.GroupResponse, error) { + klog.Infof("[Fluence] Calling get group endpoint! %v\n", in) + + // Prepare an empty match response (that can still be serialized) + emptyResponse := &pb.GroupResponse{} + return emptyResponse, nil +} + +// List group returns existing groups +func (s *ExternalService) ListGroups(ctx context.Context, in *pb.GroupRequest) (*pb.GroupResponse, error) { + + emptyResponse := &pb.GroupResponse{} + + // Prepare an empty match response (that can still be serialized) + klog.Infof("[Fluence] Calling list groups endpoint! %v\n", in) + + return emptyResponse, nil +} + +// GetResources gets the current Kubernetes Json Graph Format JGF +// This should be created on init of the scheduler +func (s *ExternalService) GetResources(ctx context.Context, in *pb.ResourceRequest) (*pb.ResourceResponse, error) { + + emptyResponse := &pb.ResourceResponse{} + + // Prepare an empty match response (that can still be serialized) + klog.Infof("[Fluence] Calling get resources endpoint! %v\n", in) + + jgf, err := os.ReadFile(defaults.KubernetesJsonGraphFormat) + if err != nil { + klog.Error("Error reading JGF") + return emptyResponse, err + } + emptyResponse.Graph = string(jgf) + return emptyResponse, nil +} diff --git a/src/fluence/utils/utils.go b/src/fluence/utils/utils.go index 2d6d932..490a0e0 100644 --- a/src/fluence/utils/utils.go +++ b/src/fluence/utils/utils.go @@ -4,7 +4,8 @@ import ( "context" "fmt" - // "strings" + klog "k8s.io/klog/v2" + "encoding/json" "github.com/flux-framework/flux-k8s/flux-plugin/fluence/jgf" @@ -17,140 +18,202 @@ import ( resourcehelper "k8s.io/kubectl/pkg/util/resource" ) -func CreateJGF(filename string, label *string) error { +var ( + controlPlaneLabel = "node-role.kubernetes.io/control-plane" +) + +// RegisterExisting uses the in cluster API to get existing pods +// This is actually the same as computeTotalRequests but I wanted to compare the two +// It is currently not being used. The main difference is that below, we are essentially +// rounding the cpu to the smaller unit (logically for the graph) but losing some +// granularity, if we think "milli" values have feet. +func RegisterExisting(clientset *kubernetes.Clientset, ctx context.Context) (map[string]PodSpec, error) { + + // We are using PodSpec as a holder for a *summary* of cpu/memory being used + // by the node, it is a summation across pods we find on each one + nodes := map[string]PodSpec{} + + // get pods in all the namespaces by omitting namespace + // Or specify namespace to get pods in particular namespace + pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) + if err != nil { + klog.Infof("Error listing pods: %s\n", err) + return nodes, err + } + klog.Infof("Found %d existing pods in the cluster\n", len(pods.Items)) + + // Create a new PodSpec for each + for _, pod := range pods.Items { + + // Add the node to our lookup if we don't have it yet + _, ok := nodes[pod.Spec.NodeName] + if !ok { + nodes[pod.Spec.NodeName] = PodSpec{} + } + ps := nodes[pod.Spec.NodeName] + + for _, container := range pod.Spec.Containers { + specRequests := container.Resources.Requests + ps.Cpu += int32(specRequests.Cpu().Value()) + ps.Memory += specRequests.Memory().Value() + ps.Storage += specRequests.StorageEphemeral().Value() + + specLimits := container.Resources.Limits + gpuSpec := specLimits["nvidia.com/gpu"] + ps.Gpu += gpuSpec.Value() + } + nodes[pod.Spec.NodeName] = ps + } + return nodes, nil +} + +// CreateJGF creates the Json Graph Format +// We currently don't have support in fluxion to allocate jobs for existing pods, +// so instead we create the graph with fewer resources. When that support is +// added (see sig-scheduler-plugins/pkg/fluence/register.go) we can +// remove the adjustment here, which is more of a hack +func CreateJGF(filename string, skipLabel *string) error { ctx := context.Background() config, err := rest.InClusterConfig() if err != nil { fmt.Println("Error getting InClusterConfig") return err } - // creates the clientset clientset, err := kubernetes.NewForConfig(config) if err != nil { - fmt.Println("Error getting ClientSet") + fmt.Printf("Error getting ClientSet: %s", err) return err } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + fmt.Printf("Error listing nodes: %s", err) + return err + } - var fluxgraph jgf.Fluxjgf - fluxgraph = jgf.InitJGF() - // subnets := make(map[string]string) + // Create a Flux Json Graph Format (JGF) with all cluster nodes + fluxgraph := jgf.InitJGF() + // Top level of the graph is the cluster + // This assumes fluxion is only serving one cluster. + // previous comments indicate that we choose between the level + // of a rack and a subnet. A rack doesn't make sense (the nodes could + // be on multiple racks) so subnet is likely the right abstraction cluster := fluxgraph.MakeCluster("k8scluster") - // Rack needs to be disabled when using subnets - // rack := fluxgraph.MakeRack(0) - - // fluxgraph.MakeEdge(cluster, rack, "contains") - // fluxgraph.MakeEdge(rack, cluster, "in") - vcores := 0 fmt.Println("Number nodes ", len(nodes.Items)) - var totalAllocCpu, totalmem int64 + var totalAllocCpu int64 totalAllocCpu = 0 - sdnCount := 0 - for node_index, node := range nodes.Items { - // _, worker := node.Labels["node-role.kubernetes.io/worker"] - if *label != "" { - _, fluxnode := node.Labels[*label] - if !fluxnode { - fmt.Println("Skipping node ", node.GetName()) + sdnCount := int64(0) + + for nodeIndex, node := range nodes.Items { + + // We should not be scheduling to the control plane + _, ok := node.Labels[controlPlaneLabel] + if ok { + fmt.Println("Skipping control plane node ", node.GetName()) + continue + } + + // Anything labeled with "skipLabel" meaning it is present, + // should be skipped + if *skipLabel != "" { + _, ok := node.Labels[*skipLabel] + if ok { + fmt.Printf("Skipping node %s\n", node.GetName()) continue } } - fmt.Println("node in flux group ", node.GetName()) - if !node.Spec.Unschedulable { - fieldselector, err := fields.ParseSelector("spec.nodeName=" + node.GetName() + ",status.phase!=" + string(corev1.PodSucceeded) + ",status.phase!=" + string(corev1.PodFailed)) - if err != nil { - return err - } - pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{ - FieldSelector: fieldselector.String(), - }) - if err != nil { - return err - } - // fmt.Println("Node ", node.GetName(), " has pods ", pods) - // Check if subnet already exists - // Here we build subnets according to topology.kubernetes.io/zone label - subnetName := node.Labels["topology.kubernetes.io/zone"] - subnet := fluxgraph.MakeSubnet(sdnCount, subnetName) - sdnCount = sdnCount + 1 - fluxgraph.MakeEdge(cluster, subnet, "contains") - fluxgraph.MakeEdge(subnet, cluster, "in") - - reqs := computeTotalRequests(pods) - cpuReqs := reqs[corev1.ResourceCPU] - memReqs := reqs[corev1.ResourceMemory] - - avail := node.Status.Allocatable.Cpu().MilliValue() - totalcpu := int64((avail - cpuReqs.MilliValue()) / 1000) //- 1 - fmt.Println("Node ", node.GetName(), " flux cpu ", totalcpu) - totalAllocCpu = totalAllocCpu + totalcpu - totalmem = node.Status.Allocatable.Memory().Value() - memReqs.Value() - fmt.Println("Node ", node.GetName(), " total mem ", totalmem) - gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable["nvidia.com/gpu"] - - // reslist := node.Status.Allocatable - // resources := make([]corev1.ResourceName, 0, len(reslist)) - // for resource := range reslist { - // fmt.Println("resource ", resource) - // resources = append(resources, resource) - // } - // for _, resource := range resources { - // value := reslist[resource] - - // fmt.Printf(" %s:\t%s\n", resource, value.String()) - // } - - workernode := fluxgraph.MakeNode(node_index, false, node.Name) - fluxgraph.MakeEdge(subnet, workernode, "contains") // this is rack otherwise - fluxgraph.MakeEdge(workernode, subnet, "in") // this is rack otherwise - - // socket := fluxgraph.MakeSocket(0, "socket") - // fluxgraph.MakeEdge(workernode, socket, "contains") - // fluxgraph.MakeEdge(socket, workernode, "in") - - if hasGpuAllocatable { - fmt.Println("GPU Resource quantity ", gpuAllocatable.Value()) - //MakeGPU(index int, name string, size int) string { - for index := 0; index < int(gpuAllocatable.Value()); index++ { - gpu := fluxgraph.MakeGPU(index, "nvidiagpu", 1) - fluxgraph.MakeEdge(workernode, gpu, "contains") // workernode was socket - fluxgraph.MakeEdge(gpu, workernode, "in") - } + if node.Spec.Unschedulable { + fmt.Printf("Skipping node %s, unschedulable\n", node.GetName()) + continue + } + + fieldselector, err := fields.ParseSelector("spec.nodeName=" + node.GetName() + ",status.phase!=" + string(corev1.PodSucceeded) + ",status.phase!=" + string(corev1.PodFailed)) + if err != nil { + return err + } + pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{ + FieldSelector: fieldselector.String(), + }) + if err != nil { + return err + } + + // Here we build the subnet according to topology.kubernetes.io/zone label + subnetName := node.Labels["topology.kubernetes.io/zone"] + subnet := fluxgraph.MakeSubnet(sdnCount, subnetName) + sdnCount = sdnCount + 1 + fluxgraph.MakeEdge(cluster, subnet, jgf.ContainsRelation) + fluxgraph.MakeEdge(subnet, cluster, jgf.InRelation) + + // These are requests for existing pods, for cpu and memory + reqs := computeTotalRequests(pods) + cpuReqs := reqs[corev1.ResourceCPU] + memReqs := reqs[corev1.ResourceMemory] + // Actual values that we have available (minus requests) + totalCpu := node.Status.Allocatable.Cpu().MilliValue() + totalMem := node.Status.Allocatable.Memory().Value() + + // Values accounting for requests + availCpu := int64((totalCpu - cpuReqs.MilliValue()) / 1000) + availMem := totalMem - memReqs.Value() + + // Show existing to compare to + fmt.Printf("\n📦️ %s\n", node.GetName()) + fmt.Printf(" allocated cpu: %d\n", cpuReqs.Value()) + fmt.Printf(" allocated mem: %d\n", memReqs.Value()) + fmt.Printf(" available cpu: %d\n", availCpu) + fmt.Printf(" running pods: %d\n", len(pods.Items)) + + // keep track of overall total + totalAllocCpu += availCpu + fmt.Printf(" available mem: %d\n", availMem) + gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable["nvidia.com/gpu"] + + // TODO possibly look at pod resources vs. node.Status.Allocatable + + workernode := fluxgraph.MakeNode(nodeIndex, false, node.Name) + fluxgraph.MakeEdge(subnet, workernode, jgf.ContainsRelation) + fluxgraph.MakeEdge(workernode, subnet, jgf.InRelation) + + if hasGpuAllocatable { + fmt.Println("GPU Resource quantity ", gpuAllocatable.Value()) + for index := 0; index < int(gpuAllocatable.Value()); index++ { + gpu := fluxgraph.MakeGPU(int64(index), jgf.NvidiaGPU, 1) + fluxgraph.MakeEdge(workernode, gpu, jgf.ContainsRelation) + fluxgraph.MakeEdge(gpu, workernode, jgf.InRelation) } - for index := 0; index < int(totalcpu); index++ { - // MakeCore(index int, name string) - core := fluxgraph.MakeCore(index, "core") - fluxgraph.MakeEdge(workernode, core, "contains") // workernode was socket - fluxgraph.MakeEdge(core, workernode, "in") - if vcores == 0 { - fluxgraph.MakeNFDProperties(core, index, "cpu-", &node.Labels) - // fluxgraph.MakeNFDProperties(core, index, "netmark-", &node.Labels) - } else { - for vc := 0; vc < vcores; vc++ { - vcore := fluxgraph.MakeVCore(core, vc, "vcore") - fluxgraph.MakeNFDProperties(vcore, index, "cpu-", &node.Labels) - } + } + + for index := 0; index < int(availCpu); index++ { + core := fluxgraph.MakeCore(int64(index), jgf.CoreType) + fluxgraph.MakeEdge(workernode, core, jgf.ContainsRelation) + fluxgraph.MakeEdge(core, workernode, jgf.InRelation) + + // Question from Vanessa: + // How can we get here and have vcores ever not equal to zero? + if vcores == 0 { + fluxgraph.MakeNFDProperties(core, int64(index), "cpu-", &node.Labels) + } else { + for virtualCore := 0; virtualCore < vcores; virtualCore++ { + vcore := fluxgraph.MakeVCore(core, int64(virtualCore), jgf.VirtualCoreType) + fluxgraph.MakeNFDProperties(vcore, int64(index), "cpu-", &node.Labels) } } + } - // MakeMemory(index int, name string, unit string, size int) - fractionmem := totalmem >> 30 - // fractionmem := (totalmem/totalcpu) >> 20 - // fmt.Println("Creating ", fractionmem, " vertices with ", 1<<10, " MB of mem") - for i := 0; i < /*int(totalcpu)*/ int(fractionmem); i++ { - mem := fluxgraph.MakeMemory(i, "memory", "MB", int(1<<10)) - fluxgraph.MakeEdge(workernode, mem, "contains") - fluxgraph.MakeEdge(mem, workernode, "in") - } + fractionMem := availMem >> 30 + for i := 0; i < int(fractionMem); i++ { + mem := fluxgraph.MakeMemory(int64(i), jgf.MemoryType, "MB", 1<<10) + fluxgraph.MakeEdge(workernode, mem, jgf.ContainsRelation) + fluxgraph.MakeEdge(mem, workernode, jgf.InRelation) } } - fmt.Println("Can request at most ", totalAllocCpu, " exclusive cpu") + fmt.Printf("\nCan request at most %d exclusive cpu", totalAllocCpu) err = fluxgraph.WriteJGF(filename) if err != nil { return err @@ -159,6 +222,7 @@ func CreateJGF(filename string, label *string) error { } +// computeTotalRequests sums up the pod requests for the list. We do not consider limits. func computeTotalRequests(podList *corev1.PodList) (total map[corev1.ResourceName]resource.Quantity) { total = map[corev1.ResourceName]resource.Quantity{} for _, pod := range podList.Items { @@ -171,14 +235,6 @@ func computeTotalRequests(podList *corev1.PodList) (total map[corev1.ResourceNam total[podReqName] = v } } - // for podLimitName, podLimitValue := range podLimits { - // if v, ok := total[podLimitName]; !ok { - // total[podLimitName] = podLimitValue - // } else { - // v.Add(podLimitValue) - // total[podLimitName] = v - // } - // } } return } @@ -190,50 +246,61 @@ type allocation struct { CoreCount int } -func ParseAllocResult(allocated string) []allocation { +// ParseAllocResult takes an allocated (string) and parses into a list of allocation +// We include the pod namespace/name for debugging later +func ParseAllocResult(allocated, podName string) []allocation { var dat map[string]interface{} - result := make([]allocation, 0) + result := []allocation{} + + // Keep track of total core count across allocated corecount := 0 + + // This should not happen - the string we get back should parse. if err := json.Unmarshal([]byte(allocated), &dat); err != nil { panic(err) } - // fmt.Println("PRINTING DATA:\n", dat) - // graph := dat["graph"] - // fmt.Println("GET GRAPH:\n ", graph) - nodes := dat["graph"].(interface{}) + // Parse graph and nodes into interfaces + // TODO look at github.com/mitchellh/mapstructure + // that might make this easier + nodes := dat["graph"] str1 := nodes.(map[string]interface{}) - // fmt.Println("GET NODES:\n", str1["nodes"]) str2 := str1["nodes"].([]interface{}) - // fmt.Println("NODES:\n", len(str2)) + for _, item := range str2 { - // fmt.Println("ITEM: ", item) str1 = item.(map[string]interface{}) metadata := str1["metadata"].(map[string]interface{}) - // fmt.Println("TYPE: ", metadata["type"]) - if metadata["type"].(string) == "core" { + if metadata["type"].(string) == jgf.CoreType { corecount = corecount + 1 } - // fmt.Println("BASENAME: ", metadata["basename"]) - if metadata["type"].(string) == "node" { + if metadata["type"].(string) == jgf.NodeType { result = append(result, allocation{ Type: metadata["type"].(string), Name: metadata["name"].(string), Basename: metadata["basename"].(string), CoreCount: corecount, }) + + // Reset the corecount once we've added to a node corecount = 0 - // result.Type = metadata["type"].(string) - // result.Name = metadata["name"].(string) - // result.Basename = metadata["basename"].(string) - // return result } } - fmt.Println("FINAL NODE RESULT:\n", result) + fmt.Printf("Final node result for %s\n", podName) + for i, alloc := range result { + fmt.Printf("Node %d: %s\n", i, alloc.Name) + fmt.Printf(" Type: %s\n Name: %s\n Basename: %s\n CoreCount: %d\n", + alloc.Type, alloc.Name, alloc.Basename, alloc.CoreCount) + + } return result } -// //// Utility functions +// Utility functions func PrintOutput(reserved bool, allocated string, at int64, overhead float64, jobid uint64, fluxerr error) { fmt.Println("\n\t----Match Allocate output---") - fmt.Printf("jobid: %d\nreserved: %t\nallocated: %s\nat: %d\noverhead: %f\nerror: %w\n", jobid, reserved, allocated, at, overhead, fluxerr) + fmt.Printf("jobid: %d\nreserved: %t\nallocated: %s\nat: %d\noverhead: %f\n", jobid, reserved, allocated, at, overhead) + + // Only print error if we had one + if fluxerr != nil { + fmt.Printf("error: %s\n", fluxerr) + } }