From 7f5cfac95de9d4fcc270d42bfd9d366dc6b2a92a Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:38:17 +0100 Subject: [PATCH 01/13] feat(ws): implement culling controller Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- workspaces/controller/Makefile | 5 +- .../controller/api/v1beta1/workspace_types.go | 6 + .../api/v1beta1/workspacekind_types.go | 7 + .../api/v1beta1/zz_generated.deepcopy.go | 10 + workspaces/controller/cmd/main.go | 8 + .../bases/kubeflow.org_workspacekinds.yaml | 8 + .../crd/bases/kubeflow.org_workspaces.yaml | 6 + .../samples/jupyterlab_v1beta1_workspace.yaml | 2 + .../jupyterlab_v1beta1_workspacekind.yaml | 5 + .../internal/controller/culling_controller.go | 238 ++++++++++++++++++ .../controller/culling_controller_test.go | 1 + 11 files changed, 295 insertions(+), 1 deletion(-) create mode 100644 workspaces/controller/internal/controller/culling_controller.go create mode 100644 workspaces/controller/internal/controller/culling_controller_test.go diff --git a/workspaces/controller/Makefile b/workspaces/controller/Makefile index 6032c8c91..85f49ed73 100644 --- a/workspaces/controller/Makefile +++ b/workspaces/controller/Makefile @@ -1,5 +1,5 @@ # Image URL to use all building/pushing image targets -IMG ?= controller:latest +IMG ?= ghcr.io/kubeflow/notebooks/workspace-controller # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.31.0 @@ -97,6 +97,9 @@ build: manifests generate fmt vet ## Build manager binary. run: manifests generate fmt vet ## Run a controller from your host. go run ./cmd/main.go +kind-load: + kind load docker-image ${IMG} -n kind + # If you wish to build the manager image targeting other platforms you can use the --platform flag. # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ diff --git a/workspaces/controller/api/v1beta1/workspace_types.go b/workspaces/controller/api/v1beta1/workspace_types.go index 03e8a66a1..d6574e2ea 100644 --- a/workspaces/controller/api/v1beta1/workspace_types.go +++ b/workspaces/controller/api/v1beta1/workspace_types.go @@ -36,6 +36,12 @@ type WorkspaceSpec struct { // +kubebuilder:default=false Paused *bool `json:"paused,omitempty"` + // DisableCulling controls whether automatic culling is disabled for the workspace. + // If true, the workspace will not be culled + //+kubebuilder:validation:Optional + //+kubebuilder:default=false + DisableCulling *bool `json:"disableCulling,omitempty"` + // if true, pending updates are NOT applied when the Workspace is paused // if false, pending updates are applied when the Workspace is paused // +kubebuilder:validation:Optional diff --git a/workspaces/controller/api/v1beta1/workspacekind_types.go b/workspaces/controller/api/v1beta1/workspacekind_types.go index 2d846237d..403b22e85 100644 --- a/workspaces/controller/api/v1beta1/workspacekind_types.go +++ b/workspaces/controller/api/v1beta1/workspacekind_types.go @@ -184,6 +184,11 @@ type WorkspaceKindCullingConfig struct { // +kubebuilder:default=86400 MaxInactiveSeconds *int32 `json:"maxInactiveSeconds,omitempty"` + //+kubebuilder:validation:Optional + //+kubebuilder:validation:Minimum:=60 + //+kubebuilder:default=300 + MinimumProbeIntervalSeconds *int32 `json:"minimumProbeInterval,omitempty"` + // the probe used to determine if the Workspace is active ActivityProbe ActivityProbe `json:"activityProbe"` } @@ -216,6 +221,8 @@ type ActivityProbeJupyter struct { // if the Jupyter-specific probe is enabled // +kubebuilder:example=true LastActivity bool `json:"lastActivity"` + + PortId string `json:"portId"` } type WorkspaceKindProbes struct { diff --git a/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go b/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go index 1beab4fde..02ec6c27a 100644 --- a/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go +++ b/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go @@ -565,6 +565,11 @@ func (in *WorkspaceKindCullingConfig) DeepCopyInto(out *WorkspaceKindCullingConf *out = new(int32) **out = **in } + if in.MinimumProbeIntervalSeconds != nil { + in, out := &in.MinimumProbeIntervalSeconds, &out.MinimumProbeIntervalSeconds + *out = new(int32) + **out = **in + } in.ActivityProbe.DeepCopyInto(&out.ActivityProbe) } @@ -1060,6 +1065,11 @@ func (in *WorkspaceSpec) DeepCopyInto(out *WorkspaceSpec) { *out = new(bool) **out = **in } + if in.DisableCulling != nil { + in, out := &in.DisableCulling, &out.DisableCulling + *out = new(bool) + **out = **in + } if in.DeferUpdates != nil { in, out := &in.DeferUpdates, &out.DeferUpdates *out = new(bool) diff --git a/workspaces/controller/cmd/main.go b/workspaces/controller/cmd/main.go index ae09c2ed3..9720e8ac6 100644 --- a/workspaces/controller/cmd/main.go +++ b/workspaces/controller/cmd/main.go @@ -148,6 +148,14 @@ func main() { os.Exit(1) } // +kubebuilder:scaffold:builder + if err = (&controllerInternal.CullingReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Culler") + os.Exit(1) + } + //+kubebuilder:scaffold:builder if os.Getenv("ENABLE_WEBHOOKS") != "false" { if err = (&webhookInternal.WorkspaceValidator{ diff --git a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml index 9fea44636..a5ff53206 100644 --- a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml +++ b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml @@ -285,8 +285,11 @@ spec: description: if the Jupyter-specific probe is enabled example: true type: boolean + portId: + type: string required: - lastActivity + - portId type: object x-kubernetes-validations: - message: '''lastActivity'' must be true' @@ -307,6 +310,11 @@ spec: format: int32 minimum: 60 type: integer + minimumProbeInterval: + default: 300 + format: int32 + minimum: 60 + type: integer required: - activityProbe type: object diff --git a/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml b/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml index c66f269dd..7d76d0e4c 100644 --- a/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml +++ b/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml @@ -50,6 +50,12 @@ spec: if true, pending updates are NOT applied when the Workspace is paused if false, pending updates are applied when the Workspace is paused type: boolean + disableCulling: + default: false + description: |- + DisableCulling controls whether automatic culling is disabled for the workspace. + If true, the workspace will not be culled + type: boolean kind: description: the WorkspaceKind to use example: jupyterlab diff --git a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspace.yaml b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspace.yaml index 1c8e076d4..f75d3fb7c 100644 --- a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspace.yaml +++ b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspace.yaml @@ -10,6 +10,8 @@ spec: ## if false, pending updates are applied when the Workspace is paused deferUpdates: false + disableCulling: false + ## the WorkspaceKind to use kind: "jupyterlab" diff --git a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml index 71ed533b4..a8084c2bf 100644 --- a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml +++ b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml @@ -83,6 +83,10 @@ spec: ## maxInactiveSeconds: 86400 + ## the minimum number of seconds between probes + ## + minimumProbeInterval: 60 + ## the probe used to determine if the Workspace is active ## activityProbe: @@ -105,6 +109,7 @@ spec: ## jupyter: lastActivity: true + portId: jupyterlab ## standard probes to determine Container health (MUTABLE) ## - spec for Probe: diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go new file mode 100644 index 000000000..f56dad424 --- /dev/null +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -0,0 +1,238 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "encoding/json" + "errors" + "fmt" + kubefloworgv1beta1 "github.com/kubeflow/notebooks/workspaces/controller/api/v1beta1" + "github.com/kubeflow/notebooks/workspaces/controller/internal/helper" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/utils/ptr" + "net/http" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "strings" + "time" +) + +const ( + defaultClusterDomain = "cluster.local" + cullingBufferSeconds = 5 +) + +// CullingReconciler reconciles a Workspace object +type CullingReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // nolint:gocyclo + log := log.FromContext(ctx) + log.V(2).Info("reconciling Workspace for culling") + + // fetch the Workspace + workspace := &kubefloworgv1beta1.Workspace{} + if err := r.Get(ctx, req.NamespacedName, workspace); err != nil { + if client.IgnoreNotFound(err) == nil { + // Request object not found, could have been deleted after reconcile request. + // Owned objects are automatically garbage collected. + // For additional cleanup logic use finalizers. + // Return and don't requeue. + return ctrl.Result{}, nil + } + log.Error(err, "unable to fetch Workspace") + return ctrl.Result{}, err + } + if !workspace.GetDeletionTimestamp().IsZero() { + log.V(2).Info("Workspace is being deleted, skipping culling") + return ctrl.Result{}, nil + } + + if !*workspace.Spec.DisableCulling { + log.Info("Culling is disabled for this workspace") + return ctrl.Result{}, nil + } + + // check if the workspace is running + if workspace.Status.State != kubefloworgv1beta1.WorkspaceStateRunning { + log.V(2).Info("Workspace is not running, skipping culling") + return ctrl.Result{}, nil + } + + workspaceKindName := workspace.Spec.Kind + log = log.WithValues("workspaceKind", workspaceKindName) + workspaceKind := &kubefloworgv1beta1.WorkspaceKind{} + if err := r.Get(ctx, client.ObjectKey{Name: workspaceKindName}, workspaceKind); err != nil { + if apierrors.IsNotFound(err) { + log.V(0).Info("Workspace references unknown WorkspaceKind") + return ctrl.Result{}, err + } + log.Error(err, "unable to fetch WorkspaceKind for Workspace") + return ctrl.Result{}, err + } + + if !*workspaceKind.Spec.PodTemplate.Culling.Enabled { + log.Info("culling is disabled for this workspace kind") + return ctrl.Result{}, nil + } + + // Convert last activity and update times from Unix to time.Time + lastActivityTime := time.Unix(workspace.Status.Activity.LastActivity, 0) + lastUpdateTime := time.Unix(workspace.Status.Activity.LastUpdate, 0) + + // Fetch the culling configuration from the WorkspaceKind spec + maxInactiveSeconds := *workspaceKind.Spec.PodTemplate.Culling.MaxInactiveSeconds + minProbeIntervalSeconds := *workspaceKind.Spec.PodTemplate.Culling.MinimumProbeIntervalSeconds + + // Set requeue duration based on the minimum probe interval + requeueDuration := time.Duration(minProbeIntervalSeconds) * time.Second + + // Calculate time since the last activity and the last update + timeSinceLastActivity := time.Since(lastActivityTime).Seconds() + timeSinceLastUpdate := time.Since(lastUpdateTime).Seconds() + + // If the workspace has been active recently, requeue for the next probe + if timeSinceLastActivity < float64(maxInactiveSeconds) { + log.V(2).Info("Workspace activity is within the allowed period, requeueing for the next probe.", + "MaxInactiveSeconds", maxInactiveSeconds, + "TimeSinceLastActivity", timeSinceLastActivity) + return ctrl.Result{RequeueAfter: requeueDuration}, nil + } + // If the workspace was updated recently, requeue for the next probe + if timeSinceLastUpdate < float64(minProbeIntervalSeconds) { + log.V(2).Info("Workspace has been updated recently, requeueing for the next probe.", + "MinProbeIntervalSeconds", minProbeIntervalSeconds, + "TimeSinceLastUpdate", timeSinceLastUpdate) + return ctrl.Result{RequeueAfter: requeueDuration}, nil + } + + // Check if JupyterLab API probing is enabled + if workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Jupyter != nil { + // This is hardcoded for now, but should be fetched from the workspace's service + serviceName, err := r.getServiceName(ctx, workspace) + if err != nil { + log.Error(err, "Error fetching service name for workspace") + return ctrl.Result{}, err + } + port := "8888" + jupyterAPIEndpoint := fmt.Sprintf("http://%s.%s.svc.%s:%s/workspace/%s/%s/jupyterlab/api/status", serviceName, workspace.Namespace, defaultClusterDomain, port, workspace.Namespace, workspace.Name) + probeStartTime := time.Now() + + lastActivity, err := fetchLastActivityFromJupyterAPI(jupyterAPIEndpoint) + if err != nil { + log.Error(err, "Error fetching last activity from JupyterLab API") + return ctrl.Result{}, err + } + + workspace.Status.Activity.LastUpdate = probeStartTime.Unix() + workspace.Status.Activity.LastActivity = lastActivity.Unix() + if err := r.Status().Update(ctx, workspace); err != nil { + log.Error(err, "Failed to update workspace status after probe", "Workspace", workspace.Name) + return ctrl.Result{}, err + } + // If the workspace has been inactive for too long, initiate culling + if time.Since(lastActivity).Seconds() > float64(maxInactiveSeconds+cullingBufferSeconds) { + log.Info("Culling the workspace due to inactivity", "TimeSinceLastActivity", time.Since(lastActivity).Seconds()) + workspace.Spec.Paused = ptr.To(true) + err := r.Update(ctx, workspace) + if err != nil { + log.Error(err, "Error updating workspace during culling") + return ctrl.Result{}, err + } + } + log.V(2).Info("requeueing for next probe") + return ctrl.Result{RequeueAfter: requeueDuration}, nil + } + //TODO: Implement Bash Probe + + log.Info("culling controller finished") + return ctrl.Result{RequeueAfter: requeueDuration}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *CullingReconciler) SetupWithManager(mgr ctrl.Manager) error { + + return ctrl.NewControllerManagedBy(mgr). + For(&kubefloworgv1beta1.Workspace{}). + Complete(r) +} + +// fetchLastActivityFromJupyterAPI queries the JupyterLab API for the last activity time. +func fetchLastActivityFromJupyterAPI(apiEndpoint string) (time.Time, error) { + resp, err := http.Get(apiEndpoint) + if err != nil { + return time.Time{}, fmt.Errorf("failed to reach JupyterLab API: %w", err) + } + defer resp.Body.Close() + + // Check if the API returned a 200-OK status + if resp.StatusCode != http.StatusOK { + return time.Time{}, fmt.Errorf("JupyterLab API returned non-200 status: %d", resp.StatusCode) + } + + // Decode the API response to extract the last activity time + var status struct { + LastActivity string `json:"last_activity"` + } + if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { + return time.Time{}, fmt.Errorf("failed to parse JupyterLab API response: %w", err) + } + + // Parse the last activity time from the response + lastActivity, err := time.Parse(time.RFC3339, status.LastActivity) + if err != nil { + return time.Time{}, fmt.Errorf("failed to parse last activity time: %w", err) + } + + return lastActivity, nil +} + +func (r *CullingReconciler) getServiceName(ctx context.Context, workspace *kubefloworgv1beta1.Workspace) (string, error) { + ownedServices := &corev1.ServiceList{} + listOpts := &client.ListOptions{ + FieldSelector: fields.OneTermEqualSelector(helper.IndexWorkspaceOwnerField, workspace.Name), + Namespace: workspace.Namespace, + } + + // List services owned by the workspace + if err := r.List(ctx, ownedServices, listOpts); err != nil { + return "", err + } + + // Check the number of owned services + if len(ownedServices.Items) > 1 { + serviceList := make([]string, len(ownedServices.Items)) + for i, svc := range ownedServices.Items { + serviceList[i] = svc.Name + } + serviceListString := strings.Join(serviceList, ", ") + return "", fmt.Errorf("workspace owns multiple Services: %s", serviceListString) + + } else if len(ownedServices.Items) == 0 { + return "", errors.New("workspace does not own any Service") + } + + // Return the single found service name + return ownedServices.Items[0].Name, nil +} diff --git a/workspaces/controller/internal/controller/culling_controller_test.go b/workspaces/controller/internal/controller/culling_controller_test.go new file mode 100644 index 000000000..b0b429f89 --- /dev/null +++ b/workspaces/controller/internal/controller/culling_controller_test.go @@ -0,0 +1 @@ +package controller From a96e96ea56ed1cef6d1edae8dc1b2f4367ce86e3 Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Tue, 10 Sep 2024 21:37:08 +0100 Subject: [PATCH 02/13] update workspace kind crd Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- .../api/v1beta1/workspacekind_types.go | 43 +++++++++++++- .../api/v1beta1/zz_generated.deepcopy.go | 45 ++++++++++++++- .../bases/kubeflow.org_workspacekinds.yaml | 57 ++++++++++++++++++- .../jupyterlab_v1beta1_workspacekind.yaml | 6 +- .../internal/controller/culling_controller.go | 8 +-- 5 files changed, 150 insertions(+), 9 deletions(-) diff --git a/workspaces/controller/api/v1beta1/workspacekind_types.go b/workspaces/controller/api/v1beta1/workspacekind_types.go index 403b22e85..3ef033091 100644 --- a/workspaces/controller/api/v1beta1/workspacekind_types.go +++ b/workspaces/controller/api/v1beta1/workspacekind_types.go @@ -184,10 +184,17 @@ type WorkspaceKindCullingConfig struct { // +kubebuilder:default=86400 MaxInactiveSeconds *int32 `json:"maxInactiveSeconds,omitempty"` + // the maximum number of seconds between probes //+kubebuilder:validation:Optional //+kubebuilder:validation:Minimum:=60 //+kubebuilder:default=300 - MinimumProbeIntervalSeconds *int32 `json:"minimumProbeInterval,omitempty"` + MaxProbeIntervalSeconds *int32 `json:"maxProbeIntervalSeconds,omitempty"` + + // the minimum number of seconds between probes to avoid spamming in case on failure + //+kubebuilder:validation:Optional + //+kubebuilder:validation:Minimum:=10 + //+kubebuilder:default=20 + MinProbeIntervalSeconds *int32 `json:"minProbeIntervalSeconds,omitempty"` // the probe used to determine if the Workspace is active ActivityProbe ActivityProbe `json:"activityProbe"` @@ -222,6 +229,7 @@ type ActivityProbeJupyter struct { // +kubebuilder:example=true LastActivity bool `json:"lastActivity"` + // The ID of the port used for probing Jupyter via HTTP requests. PortId string `json:"portId"` } @@ -517,6 +525,9 @@ type WorkspaceKindStatus struct { // metrics for podTemplate options PodTemplateOptions PodTemplateOptionsMetrics `json:"podTemplateOptions"` + + // Information about the last activity probe + Activity *WorkspaceActivityStatus `json:"activity,omitempty"` } type PodTemplateOptionsMetrics struct { @@ -543,6 +554,36 @@ type OptionMetric struct { Workspaces int32 `json:"workspaces"` } +type WorkspaceActivityStatus struct { + + // Information about the last activity probe + LastProbe ProbeStatus `json:"lastProbe"` +} + +type ProbeStatus struct { + + // the time the probe was started (UNIX epoch in milliseconds) + //+kubebuilder:validation:Minimum=0 + //+kubebuilder:example=1710435303000 + StartTimeMs int64 `json:"startTimeMs"` + + // the time the probe was completed (UNIX epoch in milliseconds) + //+kubebuilder:validation:Minimum=0 + //+kubebuilder:example=1710435305000 + EndTimeMs int64 `json:"endTimeMs"` + + // the result of the probe + // ENUM: "Success" | "Failure" | "Timeout" + //+kubebuilder:validation:Enum=Success;Failure;Timeout + Result string `json:"result"` + + // a human-readable message about the probe result + // WARNING: this field is NOT FOR MACHINE USE, subject to change without notice + //+kubebuilder:default="" + //+kubebuilder:example="Jupyter probe succeeded" + Message string `json:"message"` +} + /* =============================================================================== WorkspaceKind diff --git a/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go b/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go index 02ec6c27a..f132f1bd7 100644 --- a/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go +++ b/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go @@ -453,6 +453,21 @@ func (in *PodVolumeMount) DeepCopy() *PodVolumeMount { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProbeStatus) DeepCopyInto(out *ProbeStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProbeStatus. +func (in *ProbeStatus) DeepCopy() *ProbeStatus { + if in == nil { + return nil + } + out := new(ProbeStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RedirectMessage) DeepCopyInto(out *RedirectMessage) { *out = *in @@ -510,6 +525,22 @@ func (in *WorkspaceActivity) DeepCopy() *WorkspaceActivity { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkspaceActivityStatus) DeepCopyInto(out *WorkspaceActivityStatus) { + *out = *in + out.LastProbe = in.LastProbe +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceActivityStatus. +func (in *WorkspaceActivityStatus) DeepCopy() *WorkspaceActivityStatus { + if in == nil { + return nil + } + out := new(WorkspaceActivityStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkspaceKind) DeepCopyInto(out *WorkspaceKind) { *out = *in @@ -565,8 +596,13 @@ func (in *WorkspaceKindCullingConfig) DeepCopyInto(out *WorkspaceKindCullingConf *out = new(int32) **out = **in } - if in.MinimumProbeIntervalSeconds != nil { - in, out := &in.MinimumProbeIntervalSeconds, &out.MinimumProbeIntervalSeconds + if in.MaxProbeIntervalSeconds != nil { + in, out := &in.MaxProbeIntervalSeconds, &out.MaxProbeIntervalSeconds + *out = new(int32) + **out = **in + } + if in.MinProbeIntervalSeconds != nil { + in, out := &in.MinProbeIntervalSeconds, &out.MinProbeIntervalSeconds *out = new(int32) **out = **in } @@ -853,6 +889,11 @@ func (in *WorkspaceKindSpec) DeepCopy() *WorkspaceKindSpec { func (in *WorkspaceKindStatus) DeepCopyInto(out *WorkspaceKindStatus) { *out = *in in.PodTemplateOptions.DeepCopyInto(&out.PodTemplateOptions) + if in.Activity != nil { + in, out := &in.Activity, &out.Activity + *out = new(WorkspaceActivityStatus) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceKindStatus. diff --git a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml index a5ff53206..41455582f 100644 --- a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml +++ b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml @@ -286,6 +286,8 @@ spec: example: true type: boolean portId: + description: The ID of the port used for probing Jupyter + via HTTP requests. type: string required: - lastActivity @@ -310,11 +312,19 @@ spec: format: int32 minimum: 60 type: integer - minimumProbeInterval: + maxProbeIntervalSeconds: default: 300 + description: the maximum number of seconds between probes format: int32 minimum: 60 type: integer + minProbeIntervalSeconds: + default: 20 + description: the minimum number of seconds between probes + to avoid spamming in case on failure + format: int32 + minimum: 10 + type: integer required: - activityProbe type: object @@ -4541,6 +4551,51 @@ spec: status: description: WorkspaceKindStatus defines the observed state of WorkspaceKind properties: + activity: + description: Information about the last activity probe + properties: + lastProbe: + description: Information about the last activity probe + properties: + endTimeMs: + description: the time the probe was completed (UNIX epoch + in milliseconds) + example: 1710435305000 + format: int64 + minimum: 0 + type: integer + message: + default: "" + description: |- + a human-readable message about the probe result + WARNING: this field is NOT FOR MACHINE USE, subject to change without notice + example: Jupyter probe succeeded + type: string + result: + description: |- + the result of the probe + ENUM: "Success" | "Failure" | "Timeout" + enum: + - Success + - Failure + - Timeout + type: string + startTimeMs: + description: the time the probe was started (UNIX epoch in + milliseconds) + example: 1710435303000 + format: int64 + minimum: 0 + type: integer + required: + - endTimeMs + - message + - result + - startTimeMs + type: object + required: + - lastProbe + type: object podTemplateOptions: description: metrics for podTemplate options properties: diff --git a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml index a8084c2bf..1450ac2f4 100644 --- a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml +++ b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml @@ -83,9 +83,13 @@ spec: ## maxInactiveSeconds: 86400 + ## the maximum number of seconds between probes + ## + maxProbeIntervalSeconds: 60 + ## the minimum number of seconds between probes ## - minimumProbeInterval: 60 + minProbeIntervalSeconds: 20 ## the probe used to determine if the Workspace is active ## diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index f56dad424..52acc7a83 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -103,10 +103,10 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // Fetch the culling configuration from the WorkspaceKind spec maxInactiveSeconds := *workspaceKind.Spec.PodTemplate.Culling.MaxInactiveSeconds - minProbeIntervalSeconds := *workspaceKind.Spec.PodTemplate.Culling.MinimumProbeIntervalSeconds + maxProbeIntervalSeconds := *workspaceKind.Spec.PodTemplate.Culling.MaxProbeIntervalSeconds // Set requeue duration based on the minimum probe interval - requeueDuration := time.Duration(minProbeIntervalSeconds) * time.Second + requeueDuration := time.Duration(maxProbeIntervalSeconds) * time.Second // Calculate time since the last activity and the last update timeSinceLastActivity := time.Since(lastActivityTime).Seconds() @@ -120,9 +120,9 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{RequeueAfter: requeueDuration}, nil } // If the workspace was updated recently, requeue for the next probe - if timeSinceLastUpdate < float64(minProbeIntervalSeconds) { + if timeSinceLastUpdate < float64(maxProbeIntervalSeconds) { log.V(2).Info("Workspace has been updated recently, requeueing for the next probe.", - "MinProbeIntervalSeconds", minProbeIntervalSeconds, + "MinProbeIntervalSeconds", maxProbeIntervalSeconds, "TimeSinceLastUpdate", timeSinceLastUpdate) return ctrl.Result{RequeueAfter: requeueDuration}, nil } From 31ef8471399a470d001050d91043222b5cd5490c Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Mon, 16 Sep 2024 12:46:09 +0100 Subject: [PATCH 03/13] wip Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- .../controller/api/v1beta1/workspace_types.go | 37 ++++ .../api/v1beta1/workspacekind_types.go | 33 --- .../api/v1beta1/zz_generated.deepcopy.go | 22 +- .../bases/kubeflow.org_workspacekinds.yaml | 45 ----- .../crd/bases/kubeflow.org_workspaces.yaml | 42 ++++ workspaces/controller/config/rbac/role.yaml | 6 + .../internal/controller/culling_controller.go | 188 +++++++++++++++--- .../controller/workspace_controller.go | 1 + 8 files changed, 251 insertions(+), 123 deletions(-) diff --git a/workspaces/controller/api/v1beta1/workspace_types.go b/workspaces/controller/api/v1beta1/workspace_types.go index d6574e2ea..cf4618dd4 100644 --- a/workspaces/controller/api/v1beta1/workspace_types.go +++ b/workspaces/controller/api/v1beta1/workspace_types.go @@ -193,6 +193,9 @@ type WorkspaceActivity struct { // +kubebuilder:default=0 // +kubebuilder:example=1704067200 LastUpdate int64 `json:"lastUpdate"` + + // Information about the last activity probe + LastProbe ProbeStatus `json:"lastProbe"` } type WorkspacePodOptionsStatus struct { @@ -227,6 +230,30 @@ type WorkspacePodOptionRedirectStep struct { Target string `json:"target"` } +type ProbeStatus struct { + + // the time the probe was started (UNIX epoch in milliseconds) + //+kubebuilder:validation:Minimum=0 + //+kubebuilder:example=1710435303000 + StartTimeMs int64 `json:"startTimeMs"` + + // the time the probe was completed (UNIX epoch in milliseconds) + //+kubebuilder:validation:Minimum=0 + //+kubebuilder:example=1710435305000 + EndTimeMs int64 `json:"endTimeMs"` + + // the result of the probe + // ENUM: "Success" | "Failure" | "Timeout" + //+kubebuilder:default="Unknown" + Result ProbeResult `json:"result"` + + // a human-readable message about the probe result + // WARNING: this field is NOT FOR MACHINE USE, subject to change without notice + //+kubebuilder:default="" + //+kubebuilder:example="Jupyter probe succeeded" + Message string `json:"message"` +} + // +kubebuilder:validation:Enum:={"Running","Terminating","Paused","Pending","Error","Unknown"} type WorkspaceState string @@ -239,6 +266,16 @@ const ( WorkspaceStateUnknown WorkspaceState = "Unknown" ) +// +kubebuilder:validation:Enum={"Success","Failure","Timeout","Unknown"} +type ProbeResult string + +const ( + ProbeResultSuccess ProbeResult = "Success" + ProbeResultFailure ProbeResult = "Failure" + ProbeResultTimeout ProbeResult = "Timeout" + ProbeResultUnknown ProbeResult = "Unknown" +) + /* =============================================================================== Workspace diff --git a/workspaces/controller/api/v1beta1/workspacekind_types.go b/workspaces/controller/api/v1beta1/workspacekind_types.go index 3ef033091..449be5cf0 100644 --- a/workspaces/controller/api/v1beta1/workspacekind_types.go +++ b/workspaces/controller/api/v1beta1/workspacekind_types.go @@ -525,9 +525,6 @@ type WorkspaceKindStatus struct { // metrics for podTemplate options PodTemplateOptions PodTemplateOptionsMetrics `json:"podTemplateOptions"` - - // Information about the last activity probe - Activity *WorkspaceActivityStatus `json:"activity,omitempty"` } type PodTemplateOptionsMetrics struct { @@ -554,36 +551,6 @@ type OptionMetric struct { Workspaces int32 `json:"workspaces"` } -type WorkspaceActivityStatus struct { - - // Information about the last activity probe - LastProbe ProbeStatus `json:"lastProbe"` -} - -type ProbeStatus struct { - - // the time the probe was started (UNIX epoch in milliseconds) - //+kubebuilder:validation:Minimum=0 - //+kubebuilder:example=1710435303000 - StartTimeMs int64 `json:"startTimeMs"` - - // the time the probe was completed (UNIX epoch in milliseconds) - //+kubebuilder:validation:Minimum=0 - //+kubebuilder:example=1710435305000 - EndTimeMs int64 `json:"endTimeMs"` - - // the result of the probe - // ENUM: "Success" | "Failure" | "Timeout" - //+kubebuilder:validation:Enum=Success;Failure;Timeout - Result string `json:"result"` - - // a human-readable message about the probe result - // WARNING: this field is NOT FOR MACHINE USE, subject to change without notice - //+kubebuilder:default="" - //+kubebuilder:example="Jupyter probe succeeded" - Message string `json:"message"` -} - /* =============================================================================== WorkspaceKind diff --git a/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go b/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go index f132f1bd7..b3f95472d 100644 --- a/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go +++ b/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go @@ -513,6 +513,7 @@ func (in *Workspace) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkspaceActivity) DeepCopyInto(out *WorkspaceActivity) { *out = *in + out.LastProbe = in.LastProbe } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceActivity. @@ -525,22 +526,6 @@ func (in *WorkspaceActivity) DeepCopy() *WorkspaceActivity { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *WorkspaceActivityStatus) DeepCopyInto(out *WorkspaceActivityStatus) { - *out = *in - out.LastProbe = in.LastProbe -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceActivityStatus. -func (in *WorkspaceActivityStatus) DeepCopy() *WorkspaceActivityStatus { - if in == nil { - return nil - } - out := new(WorkspaceActivityStatus) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkspaceKind) DeepCopyInto(out *WorkspaceKind) { *out = *in @@ -889,11 +874,6 @@ func (in *WorkspaceKindSpec) DeepCopy() *WorkspaceKindSpec { func (in *WorkspaceKindStatus) DeepCopyInto(out *WorkspaceKindStatus) { *out = *in in.PodTemplateOptions.DeepCopyInto(&out.PodTemplateOptions) - if in.Activity != nil { - in, out := &in.Activity, &out.Activity - *out = new(WorkspaceActivityStatus) - **out = **in - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceKindStatus. diff --git a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml index 41455582f..a5b44bc28 100644 --- a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml +++ b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml @@ -4551,51 +4551,6 @@ spec: status: description: WorkspaceKindStatus defines the observed state of WorkspaceKind properties: - activity: - description: Information about the last activity probe - properties: - lastProbe: - description: Information about the last activity probe - properties: - endTimeMs: - description: the time the probe was completed (UNIX epoch - in milliseconds) - example: 1710435305000 - format: int64 - minimum: 0 - type: integer - message: - default: "" - description: |- - a human-readable message about the probe result - WARNING: this field is NOT FOR MACHINE USE, subject to change without notice - example: Jupyter probe succeeded - type: string - result: - description: |- - the result of the probe - ENUM: "Success" | "Failure" | "Timeout" - enum: - - Success - - Failure - - Timeout - type: string - startTimeMs: - description: the time the probe was started (UNIX epoch in - milliseconds) - example: 1710435303000 - format: int64 - minimum: 0 - type: integer - required: - - endTimeMs - - message - - result - - startTimeMs - type: object - required: - - lastProbe - type: object podTemplateOptions: description: metrics for podTemplate options properties: diff --git a/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml b/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml index 7d76d0e4c..e86311df8 100644 --- a/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml +++ b/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml @@ -185,6 +185,47 @@ spec: example: 1704067200 format: int64 type: integer + lastProbe: + description: Information about the last activity probe + properties: + endTimeMs: + description: the time the probe was completed (UNIX epoch + in milliseconds) + example: 1710435305000 + format: int64 + minimum: 0 + type: integer + message: + default: "" + description: |- + a human-readable message about the probe result + WARNING: this field is NOT FOR MACHINE USE, subject to change without notice + example: Jupyter probe succeeded + type: string + result: + default: Unknown + description: |- + the result of the probe + ENUM: "Success" | "Failure" | "Timeout" + enum: + - Success + - Failure + - Timeout + - Unknown + type: string + startTimeMs: + description: the time the probe was started (UNIX epoch in + milliseconds) + example: 1710435303000 + format: int64 + minimum: 0 + type: integer + required: + - endTimeMs + - message + - result + - startTimeMs + type: object lastUpdate: default: 0 description: the last time we checked for activity on the Workspace @@ -194,6 +235,7 @@ spec: type: integer required: - lastActivity + - lastProbe - lastUpdate type: object pauseTime: diff --git a/workspaces/controller/config/rbac/role.yaml b/workspaces/controller/config/rbac/role.yaml index cedd310e1..fc1e85523 100644 --- a/workspaces/controller/config/rbac/role.yaml +++ b/workspaces/controller/config/rbac/role.yaml @@ -14,6 +14,12 @@ rules: - get - list - watch +- apiGroups: + - "" + resources: + - pods/exec + verbs: + - create - apiGroups: - "" resources: diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index 52acc7a83..c26a98de9 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -17,6 +17,7 @@ limitations under the License. package controller import ( + "bytes" "context" "encoding/json" "errors" @@ -27,8 +28,16 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/remotecommand" "k8s.io/utils/ptr" + "net" "net/http" + "os" + "path/filepath" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -70,11 +79,15 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } if !*workspace.Spec.DisableCulling { - log.Info("Culling is disabled for this workspace") + log.V(2).Info("Culling is disabled for this workspace") + return ctrl.Result{}, nil + } + + if *workspace.Spec.Paused { + log.V(2).Info("Workspace is paused, skipping culling") return ctrl.Result{}, nil } - // check if the workspace is running if workspace.Status.State != kubefloworgv1beta1.WorkspaceStateRunning { log.V(2).Info("Workspace is not running, skipping culling") return ctrl.Result{}, nil @@ -100,57 +113,93 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // Convert last activity and update times from Unix to time.Time lastActivityTime := time.Unix(workspace.Status.Activity.LastActivity, 0) lastUpdateTime := time.Unix(workspace.Status.Activity.LastUpdate, 0) + lastProbeTime := time.Unix(workspace.Status.Activity.LastProbe.EndTimeMs/1000, 0) // Fetch the culling configuration from the WorkspaceKind spec maxInactiveSeconds := *workspaceKind.Spec.PodTemplate.Culling.MaxInactiveSeconds maxProbeIntervalSeconds := *workspaceKind.Spec.PodTemplate.Culling.MaxProbeIntervalSeconds + minProbeIntervalSeconds := *workspaceKind.Spec.PodTemplate.Culling.MinProbeIntervalSeconds - // Set requeue duration based on the minimum probe interval - requeueDuration := time.Duration(maxProbeIntervalSeconds) * time.Second - - // Calculate time since the last activity and the last update + // Calculate time since the last activity, the last update and the last probe timeSinceLastActivity := time.Since(lastActivityTime).Seconds() timeSinceLastUpdate := time.Since(lastUpdateTime).Seconds() + timeSinceLastProbe := time.Since(lastProbeTime).Seconds() + + // Calculate the requeue time for the next probe + requeueAfter := max(time.Duration(float64(maxProbeIntervalSeconds)-timeSinceLastProbe)*time.Second, 0) + minRequeueAfter := time.Duration(minProbeIntervalSeconds+cullingBufferSeconds) * time.Second + + // if the workspace has been probed recently, requeue for the next probe + if timeSinceLastProbe < float64(minProbeIntervalSeconds) { + log.V(2).Info("Workspace has been probed recently, requeueing for the next probe.", + "MinProbeIntervalSeconds", minProbeIntervalSeconds, + "TimeSinceLastProbe", timeSinceLastProbe) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } // If the workspace has been active recently, requeue for the next probe if timeSinceLastActivity < float64(maxInactiveSeconds) { log.V(2).Info("Workspace activity is within the allowed period, requeueing for the next probe.", "MaxInactiveSeconds", maxInactiveSeconds, "TimeSinceLastActivity", timeSinceLastActivity) - return ctrl.Result{RequeueAfter: requeueDuration}, nil + return ctrl.Result{RequeueAfter: requeueAfter}, nil } // If the workspace was updated recently, requeue for the next probe if timeSinceLastUpdate < float64(maxProbeIntervalSeconds) { log.V(2).Info("Workspace has been updated recently, requeueing for the next probe.", "MinProbeIntervalSeconds", maxProbeIntervalSeconds, "TimeSinceLastUpdate", timeSinceLastUpdate) - return ctrl.Result{RequeueAfter: requeueDuration}, nil + return ctrl.Result{RequeueAfter: requeueAfter}, nil } // Check if JupyterLab API probing is enabled if workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Jupyter != nil { + probeStartTime := time.Now() // This is hardcoded for now, but should be fetched from the workspace's service serviceName, err := r.getServiceName(ctx, workspace) if err != nil { log.Error(err, "Error fetching service name for workspace") - return ctrl.Result{}, err + workspace.Status.Activity.LastProbe = kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to fetch service name for workspace", + } + if err := r.Status().Update(ctx, workspace); err != nil { + if apierrors.IsConflict(err) { + log.V(2).Info("update conflict while updating Workspace status, will requeue") + return ctrl.Result{Requeue: true}, nil + } + log.Error(err, "unable to update Workspace status") + } + return ctrl.Result{RequeueAfter: minRequeueAfter}, nil } port := "8888" jupyterAPIEndpoint := fmt.Sprintf("http://%s.%s.svc.%s:%s/workspace/%s/%s/jupyterlab/api/status", serviceName, workspace.Namespace, defaultClusterDomain, port, workspace.Namespace, workspace.Name) - probeStartTime := time.Now() - lastActivity, err := fetchLastActivityFromJupyterAPI(jupyterAPIEndpoint) + lastActivity, err, probeMessage, probeResult := fetchLastActivityFromJupyterAPI(jupyterAPIEndpoint) if err != nil { log.Error(err, "Error fetching last activity from JupyterLab API") - return ctrl.Result{}, err + workspace.Status.Activity.LastProbe = kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: probeResult, + Message: probeMessage, + } + + if err := r.Status().Update(ctx, workspace); err != nil { + if apierrors.IsConflict(err) { + log.V(2).Info("update conflict while updating Workspace status, will requeue") + return ctrl.Result{Requeue: true}, nil + } + log.Error(err, "unable to update Workspace status") + } + + return ctrl.Result{RequeueAfter: minRequeueAfter}, nil } workspace.Status.Activity.LastUpdate = probeStartTime.Unix() workspace.Status.Activity.LastActivity = lastActivity.Unix() - if err := r.Status().Update(ctx, workspace); err != nil { - log.Error(err, "Failed to update workspace status after probe", "Workspace", workspace.Name) - return ctrl.Result{}, err - } // If the workspace has been inactive for too long, initiate culling if time.Since(lastActivity).Seconds() > float64(maxInactiveSeconds+cullingBufferSeconds) { log.Info("Culling the workspace due to inactivity", "TimeSinceLastActivity", time.Since(lastActivity).Seconds()) @@ -161,13 +210,37 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, err } } + workspace.Status.Activity.LastProbe = kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: probeResult, + Message: probeMessage, + } + if err := r.Status().Update(ctx, workspace); err != nil { + if apierrors.IsConflict(err) { + log.V(2).Info("update conflict while updating Workspace status, will requeue") + return ctrl.Result{Requeue: true}, nil + } + log.Error(err, "unable to update Workspace status") + } log.V(2).Info("requeueing for next probe") - return ctrl.Result{RequeueAfter: requeueDuration}, nil + return ctrl.Result{RequeueAfter: requeueAfter}, nil } //TODO: Implement Bash Probe + if workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec != nil { + exitCode, err := r.execCommand("podName", workspace.Namespace, workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec.Command) + if err != nil { + log.Error(err, "Error executing command probe") + return ctrl.Result{}, err + } + if exitCode != 0 { + + } + + } log.Info("culling controller finished") - return ctrl.Result{RequeueAfter: requeueDuration}, nil + return ctrl.Result{RequeueAfter: requeueAfter}, nil } // SetupWithManager sets up the controller with the Manager. @@ -179,16 +252,24 @@ func (r *CullingReconciler) SetupWithManager(mgr ctrl.Manager) error { } // fetchLastActivityFromJupyterAPI queries the JupyterLab API for the last activity time. -func fetchLastActivityFromJupyterAPI(apiEndpoint string) (time.Time, error) { +func fetchLastActivityFromJupyterAPI(apiEndpoint string) (time.Time, error, string, kubefloworgv1beta1.ProbeResult) { resp, err := http.Get(apiEndpoint) + var netErr net.Error if err != nil { - return time.Time{}, fmt.Errorf("failed to reach JupyterLab API: %w", err) + if errors.As(err, &netErr) && netErr.Timeout() { + return time.Time{}, fmt.Errorf("JupyterLab API request timed out: %w", err), + "JupyterLab API request timeout", kubefloworgv1beta1.ProbeResultTimeout + } else { + return time.Time{}, fmt.Errorf("JupyterLab API request failed: %w", err), + "Jupyter probe failed", kubefloworgv1beta1.ProbeResultFailure + } } defer resp.Body.Close() // Check if the API returned a 200-OK status if resp.StatusCode != http.StatusOK { - return time.Time{}, fmt.Errorf("JupyterLab API returned non-200 status: %d", resp.StatusCode) + return time.Time{}, fmt.Errorf("JupyterLab API returned non-200 status: %d", resp.StatusCode), + fmt.Sprintf("Jupyter probe failed: HTTP %d", resp.StatusCode), kubefloworgv1beta1.ProbeResultFailure } // Decode the API response to extract the last activity time @@ -196,16 +277,18 @@ func fetchLastActivityFromJupyterAPI(apiEndpoint string) (time.Time, error) { LastActivity string `json:"last_activity"` } if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { - return time.Time{}, fmt.Errorf("failed to parse JupyterLab API response: %w", err) + return time.Time{}, fmt.Errorf("failed to parse JupyterLab API response: %w", err), + "Jupyter probe failed: invalid response body", kubefloworgv1beta1.ProbeResultFailure } // Parse the last activity time from the response lastActivity, err := time.Parse(time.RFC3339, status.LastActivity) if err != nil { - return time.Time{}, fmt.Errorf("failed to parse last activity time: %w", err) + return time.Time{}, fmt.Errorf("failed to parse last activity time: %w", err), + "Jupyter probe failed: invalid last activity time", kubefloworgv1beta1.ProbeResultFailure } - return lastActivity, nil + return lastActivity, nil, "Jupyter probe succeeded", kubefloworgv1beta1.ProbeResultSuccess } func (r *CullingReconciler) getServiceName(ctx context.Context, workspace *kubefloworgv1beta1.Workspace) (string, error) { @@ -236,3 +319,60 @@ func (r *CullingReconciler) getServiceName(ctx context.Context, workspace *kubef // Return the single found service name return ownedServices.Items[0].Name, nil } + +func (r *CullingReconciler) execCommand(podName, podNamespace string, command []string) (int32, error) { + config, err := rest.InClusterConfig() + + if err != nil { + if errors.Is(err, rest.ErrNotInCluster) { + // If the in-cluster configuration is not available, try to get the configuration from the kube config file + kubeConfig := filepath.Join(os.Getenv("HOME"), ".kube", "config") + config, err = clientcmd.BuildConfigFromFlags("", kubeConfig) + if err != nil { + return -1, err + } + } else { + return -1, err + } + + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return -1, err + } + req := clientset.CoreV1().RESTClient(). + Post(). + Resource("pods"). + Name(podName). + Namespace(podNamespace). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: "main", + Command: command, + Stdin: true, + Stdout: true, + Stderr: true, + }, scheme.ParameterCodec) + executor, err := remotecommand.NewSPDYExecutor(config, "POST", req.URL()) + if err != nil { + return -1, err + } + var stdout, stderr bytes.Buffer + + err = executor.StreamWithContext(context.Background(), remotecommand.StreamOptions{ + Stdin: os.Stdin, + Stdout: &stdout, + Stderr: &stderr, + }) + if err != nil { + var exitError *apierrors.StatusError + if errors.As(err, &exitError) { + return exitError.Status().Code, nil + } + } else { + // extract the exit code from the stdout / stderr + } + + return 0, nil +} diff --git a/workspaces/controller/internal/controller/workspace_controller.go b/workspaces/controller/internal/controller/workspace_controller.go index 1af67d3c5..85574b2cc 100644 --- a/workspaces/controller/internal/controller/workspace_controller.go +++ b/workspaces/controller/internal/controller/workspace_controller.go @@ -93,6 +93,7 @@ type WorkspaceReconciler struct { // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch +// +kubebuilder:rbac:groups="core",resources=pods/exec,verbs=create // +kubebuilder:rbac:groups=core,resources=services,verbs=create;delete;get;list;patch;update;watch // +kubebuilder:rbac:groups=networking.istio.io,resources=virtualservices,verbs=create;delete;get;list;patch;update;watch From ab125aeb2858da9c49b2acc8cc17214cd7bb9629 Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:17:53 +0100 Subject: [PATCH 04/13] fetch podName Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- .../internal/controller/culling_controller.go | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index c26a98de9..06ece9d37 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -24,6 +24,7 @@ import ( "fmt" kubefloworgv1beta1 "github.com/kubeflow/notebooks/workspaces/controller/api/v1beta1" "github.com/kubeflow/notebooks/workspaces/controller/internal/helper" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/fields" @@ -228,7 +229,12 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } //TODO: Implement Bash Probe if workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec != nil { - exitCode, err := r.execCommand("podName", workspace.Namespace, workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec.Command) + podName, err := r.getPodName(ctx, workspace) + if err != nil { + log.Error(err, "Error fetching pod name for workspace") + return ctrl.Result{}, err + } + exitCode, err := r.execCommand(podName, workspace.Namespace, workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec.Command) if err != nil { log.Error(err, "Error executing command probe") return ctrl.Result{}, err @@ -319,6 +325,33 @@ func (r *CullingReconciler) getServiceName(ctx context.Context, workspace *kubef // Return the single found service name return ownedServices.Items[0].Name, nil } +func (r *CullingReconciler) getPodName(ctx context.Context, workspace *kubefloworgv1beta1.Workspace) (string, error) { + var statefulSetName string + ownedStatefulSets := &appsv1.StatefulSetList{} + listOpts := &client.ListOptions{ + FieldSelector: fields.OneTermEqualSelector(helper.IndexWorkspaceOwnerField, workspace.Name), + Namespace: workspace.Namespace, + } + if err := r.List(ctx, ownedStatefulSets, listOpts); err != nil { + return "", err + } + + // reconcile StatefulSet + if len(ownedStatefulSets.Items) > 1 { + statefulSetList := make([]string, len(ownedStatefulSets.Items)) + for i, sts := range ownedStatefulSets.Items { + statefulSetList[i] = sts.Name + } + statefulSetListString := strings.Join(statefulSetList, ", ") + return "", fmt.Errorf("workspace owns multiple StatefulSets: %s", statefulSetListString) + } else if len(ownedStatefulSets.Items) == 0 { + return "", errors.New("workspace does not own any StatefulSet") + } + + statefulSetName = ownedStatefulSets.Items[0].Name + podName := fmt.Sprintf("%s-0", statefulSetName) + return podName, nil +} func (r *CullingReconciler) execCommand(podName, podNamespace string, command []string) (int32, error) { config, err := rest.InClusterConfig() From 39cebf5a0812dea62ade8370ba3b5c020cc4775b Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Sun, 29 Sep 2024 20:48:20 +0100 Subject: [PATCH 05/13] implment bash probe Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- .../api/v1beta1/workspacekind_types.go | 18 + .../api/v1beta1/zz_generated.deepcopy.go | 7 +- workspaces/controller/cmd/main.go | 12 +- .../bases/kubeflow.org_workspacekinds.yaml | 37 +- .../controller/config/manager/manager.yaml | 3 + .../jupyterlab_v1beta1_workspacekind.yaml | 27 +- workspaces/controller/go.mod | 6 + workspaces/controller/go.sum | 94 +++++ .../internal/controller/culling_controller.go | 353 ++++++++++++------ 9 files changed, 415 insertions(+), 142 deletions(-) diff --git a/workspaces/controller/api/v1beta1/workspacekind_types.go b/workspaces/controller/api/v1beta1/workspacekind_types.go index 449be5cf0..dfb76719b 100644 --- a/workspaces/controller/api/v1beta1/workspacekind_types.go +++ b/workspaces/controller/api/v1beta1/workspacekind_types.go @@ -217,6 +217,24 @@ type ActivityProbe struct { } type ActivityProbeExec struct { + // the script should write a JSON file at this path. + // any existing file in this path will be REMOVED before the script is run + //+kubebuilder:example="/tmp/activity_probe.json" + OutputPath string `json:"outputPath"` + + // the number of seconds to wait for the script to complete + //+kubebuilder:validation:Minimum:=1 + //+kubebuilder:validation:Maximum:=600 + TimeoutSeconds int32 `json:"timeoutSeconds"` + + // the script to run to determine if the Workspace is active + // - the script must exit with a 0 status code unless there is an error + // - workspaces with failing activity probes will NOT be culled + // - the script must have a shebang (e.g. `#!/usr/bin/env bash` or `#!/usr/bin/env python`) + // - the script should be idempotent and without side effects, it may be run multiple times + // - typically, it will be more efficient to write a probe which checks for a specific + // activity indicator agreed with your users, rather than checking the entire filesystem + Script string `json:"script"` // the command to run // +kubebuilder:validation:MinItems:=1 // +kubebuilder:example={"bash", "-c", "exit 0"} diff --git a/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go b/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go index b3f95472d..3b4fbb22a 100644 --- a/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go +++ b/workspaces/controller/api/v1beta1/zz_generated.deepcopy.go @@ -31,7 +31,7 @@ func (in *ActivityProbe) DeepCopyInto(out *ActivityProbe) { if in.Exec != nil { in, out := &in.Exec, &out.Exec *out = new(ActivityProbeExec) - (*in).DeepCopyInto(*out) + **out = **in } if in.Jupyter != nil { in, out := &in.Jupyter, &out.Jupyter @@ -53,11 +53,6 @@ func (in *ActivityProbe) DeepCopy() *ActivityProbe { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ActivityProbeExec) DeepCopyInto(out *ActivityProbeExec) { *out = *in - if in.Command != nil { - in, out := &in.Command, &out.Command - *out = make([]string, len(*in)) - copy(*out, *in) - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ActivityProbeExec. diff --git a/workspaces/controller/cmd/main.go b/workspaces/controller/cmd/main.go index 9720e8ac6..b27fe6685 100644 --- a/workspaces/controller/cmd/main.go +++ b/workspaces/controller/cmd/main.go @@ -19,6 +19,7 @@ package main import ( "crypto/tls" "flag" + "k8s.io/client-go/kubernetes" "os" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -147,10 +148,17 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "WorkspaceKind") os.Exit(1) } + clientset, err := kubernetes.NewForConfig(mgr.GetConfig()) + if err != nil { + setupLog.Error(err, "unable to create clientset") + os.Exit(1) + } // +kubebuilder:scaffold:builder if err = (&controllerInternal.CullingReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Config: mgr.GetConfig(), + ClientSet: clientset, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Culler") os.Exit(1) diff --git a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml index a5b44bc28..6f16f80e1 100644 --- a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml +++ b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml @@ -261,18 +261,33 @@ spec: - if the Workspace had activity in the last 60 seconds this command should return status 0, otherwise it should return status 1 properties: - command: - description: the command to run - example: - - bash - - -c - - exit 0 - items: - type: string - minItems: 1 - type: array + outputPath: + description: "\t the script should write a JSON file + at this path.\n\t any existing file in this path + will be REMOVED before the script is run" + example: /tmp/activity_probe.json + type: string + script: + description: |- + the script to run to determine if the Workspace is active + - the script must exit with a 0 status code unless there is an error + - workspaces with failing activity probes will NOT be culled + - the script must have a shebang (e.g. `#!/usr/bin/env bash` or `#!/usr/bin/env python`) + - the script should be idempotent and without side effects, it may be run multiple times + - typically, it will be more efficient to write a probe which checks for a specific + activity indicator agreed with your users, rather than checking the entire filesystem + type: string + timeoutSeconds: + description: the number of seconds to wait for the + script to complete + format: int32 + maximum: 600 + minimum: 1 + type: integer required: - - command + - outputPath + - script + - timeoutSeconds type: object jupyter: description: |- diff --git a/workspaces/controller/config/manager/manager.yaml b/workspaces/controller/config/manager/manager.yaml index 1e6d66099..15a21e4fa 100644 --- a/workspaces/controller/config/manager/manager.yaml +++ b/workspaces/controller/config/manager/manager.yaml @@ -67,6 +67,9 @@ spec: image: controller:latest imagePullPolicy: IfNotPresent name: manager + env: + - name: HTTP_TIMEOUT_SECONDS + value: "5" securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml index 1450ac2f4..d12e1b4af 100644 --- a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml +++ b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml @@ -100,11 +100,28 @@ spec: ## should return status 0, otherwise it should return status 1 ## #exec: - # command: - # - "bash" - # - "-c" - # - "exit 0" - + # outputPath: "/tmp/activity_probe.json" + # timeoutSeconds: 60 + # script: |- + # #!/usr/bin/env bash + # + # set -euo pipefail + # + # # Define the output path + # output_path="/tmp/activity_probe.json" + # + # # Find the most recent modification time in the $HOME directory + # last_activity_epoch=$(find "$HOME" -type f -printf '%T@\n' 2>/dev/null | awk 'max < $1 { max = $1 } END { print max }') + # + # # Write the last activity time to the output path + # if [ -n "$last_activity_epoch" ]; then + # # Convert epoch time to ISO 8601 format + # last_activity=$(date -d "@$last_activity_epoch" -Iseconds) + # echo "{\"last_activity\": \"$last_activity\"}" > "$output_path" + # else + # # Handle the case where no files are found + # echo "{\"last_activity\": null}" > "$output_path" + # fi ## OPTION 2: a Jupyter-specific probe ## - will poll the `/api/status` endpoint of the Jupyter API, and use the `last_activity` field ## https://github.com/jupyter-server/jupyter_server/blob/v2.13.0/jupyter_server/services/api/handlers.py#L62-L67 diff --git a/workspaces/controller/go.mod b/workspaces/controller/go.mod index 8e1b5b5b2..6c937da1c 100644 --- a/workspaces/controller/go.mod +++ b/workspaces/controller/go.mod @@ -34,13 +34,19 @@ require ( github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect github.com/google/uuid v1.6.0 // indirect + github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect + github.com/google/uuid v1.3.0 // indirect + github.com/gorilla/websocket v1.5.0 // indirect github.com/imdario/mergo v0.3.6 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect + github.com/moby/spdystream v0.2.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.19.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect diff --git a/workspaces/controller/go.sum b/workspaces/controller/go.sum index 8496f957c..1f083e6df 100644 --- a/workspaces/controller/go.sum +++ b/workspaces/controller/go.sum @@ -186,6 +186,100 @@ k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1 k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= +github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= +github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= +github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU= +github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= +github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM= +github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= +github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= +github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY= +github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw= +github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= +github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= +github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA= +github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= +github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= +github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU= +golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= +golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= +gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +k8s.io/api v0.31.0 h1:b9LiSjR2ym/SzTOlfMHm1tr7/21aD7fSkqgD/CVJBCo= +k8s.io/api v0.31.0/go.mod h1:0YiFF+JfFxMM6+1hQei8FY8M7s1Mth+z/q7eF1aJkTE= +k8s.io/apiextensions-apiserver v0.31.0 h1:fZgCVhGwsclj3qCw1buVXCV6khjRzKC5eCFt24kyLSk= +k8s.io/apiextensions-apiserver v0.31.0/go.mod h1:b9aMDEYaEe5sdK+1T0KU78ApR/5ZVp4i56VacZYEHxk= +k8s.io/apimachinery v0.31.0 h1:m9jOiSr3FoSSL5WO9bjm1n6B9KROYYgNZOb4tyZ1lBc= +k8s.io/apimachinery v0.31.0/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= +k8s.io/client-go v0.31.0 h1:QqEJzNjbN2Yv1H79SsS+SWnXkBgVu4Pj3CJQgbx0gI8= +k8s.io/client-go v0.31.0/go.mod h1:Y9wvC76g4fLjmU0BA+rV+h2cncoadjvjjkkIGoTLcGU= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= +sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index 06ece9d37..f7a17c49b 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -22,6 +22,7 @@ import ( "encoding/json" "errors" "fmt" + "github.com/go-logr/logr" kubefloworgv1beta1 "github.com/kubeflow/notebooks/workspaces/controller/api/v1beta1" "github.com/kubeflow/notebooks/workspaces/controller/internal/helper" appsv1 "k8s.io/api/apps/v1" @@ -29,38 +30,46 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/httpstream" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/tools/remotecommand" "k8s.io/utils/ptr" "net" "net/http" + "net/url" "os" - "path/filepath" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + "strconv" "strings" "time" ) const ( - defaultClusterDomain = "cluster.local" - cullingBufferSeconds = 5 + defaultClusterDomain = "cluster.local" + inactivityToleranceBufferSeconds = 5 + defaultHTTPTimeout = 5 * time.Second ) // CullingReconciler reconciles a Workspace object type CullingReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + ClientSet *kubernetes.Clientset + Config *rest.Config +} + +type ActivityProbe struct { + HasActivity *bool `json:"has_activity,omitempty"` + LastActivity *string `json:"last_activity,omitempty"` } func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // nolint:gocyclo log := log.FromContext(ctx) log.V(2).Info("reconciling Workspace for culling") - // fetch the Workspace workspace := &kubefloworgv1beta1.Workspace{} if err := r.Get(ctx, req.NamespacedName, workspace); err != nil { @@ -111,7 +120,7 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, nil } - // Convert last activity and update times from Unix to time.Time + // Fetch the last activity, update and probe times from the Workspace status lastActivityTime := time.Unix(workspace.Status.Activity.LastActivity, 0) lastUpdateTime := time.Unix(workspace.Status.Activity.LastUpdate, 0) lastProbeTime := time.Unix(workspace.Status.Activity.LastProbe.EndTimeMs/1000, 0) @@ -128,7 +137,7 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // Calculate the requeue time for the next probe requeueAfter := max(time.Duration(float64(maxProbeIntervalSeconds)-timeSinceLastProbe)*time.Second, 0) - minRequeueAfter := time.Duration(minProbeIntervalSeconds+cullingBufferSeconds) * time.Second + minRequeueAfter := time.Duration(minProbeIntervalSeconds) * time.Second // if the workspace has been probed recently, requeue for the next probe if timeSinceLastProbe < float64(minProbeIntervalSeconds) { @@ -156,24 +165,15 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // Check if JupyterLab API probing is enabled if workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Jupyter != nil { probeStartTime := time.Now() - // This is hardcoded for now, but should be fetched from the workspace's service serviceName, err := r.getServiceName(ctx, workspace) if err != nil { log.Error(err, "Error fetching service name for workspace") - workspace.Status.Activity.LastProbe = kubefloworgv1beta1.ProbeStatus{ + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &minRequeueAfter, &kubefloworgv1beta1.ProbeStatus{ StartTimeMs: probeStartTime.UnixMilli(), EndTimeMs: time.Now().UnixMilli(), Result: kubefloworgv1beta1.ProbeResultFailure, Message: "Failed to fetch service name for workspace", - } - if err := r.Status().Update(ctx, workspace); err != nil { - if apierrors.IsConflict(err) { - log.V(2).Info("update conflict while updating Workspace status, will requeue") - return ctrl.Result{Requeue: true}, nil - } - log.Error(err, "unable to update Workspace status") - } - return ctrl.Result{RequeueAfter: minRequeueAfter}, nil + }, nil, nil) } port := "8888" jupyterAPIEndpoint := fmt.Sprintf("http://%s.%s.svc.%s:%s/workspace/%s/%s/jupyterlab/api/status", serviceName, workspace.Namespace, defaultClusterDomain, port, workspace.Namespace, workspace.Name) @@ -181,68 +181,123 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct lastActivity, err, probeMessage, probeResult := fetchLastActivityFromJupyterAPI(jupyterAPIEndpoint) if err != nil { log.Error(err, "Error fetching last activity from JupyterLab API") - workspace.Status.Activity.LastProbe = kubefloworgv1beta1.ProbeStatus{ + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &minRequeueAfter, &kubefloworgv1beta1.ProbeStatus{ StartTimeMs: probeStartTime.UnixMilli(), EndTimeMs: time.Now().UnixMilli(), Result: probeResult, Message: probeMessage, - } - - if err := r.Status().Update(ctx, workspace); err != nil { - if apierrors.IsConflict(err) { - log.V(2).Info("update conflict while updating Workspace status, will requeue") - return ctrl.Result{Requeue: true}, nil - } - log.Error(err, "unable to update Workspace status") - } - - return ctrl.Result{RequeueAfter: minRequeueAfter}, nil + }, nil, nil) } - workspace.Status.Activity.LastUpdate = probeStartTime.Unix() - workspace.Status.Activity.LastActivity = lastActivity.Unix() // If the workspace has been inactive for too long, initiate culling - if time.Since(lastActivity).Seconds() > float64(maxInactiveSeconds+cullingBufferSeconds) { - log.Info("Culling the workspace due to inactivity", "TimeSinceLastActivity", time.Since(lastActivity).Seconds()) + if time.Since(*lastActivity).Seconds() > float64(maxInactiveSeconds+inactivityToleranceBufferSeconds) { + log.V(2).Info("Culling the workspace due to inactivity", "TimeSinceLastActivity", time.Since(*lastActivity).Seconds()) workspace.Spec.Paused = ptr.To(true) err := r.Update(ctx, workspace) if err != nil { log.Error(err, "Error updating workspace during culling") - return ctrl.Result{}, err + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &requeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to pause workspace", + }, nil, nil) } } - workspace.Status.Activity.LastProbe = kubefloworgv1beta1.ProbeStatus{ + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &requeueAfter, &kubefloworgv1beta1.ProbeStatus{ StartTimeMs: probeStartTime.UnixMilli(), EndTimeMs: time.Now().UnixMilli(), Result: probeResult, Message: probeMessage, - } - if err := r.Status().Update(ctx, workspace); err != nil { - if apierrors.IsConflict(err) { - log.V(2).Info("update conflict while updating Workspace status, will requeue") - return ctrl.Result{Requeue: true}, nil - } - log.Error(err, "unable to update Workspace status") - } - log.V(2).Info("requeueing for next probe") - return ctrl.Result{RequeueAfter: requeueAfter}, nil + }, ptr.To(probeStartTime.Unix()), ptr.To(lastActivity.Unix())) } - //TODO: Implement Bash Probe + if workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec != nil { + probeStartTime := time.Now() podName, err := r.getPodName(ctx, workspace) if err != nil { log.Error(err, "Error fetching pod name for workspace") - return ctrl.Result{}, err + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &minRequeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to fetch pod name for workspace", + }, nil, nil) } - exitCode, err := r.execCommand(podName, workspace.Namespace, workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec.Command) + stdout, stderr, err := r.execCommand(ctx, podName, workspace.Namespace, workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec) if err != nil { - log.Error(err, "Error executing command probe") - return ctrl.Result{}, err + log.Error(err, "Error executing command probe", "stderr", stderr) + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &requeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to execute command probe", + }, nil, nil) + } - if exitCode != 0 { + // handle the probe result + activityProbe, err := parseActivityProbeJson(stdout) + if err != nil { + log.Error(err, "Error parsing activity probe JSON") + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &requeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to parse activity probe JSON", + }, nil, nil) } + lastActivity := time.Now().Unix() + if activityProbe.HasActivity != nil && !*activityProbe.HasActivity { + log.V(2).Info("Culling the workspace due to inactivity") + //TODO: figure out how to set the last activity time + lastActivity = time.Now().Unix() + workspace.Spec.Paused = ptr.To(true) + err := r.Update(ctx, workspace) + if err != nil { + log.Error(err, "Error updating workspace during culling") + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &requeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to update workspace during culling", + }, nil, nil) + } + } + if activityProbe.HasActivity == nil && activityProbe.LastActivity != nil { + lastActivityTime, err = time.Parse(time.RFC3339, *activityProbe.LastActivity) + if err != nil { + log.Error(err, "Error parsing last activity time") + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &requeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to parse last activity time", + }, nil, nil) + } + lastActivity = lastActivityTime.Unix() + if time.Since(lastActivityTime).Seconds() > float64(maxInactiveSeconds+inactivityToleranceBufferSeconds) { + log.V(2).Info("Culling the workspace due to inactivity", "TimeSinceLastActivity", time.Since(lastActivityTime).Seconds()) + workspace.Spec.Paused = ptr.To(true) + err := r.Update(ctx, workspace) + if err != nil { + log.Error(err, "Error updating workspace during culling") + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &requeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to update workspace during culling", + }, nil, nil) + } + } + } + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &requeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultSuccess, + Message: "Bash probe succeeded", + }, ptr.To(probeStartTime.Unix()), ptr.To(lastActivity)) } log.Info("culling controller finished") @@ -257,44 +312,33 @@ func (r *CullingReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } -// fetchLastActivityFromJupyterAPI queries the JupyterLab API for the last activity time. -func fetchLastActivityFromJupyterAPI(apiEndpoint string) (time.Time, error, string, kubefloworgv1beta1.ProbeResult) { - resp, err := http.Get(apiEndpoint) - var netErr net.Error - if err != nil { - if errors.As(err, &netErr) && netErr.Timeout() { - return time.Time{}, fmt.Errorf("JupyterLab API request timed out: %w", err), - "JupyterLab API request timeout", kubefloworgv1beta1.ProbeResultTimeout - } else { - return time.Time{}, fmt.Errorf("JupyterLab API request failed: %w", err), - "Jupyter probe failed", kubefloworgv1beta1.ProbeResultFailure - } +// updateWorkspaceActivityStatus attempts to immediately update the Workspace activity status with the provided status. +func (r *CullingReconciler) updateWorkspaceActivityStatus(ctx context.Context, log logr.Logger, workspace *kubefloworgv1beta1.Workspace, requeueAfter *time.Duration, probeStatus *kubefloworgv1beta1.ProbeStatus, lastUpdate, lastActivity *int64) (ctrl.Result, error) { // nolint:unparam + if workspace == nil { + return ctrl.Result{}, fmt.Errorf("provided Workspace was nil") } - defer resp.Body.Close() - - // Check if the API returned a 200-OK status - if resp.StatusCode != http.StatusOK { - return time.Time{}, fmt.Errorf("JupyterLab API returned non-200 status: %d", resp.StatusCode), - fmt.Sprintf("Jupyter probe failed: HTTP %d", resp.StatusCode), kubefloworgv1beta1.ProbeResultFailure + if lastUpdate != nil { + workspace.Status.Activity.LastUpdate = *lastUpdate } - - // Decode the API response to extract the last activity time - var status struct { - LastActivity string `json:"last_activity"` + if lastActivity != nil { + workspace.Status.Activity.LastActivity = *lastActivity } - if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { - return time.Time{}, fmt.Errorf("failed to parse JupyterLab API response: %w", err), - "Jupyter probe failed: invalid response body", kubefloworgv1beta1.ProbeResultFailure + if probeStatus != nil { + workspace.Status.Activity.LastProbe = *probeStatus } - - // Parse the last activity time from the response - lastActivity, err := time.Parse(time.RFC3339, status.LastActivity) - if err != nil { - return time.Time{}, fmt.Errorf("failed to parse last activity time: %w", err), - "Jupyter probe failed: invalid last activity time", kubefloworgv1beta1.ProbeResultFailure + if err := r.Status().Update(ctx, workspace); err != nil { + if apierrors.IsConflict(err) { + log.V(2).Info("update conflict while updating Workspace status, will requeue") + return ctrl.Result{Requeue: true}, nil + } + log.Error(err, "unable to update Workspace status") + return ctrl.Result{}, err + } + if requeueAfter != nil { + return ctrl.Result{RequeueAfter: *requeueAfter}, nil } - return lastActivity, nil, "Jupyter probe succeeded", kubefloworgv1beta1.ProbeResultSuccess + return ctrl.Result{}, nil } func (r *CullingReconciler) getServiceName(ctx context.Context, workspace *kubefloworgv1beta1.Workspace) (string, error) { @@ -325,6 +369,7 @@ func (r *CullingReconciler) getServiceName(ctx context.Context, workspace *kubef // Return the single found service name return ownedServices.Items[0].Name, nil } + func (r *CullingReconciler) getPodName(ctx context.Context, workspace *kubefloworgv1beta1.Workspace) (string, error) { var statefulSetName string ownedStatefulSets := &appsv1.StatefulSetList{} @@ -353,28 +398,17 @@ func (r *CullingReconciler) getPodName(ctx context.Context, workspace *kubeflowo return podName, nil } -func (r *CullingReconciler) execCommand(podName, podNamespace string, command []string) (int32, error) { - config, err := rest.InClusterConfig() +func (r *CullingReconciler) execCommand(ctx context.Context, podName, podNamespace string, exec *kubefloworgv1beta1.ActivityProbeExec) (string, string, error) { + timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(exec.TimeoutSeconds)*time.Second) + defer cancel() - if err != nil { - if errors.Is(err, rest.ErrNotInCluster) { - // If the in-cluster configuration is not available, try to get the configuration from the kube config file - kubeConfig := filepath.Join(os.Getenv("HOME"), ".kube", "config") - config, err = clientcmd.BuildConfigFromFlags("", kubeConfig) - if err != nil { - return -1, err - } - } else { - return -1, err - } - - } + command := fmt.Sprintf(` + rm -f %s + %s + cat %s + `, exec.OutputPath, exec.Script, exec.OutputPath) - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - return -1, err - } - req := clientset.CoreV1().RESTClient(). + req := r.ClientSet.CoreV1().RESTClient(). Post(). Resource("pods"). Name(podName). @@ -382,30 +416,113 @@ func (r *CullingReconciler) execCommand(podName, podNamespace string, command [] SubResource("exec"). VersionedParams(&corev1.PodExecOptions{ Container: "main", - Command: command, - Stdin: true, + Command: []string{"bash", "-c", command}, + Stdin: false, Stdout: true, Stderr: true, + TTY: false, }, scheme.ParameterCodec) - executor, err := remotecommand.NewSPDYExecutor(config, "POST", req.URL()) + + executor, err := createExecutor(req.URL(), r.Config) if err != nil { - return -1, err + return "", "", fmt.Errorf("error creating executor: %v", err) } - var stdout, stderr bytes.Buffer - err = executor.StreamWithContext(context.Background(), remotecommand.StreamOptions{ - Stdin: os.Stdin, + var stdout, stderr bytes.Buffer + err = executor.StreamWithContext(timeoutCtx, remotecommand.StreamOptions{ + Stdin: nil, Stdout: &stdout, Stderr: &stderr, + Tty: false, }) + + return stdout.String(), stderr.String(), err +} + +// fetchLastActivityFromJupyterAPI queries the JupyterLab API for the last activity time. +func fetchLastActivityFromJupyterAPI(apiEndpoint string) (*time.Time, error, string, kubefloworgv1beta1.ProbeResult) { + httpTimeoutSeconds := defaultHTTPTimeout + if timeout, err := strconv.Atoi(os.Getenv("HTTP_TIMEOUT_SECONDS")); err == nil && timeout > 0 { + httpTimeoutSeconds = time.Duration(timeout) * time.Second + } + httpClient := &http.Client{Timeout: httpTimeoutSeconds} + resp, err := httpClient.Get(apiEndpoint) + var netErr net.Error if err != nil { - var exitError *apierrors.StatusError - if errors.As(err, &exitError) { - return exitError.Status().Code, nil + if errors.As(err, &netErr) && netErr.Timeout() { + return nil, fmt.Errorf("JupyterLab API request timed out: %w", err), + "JupyterLab API request timeout", kubefloworgv1beta1.ProbeResultTimeout + } else { + return nil, fmt.Errorf("JupyterLab API request failed: %w", err), + "Jupyter probe failed", kubefloworgv1beta1.ProbeResultFailure } - } else { - // extract the exit code from the stdout / stderr } + // Check if the API returned a 200-OK status + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("JupyterLab API returned non-200 status: %d", resp.StatusCode), + fmt.Sprintf("Jupyter probe failed: HTTP %d", resp.StatusCode), kubefloworgv1beta1.ProbeResultFailure + } + + // Decode the API response to extract the last activity time + var status struct { + LastActivity string `json:"last_activity"` + } + + defer resp.Body.Close() + if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { + return nil, fmt.Errorf("failed to parse JupyterLab API response: %w", err), + "Jupyter probe failed: invalid response body", kubefloworgv1beta1.ProbeResultFailure + } + + // Parse the last activity time from the response + lastActivity, err := time.Parse(time.RFC3339, status.LastActivity) + if err != nil { + return nil, fmt.Errorf("failed to parse last activity time: %w", err), + "Jupyter probe failed: invalid last activity time", kubefloworgv1beta1.ProbeResultFailure + } + + return &lastActivity, nil, "Jupyter probe succeeded", kubefloworgv1beta1.ProbeResultSuccess +} + +// createExecutor creates a new Executor for the given URL and REST config. +func createExecutor(url *url.URL, config *rest.Config) (remotecommand.Executor, error) { + exec, err := remotecommand.NewSPDYExecutor(config, "POST", url) + if err != nil { + return nil, err + } + // WebSocketExecutor must be "GET" method as described in RFC 6455 Sec. 4.1 (page 17). + websocketExec, err := remotecommand.NewWebSocketExecutor(config, "GET", url.String()) + if err != nil { + return nil, err + } + exec, err = remotecommand.NewFallbackExecutor(websocketExec, exec, func(err error) bool { + return httpstream.IsUpgradeFailure(err) || isHTTPSProxyError(err) + }) + if err != nil { + return nil, err + } + + return exec, nil +} + +// isHTTPSProxyError checks if the given error is due to an unknown scheme in the proxy. +func isHTTPSProxyError(err error) bool { + if err == nil { + return false + } + return strings.Contains(err.Error(), "proxy: unknown scheme: https") +} + +// parseActivityProbeJson parses the JSON string into an ActivityProbe struct and ensures +// that at least has_activity or last_activity fields are present. +func parseActivityProbeJson(jsonString string) (*ActivityProbe, error) { + activityProbe := &ActivityProbe{} + if err := json.Unmarshal([]byte(jsonString), activityProbe); err != nil { + return nil, err + } + if activityProbe.HasActivity == nil && activityProbe.LastActivity == nil { + return nil, errors.New("has_activity and last_activity fields are missing in the activity probe JSON") + } + return activityProbe, nil - return 0, nil } From 29f04f655b72be982ee4507d4c7518d317289db1 Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:22:07 +0100 Subject: [PATCH 06/13] implment bash probe Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- .../controller/api/v1beta1/workspace_types.go | 13 +++-- .../api/v1beta1/workspacekind_types.go | 14 ++--- .../bases/kubeflow.org_workspacekinds.yaml | 11 ++-- .../crd/bases/kubeflow.org_workspaces.yaml | 8 ++- .../config/manager/kustomization.yaml | 1 - .../jupyterlab_v1beta1_workspacekind.yaml | 56 +++++++++---------- .../internal/controller/culling_controller.go | 13 ++--- 7 files changed, 58 insertions(+), 58 deletions(-) diff --git a/workspaces/controller/api/v1beta1/workspace_types.go b/workspaces/controller/api/v1beta1/workspace_types.go index cf4618dd4..ccd6560d1 100644 --- a/workspaces/controller/api/v1beta1/workspace_types.go +++ b/workspaces/controller/api/v1beta1/workspace_types.go @@ -243,8 +243,8 @@ type ProbeStatus struct { EndTimeMs int64 `json:"endTimeMs"` // the result of the probe - // ENUM: "Success" | "Failure" | "Timeout" - //+kubebuilder:default="Unknown" + // ENUM: "Success" | "Failure" | "Timeout" | "" + //+kubebuilder:default="" Result ProbeResult `json:"result"` // a human-readable message about the probe result @@ -266,7 +266,7 @@ const ( WorkspaceStateUnknown WorkspaceState = "Unknown" ) -// +kubebuilder:validation:Enum={"Success","Failure","Timeout","Unknown"} +// +kubebuilder:validation:Enum={"Success","Failure","Timeout",""} type ProbeResult string const ( @@ -282,9 +282,10 @@ const ( =============================================================================== */ -// +kubebuilder:object:root=true -// +kubebuilder:printcolumn:name="State",type="string",JSONPath=".status.state",description="The current state of the Workspace" -// +kubebuilder:subresource:status +//+kubebuilder:object:root=true +//+kubebuilder:printcolumn:name="State",type="string",JSONPath=".status.state",description="The current state of the Workspace" +//+kubebuilder:subresource:status +//+kubebuilder:resource:shortName=ws // Workspace is the Schema for the Workspaces API type Workspace struct { diff --git a/workspaces/controller/api/v1beta1/workspacekind_types.go b/workspaces/controller/api/v1beta1/workspacekind_types.go index dfb76719b..affc77829 100644 --- a/workspaces/controller/api/v1beta1/workspacekind_types.go +++ b/workspaces/controller/api/v1beta1/workspacekind_types.go @@ -217,14 +217,14 @@ type ActivityProbe struct { } type ActivityProbeExec struct { - // the script should write a JSON file at this path. - // any existing file in this path will be REMOVED before the script is run + // the script should write a JSON file at this path. + // any existing file in this path will be REMOVED before the script is run //+kubebuilder:example="/tmp/activity_probe.json" OutputPath string `json:"outputPath"` // the number of seconds to wait for the script to complete - //+kubebuilder:validation:Minimum:=1 - //+kubebuilder:validation:Maximum:=600 + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=600 TimeoutSeconds int32 `json:"timeoutSeconds"` // the script to run to determine if the Workspace is active @@ -235,10 +235,6 @@ type ActivityProbeExec struct { // - typically, it will be more efficient to write a probe which checks for a specific // activity indicator agreed with your users, rather than checking the entire filesystem Script string `json:"script"` - // the command to run - // +kubebuilder:validation:MinItems:=1 - // +kubebuilder:example={"bash", "-c", "exit 0"} - Command []string `json:"command"` } // +kubebuilder:validation:XValidation:message="'lastActivity' must be true",rule="has(self.lastActivity) && self.lastActivity" @@ -580,7 +576,7 @@ type OptionMetric struct { // +kubebuilder:printcolumn:name="Deprecated",type="boolean",JSONPath=".spec.spawner.deprecated",description="If this WorkspaceKind is deprecated" // +kubebuilder:printcolumn:name="Hidden",type="boolean",JSONPath=".spec.spawner.hidden",description="If this WorkspaceKind is hidden from the spawner UI" // +kubebuilder:subresource:status -// +kubebuilder:resource:scope=Cluster +// +kubebuilder:resource:scope=Cluster,shortName=wsk // WorkspaceKind is the Schema for the WorkspaceKinds API type WorkspaceKind struct { diff --git a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml index 6f16f80e1..4a83ade3c 100644 --- a/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml +++ b/workspaces/controller/config/crd/bases/kubeflow.org_workspacekinds.yaml @@ -11,6 +11,8 @@ spec: kind: WorkspaceKind listKind: WorkspaceKindList plural: workspacekinds + shortNames: + - wsk singular: workspacekind scope: Cluster versions: @@ -262,9 +264,9 @@ spec: should return status 0, otherwise it should return status 1 properties: outputPath: - description: "\t the script should write a JSON file - at this path.\n\t any existing file in this path - will be REMOVED before the script is run" + description: |- + the script should write a JSON file at this path. + any existing file in this path will be REMOVED before the script is run example: /tmp/activity_probe.json type: string script: @@ -278,10 +280,11 @@ spec: activity indicator agreed with your users, rather than checking the entire filesystem type: string timeoutSeconds: + default: 10 description: the number of seconds to wait for the script to complete format: int32 - maximum: 600 + maximum: 300 minimum: 1 type: integer required: diff --git a/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml b/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml index e86311df8..8f869fa38 100644 --- a/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml +++ b/workspaces/controller/config/crd/bases/kubeflow.org_workspaces.yaml @@ -11,6 +11,8 @@ spec: kind: Workspace listKind: WorkspaceList plural: workspaces + shortNames: + - ws singular: workspace scope: Namespaced versions: @@ -203,15 +205,15 @@ spec: example: Jupyter probe succeeded type: string result: - default: Unknown + default: "" description: |- the result of the probe - ENUM: "Success" | "Failure" | "Timeout" + ENUM: "Success" | "Failure" | "Timeout" | "" enum: - Success - Failure - Timeout - - Unknown + - "" type: string startTimeMs: description: the time the probe was started (UNIX epoch in diff --git a/workspaces/controller/config/manager/kustomization.yaml b/workspaces/controller/config/manager/kustomization.yaml index 2c5fb1ff7..ad2284487 100644 --- a/workspaces/controller/config/manager/kustomization.yaml +++ b/workspaces/controller/config/manager/kustomization.yaml @@ -5,4 +5,3 @@ resources: images: - name: controller newName: ghcr.io/kubeflow/notebooks/workspace-controller - newTag: latest diff --git a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml index d12e1b4af..7f3f6656a 100644 --- a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml +++ b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml @@ -81,7 +81,7 @@ spec: ## the maximum number of seconds a Workspace can be inactive ## - maxInactiveSeconds: 86400 + maxInactiveSeconds: 100 ## the maximum number of seconds between probes ## @@ -89,7 +89,7 @@ spec: ## the minimum number of seconds between probes ## - minProbeIntervalSeconds: 20 + minProbeIntervalSeconds: 10 ## the probe used to determine if the Workspace is active ## @@ -99,38 +99,38 @@ spec: ## - if the Workspace had activity in the last 60 seconds this command ## should return status 0, otherwise it should return status 1 ## - #exec: - # outputPath: "/tmp/activity_probe.json" - # timeoutSeconds: 60 - # script: |- - # #!/usr/bin/env bash - # - # set -euo pipefail - # - # # Define the output path - # output_path="/tmp/activity_probe.json" - # - # # Find the most recent modification time in the $HOME directory - # last_activity_epoch=$(find "$HOME" -type f -printf '%T@\n' 2>/dev/null | awk 'max < $1 { max = $1 } END { print max }') - # - # # Write the last activity time to the output path - # if [ -n "$last_activity_epoch" ]; then - # # Convert epoch time to ISO 8601 format - # last_activity=$(date -d "@$last_activity_epoch" -Iseconds) - # echo "{\"last_activity\": \"$last_activity\"}" > "$output_path" - # else - # # Handle the case where no files are found - # echo "{\"last_activity\": null}" > "$output_path" - # fi + exec: + outputPath: "/tmp/activity_probe.json" + timeoutSeconds: 60 + script: |- + #!/usr/bin/env bash + + set -euo pipefail + + # Define the output path + output_path="/tmp/activity_probe.json" + + # Find the most recent modification time in the $HOME directory + last_activity_epoch=$(find "$HOME" -type f -printf '%T@\n' 2>/dev/null | awk 'max < $1 { max = $1 } END { print max }') + + # Write the last activity time to the output path + if [ -n "$last_activity_epoch" ]; then + # Convert epoch time to ISO 8601 format + last_activity=$(date -d "@$last_activity_epoch" -Iseconds) + echo "{\"last_activity\": \"$last_activity\"}" > "$output_path" + else + # Handle the case where no files are found + echo "{\"last_activity\": null}" > "$output_path" + fi ## OPTION 2: a Jupyter-specific probe ## - will poll the `/api/status` endpoint of the Jupyter API, and use the `last_activity` field ## https://github.com/jupyter-server/jupyter_server/blob/v2.13.0/jupyter_server/services/api/handlers.py#L62-L67 ## - note, users need to be careful that their other probes don't trigger a "last_activity" update ## e.g. they should only check the health of Jupyter using the `/api/status` endpoint ## - jupyter: - lastActivity: true - portId: jupyterlab +# jupyter: +# lastActivity: true +# portId: jupyterlab ## standard probes to determine Container health (MUTABLE) ## - spec for Probe: diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index f7a17c49b..30db063b6 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -51,7 +51,7 @@ import ( const ( defaultClusterDomain = "cluster.local" inactivityToleranceBufferSeconds = 5 - defaultHTTPTimeout = 5 * time.Second + defaultHTTPTimeout = 15 * time.Second ) // CullingReconciler reconciles a Workspace object @@ -87,9 +87,8 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct log.V(2).Info("Workspace is being deleted, skipping culling") return ctrl.Result{}, nil } - - if !*workspace.Spec.DisableCulling { - log.V(2).Info("Culling is disabled for this workspace") + if workspace.Spec.DisableCulling != nil && *workspace.Spec.DisableCulling { + log.V(2).Info("Culling is disabled for this workspace", "DisableCulling", *workspace.Spec.DisableCulling) return ctrl.Result{}, nil } @@ -136,9 +135,9 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct timeSinceLastProbe := time.Since(lastProbeTime).Seconds() // Calculate the requeue time for the next probe - requeueAfter := max(time.Duration(float64(maxProbeIntervalSeconds)-timeSinceLastProbe)*time.Second, 0) minRequeueAfter := time.Duration(minProbeIntervalSeconds) * time.Second - + requeueAfter := max(time.Duration(float64(maxProbeIntervalSeconds)-timeSinceLastProbe)*time.Second, minRequeueAfter) + log.Info("requesting requeue", "requeueAfter", requeueAfter, "minRequeueAfter", minRequeueAfter) // if the workspace has been probed recently, requeue for the next probe if timeSinceLastProbe < float64(minProbeIntervalSeconds) { log.V(2).Info("Workspace has been probed recently, requeueing for the next probe.", @@ -151,7 +150,7 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct if timeSinceLastActivity < float64(maxInactiveSeconds) { log.V(2).Info("Workspace activity is within the allowed period, requeueing for the next probe.", "MaxInactiveSeconds", maxInactiveSeconds, - "TimeSinceLastActivity", timeSinceLastActivity) + "TimeSinceLastActivity", timeSinceLastActivity, "requeueAfter", requeueAfter) return ctrl.Result{RequeueAfter: requeueAfter}, nil } // If the workspace was updated recently, requeue for the next probe From 78a465d78ccf52415510052f227ad858fdd04abe Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:32:11 +0100 Subject: [PATCH 07/13] implment bash probe Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- workspaces/controller/api/v1beta1/workspacekind_types.go | 5 +++-- .../controller/internal/controller/culling_controller.go | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/workspaces/controller/api/v1beta1/workspacekind_types.go b/workspaces/controller/api/v1beta1/workspacekind_types.go index affc77829..bdc061dd7 100644 --- a/workspaces/controller/api/v1beta1/workspacekind_types.go +++ b/workspaces/controller/api/v1beta1/workspacekind_types.go @@ -223,8 +223,9 @@ type ActivityProbeExec struct { OutputPath string `json:"outputPath"` // the number of seconds to wait for the script to complete - // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Maximum=600 + // +kubebuilder:validation:Minimum:=1 + // +kubebuilder:validation:Maximum:=300 + // +kubebuilder:default=10 TimeoutSeconds int32 `json:"timeoutSeconds"` // the script to run to determine if the Workspace is active diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index 30db063b6..775465662 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -51,7 +51,7 @@ import ( const ( defaultClusterDomain = "cluster.local" inactivityToleranceBufferSeconds = 5 - defaultHTTPTimeout = 15 * time.Second + defaultHTTPTimeout = 5 * time.Second ) // CullingReconciler reconciles a Workspace object From b1739f8b86cc3edfa1e059ebba61c2e3e0ea3260 Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Mon, 7 Oct 2024 19:18:19 +0100 Subject: [PATCH 08/13] fix tests Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- .../internal/controller/workspacekind_controller_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workspaces/controller/internal/controller/workspacekind_controller_test.go b/workspaces/controller/internal/controller/workspacekind_controller_test.go index 393307610..ffa8ad163 100644 --- a/workspaces/controller/internal/controller/workspacekind_controller_test.go +++ b/workspaces/controller/internal/controller/workspacekind_controller_test.go @@ -129,7 +129,9 @@ var _ = Describe("WorkspaceKind Controller", func() { newWorkspaceKind = workspaceKind.DeepCopy() newWorkspaceKind.Spec.PodTemplate.Culling.ActivityProbe = kubefloworgv1beta1.ActivityProbe{ Exec: &kubefloworgv1beta1.ActivityProbeExec{ - Command: []string{"bash", "-c", "exit 0"}, + OutputPath: "/path/to/output", + TimeoutSeconds: 9, + Script: "echo 'hello, world!'", }, Jupyter: &kubefloworgv1beta1.ActivityProbeJupyter{ LastActivity: true, From da19de16d4f43aadbdf8343d7b7a70f361dc5bc5 Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Thu, 10 Oct 2024 21:52:55 +0100 Subject: [PATCH 09/13] get ws port Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- .../jupyterlab_v1beta1_workspacekind.yaml | 54 +++++++++---------- .../internal/controller/culling_controller.go | 27 +++++++++- 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml index 7f3f6656a..6cf3763c2 100644 --- a/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml +++ b/workspaces/controller/config/samples/jupyterlab_v1beta1_workspacekind.yaml @@ -99,38 +99,38 @@ spec: ## - if the Workspace had activity in the last 60 seconds this command ## should return status 0, otherwise it should return status 1 ## - exec: - outputPath: "/tmp/activity_probe.json" - timeoutSeconds: 60 - script: |- - #!/usr/bin/env bash - - set -euo pipefail - - # Define the output path - output_path="/tmp/activity_probe.json" - - # Find the most recent modification time in the $HOME directory - last_activity_epoch=$(find "$HOME" -type f -printf '%T@\n' 2>/dev/null | awk 'max < $1 { max = $1 } END { print max }') - - # Write the last activity time to the output path - if [ -n "$last_activity_epoch" ]; then - # Convert epoch time to ISO 8601 format - last_activity=$(date -d "@$last_activity_epoch" -Iseconds) - echo "{\"last_activity\": \"$last_activity\"}" > "$output_path" - else - # Handle the case where no files are found - echo "{\"last_activity\": null}" > "$output_path" - fi +# exec: +# outputPath: "/tmp/activity_probe.json" +# timeoutSeconds: 60 +# script: |- +# #!/usr/bin/env bash +# +# set -euo pipefail +# +# # Define the output path +# output_path="/tmp/activity_probe.json" +# +# # Find the most recent modification time in the $HOME directory +# last_activity_epoch=$(find "$HOME" -type f -printf '%T@\n' 2>/dev/null | awk 'max < $1 { max = $1 } END { print max }') +# +# # Write the last activity time to the output path +# if [ -n "$last_activity_epoch" ]; then +# # Convert epoch time to ISO 8601 format +# last_activity=$(date -d "@$last_activity_epoch" -Iseconds) +# echo "{\"last_activity\": \"$last_activity\"}" > "$output_path" +# else +# # Handle the case where no files are found +# echo "{\"last_activity\": null}" > "$output_path" +# fi ## OPTION 2: a Jupyter-specific probe ## - will poll the `/api/status` endpoint of the Jupyter API, and use the `last_activity` field ## https://github.com/jupyter-server/jupyter_server/blob/v2.13.0/jupyter_server/services/api/handlers.py#L62-L67 ## - note, users need to be careful that their other probes don't trigger a "last_activity" update ## e.g. they should only check the health of Jupyter using the `/api/status` endpoint - ## -# jupyter: -# lastActivity: true -# portId: jupyterlab + # + jupyter: + lastActivity: true + portId: jupyterlab ## standard probes to determine Container health (MUTABLE) ## - spec for Probe: diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index 775465662..46f9b03df 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -174,8 +174,17 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct Message: "Failed to fetch service name for workspace", }, nil, nil) } - port := "8888" - jupyterAPIEndpoint := fmt.Sprintf("http://%s.%s.svc.%s:%s/workspace/%s/%s/jupyterlab/api/status", serviceName, workspace.Namespace, defaultClusterDomain, port, workspace.Namespace, workspace.Name) + port, err := r.getWorkspacePort(ctx, workspace, workspaceKind) + if err != nil { + log.Error(err, "Error fetching port for workspace") + return r.updateWorkspaceActivityStatus(ctx, log, workspace, &minRequeueAfter, &kubefloworgv1beta1.ProbeStatus{ + StartTimeMs: probeStartTime.UnixMilli(), + EndTimeMs: time.Now().UnixMilli(), + Result: kubefloworgv1beta1.ProbeResultFailure, + Message: "Failed to fetch port for workspace", + }, nil, nil) + } + jupyterAPIEndpoint := fmt.Sprintf("http://%s.%s.svc.%s:%d/workspace/%s/%s/jupyterlab/api/status", serviceName, workspace.Namespace, defaultClusterDomain, port, workspace.Namespace, workspace.Name) lastActivity, err, probeMessage, probeResult := fetchLastActivityFromJupyterAPI(jupyterAPIEndpoint) if err != nil { @@ -211,6 +220,7 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct }, ptr.To(probeStartTime.Unix()), ptr.To(lastActivity.Unix())) } + // Check if Bash probing is enabled if workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Exec != nil { probeStartTime := time.Now() podName, err := r.getPodName(ctx, workspace) @@ -369,6 +379,19 @@ func (r *CullingReconciler) getServiceName(ctx context.Context, workspace *kubef return ownedServices.Items[0].Name, nil } +func (r *CullingReconciler) getWorkspacePort(ctx context.Context, workspace *kubefloworgv1beta1.Workspace, workspaceKind *kubefloworgv1beta1.WorkspaceKind) (int32, error) { + for _, imageConfigValue := range workspaceKind.Spec.PodTemplate.Options.ImageConfig.Values { + if imageConfigValue.Id == workspace.Spec.PodTemplate.Options.ImageConfig { + for _, port := range imageConfigValue.Spec.Ports { + if port.Id == workspaceKind.Spec.PodTemplate.Culling.ActivityProbe.Jupyter.PortId { + return port.Port, nil + } + } + } + } + return 0, errors.New("port not found") +} + func (r *CullingReconciler) getPodName(ctx context.Context, workspace *kubefloworgv1beta1.Workspace) (string, error) { var statefulSetName string ownedStatefulSets := &appsv1.StatefulSetList{} From d28d7cde6b061b903790ed05d0a4fabd3468dfdc Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:35:07 +0100 Subject: [PATCH 10/13] update rbac Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- workspaces/controller/internal/controller/culling_controller.go | 2 ++ .../controller/internal/controller/workspace_controller.go | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index 46f9b03df..b4fe20a9a 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -67,6 +67,8 @@ type ActivityProbe struct { LastActivity *string `json:"last_activity,omitempty"` } +// +kubebuilder:rbac:groups="core",resources=pods/exec,verbs=create + func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // nolint:gocyclo log := log.FromContext(ctx) log.V(2).Info("reconciling Workspace for culling") diff --git a/workspaces/controller/internal/controller/workspace_controller.go b/workspaces/controller/internal/controller/workspace_controller.go index 85574b2cc..1af67d3c5 100644 --- a/workspaces/controller/internal/controller/workspace_controller.go +++ b/workspaces/controller/internal/controller/workspace_controller.go @@ -93,7 +93,6 @@ type WorkspaceReconciler struct { // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch -// +kubebuilder:rbac:groups="core",resources=pods/exec,verbs=create // +kubebuilder:rbac:groups=core,resources=services,verbs=create;delete;get;list;patch;update;watch // +kubebuilder:rbac:groups=networking.istio.io,resources=virtualservices,verbs=create;delete;get;list;patch;update;watch From 37bcc1914d50cdfe1fccc898edcc969e9e803f8d Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Fri, 7 Feb 2025 20:30:13 +0100 Subject: [PATCH 11/13] rebase Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- workspaces/controller/go.mod | 4 +- workspaces/controller/go.sum | 113 ++++++----------------------------- 2 files changed, 20 insertions(+), 97 deletions(-) diff --git a/workspaces/controller/go.mod b/workspaces/controller/go.mod index 6c937da1c..a464fd98b 100644 --- a/workspaces/controller/go.mod +++ b/workspaces/controller/go.mod @@ -34,15 +34,13 @@ require ( github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect github.com/google/uuid v1.6.0 // indirect - github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect - github.com/google/uuid v1.3.0 // indirect github.com/gorilla/websocket v1.5.0 // indirect github.com/imdario/mergo v0.3.6 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect - github.com/moby/spdystream v0.2.0 // indirect + github.com/moby/spdystream v0.4.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect diff --git a/workspaces/controller/go.sum b/workspaces/controller/go.sum index 1f083e6df..70acc4f64 100644 --- a/workspaces/controller/go.sum +++ b/workspaces/controller/go.sum @@ -44,10 +44,18 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM= github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -65,6 +73,11 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= +github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= +github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/moby/spdystream v0.4.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -72,8 +85,14 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY= +github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw= github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA= github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= +github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= +github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -186,100 +205,6 @@ k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1 k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= -github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= -github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= -github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= -github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= -github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= -github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= -github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= -github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU= -github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= -github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= -github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= -github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= -github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM= -github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= -github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= -github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= -github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= -github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= -github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= -github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= -github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY= -github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw= -github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= -github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= -github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA= -github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= -github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= -github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= -github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= -github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= -github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= -github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= -github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= -github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU= -golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= -golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= -golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= -google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= -google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= -k8s.io/api v0.31.0 h1:b9LiSjR2ym/SzTOlfMHm1tr7/21aD7fSkqgD/CVJBCo= -k8s.io/api v0.31.0/go.mod h1:0YiFF+JfFxMM6+1hQei8FY8M7s1Mth+z/q7eF1aJkTE= -k8s.io/apiextensions-apiserver v0.31.0 h1:fZgCVhGwsclj3qCw1buVXCV6khjRzKC5eCFt24kyLSk= -k8s.io/apiextensions-apiserver v0.31.0/go.mod h1:b9aMDEYaEe5sdK+1T0KU78ApR/5ZVp4i56VacZYEHxk= -k8s.io/apimachinery v0.31.0 h1:m9jOiSr3FoSSL5WO9bjm1n6B9KROYYgNZOb4tyZ1lBc= -k8s.io/apimachinery v0.31.0/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= -k8s.io/client-go v0.31.0 h1:QqEJzNjbN2Yv1H79SsS+SWnXkBgVu4Pj3CJQgbx0gI8= -k8s.io/client-go v0.31.0/go.mod h1:Y9wvC76g4fLjmU0BA+rV+h2cncoadjvjjkkIGoTLcGU= -k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= -k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= -k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= -k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= -k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= -sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= From 838baa5348a1b2fc03a51670ed1150b02f897c3d Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Mon, 10 Feb 2025 22:38:35 +0100 Subject: [PATCH 12/13] fix lint problems Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- .../controller/api/v1beta1/workspace_types.go | 26 ++++++------ .../api/v1beta1/workspacekind_types.go | 14 +++---- workspaces/controller/cmd/main.go | 5 ++- workspaces/controller/go.mod | 1 - workspaces/controller/go.sum | 17 ++------ .../internal/controller/culling_controller.go | 40 ++++++++++--------- .../controller/culling_controller_test.go | 16 ++++++++ 7 files changed, 63 insertions(+), 56 deletions(-) diff --git a/workspaces/controller/api/v1beta1/workspace_types.go b/workspaces/controller/api/v1beta1/workspace_types.go index ccd6560d1..5d8a0a74c 100644 --- a/workspaces/controller/api/v1beta1/workspace_types.go +++ b/workspaces/controller/api/v1beta1/workspace_types.go @@ -38,8 +38,8 @@ type WorkspaceSpec struct { // DisableCulling controls whether automatic culling is disabled for the workspace. // If true, the workspace will not be culled - //+kubebuilder:validation:Optional - //+kubebuilder:default=false + // +kubebuilder:validation:Optional + // +kubebuilder:default=false DisableCulling *bool `json:"disableCulling,omitempty"` // if true, pending updates are NOT applied when the Workspace is paused @@ -233,24 +233,24 @@ type WorkspacePodOptionRedirectStep struct { type ProbeStatus struct { // the time the probe was started (UNIX epoch in milliseconds) - //+kubebuilder:validation:Minimum=0 - //+kubebuilder:example=1710435303000 + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:example=1710435303000 StartTimeMs int64 `json:"startTimeMs"` // the time the probe was completed (UNIX epoch in milliseconds) - //+kubebuilder:validation:Minimum=0 - //+kubebuilder:example=1710435305000 + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:example=1710435305000 EndTimeMs int64 `json:"endTimeMs"` // the result of the probe // ENUM: "Success" | "Failure" | "Timeout" | "" - //+kubebuilder:default="" + // +kubebuilder:default="" Result ProbeResult `json:"result"` // a human-readable message about the probe result // WARNING: this field is NOT FOR MACHINE USE, subject to change without notice - //+kubebuilder:default="" - //+kubebuilder:example="Jupyter probe succeeded" + // +kubebuilder:default="" + // +kubebuilder:example="Jupyter probe succeeded" Message string `json:"message"` } @@ -282,10 +282,10 @@ const ( =============================================================================== */ -//+kubebuilder:object:root=true -//+kubebuilder:printcolumn:name="State",type="string",JSONPath=".status.state",description="The current state of the Workspace" -//+kubebuilder:subresource:status -//+kubebuilder:resource:shortName=ws +// +kubebuilder:object:root=true +// +kubebuilder:printcolumn:name="State",type="string",JSONPath=".status.state",description="The current state of the Workspace" +// +kubebuilder:subresource:status +// +kubebuilder:resource:shortName=ws // Workspace is the Schema for the Workspaces API type Workspace struct { diff --git a/workspaces/controller/api/v1beta1/workspacekind_types.go b/workspaces/controller/api/v1beta1/workspacekind_types.go index bdc061dd7..c90904bba 100644 --- a/workspaces/controller/api/v1beta1/workspacekind_types.go +++ b/workspaces/controller/api/v1beta1/workspacekind_types.go @@ -185,15 +185,15 @@ type WorkspaceKindCullingConfig struct { MaxInactiveSeconds *int32 `json:"maxInactiveSeconds,omitempty"` // the maximum number of seconds between probes - //+kubebuilder:validation:Optional - //+kubebuilder:validation:Minimum:=60 - //+kubebuilder:default=300 + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum:=60 + // +kubebuilder:default=300 MaxProbeIntervalSeconds *int32 `json:"maxProbeIntervalSeconds,omitempty"` // the minimum number of seconds between probes to avoid spamming in case on failure - //+kubebuilder:validation:Optional - //+kubebuilder:validation:Minimum:=10 - //+kubebuilder:default=20 + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum:=10 + // +kubebuilder:default=20 MinProbeIntervalSeconds *int32 `json:"minProbeIntervalSeconds,omitempty"` // the probe used to determine if the Workspace is active @@ -219,7 +219,7 @@ type ActivityProbe struct { type ActivityProbeExec struct { // the script should write a JSON file at this path. // any existing file in this path will be REMOVED before the script is run - //+kubebuilder:example="/tmp/activity_probe.json" + // +kubebuilder:example="/tmp/activity_probe.json" OutputPath string `json:"outputPath"` // the number of seconds to wait for the script to complete diff --git a/workspaces/controller/cmd/main.go b/workspaces/controller/cmd/main.go index b27fe6685..1ea48eaad 100644 --- a/workspaces/controller/cmd/main.go +++ b/workspaces/controller/cmd/main.go @@ -19,9 +19,10 @@ package main import ( "crypto/tls" "flag" - "k8s.io/client-go/kubernetes" "os" + "k8s.io/client-go/kubernetes" + // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" @@ -163,7 +164,7 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "Culler") os.Exit(1) } - //+kubebuilder:scaffold:builder + // +kubebuilder:scaffold:builder if os.Getenv("ENABLE_WEBHOOKS") != "false" { if err = (&webhookInternal.WorkspaceValidator{ diff --git a/workspaces/controller/go.mod b/workspaces/controller/go.mod index a464fd98b..8f2b6c3f5 100644 --- a/workspaces/controller/go.mod +++ b/workspaces/controller/go.mod @@ -39,7 +39,6 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect - github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect github.com/moby/spdystream v0.4.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect diff --git a/workspaces/controller/go.sum b/workspaces/controller/go.sum index 70acc4f64..6d9b94f87 100644 --- a/workspaces/controller/go.sum +++ b/workspaces/controller/go.sum @@ -1,3 +1,5 @@ +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -44,18 +46,12 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM= github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -73,10 +69,7 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= -github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= -github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= -github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= -github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/moby/spdystream v0.4.0 h1:Vy79D6mHeJJjiPdFEL2yku1kl0chZpJfZcPpb16BRl8= github.com/moby/spdystream v0.4.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= @@ -87,12 +80,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY= -github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw= github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA= github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= -github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= -github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index b4fe20a9a..bc56501e1 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -22,9 +22,15 @@ import ( "encoding/json" "errors" "fmt" + "net" + "net/http" + "net/url" + "os" + "strconv" + "strings" + "time" + "github.com/go-logr/logr" - kubefloworgv1beta1 "github.com/kubeflow/notebooks/workspaces/controller/api/v1beta1" - "github.com/kubeflow/notebooks/workspaces/controller/internal/helper" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -36,16 +42,12 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/remotecommand" "k8s.io/utils/ptr" - "net" - "net/http" - "net/url" - "os" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "strconv" - "strings" - "time" + + kubefloworgv1beta1 "github.com/kubeflow/notebooks/workspaces/controller/api/v1beta1" + "github.com/kubeflow/notebooks/workspaces/controller/internal/helper" ) const ( @@ -69,7 +71,7 @@ type ActivityProbe struct { // +kubebuilder:rbac:groups="core",resources=pods/exec,verbs=create -func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // nolint:gocyclo +func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { //nolint:gocyclo log := log.FromContext(ctx) log.V(2).Info("reconciling Workspace for culling") // fetch the Workspace @@ -176,7 +178,7 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct Message: "Failed to fetch service name for workspace", }, nil, nil) } - port, err := r.getWorkspacePort(ctx, workspace, workspaceKind) + port, err := r.getWorkspacePort(workspace, workspaceKind) if err != nil { log.Error(err, "Error fetching port for workspace") return r.updateWorkspaceActivityStatus(ctx, log, workspace, &minRequeueAfter, &kubefloworgv1beta1.ProbeStatus{ @@ -261,7 +263,7 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct lastActivity := time.Now().Unix() if activityProbe.HasActivity != nil && !*activityProbe.HasActivity { log.V(2).Info("Culling the workspace due to inactivity") - //TODO: figure out how to set the last activity time + // TODO: figure out how to set the last activity time lastActivity = time.Now().Unix() workspace.Spec.Paused = ptr.To(true) err := r.Update(ctx, workspace) @@ -324,7 +326,7 @@ func (r *CullingReconciler) SetupWithManager(mgr ctrl.Manager) error { } // updateWorkspaceActivityStatus attempts to immediately update the Workspace activity status with the provided status. -func (r *CullingReconciler) updateWorkspaceActivityStatus(ctx context.Context, log logr.Logger, workspace *kubefloworgv1beta1.Workspace, requeueAfter *time.Duration, probeStatus *kubefloworgv1beta1.ProbeStatus, lastUpdate, lastActivity *int64) (ctrl.Result, error) { // nolint:unparam +func (r *CullingReconciler) updateWorkspaceActivityStatus(ctx context.Context, log logr.Logger, workspace *kubefloworgv1beta1.Workspace, requeueAfter *time.Duration, probeStatus *kubefloworgv1beta1.ProbeStatus, lastUpdate, lastActivity *int64) (ctrl.Result, error) { if workspace == nil { return ctrl.Result{}, fmt.Errorf("provided Workspace was nil") } @@ -381,7 +383,7 @@ func (r *CullingReconciler) getServiceName(ctx context.Context, workspace *kubef return ownedServices.Items[0].Name, nil } -func (r *CullingReconciler) getWorkspacePort(ctx context.Context, workspace *kubefloworgv1beta1.Workspace, workspaceKind *kubefloworgv1beta1.WorkspaceKind) (int32, error) { +func (r *CullingReconciler) getWorkspacePort(workspace *kubefloworgv1beta1.Workspace, workspaceKind *kubefloworgv1beta1.WorkspaceKind) (int32, error) { for _, imageConfigValue := range workspaceKind.Spec.PodTemplate.Options.ImageConfig.Values { if imageConfigValue.Id == workspace.Spec.PodTemplate.Options.ImageConfig { for _, port := range imageConfigValue.Spec.Ports { @@ -449,7 +451,7 @@ func (r *CullingReconciler) execCommand(ctx context.Context, podName, podNamespa executor, err := createExecutor(req.URL(), r.Config) if err != nil { - return "", "", fmt.Errorf("error creating executor: %v", err) + return "", "", fmt.Errorf("error creating executor: %w", err) } var stdout, stderr bytes.Buffer @@ -492,7 +494,7 @@ func fetchLastActivityFromJupyterAPI(apiEndpoint string) (*time.Time, error, str LastActivity string `json:"last_activity"` } - defer resp.Body.Close() + defer resp.Body.Close() //nolint:errcheck if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { return nil, fmt.Errorf("failed to parse JupyterLab API response: %w", err), "Jupyter probe failed: invalid response body", kubefloworgv1beta1.ProbeResultFailure @@ -509,13 +511,13 @@ func fetchLastActivityFromJupyterAPI(apiEndpoint string) (*time.Time, error, str } // createExecutor creates a new Executor for the given URL and REST config. -func createExecutor(url *url.URL, config *rest.Config) (remotecommand.Executor, error) { - exec, err := remotecommand.NewSPDYExecutor(config, "POST", url) +func createExecutor(requestUrl *url.URL, config *rest.Config) (remotecommand.Executor, error) { + exec, err := remotecommand.NewSPDYExecutor(config, "POST", requestUrl) if err != nil { return nil, err } // WebSocketExecutor must be "GET" method as described in RFC 6455 Sec. 4.1 (page 17). - websocketExec, err := remotecommand.NewWebSocketExecutor(config, "GET", url.String()) + websocketExec, err := remotecommand.NewWebSocketExecutor(config, "GET", requestUrl.String()) if err != nil { return nil, err } diff --git a/workspaces/controller/internal/controller/culling_controller_test.go b/workspaces/controller/internal/controller/culling_controller_test.go index b0b429f89..2f16a99ef 100644 --- a/workspaces/controller/internal/controller/culling_controller_test.go +++ b/workspaces/controller/internal/controller/culling_controller_test.go @@ -1 +1,17 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controller From ce8ed927cd42bfa3677eba1a662ab02683e04ad9 Mon Sep 17 00:00:00 2001 From: Adem Baccara <71262172+Adembc@users.noreply.github.com> Date: Tue, 11 Feb 2025 19:17:20 +0100 Subject: [PATCH 13/13] fix tests Signed-off-by: Adem Baccara <71262172+Adembc@users.noreply.github.com> --- workspaces/controller/cmd/main.go | 1 - workspaces/controller/config/manager/kustomization.yaml | 1 + workspaces/controller/internal/controller/culling_controller.go | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workspaces/controller/cmd/main.go b/workspaces/controller/cmd/main.go index 1ea48eaad..9a14355ec 100644 --- a/workspaces/controller/cmd/main.go +++ b/workspaces/controller/cmd/main.go @@ -154,7 +154,6 @@ func main() { setupLog.Error(err, "unable to create clientset") os.Exit(1) } - // +kubebuilder:scaffold:builder if err = (&controllerInternal.CullingReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), diff --git a/workspaces/controller/config/manager/kustomization.yaml b/workspaces/controller/config/manager/kustomization.yaml index ad2284487..2c5fb1ff7 100644 --- a/workspaces/controller/config/manager/kustomization.yaml +++ b/workspaces/controller/config/manager/kustomization.yaml @@ -5,3 +5,4 @@ resources: images: - name: controller newName: ghcr.io/kubeflow/notebooks/workspace-controller + newTag: latest diff --git a/workspaces/controller/internal/controller/culling_controller.go b/workspaces/controller/internal/controller/culling_controller.go index bc56501e1..f30303d26 100644 --- a/workspaces/controller/internal/controller/culling_controller.go +++ b/workspaces/controller/internal/controller/culling_controller.go @@ -321,7 +321,7 @@ func (r *CullingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct func (r *CullingReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&kubefloworgv1beta1.Workspace{}). + For(&kubefloworgv1beta1.Workspace{}).Named("culling_controller"). Complete(r) }