diff --git a/.gitignore b/.gitignore index ada68ff..35608c9 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,5 @@ go.work *.swp *.swo *~ + +.DS_Store diff --git a/PROJECT b/PROJECT index a2fddab..25e53e4 100644 --- a/PROJECT +++ b/PROJECT @@ -63,4 +63,12 @@ resources: kind: GPUNodeClass path: github.com/NexusGPU/tensor-fusion-operator/api/v1 version: v1 +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: tensor-fusion.ai + kind: SchedulingConfigTemplate + path: github.com/NexusGPU/tensor-fusion-operator/api/v1 + version: v1 version: "3" diff --git a/README.md b/README.md index f2daa2f..d2d55c7 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,12 @@ - kubectl version v1.11.3+. - Access to a Kubernetes v1.11.3+ cluster. +### Add new API + +```bash +kubebuilder create api --group "" --version v1 --kind SchedulingConfigTemplate --namespaced false +``` + ### To Deploy on the cluster **Build and push your image to the location specified by `IMG`:** @@ -112,3 +118,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + diff --git a/api/v1/base_types.go b/api/v1/base_types.go new file mode 100644 index 0000000..b7cf1d6 --- /dev/null +++ b/api/v1/base_types.go @@ -0,0 +1,6 @@ +package v1 + +type NameNamespace struct { + Name string `json:"name,omitempty"` + Namespace string `json:"namespace,omitempty"` +} diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go index 1220b08..8312e03 100644 --- a/api/v1/gpunode_types.go +++ b/api/v1/gpunode_types.go @@ -25,21 +25,65 @@ import ( // GPUNodeSpec defines the desired state of GPUNode. type GPUNodeSpec struct { - // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster - // Important: Run "make" to regenerate code after modifying this file + ManageMode GPUNodeManageMode `json:"manageMode,omitempty"` - // Foo is an example field of GPUNode. Edit gpunode_types.go to remove/update - Foo string `json:"foo,omitempty"` + // if not all GPU cards should be used, specify the GPU card indices, default to empty, + // onboard all GPU cards to the pool + GPUCardIndices []int `json:"gpuCardIndices,omitempty"` } +type GPUNodeManageMode string + +const ( + GPUNodeManageModeNone GPUNodeManageMode = "manual" + GPUNodeManageModeAuto GPUNodeManageMode = "selected" + GPUNodeManageModeManual GPUNodeManageMode = "provisioned" +) + // GPUNodeStatus defines the observed state of GPUNode. type GPUNodeStatus struct { - // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster - // Important: Run "make" to regenerate code after modifying this file + Phase TensorFusionClusterPhase `json:"phase,omitempty"` + + Conditions []metav1.Condition `json:"conditions,omitempty"` + + TotalTFlops int32 `json:"totalTFlops,omitempty"` + TotalVRAM string `json:"totalVRAM,omitempty"` + + AvailableTFlops int32 `json:"availableTFlops,omitempty"` + AvailableVRAM string `json:"availableVRAM,omitempty"` + + HypervisorStatus NodeHypervisorStatus `json:"hypervisorStatus,omitempty"` + + NodeInfo GPUNodeInfo `json:"nodeInfo,omitempty"` + + LoadedModels []string `json:"loadedModels,omitempty"` + + TotalGPUs int32 `json:"totalGPUs,omitempty"` + ManagedGPUs int32 `json:"managedGPUs,omitempty"` + ManagedGPUResourceIDs []string `json:"managedGPUResourceIDs,omitempty"` +} + +type GPUNodeInfo struct { + Hostname string `json:"hostname,omitempty"` + IP string `json:"ip,omitempty"` + KernalVersion string `json:"kernalVersion,omitempty"` + OSImage string `json:"osImage,omitempty"` + GPUDriverVersion string `json:"gpuDriverVersion,omitempty"` + GPUModel string `json:"gpuModel,omitempty"` + GPUCount int32 `json:"gpuCount,omitempty"` + OperatingSystem string `json:"operatingSystem,omitempty"` + Architecture string `json:"architecture,omitempty"` +} + +type NodeHypervisorStatus struct { + HypervisorState string `json:"hypervisorState,omitempty"` + HypervisorVersion string `json:"hypervisorVersion,omitempty"` + LastHeartbeatTime metav1.Time `json:"lastHeartbeatTime,omitempty"` } // +kubebuilder:object:root=true // +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster // GPUNode is the Schema for the gpunodes API. type GPUNode struct { diff --git a/api/v1/gpunodeclass_types.go b/api/v1/gpunodeclass_types.go index ea42334..2aa80bd 100644 --- a/api/v1/gpunodeclass_types.go +++ b/api/v1/gpunodeclass_types.go @@ -25,21 +25,60 @@ import ( // GPUNodeClassSpec defines the desired state of GPUNodeClass. type GPUNodeClassSpec struct { - // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster - // Important: Run "make" to regenerate code after modifying this file + OSImageFamily string `json:"osImageFamily,omitempty"` // The AMI family to use - // Foo is an example field of GPUNodeClass. Edit gpunodeclass_types.go to remove/update - Foo string `json:"foo,omitempty"` + OSImageSelectorTerms []NodeClassOSImageSelectorTerms `json:"osImageSelectorTerms,omitempty"` + + BlockDeviceMappings []NodeClassBlockDeviceMappings `json:"blockDeviceMappings,omitempty"` // Block device mappings for the instance + + InstanceProfile string `json:"instanceProfile,omitempty"` // The instance profile to use + + MetadataOptions NodeClassMetadataOptions `json:"metadataOptions,omitempty"` + + SecurityGroupSelectorTerms []NodeClassItemIDSelectorTerms `json:"securityGroupSelectorTerms,omitempty"` + + SubnetSelectorTerms []NodeClassItemIDSelectorTerms `json:"subnetSelectorTerms,omitempty"` // Terms to select subnets + + Tags map[string]string `json:"tags,omitempty"` // Tags associated with the resource + + UserData string `json:"userData,omitempty"` // User data script for the instance +} + +type NodeClassItemIDSelectorTerms struct { + ID string `json:"id,omitempty"` // The ID of the security group +} + +type NodeClassMetadataOptions struct { + HttpEndpoint string `json:"httpEndpoint,omitempty"` // Whether the HTTP metadata endpoint is enabled + HttpProtocolIPv6 string `json:"httpProtocolIPv6,omitempty"` // Whether IPv6 is enabled for the HTTP metadata endpoint + HttpPutResponseHopLimit int `json:"httpPutResponseHopLimit,omitempty"` // The hop limit for HTTP PUT responses + HttpTokens string `json:"httpTokens,omitempty"` // The HTTP tokens required for metadata access +} + +type NodeClassOSImageSelectorTerms struct { + Name string `json:"name,omitempty"` + Owner string `json:"owner,omitempty"` +} + +type NodeClassBlockDeviceMappings struct { + DeviceName string `json:"deviceName,omitempty"` // The device name for the block device + Ebs NodeClassEbsSettings `json:"ebs,omitempty"` +} + +type NodeClassEbsSettings struct { + DeleteOnTermination bool `json:"deleteOnTermination,omitempty"` // Whether to delete the EBS volume on termination + Encrypted bool `json:"encrypted,omitempty"` // Whether the EBS volume is encrypted + VolumeSize string `json:"volumeSize,omitempty"` // The size of the EBS volume + VolumeType string `json:"volumeType,omitempty"` // The type of the EBS volume } // GPUNodeClassStatus defines the observed state of GPUNodeClass. type GPUNodeClassStatus struct { - // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster - // Important: Run "make" to regenerate code after modifying this file } // +kubebuilder:object:root=true // +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster // GPUNodeClass is the Schema for the gpunodeclasses API. type GPUNodeClass struct { diff --git a/api/v1/gpupool_types.go b/api/v1/gpupool_types.go index c83b542..bdf7229 100644 --- a/api/v1/gpupool_types.go +++ b/api/v1/gpupool_types.go @@ -18,6 +18,7 @@ package v1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" ) // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! @@ -25,21 +26,213 @@ import ( // GPUPoolSpec defines the desired state of GPUPool. type GPUPoolSpec struct { - // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster - // Important: Run "make" to regenerate code after modifying this file + CapacityConfig CapacityConfig `json:"capacityConfig,omitempty"` + NodeManagerConfig NodeManagerConfig `json:"nodeManagerConfig,omitempty"` + ObservabilityConfig ObservabilityConfig `json:"observabilityConfig,omitempty"` + QosConfig QosConfig `json:"qosConfig,omitempty"` + ComponentConfig ComponentConfig `json:"componentConfig,omitempty"` + SchedulingConfig SchedulingConfigTemplateSpec `json:"schedulingConfig,omitempty"` + SchedulingConfigTemplate string `json:"schedulingConfigTemplate,omitempty"` +} + +type CapacityConfig struct { + MinResources GPUResourceUnit `json:"minResources,omitempty"` + MaxResources GPUResourceUnit `json:"maxResources,omitempty"` + WarmResources GPUResourceUnit `json:"warmResources,omitempty"` + Oversubscription Oversubscription `json:"oversubscription,omitempty"` +} + +type Oversubscription struct { + // the percentage of Host RAM appending to GPU VRAM, default to 50% + VramExpandToHostMem string `json:"vramExpandToHostMem,omitempty"` + + // the percentage of Host Disk appending to GPU VRAM, default to 70% + VramExpandToHostDisk string `json:"vramExpandToHostDisk,omitempty"` + + // The multipler of TFlops to oversell, default to 1 for production, 20 for development + TflopsOversellRatio string `json:"tflopsOversellRatio,omitempty"` +} + +type NodeManagerConfig struct { + // karpenter mode Hypervisor manage GPU nodes and Workers + NodeProvisioner NodeProvisioner `json:"nodeProvisioner,omitempty"` + NodeSelector NodeSelector `json:"nodeSelector,omitempty"` + NodeCompaction NodeCompaction `json:"nodeCompaction,omitempty"` + NodePoolRollingUpdatePolicy NodeRollingUpdatePolicy `json:"nodePoolRollingUpdatePolicy,omitempty"` +} + +// NodeProvisioner or NodeSelector, they are exclusive. +// NodeSelector is for existing GPUs, NodeProvisioner is for Karpenter-like auto management. +type NodeProvisioner struct { + NodeClass string `json:"nodeClass,omitempty"` + Requirements []Requirement `json:"requirements,omitempty"` + Taints []Taint `json:"taints,omitempty"` +} + +type Requirement struct { + Key string `json:"key,omitempty"` + Operator string `json:"operator,omitempty"` + Values []string `json:"values,omitempty"` +} + +type Taint struct { + Effect string `json:"effect,omitempty"` + Key string `json:"key,omitempty"` + Value string `json:"value,omitempty"` +} + +// Use existing Kubernetes GPU nodes. +type NodeSelector []NodeSelectorItem + +type NodeSelectorItem struct { + MatchAny map[string]string `json:"matchAny,omitempty"` + MatchAll map[string]string `json:"matchAll,omitempty"` +} + +type NodeCompaction struct { + Period string `json:"period,omitempty"` +} + +type NodeRollingUpdatePolicy struct { + // If set to false, updates will be pending in status, and user needs to manually approve updates. + // Updates will occur immediately or during the next maintenance window. + AutoUpdate *bool `json:"autoUpdate,omitempty"` + BatchPercentage string `json:"batchPercentage,omitempty"` + BatchInterval string `json:"batchInterval,omitempty"` + Duration string `json:"duration,omitempty"` + + MaintenanceWindow MaintenanceWindow `json:"maintenanceWindow,omitempty"` +} + +type MaintenanceWindow struct { + // crontab syntax. + Includes []string `json:"includes,omitempty"` +} + +type ObservabilityConfig struct { + Monitor MonitorConfig `json:"monitor,omitempty"` + Alert AlertConfig `json:"alert,omitempty"` +} + +type MonitorConfig struct { + Interval string `json:"interval,omitempty"` +} + +type AlertConfig struct { + Expression runtime.RawExtension `json:"expression,omitempty"` +} + +// Define different QoS and their price. +type QosConfig struct { + Definitions []QosDefinition `json:"definitions,omitempty"` + DefaultQoS string `json:"defaultQoS,omitempty"` + BillingPeriod string `json:"billingPeriod,omitempty"` // "second" or "minute", default to "second" + Pricing []QosPricing `json:"pricing,omitempty"` +} + +type QosDefinition struct { + Name string `json:"name,omitempty"` + Description string `json:"description,omitempty"` + Priority int `json:"priority,omitempty"` // Range from 1-100, reflects the scheduling priority when GPU is full and tasks are in the queue. +} + +type GPUResourceUnit struct { + // Tera floating point operations per second + TFlops string `json:"tflops,omitempty"` + + // VRAM is short for Video memory, namely GPU RAM + VRAM string `json:"vram,omitempty"` +} + +type QosPricing struct { + Qos string `json:"qos,omitempty"` + Requests GPUResourceUnit `json:"requests,omitempty"` + LimitsOverRequests GPUResourceUnit `json:"limitsOverRequests,omitempty"` +} + +// Customize system components for seamless onboarding. +type ComponentConfig struct { + Worker WorkerConfig `json:"worker,omitempty"` + Hypervisor HypervisorConfig `json:"hypervisor,omitempty"` + Client ClientConfig `json:"client,omitempty"` +} + +type WorkerConfig struct { + Image string `json:"image,omitempty"` // "stable" | "latest" | "nightly" + Port int `json:"port,omitempty"` + HostNetwork *bool `json:"hostNetwork,omitempty"` + WorkerPodTemplate runtime.RawExtension `json:"workerPodTemplate,omitempty"` // Mixin extra spec. +} + +type HypervisorConfig struct { + Image string `json:"image,omitempty"` + HypervisorDaemonSetTemplate runtime.RawExtension `json:"hypervisorDaemonSetTemplate,omitempty"` // Mixin extra spec. +} + +// TODO: client mutation webhook need TLS cert, need check using cert-manager or other ways +type ClientConfig struct { + Image string `json:"image,omitempty"` + Protocol string `json:"protocol,omitempty"` + Port int `json:"port,omitempty"` - // Foo is an example field of GPUPool. Edit gpupool_types.go to remove/update - Foo string `json:"foo,omitempty"` + // +optional + // define how to inject the client pod + PodTemplateMergePatch runtime.RawExtension `json:"podTemplateMergePatch,omitempty"` // Add other things to the original pod. } // GPUPoolStatus defines the observed state of GPUPool. type GPUPoolStatus struct { - // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster - // Important: Run "make" to regenerate code after modifying this file + Cluster string `json:"cluster,omitempty"` + + Phase TensorFusionClusterPhase `json:"phase,omitempty"` + + Conditions []metav1.Condition `json:"conditions,omitempty"` + + TotalNodes int32 `json:"totalNodes,omitempty"` + TotalGPUs int32 `json:"totalGPUs,omitempty"` + ReadyNodes int32 `json:"readyNodes,omitempty"` + NotReadyNodes int32 `json:"notReadyNodes,omitempty"` + + TotalTFlops int32 `json:"totalTFlops,omitempty"` + TotalVRAM string `json:"totalVRAM,omitempty"` + + AvailableTFlops int32 `json:"availableTFlops,omitempty"` + AvailableVRAM string `json:"availableVRAM,omitempty"` + + // If using provisioner, GPU nodes could be outside of the K8S cluster. + // The GPUNodes custom resource will be created and deleted automatically. + // ProvisioningStatus is to track the status of those outside GPU nodes. + ProvisioningStatus PoolProvisioningStatus `json:"provisioningStatus,omitempty"` + + // when updating any component version or config, poolcontroller will perform rolling update. + // the status will be updated periodically, default to 5s, progress will be 0-100. + // when the progress is 100, the component version or config is fully updated. + ComponentStatus PoolComponentStatus `json:"componentStatus,omitempty"` +} + +type PoolProvisioningStatus struct { + InitializingNodes int32 `json:"initializingNodes,omitempty"` + TerminatingNodes int32 `json:"terminatingNodes,omitempty"` + AvailableNodes int32 `json:"availableNodes,omitempty"` +} + +type PoolComponentStatus struct { + WorkerVersion string `json:"worker,omitempty"` + WorkerConfigSynced bool `json:"workerConfigSynced,omitempty"` + WorkerUpdateProgress int32 `json:"workerUpdateProgress,omitempty"` + + HypervisorVersion string `json:"hypervisor,omitempty"` + HypervisorConfigSynced bool `json:"hypervisorConfigSynced,omitempty"` + HyperVisorUpdateProgress int32 `json:"hypervisorUpdateProgress,omitempty"` + + ClientVersion string `json:"client,omitempty"` + ClientConfigSynced bool `json:"clientConfigSynced,omitempty"` + ClientUpdateProgress int32 `json:"clientUpdateProgress,omitempty"` } // +kubebuilder:object:root=true // +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster // GPUPool is the Schema for the gpupools API. type GPUPool struct { diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go new file mode 100644 index 0000000..77b6d8c --- /dev/null +++ b/api/v1/schedulingconfigtemplate_types.go @@ -0,0 +1,204 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// Place the workload to right nodes and scale smart. +type SchedulingConfigTemplateSpec struct { + + // place the client or worker to best matched nodes + Placement PlacementConfig `json:"placement,omitempty"` + + // scale the workload based on the usage and traffic + AutoScaling AutoScalingConfig `json:"autoScaling,omitempty"` + + // avoid hot GPU devices and continuously balance the workload + // implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler + ReBalancer ReBalancerConfig `json:"reBalancer,omitempty"` + + // single GPU device multi-process queuing and fair scheduling with QoS constraint + Hypervisor HypervisorScheduling `json:"hypervisor,omitempty"` +} + +type PlacementConfig struct { + Mode PlacementMode `json:"mode,omitempty"` // "compactFirst" | "lowLoadFirst" + AllowUsingLocalGPU *bool `json:"allowUsingLocalGPU,omitempty"` // If false, workloads will not be scheduled directly to GPU nodes with 'localGPU: true'. + GPUFilters []GPUFilter `json:"gpuFilters,omitempty"` +} + +type PlacementMode string + +const ( + // default to compactFirst for cost saving and energy saving + PlacementModeCompactFirst PlacementMode = "compactFirst" + + // in some cases, use lowLoadFirst for balance and fairness + PlacementModeLowLoadFirst PlacementMode = "lowLoadFirst" +) + +// GPUFilter is to select eligible GPUs for scheduling. +// +// example: +// ```yaml +// - type: avoidTooMuchConnectionsOnSameGPU +// params: +// +// connectionNum: 150 +// +// - type: avoidDifferentZone +// params: +// +// # by default, GPU worker will be scheduled into the same zone as CPU Client Pod to align AZ and improve performance +// topologyKey: topology.kubernetes.io/zone +// +// ``` +type GPUFilter struct { + Type string `json:"type,omitempty"` + Params runtime.RawExtension `json:"params,omitempty"` +} + +type AutoScalingConfig struct { + // layer 1 vertical auto-scaling, turbo burst to existing GPU cards fastly + AutoSetLimits AutoSetLimits `json:"autoSetLimits,omitempty"` + + // layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshod hit + AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"` + + // layer 3 adjusting, to match the actual usage in the long run + AutoSetRequests AutoSetRequests `json:"autoSetRequests,omitempty"` + + // additional layer to save VRAM, auto-freeze memory and cool down to RAM and Disk + ScaleToZero ScaleToZero `json:"scaleToZero,omitempty"` +} + +// A typical autoLimits algorithm could be checking every 5m, look back 1 day data, +// select 99% of actual usage as preferredLimits, +// calculate finalPreferredLimits, which is preferredLimits*(1+extraBufferRatio) +// if they are equal with each other within a range (eg. 5%), do nothing +// if finalPreferredLimits is less than current limits and exceeded error range, +// set current limits to finalPreferredLimits +// if finalPreferredLimits > current limits and exceeded error range, +// set current limits to max(finalPreferredLimits, current limits * scaleUpStep) +// if AI prediction enabled, it helps to detect history pattern, and set more reasonable, explainable limit value +// the final set limits should be max(finalPreferredLimits, last(predict_value * (1 + extraTFlopsBufferRatio))) +type AutoSetLimits struct { + EvaluationPeriod string `json:"evaluationPeriod,omitempty"` + ExtraTFlopsBufferRatio string `json:"extraTFlopsBufferRatio,omitempty"` + IgnoredDeltaRange string `json:"ignoredDeltaRange,omitempty"` + ScaleUpStep string `json:"scaleUpStep,omitempty"` + + // the multiplier of requests, to avoid limit set too high, like 5.0 + MaxRatioToRequests string `json:"maxRatioToRequests,omitempty"` + + Prediction SmartSchedulerModelInput `json:"prediction,omitempty"` +} + +// To handle burst traffic, scale up in short time (this feature requires GPU context migration & replication, not available yet) +type AutoSetReplicas struct { + Enable *bool `json:"enable,omitempty"` + TargetTFlopsOfLimits string `json:"targetTFlopsOfLimits,omitempty"` + EvaluationPeriod string `json:"evaluationPeriod,omitempty"` + ScaleUpStep string `json:"scaleUpStep,omitempty"` + ScaleDownStep string `json:"scaleDownStep,omitempty"` + ScaleDownUpDownTime string `json:"scaleDownUpDownTime,omitempty"` + ScaleDownCoolDownTime string `json:"scaleDownCoolDownTime,omitempty"` +} + +type AutoSetRequests struct { + PercentileForAutoRequests string `json:"percentileForAutoRequests,omitempty"` + + // the request buffer ratio, for example actual usage is 1.0, 10% buffer will be 1.1 as final preferred requests + ExtraBufferRatio string `json:"extraBufferRatio,omitempty"` + + EvaluationPeriod string `json:"evaluationPeriod,omitempty"` + AggregationPeriod string `json:"aggregationPeriod,omitempty"` + Prediction SmartSchedulerModelInput `json:"prediction,omitempty"` +} + +type ScaleToZero struct { + AutoFreeze []AutoFreeze `json:"autoFreeze,omitempty"` + IntelligenceWarmup SmartSchedulerModelInput `json:"intelligenceWarmup,omitempty"` +} + +type AutoFreeze struct { + Qos string `json:"qos,omitempty"` + FreezeToMemTTL string `json:"freezeToMemTTL,omitempty"` + FreezeToDiskTTL string `json:"freezeToDiskTTL,omitempty"` + Enable *bool `json:"enable,omitempty"` +} + +type SmartSchedulerModelInput struct { + Enable *bool `json:"enable,omitempty"` + Model string `json:"model,omitempty"` + HistoryDataPeriod string `json:"historyDataPeriod,omitempty"` + PredictionPeriod string `json:"predictionPeriod,omitempty"` +} + +type ReBalancerConfig struct { + Internal string `json:"internal,omitempty"` + ReBalanceCoolDownTime string `json:"reBalanceCoolDownTime,omitempty"` + Threshold ReBalanceThreshold `json:"threshold,omitempty"` +} + +type ReBalanceThreshold struct { + MatchAny runtime.RawExtension `json:"matchAny,omitempty"` +} + +type HypervisorScheduling struct { + MultiProcessQueuing MultiProcessQueuing `json:"multiProcessQueuing,omitempty"` +} + +type MultiProcessQueuing struct { + Enable *bool `json:"enable,omitempty"` + Interval string `json:"interval,omitempty"` + QueueLevelTimeSlices []string `json:"queueLevelTimeSlices,omitempty"` +} + +// SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate. +type SchedulingConfigTemplateStatus struct { + // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster + // Important: Run "make" to regenerate code after modifying this file +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status + +// SchedulingConfigTemplate is the Schema for the schedulingconfigtemplates API. +type SchedulingConfigTemplate struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec SchedulingConfigTemplateSpec `json:"spec,omitempty"` + Status SchedulingConfigTemplateStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// SchedulingConfigTemplateList contains a list of SchedulingConfigTemplate. +type SchedulingConfigTemplateList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []SchedulingConfigTemplate `json:"items"` +} + +func init() { + SchemeBuilder.Register(&SchedulingConfigTemplate{}, &SchedulingConfigTemplateList{}) +} diff --git a/api/v1/tensorfusioncluster_types.go b/api/v1/tensorfusioncluster_types.go index a5b9b8d..eef9a6a 100644 --- a/api/v1/tensorfusioncluster_types.go +++ b/api/v1/tensorfusioncluster_types.go @@ -18,28 +18,170 @@ package v1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" ) -// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! -// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. - // TensorFusionClusterSpec defines the desired state of TensorFusionCluster. type TensorFusionClusterSpec struct { - // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster - // Important: Run "make" to regenerate code after modifying this file + Enroll EnrollConfig `json:"enroll,omitempty"` + + GPUPools []GPUPoolDefinition `json:"gpuPools,omitempty"` + + // +optional + ComputingVendor ComputingVendorConfig `json:"computingVendor,omitempty"` + + // +optional + StorageVendor StorageVendorConfig `json:"storageVendor,omitempty"` - // Foo is an example field of TensorFusionCluster. Edit tensorfusioncluster_types.go to remove/update - Foo string `json:"foo,omitempty"` + // +optional + DataPipelines DataPipelinesConfig `json:"dataPipelines,omitempty"` } // TensorFusionClusterStatus defines the observed state of TensorFusionCluster. type TensorFusionClusterStatus struct { - // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster - // Important: Run "make" to regenerate code after modifying this file + + // +kubebuilder:default:=Initializing + Phase TensorFusionClusterPhase `json:"phase,omitempty"` + + Conditions []metav1.Condition `json:"conditions,omitempty"` + + TotalPools int32 `json:"totalPools,omitempty"` + TotalNodes int32 `json:"totalNodes,omitempty"` + TotalGPUs int32 `json:"totalGPUs,omitempty"` + + TotalTFlops int32 `json:"totalTFlops,omitempty"` + TotalVRAM string `json:"totalVRAM,omitempty"` + + AvailableTFlops int32 `json:"availableTFlops,omitempty"` + AvailableVRAM string `json:"availableVRAM,omitempty"` + + ReadyGPUPools []string `json:"readyGPUPools,omitempty"` + NotReadyGPUPools []string `json:"notReadyGPUPools,omitempty"` + + AvailableLicenses int32 `json:"availableLicenses,omitempty"` + TotalLicenses int32 `json:"totalLicenses,omitempty"` + LicenseRenewalTime metav1.Time `json:"licenseRenewalTime,omitempty"` + + CloudConnectionStatus ClusterCloudConnectionStatus `json:"cloudConnectionStatus,omitempty"` + StorageStatus ClusterStorageStatus `json:"storageStatus,omitempty"` + ComputingVendorStatus ClusterComputingVendorStatus `json:"computingVendorStatus,omitempty"` +} + +type ClusterCloudConnectionStatus struct { + ClusterID string `json:"clusterId,omitempty"` + ConnectionState string `json:"connectionState,omitempty"` + LastHeartbeatTime metav1.Time `json:"lastHeartbeatTime,omitempty"` +} + +type ClusterStorageStatus struct { + ConnectionState string `json:"connectionState,omitempty"` +} + +type ClusterComputingVendorStatus struct { + ConnectionState string `json:"connectionState,omitempty"` +} + +// TensorFusionClusterPhase represents the phase of the TensorFusionCluster resource. +type TensorFusionClusterPhase string + +const ( + TensorFusionClusterInitializing = TensorFusionClusterPhase("Initializing") + TensorFusionClusterRunning = TensorFusionClusterPhase("Running") + TensorFusionClusterUpdating = TensorFusionClusterPhase("Updating") + TensorFusionClusterDestroying = TensorFusionClusterPhase("Destroying") +) + +// Enroll to TensorFusion cloud with a enrollment key +type EnrollConfig struct { + APIEndpoint string `json:"apiEndpoint,omitempty"` // API endpoint for enrollment. + EnrollKey EnrollmentKeyConfig `json:"enrollKey,omitempty"` +} + +type EnrollmentKeyConfig struct { + Data string `json:"data,omitempty"` // Enrollment key data. + SecretRef NameNamespace `json:"secretRef,omitempty"` +} + +// GPUPool defines how to create a GPU pool, could be URL or inline +type GPUPoolDefinition struct { + Name string `json:"name,omitempty"` // Name of the GPU pool. + + // +optional + Spec GPUPoolSpec `json:"spec,omitempty"` + + // +optional + SpecTemplateURL string `json:"specTemplateUrl,omitempty"` +} + +// ComputingVendorConfig defines the Cloud vendor connection such as AWS, GCP, Azure etc. +type ComputingVendorConfig struct { + Name string `json:"name,omitempty"` // Name of the computing vendor. + Type string `json:"type,omitempty"` // Type of the computing vendor (e.g., aws, lambdalabs, gcp, azure, together.ai). + AuthType string `json:"authType,omitempty"` // Authentication type (e.g., accessKey, serviceAccount). + + // +optional + Enable *bool `json:"enable,omitempty"` // Enable or disable the computing vendor. + + GPUNodeControllerType string `json:"gpuNodeControllerType,omitempty"` // Type of GPU node controller (e.g., asg, karpenter, native). + Params ComputingVendorParams `json:"params,omitempty"` +} + +type ComputingVendorParams struct { + Region string `json:"region,omitempty"` // Region for the computing vendor. + AccessKey string `json:"accessKey,omitempty"` // Access key for the computing vendor. + SecretKey string `json:"secretKey,omitempty"` // Secret key for the computing vendor. + IAMRole string `json:"iamRole,omitempty"` // IAM role for the computing vendor like AWS +} + +// StorageVendorConfig defines Postgres database with extensions for timeseries storage and other resource aggregation results, system events and diagnostics reports etc. +type StorageVendorConfig struct { + Mode string `json:"mode,omitempty"` // Mode of the storage vendor (e.g., cloudnative-pg, timescale-db, RDS for PG). + Image string `json:"image,omitempty"` // Image for the storage vendor (default to timescale). + + // +optional + InstallCloudNativePGOperator *bool `json:"installCloudNativePGOperator,omitempty"` // Whether to install CloudNative-PG operator. + + StorageClass string `json:"storageClass,omitempty"` // Storage class for the storage vendor. + PGExtensions []string `json:"pgExtensions,omitempty"` // List of PostgreSQL extensions to install. + PGClusterTemplate runtime.RawExtension `json:"pgClusterTemplate,omitempty"` // Extra spec for the PostgreSQL cluster template. +} + +// DataPipelinesConfig defines the aggregation jobs that can make statistics on the data and then report to cloud if configured. +type DataPipelinesConfig struct { + Resources DataPipeline4ResourcesConfig `json:"resources,omitempty"` + + Timeseries DataPipeline4TimeSeriesConfig `json:"timeseries,omitempty"` +} + +type DataPipeline4ResourcesConfig struct { + // +optional + SyncToCloud *bool `json:"syncToCloud,omitempty"` // Whether to sync resources to the cloud. + + // +optional human readable time like 1h, 1d, default to 1h + SyncPeriod string `json:"syncPeriod,omitempty"` // Period for syncing resources. +} + +type DataPipeline4TimeSeriesConfig struct { + AggregationPeriods []string `json:"aggregationPeriods,omitempty"` // List of aggregation periods. + RawDataRetention string `json:"rawDataRetention,omitempty"` // Retention period for raw data. + AggregationDataRetention string `json:"aggregationDataRetention,omitempty"` // Retention period for aggregated data. + RemoteWrite RemoteWriteConfig `json:"remoteWrite,omitempty"` // Configuration for remote write. +} + +// RemoteWriteConfig represents the configuration for remote write. +type RemoteWriteConfig struct { + Connection DataPipelineResultRemoteWriteConfig `json:"connection,omitempty"` + Metrics []string `json:"metrics,omitempty"` // List of metrics to remote write. +} + +type DataPipelineResultRemoteWriteConfig struct { + Type string `json:"type,omitempty"` // Type of the connection (e.g., datadog). + URL string `json:"url,omitempty"` // URL of the connection. } // +kubebuilder:object:root=true // +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster // TensorFusionCluster is the Schema for the tensorfusionclusters API. type TensorFusionCluster struct { diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 459c0ba..c1c3fd9 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -21,266 +21,1462 @@ limitations under the License. package v1 import ( - runtime "k8s.io/apimachinery/pkg/runtime" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AlertConfig) DeepCopyInto(out *AlertConfig) { + *out = *in + in.Expression.DeepCopyInto(&out.Expression) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AlertConfig. +func (in *AlertConfig) DeepCopy() *AlertConfig { + if in == nil { + return nil + } + out := new(AlertConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AutoFreeze) DeepCopyInto(out *AutoFreeze) { + *out = *in + if in.Enable != nil { + in, out := &in.Enable, &out.Enable + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoFreeze. +func (in *AutoFreeze) DeepCopy() *AutoFreeze { + if in == nil { + return nil + } + out := new(AutoFreeze) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AutoScalingConfig) DeepCopyInto(out *AutoScalingConfig) { + *out = *in + in.AutoSetLimits.DeepCopyInto(&out.AutoSetLimits) + in.AutoSetReplicas.DeepCopyInto(&out.AutoSetReplicas) + in.AutoSetRequests.DeepCopyInto(&out.AutoSetRequests) + in.ScaleToZero.DeepCopyInto(&out.ScaleToZero) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoScalingConfig. +func (in *AutoScalingConfig) DeepCopy() *AutoScalingConfig { + if in == nil { + return nil + } + out := new(AutoScalingConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AutoSetLimits) DeepCopyInto(out *AutoSetLimits) { + *out = *in + in.Prediction.DeepCopyInto(&out.Prediction) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoSetLimits. +func (in *AutoSetLimits) DeepCopy() *AutoSetLimits { + if in == nil { + return nil + } + out := new(AutoSetLimits) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AutoSetReplicas) DeepCopyInto(out *AutoSetReplicas) { + *out = *in + if in.Enable != nil { + in, out := &in.Enable, &out.Enable + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoSetReplicas. +func (in *AutoSetReplicas) DeepCopy() *AutoSetReplicas { + if in == nil { + return nil + } + out := new(AutoSetReplicas) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AutoSetRequests) DeepCopyInto(out *AutoSetRequests) { + *out = *in + in.Prediction.DeepCopyInto(&out.Prediction) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoSetRequests. +func (in *AutoSetRequests) DeepCopy() *AutoSetRequests { + if in == nil { + return nil + } + out := new(AutoSetRequests) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CapacityConfig) DeepCopyInto(out *CapacityConfig) { + *out = *in + out.MinResources = in.MinResources + out.MaxResources = in.MaxResources + out.WarmResources = in.WarmResources + out.Oversubscription = in.Oversubscription +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityConfig. +func (in *CapacityConfig) DeepCopy() *CapacityConfig { + if in == nil { + return nil + } + out := new(CapacityConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClientConfig) DeepCopyInto(out *ClientConfig) { + *out = *in + in.PodTemplateMergePatch.DeepCopyInto(&out.PodTemplateMergePatch) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClientConfig. +func (in *ClientConfig) DeepCopy() *ClientConfig { + if in == nil { + return nil + } + out := new(ClientConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterCloudConnectionStatus) DeepCopyInto(out *ClusterCloudConnectionStatus) { + *out = *in + in.LastHeartbeatTime.DeepCopyInto(&out.LastHeartbeatTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterCloudConnectionStatus. +func (in *ClusterCloudConnectionStatus) DeepCopy() *ClusterCloudConnectionStatus { + if in == nil { + return nil + } + out := new(ClusterCloudConnectionStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterComputingVendorStatus) DeepCopyInto(out *ClusterComputingVendorStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterComputingVendorStatus. +func (in *ClusterComputingVendorStatus) DeepCopy() *ClusterComputingVendorStatus { + if in == nil { + return nil + } + out := new(ClusterComputingVendorStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStorageStatus) DeepCopyInto(out *ClusterStorageStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStorageStatus. +func (in *ClusterStorageStatus) DeepCopy() *ClusterStorageStatus { + if in == nil { + return nil + } + out := new(ClusterStorageStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ComponentConfig) DeepCopyInto(out *ComponentConfig) { + *out = *in + in.Worker.DeepCopyInto(&out.Worker) + in.Hypervisor.DeepCopyInto(&out.Hypervisor) + in.Client.DeepCopyInto(&out.Client) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComponentConfig. +func (in *ComponentConfig) DeepCopy() *ComponentConfig { + if in == nil { + return nil + } + out := new(ComponentConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ComputingVendorConfig) DeepCopyInto(out *ComputingVendorConfig) { + *out = *in + if in.Enable != nil { + in, out := &in.Enable, &out.Enable + *out = new(bool) + **out = **in + } + out.Params = in.Params +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComputingVendorConfig. +func (in *ComputingVendorConfig) DeepCopy() *ComputingVendorConfig { + if in == nil { + return nil + } + out := new(ComputingVendorConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ComputingVendorParams) DeepCopyInto(out *ComputingVendorParams) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComputingVendorParams. +func (in *ComputingVendorParams) DeepCopy() *ComputingVendorParams { + if in == nil { + return nil + } + out := new(ComputingVendorParams) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DataPipeline4ResourcesConfig) DeepCopyInto(out *DataPipeline4ResourcesConfig) { + *out = *in + if in.SyncToCloud != nil { + in, out := &in.SyncToCloud, &out.SyncToCloud + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DataPipeline4ResourcesConfig. +func (in *DataPipeline4ResourcesConfig) DeepCopy() *DataPipeline4ResourcesConfig { + if in == nil { + return nil + } + out := new(DataPipeline4ResourcesConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DataPipeline4TimeSeriesConfig) DeepCopyInto(out *DataPipeline4TimeSeriesConfig) { + *out = *in + if in.AggregationPeriods != nil { + in, out := &in.AggregationPeriods, &out.AggregationPeriods + *out = make([]string, len(*in)) + copy(*out, *in) + } + in.RemoteWrite.DeepCopyInto(&out.RemoteWrite) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DataPipeline4TimeSeriesConfig. +func (in *DataPipeline4TimeSeriesConfig) DeepCopy() *DataPipeline4TimeSeriesConfig { + if in == nil { + return nil + } + out := new(DataPipeline4TimeSeriesConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DataPipelineResultRemoteWriteConfig) DeepCopyInto(out *DataPipelineResultRemoteWriteConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DataPipelineResultRemoteWriteConfig. +func (in *DataPipelineResultRemoteWriteConfig) DeepCopy() *DataPipelineResultRemoteWriteConfig { + if in == nil { + return nil + } + out := new(DataPipelineResultRemoteWriteConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DataPipelinesConfig) DeepCopyInto(out *DataPipelinesConfig) { + *out = *in + in.Resources.DeepCopyInto(&out.Resources) + in.Timeseries.DeepCopyInto(&out.Timeseries) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DataPipelinesConfig. +func (in *DataPipelinesConfig) DeepCopy() *DataPipelinesConfig { + if in == nil { + return nil + } + out := new(DataPipelinesConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnrollConfig) DeepCopyInto(out *EnrollConfig) { + *out = *in + out.EnrollKey = in.EnrollKey +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnrollConfig. +func (in *EnrollConfig) DeepCopy() *EnrollConfig { + if in == nil { + return nil + } + out := new(EnrollConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnrollmentKeyConfig) DeepCopyInto(out *EnrollmentKeyConfig) { + *out = *in + out.SecretRef = in.SecretRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnrollmentKeyConfig. +func (in *EnrollmentKeyConfig) DeepCopy() *EnrollmentKeyConfig { + if in == nil { + return nil + } + out := new(EnrollmentKeyConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GPU) DeepCopyInto(out *GPU) { *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Status.DeepCopyInto(&out.Status) + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPU. +func (in *GPU) DeepCopy() *GPU { + if in == nil { + return nil + } + out := new(GPU) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPU) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUFilter) DeepCopyInto(out *GPUFilter) { + *out = *in + in.Params.DeepCopyInto(&out.Params) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUFilter. +func (in *GPUFilter) DeepCopy() *GPUFilter { + if in == nil { + return nil + } + out := new(GPUFilter) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUList) DeepCopyInto(out *GPUList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]GPU, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUList. +func (in *GPUList) DeepCopy() *GPUList { + if in == nil { + return nil + } + out := new(GPUList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNode) DeepCopyInto(out *GPUNode) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNode. +func (in *GPUNode) DeepCopy() *GPUNode { + if in == nil { + return nil + } + out := new(GPUNode) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUNode) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeClass) DeepCopyInto(out *GPUNodeClass) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + out.Status = in.Status +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClass. +func (in *GPUNodeClass) DeepCopy() *GPUNodeClass { + if in == nil { + return nil + } + out := new(GPUNodeClass) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUNodeClass) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeClassList) DeepCopyInto(out *GPUNodeClassList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]GPUNodeClass, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassList. +func (in *GPUNodeClassList) DeepCopy() *GPUNodeClassList { + if in == nil { + return nil + } + out := new(GPUNodeClassList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUNodeClassList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeClassSpec) DeepCopyInto(out *GPUNodeClassSpec) { + *out = *in + if in.OSImageSelectorTerms != nil { + in, out := &in.OSImageSelectorTerms, &out.OSImageSelectorTerms + *out = make([]NodeClassOSImageSelectorTerms, len(*in)) + copy(*out, *in) + } + if in.BlockDeviceMappings != nil { + in, out := &in.BlockDeviceMappings, &out.BlockDeviceMappings + *out = make([]NodeClassBlockDeviceMappings, len(*in)) + copy(*out, *in) + } + out.MetadataOptions = in.MetadataOptions + if in.SecurityGroupSelectorTerms != nil { + in, out := &in.SecurityGroupSelectorTerms, &out.SecurityGroupSelectorTerms + *out = make([]NodeClassItemIDSelectorTerms, len(*in)) + copy(*out, *in) + } + if in.SubnetSelectorTerms != nil { + in, out := &in.SubnetSelectorTerms, &out.SubnetSelectorTerms + *out = make([]NodeClassItemIDSelectorTerms, len(*in)) + copy(*out, *in) + } + if in.Tags != nil { + in, out := &in.Tags, &out.Tags + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassSpec. +func (in *GPUNodeClassSpec) DeepCopy() *GPUNodeClassSpec { + if in == nil { + return nil + } + out := new(GPUNodeClassSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeClassStatus) DeepCopyInto(out *GPUNodeClassStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassStatus. +func (in *GPUNodeClassStatus) DeepCopy() *GPUNodeClassStatus { + if in == nil { + return nil + } + out := new(GPUNodeClassStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeInfo) DeepCopyInto(out *GPUNodeInfo) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeInfo. +func (in *GPUNodeInfo) DeepCopy() *GPUNodeInfo { + if in == nil { + return nil + } + out := new(GPUNodeInfo) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeList) DeepCopyInto(out *GPUNodeList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]GPUNode, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeList. +func (in *GPUNodeList) DeepCopy() *GPUNodeList { + if in == nil { + return nil + } + out := new(GPUNodeList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUNodeList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeSpec) DeepCopyInto(out *GPUNodeSpec) { + *out = *in + if in.GPUCardIndices != nil { + in, out := &in.GPUCardIndices, &out.GPUCardIndices + *out = make([]int, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeSpec. +func (in *GPUNodeSpec) DeepCopy() *GPUNodeSpec { + if in == nil { + return nil + } + out := new(GPUNodeSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + in.HypervisorStatus.DeepCopyInto(&out.HypervisorStatus) + out.NodeInfo = in.NodeInfo + if in.LoadedModels != nil { + in, out := &in.LoadedModels, &out.LoadedModels + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.ManagedGPUResourceIDs != nil { + in, out := &in.ManagedGPUResourceIDs, &out.ManagedGPUResourceIDs + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus. +func (in *GPUNodeStatus) DeepCopy() *GPUNodeStatus { + if in == nil { + return nil + } + out := new(GPUNodeStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUPool) DeepCopyInto(out *GPUPool) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPool. +func (in *GPUPool) DeepCopy() *GPUPool { + if in == nil { + return nil + } + out := new(GPUPool) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUPool) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUPoolDefinition) DeepCopyInto(out *GPUPoolDefinition) { + *out = *in + in.Spec.DeepCopyInto(&out.Spec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolDefinition. +func (in *GPUPoolDefinition) DeepCopy() *GPUPoolDefinition { + if in == nil { + return nil + } + out := new(GPUPoolDefinition) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUPoolList) DeepCopyInto(out *GPUPoolList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]GPUPool, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolList. +func (in *GPUPoolList) DeepCopy() *GPUPoolList { + if in == nil { + return nil + } + out := new(GPUPoolList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUPoolList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUPoolSpec) DeepCopyInto(out *GPUPoolSpec) { + *out = *in + out.CapacityConfig = in.CapacityConfig + in.NodeManagerConfig.DeepCopyInto(&out.NodeManagerConfig) + in.ObservabilityConfig.DeepCopyInto(&out.ObservabilityConfig) + in.QosConfig.DeepCopyInto(&out.QosConfig) + in.ComponentConfig.DeepCopyInto(&out.ComponentConfig) + in.SchedulingConfig.DeepCopyInto(&out.SchedulingConfig) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolSpec. +func (in *GPUPoolSpec) DeepCopy() *GPUPoolSpec { + if in == nil { + return nil + } + out := new(GPUPoolSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUPoolStatus) DeepCopyInto(out *GPUPoolStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.ProvisioningStatus = in.ProvisioningStatus + out.ComponentStatus = in.ComponentStatus +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolStatus. +func (in *GPUPoolStatus) DeepCopy() *GPUPoolStatus { + if in == nil { + return nil + } + out := new(GPUPoolStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUResourceUnit) DeepCopyInto(out *GPUResourceUnit) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceUnit. +func (in *GPUResourceUnit) DeepCopy() *GPUResourceUnit { + if in == nil { + return nil + } + out := new(GPUResourceUnit) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUStatus) DeepCopyInto(out *GPUStatus) { + *out = *in + in.Capacity.DeepCopyInto(&out.Capacity) + in.Available.DeepCopyInto(&out.Available) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUStatus. +func (in *GPUStatus) DeepCopy() *GPUStatus { + if in == nil { + return nil + } + out := new(GPUStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HypervisorConfig) DeepCopyInto(out *HypervisorConfig) { + *out = *in + in.HypervisorDaemonSetTemplate.DeepCopyInto(&out.HypervisorDaemonSetTemplate) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HypervisorConfig. +func (in *HypervisorConfig) DeepCopy() *HypervisorConfig { + if in == nil { + return nil + } + out := new(HypervisorConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HypervisorScheduling) DeepCopyInto(out *HypervisorScheduling) { + *out = *in + in.MultiProcessQueuing.DeepCopyInto(&out.MultiProcessQueuing) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HypervisorScheduling. +func (in *HypervisorScheduling) DeepCopy() *HypervisorScheduling { + if in == nil { + return nil + } + out := new(HypervisorScheduling) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MaintenanceWindow) DeepCopyInto(out *MaintenanceWindow) { + *out = *in + if in.Includes != nil { + in, out := &in.Includes, &out.Includes + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MaintenanceWindow. +func (in *MaintenanceWindow) DeepCopy() *MaintenanceWindow { + if in == nil { + return nil + } + out := new(MaintenanceWindow) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MonitorConfig) DeepCopyInto(out *MonitorConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MonitorConfig. +func (in *MonitorConfig) DeepCopy() *MonitorConfig { + if in == nil { + return nil + } + out := new(MonitorConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MultiProcessQueuing) DeepCopyInto(out *MultiProcessQueuing) { + *out = *in + if in.Enable != nil { + in, out := &in.Enable, &out.Enable + *out = new(bool) + **out = **in + } + if in.QueueLevelTimeSlices != nil { + in, out := &in.QueueLevelTimeSlices, &out.QueueLevelTimeSlices + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MultiProcessQueuing. +func (in *MultiProcessQueuing) DeepCopy() *MultiProcessQueuing { + if in == nil { + return nil + } + out := new(MultiProcessQueuing) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NameNamespace) DeepCopyInto(out *NameNamespace) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NameNamespace. +func (in *NameNamespace) DeepCopy() *NameNamespace { + if in == nil { + return nil + } + out := new(NameNamespace) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeClassBlockDeviceMappings) DeepCopyInto(out *NodeClassBlockDeviceMappings) { + *out = *in + out.Ebs = in.Ebs +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassBlockDeviceMappings. +func (in *NodeClassBlockDeviceMappings) DeepCopy() *NodeClassBlockDeviceMappings { + if in == nil { + return nil + } + out := new(NodeClassBlockDeviceMappings) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeClassEbsSettings) DeepCopyInto(out *NodeClassEbsSettings) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassEbsSettings. +func (in *NodeClassEbsSettings) DeepCopy() *NodeClassEbsSettings { + if in == nil { + return nil + } + out := new(NodeClassEbsSettings) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeClassItemIDSelectorTerms) DeepCopyInto(out *NodeClassItemIDSelectorTerms) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassItemIDSelectorTerms. +func (in *NodeClassItemIDSelectorTerms) DeepCopy() *NodeClassItemIDSelectorTerms { + if in == nil { + return nil + } + out := new(NodeClassItemIDSelectorTerms) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeClassMetadataOptions) DeepCopyInto(out *NodeClassMetadataOptions) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassMetadataOptions. +func (in *NodeClassMetadataOptions) DeepCopy() *NodeClassMetadataOptions { + if in == nil { + return nil + } + out := new(NodeClassMetadataOptions) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeClassOSImageSelectorTerms) DeepCopyInto(out *NodeClassOSImageSelectorTerms) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassOSImageSelectorTerms. +func (in *NodeClassOSImageSelectorTerms) DeepCopy() *NodeClassOSImageSelectorTerms { + if in == nil { + return nil + } + out := new(NodeClassOSImageSelectorTerms) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeCompaction) DeepCopyInto(out *NodeCompaction) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeCompaction. +func (in *NodeCompaction) DeepCopy() *NodeCompaction { + if in == nil { + return nil + } + out := new(NodeCompaction) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeHypervisorStatus) DeepCopyInto(out *NodeHypervisorStatus) { + *out = *in + in.LastHeartbeatTime.DeepCopyInto(&out.LastHeartbeatTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeHypervisorStatus. +func (in *NodeHypervisorStatus) DeepCopy() *NodeHypervisorStatus { + if in == nil { + return nil + } + out := new(NodeHypervisorStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeManagerConfig) DeepCopyInto(out *NodeManagerConfig) { + *out = *in + in.NodeProvisioner.DeepCopyInto(&out.NodeProvisioner) + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(NodeSelector, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.NodeCompaction = in.NodeCompaction + in.NodePoolRollingUpdatePolicy.DeepCopyInto(&out.NodePoolRollingUpdatePolicy) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeManagerConfig. +func (in *NodeManagerConfig) DeepCopy() *NodeManagerConfig { + if in == nil { + return nil + } + out := new(NodeManagerConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeProvisioner) DeepCopyInto(out *NodeProvisioner) { + *out = *in + if in.Requirements != nil { + in, out := &in.Requirements, &out.Requirements + *out = make([]Requirement, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Taints != nil { + in, out := &in.Taints, &out.Taints + *out = make([]Taint, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProvisioner. +func (in *NodeProvisioner) DeepCopy() *NodeProvisioner { + if in == nil { + return nil + } + out := new(NodeProvisioner) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeRollingUpdatePolicy) DeepCopyInto(out *NodeRollingUpdatePolicy) { + *out = *in + if in.AutoUpdate != nil { + in, out := &in.AutoUpdate, &out.AutoUpdate + *out = new(bool) + **out = **in + } + in.MaintenanceWindow.DeepCopyInto(&out.MaintenanceWindow) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeRollingUpdatePolicy. +func (in *NodeRollingUpdatePolicy) DeepCopy() *NodeRollingUpdatePolicy { + if in == nil { + return nil + } + out := new(NodeRollingUpdatePolicy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in NodeSelector) DeepCopyInto(out *NodeSelector) { + { + in := &in + *out = make(NodeSelector, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeSelector. +func (in NodeSelector) DeepCopy() NodeSelector { + if in == nil { + return nil + } + out := new(NodeSelector) + in.DeepCopyInto(out) + return *out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeSelectorItem) DeepCopyInto(out *NodeSelectorItem) { + *out = *in + if in.MatchAny != nil { + in, out := &in.MatchAny, &out.MatchAny + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.MatchAll != nil { + in, out := &in.MatchAll, &out.MatchAll + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPU. -func (in *GPU) DeepCopy() *GPU { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeSelectorItem. +func (in *NodeSelectorItem) DeepCopy() *NodeSelectorItem { if in == nil { return nil } - out := new(GPU) + out := new(NodeSelectorItem) in.DeepCopyInto(out) return out } -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPU) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ObservabilityConfig) DeepCopyInto(out *ObservabilityConfig) { + *out = *in + out.Monitor = in.Monitor + in.Alert.DeepCopyInto(&out.Alert) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ObservabilityConfig. +func (in *ObservabilityConfig) DeepCopy() *ObservabilityConfig { + if in == nil { + return nil } - return nil + out := new(ObservabilityConfig) + in.DeepCopyInto(out) + return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUList) DeepCopyInto(out *GPUList) { +func (in *Oversubscription) DeepCopyInto(out *Oversubscription) { *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]GPU, len(*in)) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Oversubscription. +func (in *Oversubscription) DeepCopy() *Oversubscription { + if in == nil { + return nil + } + out := new(Oversubscription) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PlacementConfig) DeepCopyInto(out *PlacementConfig) { + *out = *in + if in.AllowUsingLocalGPU != nil { + in, out := &in.AllowUsingLocalGPU, &out.AllowUsingLocalGPU + *out = new(bool) + **out = **in + } + if in.GPUFilters != nil { + in, out := &in.GPUFilters, &out.GPUFilters + *out = make([]GPUFilter, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUList. -func (in *GPUList) DeepCopy() *GPUList { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlacementConfig. +func (in *PlacementConfig) DeepCopy() *PlacementConfig { if in == nil { return nil } - out := new(GPUList) + out := new(PlacementConfig) in.DeepCopyInto(out) return out } -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNode) DeepCopyInto(out *GPUNode) { +func (in *PoolComponentStatus) DeepCopyInto(out *PoolComponentStatus) { *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec - out.Status = in.Status } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNode. -func (in *GPUNode) DeepCopy() *GPUNode { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolComponentStatus. +func (in *PoolComponentStatus) DeepCopy() *PoolComponentStatus { if in == nil { return nil } - out := new(GPUNode) + out := new(PoolComponentStatus) in.DeepCopyInto(out) return out } -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUNode) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PoolProvisioningStatus) DeepCopyInto(out *PoolProvisioningStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolProvisioningStatus. +func (in *PoolProvisioningStatus) DeepCopy() *PoolProvisioningStatus { + if in == nil { + return nil } - return nil + out := new(PoolProvisioningStatus) + in.DeepCopyInto(out) + return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeClass) DeepCopyInto(out *GPUNodeClass) { +func (in *QosConfig) DeepCopyInto(out *QosConfig) { *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec - out.Status = in.Status + if in.Definitions != nil { + in, out := &in.Definitions, &out.Definitions + *out = make([]QosDefinition, len(*in)) + copy(*out, *in) + } + if in.Pricing != nil { + in, out := &in.Pricing, &out.Pricing + *out = make([]QosPricing, len(*in)) + copy(*out, *in) + } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClass. -func (in *GPUNodeClass) DeepCopy() *GPUNodeClass { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new QosConfig. +func (in *QosConfig) DeepCopy() *QosConfig { if in == nil { return nil } - out := new(GPUNodeClass) + out := new(QosConfig) in.DeepCopyInto(out) return out } -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUNodeClass) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *QosDefinition) DeepCopyInto(out *QosDefinition) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new QosDefinition. +func (in *QosDefinition) DeepCopy() *QosDefinition { + if in == nil { + return nil } - return nil + out := new(QosDefinition) + in.DeepCopyInto(out) + return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeClassList) DeepCopyInto(out *GPUNodeClassList) { +func (in *QosPricing) DeepCopyInto(out *QosPricing) { *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]GPUNodeClass, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } + out.Requests = in.Requests + out.LimitsOverRequests = in.LimitsOverRequests } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassList. -func (in *GPUNodeClassList) DeepCopy() *GPUNodeClassList { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new QosPricing. +func (in *QosPricing) DeepCopy() *QosPricing { if in == nil { return nil } - out := new(GPUNodeClassList) + out := new(QosPricing) in.DeepCopyInto(out) return out } -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUNodeClassList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ReBalanceThreshold) DeepCopyInto(out *ReBalanceThreshold) { + *out = *in + in.MatchAny.DeepCopyInto(&out.MatchAny) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReBalanceThreshold. +func (in *ReBalanceThreshold) DeepCopy() *ReBalanceThreshold { + if in == nil { + return nil } - return nil + out := new(ReBalanceThreshold) + in.DeepCopyInto(out) + return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeClassSpec) DeepCopyInto(out *GPUNodeClassSpec) { +func (in *ReBalancerConfig) DeepCopyInto(out *ReBalancerConfig) { *out = *in + in.Threshold.DeepCopyInto(&out.Threshold) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassSpec. -func (in *GPUNodeClassSpec) DeepCopy() *GPUNodeClassSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReBalancerConfig. +func (in *ReBalancerConfig) DeepCopy() *ReBalancerConfig { if in == nil { return nil } - out := new(GPUNodeClassSpec) + out := new(ReBalancerConfig) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeClassStatus) DeepCopyInto(out *GPUNodeClassStatus) { +func (in *RemoteWriteConfig) DeepCopyInto(out *RemoteWriteConfig) { *out = *in + out.Connection = in.Connection + if in.Metrics != nil { + in, out := &in.Metrics, &out.Metrics + *out = make([]string, len(*in)) + copy(*out, *in) + } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassStatus. -func (in *GPUNodeClassStatus) DeepCopy() *GPUNodeClassStatus { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemoteWriteConfig. +func (in *RemoteWriteConfig) DeepCopy() *RemoteWriteConfig { if in == nil { return nil } - out := new(GPUNodeClassStatus) + out := new(RemoteWriteConfig) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeList) DeepCopyInto(out *GPUNodeList) { +func (in *Requirement) DeepCopyInto(out *Requirement) { *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]GPUNode, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } + if in.Values != nil { + in, out := &in.Values, &out.Values + *out = make([]string, len(*in)) + copy(*out, *in) } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeList. -func (in *GPUNodeList) DeepCopy() *GPUNodeList { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Requirement. +func (in *Requirement) DeepCopy() *Requirement { if in == nil { return nil } - out := new(GPUNodeList) + out := new(Requirement) in.DeepCopyInto(out) return out } -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUNodeList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Resource) DeepCopyInto(out *Resource) { + *out = *in + out.Tflops = in.Tflops.DeepCopy() + out.Vram = in.Vram.DeepCopy() +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resource. +func (in *Resource) DeepCopy() *Resource { + if in == nil { + return nil } - return nil + out := new(Resource) + in.DeepCopyInto(out) + return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeSpec) DeepCopyInto(out *GPUNodeSpec) { +func (in *Resources) DeepCopyInto(out *Resources) { *out = *in + in.Requests.DeepCopyInto(&out.Requests) + in.Limits.DeepCopyInto(&out.Limits) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeSpec. -func (in *GPUNodeSpec) DeepCopy() *GPUNodeSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources. +func (in *Resources) DeepCopy() *Resources { if in == nil { return nil } - out := new(GPUNodeSpec) + out := new(Resources) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) { +func (in *ScaleToZero) DeepCopyInto(out *ScaleToZero) { *out = *in + if in.AutoFreeze != nil { + in, out := &in.AutoFreeze, &out.AutoFreeze + *out = make([]AutoFreeze, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + in.IntelligenceWarmup.DeepCopyInto(&out.IntelligenceWarmup) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus. -func (in *GPUNodeStatus) DeepCopy() *GPUNodeStatus { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScaleToZero. +func (in *ScaleToZero) DeepCopy() *ScaleToZero { if in == nil { return nil } - out := new(GPUNodeStatus) + out := new(ScaleToZero) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUPool) DeepCopyInto(out *GPUPool) { +func (in *SchedulingConfigTemplate) DeepCopyInto(out *SchedulingConfigTemplate) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec + in.Spec.DeepCopyInto(&out.Spec) out.Status = in.Status } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPool. -func (in *GPUPool) DeepCopy() *GPUPool { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplate. +func (in *SchedulingConfigTemplate) DeepCopy() *SchedulingConfigTemplate { if in == nil { return nil } - out := new(GPUPool) + out := new(SchedulingConfigTemplate) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUPool) DeepCopyObject() runtime.Object { +func (in *SchedulingConfigTemplate) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } @@ -288,31 +1484,31 @@ func (in *GPUPool) DeepCopyObject() runtime.Object { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUPoolList) DeepCopyInto(out *GPUPoolList) { +func (in *SchedulingConfigTemplateList) DeepCopyInto(out *SchedulingConfigTemplateList) { *out = *in out.TypeMeta = in.TypeMeta in.ListMeta.DeepCopyInto(&out.ListMeta) if in.Items != nil { in, out := &in.Items, &out.Items - *out = make([]GPUPool, len(*in)) + *out = make([]SchedulingConfigTemplate, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolList. -func (in *GPUPoolList) DeepCopy() *GPUPoolList { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplateList. +func (in *SchedulingConfigTemplateList) DeepCopy() *SchedulingConfigTemplateList { if in == nil { return nil } - out := new(GPUPoolList) + out := new(SchedulingConfigTemplateList) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUPoolList) DeepCopyObject() runtime.Object { +func (in *SchedulingConfigTemplateList) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } @@ -320,82 +1516,96 @@ func (in *GPUPoolList) DeepCopyObject() runtime.Object { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUPoolSpec) DeepCopyInto(out *GPUPoolSpec) { +func (in *SchedulingConfigTemplateSpec) DeepCopyInto(out *SchedulingConfigTemplateSpec) { *out = *in + in.Placement.DeepCopyInto(&out.Placement) + in.AutoScaling.DeepCopyInto(&out.AutoScaling) + in.ReBalancer.DeepCopyInto(&out.ReBalancer) + in.Hypervisor.DeepCopyInto(&out.Hypervisor) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolSpec. -func (in *GPUPoolSpec) DeepCopy() *GPUPoolSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplateSpec. +func (in *SchedulingConfigTemplateSpec) DeepCopy() *SchedulingConfigTemplateSpec { if in == nil { return nil } - out := new(GPUPoolSpec) + out := new(SchedulingConfigTemplateSpec) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUPoolStatus) DeepCopyInto(out *GPUPoolStatus) { +func (in *SchedulingConfigTemplateStatus) DeepCopyInto(out *SchedulingConfigTemplateStatus) { *out = *in } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolStatus. -func (in *GPUPoolStatus) DeepCopy() *GPUPoolStatus { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplateStatus. +func (in *SchedulingConfigTemplateStatus) DeepCopy() *SchedulingConfigTemplateStatus { if in == nil { return nil } - out := new(GPUPoolStatus) + out := new(SchedulingConfigTemplateStatus) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUStatus) DeepCopyInto(out *GPUStatus) { +func (in *SmartSchedulerModelInput) DeepCopyInto(out *SmartSchedulerModelInput) { *out = *in - in.Capacity.DeepCopyInto(&out.Capacity) - in.Available.DeepCopyInto(&out.Available) + if in.Enable != nil { + in, out := &in.Enable, &out.Enable + *out = new(bool) + **out = **in + } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUStatus. -func (in *GPUStatus) DeepCopy() *GPUStatus { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SmartSchedulerModelInput. +func (in *SmartSchedulerModelInput) DeepCopy() *SmartSchedulerModelInput { if in == nil { return nil } - out := new(GPUStatus) + out := new(SmartSchedulerModelInput) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *Resource) DeepCopyInto(out *Resource) { +func (in *StorageVendorConfig) DeepCopyInto(out *StorageVendorConfig) { *out = *in - out.Tflops = in.Tflops.DeepCopy() - out.Vram = in.Vram.DeepCopy() + if in.InstallCloudNativePGOperator != nil { + in, out := &in.InstallCloudNativePGOperator, &out.InstallCloudNativePGOperator + *out = new(bool) + **out = **in + } + if in.PGExtensions != nil { + in, out := &in.PGExtensions, &out.PGExtensions + *out = make([]string, len(*in)) + copy(*out, *in) + } + in.PGClusterTemplate.DeepCopyInto(&out.PGClusterTemplate) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resource. -func (in *Resource) DeepCopy() *Resource { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StorageVendorConfig. +func (in *StorageVendorConfig) DeepCopy() *StorageVendorConfig { if in == nil { return nil } - out := new(Resource) + out := new(StorageVendorConfig) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *Resources) DeepCopyInto(out *Resources) { +func (in *Taint) DeepCopyInto(out *Taint) { *out = *in - in.Requests.DeepCopyInto(&out.Requests) - in.Limits.DeepCopyInto(&out.Limits) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources. -func (in *Resources) DeepCopy() *Resources { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Taint. +func (in *Taint) DeepCopy() *Taint { if in == nil { return nil } - out := new(Resources) + out := new(Taint) in.DeepCopyInto(out) return out } @@ -405,8 +1615,8 @@ func (in *TensorFusionCluster) DeepCopyInto(out *TensorFusionCluster) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec - out.Status = in.Status + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionCluster. @@ -462,6 +1672,17 @@ func (in *TensorFusionClusterList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TensorFusionClusterSpec) DeepCopyInto(out *TensorFusionClusterSpec) { *out = *in + out.Enroll = in.Enroll + if in.GPUPools != nil { + in, out := &in.GPUPools, &out.GPUPools + *out = make([]GPUPoolDefinition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + in.ComputingVendor.DeepCopyInto(&out.ComputingVendor) + in.StorageVendor.DeepCopyInto(&out.StorageVendor) + in.DataPipelines.DeepCopyInto(&out.DataPipelines) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionClusterSpec. @@ -477,6 +1698,27 @@ func (in *TensorFusionClusterSpec) DeepCopy() *TensorFusionClusterSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TensorFusionClusterStatus) DeepCopyInto(out *TensorFusionClusterStatus) { *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.ReadyGPUPools != nil { + in, out := &in.ReadyGPUPools, &out.ReadyGPUPools + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.NotReadyGPUPools != nil { + in, out := &in.NotReadyGPUPools, &out.NotReadyGPUPools + *out = make([]string, len(*in)) + copy(*out, *in) + } + in.LicenseRenewalTime.DeepCopyInto(&out.LicenseRenewalTime) + in.CloudConnectionStatus.DeepCopyInto(&out.CloudConnectionStatus) + out.StorageStatus = in.StorageStatus + out.ComputingVendorStatus = in.ComputingVendorStatus } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionClusterStatus. @@ -578,3 +1820,24 @@ func (in *TensorFusionConnectionStatus) DeepCopy() *TensorFusionConnectionStatus in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkerConfig) DeepCopyInto(out *WorkerConfig) { + *out = *in + if in.HostNetwork != nil { + in, out := &in.HostNetwork, &out.HostNetwork + *out = new(bool) + **out = **in + } + in.WorkerPodTemplate.DeepCopyInto(&out.WorkerPodTemplate) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerConfig. +func (in *WorkerConfig) DeepCopy() *WorkerConfig { + if in == nil { + return nil + } + out := new(WorkerConfig) + in.DeepCopyInto(out) + return out +} diff --git a/cmd/main.go b/cmd/main.go index ab10624..5c7de61 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -219,6 +219,13 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "GPUNodeClass") os.Exit(1) } + if err = (&controller.SchedulingConfigTemplateReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "SchedulingConfigTemplate") + os.Exit(1) + } if err = (&controller.PodReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), diff --git a/config/crd/bases/tensor-fusion.ai_gpunodeclasses.yaml b/config/crd/bases/tensor-fusion.ai_gpunodeclasses.yaml new file mode 100644 index 0000000..9f0c412 --- /dev/null +++ b/config/crd/bases/tensor-fusion.ai_gpunodeclasses.yaml @@ -0,0 +1,112 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: gpunodeclasses.tensor-fusion.ai +spec: + group: tensor-fusion.ai + names: + kind: GPUNodeClass + listKind: GPUNodeClassList + plural: gpunodeclasses + singular: gpunodeclass + scope: Cluster + versions: + - name: v1 + schema: + openAPIV3Schema: + description: GPUNodeClass is the Schema for the gpunodeclasses API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: GPUNodeClassSpec defines the desired state of GPUNodeClass. + properties: + blockDeviceMappings: + items: + properties: + deviceName: + type: string + ebs: + properties: + deleteOnTermination: + type: boolean + encrypted: + type: boolean + volumeSize: + type: string + volumeType: + type: string + type: object + type: object + type: array + instanceProfile: + type: string + metadataOptions: + properties: + httpEndpoint: + type: string + httpProtocolIPv6: + type: string + httpPutResponseHopLimit: + type: integer + httpTokens: + type: string + type: object + osImageFamily: + type: string + osImageSelectorTerms: + items: + properties: + name: + type: string + owner: + type: string + type: object + type: array + securityGroupSelectorTerms: + items: + properties: + id: + type: string + type: object + type: array + subnetSelectorTerms: + items: + properties: + id: + type: string + type: object + type: array + tags: + additionalProperties: + type: string + type: object + userData: + type: string + type: object + status: + description: GPUNodeClassStatus defines the observed state of GPUNodeClass. + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml new file mode 100644 index 0000000..6212697 --- /dev/null +++ b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml @@ -0,0 +1,176 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: gpunodes.tensor-fusion.ai +spec: + group: tensor-fusion.ai + names: + kind: GPUNode + listKind: GPUNodeList + plural: gpunodes + singular: gpunode + scope: Cluster + versions: + - name: v1 + schema: + openAPIV3Schema: + description: GPUNode is the Schema for the gpunodes API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: GPUNodeSpec defines the desired state of GPUNode. + properties: + gpuCardIndices: + description: |- + if not all GPU cards should be used, specify the GPU card indices, default to empty, + onboard all GPU cards to the pool + items: + type: integer + type: array + manageMode: + type: string + type: object + status: + description: GPUNodeStatus defines the observed state of GPUNode. + properties: + availableTFlops: + format: int32 + type: integer + availableVRAM: + type: string + conditions: + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + hypervisorStatus: + properties: + hypervisorState: + type: string + hypervisorVersion: + type: string + lastHeartbeatTime: + format: date-time + type: string + type: object + loadedModels: + items: + type: string + type: array + managedGPUResourceIDs: + items: + type: string + type: array + managedGPUs: + format: int32 + type: integer + nodeInfo: + properties: + architecture: + type: string + gpuCount: + format: int32 + type: integer + gpuDriverVersion: + type: string + gpuModel: + type: string + hostname: + type: string + ip: + type: string + kernalVersion: + type: string + operatingSystem: + type: string + osImage: + type: string + type: object + phase: + description: TensorFusionClusterPhase represents the phase of the + TensorFusionCluster resource. + type: string + totalGPUs: + format: int32 + type: integer + totalTFlops: + format: int32 + type: integer + totalVRAM: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/tensor-fusion.ai_gpupools.yaml b/config/crd/bases/tensor-fusion.ai_gpupools.yaml new file mode 100644 index 0000000..440bd60 --- /dev/null +++ b/config/crd/bases/tensor-fusion.ai_gpupools.yaml @@ -0,0 +1,564 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: gpupools.tensor-fusion.ai +spec: + group: tensor-fusion.ai + names: + kind: GPUPool + listKind: GPUPoolList + plural: gpupools + singular: gpupool + scope: Cluster + versions: + - name: v1 + schema: + openAPIV3Schema: + description: GPUPool is the Schema for the gpupools API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: GPUPoolSpec defines the desired state of GPUPool. + properties: + capacityConfig: + properties: + maxResources: + properties: + tflops: + description: Tera floating point operations per second + type: string + vram: + description: VRAM is short for Video memory, namely GPU RAM + type: string + type: object + minResources: + properties: + tflops: + description: Tera floating point operations per second + type: string + vram: + description: VRAM is short for Video memory, namely GPU RAM + type: string + type: object + oversubscription: + properties: + tflopsOversellRatio: + description: The multipler of TFlops to oversell, default + to 1 for production, 20 for development + type: string + vramExpandToHostDisk: + description: the percentage of Host Disk appending to GPU + VRAM, default to 70% + type: string + vramExpandToHostMem: + description: the percentage of Host RAM appending to GPU VRAM, + default to 50% + type: string + type: object + warmResources: + properties: + tflops: + description: Tera floating point operations per second + type: string + vram: + description: VRAM is short for Video memory, namely GPU RAM + type: string + type: object + type: object + componentConfig: + description: Customize system components for seamless onboarding. + properties: + client: + properties: + image: + type: string + podTemplateMergePatch: + description: define how to inject the client pod + type: object + x-kubernetes-preserve-unknown-fields: true + port: + type: integer + protocol: + type: string + type: object + hypervisor: + properties: + hypervisorDaemonSetTemplate: + type: object + x-kubernetes-preserve-unknown-fields: true + image: + type: string + type: object + worker: + properties: + hostNetwork: + type: boolean + image: + type: string + port: + type: integer + workerPodTemplate: + type: object + x-kubernetes-preserve-unknown-fields: true + type: object + type: object + nodeManagerConfig: + properties: + nodeCompaction: + properties: + period: + type: string + type: object + nodePoolRollingUpdatePolicy: + properties: + autoUpdate: + description: |- + If set to false, updates will be pending in status, and user needs to manually approve updates. + Updates will occur immediately or during the next maintenance window. + type: boolean + batchInterval: + type: string + batchPercentage: + type: string + duration: + type: string + maintenanceWindow: + properties: + includes: + description: crontab syntax. + items: + type: string + type: array + type: object + type: object + nodeProvisioner: + description: karpenter mode Hypervisor manage GPU nodes and Workers + properties: + nodeClass: + type: string + requirements: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + type: object + type: array + taints: + items: + properties: + effect: + type: string + key: + type: string + value: + type: string + type: object + type: array + type: object + nodeSelector: + description: Use existing Kubernetes GPU nodes. + items: + properties: + matchAll: + additionalProperties: + type: string + type: object + matchAny: + additionalProperties: + type: string + type: object + type: object + type: array + type: object + observabilityConfig: + properties: + alert: + properties: + expression: + type: object + x-kubernetes-preserve-unknown-fields: true + type: object + monitor: + properties: + interval: + type: string + type: object + type: object + qosConfig: + description: Define different QoS and their price. + properties: + billingPeriod: + type: string + defaultQoS: + type: string + definitions: + items: + properties: + description: + type: string + name: + type: string + priority: + type: integer + type: object + type: array + pricing: + items: + properties: + limitsOverRequests: + properties: + tflops: + description: Tera floating point operations per second + type: string + vram: + description: VRAM is short for Video memory, namely + GPU RAM + type: string + type: object + qos: + type: string + requests: + properties: + tflops: + description: Tera floating point operations per second + type: string + vram: + description: VRAM is short for Video memory, namely + GPU RAM + type: string + type: object + type: object + type: array + type: object + schedulingConfig: + description: Place the workload to right nodes and scale smart. + properties: + autoScaling: + description: scale the workload based on the usage and traffic + properties: + autoSetLimits: + description: layer 1 vertical auto-scaling, turbo burst to + existing GPU cards fastly + properties: + evaluationPeriod: + type: string + extraTFlopsBufferRatio: + type: string + ignoredDeltaRange: + type: string + maxRatioToRequests: + description: the multiplier of requests, to avoid limit + set too high, like 5.0 + type: string + prediction: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + scaleUpStep: + type: string + type: object + autoSetReplicas: + description: layer 2 horizontal auto-scaling, scale up to + more GPU cards if max limits threshod hit + properties: + enable: + type: boolean + evaluationPeriod: + type: string + scaleDownCoolDownTime: + type: string + scaleDownStep: + type: string + scaleDownUpDownTime: + type: string + scaleUpStep: + type: string + targetTFlopsOfLimits: + type: string + type: object + autoSetRequests: + description: layer 3 adjusting, to match the actual usage + in the long run + properties: + aggregationPeriod: + type: string + evaluationPeriod: + type: string + extraBufferRatio: + description: the request buffer ratio, for example actual + usage is 1.0, 10% buffer will be 1.1 as final preferred + requests + type: string + percentileForAutoRequests: + type: string + prediction: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + type: object + scaleToZero: + description: additional layer to save VRAM, auto-freeze memory + and cool down to RAM and Disk + properties: + autoFreeze: + items: + properties: + enable: + type: boolean + freezeToDiskTTL: + type: string + freezeToMemTTL: + type: string + qos: + type: string + type: object + type: array + intelligenceWarmup: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + type: object + type: object + hypervisor: + description: single GPU device multi-process queuing and fair + scheduling with QoS constraint + properties: + multiProcessQueuing: + properties: + enable: + type: boolean + interval: + type: string + queueLevelTimeSlices: + items: + type: string + type: array + type: object + type: object + placement: + description: place the client or worker to best matched nodes + properties: + allowUsingLocalGPU: + type: boolean + gpuFilters: + items: + description: "GPUFilter is to select eligible GPUs for scheduling.\n\nexample:\n```yaml\n- + type: avoidTooMuchConnectionsOnSameGPU\nparams:\n\n\tconnectionNum: + 150\n\n- type: avoidDifferentZone\nparams:\n\n\t# by default, + GPU worker will be scheduled into the same zone as CPU + Client Pod to align AZ and improve performance\n\ttopologyKey: + topology.kubernetes.io/zone\n\n```" + properties: + params: + type: object + x-kubernetes-preserve-unknown-fields: true + type: + type: string + type: object + type: array + mode: + type: string + type: object + reBalancer: + description: |- + avoid hot GPU devices and continuously balance the workload + implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler + properties: + internal: + type: string + reBalanceCoolDownTime: + type: string + threshold: + properties: + matchAny: + type: object + x-kubernetes-preserve-unknown-fields: true + type: object + type: object + type: object + schedulingConfigTemplate: + type: string + type: object + status: + description: GPUPoolStatus defines the observed state of GPUPool. + properties: + availableTFlops: + format: int32 + type: integer + availableVRAM: + type: string + cluster: + type: string + componentStatus: + description: |- + when updating any component version or config, poolcontroller will perform rolling update. + the status will be updated periodically, default to 5s, progress will be 0-100. + when the progress is 100, the component version or config is fully updated. + properties: + client: + type: string + clientConfigSynced: + type: boolean + clientUpdateProgress: + format: int32 + type: integer + hypervisor: + type: string + hypervisorConfigSynced: + type: boolean + hypervisorUpdateProgress: + format: int32 + type: integer + worker: + type: string + workerConfigSynced: + type: boolean + workerUpdateProgress: + format: int32 + type: integer + type: object + conditions: + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + notReadyNodes: + format: int32 + type: integer + phase: + description: TensorFusionClusterPhase represents the phase of the + TensorFusionCluster resource. + type: string + provisioningStatus: + description: |- + If using provisioner, GPU nodes could be outside of the K8S cluster. + The GPUNodes custom resource will be created and deleted automatically. + ProvisioningStatus is to track the status of those outside GPU nodes. + properties: + availableNodes: + format: int32 + type: integer + initializingNodes: + format: int32 + type: integer + terminatingNodes: + format: int32 + type: integer + type: object + readyNodes: + format: int32 + type: integer + totalGPUs: + format: int32 + type: integer + totalNodes: + format: int32 + type: integer + totalTFlops: + format: int32 + type: integer + totalVRAM: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml new file mode 100644 index 0000000..05f1b16 --- /dev/null +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -0,0 +1,214 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: schedulingconfigtemplates.tensor-fusion.ai +spec: + group: tensor-fusion.ai + names: + kind: SchedulingConfigTemplate + listKind: SchedulingConfigTemplateList + plural: schedulingconfigtemplates + singular: schedulingconfigtemplate + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: SchedulingConfigTemplate is the Schema for the schedulingconfigtemplates + API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Place the workload to right nodes and scale smart. + properties: + autoScaling: + description: scale the workload based on the usage and traffic + properties: + autoSetLimits: + description: layer 1 vertical auto-scaling, turbo burst to existing + GPU cards fastly + properties: + evaluationPeriod: + type: string + extraTFlopsBufferRatio: + type: string + ignoredDeltaRange: + type: string + maxRatioToRequests: + description: the multiplier of requests, to avoid limit set + too high, like 5.0 + type: string + prediction: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + scaleUpStep: + type: string + type: object + autoSetReplicas: + description: layer 2 horizontal auto-scaling, scale up to more + GPU cards if max limits threshod hit + properties: + enable: + type: boolean + evaluationPeriod: + type: string + scaleDownCoolDownTime: + type: string + scaleDownStep: + type: string + scaleDownUpDownTime: + type: string + scaleUpStep: + type: string + targetTFlopsOfLimits: + type: string + type: object + autoSetRequests: + description: layer 3 adjusting, to match the actual usage in the + long run + properties: + aggregationPeriod: + type: string + evaluationPeriod: + type: string + extraBufferRatio: + description: the request buffer ratio, for example actual + usage is 1.0, 10% buffer will be 1.1 as final preferred + requests + type: string + percentileForAutoRequests: + type: string + prediction: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + type: object + scaleToZero: + description: additional layer to save VRAM, auto-freeze memory + and cool down to RAM and Disk + properties: + autoFreeze: + items: + properties: + enable: + type: boolean + freezeToDiskTTL: + type: string + freezeToMemTTL: + type: string + qos: + type: string + type: object + type: array + intelligenceWarmup: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + type: object + type: object + hypervisor: + description: single GPU device multi-process queuing and fair scheduling + with QoS constraint + properties: + multiProcessQueuing: + properties: + enable: + type: boolean + interval: + type: string + queueLevelTimeSlices: + items: + type: string + type: array + type: object + type: object + placement: + description: place the client or worker to best matched nodes + properties: + allowUsingLocalGPU: + type: boolean + gpuFilters: + items: + description: "GPUFilter is to select eligible GPUs for scheduling.\n\nexample:\n```yaml\n- + type: avoidTooMuchConnectionsOnSameGPU\nparams:\n\n\tconnectionNum: + 150\n\n- type: avoidDifferentZone\nparams:\n\n\t# by default, + GPU worker will be scheduled into the same zone as CPU Client + Pod to align AZ and improve performance\n\ttopologyKey: topology.kubernetes.io/zone\n\n```" + properties: + params: + type: object + x-kubernetes-preserve-unknown-fields: true + type: + type: string + type: object + type: array + mode: + type: string + type: object + reBalancer: + description: |- + avoid hot GPU devices and continuously balance the workload + implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler + properties: + internal: + type: string + reBalanceCoolDownTime: + type: string + threshold: + properties: + matchAny: + type: object + x-kubernetes-preserve-unknown-fields: true + type: object + type: object + type: object + status: + description: SchedulingConfigTemplateStatus defines the observed state + of SchedulingConfigTemplate. + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml new file mode 100644 index 0000000..6b2bd90 --- /dev/null +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml @@ -0,0 +1,686 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: tensorfusionclusters.tensor-fusion.ai +spec: + group: tensor-fusion.ai + names: + kind: TensorFusionCluster + listKind: TensorFusionClusterList + plural: tensorfusionclusters + singular: tensorfusioncluster + scope: Cluster + versions: + - name: v1 + schema: + openAPIV3Schema: + description: TensorFusionCluster is the Schema for the tensorfusionclusters + API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: TensorFusionClusterSpec defines the desired state of TensorFusionCluster. + properties: + computingVendor: + description: ComputingVendorConfig defines the Cloud vendor connection + such as AWS, GCP, Azure etc. + properties: + authType: + type: string + enable: + type: boolean + gpuNodeControllerType: + type: string + name: + type: string + params: + properties: + accessKey: + type: string + iamRole: + type: string + region: + type: string + secretKey: + type: string + type: object + type: + type: string + type: object + dataPipelines: + description: DataPipelinesConfig defines the aggregation jobs that + can make statistics on the data and then report to cloud if configured. + properties: + resources: + properties: + syncPeriod: + type: string + syncToCloud: + type: boolean + type: object + timeseries: + properties: + aggregationDataRetention: + type: string + aggregationPeriods: + items: + type: string + type: array + rawDataRetention: + type: string + remoteWrite: + description: RemoteWriteConfig represents the configuration + for remote write. + properties: + connection: + properties: + type: + type: string + url: + type: string + type: object + metrics: + items: + type: string + type: array + type: object + type: object + type: object + enroll: + description: Enroll to TensorFusion cloud with a enrollment key + properties: + apiEndpoint: + type: string + enrollKey: + properties: + data: + type: string + secretRef: + properties: + name: + type: string + namespace: + type: string + type: object + type: object + type: object + gpuPools: + items: + description: GPUPool defines how to create a GPU pool, could be + URL or inline + properties: + name: + type: string + spec: + description: GPUPoolSpec defines the desired state of GPUPool. + properties: + capacityConfig: + properties: + maxResources: + properties: + tflops: + description: Tera floating point operations per + second + type: string + vram: + description: VRAM is short for Video memory, namely + GPU RAM + type: string + type: object + minResources: + properties: + tflops: + description: Tera floating point operations per + second + type: string + vram: + description: VRAM is short for Video memory, namely + GPU RAM + type: string + type: object + oversubscription: + properties: + tflopsOversellRatio: + description: The multipler of TFlops to oversell, + default to 1 for production, 20 for development + type: string + vramExpandToHostDisk: + description: the percentage of Host Disk appending + to GPU VRAM, default to 70% + type: string + vramExpandToHostMem: + description: the percentage of Host RAM appending + to GPU VRAM, default to 50% + type: string + type: object + warmResources: + properties: + tflops: + description: Tera floating point operations per + second + type: string + vram: + description: VRAM is short for Video memory, namely + GPU RAM + type: string + type: object + type: object + componentConfig: + description: Customize system components for seamless onboarding. + properties: + client: + properties: + image: + type: string + podTemplateMergePatch: + description: define how to inject the client pod + type: object + x-kubernetes-preserve-unknown-fields: true + port: + type: integer + protocol: + type: string + type: object + hypervisor: + properties: + hypervisorDaemonSetTemplate: + type: object + x-kubernetes-preserve-unknown-fields: true + image: + type: string + type: object + worker: + properties: + hostNetwork: + type: boolean + image: + type: string + port: + type: integer + workerPodTemplate: + type: object + x-kubernetes-preserve-unknown-fields: true + type: object + type: object + nodeManagerConfig: + properties: + nodeCompaction: + properties: + period: + type: string + type: object + nodePoolRollingUpdatePolicy: + properties: + autoUpdate: + description: |- + If set to false, updates will be pending in status, and user needs to manually approve updates. + Updates will occur immediately or during the next maintenance window. + type: boolean + batchInterval: + type: string + batchPercentage: + type: string + duration: + type: string + maintenanceWindow: + properties: + includes: + description: crontab syntax. + items: + type: string + type: array + type: object + type: object + nodeProvisioner: + description: karpenter mode Hypervisor manage GPU nodes + and Workers + properties: + nodeClass: + type: string + requirements: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + type: object + type: array + taints: + items: + properties: + effect: + type: string + key: + type: string + value: + type: string + type: object + type: array + type: object + nodeSelector: + description: Use existing Kubernetes GPU nodes. + items: + properties: + matchAll: + additionalProperties: + type: string + type: object + matchAny: + additionalProperties: + type: string + type: object + type: object + type: array + type: object + observabilityConfig: + properties: + alert: + properties: + expression: + type: object + x-kubernetes-preserve-unknown-fields: true + type: object + monitor: + properties: + interval: + type: string + type: object + type: object + qosConfig: + description: Define different QoS and their price. + properties: + billingPeriod: + type: string + defaultQoS: + type: string + definitions: + items: + properties: + description: + type: string + name: + type: string + priority: + type: integer + type: object + type: array + pricing: + items: + properties: + limitsOverRequests: + properties: + tflops: + description: Tera floating point operations + per second + type: string + vram: + description: VRAM is short for Video memory, + namely GPU RAM + type: string + type: object + qos: + type: string + requests: + properties: + tflops: + description: Tera floating point operations + per second + type: string + vram: + description: VRAM is short for Video memory, + namely GPU RAM + type: string + type: object + type: object + type: array + type: object + schedulingConfig: + description: Place the workload to right nodes and scale + smart. + properties: + autoScaling: + description: scale the workload based on the usage and + traffic + properties: + autoSetLimits: + description: layer 1 vertical auto-scaling, turbo + burst to existing GPU cards fastly + properties: + evaluationPeriod: + type: string + extraTFlopsBufferRatio: + type: string + ignoredDeltaRange: + type: string + maxRatioToRequests: + description: the multiplier of requests, to + avoid limit set too high, like 5.0 + type: string + prediction: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + scaleUpStep: + type: string + type: object + autoSetReplicas: + description: layer 2 horizontal auto-scaling, scale + up to more GPU cards if max limits threshod hit + properties: + enable: + type: boolean + evaluationPeriod: + type: string + scaleDownCoolDownTime: + type: string + scaleDownStep: + type: string + scaleDownUpDownTime: + type: string + scaleUpStep: + type: string + targetTFlopsOfLimits: + type: string + type: object + autoSetRequests: + description: layer 3 adjusting, to match the actual + usage in the long run + properties: + aggregationPeriod: + type: string + evaluationPeriod: + type: string + extraBufferRatio: + description: the request buffer ratio, for example + actual usage is 1.0, 10% buffer will be 1.1 + as final preferred requests + type: string + percentileForAutoRequests: + type: string + prediction: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + type: object + scaleToZero: + description: additional layer to save VRAM, auto-freeze + memory and cool down to RAM and Disk + properties: + autoFreeze: + items: + properties: + enable: + type: boolean + freezeToDiskTTL: + type: string + freezeToMemTTL: + type: string + qos: + type: string + type: object + type: array + intelligenceWarmup: + properties: + enable: + type: boolean + historyDataPeriod: + type: string + model: + type: string + predictionPeriod: + type: string + type: object + type: object + type: object + hypervisor: + description: single GPU device multi-process queuing + and fair scheduling with QoS constraint + properties: + multiProcessQueuing: + properties: + enable: + type: boolean + interval: + type: string + queueLevelTimeSlices: + items: + type: string + type: array + type: object + type: object + placement: + description: place the client or worker to best matched + nodes + properties: + allowUsingLocalGPU: + type: boolean + gpuFilters: + items: + description: "GPUFilter is to select eligible + GPUs for scheduling.\n\nexample:\n```yaml\n- + type: avoidTooMuchConnectionsOnSameGPU\nparams:\n\n\tconnectionNum: + 150\n\n- type: avoidDifferentZone\nparams:\n\n\t# + by default, GPU worker will be scheduled into + the same zone as CPU Client Pod to align AZ + and improve performance\n\ttopologyKey: topology.kubernetes.io/zone\n\n```" + properties: + params: + type: object + x-kubernetes-preserve-unknown-fields: true + type: + type: string + type: object + type: array + mode: + type: string + type: object + reBalancer: + description: |- + avoid hot GPU devices and continuously balance the workload + implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler + properties: + internal: + type: string + reBalanceCoolDownTime: + type: string + threshold: + properties: + matchAny: + type: object + x-kubernetes-preserve-unknown-fields: true + type: object + type: object + type: object + schedulingConfigTemplate: + type: string + type: object + specTemplateUrl: + type: string + type: object + type: array + storageVendor: + description: StorageVendorConfig defines Postgres database with extensions + for timeseries storage and other resource aggregation results, system + events and diagnostics reports etc. + properties: + image: + type: string + installCloudNativePGOperator: + type: boolean + mode: + type: string + pgClusterTemplate: + type: object + x-kubernetes-preserve-unknown-fields: true + pgExtensions: + items: + type: string + type: array + storageClass: + type: string + type: object + type: object + status: + description: TensorFusionClusterStatus defines the observed state of TensorFusionCluster. + properties: + availableLicenses: + format: int32 + type: integer + availableTFlops: + format: int32 + type: integer + availableVRAM: + type: string + cloudConnectionStatus: + properties: + clusterId: + type: string + connectionState: + type: string + lastHeartbeatTime: + format: date-time + type: string + type: object + computingVendorStatus: + properties: + connectionState: + type: string + type: object + conditions: + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + licenseRenewalTime: + format: date-time + type: string + notReadyGPUPools: + items: + type: string + type: array + phase: + default: Initializing + description: TensorFusionClusterPhase represents the phase of the + TensorFusionCluster resource. + type: string + readyGPUPools: + items: + type: string + type: array + storageStatus: + properties: + connectionState: + type: string + type: object + totalGPUs: + format: int32 + type: integer + totalLicenses: + format: int32 + type: integer + totalNodes: + format: int32 + type: integer + totalPools: + format: int32 + type: integer + totalTFlops: + format: int32 + type: integer + totalVRAM: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index e3806e5..eb8fcff 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -8,6 +8,7 @@ resources: - bases/tensor-fusion.ai_gpupools.yaml - bases/tensor-fusion.ai_gpunodes.yaml - bases/tensor-fusion.ai_gpunodeclasses.yaml +- bases/tensor-fusion.ai_schedulingconfigtemplates.yaml # +kubebuilder:scaffold:crdkustomizeresource patches: diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 40b884a..88329fc 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -22,6 +22,8 @@ resources: # default, aiding admins in cluster management. Those roles are # not used by the Project itself. You can comment the following lines # if you do not want those helpers be installed with your Project. +- schedulingconfigtemplate_editor_role.yaml +- schedulingconfigtemplate_viewer_role.yaml - gpunodeclass_editor_role.yaml - gpunodeclass_viewer_role.yaml - gpunode_editor_role.yaml diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index a489148..9e9fdd7 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -19,7 +19,12 @@ rules: - apiGroups: - tensor-fusion.ai resources: + - gpunodeclasses + - gpunodes + - gpupools - gpus + - schedulingconfigtemplates + - tensorfusionclusters - tensorfusionconnections verbs: - create @@ -32,14 +37,24 @@ rules: - apiGroups: - tensor-fusion.ai resources: + - gpunodeclasses/finalizers + - gpunodes/finalizers + - gpupools/finalizers - gpus/finalizers + - schedulingconfigtemplates/finalizers + - tensorfusionclusters/finalizers - tensorfusionconnections/finalizers verbs: - update - apiGroups: - tensor-fusion.ai resources: + - gpunodeclasses/status + - gpunodes/status + - gpupools/status - gpus/status + - schedulingconfigtemplates/status + - tensorfusionclusters/status - tensorfusionconnections/status verbs: - get diff --git a/config/rbac/schedulingconfigtemplate_editor_role.yaml b/config/rbac/schedulingconfigtemplate_editor_role.yaml new file mode 100644 index 0000000..019e5e2 --- /dev/null +++ b/config/rbac/schedulingconfigtemplate_editor_role.yaml @@ -0,0 +1,27 @@ +# permissions for end users to edit schedulingconfigtemplates. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: schedulingconfigtemplate-editor-role +rules: +- apiGroups: + - tensor-fusion.ai + resources: + - schedulingconfigtemplates + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - tensor-fusion.ai + resources: + - schedulingconfigtemplates/status + verbs: + - get diff --git a/config/rbac/schedulingconfigtemplate_viewer_role.yaml b/config/rbac/schedulingconfigtemplate_viewer_role.yaml new file mode 100644 index 0000000..33fea0b --- /dev/null +++ b/config/rbac/schedulingconfigtemplate_viewer_role.yaml @@ -0,0 +1,23 @@ +# permissions for end users to view schedulingconfigtemplates. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: schedulingconfigtemplate-viewer-role +rules: +- apiGroups: + - tensor-fusion.ai + resources: + - schedulingconfigtemplates + verbs: + - get + - list + - watch +- apiGroups: + - tensor-fusion.ai + resources: + - schedulingconfigtemplates/status + verbs: + - get diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml index 5a372a2..a08fb10 100644 --- a/config/samples/kustomization.yaml +++ b/config/samples/kustomization.yaml @@ -6,4 +6,5 @@ resources: - v1_gpupool.yaml - v1_gpunode.yaml - v1_gpunodeclass.yaml +- v1_schedulingconfigtemplate.yaml # +kubebuilder:scaffold:manifestskustomizesamples diff --git a/config/samples/v1_schedulingconfigtemplate.yaml b/config/samples/v1_schedulingconfigtemplate.yaml new file mode 100644 index 0000000..07c6beb --- /dev/null +++ b/config/samples/v1_schedulingconfigtemplate.yaml @@ -0,0 +1,9 @@ +apiVersion: tensor-fusion.ai/v1 +kind: SchedulingConfigTemplate +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: schedulingconfigtemplate-sample +spec: + # TODO(user): Add fields here diff --git a/internal/controller/helper_funcs.go b/internal/controller/helper_funcs.go new file mode 100644 index 0000000..8db1759 --- /dev/null +++ b/internal/controller/helper_funcs.go @@ -0,0 +1,21 @@ +package controller + +// Helper functions to handle finalizers +func containsString(slice []string, s string) bool { + for _, item := range slice { + if item == s { + return true + } + } + return false +} + +func removeString(slice []string, s string) []string { + result := []string{} + for _, item := range slice { + if item != s { + result = append(result, item) + } + } + return result +} diff --git a/internal/controller/schedulingconfigtemplate_controller.go b/internal/controller/schedulingconfigtemplate_controller.go new file mode 100644 index 0000000..85b3911 --- /dev/null +++ b/internal/controller/schedulingconfigtemplate_controller.go @@ -0,0 +1,63 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +// SchedulingConfigTemplateReconciler reconciles a SchedulingConfigTemplate object +type SchedulingConfigTemplateReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=schedulingconfigtemplates,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=schedulingconfigtemplates/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=schedulingconfigtemplates/finalizers,verbs=update + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// TODO(user): Modify the Reconcile function to compare the state specified by +// the SchedulingConfigTemplate object against the actual cluster state, and then +// perform operations to make the cluster state reflect the state specified by +// the user. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile +func (r *SchedulingConfigTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + _ = log.FromContext(ctx) + + // TODO(user): your logic here + + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *SchedulingConfigTemplateReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&tensorfusionaiv1.SchedulingConfigTemplate{}). + Named("schedulingconfigtemplate"). + Complete(r) +} diff --git a/internal/controller/schedulingconfigtemplate_controller_test.go b/internal/controller/schedulingconfigtemplate_controller_test.go new file mode 100644 index 0000000..238ca6a --- /dev/null +++ b/internal/controller/schedulingconfigtemplate_controller_test.go @@ -0,0 +1,84 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +var _ = Describe("SchedulingConfigTemplate Controller", func() { + Context("When reconciling a resource", func() { + const resourceName = "test-resource" + + ctx := context.Background() + + typeNamespacedName := types.NamespacedName{ + Name: resourceName, + Namespace: "default", // TODO(user):Modify as needed + } + schedulingconfigtemplate := &tensorfusionaiv1.SchedulingConfigTemplate{} + + BeforeEach(func() { + By("creating the custom resource for the Kind SchedulingConfigTemplate") + err := k8sClient.Get(ctx, typeNamespacedName, schedulingconfigtemplate) + if err != nil && errors.IsNotFound(err) { + resource := &tensorfusionaiv1.SchedulingConfigTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceName, + Namespace: "default", + }, + // TODO(user): Specify other spec details if needed. + } + Expect(k8sClient.Create(ctx, resource)).To(Succeed()) + } + }) + + AfterEach(func() { + // TODO(user): Cleanup logic after each test, like removing the resource instance. + resource := &tensorfusionaiv1.SchedulingConfigTemplate{} + err := k8sClient.Get(ctx, typeNamespacedName, resource) + Expect(err).NotTo(HaveOccurred()) + + By("Cleanup the specific resource instance SchedulingConfigTemplate") + Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) + }) + It("should successfully reconcile the resource", func() { + By("Reconciling the created resource") + controllerReconciler := &SchedulingConfigTemplateReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + + _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + // TODO(user): Add more specific assertions depending on your controller's reconciliation logic. + // Example: If you expect a certain status condition after reconciliation, verify it here. + }) + }) +}) diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go index a516fdb..bc7a584 100644 --- a/internal/controller/tensorfusioncluster_controller.go +++ b/internal/controller/tensorfusioncluster_controller.go @@ -25,6 +25,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + "github.com/NexusGPU/tensor-fusion-operator/internal/constants" +) + +var ( + tensorFusionClusterFinalizer = constants.TensorFusionFinalizer ) // TensorFusionClusterReconciler reconciles a TensorFusionCluster object @@ -37,19 +42,62 @@ type TensorFusionClusterReconciler struct { // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionclusters/status,verbs=get;update;patch // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionclusters/finalizers,verbs=update -// Reconcile is part of the main kubernetes reconciliation loop which aims to -// move the current state of the cluster closer to the desired state. -// TODO(user): Modify the Reconcile function to compare the state specified by -// the TensorFusionCluster object against the actual cluster state, and then -// perform operations to make the cluster state reflect the state specified by -// the user. -// -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile func (r *TensorFusionClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { _ = log.FromContext(ctx) // TODO(user): your logic here + tfc := &tensorfusionaiv1.TensorFusionCluster{} + err := r.Get(ctx, req.NamespacedName, tfc) + if err != nil { + log.FromContext(ctx).Error(err, "unable to fetch TensorFusionCluster") + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // add a finalizer to the object + if !containsString(tfc.Finalizers, tensorFusionClusterFinalizer) { + tfc.Finalizers = append(tfc.Finalizers, tensorFusionClusterFinalizer) + err = r.Update(ctx, tfc) + if err != nil { + log.FromContext(ctx).Error(err, "unable to update TensorFusionCluster") + return ctrl.Result{}, err + } + } + + // examine DeletionTimestamp to determine if object is under deletion + if tfc.ObjectMeta.DeletionTimestamp.IsZero() { + // The object is not being deleted, so if it does not have our finalizer, + // then we should add the finalizer and update the object. Finally we + // return and requeue the object so that we can pick it up again after + // updating it. + if !containsString(tfc.Finalizers, tensorFusionClusterFinalizer) { + tfc.Finalizers = append(tfc.Finalizers, tensorFusionClusterFinalizer) + if err := r.Update(ctx, tfc); err != nil { + log.FromContext(ctx).Error(err, "unable to update TensorFusionCluster") + return ctrl.Result{}, err + } + // we return and requeue the object so that we can pick it up again after updating it + return ctrl.Result{}, nil + } + } else { + // The object is being deleted + if containsString(tfc.Finalizers, tensorFusionClusterFinalizer) { + // our finalizer is present, so lets handle any external dependency + if err := r.Delete(ctx, tfc); err != nil { + // if fail to delete the external dependency here, return with error + // so that it can be retried + return ctrl.Result{}, err + } + + // remove our finalizer from the list and update it. + tfc.Finalizers = removeString(tfc.Finalizers, tensorFusionClusterFinalizer) + if err := r.Update(ctx, tfc); err != nil { + log.FromContext(ctx).Error(err, "unable to remove finalizer from TensorFusionCluster") + return ctrl.Result{}, err + } + } + // Stop reconciliation as the item is being deleted + return ctrl.Result{}, nil + } return ctrl.Result{}, nil } diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go index e5329f3..c590ff5 100644 --- a/internal/controller/tensorfusionconnection_controller.go +++ b/internal/controller/tensorfusionconnection_controller.go @@ -184,26 +184,6 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c return r.mustUpdateStatus(ctx, connection, gpu) } -// Helper functions to handle finalizers -func containsString(slice []string, s string) bool { - for _, item := range slice { - if item == s { - return true - } - } - return false -} - -func removeString(slice []string, s string) []string { - result := []string{} - for _, item := range slice { - if item != s { - result = append(result, item) - } - } - return result -} - func (r *TensorFusionConnectionReconciler) mustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpu *tfv1.GPU) error { return retry.RetryOnConflict(retry.DefaultBackoff, func() error { // Get the latest version of the connection