Skip to content

Commit

Permalink
feat: init cluster manager and node manager types (#6)
Browse files Browse the repository at this point in the history
* feat: init cluster manager and node manager types

* fix: nested type and raw object issue

* fix: typing enhancement

* feat: fix webhook bug, create tfconn in pod controller (#7)

* chore(deps): bump golang.org/x/crypto from 0.24.0 to 0.31.0 (#9)

Bumps [golang.org/x/crypto](https://github.com/golang/crypto) from 0.24.0 to 0.31.0.
- [Commits](golang/crypto@v0.24.0...v0.31.0)

---
updated-dependencies:
- dependency-name: golang.org/x/crypto
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

* fix: typing enhancement

* fix: go lint issues

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: 0x5457 <0x5457@protonmail.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Dec 17, 2024
1 parent 6edbf18 commit b79b3da
Show file tree
Hide file tree
Showing 28 changed files with 4,143 additions and 202 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ go.work
*.swp
*.swo
*~

.DS_Store
8 changes: 8 additions & 0 deletions PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,12 @@ resources:
kind: GPUNodeClass
path: github.com/NexusGPU/tensor-fusion-operator/api/v1
version: v1
- api:
crdVersion: v1
namespaced: true
controller: true
domain: tensor-fusion.ai
kind: SchedulingConfigTemplate
path: github.com/NexusGPU/tensor-fusion-operator/api/v1
version: v1
version: "3"
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
- kubectl version v1.11.3+.
- Access to a Kubernetes v1.11.3+ cluster.

### Add new API

```bash
kubebuilder create api --group "" --version v1 --kind SchedulingConfigTemplate --namespaced false
```

### To Deploy on the cluster
**Build and push your image to the location specified by `IMG`:**

Expand Down Expand Up @@ -112,3 +118,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


6 changes: 6 additions & 0 deletions api/v1/base_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package v1

type NameNamespace struct {
Name string `json:"name,omitempty"`
Namespace string `json:"namespace,omitempty"`
}
56 changes: 50 additions & 6 deletions api/v1/gpunode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,65 @@ import (

// GPUNodeSpec defines the desired state of GPUNode.
type GPUNodeSpec struct {
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
// Important: Run "make" to regenerate code after modifying this file
ManageMode GPUNodeManageMode `json:"manageMode,omitempty"`

// Foo is an example field of GPUNode. Edit gpunode_types.go to remove/update
Foo string `json:"foo,omitempty"`
// if not all GPU cards should be used, specify the GPU card indices, default to empty,
// onboard all GPU cards to the pool
GPUCardIndices []int `json:"gpuCardIndices,omitempty"`
}

type GPUNodeManageMode string

const (
GPUNodeManageModeNone GPUNodeManageMode = "manual"
GPUNodeManageModeAuto GPUNodeManageMode = "selected"
GPUNodeManageModeManual GPUNodeManageMode = "provisioned"
)

// GPUNodeStatus defines the observed state of GPUNode.
type GPUNodeStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file
Phase TensorFusionClusterPhase `json:"phase,omitempty"`

Conditions []metav1.Condition `json:"conditions,omitempty"`

TotalTFlops int32 `json:"totalTFlops,omitempty"`
TotalVRAM string `json:"totalVRAM,omitempty"`

AvailableTFlops int32 `json:"availableTFlops,omitempty"`
AvailableVRAM string `json:"availableVRAM,omitempty"`

HypervisorStatus NodeHypervisorStatus `json:"hypervisorStatus,omitempty"`

NodeInfo GPUNodeInfo `json:"nodeInfo,omitempty"`

LoadedModels []string `json:"loadedModels,omitempty"`

TotalGPUs int32 `json:"totalGPUs,omitempty"`
ManagedGPUs int32 `json:"managedGPUs,omitempty"`
ManagedGPUResourceIDs []string `json:"managedGPUResourceIDs,omitempty"`
}

type GPUNodeInfo struct {
Hostname string `json:"hostname,omitempty"`
IP string `json:"ip,omitempty"`
KernalVersion string `json:"kernalVersion,omitempty"`
OSImage string `json:"osImage,omitempty"`
GPUDriverVersion string `json:"gpuDriverVersion,omitempty"`
GPUModel string `json:"gpuModel,omitempty"`
GPUCount int32 `json:"gpuCount,omitempty"`
OperatingSystem string `json:"operatingSystem,omitempty"`
Architecture string `json:"architecture,omitempty"`
}

type NodeHypervisorStatus struct {
HypervisorState string `json:"hypervisorState,omitempty"`
HypervisorVersion string `json:"hypervisorVersion,omitempty"`
LastHeartbeatTime metav1.Time `json:"lastHeartbeatTime,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Cluster

// GPUNode is the Schema for the gpunodes API.
type GPUNode struct {
Expand Down
51 changes: 45 additions & 6 deletions api/v1/gpunodeclass_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,60 @@ import (

// GPUNodeClassSpec defines the desired state of GPUNodeClass.
type GPUNodeClassSpec struct {
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
// Important: Run "make" to regenerate code after modifying this file
OSImageFamily string `json:"osImageFamily,omitempty"` // The AMI family to use

// Foo is an example field of GPUNodeClass. Edit gpunodeclass_types.go to remove/update
Foo string `json:"foo,omitempty"`
OSImageSelectorTerms []NodeClassOSImageSelectorTerms `json:"osImageSelectorTerms,omitempty"`

BlockDeviceMappings []NodeClassBlockDeviceMappings `json:"blockDeviceMappings,omitempty"` // Block device mappings for the instance

InstanceProfile string `json:"instanceProfile,omitempty"` // The instance profile to use

MetadataOptions NodeClassMetadataOptions `json:"metadataOptions,omitempty"`

SecurityGroupSelectorTerms []NodeClassItemIDSelectorTerms `json:"securityGroupSelectorTerms,omitempty"`

SubnetSelectorTerms []NodeClassItemIDSelectorTerms `json:"subnetSelectorTerms,omitempty"` // Terms to select subnets

Tags map[string]string `json:"tags,omitempty"` // Tags associated with the resource

UserData string `json:"userData,omitempty"` // User data script for the instance
}

type NodeClassItemIDSelectorTerms struct {
ID string `json:"id,omitempty"` // The ID of the security group
}

type NodeClassMetadataOptions struct {
HttpEndpoint string `json:"httpEndpoint,omitempty"` // Whether the HTTP metadata endpoint is enabled
HttpProtocolIPv6 string `json:"httpProtocolIPv6,omitempty"` // Whether IPv6 is enabled for the HTTP metadata endpoint
HttpPutResponseHopLimit int `json:"httpPutResponseHopLimit,omitempty"` // The hop limit for HTTP PUT responses
HttpTokens string `json:"httpTokens,omitempty"` // The HTTP tokens required for metadata access
}

type NodeClassOSImageSelectorTerms struct {
Name string `json:"name,omitempty"`
Owner string `json:"owner,omitempty"`
}

type NodeClassBlockDeviceMappings struct {
DeviceName string `json:"deviceName,omitempty"` // The device name for the block device
Ebs NodeClassEbsSettings `json:"ebs,omitempty"`
}

type NodeClassEbsSettings struct {
DeleteOnTermination bool `json:"deleteOnTermination,omitempty"` // Whether to delete the EBS volume on termination
Encrypted bool `json:"encrypted,omitempty"` // Whether the EBS volume is encrypted
VolumeSize string `json:"volumeSize,omitempty"` // The size of the EBS volume
VolumeType string `json:"volumeType,omitempty"` // The type of the EBS volume
}

// GPUNodeClassStatus defines the observed state of GPUNodeClass.
type GPUNodeClassStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Cluster

// GPUNodeClass is the Schema for the gpunodeclasses API.
type GPUNodeClass struct {
Expand Down
Loading

0 comments on commit b79b3da

Please sign in to comment.