Skip to content

Maintenance Update #638

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
@@ -50,6 +50,23 @@ jobs:
sleep 1
done

- name: Run e2e tests
run: go test -timeout 30m ./test/e2e/...

- name: Run e2e tests (VMs)
run: go test -timeout 90m ./test/e2e/v2/gpu_groups ./test/e2e/v2/gpu_leases ./test/e2e/v2/vms

- name: Run e2e tests (Deployments)
run: go test -timeout 90m ./test/e2e/v2/deployments

- name: Run e2e tests (SMs)
run: go test -timeout 90m ./test/e2e/v2/sms

- name: Run e2e tests (Jobs)
run: go test -timeout 90m ./test/e2e/v2/jobs

- name: Run e2e tests (Resource Migrations)
run: go test -timeout 90m ./test/e2e/v2/resource_migrations

- name: Run e2e tests (Users)
run: go test -timeout 90m ./test/e2e/v2/users ./test/e2e/v2/teams ./test/e2e/v2/notifications

- name: Run e2e tests (System)
run: go test -timeout 90m ./test/e2e/v2/zones
4 changes: 2 additions & 2 deletions dto/v2/body/vm_port.go
Original file line number Diff line number Diff line change
@@ -30,8 +30,8 @@ type CustomDomainRead struct {
}

type HttpProxyRead struct {
Name string `json:"name" bson:"name,omitempty" binding:"required,rfc1035,min=3,max=30"`
URL *string `json:"url,omitempty,omitempty"`
Name string `json:"name" bson:"name"`
URL *string `json:"url,omitempty" bson:"url,omitempty"`
CustomDomain *CustomDomainRead `json:"customDomain,omitempty" bson:"customDomain,omitempty"`
}

11 changes: 1 addition & 10 deletions models/config/config.go
Original file line number Diff line number Diff line change
@@ -67,6 +67,7 @@ type ConfigType struct {
PrivilegedGPUs []string `yaml:"privilegedGpus"`
ExcludedHosts []string `yaml:"excludedHosts"`
ExcludedGPUs []string `yaml:"excludedGpus"`
AddMock bool `yaml:"addMock"`
} `yaml:"gpu"`

Registry struct {
@@ -101,16 +102,6 @@ type ConfigType struct {
Password string `yaml:"password"`
}

SysApi struct {
URL string `yaml:"url"`
User string `yaml:"user"`
Password string `yaml:"password"`
ClientID string `yaml:"clientId"`
// UseMock is a flag that indicates whether the sys-api should be mocked
// This is useful for testing purposes, since the sys-api cannot be run locally
UseMock bool `yaml:"useMock"`
} `yaml:"sys-api"`

Harbor struct {
URL string `yaml:"url"`
User string `yaml:"user"`
18 changes: 0 additions & 18 deletions models/model/deployment.go
Original file line number Diff line number Diff line change
@@ -76,24 +76,6 @@ func (deployment *Deployment) GetURL(externalPort *int) *string {
return nil
}

// GetCustomDomainURL returns the custom domain URL of the deployment.
// If the app does not have a custom domain, it will return nil.
// This method does not check whether the custom domain is active, and does
// not check if the ingress exists.
func (deployment *Deployment) GetCustomDomainURL() *string {
app := deployment.GetMainApp()
if app == nil {
return nil
}

if app.CustomDomain != nil && len(app.CustomDomain.Domain) > 0 {
url := fmt.Sprintf("https://%s", app.CustomDomain.Domain)
return &url
}

return nil
}

// Ready returns true if the deployment is not being created or deleted.
func (deployment *Deployment) Ready() bool {
return !deployment.DoingActivity(ActivityBeingCreated) && !deployment.DoingActivity(ActivityBeingDeleted)
6 changes: 5 additions & 1 deletion models/model/deployment_convert.go
Original file line number Diff line number Diff line change
@@ -87,9 +87,13 @@ func (deployment *Deployment) ToDTO(smURL *string, externalPort *int, teams []st

var customDomain *body.CustomDomainRead
if app.CustomDomain != nil {
extPortStr := ""
if externalPort != nil && *externalPort != 443 {
extPortStr = fmt.Sprintf(":%d", *externalPort)
}
customDomain = &body.CustomDomainRead{
Domain: app.CustomDomain.Domain,
URL: fmt.Sprintf("https://%s", app.CustomDomain.Domain),
URL: fmt.Sprintf("https://%s%s", app.CustomDomain.Domain, extPortStr),
Status: app.CustomDomain.Status,
Secret: app.CustomDomain.Secret,
}
22 changes: 22 additions & 0 deletions models/model/vm.go
Original file line number Diff line number Diff line change
@@ -69,6 +69,28 @@ func (vm *VM) BeingDeleted() bool {
return vm.DoingActivity(ActivityBeingDeleted)
}

// GetHttpProxyURL returns the URL of a VM's HTTP proxy.
// If the K8s ingress does not exist, it will return nil, or if the ingress does not have a host, it will return nil.
func (vm *VM) GetHttpProxyURL(name string, externalPort *int) *string {
ingress := vm.Subsystems.K8s.GetIngress(fmt.Sprintf("%s-%s", vm.Name, name))
if ingress == nil || !ingress.Created() {
return nil
}

if len(ingress.Hosts) > 0 && len(ingress.Hosts[0]) > 0 {
url := fmt.Sprintf("https://%s", ingress.Hosts[0])

// If we have a custom port, we need to append it to the URL
if externalPort != nil && *externalPort != 443 {
url = fmt.Sprintf("%s:%d", url, *externalPort)
}

return &url
}

return nil
}

func (vm *VM) GetExternalPort(privatePort int, protocol string) *int {
pfrName := fmt.Sprintf("priv-%d-prot-%s", privatePort, protocol)
service := vm.Subsystems.K8s.GetService(fmt.Sprintf("%s-%s", vm.Name, pfrName))
15 changes: 12 additions & 3 deletions models/model/vm_convert.go
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@ import (
)

// ToDTOv2 converts a VM to a body.VmRead.
func (vm *VM) ToDTOv2(gpuLease *GpuLease, teams []string, sshConnectionString *string) body.VmRead {
func (vm *VM) ToDTOv2(gpuLease *GpuLease, teams []string, externalPort *int, sshConnectionString *string) body.VmRead {
var host *string
if vm.Host != nil {
host = &vm.Host.Name
@@ -42,17 +42,26 @@ func (vm *VM) ToDTOv2(gpuLease *GpuLease, teams []string, sshConnectionString *s

var httpProxy *body.HttpProxyRead
if port.HttpProxy != nil {
extPortStr := ""
if externalPort != nil && *externalPort != 443 {
extPortStr = fmt.Sprintf(":%d", *externalPort)
}

var customDomain *body.CustomDomainRead
if port.HttpProxy.CustomDomain != nil {
customDomain = &body.CustomDomainRead{
Domain: port.HttpProxy.CustomDomain.Domain,
URL: fmt.Sprintf("https://%s", port.HttpProxy.CustomDomain.Domain),
URL: fmt.Sprintf("https://%s%s", port.HttpProxy.CustomDomain.Domain, extPortStr),
Secret: port.HttpProxy.CustomDomain.Secret,
Status: port.HttpProxy.CustomDomain.Status,
}
}

httpProxy = &body.HttpProxyRead{Name: port.HttpProxy.Name, CustomDomain: customDomain}
httpProxy = &body.HttpProxyRead{
Name: port.HttpProxy.Name,
URL: vm.GetHttpProxyURL(port.HttpProxy.Name, externalPort),
CustomDomain: customDomain,
}
}

ports = append(ports, body.PortRead{
2 changes: 2 additions & 0 deletions pkg/app/status_codes/code.go
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@ const (
ResourceMountFailed = 10040
ResourceImagePullFailed = 10041
ResourceDisabled = 10042
ResourceUnschedulable = 10043

JobPending = 10140
JobFinished = 10141
@@ -56,6 +57,7 @@ var MsgFlags = map[int]string{
ResourceMountFailed: "resourceMountFailed",
ResourceImagePullFailed: "resourceImagePullFailed",
ResourceDisabled: "resourceDisabled",
ResourceUnschedulable: "resourceUnschedulable",

JobPending: "pending",
JobRunning: "running",
2 changes: 1 addition & 1 deletion pkg/db/key_value/client.go
Original file line number Diff line number Diff line change
@@ -107,7 +107,7 @@ func (client *Client) SetUpExpirationListener(ctx context.Context, pattern strin
err := handler(msg.Payload)
if err != nil {
utils.PrettyPrintError(fmt.Errorf("failed to handle expired key event for key %s. details: %w", msg.Payload, err))
return
continue
}
}
}
1 change: 1 addition & 0 deletions pkg/services/logger/pod_event_listener.go
Original file line number Diff line number Diff line change
@@ -44,6 +44,7 @@ func PodEventListener(ctx context.Context) error {

if !exists {
// Clean up the keys
log.Printf("Pod %s not longer exists. Cleaning up keys", podName)
_ = kvc.Del(LogKey(podName))
_ = kvc.Del(LastLogKey(podName))
_ = kvc.Del(OwnerLogKey(podName))
2 changes: 2 additions & 0 deletions pkg/services/status_update/vm_status_listener.go
Original file line number Diff line number Diff line change
@@ -86,6 +86,8 @@ func parseVmStatus(status *model.VmStatus) string {
statusCode = status_codes.ResourceStopping
case "Terminating":
statusCode = status_codes.ResourceDeleting
case "ErrorUnschedulable":
statusCode = status_codes.ResourceUnschedulable
case "CrashLoopBackOff", "Unknown", "Unschedulable", "ErrImagePull", "ImagePullBackOff", "PvcNotFound", "DataVolumeError":
statusCode = status_codes.ResourceError
default:
47 changes: 45 additions & 2 deletions pkg/services/synchronize/fetch_gpu.go
Original file line number Diff line number Diff line change
@@ -157,11 +157,54 @@ func listLatestGPUs() (*body.SystemGpuInfo, error) {
return nil, makeError(err)
}

var result *body.SystemGpuInfo
if len(systemGpuInfo) > 0 {
return &systemGpuInfo[0].GpuInfo, nil
result = &systemGpuInfo[0].GpuInfo
}

return nil, nil
if config.Config.GPU.AddMock {
// Add one mock GPUs in each zone
for _, zone := range config.Config.EnabledZones() {
if result == nil {
result = &body.SystemGpuInfo{}
}

result.HostGpuInfo = append(result.HostGpuInfo, body.HostGpuInfo{
HostBase: body.HostBase{
Name: "Mock Host 1",
DisplayName: "Mock Host 1",
Zone: zone.Name,
},
GPUs: []host_api.GpuInfo{{
Name: "Mock GPU 1",
Vendor: "NVIDIA",
VendorID: "10de",
DeviceID: "1eb0",
}, {
Name: "Mock GPU 2",
Vendor: "NVIDIA",
VendorID: "10de",
DeviceID: "2230",
}},
})

result.HostGpuInfo = append(result.HostGpuInfo, body.HostGpuInfo{
HostBase: body.HostBase{
Name: "Mock Host 2",
DisplayName: "Mock Host 2",
Zone: zone.Name,
},
GPUs: []host_api.GpuInfo{{
Name: "Mock GPU 1",
Vendor: "NVIDIA",
VendorID: "10de",
DeviceID: "1eb0",
}},
})
}
}

return result, nil
}

func createGpuGroupName(gpu *host_api.GpuInfo) *string {
16 changes: 12 additions & 4 deletions pkg/subsystems/k8s/status.go
Original file line number Diff line number Diff line change
@@ -90,7 +90,9 @@ func (client *Client) deploymentStatusWatcher(ctx context.Context, handler func(
}
}
case <-recreateInterval:
watcher.Stop()
if watcher != nil {
watcher.Stop()
}
watcher, err = setupDeploymentWatcher(client.Namespace)
if err != nil {
log.Println("Failed to restart Deployment status watcher, sleeping for 10 seconds before retrying")
@@ -141,7 +143,9 @@ func (client *Client) vmStatusWatcher(ctx context.Context, handler func(string,
handler(vmStatus.Name, vmStatus)
}
case <-recreateInterval:
watcher.Stop()
if watcher != nil {
watcher.Stop()
}
watcher, err = setupVmWatcher(client.Namespace)
if err != nil {
log.Println("Failed to restart VM status watcher, sleeping for 10 seconds before retrying")
@@ -192,7 +196,9 @@ func (client *Client) vmiStatusWatcher(ctx context.Context, handler func(string,
handler(vmiStatus.Name, vmiStatus)
}
case <-recreateInterval:
watcher.Stop()
if watcher != nil {
watcher.Stop()
}
watcher, err = setupVmWatcher(client.Namespace)
if err != nil {
log.Println("Failed to restart VM instance status watcher, sleeping for 10 seconds before retrying")
@@ -340,7 +346,9 @@ func (client *Client) eventWatcher(ctx context.Context, handler func(string, int
}(e)
}
case <-recreateInterval:
watcher.Stop()
if watcher != nil {
watcher.Stop()
}
watcher, err = setupEventWatcher(client.Namespace)
if err != nil {
log.Println("Failed to restart Event status watcher, sleeping for 10 seconds before retrying")
26 changes: 24 additions & 2 deletions routers/api/v2/vm.go
Original file line number Diff line number Diff line change
@@ -10,12 +10,15 @@ import (
configModels "go-deploy/models/config"
"go-deploy/models/model"
"go-deploy/models/version"
"go-deploy/pkg/config"
"go-deploy/pkg/sys"
"go-deploy/service"
sErrors "go-deploy/service/errors"
teamOpts "go-deploy/service/v2/teams/opts"
v2Utils "go-deploy/service/v2/utils"
"go-deploy/service/v2/vms/opts"
"strconv"
"strings"
)

// GetVM
@@ -75,7 +78,7 @@ func GetVM(c *gin.Context) {
sshConnectionString, _ := deployV2.VMs().SshConnectionString(vm.ID)

lease, _ := deployV2.VMs().GpuLeases().GetByVmID(vm.ID)
context.Ok(vm.ToDTOv2(lease, teamIDs, sshConnectionString))
context.Ok(vm.ToDTOv2(lease, teamIDs, getVmAppExternalPort(vm.Zone), sshConnectionString))
}

// ListVMs
@@ -139,7 +142,7 @@ func ListVMs(c *gin.Context) {
teamIDs, _ := deployV2.Teams().ListIDs(teamOpts.ListOpts{ResourceID: vm.ID})
sshConnectionString, _ := deployV2.VMs().SshConnectionString(vm.ID)
lease, _ := deployV2.VMs().GpuLeases().GetByVmID(vm.ID)
dtoVMs[i] = vm.ToDTOv2(lease, teamIDs, sshConnectionString)
dtoVMs[i] = vm.ToDTOv2(lease, teamIDs, getVmAppExternalPort(vm.Zone), sshConnectionString)
}

context.Ok(dtoVMs)
@@ -408,3 +411,22 @@ func UpdateVM(c *gin.Context) {
JobID: &jobID,
})
}

func getVmAppExternalPort(zoneName string) *int {
zone := config.Config.GetZone(zoneName)
if zone == nil {
return nil
}

split := strings.Split(zone.Domains.ParentVmApp, ":")
if len(split) > 1 {
port, err := strconv.Atoi(split[1])
if err != nil {
return nil
}

return &port
}

return nil
}
8 changes: 1 addition & 7 deletions scripts/local/config.yml.tmpl
Original file line number Diff line number Diff line change
@@ -70,6 +70,7 @@ gpu:
privilegedGpus:
excludedHosts:
excludedGpus:
addMock: true

deployment:
defaultZone: local
@@ -165,13 +166,6 @@ redis:
url: $redis_url
password: $redis_password

sys-api:
url:
user:
password:
clientId:
useMock: true

harbor:
url: $harbor_url
user: $harbor_user
Loading
Loading