Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maintenance Update #638

Merged
merged 14 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,23 @@ jobs:
sleep 1
done

- name: Run e2e tests
run: go test -timeout 30m ./test/e2e/...

- name: Run e2e tests (VMs)
run: go test -timeout 90m ./test/e2e/v2/gpu_groups ./test/e2e/v2/gpu_leases ./test/e2e/v2/vms

- name: Run e2e tests (Deployments)
run: go test -timeout 90m ./test/e2e/v2/deployments

- name: Run e2e tests (SMs)
run: go test -timeout 90m ./test/e2e/v2/sms

- name: Run e2e tests (Jobs)
run: go test -timeout 90m ./test/e2e/v2/jobs

- name: Run e2e tests (Resource Migrations)
run: go test -timeout 90m ./test/e2e/v2/resource_migrations

- name: Run e2e tests (Users)
run: go test -timeout 90m ./test/e2e/v2/users ./test/e2e/v2/teams ./test/e2e/v2/notifications

- name: Run e2e tests (System)
run: go test -timeout 90m ./test/e2e/v2/zones
4 changes: 2 additions & 2 deletions dto/v2/body/vm_port.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ type CustomDomainRead struct {
}

type HttpProxyRead struct {
Name string `json:"name" bson:"name,omitempty" binding:"required,rfc1035,min=3,max=30"`
URL *string `json:"url,omitempty,omitempty"`
Name string `json:"name" bson:"name"`
URL *string `json:"url,omitempty" bson:"url,omitempty"`
CustomDomain *CustomDomainRead `json:"customDomain,omitempty" bson:"customDomain,omitempty"`
}

Expand Down
11 changes: 1 addition & 10 deletions models/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ type ConfigType struct {
PrivilegedGPUs []string `yaml:"privilegedGpus"`
ExcludedHosts []string `yaml:"excludedHosts"`
ExcludedGPUs []string `yaml:"excludedGpus"`
AddMock bool `yaml:"addMock"`
} `yaml:"gpu"`

Registry struct {
Expand Down Expand Up @@ -101,16 +102,6 @@ type ConfigType struct {
Password string `yaml:"password"`
}

SysApi struct {
URL string `yaml:"url"`
User string `yaml:"user"`
Password string `yaml:"password"`
ClientID string `yaml:"clientId"`
// UseMock is a flag that indicates whether the sys-api should be mocked
// This is useful for testing purposes, since the sys-api cannot be run locally
UseMock bool `yaml:"useMock"`
} `yaml:"sys-api"`

Harbor struct {
URL string `yaml:"url"`
User string `yaml:"user"`
Expand Down
18 changes: 0 additions & 18 deletions models/model/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,24 +76,6 @@ func (deployment *Deployment) GetURL(externalPort *int) *string {
return nil
}

// GetCustomDomainURL returns the custom domain URL of the deployment.
// If the app does not have a custom domain, it will return nil.
// This method does not check whether the custom domain is active, and does
// not check if the ingress exists.
func (deployment *Deployment) GetCustomDomainURL() *string {
app := deployment.GetMainApp()
if app == nil {
return nil
}

if app.CustomDomain != nil && len(app.CustomDomain.Domain) > 0 {
url := fmt.Sprintf("https://%s", app.CustomDomain.Domain)
return &url
}

return nil
}

// Ready returns true if the deployment is not being created or deleted.
func (deployment *Deployment) Ready() bool {
return !deployment.DoingActivity(ActivityBeingCreated) && !deployment.DoingActivity(ActivityBeingDeleted)
Expand Down
6 changes: 5 additions & 1 deletion models/model/deployment_convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,13 @@ func (deployment *Deployment) ToDTO(smURL *string, externalPort *int, teams []st

var customDomain *body.CustomDomainRead
if app.CustomDomain != nil {
extPortStr := ""
if externalPort != nil && *externalPort != 443 {
extPortStr = fmt.Sprintf(":%d", *externalPort)
}
customDomain = &body.CustomDomainRead{
Domain: app.CustomDomain.Domain,
URL: fmt.Sprintf("https://%s", app.CustomDomain.Domain),
URL: fmt.Sprintf("https://%s%s", app.CustomDomain.Domain, extPortStr),
Status: app.CustomDomain.Status,
Secret: app.CustomDomain.Secret,
}
Expand Down
22 changes: 22 additions & 0 deletions models/model/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,28 @@ func (vm *VM) BeingDeleted() bool {
return vm.DoingActivity(ActivityBeingDeleted)
}

// GetHttpProxyURL returns the URL of a VM's HTTP proxy.
// If the K8s ingress does not exist, it will return nil, or if the ingress does not have a host, it will return nil.
func (vm *VM) GetHttpProxyURL(name string, externalPort *int) *string {
ingress := vm.Subsystems.K8s.GetIngress(fmt.Sprintf("%s-%s", vm.Name, name))
if ingress == nil || !ingress.Created() {
return nil
}

if len(ingress.Hosts) > 0 && len(ingress.Hosts[0]) > 0 {
url := fmt.Sprintf("https://%s", ingress.Hosts[0])

// If we have a custom port, we need to append it to the URL
if externalPort != nil && *externalPort != 443 {
url = fmt.Sprintf("%s:%d", url, *externalPort)
}

return &url
}

return nil
}

func (vm *VM) GetExternalPort(privatePort int, protocol string) *int {
pfrName := fmt.Sprintf("priv-%d-prot-%s", privatePort, protocol)
service := vm.Subsystems.K8s.GetService(fmt.Sprintf("%s-%s", vm.Name, pfrName))
Expand Down
15 changes: 12 additions & 3 deletions models/model/vm_convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
)

// ToDTOv2 converts a VM to a body.VmRead.
func (vm *VM) ToDTOv2(gpuLease *GpuLease, teams []string, sshConnectionString *string) body.VmRead {
func (vm *VM) ToDTOv2(gpuLease *GpuLease, teams []string, externalPort *int, sshConnectionString *string) body.VmRead {
var host *string
if vm.Host != nil {
host = &vm.Host.Name
Expand Down Expand Up @@ -42,17 +42,26 @@ func (vm *VM) ToDTOv2(gpuLease *GpuLease, teams []string, sshConnectionString *s

var httpProxy *body.HttpProxyRead
if port.HttpProxy != nil {
extPortStr := ""
if externalPort != nil && *externalPort != 443 {
extPortStr = fmt.Sprintf(":%d", *externalPort)
}

var customDomain *body.CustomDomainRead
if port.HttpProxy.CustomDomain != nil {
customDomain = &body.CustomDomainRead{
Domain: port.HttpProxy.CustomDomain.Domain,
URL: fmt.Sprintf("https://%s", port.HttpProxy.CustomDomain.Domain),
URL: fmt.Sprintf("https://%s%s", port.HttpProxy.CustomDomain.Domain, extPortStr),
Secret: port.HttpProxy.CustomDomain.Secret,
Status: port.HttpProxy.CustomDomain.Status,
}
}

httpProxy = &body.HttpProxyRead{Name: port.HttpProxy.Name, CustomDomain: customDomain}
httpProxy = &body.HttpProxyRead{
Name: port.HttpProxy.Name,
URL: vm.GetHttpProxyURL(port.HttpProxy.Name, externalPort),
CustomDomain: customDomain,
}
}

ports = append(ports, body.PortRead{
Expand Down
2 changes: 2 additions & 0 deletions pkg/app/status_codes/code.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const (
ResourceMountFailed = 10040
ResourceImagePullFailed = 10041
ResourceDisabled = 10042
ResourceUnschedulable = 10043

JobPending = 10140
JobFinished = 10141
Expand Down Expand Up @@ -56,6 +57,7 @@ var MsgFlags = map[int]string{
ResourceMountFailed: "resourceMountFailed",
ResourceImagePullFailed: "resourceImagePullFailed",
ResourceDisabled: "resourceDisabled",
ResourceUnschedulable: "resourceUnschedulable",

JobPending: "pending",
JobRunning: "running",
Expand Down
2 changes: 1 addition & 1 deletion pkg/db/key_value/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func (client *Client) SetUpExpirationListener(ctx context.Context, pattern strin
err := handler(msg.Payload)
if err != nil {
utils.PrettyPrintError(fmt.Errorf("failed to handle expired key event for key %s. details: %w", msg.Payload, err))
return
continue
}
}
}
Expand Down
1 change: 1 addition & 0 deletions pkg/services/logger/pod_event_listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ func PodEventListener(ctx context.Context) error {

if !exists {
// Clean up the keys
log.Printf("Pod %s not longer exists. Cleaning up keys", podName)
_ = kvc.Del(LogKey(podName))
_ = kvc.Del(LastLogKey(podName))
_ = kvc.Del(OwnerLogKey(podName))
Expand Down
2 changes: 2 additions & 0 deletions pkg/services/status_update/vm_status_listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ func parseVmStatus(status *model.VmStatus) string {
statusCode = status_codes.ResourceStopping
case "Terminating":
statusCode = status_codes.ResourceDeleting
case "ErrorUnschedulable":
statusCode = status_codes.ResourceUnschedulable
case "CrashLoopBackOff", "Unknown", "Unschedulable", "ErrImagePull", "ImagePullBackOff", "PvcNotFound", "DataVolumeError":
statusCode = status_codes.ResourceError
default:
Expand Down
47 changes: 45 additions & 2 deletions pkg/services/synchronize/fetch_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,54 @@ func listLatestGPUs() (*body.SystemGpuInfo, error) {
return nil, makeError(err)
}

var result *body.SystemGpuInfo
if len(systemGpuInfo) > 0 {
return &systemGpuInfo[0].GpuInfo, nil
result = &systemGpuInfo[0].GpuInfo
}

return nil, nil
if config.Config.GPU.AddMock {
// Add one mock GPUs in each zone
for _, zone := range config.Config.EnabledZones() {
if result == nil {
result = &body.SystemGpuInfo{}
}

result.HostGpuInfo = append(result.HostGpuInfo, body.HostGpuInfo{
HostBase: body.HostBase{
Name: "Mock Host 1",
DisplayName: "Mock Host 1",
Zone: zone.Name,
},
GPUs: []host_api.GpuInfo{{
Name: "Mock GPU 1",
Vendor: "NVIDIA",
VendorID: "10de",
DeviceID: "1eb0",
}, {
Name: "Mock GPU 2",
Vendor: "NVIDIA",
VendorID: "10de",
DeviceID: "2230",
}},
})

result.HostGpuInfo = append(result.HostGpuInfo, body.HostGpuInfo{
HostBase: body.HostBase{
Name: "Mock Host 2",
DisplayName: "Mock Host 2",
Zone: zone.Name,
},
GPUs: []host_api.GpuInfo{{
Name: "Mock GPU 1",
Vendor: "NVIDIA",
VendorID: "10de",
DeviceID: "1eb0",
}},
})
}
}

return result, nil
}

func createGpuGroupName(gpu *host_api.GpuInfo) *string {
Expand Down
16 changes: 12 additions & 4 deletions pkg/subsystems/k8s/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ func (client *Client) deploymentStatusWatcher(ctx context.Context, handler func(
}
}
case <-recreateInterval:
watcher.Stop()
if watcher != nil {
watcher.Stop()
}
watcher, err = setupDeploymentWatcher(client.Namespace)
if err != nil {
log.Println("Failed to restart Deployment status watcher, sleeping for 10 seconds before retrying")
Expand Down Expand Up @@ -141,7 +143,9 @@ func (client *Client) vmStatusWatcher(ctx context.Context, handler func(string,
handler(vmStatus.Name, vmStatus)
}
case <-recreateInterval:
watcher.Stop()
if watcher != nil {
watcher.Stop()
}
watcher, err = setupVmWatcher(client.Namespace)
if err != nil {
log.Println("Failed to restart VM status watcher, sleeping for 10 seconds before retrying")
Expand Down Expand Up @@ -192,7 +196,9 @@ func (client *Client) vmiStatusWatcher(ctx context.Context, handler func(string,
handler(vmiStatus.Name, vmiStatus)
}
case <-recreateInterval:
watcher.Stop()
if watcher != nil {
watcher.Stop()
}
watcher, err = setupVmWatcher(client.Namespace)
if err != nil {
log.Println("Failed to restart VM instance status watcher, sleeping for 10 seconds before retrying")
Expand Down Expand Up @@ -340,7 +346,9 @@ func (client *Client) eventWatcher(ctx context.Context, handler func(string, int
}(e)
}
case <-recreateInterval:
watcher.Stop()
if watcher != nil {
watcher.Stop()
}
watcher, err = setupEventWatcher(client.Namespace)
if err != nil {
log.Println("Failed to restart Event status watcher, sleeping for 10 seconds before retrying")
Expand Down
26 changes: 24 additions & 2 deletions routers/api/v2/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@ import (
configModels "go-deploy/models/config"
"go-deploy/models/model"
"go-deploy/models/version"
"go-deploy/pkg/config"
"go-deploy/pkg/sys"
"go-deploy/service"
sErrors "go-deploy/service/errors"
teamOpts "go-deploy/service/v2/teams/opts"
v2Utils "go-deploy/service/v2/utils"
"go-deploy/service/v2/vms/opts"
"strconv"
"strings"
)

// GetVM
Expand Down Expand Up @@ -75,7 +78,7 @@ func GetVM(c *gin.Context) {
sshConnectionString, _ := deployV2.VMs().SshConnectionString(vm.ID)

lease, _ := deployV2.VMs().GpuLeases().GetByVmID(vm.ID)
context.Ok(vm.ToDTOv2(lease, teamIDs, sshConnectionString))
context.Ok(vm.ToDTOv2(lease, teamIDs, getVmAppExternalPort(vm.Zone), sshConnectionString))
}

// ListVMs
Expand Down Expand Up @@ -139,7 +142,7 @@ func ListVMs(c *gin.Context) {
teamIDs, _ := deployV2.Teams().ListIDs(teamOpts.ListOpts{ResourceID: vm.ID})
sshConnectionString, _ := deployV2.VMs().SshConnectionString(vm.ID)
lease, _ := deployV2.VMs().GpuLeases().GetByVmID(vm.ID)
dtoVMs[i] = vm.ToDTOv2(lease, teamIDs, sshConnectionString)
dtoVMs[i] = vm.ToDTOv2(lease, teamIDs, getVmAppExternalPort(vm.Zone), sshConnectionString)
}

context.Ok(dtoVMs)
Expand Down Expand Up @@ -408,3 +411,22 @@ func UpdateVM(c *gin.Context) {
JobID: &jobID,
})
}

func getVmAppExternalPort(zoneName string) *int {
zone := config.Config.GetZone(zoneName)
if zone == nil {
return nil
}

split := strings.Split(zone.Domains.ParentVmApp, ":")
if len(split) > 1 {
port, err := strconv.Atoi(split[1])
if err != nil {
return nil
}

return &port
}

return nil
}
8 changes: 1 addition & 7 deletions scripts/local/config.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ gpu:
privilegedGpus:
excludedHosts:
excludedGpus:
addMock: true

deployment:
defaultZone: local
Expand Down Expand Up @@ -165,13 +166,6 @@ redis:
url: $redis_url
password: $redis_password

sys-api:
url:
user:
password:
clientId:
useMock: true

harbor:
url: $harbor_url
user: $harbor_user
Expand Down
Loading
Loading