Skip to content

Commit

Permalink
refactor: pod and gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
xzchaoo committed Sep 29, 2023
1 parent 46194c8 commit 01a660f
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 3 deletions.
4 changes: 4 additions & 0 deletions pkg/cri/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ type (
// Returns an error if not found.
GetPod(namespace, podName string) (*Pod, error)

// GetPod queries one local pod by its sandbox id.
// Returns an error if not found.
GetPodBySandboxId(sandboxId string) (*Pod, error)

// GetContainerByCid queries one container by cid
GetContainerByCid(cid string) (*Container, bool)

Expand Down
17 changes: 14 additions & 3 deletions pkg/cri/impl/metastore.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package impl

import (
"fmt"
"github.com/jpillora/backoff"
"github.com/traas-stack/holoinsight-agent/pkg/cri"
"github.com/traas-stack/holoinsight-agent/pkg/logger"
Expand All @@ -31,8 +32,9 @@ type (
containerMap map[string]*cachedContainer
shortCidContainerMap map[string]*cachedContainer
// podByKey pod map by key("${ns}/${pod}")
podByKey map[string]*cri.Pod
podByUID map[types.UID]*cri.Pod
podByKey map[string]*cri.Pod
podByUID map[types.UID]*cri.Pod
podBySandboxId map[string]*cri.Pod
}

cachedContainer struct {
Expand Down Expand Up @@ -84,6 +86,14 @@ func (e *defaultMetaStore) GetAllPods() []*cri.Pod {
return e.state.pods
}

func (e *defaultMetaStore) GetPodBySandboxId(sandboxId string) (*cri.Pod, error) {
pod, ok := e.state.podBySandboxId[sandboxId]
if ok {
return pod, nil
}
return nil, fmt.Errorf("no pod sandboxId=[%s]", sandboxId)
}

func (e *defaultMetaStore) GetContainerByCid(cid string) (*cri.Container, bool) {
state := e.state
// docker short container id length = 12
Expand Down Expand Up @@ -155,6 +165,7 @@ func newInternalState() *internalState {
shortCidContainerMap: make(map[string]*cachedContainer),
podByKey: make(map[string]*cri.Pod),
podByUID: make(map[types.UID]*cri.Pod),
podBySandboxId: make(map[string]*cri.Pod),
}
}

Expand All @@ -168,11 +179,11 @@ func (s *internalState) build() {
}
s.podByKey[pod.Namespace+"/"+pod.Name] = pod
s.podByUID[pod.UID] = pod

for _, container := range pod.All {
cri.SortMountPointsByLongSourceFirst(container.Mounts)
}
if pod.Sandbox != nil {
s.podBySandboxId[pod.Sandbox.Id] = pod
for _, container := range pod.All {
if pod.Sandbox != container {
container.NetworkMode = pod.Sandbox.NetworkMode
Expand Down
59 changes: 59 additions & 0 deletions pkg/plugin/input/nvidia_smi/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright 2022 Holoinsight Project Authors. Licensed under Apache-2.0.
*/

package nvidia_smi

import (
"encoding/xml"
"github.com/traas-stack/holoinsight-agent/pkg/appconfig"
"github.com/traas-stack/holoinsight-agent/pkg/core"
"os/exec"
"sync"
)

type (
NvidiaSmiLog struct {
GPU []NvidiaSmiLog_Gpu `xml:"gpu"`
}
NvidiaSmiLog_Gpu struct {
ProductName string `xml:"product_name"`
Uuid string `xml:"uuid"`
MinorNumber string `xml:"minor_number"`
}
)

var (
nvidiaSmiLog = &NvidiaSmiLog{}
initNvidiaSmiLogOnce = sync.Once{}
)

func GetNvidiaSmiLog() *NvidiaSmiLog {
initNvidiaSmiLogOnce.Do(initNvidiaSmiLog)
return nvidiaSmiLog
}

func initNvidiaSmiLog() {
if !IsNvidiaEnabled() {
return
}
var cmd *exec.Cmd
switch appconfig.StdAgentConfig.Mode {
case core.AgentModeDaemonset:
cmd = exec.Command("chroot", core.GetHostfs(), "nvidia-smi", "-q", "-x")
case core.AgentModeSidecar:
cmd = exec.Command("nvidia-smi", "-q", "-x")
default:
return
}

b, err := cmd.CombinedOutput()
if err != nil {
// return err
return
}
err = xml.Unmarshal(b, nvidiaSmiLog)
if err != nil {
return
}
}

0 comments on commit 01a660f

Please sign in to comment.