From 5c733cb0a9046ab9d9c6868245797ec9d7fc86e8 Mon Sep 17 00:00:00 2001 From: zhaotao1 Date: Mon, 23 Sep 2024 17:35:11 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=BD=93=E9=98=9F?= =?UTF-8?q?=E5=88=97=E4=B8=AD=E6=B2=A1=E6=9C=89=E8=8A=82=E7=82=B9=E7=9A=84?= =?UTF-8?q?=E6=8A=A5=E9=94=99=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- services/config/config.go | 59 ++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/services/config/config.go b/services/config/config.go index ccc0820..718a9a0 100644 --- a/services/config/config.go +++ b/services/config/config.go @@ -257,20 +257,23 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo nodeMem, _ := strconv.Atoi(memOutput) totalMemInt = nodeMem * totalNodeNumInt } else { - getMemCmd := fmt.Sprintf("scontrol show node=%s | grep RealMemory=| awk '{print $1}' | awk -F'=' '{print $2}'", nodeArray[0]) - memOutput, err := utils.RunCommand(getMemCmd) - if err != nil || utils.CheckSlurmStatus(memOutput) { - errInfo := &errdetails.ErrorInfo{ - Reason: "COMMAND_EXEC_FAILED", + // 如果nodeArray[0]是(null) 则跳过 + if nodeArray[0] != "(null)" { + getMemCmd := fmt.Sprintf("scontrol show node=%s | grep RealMemory=| awk '{print $1}' | awk -F'=' '{print $2}'", nodeArray[0]) + memOutput, err := utils.RunCommand(getMemCmd) + if err != nil || utils.CheckSlurmStatus(memOutput) { + errInfo := &errdetails.ErrorInfo{ + Reason: "COMMAND_EXEC_FAILED", + } + st := status.New(codes.Internal, "Exec command failed or slurmctld down.") + st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) + return nil, st.Err() } - st := status.New(codes.Internal, "Exec command failed or slurmctld down.") - st, _ = st.WithDetails(errInfo) - caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) - return nil, st.Err() - } - nodeMem, _ := strconv.Atoi(memOutput) - totalMemInt = nodeMem * totalNodeNumInt + nodeMem, _ := strconv.Atoi(memOutput) + totalMemInt = nodeMem * totalNodeNumInt + } } } else { errInfo := &errdetails.ErrorInfo{ @@ -329,22 +332,26 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo totalGpus = uint32(perNodeGpuNum) * uint32(totalNodeNumInt) } } else { - getGpusCmd := fmt.Sprintf("scontrol show node=%s| grep ' Gres=' | awk -F':' '{print $NF}'", nodeArray[0]) - gpusOutput, err := utils.RunCommand(getGpusCmd) - if err != nil || utils.CheckSlurmStatus(gpusOutput) { - errInfo := &errdetails.ErrorInfo{ - Reason: "COMMAND_EXEC_FAILED", - } - st := status.New(codes.Internal, "Exec command failed or slurmctld down.") - st, _ = st.WithDetails(errInfo) - caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) - return nil, st.Err() - } - if gpusOutput == "Gres=(null)" { + if nodeArray[0] == "(null)" { totalGpus = 0 } else { - perNodeGpuNum, _ := strconv.Atoi(gpusOutput) - totalGpus = uint32(perNodeGpuNum) * uint32(totalNodeNumInt) + getGpusCmd := fmt.Sprintf("scontrol show node=%s| grep ' Gres=' | awk -F':' '{print $NF}'", nodeArray[0]) + gpusOutput, err := utils.RunCommand(getGpusCmd) + if err != nil || utils.CheckSlurmStatus(gpusOutput) { + errInfo := &errdetails.ErrorInfo{ + Reason: "COMMAND_EXEC_FAILED", + } + st := status.New(codes.Internal, "Exec command failed or slurmctld down.") + st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) + return nil, st.Err() + } + if gpusOutput == "Gres=(null)" { + totalGpus = 0 + } else { + perNodeGpuNum, _ := strconv.Atoi(gpusOutput) + totalGpus = uint32(perNodeGpuNum) * uint32(totalNodeNumInt) + } } } getPartitionQosCmd := fmt.Sprintf("scontrol show partition=%s | grep -i ' QoS=' | awk '{print $3}'", partition)