Skip to content

Commit

Permalink
Merge pull request #22 from TYzzt/master
Browse files Browse the repository at this point in the history
fix: 修复当队列中没有节点的报错问题
  • Loading branch information
283713406 authored Oct 17, 2024
2 parents 8be28a0 + 5c733cb commit bb7b0ea
Showing 1 changed file with 33 additions and 26 deletions.
59 changes: 33 additions & 26 deletions services/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,20 +257,23 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo
nodeMem, _ := strconv.Atoi(memOutput)
totalMemInt = nodeMem * totalNodeNumInt
} else {
getMemCmd := fmt.Sprintf("scontrol show node=%s | grep RealMemory=| awk '{print $1}' | awk -F'=' '{print $2}'", nodeArray[0])
memOutput, err := utils.RunCommand(getMemCmd)
if err != nil || utils.CheckSlurmStatus(memOutput) {
errInfo := &errdetails.ErrorInfo{
Reason: "COMMAND_EXEC_FAILED",
// 如果nodeArray[0]是(null) 则跳过
if nodeArray[0] != "(null)" {
getMemCmd := fmt.Sprintf("scontrol show node=%s | grep RealMemory=| awk '{print $1}' | awk -F'=' '{print $2}'", nodeArray[0])
memOutput, err := utils.RunCommand(getMemCmd)
if err != nil || utils.CheckSlurmStatus(memOutput) {
errInfo := &errdetails.ErrorInfo{
Reason: "COMMAND_EXEC_FAILED",
}
st := status.New(codes.Internal, "Exec command failed or slurmctld down.")
st, _ = st.WithDetails(errInfo)
caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err())
return nil, st.Err()
}
st := status.New(codes.Internal, "Exec command failed or slurmctld down.")
st, _ = st.WithDetails(errInfo)
caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err())
return nil, st.Err()
}

nodeMem, _ := strconv.Atoi(memOutput)
totalMemInt = nodeMem * totalNodeNumInt
nodeMem, _ := strconv.Atoi(memOutput)
totalMemInt = nodeMem * totalNodeNumInt
}
}
} else {
errInfo := &errdetails.ErrorInfo{
Expand Down Expand Up @@ -329,22 +332,26 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo
totalGpus = uint32(perNodeGpuNum) * uint32(totalNodeNumInt)
}
} else {
getGpusCmd := fmt.Sprintf("scontrol show node=%s| grep ' Gres=' | awk -F':' '{print $NF}'", nodeArray[0])
gpusOutput, err := utils.RunCommand(getGpusCmd)
if err != nil || utils.CheckSlurmStatus(gpusOutput) {
errInfo := &errdetails.ErrorInfo{
Reason: "COMMAND_EXEC_FAILED",
}
st := status.New(codes.Internal, "Exec command failed or slurmctld down.")
st, _ = st.WithDetails(errInfo)
caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err())
return nil, st.Err()
}
if gpusOutput == "Gres=(null)" {
if nodeArray[0] == "(null)" {
totalGpus = 0
} else {
perNodeGpuNum, _ := strconv.Atoi(gpusOutput)
totalGpus = uint32(perNodeGpuNum) * uint32(totalNodeNumInt)
getGpusCmd := fmt.Sprintf("scontrol show node=%s| grep ' Gres=' | awk -F':' '{print $NF}'", nodeArray[0])
gpusOutput, err := utils.RunCommand(getGpusCmd)
if err != nil || utils.CheckSlurmStatus(gpusOutput) {
errInfo := &errdetails.ErrorInfo{
Reason: "COMMAND_EXEC_FAILED",
}
st := status.New(codes.Internal, "Exec command failed or slurmctld down.")
st, _ = st.WithDetails(errInfo)
caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err())
return nil, st.Err()
}
if gpusOutput == "Gres=(null)" {
totalGpus = 0
} else {
perNodeGpuNum, _ := strconv.Atoi(gpusOutput)
totalGpus = uint32(perNodeGpuNum) * uint32(totalNodeNumInt)
}
}
}
getPartitionQosCmd := fmt.Sprintf("scontrol show partition=%s | grep -i ' QoS=' | awk '{print $3}'", partition)
Expand Down

0 comments on commit bb7b0ea

Please sign in to comment.