1
1
package config
2
2
3
3
import (
4
+ "bufio"
4
5
"context"
5
6
"fmt"
6
- "scow-slurm-adapter/caller"
7
- pb "scow-slurm-adapter/gen/go"
8
- "scow-slurm-adapter/utils"
9
7
"strconv"
10
8
"strings"
11
9
"sync"
@@ -14,6 +12,9 @@ import (
14
12
"google.golang.org/genproto/googleapis/rpc/errdetails"
15
13
"google.golang.org/grpc/codes"
16
14
"google.golang.org/grpc/status"
15
+ "scow-slurm-adapter/caller"
16
+ pb "scow-slurm-adapter/gen/go"
17
+ "scow-slurm-adapter/utils"
17
18
)
18
19
19
20
type ServerConfig struct {
@@ -731,7 +732,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva
731
732
return & pb.GetAvailablePartitionsResponse {Partitions : parts }, nil
732
733
}
733
734
734
- func extractNodeInfo (info string ) ( * pb.NodeInfo , error ) {
735
+ func extractNodeInfo (info string ) * pb.NodeInfo {
735
736
var (
736
737
partitionList []string
737
738
totalGpusInt int
@@ -799,7 +800,7 @@ func extractNodeInfo(info string) (*pb.NodeInfo, error) {
799
800
GpuCount : uint32 (totalGpusInt ),
800
801
AllocGpuCount : uint32 (allocGpusInt ),
801
802
IdleGpuCount : uint32 (totalGpusInt ) - uint32 (allocGpusInt ),
802
- }, nil
803
+ }
803
804
}
804
805
805
806
func getNodeInfo (node string , wg * sync.WaitGroup , nodeChan chan <- * pb.NodeInfo , errChan chan <- error ) {
@@ -817,28 +818,23 @@ func getNodeInfo(node string, wg *sync.WaitGroup, nodeChan chan<- *pb.NodeInfo,
817
818
return
818
819
}
819
820
820
- nodeInfo , err := extractNodeInfo (info )
821
- if err != nil {
822
- errChan <- err
823
- return
824
- }
821
+ nodeInfo := extractNodeInfo (info )
825
822
826
823
nodeChan <- nodeInfo
827
824
}
828
825
829
826
func (s * ServerConfig ) GetClusterNodesInfo (ctx context.Context , in * pb.GetClusterNodesInfoRequest ) (* pb.GetClusterNodesInfoResponse , error ) {
830
827
var (
831
- wg sync.WaitGroup
832
- nodesInfo []* pb.NodeInfo
833
- nodesInfoList []string
828
+ wg sync.WaitGroup
829
+ nodesInfo []* pb.NodeInfo
834
830
)
835
831
caller .Logger .Infof ("Received request GetClusterNodesInfo: %v" , in )
836
832
nodeChan := make (chan * pb.NodeInfo , len (in .NodeNames ))
837
833
errChan := make (chan error , len (in .NodeNames ))
838
834
839
835
if len (in .NodeNames ) == 0 {
840
836
// 获取集群中全部节点的信息
841
- getNodesInfoCmd := "scontrol show nodes --oneliner | grep Partitions | awk '{print $1}' | awk -F= '{print $2}' | tr ' \n ' ';' " // 获取全部计算节点主机名
837
+ getNodesInfoCmd := "scontrol show nodes --oneliner | grep Partitions" // 获取全部计算节点主机名
842
838
output , err := utils .RunCommand (getNodesInfoCmd )
843
839
if err != nil {
844
840
errInfo := & errdetails.ErrorInfo {
@@ -848,17 +844,22 @@ func (s *ServerConfig) GetClusterNodesInfo(ctx context.Context, in *pb.GetCluste
848
844
st , _ = st .WithDetails (errInfo )
849
845
return nil , st .Err ()
850
846
}
851
- nodesInfoList = strings .Split (output , ";" )
852
- nodesInfoList = nodesInfoList [:len (nodesInfoList )- 1 ]
853
- } else {
854
- nodesInfoList = in .NodeNames
847
+ // 按行分割输出
848
+ scanner := bufio .NewScanner (strings .NewReader (output ))
849
+ for scanner .Scan () {
850
+ line := scanner .Text ()
851
+ nodeInfo := extractNodeInfo (line )
852
+ nodesInfo = append (nodesInfo , nodeInfo )
853
+ }
854
+ caller .Logger .Infof ("GetClusterNodesInfoResponse: %v" , nodesInfo )
855
+ return & pb.GetClusterNodesInfoResponse {Nodes : nodesInfo }, nil
855
856
}
856
857
857
- for _ , node := range nodesInfoList {
858
- node1 := node
858
+ for _ , node := range in . NodeNames {
859
+ nodeName := node
859
860
wg .Add (1 )
860
861
go func () {
861
- getNodeInfo (node1 , & wg , chan <- * pb.NodeInfo (nodeChan ), chan <- error (errChan ))
862
+ getNodeInfo (nodeName , & wg , chan <- * pb.NodeInfo (nodeChan ), chan <- error (errChan ))
862
863
}()
863
864
}
864
865
@@ -879,6 +880,7 @@ func (s *ServerConfig) GetClusterNodesInfo(ctx context.Context, in *pb.GetCluste
879
880
}
880
881
default :
881
882
}
883
+ caller .Logger .Infof ("GetClusterNodesInfoResponse: %v" , nodesInfo )
882
884
return & pb.GetClusterNodesInfoResponse {Nodes : nodesInfo }, nil
883
885
}
884
886
0 commit comments