-
Notifications
You must be signed in to change notification settings - Fork 6
/
jobs_info.py
131 lines (106 loc) · 4.53 KB
/
jobs_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
"""Retrieve user runs on JZ and print metrics in human or machine readable format"""
# Orginal script
from doctest import FAIL_FAST
import sys
import subprocess
import collections
import re
from enum import Enum, auto
from functools import lru_cache
class GpuType(Enum):
V100_32GB = auto()
V100_16GB = auto()
A100_40GB = auto()
A100_80GB = auto()
class Node:
def __init__(self, gpu_type, num_gpus):
self.gpu_type = gpu_type
self.num_gpus = num_gpus
type2partitions = [(Node(GpuType.V100_32GB, 4), 'gpu_p1'),
(Node(GpuType.V100_32GB, 8), 'gpu_p2'),
(Node(GpuType.V100_16GB, 4), 'gpu_p3'),
(Node(GpuType.A100_40GB, 8), 'gpu_p4'),
(Node(GpuType.A100_80GB, 8), 'gpu_p5')]
node2type = {}
for node_type, partition in type2partitions:
p = subprocess.run(f'/gpfslocalsys/slurm/current/bin/sinfo -N -p {partition} --Format=nodehost -h',
shell=True, encoding='utf8',
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
for node in p.stdout.splitlines():
node2type[node.strip()] = node_type
def find_node_type(node):
try:
return node2type[node]
except:
return None
@lru_cache(None)
def get_nodes(nodelist):
p = subprocess.run(f'/gpfslocalsys/slurm/current/bin/scontrol show hostnames {nodelist}',
shell=True, encoding='utf8',
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return p.stdout.splitlines()
def find_num_gpus_per_types(nodelist, alloctres):
nodes = get_nodes(nodelist)
is_single_node = (len(nodes) == 1)
if is_single_node:
m = re.search('gpu=(\d+)', alloctres)
num_gpus = int(m.group(1)) if m else 0
num_gpus_per_types = collections.defaultdict(int)
for node in nodes:
node_type = find_node_type(node)
if node_type:
num_gpus_per_types[node_type.gpu_type] += num_gpus if is_single_node else node_type.num_gpus
return num_gpus_per_types
# Modifications to show more info for each job
def split_alloctres(alloctres):
line_splitted = [i.split('=') for i in alloctres.split(',')]
d = {}
for pair in line_splitted:
if len(pair) < 2:
continue
key = pair[0]
value = pair[1]
d[key] = (value)
return d
show_headers = not ('-n' in sys.argv[1:] or '--noheader' in sys.argv[1:])
args = ['sacct'] + sys.argv[1:] + ['--format=jobid,elapsed,nodelist,alloctres,partition,qos,start,end,group,jobname,workdir,account', '-P', '-X', '-n']
p = subprocess.run(' '.join(args), shell=True, encoding='utf8',
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if show_headers:
# Print human-readable output
fmt_string = '{0:<9} {1:<9} {2:<9} {3:<9} {4:<9} {5:<9} {6:<9} {7:<9} {8:<9} {9:<9} {10:<9} {11:<10} {12:<10} {13:<19} {14:<19} {15:<19} {16:<19}'
print(fmt_string.format('JobID', 'V100 32GB', 'V100 16GB', 'A100 40GB', 'A100 80GB', 'CPUs', 'RAM', 'Energy', 'Partition', 'Group', 'Elapsed', 'QoS', 'JobName', 'Start', 'End', 'Workdir', 'Account'))
print(('-' * 9 + ' ') * 11 + ('-' * 10 + ' ') * 2 + ('-' * 19 + ' ') * 2 + ('-' * 40 + ' '))
else:
# Machine readable output
fmt_string = '{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}|{12}|{13}|{14}|{15}|{16}'
for j in p.stdout.splitlines():
job_id, elapsed, nodelist, alloctres, partition, qos, start, end, group, jobname, workdir, account = j.split(
'|')
# Filtering only bigscience projects
if account[:3] != "six" and account[:3] != "ajs":
continue
num_gpus_per_types = find_num_gpus_per_types(nodelist, alloctres)
alloc = split_alloctres(alloctres)
for field in ['cpu', 'mem', 'energy']:
if field not in alloc:
alloc[field] = 'N/A'
print(
fmt_string.format(job_id,
num_gpus_per_types[GpuType.V100_32GB],
num_gpus_per_types[GpuType.V100_16GB],
num_gpus_per_types[GpuType.A100_40GB],
num_gpus_per_types[GpuType.A100_80GB],
alloc['cpu'],
alloc['mem'],
alloc['energy'],
partition,
group,
elapsed,
qos,
jobname,
start,
end,
workdir,
account))