-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_run.py
248 lines (223 loc) · 12.1 KB
/
cluster_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import os
import time
import subprocess
import matplotlib as mpl
mpl.use('Agg')
import parse_output
from plot_result import plot_weak, plot_fix, plot_strong
current_dataset = 'None'
dataset = None
datasets = {}
def get_time():
return time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
def model_log(model):
return os.path.join(log_dir, '{}_{}_{}_{}_{}.log'.format(model['label'], model['nodes'], model['ranks'], model['threads'], get_time()))
def get_model(label):
for model in dataset['models']:
if model['label'] == label:
return model
return None
def generate_global_conf(model):
global_conf = ''
global_conf += 'JOB = {}\n'.format(model['JOB']) ; print('[-] JOB={}'.format(model['JOB']))
global_conf += 'basename = {}\n'.format(model['basename']) ; print('[-] basename={}'.format(model['basename']))
if not model['inputdir'].endswith('/'):
model['inputdir'] += '/'
if not model['outputdir'].endswith('/'):
model['outputdir'] += '/'
global_conf += 'inputdir = {}\n'.format(model['inputdir']) ; print('[-] inputdir={}'.format(model['inputdir']))
global_conf += 'outputdir = {}\n'.format(model['outputdir']) ; print('[-] outputdir={}'.format(model['outputdir']))
global_conf += 'lowfreq = {}\n'.format(model['lowfreq']) ; print('[-] lowfreq={}'.format(model['lowfreq']))
global_conf += 'upfreq = {}\n'.format(model['upfreq']) ; print('[-] upfreq={}'.format(model['upfreq']))
global_conf += 'pOrder = {}\n'.format(model['pOrder']) ; print('[-] pOrder={}'.format(model['pOrder']))
return global_conf
def pre_check():
if not os.path.exists(log_dir):
print('Log directory {} does not exist, created'.format(log_dir))
os.makedirs(log_dir)
def check():
print('Current dataset: {}'.format(str(current_dataset)))
if dataset == None:
return
logs = os.listdir(log_dir)
logs.sort()
models = dataset['models']
for model in models:
model['status'] = 'NOT STARTED'
for log in logs:
if not log.endswith('.log'): continue
label = log.split('_')[0]
status = 'UNFINISHED'
with open(os.path.join(log_dir, log), 'r') as f:
if f.read().find('save the results') != -1:
status = 'DONE'
for model in models:
if model['label'] == label:
model['status'] = status
print('Current status:')
for model in models:
info = '- {}: {}'.format(model['label'], model['status'])
if 'running_on' in model: info += ' (Last run on {})'.format(model['running_on'])
print(info)
def parse(label, print_result = True):
if dataset == None:
print('No dataset selected')
return
model = get_model(label)
if model == None:
print("invalid model name")
return
try:
out = parse_output.parse_output(model_log(model), print_result, label, model['nodes'], model['ranks'])
model['json-log'] = out
return True
except:
print('Error while parsing {}'.format(label))
return False
def switch_dataset(name):
global current_dataset, dataset
current_dataset = name
if not name in datasets:
print('Invalid dataset')
else:
dataset = datasets[name]
print('Dataset switched to {}'.format(name))
def get_valid_result():
check()
done = []
for model in dataset['models']:
if model['status'] == 'DONE' and parse(model['label'], False):
done.append(model)
print("Available models: {}".format([x['label'] for x in done]))
return done
def plot(show=False):
if (not os.path.exists("plot")):
os.mkdir("plot")
if dataset == None:
print('No dataset selected')
else:
done = get_valid_result()
for func in dataset['plot']:
func(done, show)
def run(label):
if dataset == None:
print('No dataset selected')
return
model = get_model(label)
if model == None:
print("invalid label name")
return
nodes_list_str = input('> Input list of nodes: ')
nodes_list_arr = nodes_list_str.split(',')
if len(nodes_list_arr) != model['nodes']:
print('Number of nodes does not match')
return
for node_name in nodes_list_arr:
if not node_name in available_nodes:
print('Invalid node {}'.format(node_name))
return
global_conf = generate_global_conf(model)
with open('global_conf', 'w') as f:
f.write(global_conf)
bash = ''
bash += '#!/bin/bash\n'
bash += 'export OMP_NUM_THREADS={}\n'.format(model['threads'])
bash += env_cmd
run_cmd = 'I_MPI_PIN=yes I_MPI_PIN_PROCESSOR_LIST=allcores:map=bunch mpirun -n {} -hosts {} bash -c "ulimit -s unlimited && {}"'.format(model['nodes'] * model['ranks'], nodes_list_str, bin_path)
bash += run_cmd
print('Command will be: {}'.format(run_cmd))
choice = input('> Confirm? (y/n) ')
if choice != 'y':
print('Canceled')
return
with open('cluster_generated_run.sh', 'w') as f:
f.write(bash)
if not os.path.exists(model['outputdir']):
print('Output directory {} does not exist, created'.format(model['outputdir']))
os.makedirs(model['outputdir'])
os.system('bash cluster_generated_run.sh | tee {}'.format(model_log(model)))
model['running_on'] = nodes_list_str
exit()
log_dir = 'logs/'
bin_path = '../bin/plmvcg_istar.out'
available_nodes = ['e1', 'e2', 'e3', 'e4']
env_cmd = 'source /etc/profile.d/modules.sh\nsource /opt/spack/share/spack/setup-env.sh\nsource $HOME/intel_env.sh\ncd $HOME/SC19/NormalModes/demos\nulimit -s unlimited\n'
datasets['final'] = {
'models': [
{'group': 0, 'label': 'f-w-M1-1', 'JOB': 2, 'basename': 'RTMDWAK_3L_1M.1', 'inputdir': 'final/input/M1', 'outputdir': 'final/output/M1', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 1, 'ranks': 56, 'threads': 1},
{'group': 0, 'label': 'f-w-M2-2', 'JOB': 2, 'basename': 'RTMDWAK_3L_2M.1', 'inputdir': 'final/input/M2', 'outputdir': 'final/output/M2', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 2, 'ranks': 56, 'threads': 1},
{'group': 0, 'label': 'f-w-M3-4', 'JOB': 2, 'basename': 'RTMDWAK_3L_16M.1', 'inputdir': 'final/input/M3', 'outputdir': 'final/output/M3', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 4, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-w-M4-1', 'JOB': 2, 'basename': 'RTMDWAK_3L_32M.1', 'inputdir': 'final/input/M4', 'outputdir': 'final/output/M4', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 1, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-w-M5-2', 'JOB': 2, 'basename': 'RTMDWAK_3L_64M.1', 'inputdir': 'final/input/M5', 'outputdir': 'final/output/M5', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 2, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-w-M6-4', 'JOB': 2, 'basename': 'RTMDWAK_3L_128M.1', 'inputdir': 'final/input/M6', 'outputdir': 'final/output/M6', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 4, 'ranks': 56, 'threads': 1},
{'group': 2, 'label': 'f-s-M2-1', 'JOB': 2, 'basename': 'RTMDWAK_3L_2M.1', 'inputdir': 'final/input/M2', 'outputdir': 'final/output/M2-1', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 1, 'ranks': 56, 'threads': 1},
{'group': 2, 'label': 'f-s-M2-4', 'JOB': 2, 'basename': 'RTMDWAK_3L_2M.1', 'inputdir': 'final/input/M2', 'outputdir': 'final/output/M2-4', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 4, 'ranks': 56, 'threads': 1},
]
}
datasets['weak'] = {
'models': [
{'group': 0, 'label': 'f-w-M1-1', 'JOB': 2, 'basename': 'RTMDWAK_3L_1M.1', 'inputdir': 'final/input/M1', 'outputdir': 'final/output/M1', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 1, 'ranks': 56, 'threads': 1},
{'group': 0, 'label': 'f-w-M2-2', 'JOB': 2, 'basename': 'RTMDWAK_3L_2M.1', 'inputdir': 'final/input/M2', 'outputdir': 'final/output/M2', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 2, 'ranks': 56, 'threads': 1},
{'group': 0, 'label': 'f-w-M3-4', 'JOB': 2, 'basename': 'RTMDWAK_3L_16M.1', 'inputdir': 'final/input/M3', 'outputdir': 'final/output/M3', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 4, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-w-M4-1', 'JOB': 2, 'basename': 'RTMDWAK_3L_32M.1', 'inputdir': 'final/input/M4', 'outputdir': 'final/output/M4', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 1, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-w-M5-2', 'JOB': 2, 'basename': 'RTMDWAK_3L_64M.1', 'inputdir': 'final/input/M5', 'outputdir': 'final/output/M5', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 2, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-w-M6-4', 'JOB': 2, 'basename': 'RTMDWAK_3L_128M.1', 'inputdir': 'final/input/M6', 'outputdir': 'final/output/M6', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 4, 'ranks': 56, 'threads': 1},
],
"plot": [plot_weak, plot_fix]
}
datasets['strong'] = {
'models': [
{'group': 0, 'label': 'f-s-M2-1', 'JOB': 2, 'basename': 'RTMDWAK_3L_2M.1', 'inputdir': 'final/input/M2', 'outputdir': 'final/output/M2-1', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 1, 'ranks': 56, 'threads': 1},
{'group': 0, 'label': 'f-w-M2-2', 'JOB': 2, 'basename': 'RTMDWAK_3L_2M.1', 'inputdir': 'final/input/M2', 'outputdir': 'final/output/M2', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 2, 'ranks': 56, 'threads': 1},
{'group': 0, 'label': 'f-s-M2-4', 'JOB': 2, 'basename': 'RTMDWAK_3L_2M.1', 'inputdir': 'final/input/M2', 'outputdir': 'final/output/M2-4', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 4, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-s-M3-1', 'JOB': 2, 'basename': 'RTMDWAK_3L_16M.1', 'inputdir': 'final/input/M3', 'outputdir': 'final/output/M3-1', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 1, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-s-M3-2', 'JOB': 2, 'basename': 'RTMDWAK_3L_16M.1', 'inputdir': 'final/input/M3', 'outputdir': 'final/output/M3-2', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 2, 'ranks': 56, 'threads': 1},
{'group': 1, 'label': 'f-w-M3-4', 'JOB': 2, 'basename': 'RTMDWAK_3L_16M.1', 'inputdir': 'final/input/M3', 'outputdir': 'final/output/M3-4', 'lowfreq': 0.5, 'upfreq': 1.0, 'pOrder': 1, 'nodes': 4, 'ranks': 56, 'threads': 1},
],
"plot": [plot_strong]
}
if __name__ == "__main__":
print('SCC 19, Tsinghua University, Reproduciblity Command Line')
print('Commands: switch <experiment>, run <label>, parse <label>, check, plot, download, exit')
print('Notes: this script must be run in demos dir, e.g.: python3 tools/cluster_run.py in demos dir')
pre_check()
switch_dataset('final')
while True:
line = input('> ').split()
if len(line) == 0: continue
command = line[0]
if command == 'exit':
break
elif command == 'switch':
if len(line) < 2:
print('Invalid args')
continue
switch_dataset(line[1])
elif command == 'run':
if len(line) < 2:
print('Invalid args')
continue
run(line[1])
elif command == 'parse':
if len(line) < 2:
print('Invalid args')
continue
parse(line[1])
elif command == 'check':
check()
elif command == 'plot':
plot(show=(len(line) > 1))
elif command == 'download':
if (not os.path.exists("trash")):
os.mkdir("trash")
if (os.path.exists("logs")):
cmd = "mv logs trash/logs-{}".format(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))
print(cmd)
os.system(cmd)
cmd = "scp -r i1:~/SC19/NormalModes/demos/logs ."
print(cmd)
os.system(cmd)
elif command == '':
pass
else:
print('Invalid command')