forked from NVIDIA/deepops
-
Notifications
You must be signed in to change notification settings - Fork 0
/
slurm-validation.yml
213 lines (191 loc) · 7.49 KB
/
slurm-validation.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
---
# Playbook designed to run a NCCL test across nodes in a cluster of DGXs
# Example to run on two nodes with no UCX, no HCOLL, and enp set for
# out-of-band NCCL init:
# ansible-playbook -l slurm-cluster playbooks/slurm-cluster/slurm-validation.yml \
# -e '{num_nodes: 2}' \
# -e '{srun_exports: "NCCL_DEBUG=INFO,OMPI_MCA_pml=^ucx,OMPI_MCA_coll=^hcoll,NCCL_SOCKET_IFNAME=enp"}' \
# -e '{cleanup: True}'
- hosts: slurm-master[0]
vars:
# String; Container for nccl performance/validation tests. Either docker
# repo or can be path to sqsh file.
base_container: "nvcr.io/nvidia/tensorflow:21.09-tf2-py3"
# String; Container to be created or one that might exist with nccl tests.
# If `compile_nccl_tests` is True, it must be a sqsh file.
nccl_tests_container: "${HOME}/enroot_images/nccl_tests_slurm_val.sqsh"
# Bool; Compile and add NCCL tests to the base_container outputing to
# nccl_tests_container (will delete/overwrite if one already exists). If
# false assumes nccl_tests_container already has the NCCL tests and uses
# the nccl_tests_container.
compile_nccl_tests: True
# String; NCCL allreduce test command.
allreduce_command: "all_reduce_perf -b 1M -e 4G -f 2 -g 1"
# Int; Number of GPUs per node. DGX-1 and DGX A100 Server have 8 GPUs.
# DGX-2 has 16 GPUs.
num_gpus: 8
# String; Slurm parition to use
partition: batch
# Time string; Time limit for the Slurm job.
timelimit: "10:00"
# String; Exports for srun command.
srun_exports: NCCL_DEBUG=INFO
# String; Custom srun options.
srun_options:
# Int or empty; Number of nodes. If empty uses all idle nodes on the partition.
num_nodes:
# Bool; Delete the `nccl_tests_container` after running the playbook, only
# if `compile_nccl_tests` is true as well.
cleanup: False
tasks:
- name: Check that NCCL sqsh base container exists when sqsh is specified.
file:
path: "{{ base_container }}"
state: file
when:
- compile_nccl_tests|bool == True
- base_container | splitext | last == ".sqsh"
- name: Check nccl_tests_container is sqsh path when compiling.
fail:
msg: >
When compiling the `nccl_tests_container` must be a path to enroot
".sqsh" file. Currently it is set to:
"{{ nccl_tests_container }}"
when:
- compile_nccl_tests|bool == True
- nccl_tests_container | splitext | last != ".sqsh"
- name: Set nccl tests compilation command
set_fact:
ncclmake: |
mkdir -p /opt/nccl_tests
cd /opt/nccl_tests
NCCL_TESTS_COMMITISH=f773748b46
wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | \
tar --strip-components=1 -xzf - \
&& CC=mpicc CXX=mpicxx make MPI=1
cp -R /opt/nccl_tests/build/* /usr/local/bin/
when: compile_nccl_tests|bool == True
- name: Print nccl make command
debug:
msg:
- |
Will compile container:
{{ nccl_tests_container }}
{% if not cleanup|bool %}Delete container manually afterwards.{% endif %}
- "NCCL compilation command:\n{{ ncclmake }}"
when: compile_nccl_tests|bool == True
- name: Check or create enroot images directory for nccl_tests_container.
file:
path: "{{ nccl_tests_container | dirname }}"
state: directory
when: compile_nccl_tests|bool == True
- name: Remove NCCL container if re-compiling.
file:
path: "{{ nccl_tests_container }}"
state: absent
when: compile_nccl_tests|bool == True
- name: Compiling NCCL tests
shell:
cmd: |
srun -p {{ partition }} -N 1 \
--ntasks-per-node=1 \
--cpus-per-task=10 \
--container-image="{{ base_container }}" \
--container-save="{{ nccl_tests_container }}" \
--container-remap-root \
{{ srun_options }} \
bash -c '{{ ncclmake }}'
creates: "{{ nccl_tests_container }}"
when: compile_nccl_tests|bool == True
- name: Check that nccl_tests_container exists when sqsh is specified.
file:
path: "{{ nccl_tests_container }}"
state: file
when:
- compile_nccl_tests|bool == False
- nccl_tests_container | splitext | last == ".sqsh"
- name: Print nccl_tests_container setting if not sqsh file.
debug:
msg: >
The nccl_tests_container is not set to sqsh file. Assuming it is a
valid docker tag. Currently it is set to:
"{{ nccl_tests_container }}"
WARNING: No validation is performed.
when:
- compile_nccl_tests|bool == False
- nccl_tests_container | splitext | last != ".sqsh"
- name: Get node count from sinfo
shell: >
sinfo -p {{ partition }} -t idle |
tail -n +2 |
awk '{sum += $4} END {print sum}'
register: node_out
when: (not num_nodes) or (num_nodes|int <= 0)
- name: Set num_nodes variable
set_fact:
num_nodes_: "{{ node_out.stdout | default(num_nodes) }}"
- name: Set nccl run command
set_fact:
ncclcmd: |
srun --export={{ srun_exports }} \
-p {{ partition }} \
--time {{ timelimit }} \
-N {{ num_nodes_ }} \
--ntasks-per-node={{ num_gpus }} \
--gpus-per-task=1 \
--exclusive \
--mpi=pmi2 \
--no-container-remap-root \
--container-image="{{ nccl_tests_container }}" \
{{ srun_options }} \
{{ allreduce_command }}
- name: Print node/gpu counts and nccl tests run command
debug:
msg:
- "Detected {{ num_nodes_ }} nodes with {{ num_gpus }} gpus each."
- "Proceeding to run validation test, this may take several
minutes:\n{{ ncclcmd }}"
- block:
- name: Execute NCCL allreduce test
shell:
cmd: "{{ ncclcmd }}"
register: result
no_log: true
rescue:
- name: NCCL allreduce running error
debug:
msg: "{{ result }}"
- name: Save error results to /tmp/nccl_tests.err
local_action:
module: copy
content: "STDOUT:\n{{ result.stdout }}\n\nSTDERR:\n{{ result.stderr }}"
dest: /tmp/nccl_tests.err
- name: Fail Slurm validation playbook
fail:
msg: See debug of stderr above. Also refer to "/tmp/nccl_tests.err"
- name: Save results to /tmp/nccl_tests.out
local_action:
module: copy
content: "STDERR:\n{{ result.stderr }}\n\nSTDOUT:\n{{ result.stdout }}"
dest: /tmp/nccl_tests.out
- name: Extract "Out of bounds values" and "Avg bus bandwidth".
shell:
cmd: |
echo "{{ result.stdout }}" | \
grep "Out of bounds values\|Avg bus bandwidth"
register: result_vals
when: result.stdout != ""
- name: Print and analyze results
debug:
msg:
- "{{ result_vals.stdout }}"
- Out of bounds values should be 0 otherwise FAIL.
- Bandwidth will vary based on GPU topology and network interconnect types.
- Refer to /tmp/nccl_tests.out for detailed NCCL allreduce output.
- name: Cleanup NCCL container.
file:
path: "{{ nccl_tests_container }}"
state: absent
when:
- cleanup|bool == True
- compile_nccl_tests|bool == True