Skip to content

Commit

Permalink
Replace nvidia-persistenced with parallelcluster_nvidia service
Browse files Browse the repository at this point in the history
`parallelcluster_nvidia` service ensures the creation of the block devices `/dev/nvidia0`
and it is needed by the `slurmd` service.

`parallelcluster_nvidia` starts the `nvidia-persistenced` or runs `nvidia-smi`
to avoid race condition with other services and avoids conflicts when using DLAMI with a gpu instance.

### Tests
* Modified ChefSpec to verify new changes.

### References
Backport of: aws#2341

Signed-off-by: Enrico Usai <usai@amazon.com>
  • Loading branch information
francesco-giordano authored and enrico-usai committed Jul 7, 2023
1 parent cadadea commit 1eb92b7
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 25 deletions.
10 changes: 10 additions & 0 deletions cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,13 @@ def nvidia_enabled?
def graphic_instance?
!Mixlib::ShellOut.new("lspci | grep -i -o 'NVIDIA'").run_command.stdout.strip.empty?
end

#
# Check if a process is running
#
def is_process_running(process_name)
ps = Mixlib::ShellOut.new("ps aux | grep '#{process_name}' | egrep -v \"grep .*#{process_name}\"")
ps.run_command

!ps.stdout.strip.empty?
end
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,23 @@
mode '0644'
end

# Install nvidia_persistenced. See https://download.nvidia.com/XFree86/Linux-x86_64/396.51/README/nvidia-persistenced.html
bash 'Install nvidia_persistenced' do
# Install ParallelCluster nvidia service.
# The service ensures the creation of the block devices /dev/nvidia0 after reboot and it is needed by the slurmd service
# cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb
#
# The service starts the nvidia-persistenced or run nvidia-smi to avoid race condition with other services
template '/etc/systemd/system/parallelcluster_nvidia.service' do
only_if { graphic_instance? && nvidia_installed? }
cwd '/usr/share/doc/NVIDIA_GLX-1.0/samples'
user 'root'
source 'nvidia/parallelcluster_nvidia_service.erb'
owner 'root'
group 'root'
code <<-NVIDIA
tar -xf nvidia-persistenced-init.tar.bz2
./nvidia-persistenced-init/install.sh
NVIDIA
mode '0644'
action :create
variables(is_nvidia_persistenced_running: is_process_running('nvidia-persistenced'))
end

service "parallelcluster_nvidia" do
only_if { graphic_instance? && nvidia_installed? }
supports restart: false
action %i(enable start)
end
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,50 @@
)
end

it 'installs nvidia_persistenced' do
is_expected.to run_bash('Install nvidia_persistenced')
.with(
cwd: '/usr/share/doc/NVIDIA_GLX-1.0/samples',
user: 'root',
group: 'root'
)
.with_code(/tar -xf nvidia-persistenced-init.tar.bz2/)
.with_code(%r{./nvidia-persistenced-init/install.sh})
it 'creates the parallelcluster nvidia template with the correct attributes' do
is_expected.to create_template('/etc/systemd/system/parallelcluster_nvidia.service').with(
source: 'nvidia/parallelcluster_nvidia_service.erb',
owner: 'root',
group: 'root',
mode: '0644',
variables: { is_nvidia_persistenced_running: is_process_running('nvidia-persistenced') }
)
end

it 'starts parallelcluster_nvidia service' do
is_expected.to enable_service('parallelcluster_nvidia').with_action(%i(enable start))
end

context 'when nvidia-peristenced process is running' do
before do
allow_any_instance_of(Object).to receive(:is_process_running?).and_return(true)
end

it 'creates the parallelcluster nvidia template with the correct attributes' do
is_expected.to create_template('/etc/systemd/system/parallelcluster_nvidia.service').with(
source: 'nvidia/parallelcluster_nvidia_service.erb',
owner: 'root',
group: 'root',
mode: '0644',
variables: { is_nvidia_persistenced_running: is_process_running('nvidia-persistenced') }
)
end
end

context 'when nvidia-peristenced process is not running' do
before do
allow_any_instance_of(Object).to receive(:is_process_running?).and_return(false)
end

it 'creates the parallelcluster nvidia template with the correct attributes' do
is_expected.to create_template('/etc/systemd/system/parallelcluster_nvidia.service').with(
source: 'nvidia/parallelcluster_nvidia_service.erb',
owner: 'root',
group: 'root',
mode: '0644',
variables: { is_nvidia_persistenced_running: is_process_running('nvidia-persistenced') }
)
end
end
end

Expand All @@ -50,7 +85,8 @@
it 'does not configure uvm' do
is_expected.not_to load_kernel_module('nvidia-uvm')
is_expected.not_to create_cookbook_file('nvidia.conf')
is_expected.not_to run_bash('Install nvidia_persistenced')
is_expected.not_to create_template('/etc/systemd/system/parallelcluster_nvidia.service')
is_expected.not_to enable_service('parallelcluster_nvidia')
end
end

Expand All @@ -63,7 +99,8 @@
it 'does not configure uvm' do
is_expected.not_to load_kernel_module('nvidia-uvm')
is_expected.not_to create_cookbook_file('nvidia.conf')
is_expected.not_to run_bash('Install nvidia_persistenced')
is_expected.not_to create_template('/etc/systemd/system/parallelcluster_nvidia.service')
is_expected.not_to enable_service('parallelcluster_nvidia')
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# This systemd service file, designed to trigger the creation device block file /dev/nvidia0
# The service start nvidia-persistenced if it is not already started or execute the command nvidia-smi.

[Unit]
Description=ParallelCluster NVIDIA Daemon
Wants=syslog.target

[Service]
<% if @is_nvidia_persistenced_running -%>
Type=simple
ExecStart=/usr/bin/nvidia-smi
RemainAfterExit=yes
<% else %>
Type=forking
ExecStart=/usr/bin/nvidia-persistenced --user root
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
<% end %>

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
its('content') { should include("uvm") }
end

describe service('nvidia-persistenced') do
describe service('parallelcluster_nvidia') do
it { should be_enabled }
it { should be_running }
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
action :create
end

# Add systemd dependency between slurmd and nvidia-persistenced for NVIDIA GPU nodes
# Add systemd dependency between slurmd and parallelcluster_nvidia for NVIDIA GPU nodes
if graphic_instance? && nvidia_installed?
directory '/etc/systemd/system/slurmd.service.d' do
user 'root'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[Unit]
After=nvidia-persistenced.service
Wants=nvidia-persistenced.service
After=parallelcluster_nvidia.service
Wants=parallelcluster_nvidia.service
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@

describe 'Check slurmd systemd "after" dependencies'
describe command('systemctl list-dependencies --after --plain slurmd.service') do
its('stdout') { should include "nvidia-persistenced.service" }
its('stdout') { should include "parallelcluster_nvidia.service" }
end

describe 'Check slurmd systemd requirement dependencies'
describe command('systemctl list-dependencies --plain slurmd.service') do
its('stdout') { should include "nvidia-persistenced.service" }
its('stdout') { should include "parallelcluster_nvidia.service" }
end

describe 'Check that slurmd systemd drop-in configuration exists'
Expand Down

0 comments on commit 1eb92b7

Please sign in to comment.