diff --git a/cookbooks/aws-parallelcluster-config/recipes/nvidia.rb b/cookbooks/aws-parallelcluster-config/recipes/nvidia.rb index b5add9a5c7..60a4b6d5a5 100644 --- a/cookbooks/aws-parallelcluster-config/recipes/nvidia.rb +++ b/cookbooks/aws-parallelcluster-config/recipes/nvidia.rb @@ -35,14 +35,18 @@ group 'root' mode '0644' end - # Install nvidia_persistenced. See https://download.nvidia.com/XFree86/Linux-x86_64/396.51/README/nvidia-persistenced.html - bash 'Install nvidia_persistenced' do - cwd '/usr/share/doc/NVIDIA_GLX-1.0/samples' - user 'root' + + # Install ParallelCluster nvidia service. + # The service ensures the creation of the block devices /dev/nvidia0 after reboot and it is needed by the slurmd service + # cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb + # + # The service starts the nvidia-persistenced or run nvidia-smi to avoid race condition with other services + template '/etc/systemd/system/parallelcluster_nvidia.service' do + source 'nvidia/parallelcluster_nvidia_service.erb' + owner 'root' group 'root' - code <<-NVIDIA - tar -xf nvidia-persistenced-init.tar.bz2 - ./nvidia-persistenced-init/install.sh - NVIDIA + mode '0644' + action :create + variables(is_nvidia_persistenced_running: is_process_running('/usr/bin/nvidia-persistenced')) end end diff --git a/cookbooks/aws-parallelcluster-config/templates/default/nvidia/parallelcluster_nvidia_service.erb b/cookbooks/aws-parallelcluster-config/templates/default/nvidia/parallelcluster_nvidia_service.erb new file mode 100644 index 0000000000..1b6f9f8b59 --- /dev/null +++ b/cookbooks/aws-parallelcluster-config/templates/default/nvidia/parallelcluster_nvidia_service.erb @@ -0,0 +1,20 @@ +# This systemd service file, designed to trigger the creation device block file /dev/nvidia0 +# The service start nvidia-persistenced if it is not already started or execute the command nvidia-smi. + +[Unit] +Description=ParallelCluster NVIDIA Daemon +Wants=syslog.target + +[Service] +<% if is_nvidia_persistenced_running -%> +Type=simple +ExecStart=/usr/bin/nvidia-smi +RemainAfterExit=yes +<% else %> +Type=forking +ExecStart=/usr/bin/nvidia-persistenced --user root +ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced +<% end %> + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb index 1ee8c068af..c75067d1d5 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb @@ -1,3 +1,3 @@ [Unit] -After=nvidia-persistenced.service -Wants=nvidia-persistenced.service +After=parallelcluster_nvidia.service +Wants=parallelcluster_nvidia.service diff --git a/libraries/helpers.rb b/libraries/helpers.rb index f1e622d13b..9048dc319d 100644 --- a/libraries/helpers.rb +++ b/libraries/helpers.rb @@ -129,6 +129,16 @@ def ignore_failure(lookup) end end +# +# Check if a process is running +# +def is_process_running(process_name) + ps = Mixlib::ShellOut.new("ps aux | grep '#{process_name}' | grep -v \"grep #{process_name}\"") + ps.run_command + + !ps.stdout.strip.empty? +end + # # Check if the instance has a GPU #