diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index b96cdbb8db..eb631adf2f 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -1639,7 +1639,6 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: D102 imds_support=self.imds.imds_support, ) if self.head_node.dcv: - self._register_validator(FeatureRegionValidator, feature=Feature.DCV, region=self.region) self._register_validator( DcvValidator, instance_type=self.head_node.instance_type, @@ -2973,7 +2972,6 @@ def login_nodes_subnet_ids(self): def _register_login_node_validators(self): """Register all login node validators to ensure that the resource parameters are valid.""" - has_dcv_configured = False # Check if all subnets(head node, Login nodes, compute nodes) are in the same VPC and support DNS. self._register_validator( SubnetsValidator, @@ -3013,10 +3011,6 @@ def _register_login_node_validators(self): os=self.image.os, architecture=pool.architecture, ) - has_dcv_configured = True - - if has_dcv_configured: - self._register_validator(FeatureRegionValidator, feature=Feature.DCV, region=self.region) def _register_validators(self, context: ValidatorContext = None): # noqa: C901 super()._register_validators(context) diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index d1860b83cf..f1b5fe5364 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -260,7 +260,6 @@ class Feature(Enum): UNSUPPORTED_FEATURES_MAP = { Feature.BATCH: ["ap-northeast-3", "ap-southeast-5", "ap-southeast-7", "us-iso"], - Feature.DCV: ["us-iso"], Feature.FSX_LUSTRE: ["us-isob"], Feature.FILE_CACHE: ["us-iso"], Feature.FSX_ONTAP: ["us-iso"], diff --git a/cli/src/pcluster/resources/imagebuilder/parallelcluster_test.yaml b/cli/src/pcluster/resources/imagebuilder/parallelcluster_test.yaml index 00c6f0861f..bd2680fa80 100644 --- a/cli/src/pcluster/resources/imagebuilder/parallelcluster_test.yaml +++ b/cli/src/pcluster/resources/imagebuilder/parallelcluster_test.yaml @@ -58,6 +58,290 @@ phases: echo ${OS} + # Get input base AMI Architecture + - name: OperatingSystemArchitecture + action: ExecuteBash + inputs: + commands: + - | + set -v + ARCH=$(uname -m) + case ${ARCH} in + 'x86_64') + echo 'x86_64' + ;; + 'aarch64') + echo 'arm64' + ;; + *) + echo "The '${ARCH}' architecture is not supported. Failing build." && exit 1 + ;; + esac + + # Get platform name + - name: PlatformName + action: ExecuteBash + inputs: + commands: + - | + set -v + OS='{{ test.OperatingSystemName.outputs.stdout }}' + + if [ `echo "${OS}" | grep -E '^(alinux|centos|rhel|rocky)'` ]; then + PLATFORM='RHEL' + elif [ `echo "${OS}" | grep -E '^ubuntu'` ]; then + PLATFORM='DEBIAN' + fi + + echo ${PLATFORM} + + ### conditions ### + - name: IntelMPISupported + action: ExecuteBash + inputs: + commands: + - | + set -v + [[ {{ test.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]] && echo "true" || echo "false" + + - name: FabricManagerSupported + action: ExecuteBash + inputs: + commands: + - | + set -v + [[ {{ test.OperatingSystemArchitecture.outputs.stdout }} == 'arm64' ]] && echo "false" || echo "true" + + - name: LustreSupported + action: ExecuteBash + inputs: + commands: + - | + set -v + ARCHITECTURE='{{ test.OperatingSystemArchitecture.outputs.stdout }}' + OS='{{ test.OperatingSystemName.outputs.stdout }}' + if [ ${ARCHITECTURE} == 'arm64' ] && [[ ${OS} =~ ^(ubuntu(20|22)04|alinux(2|2023)|rhel8|rocky8|rhel9|rocky9)$ ]] || [ ${ARCHITECTURE} == 'x86_64' ]; then + echo "true" + else + echo "false" + fi + + ### versions ### + - name: MungeVersion + action: ExecuteBash + inputs: + commands: + - | + set -v + PATTERN=$(jq '.default.cluster.munge.munge_version' {{ CookbookDefaultFile }}) + VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) + echo ${VERSION} + + - name: NvidiaDriverVersion + action: ExecuteBash + inputs: + commands: + - | + set -v + PATTERN=$(jq '.default.cluster.nvidia.driver_version' {{ CookbookDefaultFile }}) + VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) + echo ${VERSION} + + - name: CudaVersion + action: ExecuteBash + inputs: + commands: + - | + set -v + PATTERN=$(jq '.default.cluster.nvidia.cuda.version' {{ CookbookDefaultFile }}) + VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) + echo ${VERSION} + + - name: CudaSamplesDir + action: ExecuteBash + inputs: + commands: + - | + set -v + cuda_ver="{{ test.CudaVersion.outputs.stdout }}" + if [ ${cuda_ver} \> '11.4' ]; then + PATTERN=$(jq '.default.cluster.nvidia.cuda_samples_version' {{ CookbookDefaultFile }}) + VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) + echo cuda-samples-${VERSION} + else + echo cuda-${cuda_ver} + fi + + ### utils ### + - name: PatchInSpecProfiles + action: ExecuteBash + inputs: + commands: + - | + set -v + sed -Ei "s#path: cookbooks/aws-parallelcluster#path: /etc/chef/cookbooks/aws-parallelcluster#g" /etc/chef/cookbooks/aws-parallelcluster-*/test/inspec.yml + echo "InSpec profiles patched" + + - name: NvidiaEnabled + action: ExecuteBash + inputs: + commands: + - | + set -v + NVIDIA_ENABLED=$(cat /etc/parallelcluster/image_dna.json | jq -r '.cluster.nvidia.enabled') + echo "${NVIDIA_ENABLED}" + + - name: HasGPU + action: ExecuteBash + inputs: + commands: + - | + set -v + HAS_GPU=$(lspci | grep -o "NVIDIA") || HAS_GPU="false" + echo "${HAS_GPU}" + + - name: Munge + action: ExecuteBash + inputs: + commands: + - | + set -vx + echo "check munge installed" + munge --version | grep {{ test.MungeVersion.outputs.stdout }} + [[ $? -ne 0 ]] && echo "Check munge version failed" && exit 1 + echo "Munge test passed" + + - name: EFAIntelMPI + action: ExecuteBash + inputs: + commands: + - | + set -vx + PLATFORM='{{ test.PlatformName.outputs.stdout }}' + + if [ {{ test.IntelMPISupported.outputs.stdout }} == true ]; then + echo "Checking efa packages installed..." + if [ ${PLATFORM} == RHEL ]; then + rpm -qa | grep libfabric && rpm -qa | grep efa- + [[ $? -ne 0 ]] && echo "Check efa rpm failed" && exit 1 + + echo "Checking Intel MPI 20xx installed and module available..." + unset MODULEPATH + source /etc/profile.d/modules.sh + (module avail intelmpi)2>&1 | grep "/opt/intel/mpi/20.*/modulefiles" + [[ $? -ne 0 ]] && echo "Check Intel MPI failed" && exit 1 + else + dpkg -l | grep libfabric && modinfo efa | grep efa && [ -d /opt/amazon/efa ] + [[ $? -ne 0 ]] && echo "Check efa deb failed" && exit 1 + fi + fi + echo "EFA test passed" + + - name: NvidiaCudaFabricManager + action: ExecuteBash + inputs: + commands: + - | + set -vx + PLATFORM='{{ test.PlatformName.outputs.stdout }}' + + if [[ {{ test.NvidiaEnabled.outputs.stdout }} == 'no' ]]; then + echo "Nvidia recipe not enabled, skipping." && exit 0 + fi + if [ {{ test.HasGPU.outputs.stdout }} == "false" ]; then + echo "No GPU detected, skipping." && exit 0 + fi + + driver_ver="{{ test.NvidiaDriverVersion.outputs.stdout }}" + export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" + + echo "Testing Nvidia driver version" + driver_output=$(nvidia-smi | grep -E -o "Driver Version: [0-9.]+") + [[ "${driver_output}" != "Driver Version: ${driver_ver}" ]] && "ERROR Installed version ${driver_output} but expected ${driver_ver}" && exit 1 + echo "Correctly installed Nvidia ${driver_output}" + + if [ {{ test.FabricManagerSupported.outputs.stdout }} == "true" ]; then + echo "Testing Nvidia Fabric Manager version" + nvidia_driver_version=$(modinfo -F version nvidia) + if [ "${PLATFORM}" == "RHEL" ]; then + yum list installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 + yum versionlock list | grep "nvidia-fabric.*manager" || exit 1 + else + apt list --installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 + apt-mark showhold | grep "nvidia-fabric.*manager" || exit 1 + fi + echo "Fabric Manager match Nvidia driver and version is locked" + fi + + echo "Testing CUDA installation with nvcc" + cuda_ver="{{ test.CudaVersion.outputs.stdout }}" + export PATH=/usr/local/cuda-${cuda_ver}/bin:${PATH} + export LD_LIBRARY_PATH=/usr/local/cuda-${cuda_ver}/lib64:${LD_LIBRARY_PATH} + cuda_output=$(nvcc -V | grep -E -o "release [0-9]+.[0-9]+") + [[ "${cuda_output}" != "release ${cuda_ver}" ]] && echo "ERROR Installed version ${cuda_output} but expected ${cuda_ver}" && exit 1 + echo "Correctly installed CUDA ${cuda_output}" + + echo "Testing CUDA with deviceQuery..." + if [ {{ test.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]; then + /usr/local/cuda-${cuda_ver}/extras/demo_suite/deviceQuery | grep -o "Result = PASS" + [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 + else + cd /usr/local/{{ test.CudaSamplesDir.outputs.stdout }}//Samples/1_Utilities/deviceQuery + if [ {{ test.OperatingSystemName.outputs.stdout }} == 'alinux2' ]; then + make + /usr/local/{{ test.CudaSamplesDir.outputs.stdout }}/bin/sbsa/linux/release/deviceQuery | grep -o "Result = PASS" + else + mkdir build && cd build + cmake .. \ + -DCMAKE_CUDA_ARCHITECTURES="75;80;86" \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda-${cuda_ver}/bin/nvcc \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${cuda_ver} \ + -DCMAKE_PREFIX_PATH=/usr/local/cuda-cuda-${cuda_ver} \ + ${CMAKE_ARGS} + make + ./deviceQuery | grep -o "Result = PASS" + fi + [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 + fi + echo "CUDA deviceQuery test passed" + + - name: FSxLustre + action: ExecuteBash + inputs: + commands: + - | + set -vx + OS='{{ test.OperatingSystemName.outputs.stdout }}' + + [[ $? -ne 0 ]] && echo "Check for Lustre client failed" && exit 1 + echo "FSx Lustre test passed" + + - name: Python + action: ExecuteBash + inputs: + commands: + - | + set -vx + echo "Checking python3 installed..." + which python3 + [[ $? -ne 0 ]] && echo "Python3 is not installed" && exit 1 + echo "Python test passed" + + - name: DPKG + action: ExecuteBash + inputs: + commands: + - | + set -vx + PLATFORM='{{ test.PlatformName.outputs.stdout }}' + if [ ${PLATFORM} != DEBIAN ]; then + echo "Checking dpkg is not installed on non-debian OS..." + if command -v dpkg &> /dev/null; then + echo "ERROR: dpkg found on non-Debian system" && exit 1 + fi + echo "dpkg test passed" + fi + ### versions ### - name: PythonVersion action: ExecuteBash @@ -121,7 +405,7 @@ phases: set -vx echo "Performing InSpec tests for AwsBatch on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-awsbatch - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for AwsBatch failed" && exit 1 echo "InSpec tests for AwsBatch passed" @@ -133,7 +417,7 @@ phases: set -vx echo "Performing InSpec tests for platform on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-platform - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for platform failed" && exit 1 echo "InSpec tests for platform passed" @@ -145,7 +429,7 @@ phases: set -vx echo "Performing InSpec tests for environment on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-environment - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for environment failed" && exit 1 echo "InSpec tests for environment passed" @@ -157,7 +441,7 @@ phases: set -vx echo "Performing InSpec tests for compute fleet on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-computefleet - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for compute fleet failed" && exit 1 echo "InSpec tests for compute fleet passed" @@ -169,7 +453,7 @@ phases: set -vx echo "Performing InSpec tests for shared cookbook on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-shared - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for shared cookbook failed" && exit 1 echo "InSpec tests for shared cookbook passed" @@ -181,6 +465,6 @@ phases: set -vx echo "Performing InSpec tests for slurm on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-slurm - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for slurm failed" && exit 1 echo "InSpec tests for slurm passed" diff --git a/cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml b/cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml index 425e71e5e9..e69de29bb2 100644 --- a/cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml +++ b/cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml @@ -1,443 +0,0 @@ -name: ParallelClusterValidate -description: Validate ParallelCluster AMI -schemaVersion: 1.0 - -constants: - - CookbookDefaultFile: - type: string - value: /etc/chef/node_attributes.json - -phases: - - name: validate - steps: - ### basic ### - - name: OperatingSystemRelease - action: ExecuteBash - inputs: - commands: - - | - set -v - FILE=/etc/os-release - if [ -e ${FILE} ]; then - . ${FILE} - echo "${ID}${VERSION_ID:+.${VERSION_ID}}" - else - echo "The file '${FILE}' does not exist. Failing build." && exit 1 - fi - - # Get uniformed OS name - - name: OperatingSystemName - action: ExecuteBash - inputs: - commands: - - | - set -v - RELEASE='{{ validate.OperatingSystemRelease.outputs.stdout }}' - - if [ `echo "${RELEASE}" | grep -w '^amzn\.2'` ]; then - OS='alinux2' - elif [ `echo "${RELEASE}" | grep -w '^amzn\.2023'` ]; then - OS='alinux2023' - elif [ `echo "${RELEASE}" | grep '^ubuntu\.20'` ]; then - OS='ubuntu2004' - elif [ `echo "${RELEASE}" | grep '^ubuntu\.22'` ]; then - OS='ubuntu2204' - elif [ `echo "${RELEASE}" | grep '^ubuntu\.24'` ]; then - OS='ubuntu2404' - elif [ `echo "${RELEASE}" | grep '^rhel\.8'` ]; then - OS='rhel8' - elif [ `echo "${RELEASE}" | grep '^rocky\.8'` ]; then - OS='rocky8' - elif [ `echo "${RELEASE}" | grep '^rhel\.9'` ]; then - OS='rhel9' - elif [ `echo "${RELEASE}" | grep '^rocky\.9'` ]; then - OS='rocky9' - else - echo "Operating System '${RELEASE}' is not supported. Failing build." && exit 1 - fi - - echo ${OS} - - # Get input base AMI Architecture - - name: OperatingSystemArchitecture - action: ExecuteBash - inputs: - commands: - - | - set -v - ARCH=$(uname -m) - case ${ARCH} in - 'x86_64') - echo 'x86_64' - ;; - 'aarch64') - echo 'arm64' - ;; - *) - echo "The '${ARCH}' architecture is not supported. Failing build." && exit 1 - ;; - esac - - # Get platform name - - name: PlatformName - action: ExecuteBash - inputs: - commands: - - | - set -v - OS='{{ validate.OperatingSystemName.outputs.stdout }}' - - if [ `echo "${OS}" | grep -E '^(alinux|centos|rhel|rocky)'` ]; then - PLATFORM='RHEL' - elif [ `echo "${OS}" | grep -E '^ubuntu'` ]; then - PLATFORM='DEBIAN' - fi - - echo ${PLATFORM} - - # Get AWS region - - name: AWSRegion - action: ExecuteBash - inputs: - commands: - - | - set -v - IMDS_TOKEN=$(curl --retry 3 --retry-delay 0 -s --fail -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 300") - AVAIL_ZONE=$(curl --retry 3 --retry-delay 0 -s --fail -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" http://169.254.169.254/latest/meta-data/placement/availability-zone) - AWS_REGION=${AVAIL_ZONE::-1} - echo ${AWS_REGION} - - ### conditions ### - - name: IntelMPISupported - action: ExecuteBash - inputs: - commands: - - | - set -v - [[ {{ validate.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]] && echo "true" || echo "false" - - - name: FabricManagerSupported - action: ExecuteBash - inputs: - commands: - - | - set -v - [[ {{ validate.OperatingSystemArchitecture.outputs.stdout }} == 'arm64' ]] && echo "false" || echo "true" - - - name: LustreSupported - action: ExecuteBash - inputs: - commands: - - | - set -v - ARCHITECTURE='{{ validate.OperatingSystemArchitecture.outputs.stdout }}' - OS='{{ validate.OperatingSystemName.outputs.stdout }}' - if [ ${ARCHITECTURE} == 'arm64' ] && [[ ${OS} =~ ^(ubuntu(20|22)04|alinux(2|2023)|rhel8|rocky8|rhel9|rocky9)$ ]] || [ ${ARCHITECTURE} == 'x86_64' ]; then - echo "true" - else - echo "false" - fi - - ### versions ### - - name: MungeVersion - action: ExecuteBash - inputs: - commands: - - | - set -v - PATTERN=$(jq '.default.cluster.munge.munge_version' {{ CookbookDefaultFile }}) - VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) - echo ${VERSION} - - - name: NvidiaDriverVersion - action: ExecuteBash - inputs: - commands: - - | - set -v - PATTERN=$(jq '.default.cluster.nvidia.driver_version' {{ CookbookDefaultFile }}) - VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) - echo ${VERSION} - - - name: CudaVersion - action: ExecuteBash - inputs: - commands: - - | - set -v - PATTERN=$(jq '.default.cluster.nvidia.cuda.version' {{ CookbookDefaultFile }}) - VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) - echo ${VERSION} - - - name: CudaSamplesDir - action: ExecuteBash - inputs: - commands: - - | - set -v - cuda_ver="{{ validate.CudaVersion.outputs.stdout }}" - if [ ${cuda_ver} \> '11.4' ]; then - PATTERN=$(jq '.default.cluster.nvidia.cuda_samples_version' {{ CookbookDefaultFile }}) - VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) - echo cuda-samples-${VERSION} - else - echo cuda-${cuda_ver} - fi - - ### utils ### - - name: PatchInSpecProfiles - action: ExecuteBash - inputs: - commands: - - | - set -v - sed -Ei "s#path: cookbooks/aws-parallelcluster#path: /etc/chef/cookbooks/aws-parallelcluster#g" /etc/chef/cookbooks/aws-parallelcluster-*/test/inspec.yml - echo "InSpec profiles patched" - - - name: NvidiaEnabled - action: ExecuteBash - inputs: - commands: - - | - set -v - NVIDIA_ENABLED=$(cat /etc/parallelcluster/image_dna.json | jq -r '.cluster.nvidia.enabled') - echo "${NVIDIA_ENABLED}" - - - name: HasGPU - action: ExecuteBash - inputs: - commands: - - | - set -v - HAS_GPU=$(lspci | grep -o "NVIDIA") || HAS_GPU="false" - echo "${HAS_GPU}" - - - name: Munge - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "check munge installed" - munge --version | grep {{ validate.MungeVersion.outputs.stdout }} - [[ $? -ne 0 ]] && echo "Check munge version failed" && exit 1 - echo "Munge test passed" - - - name: EFAIntelMPI - action: ExecuteBash - inputs: - commands: - - | - set -vx - PLATFORM='{{ validate.PlatformName.outputs.stdout }}' - - if [ {{ validate.IntelMPISupported.outputs.stdout }} == true ]; then - echo "Checking efa packages installed..." - if [ ${PLATFORM} == RHEL ]; then - rpm -qa | grep libfabric && rpm -qa | grep efa- - [[ $? -ne 0 ]] && echo "Check efa rpm failed" && exit 1 - - echo "Checking Intel MPI 20xx installed and module available..." - unset MODULEPATH - source /etc/profile.d/modules.sh - (module avail intelmpi)2>&1 | grep "/opt/intel/mpi/20.*/modulefiles" - [[ $? -ne 0 ]] && echo "Check Intel MPI failed" && exit 1 - else - dpkg -l | grep libfabric && modinfo efa | grep efa && [ -d /opt/amazon/efa ] - [[ $? -ne 0 ]] && echo "Check efa deb failed" && exit 1 - fi - fi - echo "EFA test passed" - - - name: NvidiaCudaFabricManager - action: ExecuteBash - inputs: - commands: - - | - set -vx - PLATFORM='{{ validate.PlatformName.outputs.stdout }}' - - if [[ {{ validate.NvidiaEnabled.outputs.stdout }} == 'no' ]]; then - echo "Nvidia recipe not enabled, skipping." && exit 0 - fi - if [ {{ validate.HasGPU.outputs.stdout }} == "false" ]; then - echo "No GPU detected, skipping." && exit 0 - fi - - driver_ver="{{ validate.NvidiaDriverVersion.outputs.stdout }}" - export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" - - echo "Testing Nvidia driver version" - driver_output=$(nvidia-smi | grep -E -o "Driver Version: [0-9.]+") - [[ "${driver_output}" != "Driver Version: ${driver_ver}" ]] && "ERROR Installed version ${driver_output} but expected ${driver_ver}" && exit 1 - echo "Correctly installed Nvidia ${driver_output}" - - if [ {{ validate.FabricManagerSupported.outputs.stdout }} == "true" ]; then - echo "Testing Nvidia Fabric Manager version" - nvidia_driver_version=$(modinfo -F version nvidia) - if [ "${PLATFORM}" == "RHEL" ]; then - yum list installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 - yum versionlock list | grep "nvidia-fabric.*manager" || exit 1 - else - apt list --installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 - apt-mark showhold | grep "nvidia-fabric.*manager" || exit 1 - fi - echo "Fabric Manager match Nvidia driver and version is locked" - fi - - echo "Testing CUDA installation with nvcc" - cuda_ver="{{ validate.CudaVersion.outputs.stdout }}" - export PATH=/usr/local/cuda-${cuda_ver}/bin:${PATH} - export LD_LIBRARY_PATH=/usr/local/cuda-${cuda_ver}/lib64:${LD_LIBRARY_PATH} - cuda_output=$(nvcc -V | grep -E -o "release [0-9]+.[0-9]+") - [[ "${cuda_output}" != "release ${cuda_ver}" ]] && echo "ERROR Installed version ${cuda_output} but expected ${cuda_ver}" && exit 1 - echo "Correctly installed CUDA ${cuda_output}" - - echo "Testing CUDA with deviceQuery..." - if [ {{ validate.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]; then - /usr/local/cuda-${cuda_ver}/extras/demo_suite/deviceQuery | grep -o "Result = PASS" - [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 - else - cd /usr/local/{{ validate.CudaSamplesDir.outputs.stdout }}//Samples/1_Utilities/deviceQuery - if [ {{ validate.OperatingSystemName.outputs.stdout }} == 'alinux2' ]; then - make - /usr/local/{{ validate.CudaSamplesDir.outputs.stdout }}/bin/sbsa/linux/release/deviceQuery | grep -o "Result = PASS" - else - if [ {{ validate.OperatingSystemName.outputs.stdout }} == 'ubuntu2004' ]; then - MINI_CMAKE_VER_REQ=$(sed -n 's/cmake_minimum_required(\(VERSION \)\?\([0-9.]*\)).*/\2/p' CMakeLists.txt) - COOKBOOK_ENV=$(jq '.default.cluster.cookbook_virtualenv_path' {{ CookbookDefaultFile }}) - COOKBOOK_ENV_PATH=$(echo ${COOKBOOK_ENV} | tr -d '\n' | cut -d = -f 2 | xargs) - echo "Installing Cmake >= ${MINI_CMAKE_VER_REQ} in $COOKBOOK_ENV_PATH/bin" - . $COOKBOOK_ENV_PATH/bin/activate - $COOKBOOK_ENV_PATH/bin/pip3 install cmake>=$MINI_CMAKE_VER_REQ - CMAKE_ARGS="" - if [ -e $COOKBOOK_ENV_PATH/bin/cmake ]; then - CMAKE_ARGS="-DCMAKE_INSTALL_PREFIX=$COOKBOOK_ENV_PATH/bin/cmake ${CMAKE_ARGS}" - fi - fi - mkdir build && cd build - cmake .. \ - -DCMAKE_CUDA_ARCHITECTURES="75;80;86" \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda-${cuda_ver}/bin/nvcc \ - -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${cuda_ver} \ - -DCMAKE_PREFIX_PATH=/usr/local/cuda-cuda-${cuda_ver} \ - ${CMAKE_ARGS} - make - ./deviceQuery | grep -o "Result = PASS" - if [ "${OS}" == 'ubuntu2004' ]; then - $COOKBOOK_ENV_PATH/bin/pip3 uninstall cmake -y - deactivate - fi - fi - [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 - fi - echo "CUDA deviceQuery test passed" - - - name: FSxLustre - action: ExecuteBash - inputs: - commands: - - | - set -vx - OS='{{ validate.OperatingSystemName.outputs.stdout }}' - - [[ $? -ne 0 ]] && echo "Check for Lustre client failed" && exit 1 - echo "FSx Lustre test passed" - - - name: Python - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Checking python3 installed..." - which python3 - [[ $? -ne 0 ]] && echo "Python3 is not installed" && exit 1 - echo "Python test passed" - - - name: DPKG - action: ExecuteBash - inputs: - commands: - - | - set -vx - PLATFORM='{{ validate.PlatformName.outputs.stdout }}' - if [ ${PLATFORM} != DEBIAN ]; then - echo "Checking dpkg is not installed on non-debian OS..." - if command -v dpkg &> /dev/null; then - echo "ERROR: dpkg found on non-Debian system" && exit 1 - fi - echo "dpkg test passed" - fi - - - name: InSpecValidationsForAwsBatch - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for AwsBatch on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-awsbatch - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for AwsBatch failed" && exit 1 - echo "InSpec validation for AwsBatch passed" - - - name: InSpecValidationsForPlatform - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for platform on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-platform - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for platform failed" && exit 1 - echo "InSpec validation for platform passed" - - - name: InSpecValidationsForEnvironment - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for environment on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-environment - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for environment failed" && exit 1 - echo "InSpec validation for environment passed" - - - name: InSpecValidationsForComputeFleet - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for compute fleet on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-computefleet - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for compute fleet failed" && exit 1 - echo "InSpec validation for compute fleet passed" - - - name: InSpecValidationsForShared - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for shared cookbook on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-shared - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for shared cookbook failed" && exit 1 - echo "InSpec validation for shared cookbook passed" - - - name: InSpecValidationsForSlurm - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for Slurm on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-slurm - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for Slurm failed" && exit 1 - echo "InSpec validation for slurm passed" diff --git a/cli/src/pcluster/templates/imagebuilder_stack.py b/cli/src/pcluster/templates/imagebuilder_stack.py index 50b1a5b8c6..57ebdcbc17 100644 --- a/cli/src/pcluster/templates/imagebuilder_stack.py +++ b/cli/src/pcluster/templates/imagebuilder_stack.py @@ -532,38 +532,6 @@ def _add_imagebuilder_components(self, build_tags, lambda_cleanup_policy_stateme else True ) if not disable_pcluster_component and not disable_validate_and_test_component: - validate_component_resource = imagebuilder.CfnComponent( - self, - id="ParallelClusterValidateComponent", - name=self._build_resource_name(IMAGEBUILDER_RESOURCE_NAME_PREFIX + "-Validate"), - version=utils.get_installed_version(base_version_only=True), - tags=build_tags, - description="Validate ParallelCluster AMI", - platform="Linux", - data=_load_yaml(imagebuilder_resources_dir, "parallelcluster_validate.yaml"), - ) - components.append( - imagebuilder.CfnImageRecipe.ComponentConfigurationProperty( - component_arn=Fn.ref("ParallelClusterValidateComponent") - ) - ) - components_resources.append(validate_component_resource) - if not self.custom_cleanup_lambda_role: - self._add_resource_delete_policy( - lambda_cleanup_policy_statements, - ["imagebuilder:DeleteComponent"], - [ - self.format_arn( - service="imagebuilder", - resource="component", - resource_name="{0}/*".format( - self._build_resource_name( - IMAGEBUILDER_RESOURCE_NAME_PREFIX + "-Validate", to_lower=True - ) - ), - ) - ], - ) test_component_resource = imagebuilder.CfnComponent( self, diff --git a/cli/tests/pcluster/templates/test_imagebuilder_stack.py b/cli/tests/pcluster/templates/test_imagebuilder_stack.py index 11533c2f50..ca2917686c 100644 --- a/cli/tests/pcluster/templates/test_imagebuilder_stack.py +++ b/cli/tests/pcluster/templates/test_imagebuilder_stack.py @@ -70,7 +70,6 @@ "UpdateOSComponent": {}, "ParallelClusterComponent": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -222,7 +221,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -272,7 +270,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -324,7 +321,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -370,7 +366,6 @@ "InstanceProfile": {}, "InfrastructureConfiguration": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -424,7 +419,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -474,7 +468,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -532,7 +525,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -581,7 +573,6 @@ "InfrastructureConfiguration": {}, "ParallelClusterComponent": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -631,7 +622,6 @@ "InfrastructureConfiguration": {}, "ParallelClusterComponent": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -681,7 +671,6 @@ "InfrastructureConfiguration": {}, "ParallelClusterComponent": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -1702,26 +1691,6 @@ def test_imagebuilder_instance_role( ] }, }, - { - "Action": "imagebuilder:DeleteComponent", - "Effect": "Allow", - "Resource": { - "Fn::Join": [ - "", - [ - "arn:", - {"Ref": "AWS::Partition"}, - ":imagebuilder:", - {"Ref": "AWS::Region"}, - ":", - {"Ref": "AWS::AccountId"}, - ":component/parallelclusterimage-validate-", - {"Fn::Select": [2, {"Fn::Split": ["/", {"Ref": "AWS::StackId"}]}]}, - "/*", - ], - ] - }, - }, { "Action": "imagebuilder:DeleteComponent", "Effect": "Allow", @@ -2231,7 +2200,6 @@ def test_imagebuilder_lambda_execution_role( {"ComponentArn": {"Ref": "ParallelClusterTagComponent"}}, {"ComponentArn": "arn:aws:imagebuilder:us-east-1:aws:component/apache-tomcat-9-linux/1.0.0"}, {"ComponentArn": "arn:aws:imagebuilder:us-east-1:aws:component/amazon-cloudwatch-agent-linux/1.0.0"}, - {"ComponentArn": {"Ref": "ParallelClusterValidateComponent"}}, {"ComponentArn": {"Ref": "ParallelClusterTestComponent"}}, ], ), diff --git a/cli/tests/pcluster/test_utils.py b/cli/tests/pcluster/test_utils.py index 13a0ced1a5..75c3872eb7 100644 --- a/cli/tests/pcluster/test_utils.py +++ b/cli/tests/pcluster/test_utils.py @@ -579,10 +579,6 @@ async def async_method(self, param): (Feature.BATCH, "us-iso-west-1", False), (Feature.BATCH, "us-isob-east-1", False), (Feature.BATCH, "us-isoWHATEVER", False), - (Feature.DCV, "us-iso-east-1", False), - (Feature.DCV, "us-iso-west-1", False), - (Feature.DCV, "us-isob-east-1", False), - (Feature.DCV, "us-isoWHATEVER", False), (Feature.FSX_LUSTRE, "us-isob-east-1", False), (Feature.FSX_LUSTRE, "us-isobWHATEVER", False), (Feature.FSX_ONTAP, "us-iso-east-1", False), @@ -596,7 +592,6 @@ async def async_method(self, param): (Feature.SLURM_DATABASE, "us-isoWHATEVER", True), (Feature.CLUSTER_HEALTH_METRICS, "us-isoWHATEVER", False), (Feature.BATCH, "WHATEVER-ELSE", True), - (Feature.DCV, "WHATEVER-ELSE", True), (Feature.FSX_LUSTRE, "WHATEVER-ELSE", True), (Feature.FSX_ONTAP, "WHATEVER-ELSE", True), (Feature.FSX_OPENZFS, "WHATEVER-ELSE", True), diff --git a/cli/tests/pcluster/validators/test_feature_validators.py b/cli/tests/pcluster/validators/test_feature_validators.py index e96ed11a6a..34cd4c0aa5 100644 --- a/cli/tests/pcluster/validators/test_feature_validators.py +++ b/cli/tests/pcluster/validators/test_feature_validators.py @@ -23,8 +23,6 @@ [ (Feature.BATCH, True, None), (Feature.BATCH, False, "AWS Batch scheduler is not supported in region 'WHATEVER-REGION'"), - (Feature.DCV, True, None), - (Feature.DCV, False, "Amazon DCV is not supported in region 'WHATEVER-REGION'"), (Feature.FSX_LUSTRE, True, None), (Feature.FSX_LUSTRE, False, "FSx Lustre is not supported in region 'WHATEVER-REGION'"), (Feature.FSX_ONTAP, True, None), diff --git a/tests/integration-tests/configs/isolated_regions.yaml b/tests/integration-tests/configs/isolated_regions.yaml index 68769a4ce5..da3aefc6f6 100644 --- a/tests/integration-tests/configs/isolated_regions.yaml +++ b/tests/integration-tests/configs/isolated_regions.yaml @@ -140,6 +140,20 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} + dcv: + test_dcv.py::test_dcv_configuration: + dimensions: + # DCV on GPU enabled instance + - regions: {{ REGIONS }} + instances: ["g4dn.2xlarge"] + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_dcv.py::test_dcv_with_remote_access: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} dns: test_dns.py::test_hit_no_cluster_dns_mpi: dimensions: @@ -417,6 +431,12 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} + test_efs.py::test_efs_access_point: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} test_raid.py::test_raid_fault_tolerance_mode: dimensions: - regions: {{ REGIONS }} @@ -581,6 +601,13 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} + pyxis: + test_pyxis.py::test_pyxis: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} # These tests cannot be executed in US isolated regions # because the feature Custom Resource is not supported in these regions. # custom_resource: