From 27fcdc6f2fe54074f1eb60ec84d086daeaa0c25d Mon Sep 17 00:00:00 2001 From: hanwenli Date: Thu, 15 May 2025 11:17:31 -0700 Subject: [PATCH 1/3] [integ-tests] Add tests from develop.yaml to isolated_regions.yaml --- .../configs/isolated_regions.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/integration-tests/configs/isolated_regions.yaml b/tests/integration-tests/configs/isolated_regions.yaml index 68769a4ce5..da3aefc6f6 100644 --- a/tests/integration-tests/configs/isolated_regions.yaml +++ b/tests/integration-tests/configs/isolated_regions.yaml @@ -140,6 +140,20 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} + dcv: + test_dcv.py::test_dcv_configuration: + dimensions: + # DCV on GPU enabled instance + - regions: {{ REGIONS }} + instances: ["g4dn.2xlarge"] + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_dcv.py::test_dcv_with_remote_access: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} dns: test_dns.py::test_hit_no_cluster_dns_mpi: dimensions: @@ -417,6 +431,12 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} + test_efs.py::test_efs_access_point: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} test_raid.py::test_raid_fault_tolerance_mode: dimensions: - regions: {{ REGIONS }} @@ -581,6 +601,13 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} + pyxis: + test_pyxis.py::test_pyxis: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} # These tests cannot be executed in US isolated regions # because the feature Custom Resource is not supported in these regions. # custom_resource: From c53ecd9f58d798f3503716db1c07a44fd1450120 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Mon, 6 Jan 2025 08:39:37 -0800 Subject: [PATCH 2/3] Enable DCV for us-iso regions Signed-off-by: Hanwen --- cli/src/pcluster/config/cluster_config.py | 6 ------ cli/src/pcluster/constants.py | 1 - cli/tests/pcluster/test_utils.py | 5 ----- cli/tests/pcluster/validators/test_feature_validators.py | 2 -- 4 files changed, 14 deletions(-) diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index b96cdbb8db..eb631adf2f 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -1639,7 +1639,6 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: D102 imds_support=self.imds.imds_support, ) if self.head_node.dcv: - self._register_validator(FeatureRegionValidator, feature=Feature.DCV, region=self.region) self._register_validator( DcvValidator, instance_type=self.head_node.instance_type, @@ -2973,7 +2972,6 @@ def login_nodes_subnet_ids(self): def _register_login_node_validators(self): """Register all login node validators to ensure that the resource parameters are valid.""" - has_dcv_configured = False # Check if all subnets(head node, Login nodes, compute nodes) are in the same VPC and support DNS. self._register_validator( SubnetsValidator, @@ -3013,10 +3011,6 @@ def _register_login_node_validators(self): os=self.image.os, architecture=pool.architecture, ) - has_dcv_configured = True - - if has_dcv_configured: - self._register_validator(FeatureRegionValidator, feature=Feature.DCV, region=self.region) def _register_validators(self, context: ValidatorContext = None): # noqa: C901 super()._register_validators(context) diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index d1860b83cf..f1b5fe5364 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -260,7 +260,6 @@ class Feature(Enum): UNSUPPORTED_FEATURES_MAP = { Feature.BATCH: ["ap-northeast-3", "ap-southeast-5", "ap-southeast-7", "us-iso"], - Feature.DCV: ["us-iso"], Feature.FSX_LUSTRE: ["us-isob"], Feature.FILE_CACHE: ["us-iso"], Feature.FSX_ONTAP: ["us-iso"], diff --git a/cli/tests/pcluster/test_utils.py b/cli/tests/pcluster/test_utils.py index 13a0ced1a5..75c3872eb7 100644 --- a/cli/tests/pcluster/test_utils.py +++ b/cli/tests/pcluster/test_utils.py @@ -579,10 +579,6 @@ async def async_method(self, param): (Feature.BATCH, "us-iso-west-1", False), (Feature.BATCH, "us-isob-east-1", False), (Feature.BATCH, "us-isoWHATEVER", False), - (Feature.DCV, "us-iso-east-1", False), - (Feature.DCV, "us-iso-west-1", False), - (Feature.DCV, "us-isob-east-1", False), - (Feature.DCV, "us-isoWHATEVER", False), (Feature.FSX_LUSTRE, "us-isob-east-1", False), (Feature.FSX_LUSTRE, "us-isobWHATEVER", False), (Feature.FSX_ONTAP, "us-iso-east-1", False), @@ -596,7 +592,6 @@ async def async_method(self, param): (Feature.SLURM_DATABASE, "us-isoWHATEVER", True), (Feature.CLUSTER_HEALTH_METRICS, "us-isoWHATEVER", False), (Feature.BATCH, "WHATEVER-ELSE", True), - (Feature.DCV, "WHATEVER-ELSE", True), (Feature.FSX_LUSTRE, "WHATEVER-ELSE", True), (Feature.FSX_ONTAP, "WHATEVER-ELSE", True), (Feature.FSX_OPENZFS, "WHATEVER-ELSE", True), diff --git a/cli/tests/pcluster/validators/test_feature_validators.py b/cli/tests/pcluster/validators/test_feature_validators.py index e96ed11a6a..34cd4c0aa5 100644 --- a/cli/tests/pcluster/validators/test_feature_validators.py +++ b/cli/tests/pcluster/validators/test_feature_validators.py @@ -23,8 +23,6 @@ [ (Feature.BATCH, True, None), (Feature.BATCH, False, "AWS Batch scheduler is not supported in region 'WHATEVER-REGION'"), - (Feature.DCV, True, None), - (Feature.DCV, False, "Amazon DCV is not supported in region 'WHATEVER-REGION'"), (Feature.FSX_LUSTRE, True, None), (Feature.FSX_LUSTRE, False, "FSx Lustre is not supported in region 'WHATEVER-REGION'"), (Feature.FSX_ONTAP, True, None), From 0e5b5659d419c1ae499a8def99c887d181b75436 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Fri, 9 May 2025 11:17:07 -0700 Subject: [PATCH 3/3] [build-image] Move logic from validation to test phase Validation phase is executed at the end of the image build on the same build instance. Test phase is executed after the image build and uses a separate test instance. Prior to this commit, both phases could be switched on/off with the DevSetting parameter `DisableValidateAndTest`. After this commit, validation phase is no longer used. Test phase can be switched on/off with the same DevSetting parameter --- .../imagebuilder/parallelcluster_test.yaml | 296 +++++++++++- .../parallelcluster_validate.yaml | 443 ------------------ .../pcluster/templates/imagebuilder_stack.py | 32 -- .../templates/test_imagebuilder_stack.py | 32 -- 4 files changed, 290 insertions(+), 513 deletions(-) diff --git a/cli/src/pcluster/resources/imagebuilder/parallelcluster_test.yaml b/cli/src/pcluster/resources/imagebuilder/parallelcluster_test.yaml index 00c6f0861f..bd2680fa80 100644 --- a/cli/src/pcluster/resources/imagebuilder/parallelcluster_test.yaml +++ b/cli/src/pcluster/resources/imagebuilder/parallelcluster_test.yaml @@ -58,6 +58,290 @@ phases: echo ${OS} + # Get input base AMI Architecture + - name: OperatingSystemArchitecture + action: ExecuteBash + inputs: + commands: + - | + set -v + ARCH=$(uname -m) + case ${ARCH} in + 'x86_64') + echo 'x86_64' + ;; + 'aarch64') + echo 'arm64' + ;; + *) + echo "The '${ARCH}' architecture is not supported. Failing build." && exit 1 + ;; + esac + + # Get platform name + - name: PlatformName + action: ExecuteBash + inputs: + commands: + - | + set -v + OS='{{ test.OperatingSystemName.outputs.stdout }}' + + if [ `echo "${OS}" | grep -E '^(alinux|centos|rhel|rocky)'` ]; then + PLATFORM='RHEL' + elif [ `echo "${OS}" | grep -E '^ubuntu'` ]; then + PLATFORM='DEBIAN' + fi + + echo ${PLATFORM} + + ### conditions ### + - name: IntelMPISupported + action: ExecuteBash + inputs: + commands: + - | + set -v + [[ {{ test.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]] && echo "true" || echo "false" + + - name: FabricManagerSupported + action: ExecuteBash + inputs: + commands: + - | + set -v + [[ {{ test.OperatingSystemArchitecture.outputs.stdout }} == 'arm64' ]] && echo "false" || echo "true" + + - name: LustreSupported + action: ExecuteBash + inputs: + commands: + - | + set -v + ARCHITECTURE='{{ test.OperatingSystemArchitecture.outputs.stdout }}' + OS='{{ test.OperatingSystemName.outputs.stdout }}' + if [ ${ARCHITECTURE} == 'arm64' ] && [[ ${OS} =~ ^(ubuntu(20|22)04|alinux(2|2023)|rhel8|rocky8|rhel9|rocky9)$ ]] || [ ${ARCHITECTURE} == 'x86_64' ]; then + echo "true" + else + echo "false" + fi + + ### versions ### + - name: MungeVersion + action: ExecuteBash + inputs: + commands: + - | + set -v + PATTERN=$(jq '.default.cluster.munge.munge_version' {{ CookbookDefaultFile }}) + VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) + echo ${VERSION} + + - name: NvidiaDriverVersion + action: ExecuteBash + inputs: + commands: + - | + set -v + PATTERN=$(jq '.default.cluster.nvidia.driver_version' {{ CookbookDefaultFile }}) + VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) + echo ${VERSION} + + - name: CudaVersion + action: ExecuteBash + inputs: + commands: + - | + set -v + PATTERN=$(jq '.default.cluster.nvidia.cuda.version' {{ CookbookDefaultFile }}) + VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) + echo ${VERSION} + + - name: CudaSamplesDir + action: ExecuteBash + inputs: + commands: + - | + set -v + cuda_ver="{{ test.CudaVersion.outputs.stdout }}" + if [ ${cuda_ver} \> '11.4' ]; then + PATTERN=$(jq '.default.cluster.nvidia.cuda_samples_version' {{ CookbookDefaultFile }}) + VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) + echo cuda-samples-${VERSION} + else + echo cuda-${cuda_ver} + fi + + ### utils ### + - name: PatchInSpecProfiles + action: ExecuteBash + inputs: + commands: + - | + set -v + sed -Ei "s#path: cookbooks/aws-parallelcluster#path: /etc/chef/cookbooks/aws-parallelcluster#g" /etc/chef/cookbooks/aws-parallelcluster-*/test/inspec.yml + echo "InSpec profiles patched" + + - name: NvidiaEnabled + action: ExecuteBash + inputs: + commands: + - | + set -v + NVIDIA_ENABLED=$(cat /etc/parallelcluster/image_dna.json | jq -r '.cluster.nvidia.enabled') + echo "${NVIDIA_ENABLED}" + + - name: HasGPU + action: ExecuteBash + inputs: + commands: + - | + set -v + HAS_GPU=$(lspci | grep -o "NVIDIA") || HAS_GPU="false" + echo "${HAS_GPU}" + + - name: Munge + action: ExecuteBash + inputs: + commands: + - | + set -vx + echo "check munge installed" + munge --version | grep {{ test.MungeVersion.outputs.stdout }} + [[ $? -ne 0 ]] && echo "Check munge version failed" && exit 1 + echo "Munge test passed" + + - name: EFAIntelMPI + action: ExecuteBash + inputs: + commands: + - | + set -vx + PLATFORM='{{ test.PlatformName.outputs.stdout }}' + + if [ {{ test.IntelMPISupported.outputs.stdout }} == true ]; then + echo "Checking efa packages installed..." + if [ ${PLATFORM} == RHEL ]; then + rpm -qa | grep libfabric && rpm -qa | grep efa- + [[ $? -ne 0 ]] && echo "Check efa rpm failed" && exit 1 + + echo "Checking Intel MPI 20xx installed and module available..." + unset MODULEPATH + source /etc/profile.d/modules.sh + (module avail intelmpi)2>&1 | grep "/opt/intel/mpi/20.*/modulefiles" + [[ $? -ne 0 ]] && echo "Check Intel MPI failed" && exit 1 + else + dpkg -l | grep libfabric && modinfo efa | grep efa && [ -d /opt/amazon/efa ] + [[ $? -ne 0 ]] && echo "Check efa deb failed" && exit 1 + fi + fi + echo "EFA test passed" + + - name: NvidiaCudaFabricManager + action: ExecuteBash + inputs: + commands: + - | + set -vx + PLATFORM='{{ test.PlatformName.outputs.stdout }}' + + if [[ {{ test.NvidiaEnabled.outputs.stdout }} == 'no' ]]; then + echo "Nvidia recipe not enabled, skipping." && exit 0 + fi + if [ {{ test.HasGPU.outputs.stdout }} == "false" ]; then + echo "No GPU detected, skipping." && exit 0 + fi + + driver_ver="{{ test.NvidiaDriverVersion.outputs.stdout }}" + export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" + + echo "Testing Nvidia driver version" + driver_output=$(nvidia-smi | grep -E -o "Driver Version: [0-9.]+") + [[ "${driver_output}" != "Driver Version: ${driver_ver}" ]] && "ERROR Installed version ${driver_output} but expected ${driver_ver}" && exit 1 + echo "Correctly installed Nvidia ${driver_output}" + + if [ {{ test.FabricManagerSupported.outputs.stdout }} == "true" ]; then + echo "Testing Nvidia Fabric Manager version" + nvidia_driver_version=$(modinfo -F version nvidia) + if [ "${PLATFORM}" == "RHEL" ]; then + yum list installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 + yum versionlock list | grep "nvidia-fabric.*manager" || exit 1 + else + apt list --installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 + apt-mark showhold | grep "nvidia-fabric.*manager" || exit 1 + fi + echo "Fabric Manager match Nvidia driver and version is locked" + fi + + echo "Testing CUDA installation with nvcc" + cuda_ver="{{ test.CudaVersion.outputs.stdout }}" + export PATH=/usr/local/cuda-${cuda_ver}/bin:${PATH} + export LD_LIBRARY_PATH=/usr/local/cuda-${cuda_ver}/lib64:${LD_LIBRARY_PATH} + cuda_output=$(nvcc -V | grep -E -o "release [0-9]+.[0-9]+") + [[ "${cuda_output}" != "release ${cuda_ver}" ]] && echo "ERROR Installed version ${cuda_output} but expected ${cuda_ver}" && exit 1 + echo "Correctly installed CUDA ${cuda_output}" + + echo "Testing CUDA with deviceQuery..." + if [ {{ test.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]; then + /usr/local/cuda-${cuda_ver}/extras/demo_suite/deviceQuery | grep -o "Result = PASS" + [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 + else + cd /usr/local/{{ test.CudaSamplesDir.outputs.stdout }}//Samples/1_Utilities/deviceQuery + if [ {{ test.OperatingSystemName.outputs.stdout }} == 'alinux2' ]; then + make + /usr/local/{{ test.CudaSamplesDir.outputs.stdout }}/bin/sbsa/linux/release/deviceQuery | grep -o "Result = PASS" + else + mkdir build && cd build + cmake .. \ + -DCMAKE_CUDA_ARCHITECTURES="75;80;86" \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda-${cuda_ver}/bin/nvcc \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${cuda_ver} \ + -DCMAKE_PREFIX_PATH=/usr/local/cuda-cuda-${cuda_ver} \ + ${CMAKE_ARGS} + make + ./deviceQuery | grep -o "Result = PASS" + fi + [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 + fi + echo "CUDA deviceQuery test passed" + + - name: FSxLustre + action: ExecuteBash + inputs: + commands: + - | + set -vx + OS='{{ test.OperatingSystemName.outputs.stdout }}' + + [[ $? -ne 0 ]] && echo "Check for Lustre client failed" && exit 1 + echo "FSx Lustre test passed" + + - name: Python + action: ExecuteBash + inputs: + commands: + - | + set -vx + echo "Checking python3 installed..." + which python3 + [[ $? -ne 0 ]] && echo "Python3 is not installed" && exit 1 + echo "Python test passed" + + - name: DPKG + action: ExecuteBash + inputs: + commands: + - | + set -vx + PLATFORM='{{ test.PlatformName.outputs.stdout }}' + if [ ${PLATFORM} != DEBIAN ]; then + echo "Checking dpkg is not installed on non-debian OS..." + if command -v dpkg &> /dev/null; then + echo "ERROR: dpkg found on non-Debian system" && exit 1 + fi + echo "dpkg test passed" + fi + ### versions ### - name: PythonVersion action: ExecuteBash @@ -121,7 +405,7 @@ phases: set -vx echo "Performing InSpec tests for AwsBatch on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-awsbatch - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for AwsBatch failed" && exit 1 echo "InSpec tests for AwsBatch passed" @@ -133,7 +417,7 @@ phases: set -vx echo "Performing InSpec tests for platform on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-platform - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for platform failed" && exit 1 echo "InSpec tests for platform passed" @@ -145,7 +429,7 @@ phases: set -vx echo "Performing InSpec tests for environment on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-environment - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for environment failed" && exit 1 echo "InSpec tests for environment passed" @@ -157,7 +441,7 @@ phases: set -vx echo "Performing InSpec tests for compute fleet on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-computefleet - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for compute fleet failed" && exit 1 echo "InSpec tests for compute fleet passed" @@ -169,7 +453,7 @@ phases: set -vx echo "Performing InSpec tests for shared cookbook on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-shared - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for shared cookbook failed" && exit 1 echo "InSpec tests for shared cookbook passed" @@ -181,6 +465,6 @@ phases: set -vx echo "Performing InSpec tests for slurm on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-slurm - inspec exec test --profiles-path . --controls /tag:testami/ --no-distinct-exit + inspec exec test --profiles-path . --controls /tag:install/ /tag:testami/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec tests for slurm failed" && exit 1 echo "InSpec tests for slurm passed" diff --git a/cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml b/cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml index 425e71e5e9..e69de29bb2 100644 --- a/cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml +++ b/cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml @@ -1,443 +0,0 @@ -name: ParallelClusterValidate -description: Validate ParallelCluster AMI -schemaVersion: 1.0 - -constants: - - CookbookDefaultFile: - type: string - value: /etc/chef/node_attributes.json - -phases: - - name: validate - steps: - ### basic ### - - name: OperatingSystemRelease - action: ExecuteBash - inputs: - commands: - - | - set -v - FILE=/etc/os-release - if [ -e ${FILE} ]; then - . ${FILE} - echo "${ID}${VERSION_ID:+.${VERSION_ID}}" - else - echo "The file '${FILE}' does not exist. Failing build." && exit 1 - fi - - # Get uniformed OS name - - name: OperatingSystemName - action: ExecuteBash - inputs: - commands: - - | - set -v - RELEASE='{{ validate.OperatingSystemRelease.outputs.stdout }}' - - if [ `echo "${RELEASE}" | grep -w '^amzn\.2'` ]; then - OS='alinux2' - elif [ `echo "${RELEASE}" | grep -w '^amzn\.2023'` ]; then - OS='alinux2023' - elif [ `echo "${RELEASE}" | grep '^ubuntu\.20'` ]; then - OS='ubuntu2004' - elif [ `echo "${RELEASE}" | grep '^ubuntu\.22'` ]; then - OS='ubuntu2204' - elif [ `echo "${RELEASE}" | grep '^ubuntu\.24'` ]; then - OS='ubuntu2404' - elif [ `echo "${RELEASE}" | grep '^rhel\.8'` ]; then - OS='rhel8' - elif [ `echo "${RELEASE}" | grep '^rocky\.8'` ]; then - OS='rocky8' - elif [ `echo "${RELEASE}" | grep '^rhel\.9'` ]; then - OS='rhel9' - elif [ `echo "${RELEASE}" | grep '^rocky\.9'` ]; then - OS='rocky9' - else - echo "Operating System '${RELEASE}' is not supported. Failing build." && exit 1 - fi - - echo ${OS} - - # Get input base AMI Architecture - - name: OperatingSystemArchitecture - action: ExecuteBash - inputs: - commands: - - | - set -v - ARCH=$(uname -m) - case ${ARCH} in - 'x86_64') - echo 'x86_64' - ;; - 'aarch64') - echo 'arm64' - ;; - *) - echo "The '${ARCH}' architecture is not supported. Failing build." && exit 1 - ;; - esac - - # Get platform name - - name: PlatformName - action: ExecuteBash - inputs: - commands: - - | - set -v - OS='{{ validate.OperatingSystemName.outputs.stdout }}' - - if [ `echo "${OS}" | grep -E '^(alinux|centos|rhel|rocky)'` ]; then - PLATFORM='RHEL' - elif [ `echo "${OS}" | grep -E '^ubuntu'` ]; then - PLATFORM='DEBIAN' - fi - - echo ${PLATFORM} - - # Get AWS region - - name: AWSRegion - action: ExecuteBash - inputs: - commands: - - | - set -v - IMDS_TOKEN=$(curl --retry 3 --retry-delay 0 -s --fail -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 300") - AVAIL_ZONE=$(curl --retry 3 --retry-delay 0 -s --fail -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" http://169.254.169.254/latest/meta-data/placement/availability-zone) - AWS_REGION=${AVAIL_ZONE::-1} - echo ${AWS_REGION} - - ### conditions ### - - name: IntelMPISupported - action: ExecuteBash - inputs: - commands: - - | - set -v - [[ {{ validate.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]] && echo "true" || echo "false" - - - name: FabricManagerSupported - action: ExecuteBash - inputs: - commands: - - | - set -v - [[ {{ validate.OperatingSystemArchitecture.outputs.stdout }} == 'arm64' ]] && echo "false" || echo "true" - - - name: LustreSupported - action: ExecuteBash - inputs: - commands: - - | - set -v - ARCHITECTURE='{{ validate.OperatingSystemArchitecture.outputs.stdout }}' - OS='{{ validate.OperatingSystemName.outputs.stdout }}' - if [ ${ARCHITECTURE} == 'arm64' ] && [[ ${OS} =~ ^(ubuntu(20|22)04|alinux(2|2023)|rhel8|rocky8|rhel9|rocky9)$ ]] || [ ${ARCHITECTURE} == 'x86_64' ]; then - echo "true" - else - echo "false" - fi - - ### versions ### - - name: MungeVersion - action: ExecuteBash - inputs: - commands: - - | - set -v - PATTERN=$(jq '.default.cluster.munge.munge_version' {{ CookbookDefaultFile }}) - VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) - echo ${VERSION} - - - name: NvidiaDriverVersion - action: ExecuteBash - inputs: - commands: - - | - set -v - PATTERN=$(jq '.default.cluster.nvidia.driver_version' {{ CookbookDefaultFile }}) - VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) - echo ${VERSION} - - - name: CudaVersion - action: ExecuteBash - inputs: - commands: - - | - set -v - PATTERN=$(jq '.default.cluster.nvidia.cuda.version' {{ CookbookDefaultFile }}) - VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) - echo ${VERSION} - - - name: CudaSamplesDir - action: ExecuteBash - inputs: - commands: - - | - set -v - cuda_ver="{{ validate.CudaVersion.outputs.stdout }}" - if [ ${cuda_ver} \> '11.4' ]; then - PATTERN=$(jq '.default.cluster.nvidia.cuda_samples_version' {{ CookbookDefaultFile }}) - VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) - echo cuda-samples-${VERSION} - else - echo cuda-${cuda_ver} - fi - - ### utils ### - - name: PatchInSpecProfiles - action: ExecuteBash - inputs: - commands: - - | - set -v - sed -Ei "s#path: cookbooks/aws-parallelcluster#path: /etc/chef/cookbooks/aws-parallelcluster#g" /etc/chef/cookbooks/aws-parallelcluster-*/test/inspec.yml - echo "InSpec profiles patched" - - - name: NvidiaEnabled - action: ExecuteBash - inputs: - commands: - - | - set -v - NVIDIA_ENABLED=$(cat /etc/parallelcluster/image_dna.json | jq -r '.cluster.nvidia.enabled') - echo "${NVIDIA_ENABLED}" - - - name: HasGPU - action: ExecuteBash - inputs: - commands: - - | - set -v - HAS_GPU=$(lspci | grep -o "NVIDIA") || HAS_GPU="false" - echo "${HAS_GPU}" - - - name: Munge - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "check munge installed" - munge --version | grep {{ validate.MungeVersion.outputs.stdout }} - [[ $? -ne 0 ]] && echo "Check munge version failed" && exit 1 - echo "Munge test passed" - - - name: EFAIntelMPI - action: ExecuteBash - inputs: - commands: - - | - set -vx - PLATFORM='{{ validate.PlatformName.outputs.stdout }}' - - if [ {{ validate.IntelMPISupported.outputs.stdout }} == true ]; then - echo "Checking efa packages installed..." - if [ ${PLATFORM} == RHEL ]; then - rpm -qa | grep libfabric && rpm -qa | grep efa- - [[ $? -ne 0 ]] && echo "Check efa rpm failed" && exit 1 - - echo "Checking Intel MPI 20xx installed and module available..." - unset MODULEPATH - source /etc/profile.d/modules.sh - (module avail intelmpi)2>&1 | grep "/opt/intel/mpi/20.*/modulefiles" - [[ $? -ne 0 ]] && echo "Check Intel MPI failed" && exit 1 - else - dpkg -l | grep libfabric && modinfo efa | grep efa && [ -d /opt/amazon/efa ] - [[ $? -ne 0 ]] && echo "Check efa deb failed" && exit 1 - fi - fi - echo "EFA test passed" - - - name: NvidiaCudaFabricManager - action: ExecuteBash - inputs: - commands: - - | - set -vx - PLATFORM='{{ validate.PlatformName.outputs.stdout }}' - - if [[ {{ validate.NvidiaEnabled.outputs.stdout }} == 'no' ]]; then - echo "Nvidia recipe not enabled, skipping." && exit 0 - fi - if [ {{ validate.HasGPU.outputs.stdout }} == "false" ]; then - echo "No GPU detected, skipping." && exit 0 - fi - - driver_ver="{{ validate.NvidiaDriverVersion.outputs.stdout }}" - export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" - - echo "Testing Nvidia driver version" - driver_output=$(nvidia-smi | grep -E -o "Driver Version: [0-9.]+") - [[ "${driver_output}" != "Driver Version: ${driver_ver}" ]] && "ERROR Installed version ${driver_output} but expected ${driver_ver}" && exit 1 - echo "Correctly installed Nvidia ${driver_output}" - - if [ {{ validate.FabricManagerSupported.outputs.stdout }} == "true" ]; then - echo "Testing Nvidia Fabric Manager version" - nvidia_driver_version=$(modinfo -F version nvidia) - if [ "${PLATFORM}" == "RHEL" ]; then - yum list installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 - yum versionlock list | grep "nvidia-fabric.*manager" || exit 1 - else - apt list --installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 - apt-mark showhold | grep "nvidia-fabric.*manager" || exit 1 - fi - echo "Fabric Manager match Nvidia driver and version is locked" - fi - - echo "Testing CUDA installation with nvcc" - cuda_ver="{{ validate.CudaVersion.outputs.stdout }}" - export PATH=/usr/local/cuda-${cuda_ver}/bin:${PATH} - export LD_LIBRARY_PATH=/usr/local/cuda-${cuda_ver}/lib64:${LD_LIBRARY_PATH} - cuda_output=$(nvcc -V | grep -E -o "release [0-9]+.[0-9]+") - [[ "${cuda_output}" != "release ${cuda_ver}" ]] && echo "ERROR Installed version ${cuda_output} but expected ${cuda_ver}" && exit 1 - echo "Correctly installed CUDA ${cuda_output}" - - echo "Testing CUDA with deviceQuery..." - if [ {{ validate.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]; then - /usr/local/cuda-${cuda_ver}/extras/demo_suite/deviceQuery | grep -o "Result = PASS" - [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 - else - cd /usr/local/{{ validate.CudaSamplesDir.outputs.stdout }}//Samples/1_Utilities/deviceQuery - if [ {{ validate.OperatingSystemName.outputs.stdout }} == 'alinux2' ]; then - make - /usr/local/{{ validate.CudaSamplesDir.outputs.stdout }}/bin/sbsa/linux/release/deviceQuery | grep -o "Result = PASS" - else - if [ {{ validate.OperatingSystemName.outputs.stdout }} == 'ubuntu2004' ]; then - MINI_CMAKE_VER_REQ=$(sed -n 's/cmake_minimum_required(\(VERSION \)\?\([0-9.]*\)).*/\2/p' CMakeLists.txt) - COOKBOOK_ENV=$(jq '.default.cluster.cookbook_virtualenv_path' {{ CookbookDefaultFile }}) - COOKBOOK_ENV_PATH=$(echo ${COOKBOOK_ENV} | tr -d '\n' | cut -d = -f 2 | xargs) - echo "Installing Cmake >= ${MINI_CMAKE_VER_REQ} in $COOKBOOK_ENV_PATH/bin" - . $COOKBOOK_ENV_PATH/bin/activate - $COOKBOOK_ENV_PATH/bin/pip3 install cmake>=$MINI_CMAKE_VER_REQ - CMAKE_ARGS="" - if [ -e $COOKBOOK_ENV_PATH/bin/cmake ]; then - CMAKE_ARGS="-DCMAKE_INSTALL_PREFIX=$COOKBOOK_ENV_PATH/bin/cmake ${CMAKE_ARGS}" - fi - fi - mkdir build && cd build - cmake .. \ - -DCMAKE_CUDA_ARCHITECTURES="75;80;86" \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda-${cuda_ver}/bin/nvcc \ - -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${cuda_ver} \ - -DCMAKE_PREFIX_PATH=/usr/local/cuda-cuda-${cuda_ver} \ - ${CMAKE_ARGS} - make - ./deviceQuery | grep -o "Result = PASS" - if [ "${OS}" == 'ubuntu2004' ]; then - $COOKBOOK_ENV_PATH/bin/pip3 uninstall cmake -y - deactivate - fi - fi - [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 - fi - echo "CUDA deviceQuery test passed" - - - name: FSxLustre - action: ExecuteBash - inputs: - commands: - - | - set -vx - OS='{{ validate.OperatingSystemName.outputs.stdout }}' - - [[ $? -ne 0 ]] && echo "Check for Lustre client failed" && exit 1 - echo "FSx Lustre test passed" - - - name: Python - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Checking python3 installed..." - which python3 - [[ $? -ne 0 ]] && echo "Python3 is not installed" && exit 1 - echo "Python test passed" - - - name: DPKG - action: ExecuteBash - inputs: - commands: - - | - set -vx - PLATFORM='{{ validate.PlatformName.outputs.stdout }}' - if [ ${PLATFORM} != DEBIAN ]; then - echo "Checking dpkg is not installed on non-debian OS..." - if command -v dpkg &> /dev/null; then - echo "ERROR: dpkg found on non-Debian system" && exit 1 - fi - echo "dpkg test passed" - fi - - - name: InSpecValidationsForAwsBatch - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for AwsBatch on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-awsbatch - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for AwsBatch failed" && exit 1 - echo "InSpec validation for AwsBatch passed" - - - name: InSpecValidationsForPlatform - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for platform on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-platform - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for platform failed" && exit 1 - echo "InSpec validation for platform passed" - - - name: InSpecValidationsForEnvironment - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for environment on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-environment - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for environment failed" && exit 1 - echo "InSpec validation for environment passed" - - - name: InSpecValidationsForComputeFleet - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for compute fleet on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-computefleet - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for compute fleet failed" && exit 1 - echo "InSpec validation for compute fleet passed" - - - name: InSpecValidationsForShared - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for shared cookbook on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-shared - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for shared cookbook failed" && exit 1 - echo "InSpec validation for shared cookbook passed" - - - name: InSpecValidationsForSlurm - action: ExecuteBash - inputs: - commands: - - | - set -vx - echo "Performing InSpec validation for Slurm on the AMI..." - cd /etc/chef/cookbooks/aws-parallelcluster-slurm - inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit - [[ $? -ne 0 ]] && echo "InSpec validation for Slurm failed" && exit 1 - echo "InSpec validation for slurm passed" diff --git a/cli/src/pcluster/templates/imagebuilder_stack.py b/cli/src/pcluster/templates/imagebuilder_stack.py index 50b1a5b8c6..57ebdcbc17 100644 --- a/cli/src/pcluster/templates/imagebuilder_stack.py +++ b/cli/src/pcluster/templates/imagebuilder_stack.py @@ -532,38 +532,6 @@ def _add_imagebuilder_components(self, build_tags, lambda_cleanup_policy_stateme else True ) if not disable_pcluster_component and not disable_validate_and_test_component: - validate_component_resource = imagebuilder.CfnComponent( - self, - id="ParallelClusterValidateComponent", - name=self._build_resource_name(IMAGEBUILDER_RESOURCE_NAME_PREFIX + "-Validate"), - version=utils.get_installed_version(base_version_only=True), - tags=build_tags, - description="Validate ParallelCluster AMI", - platform="Linux", - data=_load_yaml(imagebuilder_resources_dir, "parallelcluster_validate.yaml"), - ) - components.append( - imagebuilder.CfnImageRecipe.ComponentConfigurationProperty( - component_arn=Fn.ref("ParallelClusterValidateComponent") - ) - ) - components_resources.append(validate_component_resource) - if not self.custom_cleanup_lambda_role: - self._add_resource_delete_policy( - lambda_cleanup_policy_statements, - ["imagebuilder:DeleteComponent"], - [ - self.format_arn( - service="imagebuilder", - resource="component", - resource_name="{0}/*".format( - self._build_resource_name( - IMAGEBUILDER_RESOURCE_NAME_PREFIX + "-Validate", to_lower=True - ) - ), - ) - ], - ) test_component_resource = imagebuilder.CfnComponent( self, diff --git a/cli/tests/pcluster/templates/test_imagebuilder_stack.py b/cli/tests/pcluster/templates/test_imagebuilder_stack.py index 11533c2f50..ca2917686c 100644 --- a/cli/tests/pcluster/templates/test_imagebuilder_stack.py +++ b/cli/tests/pcluster/templates/test_imagebuilder_stack.py @@ -70,7 +70,6 @@ "UpdateOSComponent": {}, "ParallelClusterComponent": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -222,7 +221,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -272,7 +270,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -324,7 +321,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -370,7 +366,6 @@ "InstanceProfile": {}, "InfrastructureConfiguration": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -424,7 +419,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -474,7 +468,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -532,7 +525,6 @@ "ImageRecipe": {}, "ParallelClusterImage": {}, "ParallelClusterComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "BuildNotificationTopic": {}, "BuildNotificationSubscription": {}, @@ -581,7 +573,6 @@ "InfrastructureConfiguration": {}, "ParallelClusterComponent": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -631,7 +622,6 @@ "InfrastructureConfiguration": {}, "ParallelClusterComponent": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -681,7 +671,6 @@ "InfrastructureConfiguration": {}, "ParallelClusterComponent": {}, "ParallelClusterTagComponent": {}, - "ParallelClusterValidateComponent": {}, "ParallelClusterTestComponent": {}, "ImageRecipe": {}, "ParallelClusterImage": {}, @@ -1702,26 +1691,6 @@ def test_imagebuilder_instance_role( ] }, }, - { - "Action": "imagebuilder:DeleteComponent", - "Effect": "Allow", - "Resource": { - "Fn::Join": [ - "", - [ - "arn:", - {"Ref": "AWS::Partition"}, - ":imagebuilder:", - {"Ref": "AWS::Region"}, - ":", - {"Ref": "AWS::AccountId"}, - ":component/parallelclusterimage-validate-", - {"Fn::Select": [2, {"Fn::Split": ["/", {"Ref": "AWS::StackId"}]}]}, - "/*", - ], - ] - }, - }, { "Action": "imagebuilder:DeleteComponent", "Effect": "Allow", @@ -2231,7 +2200,6 @@ def test_imagebuilder_lambda_execution_role( {"ComponentArn": {"Ref": "ParallelClusterTagComponent"}}, {"ComponentArn": "arn:aws:imagebuilder:us-east-1:aws:component/apache-tomcat-9-linux/1.0.0"}, {"ComponentArn": "arn:aws:imagebuilder:us-east-1:aws:component/amazon-cloudwatch-agent-linux/1.0.0"}, - {"ComponentArn": {"Ref": "ParallelClusterValidateComponent"}}, {"ComponentArn": {"Ref": "ParallelClusterTestComponent"}}, ], ),