Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decouple OSU benchmarks from EFA integration tests #5928

Merged
merged 5 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions tests/integration-tests/configs/ad_integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,3 @@ test-suites:
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["alinux2", "ubuntu2004"]
schedulers: ["slurm"]
benchmarks:
- mpi_variants: ["openmpi"]
num_instances: [100]
osu_benchmarks:
# Available collective benchmarks "osu_allgather", "osu_allreduce", "osu_alltoall", "osu_barrier", "osu_bcast", "osu_gather", "osu_reduce", "osu_reduce_scatter", "osu_scatter"
collective: ["osu_allreduce", "osu_alltoall"]
pt2pt: []
24 changes: 0 additions & 24 deletions tests/integration-tests/configs/common/common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ ad_integration:
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["alinux2", "rhel8"]
schedulers: ["slurm"]
benchmarks:
- mpi_variants: ["openmpi"]
num_instances: [5]
osu_benchmarks:
collective: ["osu_alltoall"]
- regions: ["eu-west-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["ubuntu2204", "centos7"]
Expand Down Expand Up @@ -240,13 +235,6 @@ disable_hyperthreading:
instances: ["c5.xlarge"]
oss: ["ubuntu2204"]
schedulers: ["slurm"]
benchmarks:
- mpi_variants: ["openmpi", "intelmpi"]
num_instances: [5] # Change the head node instance type if you'd test more than 30 instances
slots_per_instance: 2
partition: "ht-disabled"
osu_benchmarks:
collective: ["osu_allreduce", "osu_alltoall"]
dns:
test_dns.py::test_hit_no_cluster_dns_mpi:
dimensions:
Expand Down Expand Up @@ -569,12 +557,6 @@ storage:
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["rhel8"]
schedulers: ["slurm"]
benchmarks:
- mpi_variants: ["openmpi"]
num_instances: [5] # Change the head node instance type if you'd test more than 30 instances
slots_per_instance: 2
osu_benchmarks:
collective: ["osu_alltoall"]
- regions: ["eu-west-2"]
instances: {{ common.INSTANCES_DEFAULT_ARM }}
oss: ["ubuntu2004"]
Expand Down Expand Up @@ -630,12 +612,6 @@ storage:
instances: {{ common.INSTANCES_DEFAULT_ARM }}
oss: ["ubuntu2204"]
schedulers: [ "slurm" ]
benchmarks:
- mpi_variants: ["intelmpi"]
num_instances: [5] # Change the head node instance type if you'd test more than 30 instances
slots_per_instance: 2
osu_benchmarks:
collective: ["osu_alltoall"]
- regions: ["us-gov-east-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["rhel8"]
Expand Down
18 changes: 0 additions & 18 deletions tests/integration-tests/configs/isolated_regions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@ test-suites:
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
benchmarks:
- mpi_variants: ["openmpi"]
num_instances: [5]
osu_benchmarks:
collective: ["osu_alltoall"]
# This test cannot be executed in US isolated regions because it relies on a CloudFormation stack using resources
# that are not supported by CloudFormation in ADC, i.e. CapacityReservation and ResourceGroup.
# capacity_reservations:
Expand Down Expand Up @@ -153,13 +148,6 @@ test-suites:
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
benchmarks:
- mpi_variants: ["openmpi", "intelmpi"]
num_instances: [20] # Change the head node instance type if you'd test more than 30 instances
slots_per_instance: 2
partition: "ht-disabled"
osu_benchmarks:
collective: ["osu_allreduce", "osu_alltoall"]
dns:
test_dns.py::test_hit_no_cluster_dns_mpi:
dimensions:
Expand Down Expand Up @@ -460,12 +448,6 @@ test-suites:
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
benchmarks:
- mpi_variants: ["openmpi", "intelmpi"]
num_instances: [20] # Change the head node instance type if you'd test more than 30 instances
slots_per_instance: 2
osu_benchmarks:
collective: ["osu_allreduce", "osu_alltoall"]
test_raid.py::test_raid_fault_tolerance_mode:
dimensions:
- regions: {{ REGIONS }}
Expand Down
24 changes: 0 additions & 24 deletions tests/integration-tests/configs/new_os.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@ test-suites:
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: {{ NEW_OS }}
schedulers: ["slurm"]
benchmarks:
- mpi_variants: [ "openmpi", "intelmpi" ]
num_instances: [ 4 ]
slots_per_instance: 2
osu_benchmarks:
collective: [ "osu_alltoall" ]
arm_pl:
test_arm_pl.py::test_arm_pl:
dimensions:
Expand Down Expand Up @@ -100,12 +94,6 @@ test-suites:
instances: ["m4.xlarge"]
oss: {{ NEW_OS }}
schedulers: ["slurm"]
benchmarks:
- mpi_variants: [ "openmpi", "intelmpi" ]
num_instances: [ 4 ]
slots_per_instance: 2
osu_benchmarks:
collective: [ "osu_alltoall" ]
dns:
test_dns.py::test_hit_no_cluster_dns_mpi:
dimensions:
Expand Down Expand Up @@ -237,12 +225,6 @@ test-suites:
instances: {{ common.INSTANCES_DEFAULT_ARM }}
oss: {{ NEW_OS }}
schedulers: ["slurm"]
benchmarks:
- mpi_variants: [ "openmpi", "intelmpi" ]
num_instances: [ 4 ]
slots_per_instance: 2
osu_benchmarks:
collective: [ "osu_alltoall" ]
test_fsx_lustre.py::test_fsx_lustre_configuration_options:
dimensions:
- regions: ["us-east-2"]
Expand All @@ -261,12 +243,6 @@ test-suites:
instances: {{ common.INSTANCES_DEFAULT_ARM }}
oss: {{ NEW_OS }}
schedulers: [ "slurm" ]
benchmarks:
- mpi_variants: [ "openmpi", "intelmpi" ]
num_instances: [ 4 ]
slots_per_instance: 2
osu_benchmarks:
collective: [ "osu_alltoall" ]
test_raid.py::test_raid_performance_mode:
dimensions:
- regions: ["ap-south-1"]
Expand Down
4 changes: 2 additions & 2 deletions tests/integration-tests/configs/osu.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{%- import 'common.jinja2' as common with context -%}
test-suites:
efa:
test_efa.py::test_efa:
performance_tests:
test_osu.py::test_osu:
dimensions:
- regions: [ "euw1-az1" ] # do not move, unless capacity reservation is moved as well
instances: [ "c5n.18xlarge" ]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
)

from tests.ad_integration.cluster_user import ClusterUser
from tests.common.osu_common import compile_osu
from tests.common.utils import get_sts_endpoint, retrieve_latest_ami, run_system_analyzer
from tests.storage.test_fsx_lustre import create_fsx_ontap, create_fsx_open_zfs

Expand Down Expand Up @@ -427,8 +426,7 @@ def _directory_factory(
_delete_certificate(certificate_arn=certificate_arn, region=region)


def _run_user_workloads(users, test_datadir, remote_command_executor, shared_storage_mount_dirs):
compile_osu("openmpi", remote_command_executor)
def _run_user_workloads(users, test_datadir, shared_storage_mount_dirs):
_check_whoami(users)
_check_files_permissions(users, shared_storage_mount_dirs)
job_submission_outputs = [
Expand Down Expand Up @@ -659,8 +657,6 @@ def test_ad_integration(
request,
store_secret_in_secret_manager,
clusters_factory,
run_benchmarks,
benchmarks,
):
"""
Verify AD integration works as expected.
Expand All @@ -670,18 +666,12 @@ def test_ad_integration(
3. SSH key for AD users is created when the property GenerateSshKeysForUsers is true;
4. AD users can submit workloads;
5. AD users filter out by LdapAccessFilter cannot access to the head node.

Optionally, it executes performance tests using OSU benchmarks.
"""
if not is_directory_supported(region, directory_type):
pytest.skip(f"Skipping the test because directory type {directory_type} is not supported in region {region}")

head_node_instance_type = "c5n.18xlarge" if request.config.getoption("benchmarks") else "c5.xlarge"
compute_instance_type_info = {"name": "c5.xlarge", "num_cores": 4}
fsx_supported = is_fsx_supported(region)
config_params = {
"compute_instance_type": compute_instance_type_info.get("name"),
"head_node_instance_type": head_node_instance_type,
"fsx_supported": fsx_supported,
}
directory_stack_name, nlb_stack_name = directory_factory(
Expand Down Expand Up @@ -711,7 +701,7 @@ def test_ad_integration(
)
if fsx_supported:
config_params.update(get_fsx_config_param_vals(fsx_factory, svm_factory))
cluster_config = pcluster_config_reader(benchmarks=benchmarks, **config_params)
cluster_config = pcluster_config_reader(**config_params)
cluster = clusters_factory(cluster_config)

certificate_secret_arn = nlb_stack_parameters.get("CertificateSecretArn")
Expand Down Expand Up @@ -758,14 +748,12 @@ def test_ad_integration(
shared_storage_mount_dirs = ["/shared", "/efs"]
if fsx_supported:
shared_storage_mount_dirs.extend(["/fsxlustre", "/fsxontap", "/fsxopenzfs"])
_run_user_workloads(users, test_datadir, remote_command_executor, shared_storage_mount_dirs)
_run_user_workloads(users, test_datadir, shared_storage_mount_dirs)
logging.info("Testing pcluster update and generate ssh keys for user")
_check_ssh_key_generation(users[0], remote_command_executor, scheduler_commands, False)

# Verify access control with ldap access provider.
updated_config_file = pcluster_config_reader(
config_file="pcluster.config.update.yaml", benchmarks=benchmarks, **config_params
)
updated_config_file = pcluster_config_reader(config_file="pcluster.config.update.yaml", **config_params)
cluster.update(str(updated_config_file), force_update="true")
# Reset stateful connection variables after the cluster update
remote_command_executor = RemoteCommandExecutor(cluster)
Expand All @@ -779,9 +767,7 @@ def test_ad_integration(

# Verify access control with simple access provider.
# With this test we also verify that AdditionalSssdConfigs is working properly.
updated_config_file = pcluster_config_reader(
config_file="pcluster.config.update2.yaml", benchmarks=benchmarks, **config_params
)
updated_config_file = pcluster_config_reader(config_file="pcluster.config.update2.yaml", **config_params)
cluster.update(str(updated_config_file), force_update="true")
# Reset stateful connection variables after the cluster update
remote_command_executor = RemoteCommandExecutor(cluster)
Expand All @@ -794,7 +780,6 @@ def test_ad_integration(
_check_ssh_auth(user=user, expect_success=user.alias != "PclusterUser0")

run_system_analyzer(cluster, scheduler_commands_factory, request)
run_benchmarks(users[0].remote_command_executor(), users[0].scheduler_commands(), diretory_type=directory_type)


def _check_ssh_auth(user, expect_success=True):
Expand Down Expand Up @@ -837,12 +822,6 @@ def test_ad_integration_on_login_nodes(
2. SSH key for AD users is created when the property GenerateSshKeysForUsers is true;
3. AD users can submit workloads;
"""
head_node_instance_type = "c5n.18xlarge" if request.config.getoption("benchmarks") else "c5.xlarge"
compute_instance_type_info = {"name": "c5.xlarge", "num_cores": 4}
config_params = {
"compute_instance_type": compute_instance_type_info.get("name"),
"head_node_instance_type": head_node_instance_type,
}
directory_stack_name, nlb_stack_name = directory_factory(
request.config.getoption("directory_stack_name"),
request.config.getoption("ldaps_nlb_stack_name"),
Expand All @@ -857,16 +836,14 @@ def test_ad_integration_on_login_nodes(
)
nlb_stack_parameters = get_infra_stack_parameters(nlb_stack_name)
ldap_tls_ca_cert = "/opt/parallelcluster/shared_login_nodes/directory_service/certificate.crt"
config_params.update(
get_ad_config_param_vals(
directory_stack_outputs,
nlb_stack_parameters,
password_secret_arn,
ldap_tls_ca_cert,
directory_type,
directory_protocol,
directory_certificate_verification,
)
config_params = get_ad_config_param_vals(
directory_stack_outputs,
nlb_stack_parameters,
password_secret_arn,
ldap_tls_ca_cert,
directory_type,
directory_protocol,
directory_certificate_verification,
)
cluster_config = pcluster_config_reader(**config_params)
cluster = clusters_factory(cluster_config)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Image:
Os: {{ os }}
HeadNode:
InstanceType: {{ head_node_instance_type }}
InstanceType: {{ instance }}
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
Expand All @@ -15,7 +15,7 @@ Scheduling:
ComputeResources:
- Name: cit
Instances:
- InstanceType: {{ compute_instance_type }}
- InstanceType: {{ instance }}
MinCount: 2
MaxCount: 150
Networking:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Image:
Os: {{ os }}
HeadNode:
InstanceType: {{ head_node_instance_type }}
InstanceType: {{ instance }}
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
Expand All @@ -15,7 +15,7 @@ Scheduling:
ComputeResources:
- Name: cit
Instances:
- InstanceType: {{ compute_instance_type }}
- InstanceType: {{ instance }}
MinCount: 2
MaxCount: 150
Networking:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Image:
Os: {{ os }}
HeadNode:
InstanceType: {{ head_node_instance_type }}
InstanceType: {{ instance }}
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
Expand All @@ -15,7 +15,7 @@ Scheduling:
ComputeResources:
- Name: cit
Instances:
- InstanceType: {{ compute_instance_type }}
- InstanceType: {{ instance }}
MinCount: 2
MaxCount: 150
Networking:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,3 @@ for fspath in shared efs; do
# srun has to be used for whoami because slurm_nss plugin only send user information through srun
date '+%Y%m%d%H%M%S' > "/$fspath/$(srun whoami)"
done

BENCHMARK_NAME=osu_barrier
OSU_BENCHMARK_VERSION=5.7.1

module load openmpi
# Run collective benchmark. The collective operations are close to what a real application looks like.
# NOTE: The test is sized for 4 compute nodes.
# -np total number of processes to run (all CPUs * 4 nodes)
mpirun \
> /shared/"$(date '+%Y%m%d%H%M%S')-$(srun whoami)-${BENCHMARK_NAME}".out \
/shared/openmpi/osu-micro-benchmarks-${OSU_BENCHMARK_VERSION}/mpi/collective/${BENCHMARK_NAME}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ LoginNodes:
Ssh:
KeyName: {{ key_name }}
HeadNode:
InstanceType: {{ head_node_instance_type }}
InstanceType: {{ instance }}
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
Expand All @@ -25,7 +25,7 @@ Scheduling:
ComputeResources:
- Name: cit
Instances:
- InstanceType: {{ compute_instance_type }}
- InstanceType: {{ instance }}
MinCount: 2
MaxCount: 150
Networking:
Expand Down
Loading
Loading