From 64264a9377ad5e05256dfd115997420d23ac36c7 Mon Sep 17 00:00:00 2001 From: Sai Sunku Date: Thu, 17 Oct 2024 16:25:07 -0400 Subject: [PATCH] .ci/aws: Switch CI to persistent clusters with containers Signed-off-by: Sai Sunku (cherry picked from commit 8a0686f79a7c81cfbcc84d5ab0b3415164735843) --- .ci/aws/Jenkinsfile | 174 +++++++++++++++++++------------------------- 1 file changed, 75 insertions(+), 99 deletions(-) diff --git a/.ci/aws/Jenkinsfile b/.ci/aws/Jenkinsfile index 8175da33f..2a8cf38b9 100644 --- a/.ci/aws/Jenkinsfile +++ b/.ci/aws/Jenkinsfile @@ -5,10 +5,6 @@ def buildNumber = env.BUILD_NUMBER as int if (buildNumber > 1) milestone(buildNumber - 1) milestone(buildNumber) - -import groovy.transform.Field -@Field boolean build_ok = true - def get_portafiducia_download_path() { /* Stable Portafiducia tarball */ def AWS_ACCOUNT_ID = sh ( @@ -47,88 +43,48 @@ def install_porta_fiducia() { ''' } -def kill_all_clusters(instance_type, region) { - def instance_type_without_period = sh( - script: "echo ${instance_type} | tr -d '.\\n'", - returnStdout: true - ) - sh ". venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name \'*${instance_type_without_period}*\' --region ${region} || true" -} - -def wait_for_odcr_capacity(region, instance_count, odcr) { - sh ". venv/bin/activate; ./PortaFiducia/scripts/wait_for_odcr_capacity.py --region ${region} --odcr-id ${odcr} --required-capacity ${instance_count}" +def get_persistent_cluster_name(build_tag, os, instance_type) { + def instance_type_prefix = instance_type.split("\\.")[0] + return "PluginPRCI_PersistentManualCluster_${instance_type_prefix}" } -def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, odcr, addl_args) { - /* - * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments - */ - +def run_test_orchestrator_once_persistent(run_name, build_tag, os, instance_type, instance_count, region, odcr, addl_args) { /* - * This is a temporary workaround to deal with clusters not getting cleaned up - * Attempt to cleanup all instances types in a region when you get the lock. - * This is required b/c milestones send multiple SIG_TERM, followed by a SIG_KILL after 20s. - * This stops us from being able to add additional capacity to the Jenkins service. + * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments on an already existing persistent cluster */ - kill_all_clusters(instance_type, region) - wait_for_odcr_capacity(region, instance_count, odcr) - /* - * p3dn clusters are getting ICE'ed within an ODCR, when we try to launch them back to back. - * This is a non-deterministic work around to help us increase our chances of not getting ICE'ed. - * Worst case, this increases our time to publish results on PR's by 15 minutes. - */ - if (instance_type == "p3dn.24xlarge") { - sh "sleep 150" - } + def buildNumber = env.BUILD_NUMBER as int + def cluster_name = get_persistent_cluster_name(build_tag, os, instance_type) + def args = "--os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}.xml" - def cluster_name = get_cluster_name(build_tag, os, instance_type) - def args = "--os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" - def ret = sh ( - script: ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}", - returnStatus: true - ) - if (ret == 65) - unstable('Scripts exited with status 65') - else if (ret != 0) - build_ok = false - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - sh "exit ${ret}" + try { + sh ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}" + } catch (Exception e) { + currentBuild.result = "FAILURE" + throw e } } -def get_random_string(len) { - def s = sh ( - script: "cat /dev/urandom | LC_ALL=C tr -dc A-Za-z0-9 | head -c ${len}", - returnStdout: true - ) - return s -} - -def get_cluster_name(build_tag, os, instance_type) { +def run_test_orchestrator_once_container(run_name, build_tag, os, container_os, instance_type, instance_count, region, odcr, addl_args) { /* - * Compose the cluster name. Pcluster requires a cluster name under 60 characters. - * cluster name cannot have ".". - * Jenkins does not allow groovy to use the replace() method - * of string. Therefore we used shell command sed to replace "." with "" + * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments on an already existing persistent cluster on a container with specified OS */ - build_tag = sh( - script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\"", - returnStdout: true - ) - def cluster_name = sh( - script: "echo '${build_tag.take(28)}-${os.take(10)}-${instance_type}-'${get_random_string(8)} | tr -d '.\\n'", - returnStdout: true - ) + def buildNumber = env.BUILD_NUMBER as int + def cluster_name = get_persistent_cluster_name(build_tag, os, instance_type) + def args = "--os ${os} --container-os ${container_os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}-${os}-${container_os}.xml" - return cluster_name + try { + sh ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}" + } catch (Exception e) { + currentBuild.result = "FAILURE" + throw e + } } - -def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, odcr, addl_args) { +def get_test_stage_with_lock_persistent(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, odcr, addl_args) { /* - * Generate a single test stage that run test_orchestrator.py with the given parameters. + * Generate a single test stage that run test_orchestrator.py with the given parameters on an already existing persistent cluster. * The job will queue until it acquires the given number of locks. The locks will be released * after the job finishes. * param@ stage_name: the name of the stage @@ -145,12 +101,37 @@ def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, region, l return { stage("${stage_name}") { lock(label: lock_label, quantity: lock_count) { - this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, lock_count, region, odcr, addl_args) + this.run_test_orchestrator_once_persistent(stage_name, build_tag, os, instance_type, lock_count, region, odcr, addl_args) } } } } +def get_test_stage_with_lock_container(stage_name, build_tag, os, container_os, instance_type, region, lock_label, lock_count, odcr, addl_args) { + /* + * Generate a single test stage that run test_orchestrator.py with the given parameters on an already existing persistent cluster on a container of specified OS. + * The job will queue until it acquires the given number of locks. The locks will be released + * after the job finishes. + * param@ stage_name: the name of the stage + * param@ build_tag: the BUILD_TAG env generated by Jenkins + * param@ os: the operating system on the instance for the test stage. + * param@ container_os: the operating system on the container for the test stage. + * param@ instance_type: the instance type for the test stage. + * param@ region: the (default) aws region where the tests are run. + * param@ lock_label: str, the label of the lockable resources. + * param@ lock_count: int, the quantity of the lockable resources. + * param@ odcr: The on demand capacity reservation ID to create instances in + * param@ addl_args: additional arguments passed to test_orchestrator.py + * return@: the test stage. + */ + return { + stage("${stage_name}") { + lock(label: lock_label, quantity: lock_count) { + this.run_test_orchestrator_once_container(stage_name, build_tag, os, container_os, instance_type, lock_count, region, odcr, addl_args) + } + } + } +} pipeline { agent { @@ -206,26 +187,33 @@ pipeline { def test_type = "--test-type pr" def build_type = "--aws-ofi-nccl-build-type debug" def pr_num = "--test-aws-ofi-nccl-pr $env.CHANGE_ID" - def test_list = "--test-list test_nccl_test test_ofi_nccl_functional" def nccl_test_iter = "--test-aws-ofi-nccl-nccltest-iterations 5" def efa_installer = "--use-prebuilt-ami-with-efa-installer true" - def base_args = "${efa_installer} ${nccl_version} ${timeout} ${cluster_type} ${test_target} ${test_type} ${build_type} ${pr_num} ${test_list} ${nccl_test_iter}" + + def persistent_manual_cluster_addl_args = " --keep-cluster --skip-fixture-setup --skip-health-checks --use-existing-installer --cleanup-pf-directory" + def container_addl_args = " --test-in-containers-on-ec2" + + def base_args = "${efa_installer} ${nccl_version} ${timeout} ${cluster_type} ${test_target} ${test_type} ${build_type} ${pr_num} ${nccl_test_iter} ${persistent_manual_cluster_addl_args}" def num_instances = 4 def p3dn_lock_label = "p3dn-1-4node" def p3dn_region = "ap-northeast-1" def p3dn_odcr = "cr-08ecd03c0644442e4" - def p3dn_addl_args = "${base_args} --odcr-placement-group-name p3dn-placement-group" + def p3dn_addl_args = "${base_args} --odcr-placement-group-name p3dn-placement-group ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional" def p4d_lock_label = "p4d-1-4node" def p4d_region = "us-east-2" def p4d_odcr = "cr-0e5eebb3c896f6af0" + def p4_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional" def p5_lock_label = "p5-1-4node" def p5_region = "ap-southeast-3" def p5_odcr = "cr-091dbf6e0516dbba1" + def p5_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test" + def p3_p4_p5_base_os = "alinux2" + def g4dn_lock_label = "g4dn-1-4node" def g4dn_region = "us-west-2" def g4dn_odcr = "cr-0e2f9cac30bb5ad5f" - def g4dn_addl_args = "${base_args} --odcr-placement-group-name g4dn-placement-group" + def g4dn_addl_args = "${base_args} --odcr-placement-group-name g4dn-placement-group --test-list test_nccl_test test_ofi_nccl_functional" def trn1_lock_label = "trn1-1-4node" def trn1_region = "us-east-2" def trn1_odcr = "cr-0e9366fb7fa2772f1" @@ -236,45 +224,33 @@ pipeline { def trn1n_addl_args = "${base_args} --odcr-placement-group-name trn1n-placement-group --test-list test_nccom_test" // p3dn tests - stages["4_p3dn_al2"] = get_test_stage_with_lock("4_p3dn_al2", env.BUILD_TAG, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) - stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock("4_p3dn_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) - stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) + stages["4_p3dn_al2"] = get_test_stage_with_lock_container("4_p3dn_al2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) + stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) + stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) // p4d tests - stages["4_p4d_alinux2"] = get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, base_args) - stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, base_args) - stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock("4_p4d_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, base_args) + stages["4_p4d_alinux2"] = get_test_stage_with_lock_container("4_p4d_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args) + stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock_container("4_p4d_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args) + stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock_container("4_p4d_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args) // p5 tests - stages["4_p5_alinux2"] = get_test_stage_with_lock("4_p5_alinux2", env.BUILD_TAG, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, base_args) - stages["4_p5_ubuntu2004"] = get_test_stage_with_lock("4_p5_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, base_args) - stages["4_p5_ubuntu2204"] = get_test_stage_with_lock("4_p5_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, base_args) + stages["4_p5_alinux2"] = get_test_stage_with_lock_container("4_p5_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args) + stages["4_p5_ubuntu2004"] = get_test_stage_with_lock_container("4_p5_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args) + stages["4_p5_ubuntu2204"] = get_test_stage_with_lock_container("4_p5_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args) // g4dn tests - stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_region, g4dn_lock_label, num_instances, g4dn_odcr, g4dn_addl_args) + stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock_persistent("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_region, g4dn_lock_label, num_instances, g4dn_odcr, g4dn_addl_args) // trn1 tests - stages["4_trn1_ubuntu2004"] = get_test_stage_with_lock("4_trn1_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", trn1_region, trn1_lock_label, num_instances, trn1_odcr, trn1_addl_args) + stages["4_trn1_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", trn1_region, trn1_lock_label, num_instances, trn1_odcr, trn1_addl_args) // trn1n tests - stages["4_trn1n_ubuntu2004"] = get_test_stage_with_lock("4_trn1n_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1n.32xlarge", trn1n_region, trn1n_lock_label, num_instances, trn1n_odcr, trn1n_addl_args) + stages["4_trn1n_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1n_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1n.32xlarge", trn1n_region, trn1n_lock_label, num_instances, trn1n_odcr, trn1n_addl_args) parallel stages } } } - stage('check build_ok') { - steps { - script { - if (build_ok) { - currentBuild.result = "SUCCESS" - } - else { - currentBuild.result = "FAILURE" - } - } - } - } } post { always {