Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v1.12.x] .ci/aws: Switch CI to persistent clusters with containers #690

Merged
merged 1 commit into from
Nov 4, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 75 additions & 99 deletions .ci/aws/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@ def buildNumber = env.BUILD_NUMBER as int
if (buildNumber > 1) milestone(buildNumber - 1)
milestone(buildNumber)


import groovy.transform.Field
@Field boolean build_ok = true

def get_portafiducia_download_path() {
/* Stable Portafiducia tarball */
def AWS_ACCOUNT_ID = sh (
Expand Down Expand Up @@ -47,88 +43,48 @@ def install_porta_fiducia() {
'''
}

def kill_all_clusters(instance_type, region) {
def instance_type_without_period = sh(
script: "echo ${instance_type} | tr -d '.\\n'",
returnStdout: true
)
sh ". venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name \'*${instance_type_without_period}*\' --region ${region} || true"
}

def wait_for_odcr_capacity(region, instance_count, odcr) {
sh ". venv/bin/activate; ./PortaFiducia/scripts/wait_for_odcr_capacity.py --region ${region} --odcr-id ${odcr} --required-capacity ${instance_count}"
def get_persistent_cluster_name(build_tag, os, instance_type) {
def instance_type_prefix = instance_type.split("\\.")[0]
return "PluginPRCI_PersistentManualCluster_${instance_type_prefix}"
}

def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, odcr, addl_args) {
/*
* Run PortaFiducia/tests/test_orchestrator.py with given command line arguments
*/

def run_test_orchestrator_once_persistent(run_name, build_tag, os, instance_type, instance_count, region, odcr, addl_args) {
/*
* This is a temporary workaround to deal with clusters not getting cleaned up
* Attempt to cleanup all instances types in a region when you get the lock.
* This is required b/c milestones send multiple SIG_TERM, followed by a SIG_KILL after 20s.
* This stops us from being able to add additional capacity to the Jenkins service.
* Run PortaFiducia/tests/test_orchestrator.py with given command line arguments on an already existing persistent cluster
*/
kill_all_clusters(instance_type, region)
wait_for_odcr_capacity(region, instance_count, odcr)

/*
* p3dn clusters are getting ICE'ed within an ODCR, when we try to launch them back to back.
* This is a non-deterministic work around to help us increase our chances of not getting ICE'ed.
* Worst case, this increases our time to publish results on PR's by 15 minutes.
*/
if (instance_type == "p3dn.24xlarge") {
sh "sleep 150"
}
def buildNumber = env.BUILD_NUMBER as int
def cluster_name = get_persistent_cluster_name(build_tag, os, instance_type)
def args = "--os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}.xml"

def cluster_name = get_cluster_name(build_tag, os, instance_type)
def args = "--os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml"
def ret = sh (
script: ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}",
returnStatus: true
)
if (ret == 65)
unstable('Scripts exited with status 65')
else if (ret != 0)
build_ok = false
catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
sh "exit ${ret}"
try {
sh ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}"
} catch (Exception e) {
currentBuild.result = "FAILURE"
throw e
}
}

def get_random_string(len) {
def s = sh (
script: "cat /dev/urandom | LC_ALL=C tr -dc A-Za-z0-9 | head -c ${len}",
returnStdout: true
)
return s
}

def get_cluster_name(build_tag, os, instance_type) {
def run_test_orchestrator_once_container(run_name, build_tag, os, container_os, instance_type, instance_count, region, odcr, addl_args) {
/*
* Compose the cluster name. Pcluster requires a cluster name under 60 characters.
* cluster name cannot have ".".
* Jenkins does not allow groovy to use the replace() method
* of string. Therefore we used shell command sed to replace "." with ""
* Run PortaFiducia/tests/test_orchestrator.py with given command line arguments on an already existing persistent cluster on a container with specified OS
*/
build_tag = sh(
script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\"",
returnStdout: true
)

def cluster_name = sh(
script: "echo '${build_tag.take(28)}-${os.take(10)}-${instance_type}-'${get_random_string(8)} | tr -d '.\\n'",
returnStdout: true
)
def buildNumber = env.BUILD_NUMBER as int
def cluster_name = get_persistent_cluster_name(build_tag, os, instance_type)
def args = "--os ${os} --container-os ${container_os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}-${os}-${container_os}.xml"

return cluster_name
try {
sh ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}"
} catch (Exception e) {
currentBuild.result = "FAILURE"
throw e
}
}


def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, odcr, addl_args) {
def get_test_stage_with_lock_persistent(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, odcr, addl_args) {
/*
* Generate a single test stage that run test_orchestrator.py with the given parameters.
* Generate a single test stage that run test_orchestrator.py with the given parameters on an already existing persistent cluster.
* The job will queue until it acquires the given number of locks. The locks will be released
* after the job finishes.
* param@ stage_name: the name of the stage
Expand All @@ -145,12 +101,37 @@ def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, region, l
return {
stage("${stage_name}") {
lock(label: lock_label, quantity: lock_count) {
this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, lock_count, region, odcr, addl_args)
this.run_test_orchestrator_once_persistent(stage_name, build_tag, os, instance_type, lock_count, region, odcr, addl_args)
}
}
}
}

def get_test_stage_with_lock_container(stage_name, build_tag, os, container_os, instance_type, region, lock_label, lock_count, odcr, addl_args) {
/*
* Generate a single test stage that run test_orchestrator.py with the given parameters on an already existing persistent cluster on a container of specified OS.
* The job will queue until it acquires the given number of locks. The locks will be released
* after the job finishes.
* param@ stage_name: the name of the stage
* param@ build_tag: the BUILD_TAG env generated by Jenkins
* param@ os: the operating system on the instance for the test stage.
* param@ container_os: the operating system on the container for the test stage.
* param@ instance_type: the instance type for the test stage.
* param@ region: the (default) aws region where the tests are run.
* param@ lock_label: str, the label of the lockable resources.
* param@ lock_count: int, the quantity of the lockable resources.
* param@ odcr: The on demand capacity reservation ID to create instances in
* param@ addl_args: additional arguments passed to test_orchestrator.py
* return@: the test stage.
*/
return {
stage("${stage_name}") {
lock(label: lock_label, quantity: lock_count) {
this.run_test_orchestrator_once_container(stage_name, build_tag, os, container_os, instance_type, lock_count, region, odcr, addl_args)
}
}
}
}

pipeline {
agent {
Expand Down Expand Up @@ -206,26 +187,33 @@ pipeline {
def test_type = "--test-type pr"
def build_type = "--aws-ofi-nccl-build-type debug"
def pr_num = "--test-aws-ofi-nccl-pr $env.CHANGE_ID"
def test_list = "--test-list test_nccl_test test_ofi_nccl_functional"
def nccl_test_iter = "--test-aws-ofi-nccl-nccltest-iterations 5"
def efa_installer = "--use-prebuilt-ami-with-efa-installer true"
def base_args = "${efa_installer} ${nccl_version} ${timeout} ${cluster_type} ${test_target} ${test_type} ${build_type} ${pr_num} ${test_list} ${nccl_test_iter}"

def persistent_manual_cluster_addl_args = " --keep-cluster --skip-fixture-setup --skip-health-checks --use-existing-installer --cleanup-pf-directory"
def container_addl_args = " --test-in-containers-on-ec2"

def base_args = "${efa_installer} ${nccl_version} ${timeout} ${cluster_type} ${test_target} ${test_type} ${build_type} ${pr_num} ${nccl_test_iter} ${persistent_manual_cluster_addl_args}"

def num_instances = 4
def p3dn_lock_label = "p3dn-1-4node"
def p3dn_region = "ap-northeast-1"
def p3dn_odcr = "cr-08ecd03c0644442e4"
def p3dn_addl_args = "${base_args} --odcr-placement-group-name p3dn-placement-group"
def p3dn_addl_args = "${base_args} --odcr-placement-group-name p3dn-placement-group ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional"
def p4d_lock_label = "p4d-1-4node"
def p4d_region = "us-east-2"
def p4d_odcr = "cr-0e5eebb3c896f6af0"
def p4_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional"
def p5_lock_label = "p5-1-4node"
def p5_region = "ap-southeast-3"
def p5_odcr = "cr-091dbf6e0516dbba1"
def p5_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test"
def p3_p4_p5_base_os = "alinux2"

def g4dn_lock_label = "g4dn-1-4node"
def g4dn_region = "us-west-2"
def g4dn_odcr = "cr-0e2f9cac30bb5ad5f"
def g4dn_addl_args = "${base_args} --odcr-placement-group-name g4dn-placement-group"
def g4dn_addl_args = "${base_args} --odcr-placement-group-name g4dn-placement-group --test-list test_nccl_test test_ofi_nccl_functional"
def trn1_lock_label = "trn1-1-4node"
def trn1_region = "us-east-2"
def trn1_odcr = "cr-0e9366fb7fa2772f1"
Expand All @@ -236,45 +224,33 @@ pipeline {
def trn1n_addl_args = "${base_args} --odcr-placement-group-name trn1n-placement-group --test-list test_nccom_test"

// p3dn tests
stages["4_p3dn_al2"] = get_test_stage_with_lock("4_p3dn_al2", env.BUILD_TAG, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock("4_p3dn_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_al2"] = get_test_stage_with_lock_container("4_p3dn_al2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)

// p4d tests
stages["4_p4d_alinux2"] = get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, base_args)
stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, base_args)
stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock("4_p4d_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, base_args)
stages["4_p4d_alinux2"] = get_test_stage_with_lock_container("4_p4d_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args)
stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock_container("4_p4d_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args)
stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock_container("4_p4d_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args)

// p5 tests
stages["4_p5_alinux2"] = get_test_stage_with_lock("4_p5_alinux2", env.BUILD_TAG, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, base_args)
stages["4_p5_ubuntu2004"] = get_test_stage_with_lock("4_p5_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, base_args)
stages["4_p5_ubuntu2204"] = get_test_stage_with_lock("4_p5_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, base_args)
stages["4_p5_alinux2"] = get_test_stage_with_lock_container("4_p5_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args)
stages["4_p5_ubuntu2004"] = get_test_stage_with_lock_container("4_p5_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args)
stages["4_p5_ubuntu2204"] = get_test_stage_with_lock_container("4_p5_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args)

// g4dn tests
stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_region, g4dn_lock_label, num_instances, g4dn_odcr, g4dn_addl_args)
stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock_persistent("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_region, g4dn_lock_label, num_instances, g4dn_odcr, g4dn_addl_args)

// trn1 tests
stages["4_trn1_ubuntu2004"] = get_test_stage_with_lock("4_trn1_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", trn1_region, trn1_lock_label, num_instances, trn1_odcr, trn1_addl_args)
stages["4_trn1_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", trn1_region, trn1_lock_label, num_instances, trn1_odcr, trn1_addl_args)

// trn1n tests
stages["4_trn1n_ubuntu2004"] = get_test_stage_with_lock("4_trn1n_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1n.32xlarge", trn1n_region, trn1n_lock_label, num_instances, trn1n_odcr, trn1n_addl_args)
stages["4_trn1n_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1n_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1n.32xlarge", trn1n_region, trn1n_lock_label, num_instances, trn1n_odcr, trn1n_addl_args)

parallel stages
}
}
}
stage('check build_ok') {
steps {
script {
if (build_ok) {
currentBuild.result = "SUCCESS"
}
else {
currentBuild.result = "FAILURE"
}
}
}
}
}
post {
always {
Expand Down
Loading