From 023c5b002fb99636d7a5e88914a9bc64b0909744 Mon Sep 17 00:00:00 2001 From: Param Bole Date: Wed, 13 Nov 2024 10:56:30 -0800 Subject: [PATCH 01/26] Adding single and multi-node AxLearn A3Plus Tests (#471) * Adding AxLearn A3Plus Tests --- dags/imagegen_devx/project_bite_gpu_e2e.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dags/imagegen_devx/project_bite_gpu_e2e.py b/dags/imagegen_devx/project_bite_gpu_e2e.py index 29b69df1..7ce82778 100644 --- a/dags/imagegen_devx/project_bite_gpu_e2e.py +++ b/dags/imagegen_devx/project_bite_gpu_e2e.py @@ -38,6 +38,7 @@ axlearn_test_configs = { # accelerator: list of slices to test "a3": [1], + "a3plus": [1, 2], } for accelerator, slices in axlearn_test_configs.items(): From fcbe0dd633bc5c0c915afdb8cc1f74632364094a Mon Sep 17 00:00:00 2001 From: Ran Ran Date: Wed, 13 Nov 2024 14:54:16 -0800 Subject: [PATCH 02/26] Add new owner to the repo (#472) --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c2f3d1bf..97dae494 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,5 @@ # Default owners for everything in the repo, unless a later match takes precedence. -* @RissyRan @allenwang28 +* @mbzomowski @RissyRan @allenwang28 dags/solutions_team/configs/tensorflow @chandrasekhard2 @ZhaoyueCheng dags/solutions_team/solutionsteam_tf* @chandrasekhard2 @ZhaoyueCheng From 4820439287683d708028ad6229a1b51a7e4c030f Mon Sep 17 00:00:00 2001 From: Ran Ran Date: Wed, 13 Nov 2024 15:41:29 -0800 Subject: [PATCH 03/26] Rename team name folder (#473) --- .github/CODEOWNERS | 6 +++--- dags/quarantined_tests.py | 10 +++++----- .../configs/__init__.py | 0 .../configs/common.py | 0 .../configs/gke_config.py | 0 .../configs/project_bite_config.py | 4 ++-- .../jax_stable_stack_gpu_e2e.py | 9 +++++++-- .../jax_stable_stack_tpu_e2e.py | 6 ++++-- .../maxdiffusion_e2e.py | 4 ++-- .../project_bite_gpu_e2e.py | 10 ++++++++-- .../project_bite_tpu_e2e.py | 4 ++-- dags/test_owner.py | 5 ++--- 12 files changed, 35 insertions(+), 23 deletions(-) rename dags/{imagegen_devx => sparsity_diffusion_devx}/configs/__init__.py (100%) rename dags/{imagegen_devx => sparsity_diffusion_devx}/configs/common.py (100%) rename dags/{imagegen_devx => sparsity_diffusion_devx}/configs/gke_config.py (100%) rename dags/{imagegen_devx => sparsity_diffusion_devx}/configs/project_bite_config.py (95%) rename dags/{imagegen_devx => sparsity_diffusion_devx}/jax_stable_stack_gpu_e2e.py (93%) rename dags/{imagegen_devx => sparsity_diffusion_devx}/jax_stable_stack_tpu_e2e.py (97%) rename dags/{imagegen_devx => sparsity_diffusion_devx}/maxdiffusion_e2e.py (96%) rename dags/{imagegen_devx => sparsity_diffusion_devx}/project_bite_gpu_e2e.py (91%) rename dags/{imagegen_devx => sparsity_diffusion_devx}/project_bite_tpu_e2e.py (92%) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 97dae494..fd1f8567 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -11,8 +11,8 @@ dags/multipod @jonb377 @tonyjohnchen @raymondzouu @gobbleturk @shralex @RissyRan dags/mlcompass @ortibazar @sganeshb @brajiang @wlzhg -dags/imagegen_devx @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh -dags/imagegen_devx/project_bite* @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh @jiya-zhang -dags/imagegen_devx/configs/project_bite* @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh @jiya-zhang +dags/sparsity_diffusion_devx @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh +dags/sparsity_diffusion_devx/project_bite* @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh @jiya-zhang +dags/sparsity_diffusion_devx/configs/project_bite* @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh @jiya-zhang dags/inference @yeandy @vipannalla @morgandu @mailvijayasingh @sixiang-google @joezijunzhou @singh-mitali diff --git a/dags/quarantined_tests.py b/dags/quarantined_tests.py index 08854752..f98c06b9 100644 --- a/dags/quarantined_tests.py +++ b/dags/quarantined_tests.py @@ -193,21 +193,21 @@ class QuarantineTests: "chained_tests_llama2-70b_nightly": TestInfo(team.LLM_DEVX, "2024-11-12"), # DAG: jax_stable_stack_gpu_e2e "maxtext-stable-stack-train-c4-data-h100-80gb-8": TestInfo( - team.SPARCITY_DIFFUSION_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), "maxtext-stable-stack-train-c4-data-h100-mega-80gb-8": TestInfo( - team.SPARCITY_DIFFUSION_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), # DAG: jax_stable_tpu_stack_e2e "axlearn-jax-stable-stack-v4-16-1x-v4-16": TestInfo( - team.SPARCITY_DIFFUSION_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), "axlearn-jax-stable-stack-v4-16-2x-2xv4-16": TestInfo( - team.SPARCITY_DIFFUSION_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), # DAG: maxdiffusion_e2e "maxd-sdxl-nan-v6e-256-2x-2xv6e-256": TestInfo( - team.SPARCITY_DIFFUSION_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), # DAG: maxtext_configs_aot "maxtext-aot-v5e-stable-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"), diff --git a/dags/imagegen_devx/configs/__init__.py b/dags/sparsity_diffusion_devx/configs/__init__.py similarity index 100% rename from dags/imagegen_devx/configs/__init__.py rename to dags/sparsity_diffusion_devx/configs/__init__.py diff --git a/dags/imagegen_devx/configs/common.py b/dags/sparsity_diffusion_devx/configs/common.py similarity index 100% rename from dags/imagegen_devx/configs/common.py rename to dags/sparsity_diffusion_devx/configs/common.py diff --git a/dags/imagegen_devx/configs/gke_config.py b/dags/sparsity_diffusion_devx/configs/gke_config.py similarity index 100% rename from dags/imagegen_devx/configs/gke_config.py rename to dags/sparsity_diffusion_devx/configs/gke_config.py diff --git a/dags/imagegen_devx/configs/project_bite_config.py b/dags/sparsity_diffusion_devx/configs/project_bite_config.py similarity index 95% rename from dags/imagegen_devx/configs/project_bite_config.py rename to dags/sparsity_diffusion_devx/configs/project_bite_config.py index 58173947..14d1068d 100644 --- a/dags/imagegen_devx/configs/project_bite_config.py +++ b/dags/sparsity_diffusion_devx/configs/project_bite_config.py @@ -19,12 +19,12 @@ from typing import Tuple, Optional from xlml.apis import gcp_config, metric_config, task, test_config from dags import gcs_bucket, test_owner -from dags.imagegen_devx.configs import common +from dags.sparsity_diffusion_devx.configs import common from dags.vm_resource import TpuVersion, Project from airflow.models.taskmixin import DAGNode -GCS_SUBFOLDER_PREFIX = test_owner.Team.IMAGEGEN_DEVX.value +GCS_SUBFOLDER_PREFIX = test_owner.Team.SPARSITY_DIFFUSION_DEVX.value def set_up_axlearn(pinned_version) -> Tuple[str]: diff --git a/dags/imagegen_devx/jax_stable_stack_gpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py similarity index 93% rename from dags/imagegen_devx/jax_stable_stack_gpu_e2e.py rename to dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py index d06c1165..466bf3b6 100644 --- a/dags/imagegen_devx/jax_stable_stack_gpu_e2e.py +++ b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py @@ -20,7 +20,7 @@ from dags import composer_env, test_owner, gcs_bucket from dags.vm_resource import Project, TpuVersion, CpuVersion, Zone, DockerImage, GpuVersion, XpkClusters from airflow.utils.task_group import TaskGroup -from dags.imagegen_devx.configs import gke_config as config +from dags.sparsity_diffusion_devx.configs import gke_config as config from xlml.utils import name_format from dags.multipod.configs.common import SetupMode @@ -31,7 +31,12 @@ with models.DAG( dag_id="jax_stable_stack_gpu_e2e", schedule=SCHEDULED_TIME, - tags=["multipod_team", "maxtext", "jax-stable-stack"], + tags=[ + "sparsity_diffusion_devx", + "multipod_team", + "maxtext", + "jax-stable-stack", + ], start_date=datetime.datetime(2024, 6, 7), catchup=False, ) as dag: diff --git a/dags/imagegen_devx/jax_stable_stack_tpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py similarity index 97% rename from dags/imagegen_devx/jax_stable_stack_tpu_e2e.py rename to dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py index f7c6d1b5..a0cbac1a 100644 --- a/dags/imagegen_devx/jax_stable_stack_tpu_e2e.py +++ b/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py @@ -20,7 +20,7 @@ from airflow.utils.task_group import TaskGroup from dags import composer_env, test_owner, gcs_bucket from dags.vm_resource import Project, TpuVersion, CpuVersion, Zone, DockerImage, GpuVersion, XpkClusters -from dags.imagegen_devx.configs import gke_config as config +from dags.sparsity_diffusion_devx.configs import gke_config as config from xlml.utils import name_format # Run once a day at 3 am UTC (7 pm PST) @@ -31,11 +31,13 @@ dag_id="jax_stable_stack_tpu_e2e", schedule=SCHEDULED_TIME, tags=[ + "sparsity_diffusion_devx", "multipod_team", "maxtext", "maxdiffusion", "axlearn", - "tpu" "jax-stable-stack", + "tpu", + "jax-stable-stack", ], start_date=datetime.datetime(2024, 6, 7), catchup=False, diff --git a/dags/imagegen_devx/maxdiffusion_e2e.py b/dags/sparsity_diffusion_devx/maxdiffusion_e2e.py similarity index 96% rename from dags/imagegen_devx/maxdiffusion_e2e.py rename to dags/sparsity_diffusion_devx/maxdiffusion_e2e.py index 980c43ce..3c9c7311 100644 --- a/dags/imagegen_devx/maxdiffusion_e2e.py +++ b/dags/sparsity_diffusion_devx/maxdiffusion_e2e.py @@ -20,7 +20,7 @@ from airflow.utils.task_group import TaskGroup from dags import composer_env, test_owner, gcs_bucket from dags.vm_resource import Project, TpuVersion, CpuVersion, Zone, DockerImage, GpuVersion, XpkClusters -from dags.imagegen_devx.configs import gke_config as config +from dags.sparsity_diffusion_devx.configs import gke_config as config from xlml.utils import name_format # Run once a day at 4 am UTC (8 pm PST) @@ -30,7 +30,7 @@ with models.DAG( dag_id="maxdiffusion_e2e", schedule=SCHEDULED_TIME, - tags=["multipod_team", "maxdiffusion"], + tags=["sparsity_diffusion_devx", "multipod_team", "maxdiffusion"], start_date=datetime.datetime(2024, 9, 12), catchup=False, ) as dag: diff --git a/dags/imagegen_devx/project_bite_gpu_e2e.py b/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py similarity index 91% rename from dags/imagegen_devx/project_bite_gpu_e2e.py rename to dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py index 7ce82778..fdb95032 100644 --- a/dags/imagegen_devx/project_bite_gpu_e2e.py +++ b/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py @@ -19,7 +19,7 @@ from airflow import models from dags import composer_env, test_owner, gcs_bucket from dags.vm_resource import DockerImage, XpkClusters -from dags.imagegen_devx.configs import gke_config as config +from dags.sparsity_diffusion_devx.configs import gke_config as config from xlml.utils import name_format # Run once a day at 3 am UTC (7 pm PST) @@ -29,7 +29,13 @@ with models.DAG( dag_id="project_bite_gpu_e2e", schedule=SCHEDULED_TIME, - tags=["multipod_team", "gcp_gpu", "axlearn", "bite"], + tags=[ + "sparsity_diffusion_devx", + "multipod_team", + "gcp_gpu", + "axlearn", + "bite", + ], start_date=datetime.datetime(2024, 11, 12), catchup=False, ) as dag: diff --git a/dags/imagegen_devx/project_bite_tpu_e2e.py b/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py similarity index 92% rename from dags/imagegen_devx/project_bite_tpu_e2e.py rename to dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py index 30bbcfa6..df7370b5 100644 --- a/dags/imagegen_devx/project_bite_tpu_e2e.py +++ b/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py @@ -18,7 +18,7 @@ from airflow import models from dags import composer_env from dags.vm_resource import TpuVersion, Zone, RuntimeVersion -from dags.imagegen_devx.configs import project_bite_config as config +from dags.sparsity_diffusion_devx.configs import project_bite_config as config # Run once a day at 6 pm UTC (11 am PST) @@ -28,7 +28,7 @@ with models.DAG( dag_id="project_bite_tpu_e2e", schedule=SCHEDULED_TIME, - tags=["imagegen_devx", "jax", "nightly", "bite", "multipod_team"], + tags=["sparsity_diffusion_devx", "jax", "nightly", "bite", "multipod_team"], start_date=datetime.datetime(2024, 4, 4), catchup=False, ) as dag: diff --git a/dags/test_owner.py b/dags/test_owner.py index f17b4348..fbe748b5 100644 --- a/dags/test_owner.py +++ b/dags/test_owner.py @@ -22,11 +22,10 @@ class Team(enum.Enum): PYTORCH_XLA = "pytorch_xla" MULTIPOD = "multipod" MLCOMPASS = "mlcompass" - IMAGEGEN_DEVX = "imagegen_devx" INFERENCE = "inference" FRAMEWORK = "framework3p" LLM_DEVX = "llm_devx" - SPARCITY_DIFFUSION_DEVX = "sparcity_diffusion_devx" + SPARSITY_DIFFUSION_DEVX = "sparsity_diffusion_devx" PERFORMANCE = "performance" PRODUCTIVITY = "productivity" @@ -54,7 +53,7 @@ class Team(enum.Enum): # MLCompass ORTI_B = "Orti B." -# ImageGen DevX +# Sparsity & Diffusion DevX RAN_R = "Ran R." PARAM_B = "Param B." From 67c27acfc644f03525eb59406b6f5e8210cd6fb0 Mon Sep 17 00:00:00 2001 From: shralex Date: Wed, 13 Nov 2024 16:10:27 -0800 Subject: [PATCH 04/26] Adding a quarantine folder for remaining multipod DAGs. (#474) --- .../multipod/maxtext_trillium_configs_perf.py | 12 +- dags/multipod/maxtext_v5e_configs_perf.py | 23 +- dags/quarantined_tests.py | 393 ++++++++++++++++++ xlml/apis/task.py | 10 + 4 files changed, 430 insertions(+), 8 deletions(-) diff --git a/dags/multipod/maxtext_trillium_configs_perf.py b/dags/multipod/maxtext_trillium_configs_perf.py index c6b45f7d..7fc8a107 100644 --- a/dags/multipod/maxtext_trillium_configs_perf.py +++ b/dags/multipod/maxtext_trillium_configs_perf.py @@ -17,6 +17,7 @@ """ import datetime from airflow import models +from airflow.utils.task_group import TaskGroup from dags import composer_env, test_owner from dags.vm_resource import TpuVersion, Zone, Project, XpkClusters, DockerImage from dags.multipod.configs import maxtext_sweep_gke_config @@ -40,6 +41,9 @@ start_date=datetime.datetime(2024, 2, 19), catchup=False, ) as dag: + quarantine_task_group = TaskGroup( + group_id="Quarantine", dag=dag, prefix_group_id=False + ) for mode, image in DOCKER_IMAGES: for model in MODEL_CONFIGS: base_run_model_cmds = [ @@ -63,9 +67,13 @@ ) chain_num = 4 - prev = maxtext_sweep_gke_test[0].run_with_run_name_generation() + prev = maxtext_sweep_gke_test[0].run_with_name_gen_and_quarantine( + quarantine_task_group + ) for i in range(1, len(maxtext_sweep_gke_test)): - curr = maxtext_sweep_gke_test[i].run_with_run_name_generation() + curr = maxtext_sweep_gke_test[i].run_with_name_gen_and_quarantine( + quarantine_task_group + ) if i % chain_num != 0: prev >> curr prev = curr diff --git a/dags/multipod/maxtext_v5e_configs_perf.py b/dags/multipod/maxtext_v5e_configs_perf.py index f367b29d..22eabe17 100644 --- a/dags/multipod/maxtext_v5e_configs_perf.py +++ b/dags/multipod/maxtext_v5e_configs_perf.py @@ -17,6 +17,7 @@ """ import datetime from airflow import models +from airflow.utils.task_group import TaskGroup from dags import composer_env, test_owner from dags.vm_resource import TpuVersion, Zone, Project, XpkClusters, DockerImage from dags.multipod.configs import maxtext_sweep_gke_config @@ -49,6 +50,9 @@ start_date=datetime.datetime(2024, 2, 19), catchup=False, ) as dag: + quarantine_task_group = TaskGroup( + group_id="Quarantine", dag=dag, prefix_group_id=False + ) for mode, image in DOCKER_IMAGES: for model in MODEL_CONFIGS: base_run_model_cmds = [ @@ -72,9 +76,13 @@ ) chain_num = 4 - prev = maxtext_sweep_gke_test[0].run_with_run_name_generation() + prev = maxtext_sweep_gke_test[0].run_with_name_gen_and_quarantine( + quarantine_task_group + ) for i in range(1, len(maxtext_sweep_gke_test)): - curr = maxtext_sweep_gke_test[i].run_with_run_name_generation() + curr = maxtext_sweep_gke_test[i].run_with_name_gen_and_quarantine( + quarantine_task_group + ) if i % chain_num != 0: prev >> curr prev = curr @@ -90,6 +98,9 @@ start_date=datetime.datetime(2024, 2, 19), catchup=False, ) as dag: + quarantine_task_group = TaskGroup( + group_id="Quarantine", dag=dag, prefix_group_id=False + ) for mode, image in DOCKER_IMAGES: for model in MODEL_CONFIGS: base_run_model_cmds = [ @@ -113,12 +124,12 @@ ) chain_num = 4 - prev = maxtext_sweep_gke_test[0].run_with_run_name_generation( - use_pathways=True + prev = maxtext_sweep_gke_test[0].run_with_name_gen_and_quarantine( + quarantine_task_group, use_pathways=True ) for i in range(1, len(maxtext_sweep_gke_test)): - curr = maxtext_sweep_gke_test[i].run_with_run_name_generation( - use_pathways=True + curr = maxtext_sweep_gke_test[i].run_with_name_gen_and_quarantine( + quarantine_task_group, use_pathways=True ) if i % chain_num != 0: prev >> curr diff --git a/dags/quarantined_tests.py b/dags/quarantined_tests.py index f98c06b9..fe581b68 100644 --- a/dags/quarantined_tests.py +++ b/dags/quarantined_tests.py @@ -287,6 +287,399 @@ class QuarantineTests: "mxla-maxtext-nightly-gke-8xv5p-8": TestInfo( team.PERFORMANCE, "2024-11-12" ), + # DAG: maxtext_trillium_configs_perf + "maxtext-llama2_70b_4096-stable-3-2xv6e-256": TestInfo( + team.PERFORMANCE, "2024-11-12" + ), + "maxtext-llama2_70b_4096-nightly-3-2xv6e-256": TestInfo( + team.PERFORMANCE, "2024-11-12" + ), + # DAG: maxtext_v5e_configs_perf + "maxtext-16b-stable-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-16b-stable-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-16b-stable-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-16b-stable-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-32b-stable-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-32b-stable-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-32b-stable-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-32b-stable-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-64b-stable-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-64b-stable-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-64b-stable-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-64b-stable-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-128b-stable-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-128b-stable-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-128b-stable-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-128b-stable-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-gpt3_175b-stable-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-gpt3_175b-stable-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-gpt3_175b-stable-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-gpt3_175b-stable-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_7b-stable-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_7b-stable-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_7b-stable-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_7b-stable-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_13b-stable-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_13b-stable-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_13b-stable-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_13b-stable-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_70b-stable-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_70b-stable-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_70b-stable-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_70b-stable-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-16b-nightly-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-16b-nightly-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-16b-nightly-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-16b-nightly-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-32b-nightly-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-32b-nightly-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-32b-nightly-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-32b-nightly-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-64b-nightly-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-64b-nightly-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-64b-nightly-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-64b-nightly-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-128b-nightly-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-128b-nightly-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-128b-nightly-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-128b-nightly-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-gpt3_175b-nightly-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-gpt3_175b-nightly-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-gpt3_175b-nightly-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-gpt3_175b-nightly-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_7b-nightly-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_7b-nightly-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_7b-nightly-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_7b-nightly-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_13b-nightly-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_13b-nightly-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_13b-nightly-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_13b-nightly-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_70b-nightly-0-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_70b-nightly-1-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_70b-nightly-2-v5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + "maxtext-llama2_70b-nightly-3-2xv5litepod-256": TestInfo( + team.PERFORMANCE, "2024-11-13" + ), + # DAG: pathways_maxtext_v5e_configs_perf + "p-maxtext-16b-stable-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-16b-stable-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-16b-stable-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-16b-stable-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-32b-stable-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-32b-stable-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-32b-stable-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-32b-stable-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-64b-stable-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-64b-stable-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-64b-stable-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-64b-stable-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-128b-stable-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-128b-stable-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-128b-stable-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-128b-stable-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-gpt3_175b-stable-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-gpt3_175b-stable-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-gpt3_175b-stable-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-gpt3_175b-stable-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_7b-stable-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_7b-stable-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_7b-stable-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_7b-stable-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_13b-stable-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_13b-stable-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_13b-stable-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_13b-stable-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_70b-stable-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_70b-stable-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_70b-stable-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_70b-stable-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-16b-nightly-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-16b-nightly-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-16b-nightly-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-16b-nightly-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-32b-nightly-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-32b-nightly-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-32b-nightly-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-32b-nightly-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-64b-nightly-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-64b-nightly-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-64b-nightly-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-64b-nightly-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-128b-nightly-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-128b-nightly-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-128b-nightly-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-128b-nightly-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-gpt3_175b-nightly-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-gpt3_175b-nightly-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-gpt3_175b-nightly-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-gpt3_175b-nightly-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_7b-nightly-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_7b-nightly-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_7b-nightly-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_7b-nightly-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_13b-nightly-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_13b-nightly-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_13b-nightly-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_13b-nightly-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_70b-nightly-0-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_70b-nightly-1-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_70b-nightly-2-v5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), + "p-maxtext-llama2_70b-nightly-3-2xv5litepod-256": TestInfo( + team.PRODUCTIVITY, "2024-11-13" + ), } @staticmethod diff --git a/xlml/apis/task.py b/xlml/apis/task.py index 6cb84ce9..69806466 100644 --- a/xlml/apis/task.py +++ b/xlml/apis/task.py @@ -191,6 +191,16 @@ def run( return group + def run_with_name_gen_and_quarantine( + self, quarantine_task_group, use_pathways: bool = False + ) -> DAGNode: + test_name = self.task_test_config.benchmark_id + if QuarantineTests.is_quarantined(test_name): + with quarantine_task_group: + return self.run_with_run_name_generation(use_pathways) + else: + return self.run_with_run_name_generation(use_pathways) + def run_with_run_name_generation(self, use_pathways: bool = False) -> DAGNode: """Generate a unique run name and tensorboard file location, then run a test job within a docker image. From affb3a3b8a2a0b4a7cfcd26f5c5a182d07d97dfa Mon Sep 17 00:00:00 2001 From: Yijia Date: Thu, 14 Nov 2024 10:27:57 -0800 Subject: [PATCH 05/26] Fix Project Image Name for GPU Inference DAGs (#475) * fix project name * format --- dags/inference/trt_llm_inference.py | 2 +- dags/inference/trt_llm_mlperf_v40_inference.py | 2 +- dags/inference/trt_llm_mlperf_v41_inference.py | 4 ++-- dags/vm_resource.py | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dags/inference/trt_llm_inference.py b/dags/inference/trt_llm_inference.py index ea30e9d5..7b0f5b97 100644 --- a/dags/inference/trt_llm_inference.py +++ b/dags/inference/trt_llm_inference.py @@ -36,7 +36,7 @@ # Running on H100 GPU trt_llm_inference_config.get_trt_llm_gpu_config( machine_type=MachineVersion.A3_HIGHGPU_8G, - image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE, + image_project=ImageProject.ML_IMAGES, image_family=ImageFamily.COMMON_CU121_DEBIAN_11, accelerator_type=GpuVersion.H100, count=8, diff --git a/dags/inference/trt_llm_mlperf_v40_inference.py b/dags/inference/trt_llm_mlperf_v40_inference.py index 2926b52b..4d716ea4 100644 --- a/dags/inference/trt_llm_mlperf_v40_inference.py +++ b/dags/inference/trt_llm_mlperf_v40_inference.py @@ -50,7 +50,7 @@ # Running on H100 GPU trt_llm_mlperf_v40_config.get_trt_llm_mlperf_v40_gpu_config( machine_type=MachineVersion.A3_HIGHGPU_8G, - image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE, + image_project=ImageProject.ML_IMAGES, image_family=ImageFamily.COMMON_CU121_DEBIAN_11, accelerator_type=GpuVersion.H100, count=8, diff --git a/dags/inference/trt_llm_mlperf_v41_inference.py b/dags/inference/trt_llm_mlperf_v41_inference.py index 9fdf8700..1cfe4498 100644 --- a/dags/inference/trt_llm_mlperf_v41_inference.py +++ b/dags/inference/trt_llm_mlperf_v41_inference.py @@ -104,7 +104,7 @@ # Running on A100 GPU trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config( machine_type=MachineVersion.A2_ULTRAGPU_8G, - image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE, + image_project=ImageProject.ML_IMAGES, image_family=ImageFamily.COMMON_CU121_DEBIAN_11, accelerator_type=GpuVersion.A100_80G, count=8, @@ -123,7 +123,7 @@ # Running on L4 GPU trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config( machine_type=MachineVersion.G2_STAND_96, - image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE, + image_project=ImageProject.ML_IMAGES, image_family=ImageFamily.COMMON_CU121_DEBIAN_11, accelerator_type=GpuVersion.L4, count=8, diff --git a/dags/vm_resource.py b/dags/vm_resource.py index 8b8b6de3..4459be63 100644 --- a/dags/vm_resource.py +++ b/dags/vm_resource.py @@ -67,6 +67,7 @@ class ImageProject(enum.Enum): """Common image projects for GPU.""" DEEP_LEARNING_PLATFORM_RELEASE = "deeplearning-platform-release" + ML_IMAGES = "ml-images" class ImageFamily(enum.Enum): From 5af6bd71320aa966aa3958b5f63275e2df662314 Mon Sep 17 00:00:00 2001 From: Ran Ran Date: Fri, 15 Nov 2024 10:40:59 -0800 Subject: [PATCH 06/26] Split MoE from end_to_end test (#477) --- dags/multipod/maxtext_end_to_end.py | 26 +-- dags/quarantined_tests.py | 8 +- .../jax_stable_stack_gpu_e2e.py | 1 + .../maxtext_moe_tpu_e2e.py | 148 ++++++++++++++++++ .../project_bite_gpu_e2e.py | 2 +- .../project_bite_tpu_e2e.py | 8 +- 6 files changed, 162 insertions(+), 31 deletions(-) create mode 100644 dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py diff --git a/dags/multipod/maxtext_end_to_end.py b/dags/multipod/maxtext_end_to_end.py index e1817679..2d482441 100644 --- a/dags/multipod/maxtext_end_to_end.py +++ b/dags/multipod/maxtext_end_to_end.py @@ -20,7 +20,7 @@ from airflow.utils.task_group import TaskGroup from dags import composer_env, test_owner from dags.quarantined_tests import QuarantineTests -from dags.vm_resource import XpkClusters, CpuVersion, DockerImage, GpuVersion, Project, TpuVersion, Zone +from dags.vm_resource import XpkClusters, DockerImage from dags.multipod.configs import gke_config from xlml.utils import name_format @@ -77,30 +77,6 @@ "time_out_in_min": 60, }, ], - "mixtral-8x7b": [ - { - "script_name": "tpu/mixtral/8x7b/1_test_mixtral", - "cluster": XpkClusters.CPU_M1_MEGAMEM_96_CLUSTER, - "time_out_in_min": 240, - }, - { - "script_name": "tpu/mixtral/8x7b/2_test_mixtral", - "cluster": XpkClusters.TPU_V4_128_CLUSTER, - "time_out_in_min": 60, - }, - ], - "mixtral-8x22b": [ - { - "script_name": "tpu/mixtral/8x22b/1_test_mixtral", - "cluster": XpkClusters.CPU_M1_MEGAMEM_96_CLUSTER, - "time_out_in_min": 360, - }, - { - "script_name": "tpu/mixtral/8x22b/2_test_mixtral", - "cluster": XpkClusters.TPU_V5E_256_CLUSTER, - "time_out_in_min": 60, - }, - ], "llama2-70b": [ { "script_name": "tpu/llama2/70b/1_test_llama2_70b", diff --git a/dags/quarantined_tests.py b/dags/quarantined_tests.py index fe581b68..a5ba9f6b 100644 --- a/dags/quarantined_tests.py +++ b/dags/quarantined_tests.py @@ -178,16 +178,16 @@ class QuarantineTests: "chained_tests_gemma-7b_stable": TestInfo(team.LLM_DEVX, "2024-11-12"), "chained_tests_gemma-7b_nightly": TestInfo(team.LLM_DEVX, "2024-11-12"), "chained_tests_mixtral-8x7b_stable": TestInfo( - team.LLM_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), "chained_tests_mixtral-8x7b_nightly": TestInfo( - team.LLM_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), "chained_tests_mixtral-8x22b_stable": TestInfo( - team.LLM_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), "chained_tests_mixtral-8x22b_nightly": TestInfo( - team.LLM_DEVX, "2024-11-12" + team.SPARSITY_DIFFUSION_DEVX, "2024-11-12" ), "chained_tests_llama2-70b_stable": TestInfo(team.LLM_DEVX, "2024-11-12"), "chained_tests_llama2-70b_nightly": TestInfo(team.LLM_DEVX, "2024-11-12"), diff --git a/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py index 466bf3b6..400f49ce 100644 --- a/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py +++ b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py @@ -35,6 +35,7 @@ "sparsity_diffusion_devx", "multipod_team", "maxtext", + "gpu", "jax-stable-stack", ], start_date=datetime.datetime(2024, 6, 7), diff --git a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py new file mode 100644 index 00000000..8abe4919 --- /dev/null +++ b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py @@ -0,0 +1,148 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A DAG to run end-to-end MoE tests.""" + + +import datetime +from airflow import models +from airflow.utils.task_group import TaskGroup +from dags import composer_env, test_owner +from dags.quarantined_tests import QuarantineTests +from dags.vm_resource import XpkClusters, DockerImage +from dags.multipod.configs import gke_config +from xlml.utils import name_format + +# Run once a day at 5 am UTC (9 pm PST) +SCHEDULED_TIME = "0 5 * * *" if composer_env.is_prod_env() else None + + +with models.DAG( + dag_id="maxtext_moe_tpu_e2e", + schedule=SCHEDULED_TIME, + tags=[ + "sparsity_diffusion_devx", + "multipod_team", + "maxtext", + "tpu", + "stable", + "nightly", + ], + start_date=datetime.datetime(2024, 11, 14), + catchup=False, +) as dag: + test_name_prefix = "maxtext" + quarantine_task_group = TaskGroup( + group_id="Quarantine", dag=dag, prefix_group_id=False + ) + + multicluster_test_models = { + "mixtral-8x7b": [ + { + "script_name": "tpu/mixtral/8x7b/1_test_mixtral", + "cluster": XpkClusters.CPU_M1_MEGAMEM_96_CLUSTER, + "time_out_in_min": 240, + }, + { + "script_name": "tpu/mixtral/8x7b/2_test_mixtral", + "cluster": XpkClusters.TPU_V4_128_CLUSTER, + "time_out_in_min": 60, + }, + ], + "mixtral-8x22b": [ + { + "script_name": "tpu/mixtral/8x22b/1_test_mixtral", + "cluster": XpkClusters.CPU_M1_MEGAMEM_96_CLUSTER, + "time_out_in_min": 360, + }, + { + "script_name": "tpu/mixtral/8x22b/2_test_mixtral", + "cluster": XpkClusters.TPU_V5E_256_CLUSTER, + "time_out_in_min": 60, + }, + ], + } + + def convert_checkpoint_and_run_training( + test_group_id, + test_name_prefix, + type, + docker_image, + model, + test_scripts_details, + ): + with TaskGroup(group_id=test_group_id, prefix_group_id=False) as group: + test_name = f"{test_name_prefix}-{type}-{model}" + shared_gcs_location = name_format.generate_gcs_folder_location.override( + task_id=f"{test_group_id}_generate_gcs_folder_location" + )( + gcs_subfolder, + test_group_id, + ) + conversion_cpu = gke_config.get_maxtext_cpu_end_to_end_gke_config( + time_out_in_min=test_scripts_details[0]["time_out_in_min"], + test_name=test_name, + run_model_cmds=( + f"export BASE_OUTPUT_PATH=$GCS_OUTPUT; bash end_to_end/{test_scripts_details[0]['script_name']}.sh", + ), + docker_image=docker_image, + test_owner=test_owner.RAN_R, + cluster=test_scripts_details[0]["cluster"], + ).run(gcs_location=shared_gcs_location) + training_tpu = gke_config.get_gke_config( + time_out_in_min=test_scripts_details[1]["time_out_in_min"], + test_name=test_name, + run_model_cmds=( + f"export BASE_OUTPUT_PATH=$GCS_OUTPUT; bash end_to_end/{test_scripts_details[1]['script_name']}.sh", + ), + docker_image=docker_image, + test_owner=test_owner.RAN_R, + cluster=test_scripts_details[1]["cluster"], + ).run(gcs_location=shared_gcs_location) + return conversion_cpu, training_tpu + + docker_image = { + "stable": DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK.value, + "nightly": DockerImage.MAXTEXT_TPU_JAX_NIGHTLY.value, + } + tests = [] + for model, test_scripts_details in multicluster_test_models.items(): + gcs_subfolder = f"{test_owner.Team.SPARSITY_DIFFUSION_DEVX.value}/maxtext" + for type in docker_image.keys(): + test_group_id = "chained_tests" + "_" + model + "_" + type + if QuarantineTests.is_quarantined(test_group_id): + with quarantine_task_group: + mode_cpu, mode_tpu = convert_checkpoint_and_run_training( + test_group_id, + test_name_prefix, + type, + docker_image[type], + model, + test_scripts_details, + ) + else: + mode_cpu, mode_tpu = convert_checkpoint_and_run_training( + test_group_id, + test_name_prefix, + type, + docker_image[type], + model, + test_scripts_details, + ) + tests.append(mode_cpu) + tests.append(mode_tpu) + + # stable_cpu >> stable_tpu >> nightly_cpu >> nightly_tpu + for i in range(len(tests) - 1): + tests[i] << tests[i + 1] diff --git a/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py b/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py index fdb95032..2d47b5cd 100644 --- a/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py +++ b/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py @@ -32,7 +32,7 @@ tags=[ "sparsity_diffusion_devx", "multipod_team", - "gcp_gpu", + "gpu", "axlearn", "bite", ], diff --git a/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py b/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py index df7370b5..b188373d 100644 --- a/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py +++ b/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py @@ -28,7 +28,13 @@ with models.DAG( dag_id="project_bite_tpu_e2e", schedule=SCHEDULED_TIME, - tags=["sparsity_diffusion_devx", "jax", "nightly", "bite", "multipod_team"], + tags=[ + "sparsity_diffusion_devx", + "multipod_team", + "tpu", + "axlearn", + "bite", + ], start_date=datetime.datetime(2024, 4, 4), catchup=False, ) as dag: From f1e009e0f88b195eff42a4c799793faf01920f66 Mon Sep 17 00:00:00 2001 From: Gunjan Jalori <39437795+gunjanj007@users.noreply.github.com> Date: Fri, 15 Nov 2024 14:23:59 -0800 Subject: [PATCH 07/26] Add AOTC nightly test DAG (#461) * Add an empty DAG for reproducibility * Trillium support for JAX Stable Stack DAG (#444) * Updated M* image versions to reflect jax_stable_stack=0.4.35-rev1 (#445) * Add Gpt3 regression tests for Models benchmark reproducible artifacts. * Delete old files * reformat * gpus 256 * add all the func * add all the func * fix recipe name * resolve comments * reformat * gger checks * resolve conflict * resolve conflict1 * resolve conflict2 * move utils to team folder * change bucket name * fix format * remove getting pods initially * Add copyright doc * resolve conflicts * fix formatting * fi8x formatting * fix formatting * fix formatting. --------- Co-authored-by: Param Bole --- .../aotc_reproducibility.py | 110 ++++++++++++++++++ dags/map_reproducibility/nemo_gpt3.py | 97 +++++++++++++++ dags/test_owner.py | 4 +- 3 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 dags/map_reproducibility/aotc_reproducibility.py create mode 100644 dags/map_reproducibility/nemo_gpt3.py diff --git a/dags/map_reproducibility/aotc_reproducibility.py b/dags/map_reproducibility/aotc_reproducibility.py new file mode 100644 index 00000000..eff156ab --- /dev/null +++ b/dags/map_reproducibility/aotc_reproducibility.py @@ -0,0 +1,110 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"Bash helper commands for AOTC artifacts" + +import os + + +def set_variables_cmds(): + set_variables = ( + "export PROJECT=supercomputer-testing", + "export CLUSTER=a3plus-benchmark", + "export CLUSTER_REGION=australia-southeast1", + "NOW=$(date +%s)", + "export BUCKET_NAME=regression-testing-xlml", + "export JOB_NAME=gpt3-xlml-$NOW-175b-nemo", + ) + return set_variables + + +def set_project_commands(): + set_project_command = ( + "gcloud config set project $PROJECT", + "sudo chown -R airflow:airflow /home/airflow/composer_kube_config", + "gcloud container clusters get-credentials " + "$CLUSTER --region $CLUSTER_REGION", + ) + return set_project_command + + +def install_helm_cmds(): + install_helm_cmd = ( + "curl -fsSL -o get_helm.sh " + "https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3", + "chmod 700 get_helm.sh", + "./get_helm.sh", + ) + return install_helm_cmd + + +# By default the composer environment overwrites the +# namespaces to airflow namespaces. +# In order to prevent that it is necessary explicitly +# change the namespace to default. +def namespace_cmds(): + namespace = ( + "kubectl config view | grep namespace", + "kubectl config set-context --current --namespace=default", + "kubectl config set-context heml --namespace=default", + ) + return namespace + + +def wait_for_jobs_cmds(): + wait_for_job = ( + "echo 'will wait for job to start running'", + "kubectl wait --for=condition=running job/$JOB_NAME" + " --namespace=default --timeout=10m", + "echo 'will wait for jobs to finish'", + "kubectl wait --for=condition=complete " + "job/$JOB_NAME --namespace=default --timeout=100m", + ) + return wait_for_job + + +def copy_bucket_cmds(): + copy_bucket_contents = ( + "COMPLETE_JOB_NAME=$(gcloud storage ls " + "gs://$BUCKET_NAME/nemo-experiments/ | grep $JOB_NAME)", + "echo 'copying from' ", + "echo $COMPLETE_JOB_NAME", + "cd $REPO_ROOT/src/utils/training_metrics", + "gcloud storage cp ${COMPLETE_JOB_NAME}" + "dllogger/rank-0/dllogger.json .", + ) + return copy_bucket_contents + + +def get_metrics_cmds(): + # TODO(gunjanj007): get these parameters from the recipe + get_metrics = ( + "python3 process_training_results.py --file" + " dllogger.json --batch_size 2048 " + "--num_accelerators 256 " + "--precision fp8 " + "--model_type gpt3-175b " + "--accelerator_type h100 ", + ) + return get_metrics + + +def cleanup_cmds(): + cleanup = ( + "kubectl get pods " + "--no-headers=true | awk '{print $1}' " + "| grep $JOB_NAME | xargs kubectl delete pods", + "helm uninstall $JOB_NAME", + ) + return cleanup diff --git a/dags/map_reproducibility/nemo_gpt3.py b/dags/map_reproducibility/nemo_gpt3.py new file mode 100644 index 00000000..ff03fdaf --- /dev/null +++ b/dags/map_reproducibility/nemo_gpt3.py @@ -0,0 +1,97 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""DAGs to run Aotc reproducibility benchmarks.""" + +import datetime +from airflow import models +from airflow.decorators import task +from airflow.hooks.subprocess import SubprocessHook +from dags import composer_env +from dags.map_reproducibility.aotc_reproducibility import get_metrics_cmds +from dags.map_reproducibility.aotc_reproducibility import set_variables_cmds +from dags.map_reproducibility.aotc_reproducibility import set_project_commands +from dags.map_reproducibility.aotc_reproducibility import install_helm_cmds +from dags.map_reproducibility.aotc_reproducibility import namespace_cmds +from dags.map_reproducibility.aotc_reproducibility import wait_for_jobs_cmds +from dags.map_reproducibility.aotc_reproducibility import copy_bucket_cmds +from dags.map_reproducibility.aotc_reproducibility import cleanup_cmds + +# Run once a day at 2 pm UTC (6 am PST) +SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None + + +@task +def run_aotc_workload(): + gpu_recipe_cmd = ( + "git clone https://github.com/ai-hypercomputer/gpu-recipes.git", + "cd gpu-recipes", + "export REPO_ROOT=`git rev-parse --show-toplevel`", + "export RECIPE_ROOT=" + "$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke", + "cd $RECIPE_ROOT", + ) + + helm_cmds = ( + "CONFIG_FILE=$REPO_ROOT/src/frameworks" + "/nemo-configs/gpt3-175b-256gpus-fp8.yaml", + " helm install -f values.yaml " + "--namespace default " + "--set namespace=default" + " --set-file nemo_config" + "=$CONFIG_FILE" + " --set workload.image" + "=us-central1-docker.pkg.dev/" + "supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07" + " --set workload.gcsBucketForDataCataPath=$BUCKET_NAME" + " $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training", + ) + + hook = SubprocessHook() + result = hook.run_command( + [ + "bash", + "-c", + ";".join( + set_variables_cmds() + + set_project_commands() + + gpu_recipe_cmd + + install_helm_cmds() + + namespace_cmds() + + helm_cmds + + wait_for_jobs_cmds() + + copy_bucket_cmds() + + get_metrics_cmds() + + cleanup_cmds() + ), + ], + ) + assert result.exit_code == 0, f"Command failed with code {result.exit_code}" + + +with models.DAG( + dag_id="reproducibility_nemo_gpt3_nighly_dag", + schedule=SCHEDULED_TIME, + tags=[ + "simple", + "aotc", + "nightly", + "reproducibility", + "experimental", + "xlml", + ], + start_date=datetime.datetime(2024, 11, 15), + catchup=False, +) as dag: + run_aotc_workload() diff --git a/dags/test_owner.py b/dags/test_owner.py index fbe748b5..e96a8c5e 100644 --- a/dags/test_owner.py +++ b/dags/test_owner.py @@ -69,6 +69,8 @@ class Team(enum.Enum): # FRAMEWORK QINY_Y = "Qinyi Y." - # JAX AKANKSHA_G = "Akanksha G." + +# MAP_REPRODUCIBILITY +GUNJAN_J = "Gunjan J." From 8ef98a3e8b3d79a41307ef9cb2bdf669532d1b53 Mon Sep 17 00:00:00 2001 From: Akanksha Date: Fri, 15 Nov 2024 15:26:52 -0800 Subject: [PATCH 08/26] Add GKE tests for jax.distributed.initialize() (#480) Add tests for jax.distributed.initialize() function using the GKE stack. This improves code coverage for this function. Test logs: http://shortn/_kychJoMx2Q --- dags/multipod/configs/jax_tests_gce_config.py | 2 +- dags/multipod/configs/jax_tests_gke_config.py | 41 +++++ dags/multipod/jax_functional_tests.py | 155 +++++++++--------- 3 files changed, 118 insertions(+), 80 deletions(-) create mode 100644 dags/multipod/configs/jax_tests_gke_config.py diff --git a/dags/multipod/configs/jax_tests_gce_config.py b/dags/multipod/configs/jax_tests_gce_config.py index 0f31af96..9d785462 100644 --- a/dags/multipod/configs/jax_tests_gce_config.py +++ b/dags/multipod/configs/jax_tests_gce_config.py @@ -15,7 +15,7 @@ """Utilities to construct configs for JAX tests for GCE.""" from xlml.apis import gcp_config, metric_config, task, test_config -from dags import test_owner, gcs_bucket +from dags import test_owner from dags.multipod.configs import common from dags.vm_resource import TpuVersion, Project, RuntimeVersion import datetime diff --git a/dags/multipod/configs/jax_tests_gke_config.py b/dags/multipod/configs/jax_tests_gke_config.py new file mode 100644 index 00000000..183c4d6e --- /dev/null +++ b/dags/multipod/configs/jax_tests_gke_config.py @@ -0,0 +1,41 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities to construct configs for JAX tests for GCE.""" + +from dags import test_owner +from dags.multipod.configs import gke_config +from dags.vm_resource import XpkClusterConfig + + +def get_jax_distributed_initialize_config( + cluster: XpkClusterConfig, + time_out_in_min: int, + test_name: str, + docker_image: str, + num_slices: int = 1, +): + run_model_cmds = [ + "bash end_to_end/test_jdi.sh", + ] + + return gke_config.get_gke_config( + cluster=cluster, + test_name=test_name, + run_model_cmds=run_model_cmds, + num_slices=num_slices, + docker_image=docker_image, + test_owner=test_owner.AKANKSHA_G, + time_out_in_min=time_out_in_min, + ) diff --git a/dags/multipod/jax_functional_tests.py b/dags/multipod/jax_functional_tests.py index 9538c688..cf76af8f 100644 --- a/dags/multipod/jax_functional_tests.py +++ b/dags/multipod/jax_functional_tests.py @@ -17,8 +17,8 @@ import datetime from airflow import models from dags import composer_env -from dags.vm_resource import TpuVersion, Zone, Project, V5_NETWORKS, V5P_SUBNETWORKS, RuntimeVersion -from dags.multipod.configs import jax_tests_gce_config +from dags.vm_resource import DockerImage, TpuVersion, Zone, Project, V5_NETWORKS, V5P_SUBNETWORKS, RuntimeVersion, XpkClusters +from dags.multipod.configs import jax_tests_gce_config, jax_tests_gke_config from dags.multipod.configs.common import SetupMode # Run once a day at 10 am UTC (2 am PST) @@ -32,87 +32,84 @@ catchup=False, ) as dag: default_test_name = "jax-distributed-initialize" - test_modes = [SetupMode.STABLE, SetupMode.NIGHTLY, SetupMode.JAX_STABLE_STACK] + v5p_project_name = Project.TPU_PROD_ENV_AUTOMATED.value + v5p_network = V5_NETWORKS + v5p_subnetwork = V5P_SUBNETWORKS + v5p_runtime_version = RuntimeVersion.V2_ALPHA_TPUV5.value + test_modes_with_docker_images = [ + (SetupMode.STABLE, None), + (SetupMode.JAX_STABLE_STACK, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), + (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), + ] + v4_task_arr, v5p_task_arr = [], [] - for test_mode in test_modes: - # v4 - jax_nightly_1slice_v4_8 = ( - jax_tests_gce_config.get_jax_distributed_initialize_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - test_name=f"{default_test_name}-{test_mode.value}", - test_mode=test_mode, - ) - ) - if len(v4_task_arr) > 1: - # pylint: disable-next=pointless-statement - v4_task_arr[-1] >> jax_nightly_1slice_v4_8 - v4_task_arr.append(jax_nightly_1slice_v4_8) + for test_mode, gke_docker_image in test_modes_with_docker_images: + for num_slices in (1, 2): + # v4 GCE + jax_gce_v4_8 = jax_tests_gce_config.get_jax_distributed_initialize_config( + tpu_version=TpuVersion.V4, + tpu_cores=8, + tpu_zone=Zone.US_CENTRAL2_B.value, + time_out_in_min=60, + is_tpu_reserved=False, + num_slices=num_slices, + test_name=f"{default_test_name}-gce-{test_mode.value}", + test_mode=test_mode, + ) + if len(v4_task_arr) > 1: + # pylint: disable-next=pointless-statement + v4_task_arr[-1] >> jax_gce_v4_8 + v4_task_arr.append(jax_gce_v4_8) - jax_nightly_2slice_v4_8 = ( - jax_tests_gce_config.get_jax_distributed_initialize_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - num_slices=2, - test_name=f"{default_test_name}-{test_mode.value}", - test_mode=test_mode, + # v4 GKE + if gke_docker_image is not None: + jax_gke_v4_8 = ( + jax_tests_gke_config.get_jax_distributed_initialize_config( + cluster=XpkClusters.TPU_V4_8_MAXTEXT_CLUSTER, + time_out_in_min=60, + num_slices=num_slices, + test_name=f"{default_test_name}-gke-{test_mode.value}", + docker_image=gke_docker_image.value, + ).run() ) - ) - - # pylint: disable-next=pointless-statement - v4_task_arr[-1] >> jax_nightly_2slice_v4_8 - v4_task_arr.append(jax_nightly_2slice_v4_8) + # pylint: disable-next=pointless-statement + v4_task_arr[-1] >> jax_gke_v4_8 + v4_task_arr.append(jax_gke_v4_8) - # v5p - v5p_project_name = Project.TPU_PROD_ENV_AUTOMATED.value - v5p_network = V5_NETWORKS - v5p_subnetwork = V5P_SUBNETWORKS - v5p_runtime_version = RuntimeVersion.V2_ALPHA_TPUV5.value + # v5p GCE + jax_gce_v5p_8 = ( + jax_tests_gce_config.get_jax_distributed_initialize_config( + tpu_version=TpuVersion.V5P, + tpu_cores=8, + num_slices=num_slices, + tpu_zone=Zone.US_EAST5_A.value, + runtime_version=v5p_runtime_version, + project_name=v5p_project_name, + time_out_in_min=60, + is_tpu_reserved=True, + test_name=f"{default_test_name}-gce-{test_mode.value}", + test_mode=test_mode, + network=v5p_network, + subnetwork=v5p_subnetwork, + ) + ) + if len(v5p_task_arr) > 1: + # pylint: disable-next=pointless-statement + v5p_task_arr[-1] >> jax_gce_v5p_8 + v5p_task_arr.append(jax_gce_v5p_8) - jax_nightly_1slice_v5p_8 = ( - jax_tests_gce_config.get_jax_distributed_initialize_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=f"{default_test_name}-{test_mode.value}", - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, + # v5p GKE + if gke_docker_image is not None: + jax_gke_v5p_8 = ( + jax_tests_gke_config.get_jax_distributed_initialize_config( + cluster=XpkClusters.TPU_V5P_8_CLUSTER, + time_out_in_min=60, + num_slices=num_slices, + test_name=f"{default_test_name}-gke-{test_mode.value}", + docker_image=gke_docker_image.value, + ).run() ) - ) - if len(v5p_task_arr) > 1: - # pylint: disable-next=pointless-statement - v5p_task_arr[-1] >> jax_nightly_1slice_v5p_8 - v5p_task_arr.append(jax_nightly_1slice_v5p_8) - - jax_nightly_2slice_v5p_8 = ( - jax_tests_gce_config.get_jax_distributed_initialize_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - num_slices=2, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=f"{default_test_name}-{test_mode.value}", - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) - ) - - # pylint: disable-next=pointless-statement - v5p_task_arr[-1] >> jax_nightly_2slice_v5p_8 - v5p_task_arr.append(jax_nightly_2slice_v5p_8) + # pylint: disable-next=pointless-statement + v5p_task_arr[-1] >> jax_gke_v5p_8 + v5p_task_arr.append(jax_gke_v5p_8) From 0d1c0b8a844196ff5fbc5e82cf031d80979c3fab Mon Sep 17 00:00:00 2001 From: Ran Ran Date: Fri, 15 Nov 2024 15:40:39 -0800 Subject: [PATCH 09/26] Fix test dependency issue (#479) --- dags/multipod/maxtext_end_to_end.py | 2 +- dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/multipod/maxtext_end_to_end.py b/dags/multipod/maxtext_end_to_end.py index 2d482441..287257b6 100644 --- a/dags/multipod/maxtext_end_to_end.py +++ b/dags/multipod/maxtext_end_to_end.py @@ -162,4 +162,4 @@ def convert_checkpoint_and_run_training( # stable_cpu >> stable_tpu >> nightly_cpu >> nightly_tpu for i in range(len(tests) - 1): - tests[i] << tests[i + 1] + tests[i] >> tests[i + 1] diff --git a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py index 8abe4919..3430f2a0 100644 --- a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py +++ b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py @@ -145,4 +145,4 @@ def convert_checkpoint_and_run_training( # stable_cpu >> stable_tpu >> nightly_cpu >> nightly_tpu for i in range(len(tests) - 1): - tests[i] << tests[i + 1] + tests[i] >> tests[i + 1] From d30999d45a27730d105ad9088e71d0b6dd0ccd45 Mon Sep 17 00:00:00 2001 From: Param Bole Date: Mon, 18 Nov 2024 08:29:30 -0800 Subject: [PATCH 10/26] Adding new tests with stable stack nightly jax images (#476) * Adding new tests with stable stack nightly jax images * Correcting Typo --- .../jax_stable_stack_gpu_e2e.py | 2 +- .../jax_stable_stack_tpu_e2e.py | 68 +++++++++++-------- dags/vm_resource.py | 8 ++- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py index 400f49ce..6719b364 100644 --- a/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py +++ b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py @@ -61,7 +61,7 @@ docker_images = [ (SetupMode.STABLE, DockerImage.MAXTEXT_GPU_JAX_STABLE_STACK), - (SetupMode.NIGHTLY, DockerImage.MAXTEXT_GPU_JAX_STABLE_STACK_NIGHTLY), + (SetupMode.NIGHTLY, DockerImage.MAXTEXT_GPU_STABLE_STACK_NIGHTLY_JAX), ] for model, (test_script, nnodes) in test_models_gpu.items(): diff --git a/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py index a0cbac1a..572a6c32 100644 --- a/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py +++ b/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py @@ -21,6 +21,7 @@ from dags import composer_env, test_owner, gcs_bucket from dags.vm_resource import Project, TpuVersion, CpuVersion, Zone, DockerImage, GpuVersion, XpkClusters from dags.sparsity_diffusion_devx.configs import gke_config as config +from dags.multipod.configs.common import SetupMode from xlml.utils import name_format # Run once a day at 3 am UTC (7 pm PST) @@ -62,45 +63,52 @@ group_id="Quarantine", dag=dag, prefix_group_id=False ) + docker_images = [ + (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), + (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_STABLE_STACK_NIGHTLY_JAX), + ] + for accelerator, slices in maxtext_test_configs.items(): cores = accelerator.rsplit("-", maxsplit=1)[-1] cluster = config.clusters[accelerator] for slice_num in slices: - maxtext_jax_stable_stack_test = config.get_gke_config( - num_slices=slice_num, - cluster=cluster, - time_out_in_min=60, - run_model_cmds=( - f"JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true TPU_SLICE_BUILDER_DUMP_CHIP_FORCE=true TPU_SLICE_BUILDER_DUMP_ICI=true JAX_FORCE_TPU_INIT=true ENABLE_TPUNETD_CLIENT=true && " - f"python MaxText/train.py MaxText/configs/base.yml run_name={slice_num}slice-V{cluster.device_version}_{cores}-maxtext-jax-stable-stack-{current_datetime} " - "steps=30 per_device_batch_size=1 max_target_length=4096 model_name=llama2-7b " - "enable_checkpointing=false attention=dot_product remat_policy=minimal_flash use_iota_embed=true scan_layers=false " - "dataset_type=synthetic async_checkpointing=false " - f"base_output_directory={gcs_bucket.BASE_OUTPUT_DIR}/maxtext/jax-stable-stack/automated/{current_datetime}", - ), - test_name=f"maxtext-jax-stable-stack-{accelerator}-{slice_num}x", - docker_image=DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK.value, - test_owner=test_owner.PARAM_B, - ).run_with_quarantine(quarantine_task_group) + for mode, image in docker_images: + maxtext_jax_stable_stack_test = config.get_gke_config( + num_slices=slice_num, + cluster=cluster, + time_out_in_min=60, + run_model_cmds=( + f"JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true TPU_SLICE_BUILDER_DUMP_CHIP_FORCE=true TPU_SLICE_BUILDER_DUMP_ICI=true JAX_FORCE_TPU_INIT=true ENABLE_TPUNETD_CLIENT=true && " + f"python MaxText/train.py MaxText/configs/base.yml run_name={slice_num}slice-V{cluster.device_version}_{cores}-maxtext-jax-stable-stack-{current_datetime} " + "steps=30 per_device_batch_size=1 max_target_length=4096 model_name=llama2-7b " + "enable_checkpointing=false attention=dot_product remat_policy=minimal_flash use_iota_embed=true scan_layers=false " + "dataset_type=synthetic async_checkpointing=false " + f"base_output_directory={gcs_bucket.BASE_OUTPUT_DIR}/maxtext/jax-stable-stack/automated/{current_datetime}", + ), + test_name=f"maxtext-jax-stable-stack-{mode.value}-{accelerator}-{slice_num}x", + docker_image=DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK.value, + test_owner=test_owner.PARAM_B, + ).run_with_quarantine(quarantine_task_group) for accelerator, slices in maxdiffusion_test_configs.items(): cores = accelerator.rsplit("-", maxsplit=1)[-1] cluster = config.clusters[accelerator] for slice_num in slices: - maxdiffusion_jax_stable_stack_test = config.get_gke_config( - num_slices=slice_num, - cluster=cluster, - time_out_in_min=60, - run_model_cmds=( - f"JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true TPU_SLICE_BUILDER_DUMP_CHIP_FORCE=true TPU_SLICE_BUILDER_DUMP_ICI=true JAX_FORCE_TPU_INIT=true ENABLE_TPUNETD_CLIENT=true && " - f"pip install . && python src/maxdiffusion/train.py src/maxdiffusion/configs/base_2_base.yml " - f"run_name={slice_num}slice-V{cluster.device_version}_{cores}-maxdiffusion-jax-stable-stack-{current_datetime} " - f"output_dir={gcs_bucket.BASE_OUTPUT_DIR}/maxdiffusion/jax-stable-stack/automated/{current_datetime}", - ), - test_name=f"maxdiffusion-jax-stable-stack-{accelerator}-{slice_num}x", - docker_image=DockerImage.MAXDIFFUSION_TPU_JAX_STABLE_STACK.value, - test_owner=test_owner.PARAM_B, - ).run_with_quarantine(quarantine_task_group) + for mode, image in docker_images: + maxdiffusion_jax_stable_stack_test = config.get_gke_config( + num_slices=slice_num, + cluster=cluster, + time_out_in_min=60, + run_model_cmds=( + f"JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true TPU_SLICE_BUILDER_DUMP_CHIP_FORCE=true TPU_SLICE_BUILDER_DUMP_ICI=true JAX_FORCE_TPU_INIT=true ENABLE_TPUNETD_CLIENT=true && " + f"pip install . && python src/maxdiffusion/train.py src/maxdiffusion/configs/base_2_base.yml " + f"run_name={slice_num}slice-V{cluster.device_version}_{cores}-maxdiffusion-jax-stable-stack-{current_datetime} " + f"output_dir={gcs_bucket.BASE_OUTPUT_DIR}/maxdiffusion/jax-stable-stack/automated/{current_datetime}", + ), + test_name=f"maxdiffusion-jax-stable-stack-{mode.value}-{accelerator}-{slice_num}x", + docker_image=DockerImage.MAXDIFFUSION_TPU_JAX_STABLE_STACK.value, + test_owner=test_owner.PARAM_B, + ).run_with_quarantine(quarantine_task_group) for accelerator, slices in axlearn_test_configs.items(): cores = accelerator.rsplit("-", maxsplit=1)[-1] diff --git a/dags/vm_resource.py b/dags/vm_resource.py index 4459be63..01f4638b 100644 --- a/dags/vm_resource.py +++ b/dags/vm_resource.py @@ -303,6 +303,10 @@ class DockerImage(enum.Enum): "gcr.io/tpu-prod-env-multipod/maxtext_jax_stable_stack_0.4.35:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) + MAXTEXT_TPU_STABLE_STACK_NIGHTLY_JAX = ( + "gcr.io/tpu-prod-env-multipod/maxtext_stable_stack_nightly_jax:" + f"{datetime.datetime.today().strftime('%Y-%m-%d')}" + ) MAXDIFFUSION_TPU_JAX_STABLE_STACK = ( "gcr.io/tpu-prod-env-multipod/maxdiffusion_jax_stable_stack_0.4.35:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" @@ -323,8 +327,8 @@ class DockerImage(enum.Enum): "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_stable_stack_0.4.35:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) - MAXTEXT_GPU_JAX_STABLE_STACK_NIGHTLY = ( - "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_stable_stack_nightly:" + MAXTEXT_GPU_STABLE_STACK_NIGHTLY_JAX = ( + "gcr.io/tpu-prod-env-multipod/maxtext_gpu_stable_stack_nightly_jax:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_GPU_JAX_NIGHTLY = ( From 12a6b809600261db1855dac9f4c3243be77636b1 Mon Sep 17 00:00:00 2001 From: Yijia Date: Mon, 18 Nov 2024 10:22:21 -0800 Subject: [PATCH 11/26] add gemma (#481) --- dags/inference/configs/trt_llm_inference_config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dags/inference/configs/trt_llm_inference_config.py b/dags/inference/configs/trt_llm_inference_config.py index b13f3ebf..913f29b6 100644 --- a/dags/inference/configs/trt_llm_inference_config.py +++ b/dags/inference/configs/trt_llm_inference_config.py @@ -57,6 +57,7 @@ def get_trt_llm_gpu_config( "gsutil -m cp -r gs://tohaowu/llama_3_8B_Instruct_HF_model .", "gsutil -m cp -r gs://tohaowu/llama_3.1_70B_Instruct_HF_model .", "gsutil -m cp -r gs://tohaowu/Mixtral-8x22B-Instruct-v0.1 .", + "gsutil -m cp -r gs://yijiaj/gemma/gemma-2-27b-it .", "sudo apt-get update", "sudo apt-get -y install git git-lfs", "git clone https://github.com/NVIDIA/TensorRT-LLM.git", @@ -103,10 +104,14 @@ def get_trt_llm_gpu_config( "trtllm-build --checkpoint_dir /scratch/tllm_checkpoint_8gpu_tp8 --output_dir /scratch/llama/70B/trt_engines/fp16/8-gpu/ --gemm_plugin auto", "python ../llama/convert_checkpoint.py --model_dir /scratch/Mixtral-8x22B-Instruct-v0.1 --output_dir /scratch/tllm_checkpoint_mixtral_8gpu --dtype float16 --tp_size 8 --moe_tp_size 2 --moe_ep_size 4", "trtllm-build --checkpoint_dir /scratch/tllm_checkpoint_mixtral_8gpu --output_dir /scratch/trt_engines/mixtral/tp2ep4", + "cd ../gemma", + "python3 convert_checkpoint.py --ckpt-type hf --model-dir /scratch/gemma-2-27b-it/ --dtype bfloat16 --world-size 1 --output-model-dir /scratch/checkpoints/tmp_27b_it_tensorrt_llm/bf16/tp1/", + "trtllm-build --checkpoint_dir /scratch/checkpoints/tmp_27b_it_tensorrt_llm/bf16/tp1/ --gemm_plugin auto --max_batch_size 8 --max_input_len 3000 --max_seq_len 3100 --output_dir /scratch/gemma2/27b/bf16/1-gpu/", "cd ../../benchmarks/python", "python benchmark.py -m dec --engine_dir /scratch/llama/8B/trt_engines/fp16/1-gpu/ --csv", "OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 mpirun -n 8 python benchmark.py -m dec --engine_dir /scratch/llama/70B/trt_engines/fp16/8-gpu/ --csv", "OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 mpirun -n 8 python benchmark.py -m dec --engine_dir /scratch/trt_engines/mixtral/tp2ep4 --csv", + "python benchmark.py -m dec --engine_dir /scratch/gemma2/27b/bf16/1-gpu/ --dtype bfloat16 --csv", make_jsonl_convert_cmd, f"python jsonl_converter.py {jsonl_output_path}", ) From 2a3a8e81525ccb086cb586a94f29677eac00019e Mon Sep 17 00:00:00 2001 From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com> Date: Mon, 25 Nov 2024 10:58:24 -0800 Subject: [PATCH 12/26] Use smaller embedding size as 32 leads to segmentation fault (#487) --- dags/solutions_team/solutionsteam_tf_nightly_supported.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py index 13abe496..c2bebeed 100644 --- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py +++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py @@ -112,7 +112,7 @@ runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value, ) - embedding_dim = 32 + embedding_dim = 16 tf_dlrm_v5p_8 = tf_config.get_tf_dlrm_config( project_name=Project.TPU_PROD_ENV_AUTOMATED.value, tpu_version=TpuVersion.V5P, From 32b056a7e5a9923e8173b6771b3778b0a7102ed5 Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Mon, 25 Nov 2024 11:18:23 -0800 Subject: [PATCH 13/26] Update CODEOWNERS - add richardsliu as owner (#488) --- .github/CODEOWNERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index fd1f8567..e68ed421 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,8 +1,8 @@ # Default owners for everything in the repo, unless a later match takes precedence. * @mbzomowski @RissyRan @allenwang28 -dags/solutions_team/configs/tensorflow @chandrasekhard2 @ZhaoyueCheng -dags/solutions_team/solutionsteam_tf* @chandrasekhard2 @ZhaoyueCheng +dags/solutions_team/configs/tensorflow @chandrasekhard2 @ZhaoyueCheng @richardsliu +dags/solutions_team/solutionsteam_tf* @chandrasekhard2 @ZhaoyueCheng @richardsliu dags/pytorch_xla @JackCaoG @vanbasten23 @zpcore @ManfeiBai dags/legacy_test/tests/pytorch @JackCaoG @vanbasten23 @zpcore @ManfeiBai From fbb1de9a78c7d9952479cb7d5ac035dabb44b6fb Mon Sep 17 00:00:00 2001 From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com> Date: Mon, 25 Nov 2024 11:28:24 -0800 Subject: [PATCH 14/26] Oncall test fix (#486) * change cluster change cluster * typo * The -f flag makes rm not error if the file doesn't exist. --------- Co-authored-by: chandrasekhard2 <98771505+chandrasekhard2@users.noreply.github.com> --- dags/solutions_team/configs/tensorflow/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/solutions_team/configs/tensorflow/common.py b/dags/solutions_team/configs/tensorflow/common.py index 3972e227..beee93d5 100644 --- a/dags/solutions_team/configs/tensorflow/common.py +++ b/dags/solutions_team/configs/tensorflow/common.py @@ -19,7 +19,7 @@ CMD_PRINT_TF_VERSION = "python3 -c \"import tensorflow; print('Running using TensorFlow Version: ' + tensorflow.__version__)\"" -CMD_REMOVE_LIBTPU_LOCKFILE = "sudo rm /tmp/libtpu_lockfile" +CMD_REMOVE_LIBTPU_LOCKFILE = "sudo rm -f /tmp/libtpu_lockfile" CMD_INSTALL_KERAS_NIGHTLY = ( "pip install --upgrade --no-deps --force-reinstall tf-keras-nightly" ) From 10936b142fd67454e0d920c66bf9b920fd5b01bf Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:06:55 -0800 Subject: [PATCH 15/26] fix nightly (#489) --- dags/solutions_team/solutionsteam_tf_nightly_supported.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py index c2bebeed..d79bfdf1 100644 --- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py +++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py @@ -41,6 +41,7 @@ tpu_zone=Zone.US_CENTRAL1_C.value, time_out_in_min=60, global_batch_size=1024, + runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value, ) tf_resnet_v3_8 = tf_config.get_tf_resnet_config( @@ -48,6 +49,7 @@ tpu_cores=8, tpu_zone=Zone.US_EAST1_D.value, time_out_in_min=60, + runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value, ) tf_resnet_v4_8 = tf_config.get_tf_resnet_config( @@ -55,6 +57,7 @@ tpu_cores=8, tpu_zone=Zone.US_CENTRAL2_B.value, time_out_in_min=60, + runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value, ) tf_resnet_v4_32 = tf_config.get_tf_resnet_config( From b99cc999d171e3a2f5a991adaa0c33901a32a081 Mon Sep 17 00:00:00 2001 From: Ran Ran Date: Tue, 26 Nov 2024 11:05:01 -0800 Subject: [PATCH 16/26] Add ruamel dependency (#490) --- deployment/modules/composer_env/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/deployment/modules/composer_env/main.tf b/deployment/modules/composer_env/main.tf index f9718748..0288eca4 100644 --- a/deployment/modules/composer_env/main.tf +++ b/deployment/modules/composer_env/main.tf @@ -20,6 +20,7 @@ resource "google_composer_environment" "example_environment" { google-cloud-tpu = ">=1.16.0" jsonlines = "" ray = "[default]" + ruamel.yaml = "" # These packages are already in the default composer environment. # See https://cloud.google.com/composer/docs/concepts/versioning/composer-versions # google-cloud-bigquery = "" From cfbb0acc0cd959be1a7834bc8ba217c6e33f57cf Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:10:43 -0800 Subject: [PATCH 17/26] Fix one test for nightly (#491) Updating TF runtime environment for v4-32 in nightly test --- dags/solutions_team/solutionsteam_tf_nightly_supported.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py index d79bfdf1..fb67088a 100644 --- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py +++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py @@ -66,7 +66,7 @@ tpu_zone=Zone.US_CENTRAL2_B.value, time_out_in_min=60, is_pod=True, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value, + runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value, ) tf_resnet_v5e_4 = tf_config.get_tf_resnet_config( From e27c58a587270e49a5b07357582669e471c8532e Mon Sep 17 00:00:00 2001 From: Gunjan Jalori <39437795+gunjanj007@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:50:11 -0800 Subject: [PATCH 18/26] Add GoB repo cloning mechanism (#484) * add GoB cloning commands * reformat * format again * format again * format again * address comments * address comments * address comments --- .../aotc_reproducibility.py | 43 +++++++++++++++++-- dags/map_reproducibility/nemo_gpt3.py | 29 ++++++------- 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/dags/map_reproducibility/aotc_reproducibility.py b/dags/map_reproducibility/aotc_reproducibility.py index eff156ab..64478503 100644 --- a/dags/map_reproducibility/aotc_reproducibility.py +++ b/dags/map_reproducibility/aotc_reproducibility.py @@ -24,12 +24,11 @@ def set_variables_cmds(): "export CLUSTER_REGION=australia-southeast1", "NOW=$(date +%s)", "export BUCKET_NAME=regression-testing-xlml", - "export JOB_NAME=gpt3-xlml-$NOW-175b-nemo", ) return set_variables -def set_project_commands(): +def configure_project_and_cluster(): set_project_command = ( "gcloud config set project $PROJECT", "sudo chown -R airflow:airflow /home/airflow/composer_kube_config", @@ -39,6 +38,28 @@ def set_project_commands(): return set_project_command +# This is required to get auth to access +# internal GoB repo +def git_cookie_authdaemon(): + auth_cmds = ( + "git clone https://gerrit.googlesource.com/gcompute-tools", + "echo 'trying to run git-cookie-authdaemon'", + "./gcompute-tools/git-cookie-authdaemon", + ) + return auth_cmds + + +def clone_gob(): + gob_clone_cmds = ( + "echo 'trying to clone GoB repo from outside'", + "git clone https://ai-hypercomputer-benchmarks.googlesource.com/" + "reproducible-benchmark-recipes", + "cd reproducible-benchmark-recipes/projects", + "cd gpu-recipes", + ) + return gob_clone_cmds + + def install_helm_cmds(): install_helm_cmd = ( "curl -fsSL -o get_helm.sh " @@ -57,11 +78,27 @@ def namespace_cmds(): namespace = ( "kubectl config view | grep namespace", "kubectl config set-context --current --namespace=default", - "kubectl config set-context heml --namespace=default", + "kubectl config set-context helm --namespace=default", ) return namespace +def helm_install_cmds(): + helm_cmds = ( + " helm install -f values.yaml " + "--namespace default " + "--set namespace=default" + " --set-file nemo_config" + "=$CONFIG_FILE" + " --set workload.image" + "=us-central1-docker.pkg.dev/" + "supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07" + " --set workload.gcsBucketForDataCataPath=$BUCKET_NAME" + " $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training", + ) + return helm_cmds + + def wait_for_jobs_cmds(): wait_for_job = ( "echo 'will wait for job to start running'", diff --git a/dags/map_reproducibility/nemo_gpt3.py b/dags/map_reproducibility/nemo_gpt3.py index ff03fdaf..74d3cc0f 100644 --- a/dags/map_reproducibility/nemo_gpt3.py +++ b/dags/map_reproducibility/nemo_gpt3.py @@ -21,12 +21,15 @@ from dags import composer_env from dags.map_reproducibility.aotc_reproducibility import get_metrics_cmds from dags.map_reproducibility.aotc_reproducibility import set_variables_cmds -from dags.map_reproducibility.aotc_reproducibility import set_project_commands +from dags.map_reproducibility.aotc_reproducibility import configure_project_and_cluster from dags.map_reproducibility.aotc_reproducibility import install_helm_cmds from dags.map_reproducibility.aotc_reproducibility import namespace_cmds from dags.map_reproducibility.aotc_reproducibility import wait_for_jobs_cmds from dags.map_reproducibility.aotc_reproducibility import copy_bucket_cmds from dags.map_reproducibility.aotc_reproducibility import cleanup_cmds +from dags.map_reproducibility.aotc_reproducibility import git_cookie_authdaemon +from dags.map_reproducibility.aotc_reproducibility import clone_gob +from dags.map_reproducibility.aotc_reproducibility import helm_install_cmds # Run once a day at 2 pm UTC (6 am PST) SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None @@ -35,27 +38,16 @@ @task def run_aotc_workload(): gpu_recipe_cmd = ( - "git clone https://github.com/ai-hypercomputer/gpu-recipes.git", - "cd gpu-recipes", - "export REPO_ROOT=`git rev-parse --show-toplevel`", + "export REPO_ROOT=`pwd`", "export RECIPE_ROOT=" "$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke", "cd $RECIPE_ROOT", ) - helm_cmds = ( + workload_cmds = ( "CONFIG_FILE=$REPO_ROOT/src/frameworks" "/nemo-configs/gpt3-175b-256gpus-fp8.yaml", - " helm install -f values.yaml " - "--namespace default " - "--set namespace=default" - " --set-file nemo_config" - "=$CONFIG_FILE" - " --set workload.image" - "=us-central1-docker.pkg.dev/" - "supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07" - " --set workload.gcsBucketForDataCataPath=$BUCKET_NAME" - " $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training", + "export JOB_NAME=gpt3-xlml-$NOW-175b-nemo", ) hook = SubprocessHook() @@ -65,11 +57,14 @@ def run_aotc_workload(): "-c", ";".join( set_variables_cmds() - + set_project_commands() + + configure_project_and_cluster() + + git_cookie_authdaemon() + + clone_gob() + gpu_recipe_cmd + install_helm_cmds() + namespace_cmds() - + helm_cmds + + workload_cmds + + helm_install_cmds() + wait_for_jobs_cmds() + copy_bucket_cmds() + get_metrics_cmds() From 0ccefe31d9538361aad4556bdfc9cbb7861d0f50 Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:30:32 -0800 Subject: [PATCH 19/26] fix benchmark tests (#492) --- .../configs/vllm/vllm_benchmark_config.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/dags/solutions_team/configs/vllm/vllm_benchmark_config.py b/dags/solutions_team/configs/vllm/vllm_benchmark_config.py index 5cb27ba1..b822e1b3 100644 --- a/dags/solutions_team/configs/vllm/vllm_benchmark_config.py +++ b/dags/solutions_team/configs/vllm/vllm_benchmark_config.py @@ -45,6 +45,7 @@ def get_vllm_gpu_setup_cmds(): # Download dataset "wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json", # Download benchmark + "pip install --upgrade google-cloud-storage", "rm -rf ai-on-gke && git clone https://github.com/GoogleCloudPlatform/ai-on-gke", ) return setup_cmds @@ -65,21 +66,13 @@ def get_vllm_tpu_setup_cmds(): "cd vllm", # From https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html "pip uninstall torch torch-xla -y", - 'export DATE="20240828"', - 'export TORCH_VERSION="2.5.0"', - "pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl", - "pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl", - # Install JAX and Pallas. - "pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html", - "pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html", - # Install other build dependencies. - "pip install setuptools-scm>=8", "pip install -r requirements-tpu.txt", # Build vLLM 'VLLM_TARGET_DEVICE="tpu" python setup.py develop', # Download dataset "cd .. && wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json", # Download benchmark + "pip install --upgrade google-cloud-storage", "rm -rf ai-on-gke && git clone https://github.com/GoogleCloudPlatform/ai-on-gke", ) From aea739eb7db9455d84d81caac7117eb0e2dab764 Mon Sep 17 00:00:00 2001 From: Ran Ran Date: Wed, 27 Nov 2024 17:00:54 -0800 Subject: [PATCH 20/26] Update MoE test config (#493) --- dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py index 3430f2a0..0c408fa7 100644 --- a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py +++ b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py @@ -57,7 +57,7 @@ { "script_name": "tpu/mixtral/8x7b/2_test_mixtral", "cluster": XpkClusters.TPU_V4_128_CLUSTER, - "time_out_in_min": 60, + "time_out_in_min": 90, }, ], "mixtral-8x22b": [ @@ -68,7 +68,7 @@ }, { "script_name": "tpu/mixtral/8x22b/2_test_mixtral", - "cluster": XpkClusters.TPU_V5E_256_CLUSTER, + "cluster": XpkClusters.TPU_V4_128_CLUSTER, "time_out_in_min": 60, }, ], From d9a509cb43a753e518aa118a89aa41266b2f0bb4 Mon Sep 17 00:00:00 2001 From: Orti Bazar Date: Mon, 2 Dec 2024 11:16:00 -0800 Subject: [PATCH 21/26] Create mlcompass_maxtext_gke dag (#485) * Create mlcompass_maxtext_gke dag * Add dag description --- .github/requirements.txt | 1 + dags/mlcompass/maxtext_gke.py | 127 ++++++++++++++++++++++++++++++++++ xlml/apis/task.py | 4 +- 3 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 dags/mlcompass/maxtext_gke.py diff --git a/.github/requirements.txt b/.github/requirements.txt index 5cc2d0ca..ebf875bf 100644 --- a/.github/requirements.txt +++ b/.github/requirements.txt @@ -9,3 +9,4 @@ jsonlines tensorflow-cpu kubernetes pyarrow +apache-airflow-providers-google diff --git a/dags/mlcompass/maxtext_gke.py b/dags/mlcompass/maxtext_gke.py new file mode 100644 index 00000000..660da3a7 --- /dev/null +++ b/dags/mlcompass/maxtext_gke.py @@ -0,0 +1,127 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This Airflow DAG runs a maxtext machine learning benchmark on a GKE cluster + +Usage: +gcloud composer environments run ml-automation-solutions \ + --project=cloud-ml-auto-solutions \ + --location=us-central1 dags trigger \ + -- \ + mlcompass_maxtext_gke \ + --conf={\\\"uuid\\\":\\\"abc\\\"} 70 +""" + +import datetime +from airflow import models +from airflow.decorators import task +from airflow.providers.google.cloud.hooks.gcs import GCSHook +from xlml.apis.xpk_cluster_config import XpkClusterConfig +from dags import test_owner +from dags.vm_resource import Project, XpkClusters +from xlml.apis import gcp_config, metric_config, task as xlml_task, test_config +import json + + +def get_config_gke( + docker_image: str, + model_name: str, + base_output_directory: str, + task_owner: str = test_owner.ORTI_B, + cluster: XpkClusterConfig = XpkClusters.TPU_V4_8_MAXTEXT_CLUSTER, + time_out_in_min: int = 60, + num_slices: int = 1, + dataset_name: metric_config.DatasetOption = metric_config.DatasetOption.XLML_DATASET, + dataset_project: str = Project.CLOUD_ML_AUTO_SOLUTIONS.value, + composer_project: str = Project.CLOUD_ML_AUTO_SOLUTIONS.value, +) -> xlml_task.XpkTask: + job_gcp_config = gcp_config.GCPConfig( + project_name=cluster.project, + zone=cluster.zone, + dataset_name=dataset_name, + dataset_project=dataset_project, + composer_project=composer_project, + ) + job_test_config = test_config.TpuGkeTest( + test_config.Tpu( + version=cluster.device_version, + cores=cluster.core_count, + ), + test_name="maxtext", + run_model_cmds=[ + f"source benchmark_run.sh;run {model_name} {base_output_directory}", + ], + set_up_cmds=None, + timeout=datetime.timedelta(minutes=time_out_in_min), + task_owner=task_owner, + num_slices=num_slices, + cluster_name=cluster.name, + docker_image=docker_image, + ) + return xlml_task.XpkTask( + task_test_config=job_test_config, + task_gcp_config=job_gcp_config, + ) + + +with models.DAG( + dag_id="mlcompass_maxtext_gke", + schedule=None, + tags=["mlcompass", "maxtext"], + start_date=datetime.datetime(2024, 9, 1), + catchup=False, + params={ + "uuid": "", + }, + default_args={ + "retries": 0, + }, +) as dag: + + @task.python + def load_xlml_state(params: dict = None): + dag.log.info(params) + uuid = params["uuid"] + if not uuid: + raise RuntimeError("uuid is not set") + gcs_hook = GCSHook() + file_content = gcs_hook.download( + "mlcompass-jax-artifacts", f"xlml/{uuid}/xlml_state.json" + ) + return json.loads(file_content) + + @task.python + def get_docker_image_path(state: dict) -> str: + return state["docker_image_path"] + + @task.python + def get_model_name(state: dict) -> str: + return state["model_name"] + + @task.python + def get_base_output_directory(state: dict) -> str: + bucket = state["workdir_bucket"] + path = state["workdir_path"] + return f"gs://{bucket}/{path}" + + xlml_state = load_xlml_state() + docker_image_path = get_docker_image_path(xlml_state) + model_name_arg = get_model_name(xlml_state) + base_output_directory_arg = get_base_output_directory(xlml_state) + + default_benchmark = get_config_gke( + docker_image=docker_image_path, + model_name=model_name_arg, + base_output_directory=base_output_directory_arg, + ).run(skip_post_process=True) diff --git a/xlml/apis/task.py b/xlml/apis/task.py index 69806466..117bc3c8 100644 --- a/xlml/apis/task.py +++ b/xlml/apis/task.py @@ -171,6 +171,7 @@ def run( gcs_location: Optional[airflow.XComArg] = None, use_vertex_tensorboard: bool = False, use_pathways: bool = False, + skip_post_process: bool = False, ) -> DAGNode: """Run a test job within a docker image. @@ -187,7 +188,8 @@ def run( run_model, gcs_path = self.run_model( gcs_location, use_vertex_tensorboard, use_pathways ) - run_model >> self.post_process(gcs_path) + if not skip_post_process: + run_model >> self.post_process(gcs_path) return group From 9ccaedbcd7e9bf1f03a7ecea468ee8a191c5823e Mon Sep 17 00:00:00 2001 From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:48:28 -0800 Subject: [PATCH 22/26] Remove TF SE tests (#496) * Use smaller embedding size as 32 leads to segmentation fault * Remove TF SE Nightly tests --- .../solutionsteam_tf_se_nightly_supported.py | 158 ------------------ 1 file changed, 158 deletions(-) delete mode 100644 dags/solutions_team/solutionsteam_tf_se_nightly_supported.py diff --git a/dags/solutions_team/solutionsteam_tf_se_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_se_nightly_supported.py deleted file mode 100644 index ecb51554..00000000 --- a/dags/solutions_team/solutionsteam_tf_se_nightly_supported.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A DAG to run all supported ML models with the nightly TensorFlow version.""" - -import datetime -from airflow import models -from dags import composer_env -from dags.vm_resource import TpuVersion, Project, Zone, RuntimeVersion, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS -from dags.solutions_team.configs.tensorflow import solutionsteam_tf_nightly_supported_config as tf_config -from dags.solutions_team.configs.tensorflow import common - - -# Run once a day at 4 pm UTC (8 am PST) -SCHEDULED_TIME = "0 16 * * *" if composer_env.is_prod_env() else None - -with models.DAG( - dag_id="tf_se_nightly_supported", - schedule=SCHEDULED_TIME, - tags=["solutions_team", "tf", "se", "nightly", "supported", "xlml"], - start_date=datetime.datetime(2024, 1, 4), - catchup=False, -) as dag: - # ResNet - tf_resnet_v2_8 = tf_config.get_tf_resnet_config( - tpu_version=TpuVersion.V2, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL1_C.value, - time_out_in_min=60, - global_batch_size=1024, - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value, - ) - - tf_resnet_v2_32 = tf_config.get_tf_resnet_config( - tpu_version=TpuVersion.V2, - tpu_cores=32, - tpu_zone=Zone.US_CENTRAL1_A.value, - time_out_in_min=60, - global_batch_size=1024, - is_pod=True, - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value, - ) - - tf_resnet_v3_8 = tf_config.get_tf_resnet_config( - tpu_version=TpuVersion.V3, - tpu_cores=8, - tpu_zone=Zone.US_EAST1_D.value, - time_out_in_min=60, - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value, - ) - - tf_resnet_v3_32 = tf_config.get_tf_resnet_config( - tpu_version=TpuVersion.V3, - tpu_cores=32, - tpu_zone=Zone.US_EAST1_D.value, - time_out_in_min=60, - is_pod=True, - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value, - ) - - tf_resnet_v4_8 = tf_config.get_tf_resnet_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value, - ) - tf_resnet_v4_32 = tf_config.get_tf_resnet_config( - tpu_version=TpuVersion.V4, - tpu_cores=32, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_pod=True, - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value, - ) - - # DLRM - embedding_dim = 16 - tf_dlrm_v2_8 = tf_config.get_tf_dlrm_config( - tpu_version=TpuVersion.V2, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL1_C.value, - time_out_in_min=120, - bottom_mlp=[512, 256, embedding_dim], - embedding_dim=embedding_dim, - train_steps=10000, - extraFlags="--mode=train", - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value, - ) - - embedding_dim = 64 - tf_dlrm_v2_32 = tf_config.get_tf_dlrm_config( - tpu_version=TpuVersion.V2, - tpu_cores=32, - tpu_zone=Zone.US_CENTRAL1_A.value, - time_out_in_min=120, - bottom_mlp=[512, 256, embedding_dim], - embedding_dim=embedding_dim, - train_steps=256054, - extraFlags="--mode=train_and_eval", - is_pod=True, - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value, - ) - - embedding_dim = 64 - tf_dlrm_v4_8 = tf_config.get_tf_dlrm_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=120, - bottom_mlp=[512, 256, embedding_dim], - embedding_dim=embedding_dim, - train_steps=10000, - extraFlags="--mode=train", - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value, - ) - - embedding_dim = 128 - tf_dlrm_v4_32 = tf_config.get_tf_dlrm_config( - tpu_version=TpuVersion.V4, - tpu_cores=32, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=120, - bottom_mlp=[512, 256, embedding_dim], - embedding_dim=embedding_dim, - train_steps=128000, - extraFlags="--mode=train_and_eval", - is_pod=True, - is_pjrt=False, - runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value, - ) - - # Test dependencies - tf_resnet_v2_8 >> tf_resnet_v2_32 - tf_resnet_v3_8 >> tf_resnet_v3_32 - tf_resnet_v4_8 >> tf_resnet_v4_32 - tf_dlrm_v2_8 >> tf_dlrm_v2_32 - tf_dlrm_v4_8 >> tf_dlrm_v4_32 From c05df73ef85c493ba18e482be12c3497019e23c6 Mon Sep 17 00:00:00 2001 From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:00:25 -0800 Subject: [PATCH 23/26] Use smaller batch size for v5p test (#497) --- .../tensorflow/solutionsteam_tf_nightly_supported_config.py | 5 +++-- dags/solutions_team/solutionsteam_tf_nightly_supported.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py b/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py index 681c7711..3a4d78de 100644 --- a/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py +++ b/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py @@ -200,6 +200,7 @@ def get_tf_dlrm_config( criteo_dir: str = gcs_bucket.CRITEO_DIR, network: str = "default", subnetwork: str = "default", + global_batch_size=16384, ): job_gcp_config = gcp_config.GCPConfig( project_name=project_name, @@ -233,11 +234,11 @@ def get_tf_dlrm_config( "use_tf_record_reader": "true", "train_data": { "input_path": "gs://zyc_dlrm/dataset/tb_tf_record_train_val/train/day_*/*", - "global_batch_size": 16384, + "global_batch_size": global_batch_size, }, "validation_data": { "input_path": "gs://zyc_dlrm/dataset/tb_tf_record_train_val/eval/day_*/*", - "global_batch_size": 16384, + "global_batch_size": global_batch_size, }, "model": { "interaction": "multi_layer_dcn", diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py index fb67088a..fb0aa3ad 100644 --- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py +++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py @@ -131,6 +131,7 @@ network=V5_NETWORKS, subnetwork=V5P_SUBNETWORKS, runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value, + global_batch_size=8196, ) embedding_dim = 128 From 15da729ec6b520fb457fb9f79f374c26c13191eb Mon Sep 17 00:00:00 2001 From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com> Date: Thu, 5 Dec 2024 10:11:11 -0800 Subject: [PATCH 24/26] Update solutionsteam_tf_nightly_supported.py (#498) --- dags/solutions_team/solutionsteam_tf_nightly_supported.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py index fb0aa3ad..5e8c72df 100644 --- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py +++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py @@ -131,7 +131,7 @@ network=V5_NETWORKS, subnetwork=V5P_SUBNETWORKS, runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value, - global_batch_size=8196, + global_batch_size=8192, ) embedding_dim = 128 From 5843847a5b333d9f6e7f29d6364cfdbcb28d6b26 Mon Sep 17 00:00:00 2001 From: Yijia Date: Thu, 5 Dec 2024 11:24:11 -0800 Subject: [PATCH 25/26] Add More Configs to Automation (#495) * wrap up * update v41 models * more configs to auto --- .../configs/trt_llm_mlperf_v41_config.py | 28 +- .../inference/trt_llm_mlperf_v41_inference.py | 306 +++++++++++++++++- 2 files changed, 311 insertions(+), 23 deletions(-) diff --git a/dags/inference/configs/trt_llm_mlperf_v41_config.py b/dags/inference/configs/trt_llm_mlperf_v41_config.py index 01e2712d..00295581 100644 --- a/dags/inference/configs/trt_llm_mlperf_v41_config.py +++ b/dags/inference/configs/trt_llm_mlperf_v41_config.py @@ -36,7 +36,7 @@ def get_trt_llm_mlperf_gpu_config( project: Project, network: str, subnetwork: str, - general_configs: Dict = {}, + benchmark_configs: Dict = {}, model_parameters: Dict = {}, parameter_positions: Dict = {}, binary_search_steps: int = 1, @@ -59,10 +59,11 @@ def get_trt_llm_mlperf_gpu_config( "sudo chmod a+w /scratch", "cd /scratch", # Prepare data - f"gsutil -m cp -n -r gs://yijiaj/mlperf/v41/Google_GPU .", - f"gsutil -m cp -n -r {general_configs['models']} .", - f"gsutil -m cp -n -r {general_configs['preprocessed_data']} .", - f"gsutil -m cp -n -r {general_configs['docker_config']} .", + "gsutil -m cp -n -r gs://yijiaj/mlperf/v41/Google_GPU .", + "gsutil -m cp -n -r gs://tohaowu/mlpinf-v40/mlperf_inf_dlrmv2 .", + f"gsutil -m cp -n -r {benchmark_configs['models']} .", + f"gsutil -m cp -n -r {benchmark_configs['preprocessed_data']} .", + f"gsutil -m cp -n -r {benchmark_configs['docker_config']} .", "curl -sSL https://get.docker.com/ | sh", "sudo mkdir -p /home/cloud-ml-auto-solutions/.docker", "sudo touch ~/.docker/config.json", @@ -78,7 +79,7 @@ def get_trt_llm_mlperf_gpu_config( # Build and launch a docker container "PARTNER_DROP=1 make prebuild DOCKER_DETACH=1", "make docker_add_user", - f"make launch_docker DOCKER_NAME={docker_container_name} DOCKER_ARGS='-d'", + f"make launch_docker DOCKER_NAME={docker_container_name} DOCKER_ARGS='-v /scratch/mlperf_inf_dlrmv2:/home/mlperf_inf_dlrmv2 -d'", ) jsonl_output_path = "metric_report.jsonl" @@ -108,17 +109,18 @@ def get_trt_llm_mlperf_gpu_config( make_jsonl_converter_cmd = f'echo "{py_script}" > jsonl_converter.py' model_parameters_sweep_cmds = [] - for model_name in general_configs["model_name"].split(","): + for model_name in benchmark_configs["model_name"].split(","): + scenario = ",".join(model_parameters[model_name]) if accelerator_type == GpuVersion.L4: model_parameters_sweep_cmds.append( - f'CUDA_VISIBLE_DEVICES=0 make generate_engines RUN_ARGS=\'--benchmarks={model_name} --scenarios={general_configs["scenario"]}\'' + f"CUDA_VISIBLE_DEVICES=0 make generate_engines RUN_ARGS='--benchmarks={model_name} --scenarios={scenario}'" ) else: model_parameters_sweep_cmds.append( - f'make generate_engines RUN_ARGS=\'--benchmarks={model_name} --scenarios={general_configs["scenario"]}\'' + f"make generate_engines RUN_ARGS='--benchmarks={model_name} --scenarios={scenario}'" ) - for model_name in general_configs["model_name"].split(","): + for model_name in benchmark_configs["model_name"].split(","): for scenario in model_parameters[model_name]: for parameter in model_parameters[model_name][scenario]: steps = 2 ** (binary_search_steps - 1) + 1 @@ -153,6 +155,8 @@ def get_trt_llm_mlperf_gpu_config( docker_cmds = [ "make link_dirs", "make build BUILD_TRTLLM=1", + "pip install huggingface_hub==0.24.7", + "lscpu", ] if accelerator_type == GpuVersion.L4: docker_cmds.append( @@ -180,7 +184,9 @@ def get_trt_llm_mlperf_gpu_config( runtime_version=RUNTIME_IMAGE, network=network, subnetwork=subnetwork, - attach_local_ssd=True, + attach_local_ssd=True + if accelerator_type != GpuVersion.H100 + else False, disk_size_gb=1000, ), test_name=test_name, diff --git a/dags/inference/trt_llm_mlperf_v41_inference.py b/dags/inference/trt_llm_mlperf_v41_inference.py index 1cfe4498..5546324d 100644 --- a/dags/inference/trt_llm_mlperf_v41_inference.py +++ b/dags/inference/trt_llm_mlperf_v41_inference.py @@ -17,11 +17,11 @@ import datetime from airflow import models from dags import composer_env -from dags.vm_resource import A100_INFERENCE_SUBNETWORKS, GpuVersion, Zone, ImageFamily, ImageProject, MachineVersion, Project, INFERENCE_NETWORKS, L4_INFERENCE_SUBNETWORKS +from dags.vm_resource import A100_INFERENCE_SUBNETWORKS, H100_INFERENCE_SUBNETWORKS, GpuVersion, Zone, ImageFamily, ImageProject, MachineVersion, Project, INFERENCE_NETWORKS, L4_INFERENCE_SUBNETWORKS from dags.inference.configs import trt_llm_mlperf_v41_config -# Run once a day at 4 am UTC (8 pm PST) -SCHEDULED_TIME = "0 4 * * *" if composer_env.is_prod_env() else None +# Run once a day at 1 pm UTC (5 am PST) +SCHEDULED_TIME = "1 3 * * *" if composer_env.is_prod_env() else None with models.DAG( @@ -41,10 +41,8 @@ config_ver = "default,high_accuracy" test_mode = "PerformanceOnly" - scenario = "Offline,Server" g2_configs = { - "model_name": "bert", - "scenario": scenario, + "model_name": "bert,3d-unet", "config_ver": config_ver, "test_mode": test_mode, "docker_config": "gs://yijiaj/mlperf/config.json", @@ -60,6 +58,43 @@ "server_target_qps": (900, 1200), }, }, + "3d-unet": { + "Offline": { + "offline_expected_qps": (1.3, 2.6), + }, + }, + "dlrm-v2": { + "Offline": { + "offline_expected_qps": (3400, 3500), + }, + "Server": { + "server_target_qps": (3300, 3500), + }, + }, + "gptj": { + "Offline": { + "offline_expected_qps": (1.3, 1.6), + }, + "Server": { + "server_target_qps": (0.88, 1), + }, + }, + "resnet50": { + "Offline": { + "offline_expected_qps": (13000, 15000), + }, + "Server": { + "server_target_qps": (11532.8125, 11600), + }, + }, + "retinanet": { + "Offline": { + "offline_expected_qps": (220, 230), + }, + "Server": { + "server_target_qps": (200, 220), + }, + }, } g2_parameter_position = { "bert": { @@ -70,10 +105,46 @@ "server_target_qps": 278, }, }, + "3d-unet": { + "Offline": { + "offline_expected_qps": 55, + }, + }, + "dlrm-v2": { + "Offline": { + "offline_expected_qps": 233, + }, + "Server": { + "server_target_qps": 176, + }, + }, + "gptj": { + "Offline": { + "offline_expected_qps": 191, + }, + "Server": { + "server_target_qps": 158, + }, + }, + "resnet50": { + "Offline": { + "offline_expected_qps": 48, + }, + "Server": { + "server_target_qps": 52, + }, + }, + "retinanet": { + "Offline": { + "offline_expected_qps": 51, + }, + "Server": { + "server_target_qps": 57, + }, + }, } a2_configs = { - "model_name": "bert", - "scenario": scenario, + "model_name": "bert,3d-unet", "config_ver": config_ver, "test_mode": test_mode, "docker_config": "gs://yijiaj/mlperf/config.json", @@ -89,6 +160,27 @@ "server_target_qps": (25400, 25600), }, }, + "3d-unet": { + "Offline": { + "offline_expected_qps": (30, 40), + }, + }, + "resnet50": { + "Offline": { + "offline_expected_qps": (340000, 360000), + }, + "Server": { + "server_target_qps": (290000, 299000), + }, + }, + "retinanet": { + "Offline": { + "offline_expected_qps": (5840, 5980), + }, + "Server": { + "server_target_qps": (5600, 5800), + }, + }, } a2_parameter_position = { "bert": { @@ -99,6 +191,177 @@ "server_target_qps": 560, }, }, + "3d-unet": { + "Offline": { + "offline_expected_qps": 623, + }, + }, + "resnet50": { + "Offline": { + "offline_expected_qps": 456, + }, + "Server": { + "server_target_qps": 396, + }, + }, + "retinanet": { + "Offline": { + "offline_expected_qps": 269, + }, + "Server": { + "server_target_qps": 244, + }, + }, + } + a3_configs = { + "model_name": "resnet50,retinanet,stable-diffusion-xl,llama2-70b,mixtral-8x7b", + "config_ver": config_ver, + "test_mode": test_mode, + "docker_config": "gs://yijiaj/mlperf/config.json", + "models": "gs://yijiaj/mlperf/a3/models", + "preprocessed_data": "gs://yijiaj/mlperf/a3/preprocessed_data", + } + a3_model_parameters = { + "bert": { + "Offline": { + "offline_expected_qps": (75200, 76000), + }, + "Server": { + "server_target_qps": (56000, 60000), + }, + }, + "3d-unet": { + "Offline": { + "offline_expected_qps": (54.4, 64), + }, + }, + "dlrm-v2": { + "Offline": { + "offline_expected_qps": (616000, 620000), + }, + "Server": { + "server_target_qps": (458203.125, 510000), + }, + }, + "gptj": { + "Offline": { + "offline_expected_qps": (288, 300), + }, + "Server": { + "server_target_qps": (279.36, 285), + }, + }, + "resnet50": { + "Offline": { + "offline_expected_qps": (720000, 740000), + }, + "Server": { + "server_target_qps": (584000, 586000), + }, + }, + "retinanet": { + "Offline": { + "offline_expected_qps": (13600, 14000), + }, + "Server": { + "server_target_qps": (12880, 13000), + }, + }, + "stable-diffusion-xl": { + "Offline": { + "offline_expected_qps": (16, 18), + }, + "Server": { + "server_target_qps": (16.3, 18), + }, + }, + "llama2-70b": { + "Offline": { + "offline_expected_qps": (80, 86), + }, + "Server": { + "server_target_qps": (75, 80), + }, + }, + "mixtral-8x7b": { + "Offline": { + "offline_expected_qps": (368, 386), + }, + "Server": { + "server_target_qps": (345, 360), + }, + }, + } + a3_parameter_position = { + "bert": { + "Offline": { + "offline_expected_qps": 196, + }, + "Server": { + "server_target_qps": 238, + }, + }, + "3d-unet": { + "Offline": { + "offline_expected_qps": 160, + }, + }, + "dlrm-v2": { + "Offline": { + "offline_expected_qps": 65, + }, + "Server": { + "server_target_qps": 65, + }, + }, + "gptj": { + "Offline": { + "offline_expected_qps": 48, + }, + "Server": { + "server_target_qps": 91, + }, + }, + "resnet50": { + "Offline": { + "offline_expected_qps": 84, + }, + "Server": { + "server_target_qps": 132, + }, + }, + "retinanet": { + "Offline": { + "offline_expected_qps": 139, + }, + "Server": { + "server_target_qps": 127, + }, + }, + "stable-diffusion-xl": { + "Offline": { + "offline_expected_qps": 55, + }, + "Server": { + "server_target_qps": 59, + }, + }, + "llama2-70b": { + "Offline": { + "offline_expected_qps": 75, + }, + "Server": { + "server_target_qps": 74, + }, + }, + "mixtral-8x7b": { + "Offline": { + "offline_expected_qps": 74, + }, + "Server": { + "server_target_qps": 64, + }, + }, } # Running on A100 GPU @@ -108,13 +371,13 @@ image_family=ImageFamily.COMMON_CU121_DEBIAN_11, accelerator_type=GpuVersion.A100_80G, count=8, - gpu_zone=Zone.US_CENTRAL1_C, + gpu_zone=Zone.US_CENTRAL1_A, time_out_in_min=1600, test_name=f"{test_name_prefix}-nightly-test-a100-8", project=Project.CLOUD_TPU_INFERENCE_TEST, network=INFERENCE_NETWORKS, subnetwork=A100_INFERENCE_SUBNETWORKS, - general_configs=a2_configs, + benchmark_configs=a2_configs, model_parameters=a2_model_parameters, parameter_positions=a2_parameter_position, binary_search_steps=2, @@ -127,14 +390,33 @@ image_family=ImageFamily.COMMON_CU121_DEBIAN_11, accelerator_type=GpuVersion.L4, count=8, - gpu_zone=Zone.US_CENTRAL1_A, + gpu_zone=Zone.US_CENTRAL1_C, time_out_in_min=1600, test_name=f"{test_name_prefix}-nightly-test-l4-1", project=Project.CLOUD_TPU_INFERENCE_TEST, network=INFERENCE_NETWORKS, subnetwork=L4_INFERENCE_SUBNETWORKS, - general_configs=g2_configs, + benchmark_configs=g2_configs, model_parameters=g2_model_parameters, parameter_positions=g2_parameter_position, binary_search_steps=2, ).run() + + # Running on H100 GPU + trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config( + machine_type=MachineVersion.A3_HIGHGPU_8G, + image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE, + image_family=ImageFamily.COMMON_CU121_DEBIAN_11, + accelerator_type=GpuVersion.H100, + count=8, + gpu_zone=Zone.US_CENTRAL1_A, + time_out_in_min=1600, + test_name=f"{test_name_prefix}-nightly-test-h100-8", + project=Project.CLOUD_TPU_INFERENCE_TEST, + network=INFERENCE_NETWORKS, + subnetwork=H100_INFERENCE_SUBNETWORKS, + benchmark_configs=a3_configs, + model_parameters=a3_model_parameters, + parameter_positions=a3_parameter_position, + binary_search_steps=2, + ).run() From 4afadb458f48a2e31bdedaa3f283c92b5dd340ef Mon Sep 17 00:00:00 2001 From: Yifei Teng Date: Thu, 5 Dec 2024 15:08:38 -0800 Subject: [PATCH 26/26] Update the tpu dependency installation command (#499) Because libtpu is switching Python package registries, we need to include both registries during the transition. --- dags/legacy_test/tests/pytorch/nightly/common.libsonnet | 3 ++- dags/pytorch_xla/configs/pytorchxla_torchbench_config.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet index 9a5f4c21..8bbe599b 100644 --- a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet +++ b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet @@ -105,7 +105,8 @@ local volumes = import 'templates/volumes.libsonnet'; pip3 install --user --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu pip install --user \ 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' \ - -f https://storage.googleapis.com/libtpu-releases/index.html + -f https://storage.googleapis.com/libtpu-releases/index.html \ + -f https://storage.googleapis.com/libtpu-wheels/index.html pip3 install pillow git clone --depth=1 https://github.com/pytorch/pytorch.git cd pytorch diff --git a/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py b/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py index bef03912..d490456a 100644 --- a/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py +++ b/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py @@ -168,7 +168,7 @@ def model_install_cmds(output_file=None) -> str: # "pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html", "pip3 uninstall -y libtpu-nightly jax jaxlib", "cd ~/xla/experimental/torch_xla2/", - "pip3 install --user -e .[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html", + "pip3 install --user -e .[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html", ) if use_xla2 else () @@ -185,7 +185,7 @@ def model_install_cmds(output_file=None) -> str: f"pip3 install --user --pre {version_mapping.TORCH.value} {version_mapping.TORCHVISION.value} {version_mapping.TORCHAUDIO.value} --index-url {version_mapping.TORCH_INDEX_CPU_URL.value}" ), ( - f"pip3 install --user 'torch_xla[tpu] @{version_mapping.TORCH_XLA_TPU_WHEEL.value}' -f https://storage.googleapis.com/libtpu-releases/index.html" + f"pip3 install --user 'torch_xla[tpu] @{version_mapping.TORCH_XLA_TPU_WHEEL.value}' -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html" ), "pip3 install --user psutil", "cd; git clone https://github.com/pytorch/benchmark.git", @@ -326,7 +326,7 @@ def get_nvidia_driver_install_cmd(driver_version: str) -> str: # TODO(piz): torch_xla2 only support nightly test at this time. "pip3 uninstall -y libtpu-nightly jax jaxlib", # in case libtpu is installed from torch_xla "cd /tmp/xla/experimental/torch_xla2/", - "pip3 install --user -e .[cuda] -f https://storage.googleapis.com/libtpu-releases/index.html", + "pip3 install --user -e .[cuda] -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html", ) if use_xla2 else ()