From 155c05a40b81c71dd7e42bf2caeeff9e0bea29f9 Mon Sep 17 00:00:00 2001 From: gobbleturk Date: Fri, 10 Jan 2025 00:03:26 +0000 Subject: [PATCH 1/3] Only run gke mxla --- .../multipod/maxtext_trillium_configs_perf.py | 8 +- dags/multipod/mxla_maxtext_nightly.py | 145 ------------------ dags/multipod/mxla_maxtext_nightly_gke.py | 3 +- 3 files changed, 7 insertions(+), 149 deletions(-) delete mode 100644 dags/multipod/mxla_maxtext_nightly.py diff --git a/dags/multipod/maxtext_trillium_configs_perf.py b/dags/multipod/maxtext_trillium_configs_perf.py index affb1ba48..e11814530 100644 --- a/dags/multipod/maxtext_trillium_configs_perf.py +++ b/dags/multipod/maxtext_trillium_configs_perf.py @@ -27,10 +27,14 @@ # Run once a day at 4 am UTC (8 pm PST / 9 pm PDT) SCHEDULED_TIME = "0 4 * * *" if composer_env.is_prod_env() else None -MODEL_CONFIGS = ["gpt3_175b", "llama2_7b_4096", "mixtral_8x7b"] +#MODEL_CONFIGS = ["gpt3_175b", "llama2_7b_4096", "mixtral_8x7b"] +MODEL_CONFIGS = ["llama2_7b_4096"] +# DOCKER_IMAGES = [ +# (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), +# (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), +# ] DOCKER_IMAGES = [ (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), - (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), ] QUANTIZATION_SWEEP = {"M_QUANTIZATION": ["", "int8"]} BASE_OUTPUT_DIRECTORY = "gs://runner-maxtext-logs" diff --git a/dags/multipod/mxla_maxtext_nightly.py b/dags/multipod/mxla_maxtext_nightly.py deleted file mode 100644 index 4018ceb3a..000000000 --- a/dags/multipod/mxla_maxtext_nightly.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A DAG to run MaxText tests with nightly version.""" - -import datetime -from airflow import models -from dags import composer_env -from dags.common.vm_resource import TpuVersion, Zone, Project, V5_NETWORKS, V5P_SUBNETWORKS, RuntimeVersion -from dags.multipod.configs import maxtext_gce_config -from dags.multipod.configs.common import SetupMode, Platform - -# Run once a day at 9 am UTC (1 am PST) -SCHEDULED_TIME = "0 9 * * *" if composer_env.is_prod_env() else None - - -with models.DAG( - dag_id="mxla_maxtext_nightly", - schedule=SCHEDULED_TIME, - tags=["multipod_team", "maxtext", "nightly", "mlscale_onduty"], - start_date=datetime.datetime(2024, 1, 10), - catchup=False, -) as dag: - default_test_name = "mxla-maxtext-nightly" - test_mode = SetupMode.NIGHTLY - - # v4 Maxtext test - maxtext_nightly_1slice_v4_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - test_name=default_test_name, - test_mode=test_mode, - ) - - maxtext_nightly_2slice_v4_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - num_slices=2, - test_name=default_test_name, - test_mode=test_mode, - ) - - maxtext_nightly_4slice_v4_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - num_slices=4, - test_name=default_test_name, - test_mode=test_mode, - ) - - maxtext_nightly_8slice_v4_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - num_slices=8, - test_name=default_test_name, - test_mode=test_mode, - ) - - # v5p Maxtext test - v5p_project_name = Project.TPU_PROD_ENV_AUTOMATED.value - v5p_network = V5_NETWORKS - v5p_subnetwork = V5P_SUBNETWORKS - v5p_runtime_version = RuntimeVersion.V2_ALPHA_TPUV5.value - - maxtext_nightly_1slice_v5p_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=default_test_name, - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) - - maxtext_nightly_2slice_v5p_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - num_slices=2, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=default_test_name, - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) - - maxtext_nightly_4slice_v5p_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - num_slices=4, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=default_test_name, - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) - - maxtext_nightly_8slice_v5p_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - num_slices=8, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=default_test_name, - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) diff --git a/dags/multipod/mxla_maxtext_nightly_gke.py b/dags/multipod/mxla_maxtext_nightly_gke.py index 760a25d24..6e4ede926 100644 --- a/dags/multipod/mxla_maxtext_nightly_gke.py +++ b/dags/multipod/mxla_maxtext_nightly_gke.py @@ -24,8 +24,7 @@ from dags.multipod.configs import gke_config # Run once a day at 9 am UTC (1 am PST) -# Pause test on GKE -SCHEDULED_TIME = None +SCHEDULED_TIME = "0 9 * * *" if composer_env.is_prod_env() else None with models.DAG( dag_id="mxla_maxtext_nightly_gke", From c22982adeb76a765095105bdf409292b35799ee7 Mon Sep 17 00:00:00 2001 From: gobbleturk Date: Fri, 10 Jan 2025 00:36:17 +0000 Subject: [PATCH 2/3] Only run gke mxla --- dags/multipod/maxtext_trillium_configs_perf.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/dags/multipod/maxtext_trillium_configs_perf.py b/dags/multipod/maxtext_trillium_configs_perf.py index e11814530..4004eb541 100644 --- a/dags/multipod/maxtext_trillium_configs_perf.py +++ b/dags/multipod/maxtext_trillium_configs_perf.py @@ -27,14 +27,10 @@ # Run once a day at 4 am UTC (8 pm PST / 9 pm PDT) SCHEDULED_TIME = "0 4 * * *" if composer_env.is_prod_env() else None -#MODEL_CONFIGS = ["gpt3_175b", "llama2_7b_4096", "mixtral_8x7b"] -MODEL_CONFIGS = ["llama2_7b_4096"] -# DOCKER_IMAGES = [ -# (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), -# (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), -# ] +MODEL_CONFIGS = ["gpt3_175b", "llama2_7b_4096", "mixtral_8x7b"] DOCKER_IMAGES = [ - (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), + (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), + (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), ] QUANTIZATION_SWEEP = {"M_QUANTIZATION": ["", "int8"]} BASE_OUTPUT_DIRECTORY = "gs://runner-maxtext-logs" From 44488cd43cfb0931aa3bb1498691827e3e9887e1 Mon Sep 17 00:00:00 2001 From: gobbleturk Date: Fri, 10 Jan 2025 00:36:35 +0000 Subject: [PATCH 3/3] Only run gke mxla --- dags/multipod/maxtext_trillium_configs_perf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/multipod/maxtext_trillium_configs_perf.py b/dags/multipod/maxtext_trillium_configs_perf.py index 4004eb541..affb1ba48 100644 --- a/dags/multipod/maxtext_trillium_configs_perf.py +++ b/dags/multipod/maxtext_trillium_configs_perf.py @@ -29,8 +29,8 @@ SCHEDULED_TIME = "0 4 * * *" if composer_env.is_prod_env() else None MODEL_CONFIGS = ["gpt3_175b", "llama2_7b_4096", "mixtral_8x7b"] DOCKER_IMAGES = [ - (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), - (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), + (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), + (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), ] QUANTIZATION_SWEEP = {"M_QUANTIZATION": ["", "int8"]} BASE_OUTPUT_DIRECTORY = "gs://runner-maxtext-logs"