diff --git a/dags/multipod/maxtext_trillium_configs_perf.py b/dags/multipod/maxtext_trillium_configs_perf.py index affb1ba4..e1181453 100644 --- a/dags/multipod/maxtext_trillium_configs_perf.py +++ b/dags/multipod/maxtext_trillium_configs_perf.py @@ -27,10 +27,14 @@ # Run once a day at 4 am UTC (8 pm PST / 9 pm PDT) SCHEDULED_TIME = "0 4 * * *" if composer_env.is_prod_env() else None -MODEL_CONFIGS = ["gpt3_175b", "llama2_7b_4096", "mixtral_8x7b"] +#MODEL_CONFIGS = ["gpt3_175b", "llama2_7b_4096", "mixtral_8x7b"] +MODEL_CONFIGS = ["llama2_7b_4096"] +# DOCKER_IMAGES = [ +# (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), +# (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), +# ] DOCKER_IMAGES = [ (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), - (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), ] QUANTIZATION_SWEEP = {"M_QUANTIZATION": ["", "int8"]} BASE_OUTPUT_DIRECTORY = "gs://runner-maxtext-logs" diff --git a/dags/multipod/mxla_maxtext_nightly.py b/dags/multipod/mxla_maxtext_nightly.py deleted file mode 100644 index 4018ceb3..00000000 --- a/dags/multipod/mxla_maxtext_nightly.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A DAG to run MaxText tests with nightly version.""" - -import datetime -from airflow import models -from dags import composer_env -from dags.common.vm_resource import TpuVersion, Zone, Project, V5_NETWORKS, V5P_SUBNETWORKS, RuntimeVersion -from dags.multipod.configs import maxtext_gce_config -from dags.multipod.configs.common import SetupMode, Platform - -# Run once a day at 9 am UTC (1 am PST) -SCHEDULED_TIME = "0 9 * * *" if composer_env.is_prod_env() else None - - -with models.DAG( - dag_id="mxla_maxtext_nightly", - schedule=SCHEDULED_TIME, - tags=["multipod_team", "maxtext", "nightly", "mlscale_onduty"], - start_date=datetime.datetime(2024, 1, 10), - catchup=False, -) as dag: - default_test_name = "mxla-maxtext-nightly" - test_mode = SetupMode.NIGHTLY - - # v4 Maxtext test - maxtext_nightly_1slice_v4_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - test_name=default_test_name, - test_mode=test_mode, - ) - - maxtext_nightly_2slice_v4_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - num_slices=2, - test_name=default_test_name, - test_mode=test_mode, - ) - - maxtext_nightly_4slice_v4_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - num_slices=4, - test_name=default_test_name, - test_mode=test_mode, - ) - - maxtext_nightly_8slice_v4_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V4, - tpu_cores=8, - tpu_zone=Zone.US_CENTRAL2_B.value, - time_out_in_min=60, - is_tpu_reserved=False, - num_slices=8, - test_name=default_test_name, - test_mode=test_mode, - ) - - # v5p Maxtext test - v5p_project_name = Project.TPU_PROD_ENV_AUTOMATED.value - v5p_network = V5_NETWORKS - v5p_subnetwork = V5P_SUBNETWORKS - v5p_runtime_version = RuntimeVersion.V2_ALPHA_TPUV5.value - - maxtext_nightly_1slice_v5p_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=default_test_name, - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) - - maxtext_nightly_2slice_v5p_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - num_slices=2, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=default_test_name, - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) - - maxtext_nightly_4slice_v5p_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - num_slices=4, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=default_test_name, - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) - - maxtext_nightly_8slice_v5p_8 = maxtext_gce_config.get_maxtext_nightly_config( - tpu_version=TpuVersion.V5P, - tpu_cores=8, - num_slices=8, - tpu_zone=Zone.US_EAST5_A.value, - runtime_version=v5p_runtime_version, - project_name=v5p_project_name, - time_out_in_min=60, - is_tpu_reserved=True, - test_name=default_test_name, - test_mode=test_mode, - network=v5p_network, - subnetwork=v5p_subnetwork, - ) diff --git a/dags/multipod/mxla_maxtext_nightly_gke.py b/dags/multipod/mxla_maxtext_nightly_gke.py index 760a25d2..6e4ede92 100644 --- a/dags/multipod/mxla_maxtext_nightly_gke.py +++ b/dags/multipod/mxla_maxtext_nightly_gke.py @@ -24,8 +24,7 @@ from dags.multipod.configs import gke_config # Run once a day at 9 am UTC (1 am PST) -# Pause test on GKE -SCHEDULED_TIME = None +SCHEDULED_TIME = "0 9 * * *" if composer_env.is_prod_env() else None with models.DAG( dag_id="mxla_maxtext_nightly_gke",