From 023c5b002fb99636d7a5e88914a9bc64b0909744 Mon Sep 17 00:00:00 2001
From: Param Bole <parambole@google.com>
Date: Wed, 13 Nov 2024 10:56:30 -0800
Subject: [PATCH 01/26] Adding single and multi-node AxLearn A3Plus Tests
 (#471)

* Adding AxLearn A3Plus Tests
---
 dags/imagegen_devx/project_bite_gpu_e2e.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dags/imagegen_devx/project_bite_gpu_e2e.py b/dags/imagegen_devx/project_bite_gpu_e2e.py
index 29b69df1..7ce82778 100644
--- a/dags/imagegen_devx/project_bite_gpu_e2e.py
+++ b/dags/imagegen_devx/project_bite_gpu_e2e.py
@@ -38,6 +38,7 @@
   axlearn_test_configs = {
       # accelerator: list of slices to test
       "a3": [1],
+      "a3plus": [1, 2],
   }
 
   for accelerator, slices in axlearn_test_configs.items():

From fcbe0dd633bc5c0c915afdb8cc1f74632364094a Mon Sep 17 00:00:00 2001
From: Ran Ran <ran.rissy@gmail.com>
Date: Wed, 13 Nov 2024 14:54:16 -0800
Subject: [PATCH 02/26] Add new owner to the repo (#472)

---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c2f3d1bf..97dae494 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,5 +1,5 @@
 # Default owners for everything in the repo, unless a later match takes precedence.
-* @RissyRan @allenwang28
+* @mbzomowski @RissyRan @allenwang28
 
 dags/solutions_team/configs/tensorflow @chandrasekhard2 @ZhaoyueCheng
 dags/solutions_team/solutionsteam_tf* @chandrasekhard2 @ZhaoyueCheng

From 4820439287683d708028ad6229a1b51a7e4c030f Mon Sep 17 00:00:00 2001
From: Ran Ran <ran.rissy@gmail.com>
Date: Wed, 13 Nov 2024 15:41:29 -0800
Subject: [PATCH 03/26] Rename team name folder (#473)

---
 .github/CODEOWNERS                                     |  6 +++---
 dags/quarantined_tests.py                              | 10 +++++-----
 .../configs/__init__.py                                |  0
 .../configs/common.py                                  |  0
 .../configs/gke_config.py                              |  0
 .../configs/project_bite_config.py                     |  4 ++--
 .../jax_stable_stack_gpu_e2e.py                        |  9 +++++++--
 .../jax_stable_stack_tpu_e2e.py                        |  6 ++++--
 .../maxdiffusion_e2e.py                                |  4 ++--
 .../project_bite_gpu_e2e.py                            | 10 ++++++++--
 .../project_bite_tpu_e2e.py                            |  4 ++--
 dags/test_owner.py                                     |  5 ++---
 12 files changed, 35 insertions(+), 23 deletions(-)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/configs/__init__.py (100%)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/configs/common.py (100%)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/configs/gke_config.py (100%)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/configs/project_bite_config.py (95%)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/jax_stable_stack_gpu_e2e.py (93%)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/jax_stable_stack_tpu_e2e.py (97%)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/maxdiffusion_e2e.py (96%)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/project_bite_gpu_e2e.py (91%)
 rename dags/{imagegen_devx => sparsity_diffusion_devx}/project_bite_tpu_e2e.py (92%)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 97dae494..fd1f8567 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -11,8 +11,8 @@ dags/multipod @jonb377 @tonyjohnchen @raymondzouu @gobbleturk @shralex @RissyRan
 
 dags/mlcompass @ortibazar @sganeshb @brajiang @wlzhg
 
-dags/imagegen_devx @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh
-dags/imagegen_devx/project_bite* @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh @jiya-zhang
-dags/imagegen_devx/configs/project_bite* @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh @jiya-zhang
+dags/sparsity_diffusion_devx @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh
+dags/sparsity_diffusion_devx/project_bite* @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh @jiya-zhang
+dags/sparsity_diffusion_devx/configs/project_bite* @RissyRan @parambole @jiangjy1982 @aireenmei @michelle-yooh @jiya-zhang
 
 dags/inference @yeandy @vipannalla @morgandu @mailvijayasingh @sixiang-google @joezijunzhou @singh-mitali
diff --git a/dags/quarantined_tests.py b/dags/quarantined_tests.py
index 08854752..f98c06b9 100644
--- a/dags/quarantined_tests.py
+++ b/dags/quarantined_tests.py
@@ -193,21 +193,21 @@ class QuarantineTests:
       "chained_tests_llama2-70b_nightly": TestInfo(team.LLM_DEVX, "2024-11-12"),
       # DAG: jax_stable_stack_gpu_e2e
       "maxtext-stable-stack-train-c4-data-h100-80gb-8": TestInfo(
-          team.SPARCITY_DIFFUSION_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       "maxtext-stable-stack-train-c4-data-h100-mega-80gb-8": TestInfo(
-          team.SPARCITY_DIFFUSION_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       # DAG: jax_stable_tpu_stack_e2e
       "axlearn-jax-stable-stack-v4-16-1x-v4-16": TestInfo(
-          team.SPARCITY_DIFFUSION_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       "axlearn-jax-stable-stack-v4-16-2x-2xv4-16": TestInfo(
-          team.SPARCITY_DIFFUSION_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       # DAG: maxdiffusion_e2e
       "maxd-sdxl-nan-v6e-256-2x-2xv6e-256": TestInfo(
-          team.SPARCITY_DIFFUSION_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       # DAG: maxtext_configs_aot
       "maxtext-aot-v5e-stable-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"),
diff --git a/dags/imagegen_devx/configs/__init__.py b/dags/sparsity_diffusion_devx/configs/__init__.py
similarity index 100%
rename from dags/imagegen_devx/configs/__init__.py
rename to dags/sparsity_diffusion_devx/configs/__init__.py
diff --git a/dags/imagegen_devx/configs/common.py b/dags/sparsity_diffusion_devx/configs/common.py
similarity index 100%
rename from dags/imagegen_devx/configs/common.py
rename to dags/sparsity_diffusion_devx/configs/common.py
diff --git a/dags/imagegen_devx/configs/gke_config.py b/dags/sparsity_diffusion_devx/configs/gke_config.py
similarity index 100%
rename from dags/imagegen_devx/configs/gke_config.py
rename to dags/sparsity_diffusion_devx/configs/gke_config.py
diff --git a/dags/imagegen_devx/configs/project_bite_config.py b/dags/sparsity_diffusion_devx/configs/project_bite_config.py
similarity index 95%
rename from dags/imagegen_devx/configs/project_bite_config.py
rename to dags/sparsity_diffusion_devx/configs/project_bite_config.py
index 58173947..14d1068d 100644
--- a/dags/imagegen_devx/configs/project_bite_config.py
+++ b/dags/sparsity_diffusion_devx/configs/project_bite_config.py
@@ -19,12 +19,12 @@
 from typing import Tuple, Optional
 from xlml.apis import gcp_config, metric_config, task, test_config
 from dags import gcs_bucket, test_owner
-from dags.imagegen_devx.configs import common
+from dags.sparsity_diffusion_devx.configs import common
 from dags.vm_resource import TpuVersion, Project
 from airflow.models.taskmixin import DAGNode
 
 
-GCS_SUBFOLDER_PREFIX = test_owner.Team.IMAGEGEN_DEVX.value
+GCS_SUBFOLDER_PREFIX = test_owner.Team.SPARSITY_DIFFUSION_DEVX.value
 
 
 def set_up_axlearn(pinned_version) -> Tuple[str]:
diff --git a/dags/imagegen_devx/jax_stable_stack_gpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
similarity index 93%
rename from dags/imagegen_devx/jax_stable_stack_gpu_e2e.py
rename to dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
index d06c1165..466bf3b6 100644
--- a/dags/imagegen_devx/jax_stable_stack_gpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
@@ -20,7 +20,7 @@
 from dags import composer_env, test_owner, gcs_bucket
 from dags.vm_resource import Project, TpuVersion, CpuVersion, Zone, DockerImage, GpuVersion, XpkClusters
 from airflow.utils.task_group import TaskGroup
-from dags.imagegen_devx.configs import gke_config as config
+from dags.sparsity_diffusion_devx.configs import gke_config as config
 from xlml.utils import name_format
 from dags.multipod.configs.common import SetupMode
 
@@ -31,7 +31,12 @@
 with models.DAG(
     dag_id="jax_stable_stack_gpu_e2e",
     schedule=SCHEDULED_TIME,
-    tags=["multipod_team", "maxtext", "jax-stable-stack"],
+    tags=[
+        "sparsity_diffusion_devx",
+        "multipod_team",
+        "maxtext",
+        "jax-stable-stack",
+    ],
     start_date=datetime.datetime(2024, 6, 7),
     catchup=False,
 ) as dag:
diff --git a/dags/imagegen_devx/jax_stable_stack_tpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py
similarity index 97%
rename from dags/imagegen_devx/jax_stable_stack_tpu_e2e.py
rename to dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py
index f7c6d1b5..a0cbac1a 100644
--- a/dags/imagegen_devx/jax_stable_stack_tpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py
@@ -20,7 +20,7 @@
 from airflow.utils.task_group import TaskGroup
 from dags import composer_env, test_owner, gcs_bucket
 from dags.vm_resource import Project, TpuVersion, CpuVersion, Zone, DockerImage, GpuVersion, XpkClusters
-from dags.imagegen_devx.configs import gke_config as config
+from dags.sparsity_diffusion_devx.configs import gke_config as config
 from xlml.utils import name_format
 
 # Run once a day at 3 am UTC (7 pm PST)
@@ -31,11 +31,13 @@
     dag_id="jax_stable_stack_tpu_e2e",
     schedule=SCHEDULED_TIME,
     tags=[
+        "sparsity_diffusion_devx",
         "multipod_team",
         "maxtext",
         "maxdiffusion",
         "axlearn",
-        "tpu" "jax-stable-stack",
+        "tpu",
+        "jax-stable-stack",
     ],
     start_date=datetime.datetime(2024, 6, 7),
     catchup=False,
diff --git a/dags/imagegen_devx/maxdiffusion_e2e.py b/dags/sparsity_diffusion_devx/maxdiffusion_e2e.py
similarity index 96%
rename from dags/imagegen_devx/maxdiffusion_e2e.py
rename to dags/sparsity_diffusion_devx/maxdiffusion_e2e.py
index 980c43ce..3c9c7311 100644
--- a/dags/imagegen_devx/maxdiffusion_e2e.py
+++ b/dags/sparsity_diffusion_devx/maxdiffusion_e2e.py
@@ -20,7 +20,7 @@
 from airflow.utils.task_group import TaskGroup
 from dags import composer_env, test_owner, gcs_bucket
 from dags.vm_resource import Project, TpuVersion, CpuVersion, Zone, DockerImage, GpuVersion, XpkClusters
-from dags.imagegen_devx.configs import gke_config as config
+from dags.sparsity_diffusion_devx.configs import gke_config as config
 from xlml.utils import name_format
 
 # Run once a day at 4 am UTC (8 pm PST)
@@ -30,7 +30,7 @@
 with models.DAG(
     dag_id="maxdiffusion_e2e",
     schedule=SCHEDULED_TIME,
-    tags=["multipod_team", "maxdiffusion"],
+    tags=["sparsity_diffusion_devx", "multipod_team", "maxdiffusion"],
     start_date=datetime.datetime(2024, 9, 12),
     catchup=False,
 ) as dag:
diff --git a/dags/imagegen_devx/project_bite_gpu_e2e.py b/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py
similarity index 91%
rename from dags/imagegen_devx/project_bite_gpu_e2e.py
rename to dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py
index 7ce82778..fdb95032 100644
--- a/dags/imagegen_devx/project_bite_gpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py
@@ -19,7 +19,7 @@
 from airflow import models
 from dags import composer_env, test_owner, gcs_bucket
 from dags.vm_resource import DockerImage, XpkClusters
-from dags.imagegen_devx.configs import gke_config as config
+from dags.sparsity_diffusion_devx.configs import gke_config as config
 from xlml.utils import name_format
 
 # Run once a day at 3 am UTC (7 pm PST)
@@ -29,7 +29,13 @@
 with models.DAG(
     dag_id="project_bite_gpu_e2e",
     schedule=SCHEDULED_TIME,
-    tags=["multipod_team", "gcp_gpu", "axlearn", "bite"],
+    tags=[
+        "sparsity_diffusion_devx",
+        "multipod_team",
+        "gcp_gpu",
+        "axlearn",
+        "bite",
+    ],
     start_date=datetime.datetime(2024, 11, 12),
     catchup=False,
 ) as dag:
diff --git a/dags/imagegen_devx/project_bite_tpu_e2e.py b/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py
similarity index 92%
rename from dags/imagegen_devx/project_bite_tpu_e2e.py
rename to dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py
index 30bbcfa6..df7370b5 100644
--- a/dags/imagegen_devx/project_bite_tpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py
@@ -18,7 +18,7 @@
 from airflow import models
 from dags import composer_env
 from dags.vm_resource import TpuVersion, Zone, RuntimeVersion
-from dags.imagegen_devx.configs import project_bite_config as config
+from dags.sparsity_diffusion_devx.configs import project_bite_config as config
 
 
 # Run once a day at 6 pm UTC (11 am PST)
@@ -28,7 +28,7 @@
 with models.DAG(
     dag_id="project_bite_tpu_e2e",
     schedule=SCHEDULED_TIME,
-    tags=["imagegen_devx", "jax", "nightly", "bite", "multipod_team"],
+    tags=["sparsity_diffusion_devx", "jax", "nightly", "bite", "multipod_team"],
     start_date=datetime.datetime(2024, 4, 4),
     catchup=False,
 ) as dag:
diff --git a/dags/test_owner.py b/dags/test_owner.py
index f17b4348..fbe748b5 100644
--- a/dags/test_owner.py
+++ b/dags/test_owner.py
@@ -22,11 +22,10 @@ class Team(enum.Enum):
   PYTORCH_XLA = "pytorch_xla"
   MULTIPOD = "multipod"
   MLCOMPASS = "mlcompass"
-  IMAGEGEN_DEVX = "imagegen_devx"
   INFERENCE = "inference"
   FRAMEWORK = "framework3p"
   LLM_DEVX = "llm_devx"
-  SPARCITY_DIFFUSION_DEVX = "sparcity_diffusion_devx"
+  SPARSITY_DIFFUSION_DEVX = "sparsity_diffusion_devx"
   PERFORMANCE = "performance"
   PRODUCTIVITY = "productivity"
 
@@ -54,7 +53,7 @@ class Team(enum.Enum):
 # MLCompass
 ORTI_B = "Orti B."
 
-# ImageGen DevX
+# Sparsity & Diffusion DevX
 RAN_R = "Ran R."
 PARAM_B = "Param B."
 

From 67c27acfc644f03525eb59406b6f5e8210cd6fb0 Mon Sep 17 00:00:00 2001
From: shralex <shralex@gmail.com>
Date: Wed, 13 Nov 2024 16:10:27 -0800
Subject: [PATCH 04/26] Adding a quarantine folder for remaining multipod DAGs.
 (#474)

---
 .../multipod/maxtext_trillium_configs_perf.py |  12 +-
 dags/multipod/maxtext_v5e_configs_perf.py     |  23 +-
 dags/quarantined_tests.py                     | 393 ++++++++++++++++++
 xlml/apis/task.py                             |  10 +
 4 files changed, 430 insertions(+), 8 deletions(-)

diff --git a/dags/multipod/maxtext_trillium_configs_perf.py b/dags/multipod/maxtext_trillium_configs_perf.py
index c6b45f7d..7fc8a107 100644
--- a/dags/multipod/maxtext_trillium_configs_perf.py
+++ b/dags/multipod/maxtext_trillium_configs_perf.py
@@ -17,6 +17,7 @@
 """
 import datetime
 from airflow import models
+from airflow.utils.task_group import TaskGroup
 from dags import composer_env, test_owner
 from dags.vm_resource import TpuVersion, Zone, Project, XpkClusters, DockerImage
 from dags.multipod.configs import maxtext_sweep_gke_config
@@ -40,6 +41,9 @@
     start_date=datetime.datetime(2024, 2, 19),
     catchup=False,
 ) as dag:
+  quarantine_task_group = TaskGroup(
+      group_id="Quarantine", dag=dag, prefix_group_id=False
+  )
   for mode, image in DOCKER_IMAGES:
     for model in MODEL_CONFIGS:
       base_run_model_cmds = [
@@ -63,9 +67,13 @@
       )
 
       chain_num = 4
-      prev = maxtext_sweep_gke_test[0].run_with_run_name_generation()
+      prev = maxtext_sweep_gke_test[0].run_with_name_gen_and_quarantine(
+          quarantine_task_group
+      )
       for i in range(1, len(maxtext_sweep_gke_test)):
-        curr = maxtext_sweep_gke_test[i].run_with_run_name_generation()
+        curr = maxtext_sweep_gke_test[i].run_with_name_gen_and_quarantine(
+            quarantine_task_group
+        )
         if i % chain_num != 0:
           prev >> curr
         prev = curr
diff --git a/dags/multipod/maxtext_v5e_configs_perf.py b/dags/multipod/maxtext_v5e_configs_perf.py
index f367b29d..22eabe17 100644
--- a/dags/multipod/maxtext_v5e_configs_perf.py
+++ b/dags/multipod/maxtext_v5e_configs_perf.py
@@ -17,6 +17,7 @@
 """
 import datetime
 from airflow import models
+from airflow.utils.task_group import TaskGroup
 from dags import composer_env, test_owner
 from dags.vm_resource import TpuVersion, Zone, Project, XpkClusters, DockerImage
 from dags.multipod.configs import maxtext_sweep_gke_config
@@ -49,6 +50,9 @@
     start_date=datetime.datetime(2024, 2, 19),
     catchup=False,
 ) as dag:
+  quarantine_task_group = TaskGroup(
+      group_id="Quarantine", dag=dag, prefix_group_id=False
+  )
   for mode, image in DOCKER_IMAGES:
     for model in MODEL_CONFIGS:
       base_run_model_cmds = [
@@ -72,9 +76,13 @@
       )
 
       chain_num = 4
-      prev = maxtext_sweep_gke_test[0].run_with_run_name_generation()
+      prev = maxtext_sweep_gke_test[0].run_with_name_gen_and_quarantine(
+          quarantine_task_group
+      )
       for i in range(1, len(maxtext_sweep_gke_test)):
-        curr = maxtext_sweep_gke_test[i].run_with_run_name_generation()
+        curr = maxtext_sweep_gke_test[i].run_with_name_gen_and_quarantine(
+            quarantine_task_group
+        )
         if i % chain_num != 0:
           prev >> curr
         prev = curr
@@ -90,6 +98,9 @@
     start_date=datetime.datetime(2024, 2, 19),
     catchup=False,
 ) as dag:
+  quarantine_task_group = TaskGroup(
+      group_id="Quarantine", dag=dag, prefix_group_id=False
+  )
   for mode, image in DOCKER_IMAGES:
     for model in MODEL_CONFIGS:
       base_run_model_cmds = [
@@ -113,12 +124,12 @@
       )
 
       chain_num = 4
-      prev = maxtext_sweep_gke_test[0].run_with_run_name_generation(
-          use_pathways=True
+      prev = maxtext_sweep_gke_test[0].run_with_name_gen_and_quarantine(
+          quarantine_task_group, use_pathways=True
       )
       for i in range(1, len(maxtext_sweep_gke_test)):
-        curr = maxtext_sweep_gke_test[i].run_with_run_name_generation(
-            use_pathways=True
+        curr = maxtext_sweep_gke_test[i].run_with_name_gen_and_quarantine(
+            quarantine_task_group, use_pathways=True
         )
         if i % chain_num != 0:
           prev >> curr
diff --git a/dags/quarantined_tests.py b/dags/quarantined_tests.py
index f98c06b9..fe581b68 100644
--- a/dags/quarantined_tests.py
+++ b/dags/quarantined_tests.py
@@ -287,6 +287,399 @@ class QuarantineTests:
       "mxla-maxtext-nightly-gke-8xv5p-8": TestInfo(
           team.PERFORMANCE, "2024-11-12"
       ),
+      # DAG: maxtext_trillium_configs_perf
+      "maxtext-llama2_70b_4096-stable-3-2xv6e-256": TestInfo(
+          team.PERFORMANCE, "2024-11-12"
+      ),
+      "maxtext-llama2_70b_4096-nightly-3-2xv6e-256": TestInfo(
+          team.PERFORMANCE, "2024-11-12"
+      ),
+      # DAG: maxtext_v5e_configs_perf
+      "maxtext-16b-stable-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-16b-stable-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-16b-stable-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-16b-stable-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-32b-stable-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-32b-stable-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-32b-stable-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-32b-stable-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-64b-stable-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-64b-stable-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-64b-stable-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-64b-stable-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-128b-stable-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-128b-stable-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-128b-stable-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-128b-stable-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-gpt3_175b-stable-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-gpt3_175b-stable-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-gpt3_175b-stable-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-gpt3_175b-stable-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_7b-stable-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_7b-stable-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_7b-stable-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_7b-stable-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_13b-stable-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_13b-stable-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_13b-stable-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_13b-stable-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_70b-stable-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_70b-stable-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_70b-stable-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_70b-stable-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-16b-nightly-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-16b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-16b-nightly-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-16b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-32b-nightly-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-32b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-32b-nightly-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-32b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-64b-nightly-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-64b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-64b-nightly-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-64b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-128b-nightly-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-128b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-128b-nightly-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-128b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-gpt3_175b-nightly-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-gpt3_175b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-gpt3_175b-nightly-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-gpt3_175b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_7b-nightly-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_7b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_7b-nightly-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_7b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_13b-nightly-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_13b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_13b-nightly-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_13b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_70b-nightly-0-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_70b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_70b-nightly-2-v5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      "maxtext-llama2_70b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PERFORMANCE, "2024-11-13"
+      ),
+      # DAG: pathways_maxtext_v5e_configs_perf
+      "p-maxtext-16b-stable-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-16b-stable-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-16b-stable-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-16b-stable-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-32b-stable-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-32b-stable-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-32b-stable-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-32b-stable-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-64b-stable-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-64b-stable-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-64b-stable-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-64b-stable-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-128b-stable-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-128b-stable-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-128b-stable-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-128b-stable-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-gpt3_175b-stable-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-gpt3_175b-stable-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-gpt3_175b-stable-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-gpt3_175b-stable-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_7b-stable-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_7b-stable-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_7b-stable-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_7b-stable-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_13b-stable-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_13b-stable-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_13b-stable-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_13b-stable-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_70b-stable-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_70b-stable-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_70b-stable-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_70b-stable-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-16b-nightly-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-16b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-16b-nightly-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-16b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-32b-nightly-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-32b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-32b-nightly-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-32b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-64b-nightly-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-64b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-64b-nightly-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-64b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-128b-nightly-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-128b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-128b-nightly-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-128b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-gpt3_175b-nightly-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-gpt3_175b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-gpt3_175b-nightly-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-gpt3_175b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_7b-nightly-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_7b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_7b-nightly-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_7b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_13b-nightly-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_13b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_13b-nightly-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_13b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_70b-nightly-0-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_70b-nightly-1-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_70b-nightly-2-v5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
+      "p-maxtext-llama2_70b-nightly-3-2xv5litepod-256": TestInfo(
+          team.PRODUCTIVITY, "2024-11-13"
+      ),
   }
 
   @staticmethod
diff --git a/xlml/apis/task.py b/xlml/apis/task.py
index 6cb84ce9..69806466 100644
--- a/xlml/apis/task.py
+++ b/xlml/apis/task.py
@@ -191,6 +191,16 @@ def run(
 
     return group
 
+  def run_with_name_gen_and_quarantine(
+      self, quarantine_task_group, use_pathways: bool = False
+  ) -> DAGNode:
+    test_name = self.task_test_config.benchmark_id
+    if QuarantineTests.is_quarantined(test_name):
+      with quarantine_task_group:
+        return self.run_with_run_name_generation(use_pathways)
+    else:
+      return self.run_with_run_name_generation(use_pathways)
+
   def run_with_run_name_generation(self, use_pathways: bool = False) -> DAGNode:
     """Generate a unique run name and tensorboard file location,
     then run a test job within a docker image.

From affb3a3b8a2a0b4a7cfcd26f5c5a182d07d97dfa Mon Sep 17 00:00:00 2001
From: Yijia <jinyijia24@gmail.com>
Date: Thu, 14 Nov 2024 10:27:57 -0800
Subject: [PATCH 05/26] Fix Project Image Name for GPU Inference DAGs (#475)

* fix project name

* format
---
 dags/inference/trt_llm_inference.py            | 2 +-
 dags/inference/trt_llm_mlperf_v40_inference.py | 2 +-
 dags/inference/trt_llm_mlperf_v41_inference.py | 4 ++--
 dags/vm_resource.py                            | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/dags/inference/trt_llm_inference.py b/dags/inference/trt_llm_inference.py
index ea30e9d5..7b0f5b97 100644
--- a/dags/inference/trt_llm_inference.py
+++ b/dags/inference/trt_llm_inference.py
@@ -36,7 +36,7 @@
   # Running on H100 GPU
   trt_llm_inference_config.get_trt_llm_gpu_config(
       machine_type=MachineVersion.A3_HIGHGPU_8G,
-      image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_project=ImageProject.ML_IMAGES,
       image_family=ImageFamily.COMMON_CU121_DEBIAN_11,
       accelerator_type=GpuVersion.H100,
       count=8,
diff --git a/dags/inference/trt_llm_mlperf_v40_inference.py b/dags/inference/trt_llm_mlperf_v40_inference.py
index 2926b52b..4d716ea4 100644
--- a/dags/inference/trt_llm_mlperf_v40_inference.py
+++ b/dags/inference/trt_llm_mlperf_v40_inference.py
@@ -50,7 +50,7 @@
   # Running on H100 GPU
   trt_llm_mlperf_v40_config.get_trt_llm_mlperf_v40_gpu_config(
       machine_type=MachineVersion.A3_HIGHGPU_8G,
-      image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_project=ImageProject.ML_IMAGES,
       image_family=ImageFamily.COMMON_CU121_DEBIAN_11,
       accelerator_type=GpuVersion.H100,
       count=8,
diff --git a/dags/inference/trt_llm_mlperf_v41_inference.py b/dags/inference/trt_llm_mlperf_v41_inference.py
index 9fdf8700..1cfe4498 100644
--- a/dags/inference/trt_llm_mlperf_v41_inference.py
+++ b/dags/inference/trt_llm_mlperf_v41_inference.py
@@ -104,7 +104,7 @@
   # Running on A100 GPU
   trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config(
       machine_type=MachineVersion.A2_ULTRAGPU_8G,
-      image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_project=ImageProject.ML_IMAGES,
       image_family=ImageFamily.COMMON_CU121_DEBIAN_11,
       accelerator_type=GpuVersion.A100_80G,
       count=8,
@@ -123,7 +123,7 @@
   # Running on L4 GPU
   trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config(
       machine_type=MachineVersion.G2_STAND_96,
-      image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_project=ImageProject.ML_IMAGES,
       image_family=ImageFamily.COMMON_CU121_DEBIAN_11,
       accelerator_type=GpuVersion.L4,
       count=8,
diff --git a/dags/vm_resource.py b/dags/vm_resource.py
index 8b8b6de3..4459be63 100644
--- a/dags/vm_resource.py
+++ b/dags/vm_resource.py
@@ -67,6 +67,7 @@ class ImageProject(enum.Enum):
   """Common image projects for GPU."""
 
   DEEP_LEARNING_PLATFORM_RELEASE = "deeplearning-platform-release"
+  ML_IMAGES = "ml-images"
 
 
 class ImageFamily(enum.Enum):

From 5af6bd71320aa966aa3958b5f63275e2df662314 Mon Sep 17 00:00:00 2001
From: Ran Ran <ran.rissy@gmail.com>
Date: Fri, 15 Nov 2024 10:40:59 -0800
Subject: [PATCH 06/26] Split MoE from end_to_end test (#477)

---
 dags/multipod/maxtext_end_to_end.py           |  26 +--
 dags/quarantined_tests.py                     |   8 +-
 .../jax_stable_stack_gpu_e2e.py               |   1 +
 .../maxtext_moe_tpu_e2e.py                    | 148 ++++++++++++++++++
 .../project_bite_gpu_e2e.py                   |   2 +-
 .../project_bite_tpu_e2e.py                   |   8 +-
 6 files changed, 162 insertions(+), 31 deletions(-)
 create mode 100644 dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py

diff --git a/dags/multipod/maxtext_end_to_end.py b/dags/multipod/maxtext_end_to_end.py
index e1817679..2d482441 100644
--- a/dags/multipod/maxtext_end_to_end.py
+++ b/dags/multipod/maxtext_end_to_end.py
@@ -20,7 +20,7 @@
 from airflow.utils.task_group import TaskGroup
 from dags import composer_env, test_owner
 from dags.quarantined_tests import QuarantineTests
-from dags.vm_resource import XpkClusters, CpuVersion, DockerImage, GpuVersion, Project, TpuVersion, Zone
+from dags.vm_resource import XpkClusters, DockerImage
 from dags.multipod.configs import gke_config
 from xlml.utils import name_format
 
@@ -77,30 +77,6 @@
               "time_out_in_min": 60,
           },
       ],
-      "mixtral-8x7b": [
-          {
-              "script_name": "tpu/mixtral/8x7b/1_test_mixtral",
-              "cluster": XpkClusters.CPU_M1_MEGAMEM_96_CLUSTER,
-              "time_out_in_min": 240,
-          },
-          {
-              "script_name": "tpu/mixtral/8x7b/2_test_mixtral",
-              "cluster": XpkClusters.TPU_V4_128_CLUSTER,
-              "time_out_in_min": 60,
-          },
-      ],
-      "mixtral-8x22b": [
-          {
-              "script_name": "tpu/mixtral/8x22b/1_test_mixtral",
-              "cluster": XpkClusters.CPU_M1_MEGAMEM_96_CLUSTER,
-              "time_out_in_min": 360,
-          },
-          {
-              "script_name": "tpu/mixtral/8x22b/2_test_mixtral",
-              "cluster": XpkClusters.TPU_V5E_256_CLUSTER,
-              "time_out_in_min": 60,
-          },
-      ],
       "llama2-70b": [
           {
               "script_name": "tpu/llama2/70b/1_test_llama2_70b",
diff --git a/dags/quarantined_tests.py b/dags/quarantined_tests.py
index fe581b68..a5ba9f6b 100644
--- a/dags/quarantined_tests.py
+++ b/dags/quarantined_tests.py
@@ -178,16 +178,16 @@ class QuarantineTests:
       "chained_tests_gemma-7b_stable": TestInfo(team.LLM_DEVX, "2024-11-12"),
       "chained_tests_gemma-7b_nightly": TestInfo(team.LLM_DEVX, "2024-11-12"),
       "chained_tests_mixtral-8x7b_stable": TestInfo(
-          team.LLM_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       "chained_tests_mixtral-8x7b_nightly": TestInfo(
-          team.LLM_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       "chained_tests_mixtral-8x22b_stable": TestInfo(
-          team.LLM_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       "chained_tests_mixtral-8x22b_nightly": TestInfo(
-          team.LLM_DEVX, "2024-11-12"
+          team.SPARSITY_DIFFUSION_DEVX, "2024-11-12"
       ),
       "chained_tests_llama2-70b_stable": TestInfo(team.LLM_DEVX, "2024-11-12"),
       "chained_tests_llama2-70b_nightly": TestInfo(team.LLM_DEVX, "2024-11-12"),
diff --git a/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
index 466bf3b6..400f49ce 100644
--- a/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
@@ -35,6 +35,7 @@
         "sparsity_diffusion_devx",
         "multipod_team",
         "maxtext",
+        "gpu",
         "jax-stable-stack",
     ],
     start_date=datetime.datetime(2024, 6, 7),
diff --git a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py
new file mode 100644
index 00000000..8abe4919
--- /dev/null
+++ b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py
@@ -0,0 +1,148 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A DAG to run end-to-end MoE tests."""
+
+
+import datetime
+from airflow import models
+from airflow.utils.task_group import TaskGroup
+from dags import composer_env, test_owner
+from dags.quarantined_tests import QuarantineTests
+from dags.vm_resource import XpkClusters, DockerImage
+from dags.multipod.configs import gke_config
+from xlml.utils import name_format
+
+# Run once a day at 5 am UTC (9 pm PST)
+SCHEDULED_TIME = "0 5 * * *" if composer_env.is_prod_env() else None
+
+
+with models.DAG(
+    dag_id="maxtext_moe_tpu_e2e",
+    schedule=SCHEDULED_TIME,
+    tags=[
+        "sparsity_diffusion_devx",
+        "multipod_team",
+        "maxtext",
+        "tpu",
+        "stable",
+        "nightly",
+    ],
+    start_date=datetime.datetime(2024, 11, 14),
+    catchup=False,
+) as dag:
+  test_name_prefix = "maxtext"
+  quarantine_task_group = TaskGroup(
+      group_id="Quarantine", dag=dag, prefix_group_id=False
+  )
+
+  multicluster_test_models = {
+      "mixtral-8x7b": [
+          {
+              "script_name": "tpu/mixtral/8x7b/1_test_mixtral",
+              "cluster": XpkClusters.CPU_M1_MEGAMEM_96_CLUSTER,
+              "time_out_in_min": 240,
+          },
+          {
+              "script_name": "tpu/mixtral/8x7b/2_test_mixtral",
+              "cluster": XpkClusters.TPU_V4_128_CLUSTER,
+              "time_out_in_min": 60,
+          },
+      ],
+      "mixtral-8x22b": [
+          {
+              "script_name": "tpu/mixtral/8x22b/1_test_mixtral",
+              "cluster": XpkClusters.CPU_M1_MEGAMEM_96_CLUSTER,
+              "time_out_in_min": 360,
+          },
+          {
+              "script_name": "tpu/mixtral/8x22b/2_test_mixtral",
+              "cluster": XpkClusters.TPU_V5E_256_CLUSTER,
+              "time_out_in_min": 60,
+          },
+      ],
+  }
+
+  def convert_checkpoint_and_run_training(
+      test_group_id,
+      test_name_prefix,
+      type,
+      docker_image,
+      model,
+      test_scripts_details,
+  ):
+    with TaskGroup(group_id=test_group_id, prefix_group_id=False) as group:
+      test_name = f"{test_name_prefix}-{type}-{model}"
+      shared_gcs_location = name_format.generate_gcs_folder_location.override(
+          task_id=f"{test_group_id}_generate_gcs_folder_location"
+      )(
+          gcs_subfolder,
+          test_group_id,
+      )
+      conversion_cpu = gke_config.get_maxtext_cpu_end_to_end_gke_config(
+          time_out_in_min=test_scripts_details[0]["time_out_in_min"],
+          test_name=test_name,
+          run_model_cmds=(
+              f"export BASE_OUTPUT_PATH=$GCS_OUTPUT; bash end_to_end/{test_scripts_details[0]['script_name']}.sh",
+          ),
+          docker_image=docker_image,
+          test_owner=test_owner.RAN_R,
+          cluster=test_scripts_details[0]["cluster"],
+      ).run(gcs_location=shared_gcs_location)
+      training_tpu = gke_config.get_gke_config(
+          time_out_in_min=test_scripts_details[1]["time_out_in_min"],
+          test_name=test_name,
+          run_model_cmds=(
+              f"export BASE_OUTPUT_PATH=$GCS_OUTPUT; bash end_to_end/{test_scripts_details[1]['script_name']}.sh",
+          ),
+          docker_image=docker_image,
+          test_owner=test_owner.RAN_R,
+          cluster=test_scripts_details[1]["cluster"],
+      ).run(gcs_location=shared_gcs_location)
+      return conversion_cpu, training_tpu
+
+  docker_image = {
+      "stable": DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK.value,
+      "nightly": DockerImage.MAXTEXT_TPU_JAX_NIGHTLY.value,
+  }
+  tests = []
+  for model, test_scripts_details in multicluster_test_models.items():
+    gcs_subfolder = f"{test_owner.Team.SPARSITY_DIFFUSION_DEVX.value}/maxtext"
+    for type in docker_image.keys():
+      test_group_id = "chained_tests" + "_" + model + "_" + type
+      if QuarantineTests.is_quarantined(test_group_id):
+        with quarantine_task_group:
+          mode_cpu, mode_tpu = convert_checkpoint_and_run_training(
+              test_group_id,
+              test_name_prefix,
+              type,
+              docker_image[type],
+              model,
+              test_scripts_details,
+          )
+      else:
+        mode_cpu, mode_tpu = convert_checkpoint_and_run_training(
+            test_group_id,
+            test_name_prefix,
+            type,
+            docker_image[type],
+            model,
+            test_scripts_details,
+        )
+      tests.append(mode_cpu)
+      tests.append(mode_tpu)
+
+    # stable_cpu >> stable_tpu >> nightly_cpu >> nightly_tpu
+    for i in range(len(tests) - 1):
+      tests[i] << tests[i + 1]
diff --git a/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py b/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py
index fdb95032..2d47b5cd 100644
--- a/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/project_bite_gpu_e2e.py
@@ -32,7 +32,7 @@
     tags=[
         "sparsity_diffusion_devx",
         "multipod_team",
-        "gcp_gpu",
+        "gpu",
         "axlearn",
         "bite",
     ],
diff --git a/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py b/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py
index df7370b5..b188373d 100644
--- a/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/project_bite_tpu_e2e.py
@@ -28,7 +28,13 @@
 with models.DAG(
     dag_id="project_bite_tpu_e2e",
     schedule=SCHEDULED_TIME,
-    tags=["sparsity_diffusion_devx", "jax", "nightly", "bite", "multipod_team"],
+    tags=[
+        "sparsity_diffusion_devx",
+        "multipod_team",
+        "tpu",
+        "axlearn",
+        "bite",
+    ],
     start_date=datetime.datetime(2024, 4, 4),
     catchup=False,
 ) as dag:

From f1e009e0f88b195eff42a4c799793faf01920f66 Mon Sep 17 00:00:00 2001
From: Gunjan Jalori <39437795+gunjanj007@users.noreply.github.com>
Date: Fri, 15 Nov 2024 14:23:59 -0800
Subject: [PATCH 07/26] Add AOTC nightly test DAG (#461)

* Add an empty DAG for reproducibility

* Trillium support for JAX Stable Stack DAG (#444)

* Updated M* image versions to reflect jax_stable_stack=0.4.35-rev1 (#445)

* Add Gpt3 regression tests for Models benchmark reproducible artifacts.

* Delete old files

* reformat

* gpus 256

* add all the func

* add all the func

* fix recipe name

* resolve comments

* reformat

* gger checks

* resolve conflict

* resolve conflict1

* resolve conflict2

* move utils to team folder

* change bucket name

* fix format

* remove getting pods initially

* Add copyright doc

* resolve conflicts

* fix formatting

* fi8x formatting

* fix formatting

* fix formatting.

---------

Co-authored-by: Param Bole <parambole@google.com>
---
 .../aotc_reproducibility.py                   | 110 ++++++++++++++++++
 dags/map_reproducibility/nemo_gpt3.py         |  97 +++++++++++++++
 dags/test_owner.py                            |   4 +-
 3 files changed, 210 insertions(+), 1 deletion(-)
 create mode 100644 dags/map_reproducibility/aotc_reproducibility.py
 create mode 100644 dags/map_reproducibility/nemo_gpt3.py

diff --git a/dags/map_reproducibility/aotc_reproducibility.py b/dags/map_reproducibility/aotc_reproducibility.py
new file mode 100644
index 00000000..eff156ab
--- /dev/null
+++ b/dags/map_reproducibility/aotc_reproducibility.py
@@ -0,0 +1,110 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"Bash helper commands for AOTC artifacts"
+
+import os
+
+
+def set_variables_cmds():
+  set_variables = (
+      "export PROJECT=supercomputer-testing",
+      "export CLUSTER=a3plus-benchmark",
+      "export CLUSTER_REGION=australia-southeast1",
+      "NOW=$(date +%s)",
+      "export BUCKET_NAME=regression-testing-xlml",
+      "export JOB_NAME=gpt3-xlml-$NOW-175b-nemo",
+  )
+  return set_variables
+
+
+def set_project_commands():
+  set_project_command = (
+      "gcloud config set project $PROJECT",
+      "sudo chown -R airflow:airflow /home/airflow/composer_kube_config",
+      "gcloud container clusters get-credentials "
+      "$CLUSTER --region $CLUSTER_REGION",
+  )
+  return set_project_command
+
+
+def install_helm_cmds():
+  install_helm_cmd = (
+      "curl -fsSL -o get_helm.sh "
+      "https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3",
+      "chmod 700 get_helm.sh",
+      "./get_helm.sh",
+  )
+  return install_helm_cmd
+
+
+# By default the composer environment overwrites the
+# namespaces to airflow namespaces.
+# In order to prevent that it is necessary explicitly
+# change the namespace to default.
+def namespace_cmds():
+  namespace = (
+      "kubectl config view | grep namespace",
+      "kubectl config set-context --current --namespace=default",
+      "kubectl config set-context heml --namespace=default",
+  )
+  return namespace
+
+
+def wait_for_jobs_cmds():
+  wait_for_job = (
+      "echo 'will wait for job to start running'",
+      "kubectl wait --for=condition=running job/$JOB_NAME"
+      " --namespace=default --timeout=10m",
+      "echo 'will wait for jobs to finish'",
+      "kubectl wait --for=condition=complete "
+      "job/$JOB_NAME --namespace=default --timeout=100m",
+  )
+  return wait_for_job
+
+
+def copy_bucket_cmds():
+  copy_bucket_contents = (
+      "COMPLETE_JOB_NAME=$(gcloud storage ls "
+      "gs://$BUCKET_NAME/nemo-experiments/ | grep $JOB_NAME)",
+      "echo 'copying from' ",
+      "echo $COMPLETE_JOB_NAME",
+      "cd $REPO_ROOT/src/utils/training_metrics",
+      "gcloud storage cp ${COMPLETE_JOB_NAME}"
+      "dllogger/rank-0/dllogger.json .",
+  )
+  return copy_bucket_contents
+
+
+def get_metrics_cmds():
+  # TODO(gunjanj007): get these parameters from the recipe
+  get_metrics = (
+      "python3 process_training_results.py --file"
+      " dllogger.json --batch_size 2048 "
+      "--num_accelerators 256 "
+      "--precision fp8  "
+      "--model_type gpt3-175b "
+      "--accelerator_type h100 ",
+  )
+  return get_metrics
+
+
+def cleanup_cmds():
+  cleanup = (
+      "kubectl get pods "
+      "--no-headers=true | awk '{print $1}' "
+      "| grep $JOB_NAME |  xargs kubectl delete pods",
+      "helm uninstall $JOB_NAME",
+  )
+  return cleanup
diff --git a/dags/map_reproducibility/nemo_gpt3.py b/dags/map_reproducibility/nemo_gpt3.py
new file mode 100644
index 00000000..ff03fdaf
--- /dev/null
+++ b/dags/map_reproducibility/nemo_gpt3.py
@@ -0,0 +1,97 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DAGs to run Aotc reproducibility benchmarks."""
+
+import datetime
+from airflow import models
+from airflow.decorators import task
+from airflow.hooks.subprocess import SubprocessHook
+from dags import composer_env
+from dags.map_reproducibility.aotc_reproducibility import get_metrics_cmds
+from dags.map_reproducibility.aotc_reproducibility import set_variables_cmds
+from dags.map_reproducibility.aotc_reproducibility import set_project_commands
+from dags.map_reproducibility.aotc_reproducibility import install_helm_cmds
+from dags.map_reproducibility.aotc_reproducibility import namespace_cmds
+from dags.map_reproducibility.aotc_reproducibility import wait_for_jobs_cmds
+from dags.map_reproducibility.aotc_reproducibility import copy_bucket_cmds
+from dags.map_reproducibility.aotc_reproducibility import cleanup_cmds
+
+# Run once a day at 2 pm UTC (6 am PST)
+SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None
+
+
+@task
+def run_aotc_workload():
+  gpu_recipe_cmd = (
+      "git clone https://github.com/ai-hypercomputer/gpu-recipes.git",
+      "cd gpu-recipes",
+      "export REPO_ROOT=`git rev-parse --show-toplevel`",
+      "export RECIPE_ROOT="
+      "$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke",
+      "cd $RECIPE_ROOT",
+  )
+
+  helm_cmds = (
+      "CONFIG_FILE=$REPO_ROOT/src/frameworks"
+      "/nemo-configs/gpt3-175b-256gpus-fp8.yaml",
+      " helm install -f values.yaml "
+      "--namespace default "
+      "--set namespace=default"
+      " --set-file nemo_config"
+      "=$CONFIG_FILE"
+      " --set workload.image"
+      "=us-central1-docker.pkg.dev/"
+      "supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07"
+      " --set workload.gcsBucketForDataCataPath=$BUCKET_NAME"
+      " $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training",
+  )
+
+  hook = SubprocessHook()
+  result = hook.run_command(
+      [
+          "bash",
+          "-c",
+          ";".join(
+              set_variables_cmds()
+              + set_project_commands()
+              + gpu_recipe_cmd
+              + install_helm_cmds()
+              + namespace_cmds()
+              + helm_cmds
+              + wait_for_jobs_cmds()
+              + copy_bucket_cmds()
+              + get_metrics_cmds()
+              + cleanup_cmds()
+          ),
+      ],
+  )
+  assert result.exit_code == 0, f"Command failed with code {result.exit_code}"
+
+
+with models.DAG(
+    dag_id="reproducibility_nemo_gpt3_nighly_dag",
+    schedule=SCHEDULED_TIME,
+    tags=[
+        "simple",
+        "aotc",
+        "nightly",
+        "reproducibility",
+        "experimental",
+        "xlml",
+    ],
+    start_date=datetime.datetime(2024, 11, 15),
+    catchup=False,
+) as dag:
+  run_aotc_workload()
diff --git a/dags/test_owner.py b/dags/test_owner.py
index fbe748b5..e96a8c5e 100644
--- a/dags/test_owner.py
+++ b/dags/test_owner.py
@@ -69,6 +69,8 @@ class Team(enum.Enum):
 
 # FRAMEWORK
 QINY_Y = "Qinyi Y."
-
 # JAX
 AKANKSHA_G = "Akanksha G."
+
+# MAP_REPRODUCIBILITY
+GUNJAN_J = "Gunjan J."

From 8ef98a3e8b3d79a41307ef9cb2bdf669532d1b53 Mon Sep 17 00:00:00 2001
From: Akanksha <akshu@google.com>
Date: Fri, 15 Nov 2024 15:26:52 -0800
Subject: [PATCH 08/26] Add GKE tests for jax.distributed.initialize() (#480)

Add tests for jax.distributed.initialize() function using the GKE stack.

This improves code coverage for this function.

Test logs: http://shortn/_kychJoMx2Q
---
 dags/multipod/configs/jax_tests_gce_config.py |   2 +-
 dags/multipod/configs/jax_tests_gke_config.py |  41 +++++
 dags/multipod/jax_functional_tests.py         | 155 +++++++++---------
 3 files changed, 118 insertions(+), 80 deletions(-)
 create mode 100644 dags/multipod/configs/jax_tests_gke_config.py

diff --git a/dags/multipod/configs/jax_tests_gce_config.py b/dags/multipod/configs/jax_tests_gce_config.py
index 0f31af96..9d785462 100644
--- a/dags/multipod/configs/jax_tests_gce_config.py
+++ b/dags/multipod/configs/jax_tests_gce_config.py
@@ -15,7 +15,7 @@
 """Utilities to construct configs for JAX tests for GCE."""
 
 from xlml.apis import gcp_config, metric_config, task, test_config
-from dags import test_owner, gcs_bucket
+from dags import test_owner
 from dags.multipod.configs import common
 from dags.vm_resource import TpuVersion, Project, RuntimeVersion
 import datetime
diff --git a/dags/multipod/configs/jax_tests_gke_config.py b/dags/multipod/configs/jax_tests_gke_config.py
new file mode 100644
index 00000000..183c4d6e
--- /dev/null
+++ b/dags/multipod/configs/jax_tests_gke_config.py
@@ -0,0 +1,41 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities to construct configs for JAX tests for GCE."""
+
+from dags import test_owner
+from dags.multipod.configs import gke_config
+from dags.vm_resource import XpkClusterConfig
+
+
+def get_jax_distributed_initialize_config(
+    cluster: XpkClusterConfig,
+    time_out_in_min: int,
+    test_name: str,
+    docker_image: str,
+    num_slices: int = 1,
+):
+  run_model_cmds = [
+      "bash end_to_end/test_jdi.sh",
+  ]
+
+  return gke_config.get_gke_config(
+      cluster=cluster,
+      test_name=test_name,
+      run_model_cmds=run_model_cmds,
+      num_slices=num_slices,
+      docker_image=docker_image,
+      test_owner=test_owner.AKANKSHA_G,
+      time_out_in_min=time_out_in_min,
+  )
diff --git a/dags/multipod/jax_functional_tests.py b/dags/multipod/jax_functional_tests.py
index 9538c688..cf76af8f 100644
--- a/dags/multipod/jax_functional_tests.py
+++ b/dags/multipod/jax_functional_tests.py
@@ -17,8 +17,8 @@
 import datetime
 from airflow import models
 from dags import composer_env
-from dags.vm_resource import TpuVersion, Zone, Project, V5_NETWORKS, V5P_SUBNETWORKS, RuntimeVersion
-from dags.multipod.configs import jax_tests_gce_config
+from dags.vm_resource import DockerImage, TpuVersion, Zone, Project, V5_NETWORKS, V5P_SUBNETWORKS, RuntimeVersion, XpkClusters
+from dags.multipod.configs import jax_tests_gce_config, jax_tests_gke_config
 from dags.multipod.configs.common import SetupMode
 
 # Run once a day at 10 am UTC (2 am PST)
@@ -32,87 +32,84 @@
     catchup=False,
 ) as dag:
   default_test_name = "jax-distributed-initialize"
-  test_modes = [SetupMode.STABLE, SetupMode.NIGHTLY, SetupMode.JAX_STABLE_STACK]
+  v5p_project_name = Project.TPU_PROD_ENV_AUTOMATED.value
+  v5p_network = V5_NETWORKS
+  v5p_subnetwork = V5P_SUBNETWORKS
+  v5p_runtime_version = RuntimeVersion.V2_ALPHA_TPUV5.value
+  test_modes_with_docker_images = [
+      (SetupMode.STABLE, None),
+      (SetupMode.JAX_STABLE_STACK, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK),
+      (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY),
+  ]
+
   v4_task_arr, v5p_task_arr = [], []
 
-  for test_mode in test_modes:
-    # v4
-    jax_nightly_1slice_v4_8 = (
-        jax_tests_gce_config.get_jax_distributed_initialize_config(
-            tpu_version=TpuVersion.V4,
-            tpu_cores=8,
-            tpu_zone=Zone.US_CENTRAL2_B.value,
-            time_out_in_min=60,
-            is_tpu_reserved=False,
-            test_name=f"{default_test_name}-{test_mode.value}",
-            test_mode=test_mode,
-        )
-    )
-    if len(v4_task_arr) > 1:
-      # pylint: disable-next=pointless-statement
-      v4_task_arr[-1] >> jax_nightly_1slice_v4_8
-    v4_task_arr.append(jax_nightly_1slice_v4_8)
+  for test_mode, gke_docker_image in test_modes_with_docker_images:
+    for num_slices in (1, 2):
+      # v4 GCE
+      jax_gce_v4_8 = jax_tests_gce_config.get_jax_distributed_initialize_config(
+          tpu_version=TpuVersion.V4,
+          tpu_cores=8,
+          tpu_zone=Zone.US_CENTRAL2_B.value,
+          time_out_in_min=60,
+          is_tpu_reserved=False,
+          num_slices=num_slices,
+          test_name=f"{default_test_name}-gce-{test_mode.value}",
+          test_mode=test_mode,
+      )
+      if len(v4_task_arr) > 1:
+        # pylint: disable-next=pointless-statement
+        v4_task_arr[-1] >> jax_gce_v4_8
+      v4_task_arr.append(jax_gce_v4_8)
 
-    jax_nightly_2slice_v4_8 = (
-        jax_tests_gce_config.get_jax_distributed_initialize_config(
-            tpu_version=TpuVersion.V4,
-            tpu_cores=8,
-            tpu_zone=Zone.US_CENTRAL2_B.value,
-            time_out_in_min=60,
-            is_tpu_reserved=False,
-            num_slices=2,
-            test_name=f"{default_test_name}-{test_mode.value}",
-            test_mode=test_mode,
+      # v4 GKE
+      if gke_docker_image is not None:
+        jax_gke_v4_8 = (
+            jax_tests_gke_config.get_jax_distributed_initialize_config(
+                cluster=XpkClusters.TPU_V4_8_MAXTEXT_CLUSTER,
+                time_out_in_min=60,
+                num_slices=num_slices,
+                test_name=f"{default_test_name}-gke-{test_mode.value}",
+                docker_image=gke_docker_image.value,
+            ).run()
         )
-    )
-
-    # pylint: disable-next=pointless-statement
-    v4_task_arr[-1] >> jax_nightly_2slice_v4_8
-    v4_task_arr.append(jax_nightly_2slice_v4_8)
+        # pylint: disable-next=pointless-statement
+        v4_task_arr[-1] >> jax_gke_v4_8
+        v4_task_arr.append(jax_gke_v4_8)
 
-    # v5p
-    v5p_project_name = Project.TPU_PROD_ENV_AUTOMATED.value
-    v5p_network = V5_NETWORKS
-    v5p_subnetwork = V5P_SUBNETWORKS
-    v5p_runtime_version = RuntimeVersion.V2_ALPHA_TPUV5.value
+      # v5p GCE
+      jax_gce_v5p_8 = (
+          jax_tests_gce_config.get_jax_distributed_initialize_config(
+              tpu_version=TpuVersion.V5P,
+              tpu_cores=8,
+              num_slices=num_slices,
+              tpu_zone=Zone.US_EAST5_A.value,
+              runtime_version=v5p_runtime_version,
+              project_name=v5p_project_name,
+              time_out_in_min=60,
+              is_tpu_reserved=True,
+              test_name=f"{default_test_name}-gce-{test_mode.value}",
+              test_mode=test_mode,
+              network=v5p_network,
+              subnetwork=v5p_subnetwork,
+          )
+      )
+      if len(v5p_task_arr) > 1:
+        # pylint: disable-next=pointless-statement
+        v5p_task_arr[-1] >> jax_gce_v5p_8
+      v5p_task_arr.append(jax_gce_v5p_8)
 
-    jax_nightly_1slice_v5p_8 = (
-        jax_tests_gce_config.get_jax_distributed_initialize_config(
-            tpu_version=TpuVersion.V5P,
-            tpu_cores=8,
-            tpu_zone=Zone.US_EAST5_A.value,
-            runtime_version=v5p_runtime_version,
-            project_name=v5p_project_name,
-            time_out_in_min=60,
-            is_tpu_reserved=True,
-            test_name=f"{default_test_name}-{test_mode.value}",
-            test_mode=test_mode,
-            network=v5p_network,
-            subnetwork=v5p_subnetwork,
+      # v5p GKE
+      if gke_docker_image is not None:
+        jax_gke_v5p_8 = (
+            jax_tests_gke_config.get_jax_distributed_initialize_config(
+                cluster=XpkClusters.TPU_V5P_8_CLUSTER,
+                time_out_in_min=60,
+                num_slices=num_slices,
+                test_name=f"{default_test_name}-gke-{test_mode.value}",
+                docker_image=gke_docker_image.value,
+            ).run()
         )
-    )
-    if len(v5p_task_arr) > 1:
-      # pylint: disable-next=pointless-statement
-      v5p_task_arr[-1] >> jax_nightly_1slice_v5p_8
-    v5p_task_arr.append(jax_nightly_1slice_v5p_8)
-
-    jax_nightly_2slice_v5p_8 = (
-        jax_tests_gce_config.get_jax_distributed_initialize_config(
-            tpu_version=TpuVersion.V5P,
-            tpu_cores=8,
-            num_slices=2,
-            tpu_zone=Zone.US_EAST5_A.value,
-            runtime_version=v5p_runtime_version,
-            project_name=v5p_project_name,
-            time_out_in_min=60,
-            is_tpu_reserved=True,
-            test_name=f"{default_test_name}-{test_mode.value}",
-            test_mode=test_mode,
-            network=v5p_network,
-            subnetwork=v5p_subnetwork,
-        )
-    )
-
-    # pylint: disable-next=pointless-statement
-    v5p_task_arr[-1] >> jax_nightly_2slice_v5p_8
-    v5p_task_arr.append(jax_nightly_2slice_v5p_8)
+        # pylint: disable-next=pointless-statement
+        v5p_task_arr[-1] >> jax_gke_v5p_8
+        v5p_task_arr.append(jax_gke_v5p_8)

From 0d1c0b8a844196ff5fbc5e82cf031d80979c3fab Mon Sep 17 00:00:00 2001
From: Ran Ran <ran.rissy@gmail.com>
Date: Fri, 15 Nov 2024 15:40:39 -0800
Subject: [PATCH 09/26] Fix test dependency issue (#479)

---
 dags/multipod/maxtext_end_to_end.py                 | 2 +-
 dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dags/multipod/maxtext_end_to_end.py b/dags/multipod/maxtext_end_to_end.py
index 2d482441..287257b6 100644
--- a/dags/multipod/maxtext_end_to_end.py
+++ b/dags/multipod/maxtext_end_to_end.py
@@ -162,4 +162,4 @@ def convert_checkpoint_and_run_training(
 
     # stable_cpu >> stable_tpu >> nightly_cpu >> nightly_tpu
     for i in range(len(tests) - 1):
-      tests[i] << tests[i + 1]
+      tests[i] >> tests[i + 1]
diff --git a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py
index 8abe4919..3430f2a0 100644
--- a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py
@@ -145,4 +145,4 @@ def convert_checkpoint_and_run_training(
 
     # stable_cpu >> stable_tpu >> nightly_cpu >> nightly_tpu
     for i in range(len(tests) - 1):
-      tests[i] << tests[i + 1]
+      tests[i] >> tests[i + 1]

From d30999d45a27730d105ad9088e71d0b6dd0ccd45 Mon Sep 17 00:00:00 2001
From: Param Bole <parambole@google.com>
Date: Mon, 18 Nov 2024 08:29:30 -0800
Subject: [PATCH 10/26] Adding new tests with stable stack nightly jax images
 (#476)

* Adding new tests with stable stack nightly jax images

* Correcting Typo
---
 .../jax_stable_stack_gpu_e2e.py               |  2 +-
 .../jax_stable_stack_tpu_e2e.py               | 68 +++++++++++--------
 dags/vm_resource.py                           |  8 ++-
 3 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
index 400f49ce..6719b364 100644
--- a/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/jax_stable_stack_gpu_e2e.py
@@ -61,7 +61,7 @@
 
   docker_images = [
       (SetupMode.STABLE, DockerImage.MAXTEXT_GPU_JAX_STABLE_STACK),
-      (SetupMode.NIGHTLY, DockerImage.MAXTEXT_GPU_JAX_STABLE_STACK_NIGHTLY),
+      (SetupMode.NIGHTLY, DockerImage.MAXTEXT_GPU_STABLE_STACK_NIGHTLY_JAX),
   ]
 
   for model, (test_script, nnodes) in test_models_gpu.items():
diff --git a/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py b/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py
index a0cbac1a..572a6c32 100644
--- a/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/jax_stable_stack_tpu_e2e.py
@@ -21,6 +21,7 @@
 from dags import composer_env, test_owner, gcs_bucket
 from dags.vm_resource import Project, TpuVersion, CpuVersion, Zone, DockerImage, GpuVersion, XpkClusters
 from dags.sparsity_diffusion_devx.configs import gke_config as config
+from dags.multipod.configs.common import SetupMode
 from xlml.utils import name_format
 
 # Run once a day at 3 am UTC (7 pm PST)
@@ -62,45 +63,52 @@
       group_id="Quarantine", dag=dag, prefix_group_id=False
   )
 
+  docker_images = [
+      (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK),
+      (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_STABLE_STACK_NIGHTLY_JAX),
+  ]
+
   for accelerator, slices in maxtext_test_configs.items():
     cores = accelerator.rsplit("-", maxsplit=1)[-1]
     cluster = config.clusters[accelerator]
     for slice_num in slices:
-      maxtext_jax_stable_stack_test = config.get_gke_config(
-          num_slices=slice_num,
-          cluster=cluster,
-          time_out_in_min=60,
-          run_model_cmds=(
-              f"JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true TPU_SLICE_BUILDER_DUMP_CHIP_FORCE=true TPU_SLICE_BUILDER_DUMP_ICI=true JAX_FORCE_TPU_INIT=true ENABLE_TPUNETD_CLIENT=true && "
-              f"python MaxText/train.py MaxText/configs/base.yml run_name={slice_num}slice-V{cluster.device_version}_{cores}-maxtext-jax-stable-stack-{current_datetime} "
-              "steps=30 per_device_batch_size=1 max_target_length=4096 model_name=llama2-7b "
-              "enable_checkpointing=false attention=dot_product remat_policy=minimal_flash use_iota_embed=true scan_layers=false "
-              "dataset_type=synthetic async_checkpointing=false "
-              f"base_output_directory={gcs_bucket.BASE_OUTPUT_DIR}/maxtext/jax-stable-stack/automated/{current_datetime}",
-          ),
-          test_name=f"maxtext-jax-stable-stack-{accelerator}-{slice_num}x",
-          docker_image=DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK.value,
-          test_owner=test_owner.PARAM_B,
-      ).run_with_quarantine(quarantine_task_group)
+      for mode, image in docker_images:
+        maxtext_jax_stable_stack_test = config.get_gke_config(
+            num_slices=slice_num,
+            cluster=cluster,
+            time_out_in_min=60,
+            run_model_cmds=(
+                f"JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true TPU_SLICE_BUILDER_DUMP_CHIP_FORCE=true TPU_SLICE_BUILDER_DUMP_ICI=true JAX_FORCE_TPU_INIT=true ENABLE_TPUNETD_CLIENT=true && "
+                f"python MaxText/train.py MaxText/configs/base.yml run_name={slice_num}slice-V{cluster.device_version}_{cores}-maxtext-jax-stable-stack-{current_datetime} "
+                "steps=30 per_device_batch_size=1 max_target_length=4096 model_name=llama2-7b "
+                "enable_checkpointing=false attention=dot_product remat_policy=minimal_flash use_iota_embed=true scan_layers=false "
+                "dataset_type=synthetic async_checkpointing=false "
+                f"base_output_directory={gcs_bucket.BASE_OUTPUT_DIR}/maxtext/jax-stable-stack/automated/{current_datetime}",
+            ),
+            test_name=f"maxtext-jax-stable-stack-{mode.value}-{accelerator}-{slice_num}x",
+            docker_image=DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK.value,
+            test_owner=test_owner.PARAM_B,
+        ).run_with_quarantine(quarantine_task_group)
 
   for accelerator, slices in maxdiffusion_test_configs.items():
     cores = accelerator.rsplit("-", maxsplit=1)[-1]
     cluster = config.clusters[accelerator]
     for slice_num in slices:
-      maxdiffusion_jax_stable_stack_test = config.get_gke_config(
-          num_slices=slice_num,
-          cluster=cluster,
-          time_out_in_min=60,
-          run_model_cmds=(
-              f"JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true TPU_SLICE_BUILDER_DUMP_CHIP_FORCE=true TPU_SLICE_BUILDER_DUMP_ICI=true JAX_FORCE_TPU_INIT=true ENABLE_TPUNETD_CLIENT=true && "
-              f"pip install . && python src/maxdiffusion/train.py src/maxdiffusion/configs/base_2_base.yml "
-              f"run_name={slice_num}slice-V{cluster.device_version}_{cores}-maxdiffusion-jax-stable-stack-{current_datetime} "
-              f"output_dir={gcs_bucket.BASE_OUTPUT_DIR}/maxdiffusion/jax-stable-stack/automated/{current_datetime}",
-          ),
-          test_name=f"maxdiffusion-jax-stable-stack-{accelerator}-{slice_num}x",
-          docker_image=DockerImage.MAXDIFFUSION_TPU_JAX_STABLE_STACK.value,
-          test_owner=test_owner.PARAM_B,
-      ).run_with_quarantine(quarantine_task_group)
+      for mode, image in docker_images:
+        maxdiffusion_jax_stable_stack_test = config.get_gke_config(
+            num_slices=slice_num,
+            cluster=cluster,
+            time_out_in_min=60,
+            run_model_cmds=(
+                f"JAX_PLATFORMS=tpu,cpu ENABLE_PJRT_COMPATIBILITY=true TPU_SLICE_BUILDER_DUMP_CHIP_FORCE=true TPU_SLICE_BUILDER_DUMP_ICI=true JAX_FORCE_TPU_INIT=true ENABLE_TPUNETD_CLIENT=true && "
+                f"pip install . && python src/maxdiffusion/train.py src/maxdiffusion/configs/base_2_base.yml "
+                f"run_name={slice_num}slice-V{cluster.device_version}_{cores}-maxdiffusion-jax-stable-stack-{current_datetime} "
+                f"output_dir={gcs_bucket.BASE_OUTPUT_DIR}/maxdiffusion/jax-stable-stack/automated/{current_datetime}",
+            ),
+            test_name=f"maxdiffusion-jax-stable-stack-{mode.value}-{accelerator}-{slice_num}x",
+            docker_image=DockerImage.MAXDIFFUSION_TPU_JAX_STABLE_STACK.value,
+            test_owner=test_owner.PARAM_B,
+        ).run_with_quarantine(quarantine_task_group)
 
   for accelerator, slices in axlearn_test_configs.items():
     cores = accelerator.rsplit("-", maxsplit=1)[-1]
diff --git a/dags/vm_resource.py b/dags/vm_resource.py
index 4459be63..01f4638b 100644
--- a/dags/vm_resource.py
+++ b/dags/vm_resource.py
@@ -303,6 +303,10 @@ class DockerImage(enum.Enum):
       "gcr.io/tpu-prod-env-multipod/maxtext_jax_stable_stack_0.4.35:"
       f"{datetime.datetime.today().strftime('%Y-%m-%d')}"
   )
+  MAXTEXT_TPU_STABLE_STACK_NIGHTLY_JAX = (
+      "gcr.io/tpu-prod-env-multipod/maxtext_stable_stack_nightly_jax:"
+      f"{datetime.datetime.today().strftime('%Y-%m-%d')}"
+  )
   MAXDIFFUSION_TPU_JAX_STABLE_STACK = (
       "gcr.io/tpu-prod-env-multipod/maxdiffusion_jax_stable_stack_0.4.35:"
       f"{datetime.datetime.today().strftime('%Y-%m-%d')}"
@@ -323,8 +327,8 @@ class DockerImage(enum.Enum):
       "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_stable_stack_0.4.35:"
       f"{datetime.datetime.today().strftime('%Y-%m-%d')}"
   )
-  MAXTEXT_GPU_JAX_STABLE_STACK_NIGHTLY = (
-      "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_stable_stack_nightly:"
+  MAXTEXT_GPU_STABLE_STACK_NIGHTLY_JAX = (
+      "gcr.io/tpu-prod-env-multipod/maxtext_gpu_stable_stack_nightly_jax:"
       f"{datetime.datetime.today().strftime('%Y-%m-%d')}"
   )
   MAXTEXT_GPU_JAX_NIGHTLY = (

From 12a6b809600261db1855dac9f4c3243be77636b1 Mon Sep 17 00:00:00 2001
From: Yijia <jinyijia24@gmail.com>
Date: Mon, 18 Nov 2024 10:22:21 -0800
Subject: [PATCH 11/26] add gemma (#481)

---
 dags/inference/configs/trt_llm_inference_config.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dags/inference/configs/trt_llm_inference_config.py b/dags/inference/configs/trt_llm_inference_config.py
index b13f3ebf..913f29b6 100644
--- a/dags/inference/configs/trt_llm_inference_config.py
+++ b/dags/inference/configs/trt_llm_inference_config.py
@@ -57,6 +57,7 @@ def get_trt_llm_gpu_config(
       "gsutil -m cp -r gs://tohaowu/llama_3_8B_Instruct_HF_model .",
       "gsutil -m cp -r gs://tohaowu/llama_3.1_70B_Instruct_HF_model .",
       "gsutil -m cp -r gs://tohaowu/Mixtral-8x22B-Instruct-v0.1 .",
+      "gsutil -m cp -r gs://yijiaj/gemma/gemma-2-27b-it .",
       "sudo apt-get update",
       "sudo apt-get -y install git git-lfs",
       "git clone https://github.com/NVIDIA/TensorRT-LLM.git",
@@ -103,10 +104,14 @@ def get_trt_llm_gpu_config(
       "trtllm-build --checkpoint_dir /scratch/tllm_checkpoint_8gpu_tp8 --output_dir /scratch/llama/70B/trt_engines/fp16/8-gpu/ --gemm_plugin auto",
       "python ../llama/convert_checkpoint.py --model_dir /scratch/Mixtral-8x22B-Instruct-v0.1 --output_dir /scratch/tllm_checkpoint_mixtral_8gpu --dtype float16 --tp_size 8 --moe_tp_size 2 --moe_ep_size 4",
       "trtllm-build --checkpoint_dir /scratch/tllm_checkpoint_mixtral_8gpu --output_dir /scratch/trt_engines/mixtral/tp2ep4",
+      "cd ../gemma",
+      "python3 convert_checkpoint.py --ckpt-type hf --model-dir /scratch/gemma-2-27b-it/ --dtype bfloat16 --world-size 1 --output-model-dir /scratch/checkpoints/tmp_27b_it_tensorrt_llm/bf16/tp1/",
+      "trtllm-build --checkpoint_dir /scratch/checkpoints/tmp_27b_it_tensorrt_llm/bf16/tp1/ --gemm_plugin auto --max_batch_size 8 --max_input_len 3000 --max_seq_len 3100 --output_dir /scratch/gemma2/27b/bf16/1-gpu/",
       "cd ../../benchmarks/python",
       "python benchmark.py -m dec --engine_dir /scratch/llama/8B/trt_engines/fp16/1-gpu/ --csv",
       "OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 mpirun -n 8 python benchmark.py -m dec --engine_dir /scratch/llama/70B/trt_engines/fp16/8-gpu/ --csv",
       "OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 mpirun -n 8 python benchmark.py -m dec --engine_dir /scratch/trt_engines/mixtral/tp2ep4 --csv",
+      "python benchmark.py -m dec --engine_dir /scratch/gemma2/27b/bf16/1-gpu/ --dtype bfloat16 --csv",
       make_jsonl_convert_cmd,
       f"python jsonl_converter.py {jsonl_output_path}",
   )

From 2a3a8e81525ccb086cb586a94f29677eac00019e Mon Sep 17 00:00:00 2001
From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com>
Date: Mon, 25 Nov 2024 10:58:24 -0800
Subject: [PATCH 12/26] Use smaller embedding size as 32 leads to segmentation
 fault (#487)

---
 dags/solutions_team/solutionsteam_tf_nightly_supported.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
index 13abe496..c2bebeed 100644
--- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py
+++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
@@ -112,7 +112,7 @@
       runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
   )
 
-  embedding_dim = 32
+  embedding_dim = 16
   tf_dlrm_v5p_8 = tf_config.get_tf_dlrm_config(
       project_name=Project.TPU_PROD_ENV_AUTOMATED.value,
       tpu_version=TpuVersion.V5P,

From 32b056a7e5a9923e8173b6771b3778b0a7102ed5 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Mon, 25 Nov 2024 11:18:23 -0800
Subject: [PATCH 13/26] Update CODEOWNERS - add richardsliu as owner (#488)

---
 .github/CODEOWNERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index fd1f8567..e68ed421 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
 # Default owners for everything in the repo, unless a later match takes precedence.
 * @mbzomowski @RissyRan @allenwang28
 
-dags/solutions_team/configs/tensorflow @chandrasekhard2 @ZhaoyueCheng
-dags/solutions_team/solutionsteam_tf* @chandrasekhard2 @ZhaoyueCheng
+dags/solutions_team/configs/tensorflow @chandrasekhard2 @ZhaoyueCheng @richardsliu
+dags/solutions_team/solutionsteam_tf* @chandrasekhard2 @ZhaoyueCheng @richardsliu
 
 dags/pytorch_xla @JackCaoG @vanbasten23 @zpcore @ManfeiBai
 dags/legacy_test/tests/pytorch @JackCaoG @vanbasten23 @zpcore @ManfeiBai

From fbb1de9a78c7d9952479cb7d5ac035dabb44b6fb Mon Sep 17 00:00:00 2001
From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com>
Date: Mon, 25 Nov 2024 11:28:24 -0800
Subject: [PATCH 14/26] Oncall test fix (#486)

* change cluster

change cluster

* typo

* The -f flag makes rm not error if the file doesn't exist.

---------

Co-authored-by: chandrasekhard2 <98771505+chandrasekhard2@users.noreply.github.com>
---
 dags/solutions_team/configs/tensorflow/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/solutions_team/configs/tensorflow/common.py b/dags/solutions_team/configs/tensorflow/common.py
index 3972e227..beee93d5 100644
--- a/dags/solutions_team/configs/tensorflow/common.py
+++ b/dags/solutions_team/configs/tensorflow/common.py
@@ -19,7 +19,7 @@
 
 
 CMD_PRINT_TF_VERSION = "python3 -c \"import tensorflow; print('Running using TensorFlow Version: ' + tensorflow.__version__)\""
-CMD_REMOVE_LIBTPU_LOCKFILE = "sudo rm /tmp/libtpu_lockfile"
+CMD_REMOVE_LIBTPU_LOCKFILE = "sudo rm -f /tmp/libtpu_lockfile"
 CMD_INSTALL_KERAS_NIGHTLY = (
     "pip install --upgrade --no-deps --force-reinstall tf-keras-nightly"
 )

From 10936b142fd67454e0d920c66bf9b920fd5b01bf Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:06:55 -0800
Subject: [PATCH 15/26] fix nightly (#489)

---
 dags/solutions_team/solutionsteam_tf_nightly_supported.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
index c2bebeed..d79bfdf1 100644
--- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py
+++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
@@ -41,6 +41,7 @@
       tpu_zone=Zone.US_CENTRAL1_C.value,
       time_out_in_min=60,
       global_batch_size=1024,
+      runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value,
   )
 
   tf_resnet_v3_8 = tf_config.get_tf_resnet_config(
@@ -48,6 +49,7 @@
       tpu_cores=8,
       tpu_zone=Zone.US_EAST1_D.value,
       time_out_in_min=60,
+      runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value,
   )
 
   tf_resnet_v4_8 = tf_config.get_tf_resnet_config(
@@ -55,6 +57,7 @@
       tpu_cores=8,
       tpu_zone=Zone.US_CENTRAL2_B.value,
       time_out_in_min=60,
+      runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value,
   )
 
   tf_resnet_v4_32 = tf_config.get_tf_resnet_config(

From b99cc999d171e3a2f5a991adaa0c33901a32a081 Mon Sep 17 00:00:00 2001
From: Ran Ran <ran.rissy@gmail.com>
Date: Tue, 26 Nov 2024 11:05:01 -0800
Subject: [PATCH 16/26] Add ruamel dependency (#490)

---
 deployment/modules/composer_env/main.tf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deployment/modules/composer_env/main.tf b/deployment/modules/composer_env/main.tf
index f9718748..0288eca4 100644
--- a/deployment/modules/composer_env/main.tf
+++ b/deployment/modules/composer_env/main.tf
@@ -20,6 +20,7 @@ resource "google_composer_environment" "example_environment" {
         google-cloud-tpu                  = ">=1.16.0"
         jsonlines                         = ""
         ray                               = "[default]"
+        ruamel.yaml                       = ""
         # These packages are already in the default composer environment.
         # See https://cloud.google.com/composer/docs/concepts/versioning/composer-versions
         # google-cloud-bigquery             = ""

From cfbb0acc0cd959be1a7834bc8ba217c6e33f57cf Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Tue, 26 Nov 2024 16:10:43 -0800
Subject: [PATCH 17/26] Fix one test for nightly (#491)

Updating TF runtime environment for v4-32 in nightly test
---
 dags/solutions_team/solutionsteam_tf_nightly_supported.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
index d79bfdf1..fb67088a 100644
--- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py
+++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
@@ -66,7 +66,7 @@
       tpu_zone=Zone.US_CENTRAL2_B.value,
       time_out_in_min=60,
       is_pod=True,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
+      runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value,
   )
 
   tf_resnet_v5e_4 = tf_config.get_tf_resnet_config(

From e27c58a587270e49a5b07357582669e471c8532e Mon Sep 17 00:00:00 2001
From: Gunjan Jalori <39437795+gunjanj007@users.noreply.github.com>
Date: Wed, 27 Nov 2024 10:50:11 -0800
Subject: [PATCH 18/26] Add GoB repo cloning mechanism (#484)

* add GoB cloning commands

* reformat

* format again

* format again

* format again

* address comments

* address comments

* address comments
---
 .../aotc_reproducibility.py                   | 43 +++++++++++++++++--
 dags/map_reproducibility/nemo_gpt3.py         | 29 ++++++-------
 2 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/dags/map_reproducibility/aotc_reproducibility.py b/dags/map_reproducibility/aotc_reproducibility.py
index eff156ab..64478503 100644
--- a/dags/map_reproducibility/aotc_reproducibility.py
+++ b/dags/map_reproducibility/aotc_reproducibility.py
@@ -24,12 +24,11 @@ def set_variables_cmds():
       "export CLUSTER_REGION=australia-southeast1",
       "NOW=$(date +%s)",
       "export BUCKET_NAME=regression-testing-xlml",
-      "export JOB_NAME=gpt3-xlml-$NOW-175b-nemo",
   )
   return set_variables
 
 
-def set_project_commands():
+def configure_project_and_cluster():
   set_project_command = (
       "gcloud config set project $PROJECT",
       "sudo chown -R airflow:airflow /home/airflow/composer_kube_config",
@@ -39,6 +38,28 @@ def set_project_commands():
   return set_project_command
 
 
+# This is required to get auth to access
+# internal GoB repo
+def git_cookie_authdaemon():
+  auth_cmds = (
+      "git clone https://gerrit.googlesource.com/gcompute-tools",
+      "echo 'trying to run git-cookie-authdaemon'",
+      "./gcompute-tools/git-cookie-authdaemon",
+  )
+  return auth_cmds
+
+
+def clone_gob():
+  gob_clone_cmds = (
+      "echo 'trying to clone GoB repo from outside'",
+      "git clone https://ai-hypercomputer-benchmarks.googlesource.com/"
+      "reproducible-benchmark-recipes",
+      "cd reproducible-benchmark-recipes/projects",
+      "cd gpu-recipes",
+  )
+  return gob_clone_cmds
+
+
 def install_helm_cmds():
   install_helm_cmd = (
       "curl -fsSL -o get_helm.sh "
@@ -57,11 +78,27 @@ def namespace_cmds():
   namespace = (
       "kubectl config view | grep namespace",
       "kubectl config set-context --current --namespace=default",
-      "kubectl config set-context heml --namespace=default",
+      "kubectl config set-context helm --namespace=default",
   )
   return namespace
 
 
+def helm_install_cmds():
+  helm_cmds = (
+      " helm install -f values.yaml "
+      "--namespace default "
+      "--set namespace=default"
+      " --set-file nemo_config"
+      "=$CONFIG_FILE"
+      " --set workload.image"
+      "=us-central1-docker.pkg.dev/"
+      "supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07"
+      " --set workload.gcsBucketForDataCataPath=$BUCKET_NAME"
+      " $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training",
+  )
+  return helm_cmds
+
+
 def wait_for_jobs_cmds():
   wait_for_job = (
       "echo 'will wait for job to start running'",
diff --git a/dags/map_reproducibility/nemo_gpt3.py b/dags/map_reproducibility/nemo_gpt3.py
index ff03fdaf..74d3cc0f 100644
--- a/dags/map_reproducibility/nemo_gpt3.py
+++ b/dags/map_reproducibility/nemo_gpt3.py
@@ -21,12 +21,15 @@
 from dags import composer_env
 from dags.map_reproducibility.aotc_reproducibility import get_metrics_cmds
 from dags.map_reproducibility.aotc_reproducibility import set_variables_cmds
-from dags.map_reproducibility.aotc_reproducibility import set_project_commands
+from dags.map_reproducibility.aotc_reproducibility import configure_project_and_cluster
 from dags.map_reproducibility.aotc_reproducibility import install_helm_cmds
 from dags.map_reproducibility.aotc_reproducibility import namespace_cmds
 from dags.map_reproducibility.aotc_reproducibility import wait_for_jobs_cmds
 from dags.map_reproducibility.aotc_reproducibility import copy_bucket_cmds
 from dags.map_reproducibility.aotc_reproducibility import cleanup_cmds
+from dags.map_reproducibility.aotc_reproducibility import git_cookie_authdaemon
+from dags.map_reproducibility.aotc_reproducibility import clone_gob
+from dags.map_reproducibility.aotc_reproducibility import helm_install_cmds
 
 # Run once a day at 2 pm UTC (6 am PST)
 SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None
@@ -35,27 +38,16 @@
 @task
 def run_aotc_workload():
   gpu_recipe_cmd = (
-      "git clone https://github.com/ai-hypercomputer/gpu-recipes.git",
-      "cd gpu-recipes",
-      "export REPO_ROOT=`git rev-parse --show-toplevel`",
+      "export REPO_ROOT=`pwd`",
       "export RECIPE_ROOT="
       "$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke",
       "cd $RECIPE_ROOT",
   )
 
-  helm_cmds = (
+  workload_cmds = (
       "CONFIG_FILE=$REPO_ROOT/src/frameworks"
       "/nemo-configs/gpt3-175b-256gpus-fp8.yaml",
-      " helm install -f values.yaml "
-      "--namespace default "
-      "--set namespace=default"
-      " --set-file nemo_config"
-      "=$CONFIG_FILE"
-      " --set workload.image"
-      "=us-central1-docker.pkg.dev/"
-      "supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07"
-      " --set workload.gcsBucketForDataCataPath=$BUCKET_NAME"
-      " $JOB_NAME $REPO_ROOT/src/helm-charts/nemo-training",
+      "export JOB_NAME=gpt3-xlml-$NOW-175b-nemo",
   )
 
   hook = SubprocessHook()
@@ -65,11 +57,14 @@ def run_aotc_workload():
           "-c",
           ";".join(
               set_variables_cmds()
-              + set_project_commands()
+              + configure_project_and_cluster()
+              + git_cookie_authdaemon()
+              + clone_gob()
               + gpu_recipe_cmd
               + install_helm_cmds()
               + namespace_cmds()
-              + helm_cmds
+              + workload_cmds
+              + helm_install_cmds()
               + wait_for_jobs_cmds()
               + copy_bucket_cmds()
               + get_metrics_cmds()

From 0ccefe31d9538361aad4556bdfc9cbb7861d0f50 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Wed, 27 Nov 2024 14:30:32 -0800
Subject: [PATCH 19/26] fix benchmark tests (#492)

---
 .../configs/vllm/vllm_benchmark_config.py             | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/dags/solutions_team/configs/vllm/vllm_benchmark_config.py b/dags/solutions_team/configs/vllm/vllm_benchmark_config.py
index 5cb27ba1..b822e1b3 100644
--- a/dags/solutions_team/configs/vllm/vllm_benchmark_config.py
+++ b/dags/solutions_team/configs/vllm/vllm_benchmark_config.py
@@ -45,6 +45,7 @@ def get_vllm_gpu_setup_cmds():
       # Download dataset
       "wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json",
       # Download benchmark
+      "pip install --upgrade google-cloud-storage",
       "rm -rf ai-on-gke && git clone https://github.com/GoogleCloudPlatform/ai-on-gke",
   )
   return setup_cmds
@@ -65,21 +66,13 @@ def get_vllm_tpu_setup_cmds():
       "cd vllm",
       # From https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html
       "pip uninstall torch torch-xla -y",
-      'export DATE="20240828"',
-      'export TORCH_VERSION="2.5.0"',
-      "pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl",
-      "pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl",
-      # Install JAX and Pallas.
-      "pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html",
-      "pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html",
-      # Install other build dependencies.
-      "pip install setuptools-scm>=8",
       "pip install -r requirements-tpu.txt",
       # Build vLLM
       'VLLM_TARGET_DEVICE="tpu" python setup.py develop',
       # Download dataset
       "cd .. && wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json",
       # Download benchmark
+      "pip install --upgrade google-cloud-storage",
       "rm -rf ai-on-gke && git clone https://github.com/GoogleCloudPlatform/ai-on-gke",
   )
 

From aea739eb7db9455d84d81caac7117eb0e2dab764 Mon Sep 17 00:00:00 2001
From: Ran Ran <ran.rissy@gmail.com>
Date: Wed, 27 Nov 2024 17:00:54 -0800
Subject: [PATCH 20/26] Update MoE test config (#493)

---
 dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py
index 3430f2a0..0c408fa7 100644
--- a/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py
+++ b/dags/sparsity_diffusion_devx/maxtext_moe_tpu_e2e.py
@@ -57,7 +57,7 @@
           {
               "script_name": "tpu/mixtral/8x7b/2_test_mixtral",
               "cluster": XpkClusters.TPU_V4_128_CLUSTER,
-              "time_out_in_min": 60,
+              "time_out_in_min": 90,
           },
       ],
       "mixtral-8x22b": [
@@ -68,7 +68,7 @@
           },
           {
               "script_name": "tpu/mixtral/8x22b/2_test_mixtral",
-              "cluster": XpkClusters.TPU_V5E_256_CLUSTER,
+              "cluster": XpkClusters.TPU_V4_128_CLUSTER,
               "time_out_in_min": 60,
           },
       ],

From d9a509cb43a753e518aa118a89aa41266b2f0bb4 Mon Sep 17 00:00:00 2001
From: Orti Bazar <orti.bazar@gmail.com>
Date: Mon, 2 Dec 2024 11:16:00 -0800
Subject: [PATCH 21/26] Create mlcompass_maxtext_gke dag (#485)

* Create mlcompass_maxtext_gke dag

* Add dag description
---
 .github/requirements.txt      |   1 +
 dags/mlcompass/maxtext_gke.py | 127 ++++++++++++++++++++++++++++++++++
 xlml/apis/task.py             |   4 +-
 3 files changed, 131 insertions(+), 1 deletion(-)
 create mode 100644 dags/mlcompass/maxtext_gke.py

diff --git a/.github/requirements.txt b/.github/requirements.txt
index 5cc2d0ca..ebf875bf 100644
--- a/.github/requirements.txt
+++ b/.github/requirements.txt
@@ -9,3 +9,4 @@ jsonlines
 tensorflow-cpu
 kubernetes
 pyarrow
+apache-airflow-providers-google
diff --git a/dags/mlcompass/maxtext_gke.py b/dags/mlcompass/maxtext_gke.py
new file mode 100644
index 00000000..660da3a7
--- /dev/null
+++ b/dags/mlcompass/maxtext_gke.py
@@ -0,0 +1,127 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This Airflow DAG runs a maxtext machine learning benchmark on a GKE cluster
+
+Usage:
+gcloud composer environments run ml-automation-solutions \
+  --project=cloud-ml-auto-solutions \
+  --location=us-central1 dags trigger \
+  -- \
+  mlcompass_maxtext_gke \
+  --conf={\\\"uuid\\\":\\\"abc\\\"} 70
+"""
+
+import datetime
+from airflow import models
+from airflow.decorators import task
+from airflow.providers.google.cloud.hooks.gcs import GCSHook
+from xlml.apis.xpk_cluster_config import XpkClusterConfig
+from dags import test_owner
+from dags.vm_resource import Project, XpkClusters
+from xlml.apis import gcp_config, metric_config, task as xlml_task, test_config
+import json
+
+
+def get_config_gke(
+    docker_image: str,
+    model_name: str,
+    base_output_directory: str,
+    task_owner: str = test_owner.ORTI_B,
+    cluster: XpkClusterConfig = XpkClusters.TPU_V4_8_MAXTEXT_CLUSTER,
+    time_out_in_min: int = 60,
+    num_slices: int = 1,
+    dataset_name: metric_config.DatasetOption = metric_config.DatasetOption.XLML_DATASET,
+    dataset_project: str = Project.CLOUD_ML_AUTO_SOLUTIONS.value,
+    composer_project: str = Project.CLOUD_ML_AUTO_SOLUTIONS.value,
+) -> xlml_task.XpkTask:
+  job_gcp_config = gcp_config.GCPConfig(
+      project_name=cluster.project,
+      zone=cluster.zone,
+      dataset_name=dataset_name,
+      dataset_project=dataset_project,
+      composer_project=composer_project,
+  )
+  job_test_config = test_config.TpuGkeTest(
+      test_config.Tpu(
+          version=cluster.device_version,
+          cores=cluster.core_count,
+      ),
+      test_name="maxtext",
+      run_model_cmds=[
+          f"source benchmark_run.sh;run {model_name} {base_output_directory}",
+      ],
+      set_up_cmds=None,
+      timeout=datetime.timedelta(minutes=time_out_in_min),
+      task_owner=task_owner,
+      num_slices=num_slices,
+      cluster_name=cluster.name,
+      docker_image=docker_image,
+  )
+  return xlml_task.XpkTask(
+      task_test_config=job_test_config,
+      task_gcp_config=job_gcp_config,
+  )
+
+
+with models.DAG(
+    dag_id="mlcompass_maxtext_gke",
+    schedule=None,
+    tags=["mlcompass", "maxtext"],
+    start_date=datetime.datetime(2024, 9, 1),
+    catchup=False,
+    params={
+        "uuid": "",
+    },
+    default_args={
+        "retries": 0,
+    },
+) as dag:
+
+  @task.python
+  def load_xlml_state(params: dict = None):
+    dag.log.info(params)
+    uuid = params["uuid"]
+    if not uuid:
+      raise RuntimeError("uuid is not set")
+    gcs_hook = GCSHook()
+    file_content = gcs_hook.download(
+        "mlcompass-jax-artifacts", f"xlml/{uuid}/xlml_state.json"
+    )
+    return json.loads(file_content)
+
+  @task.python
+  def get_docker_image_path(state: dict) -> str:
+    return state["docker_image_path"]
+
+  @task.python
+  def get_model_name(state: dict) -> str:
+    return state["model_name"]
+
+  @task.python
+  def get_base_output_directory(state: dict) -> str:
+    bucket = state["workdir_bucket"]
+    path = state["workdir_path"]
+    return f"gs://{bucket}/{path}"
+
+  xlml_state = load_xlml_state()
+  docker_image_path = get_docker_image_path(xlml_state)
+  model_name_arg = get_model_name(xlml_state)
+  base_output_directory_arg = get_base_output_directory(xlml_state)
+
+  default_benchmark = get_config_gke(
+      docker_image=docker_image_path,
+      model_name=model_name_arg,
+      base_output_directory=base_output_directory_arg,
+  ).run(skip_post_process=True)
diff --git a/xlml/apis/task.py b/xlml/apis/task.py
index 69806466..117bc3c8 100644
--- a/xlml/apis/task.py
+++ b/xlml/apis/task.py
@@ -171,6 +171,7 @@ def run(
       gcs_location: Optional[airflow.XComArg] = None,
       use_vertex_tensorboard: bool = False,
       use_pathways: bool = False,
+      skip_post_process: bool = False,
   ) -> DAGNode:
     """Run a test job within a docker image.
 
@@ -187,7 +188,8 @@ def run(
       run_model, gcs_path = self.run_model(
           gcs_location, use_vertex_tensorboard, use_pathways
       )
-      run_model >> self.post_process(gcs_path)
+      if not skip_post_process:
+        run_model >> self.post_process(gcs_path)
 
     return group
 

From 9ccaedbcd7e9bf1f03a7ecea468ee8a191c5823e Mon Sep 17 00:00:00 2001
From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com>
Date: Wed, 4 Dec 2024 14:48:28 -0800
Subject: [PATCH 22/26] Remove TF SE tests (#496)

* Use smaller embedding size as 32 leads to segmentation fault

* Remove TF SE Nightly tests
---
 .../solutionsteam_tf_se_nightly_supported.py  | 158 ------------------
 1 file changed, 158 deletions(-)
 delete mode 100644 dags/solutions_team/solutionsteam_tf_se_nightly_supported.py

diff --git a/dags/solutions_team/solutionsteam_tf_se_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_se_nightly_supported.py
deleted file mode 100644
index ecb51554..00000000
--- a/dags/solutions_team/solutionsteam_tf_se_nightly_supported.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""A DAG to run all supported ML models with the nightly TensorFlow version."""
-
-import datetime
-from airflow import models
-from dags import composer_env
-from dags.vm_resource import TpuVersion, Project, Zone, RuntimeVersion, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS
-from dags.solutions_team.configs.tensorflow import solutionsteam_tf_nightly_supported_config as tf_config
-from dags.solutions_team.configs.tensorflow import common
-
-
-# Run once a day at 4 pm UTC (8 am PST)
-SCHEDULED_TIME = "0 16 * * *" if composer_env.is_prod_env() else None
-
-with models.DAG(
-    dag_id="tf_se_nightly_supported",
-    schedule=SCHEDULED_TIME,
-    tags=["solutions_team", "tf", "se", "nightly", "supported", "xlml"],
-    start_date=datetime.datetime(2024, 1, 4),
-    catchup=False,
-) as dag:
-  # ResNet
-  tf_resnet_v2_8 = tf_config.get_tf_resnet_config(
-      tpu_version=TpuVersion.V2,
-      tpu_cores=8,
-      tpu_zone=Zone.US_CENTRAL1_C.value,
-      time_out_in_min=60,
-      global_batch_size=1024,
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
-  )
-
-  tf_resnet_v2_32 = tf_config.get_tf_resnet_config(
-      tpu_version=TpuVersion.V2,
-      tpu_cores=32,
-      tpu_zone=Zone.US_CENTRAL1_A.value,
-      time_out_in_min=60,
-      global_batch_size=1024,
-      is_pod=True,
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
-  )
-
-  tf_resnet_v3_8 = tf_config.get_tf_resnet_config(
-      tpu_version=TpuVersion.V3,
-      tpu_cores=8,
-      tpu_zone=Zone.US_EAST1_D.value,
-      time_out_in_min=60,
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
-  )
-
-  tf_resnet_v3_32 = tf_config.get_tf_resnet_config(
-      tpu_version=TpuVersion.V3,
-      tpu_cores=32,
-      tpu_zone=Zone.US_EAST1_D.value,
-      time_out_in_min=60,
-      is_pod=True,
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
-  )
-
-  tf_resnet_v4_8 = tf_config.get_tf_resnet_config(
-      tpu_version=TpuVersion.V4,
-      tpu_cores=8,
-      tpu_zone=Zone.US_CENTRAL2_B.value,
-      time_out_in_min=60,
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
-  )
-  tf_resnet_v4_32 = tf_config.get_tf_resnet_config(
-      tpu_version=TpuVersion.V4,
-      tpu_cores=32,
-      tpu_zone=Zone.US_CENTRAL2_B.value,
-      time_out_in_min=60,
-      is_pod=True,
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
-  )
-
-  # DLRM
-  embedding_dim = 16
-  tf_dlrm_v2_8 = tf_config.get_tf_dlrm_config(
-      tpu_version=TpuVersion.V2,
-      tpu_cores=8,
-      tpu_zone=Zone.US_CENTRAL1_C.value,
-      time_out_in_min=120,
-      bottom_mlp=[512, 256, embedding_dim],
-      embedding_dim=embedding_dim,
-      train_steps=10000,
-      extraFlags="--mode=train",
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
-  )
-
-  embedding_dim = 64
-  tf_dlrm_v2_32 = tf_config.get_tf_dlrm_config(
-      tpu_version=TpuVersion.V2,
-      tpu_cores=32,
-      tpu_zone=Zone.US_CENTRAL1_A.value,
-      time_out_in_min=120,
-      bottom_mlp=[512, 256, embedding_dim],
-      embedding_dim=embedding_dim,
-      train_steps=256054,
-      extraFlags="--mode=train_and_eval",
-      is_pod=True,
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
-  )
-
-  embedding_dim = 64
-  tf_dlrm_v4_8 = tf_config.get_tf_dlrm_config(
-      tpu_version=TpuVersion.V4,
-      tpu_cores=8,
-      tpu_zone=Zone.US_CENTRAL2_B.value,
-      time_out_in_min=120,
-      bottom_mlp=[512, 256, embedding_dim],
-      embedding_dim=embedding_dim,
-      train_steps=10000,
-      extraFlags="--mode=train",
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
-  )
-
-  embedding_dim = 128
-  tf_dlrm_v4_32 = tf_config.get_tf_dlrm_config(
-      tpu_version=TpuVersion.V4,
-      tpu_cores=32,
-      tpu_zone=Zone.US_CENTRAL2_B.value,
-      time_out_in_min=120,
-      bottom_mlp=[512, 256, embedding_dim],
-      embedding_dim=embedding_dim,
-      train_steps=128000,
-      extraFlags="--mode=train_and_eval",
-      is_pod=True,
-      is_pjrt=False,
-      runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
-  )
-
-  # Test dependencies
-  tf_resnet_v2_8 >> tf_resnet_v2_32
-  tf_resnet_v3_8 >> tf_resnet_v3_32
-  tf_resnet_v4_8 >> tf_resnet_v4_32
-  tf_dlrm_v2_8 >> tf_dlrm_v2_32
-  tf_dlrm_v4_8 >> tf_dlrm_v4_32

From c05df73ef85c493ba18e482be12c3497019e23c6 Mon Sep 17 00:00:00 2001
From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com>
Date: Wed, 4 Dec 2024 15:00:25 -0800
Subject: [PATCH 23/26] Use smaller batch size for v5p test (#497)

---
 .../tensorflow/solutionsteam_tf_nightly_supported_config.py  | 5 +++--
 dags/solutions_team/solutionsteam_tf_nightly_supported.py    | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py b/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py
index 681c7711..3a4d78de 100644
--- a/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py
+++ b/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py
@@ -200,6 +200,7 @@ def get_tf_dlrm_config(
     criteo_dir: str = gcs_bucket.CRITEO_DIR,
     network: str = "default",
     subnetwork: str = "default",
+    global_batch_size=16384,
 ):
   job_gcp_config = gcp_config.GCPConfig(
       project_name=project_name,
@@ -233,11 +234,11 @@ def get_tf_dlrm_config(
           "use_tf_record_reader": "true",
           "train_data": {
               "input_path": "gs://zyc_dlrm/dataset/tb_tf_record_train_val/train/day_*/*",
-              "global_batch_size": 16384,
+              "global_batch_size": global_batch_size,
           },
           "validation_data": {
               "input_path": "gs://zyc_dlrm/dataset/tb_tf_record_train_val/eval/day_*/*",
-              "global_batch_size": 16384,
+              "global_batch_size": global_batch_size,
           },
           "model": {
               "interaction": "multi_layer_dcn",
diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
index fb67088a..fb0aa3ad 100644
--- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py
+++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
@@ -131,6 +131,7 @@
       network=V5_NETWORKS,
       subnetwork=V5P_SUBNETWORKS,
       runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value,
+      global_batch_size=8196,
   )
 
   embedding_dim = 128

From 15da729ec6b520fb457fb9f79f374c26c13191eb Mon Sep 17 00:00:00 2001
From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com>
Date: Thu, 5 Dec 2024 10:11:11 -0800
Subject: [PATCH 24/26] Update solutionsteam_tf_nightly_supported.py (#498)

---
 dags/solutions_team/solutionsteam_tf_nightly_supported.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/solutions_team/solutionsteam_tf_nightly_supported.py b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
index fb0aa3ad..5e8c72df 100644
--- a/dags/solutions_team/solutionsteam_tf_nightly_supported.py
+++ b/dags/solutions_team/solutionsteam_tf_nightly_supported.py
@@ -131,7 +131,7 @@
       network=V5_NETWORKS,
       subnetwork=V5P_SUBNETWORKS,
       runtime_version=RuntimeVersion.V2_ALPHA_TPUV5.value,
-      global_batch_size=8196,
+      global_batch_size=8192,
   )
 
   embedding_dim = 128

From 5843847a5b333d9f6e7f29d6364cfdbcb28d6b26 Mon Sep 17 00:00:00 2001
From: Yijia <jinyijia24@gmail.com>
Date: Thu, 5 Dec 2024 11:24:11 -0800
Subject: [PATCH 25/26] Add More Configs to Automation  (#495)

* wrap up

* update v41 models

* more configs to auto
---
 .../configs/trt_llm_mlperf_v41_config.py      |  28 +-
 .../inference/trt_llm_mlperf_v41_inference.py | 306 +++++++++++++++++-
 2 files changed, 311 insertions(+), 23 deletions(-)

diff --git a/dags/inference/configs/trt_llm_mlperf_v41_config.py b/dags/inference/configs/trt_llm_mlperf_v41_config.py
index 01e2712d..00295581 100644
--- a/dags/inference/configs/trt_llm_mlperf_v41_config.py
+++ b/dags/inference/configs/trt_llm_mlperf_v41_config.py
@@ -36,7 +36,7 @@ def get_trt_llm_mlperf_gpu_config(
     project: Project,
     network: str,
     subnetwork: str,
-    general_configs: Dict = {},
+    benchmark_configs: Dict = {},
     model_parameters: Dict = {},
     parameter_positions: Dict = {},
     binary_search_steps: int = 1,
@@ -59,10 +59,11 @@ def get_trt_llm_mlperf_gpu_config(
       "sudo chmod a+w /scratch",
       "cd /scratch",
       # Prepare data
-      f"gsutil -m cp -n -r gs://yijiaj/mlperf/v41/Google_GPU .",
-      f"gsutil -m cp -n -r {general_configs['models']} .",
-      f"gsutil -m cp -n -r {general_configs['preprocessed_data']} .",
-      f"gsutil -m cp -n -r {general_configs['docker_config']} .",
+      "gsutil -m cp -n -r gs://yijiaj/mlperf/v41/Google_GPU .",
+      "gsutil -m cp -n -r gs://tohaowu/mlpinf-v40/mlperf_inf_dlrmv2 .",
+      f"gsutil -m cp -n -r {benchmark_configs['models']} .",
+      f"gsutil -m cp -n -r {benchmark_configs['preprocessed_data']} .",
+      f"gsutil -m cp -n -r {benchmark_configs['docker_config']} .",
       "curl -sSL https://get.docker.com/ | sh",
       "sudo mkdir -p /home/cloud-ml-auto-solutions/.docker",
       "sudo touch ~/.docker/config.json",
@@ -78,7 +79,7 @@ def get_trt_llm_mlperf_gpu_config(
       # Build and launch a docker container
       "PARTNER_DROP=1 make prebuild DOCKER_DETACH=1",
       "make docker_add_user",
-      f"make launch_docker DOCKER_NAME={docker_container_name} DOCKER_ARGS='-d'",
+      f"make launch_docker DOCKER_NAME={docker_container_name} DOCKER_ARGS='-v /scratch/mlperf_inf_dlrmv2:/home/mlperf_inf_dlrmv2 -d'",
   )
 
   jsonl_output_path = "metric_report.jsonl"
@@ -108,17 +109,18 @@ def get_trt_llm_mlperf_gpu_config(
   make_jsonl_converter_cmd = f'echo "{py_script}" > jsonl_converter.py'
 
   model_parameters_sweep_cmds = []
-  for model_name in general_configs["model_name"].split(","):
+  for model_name in benchmark_configs["model_name"].split(","):
+    scenario = ",".join(model_parameters[model_name])
     if accelerator_type == GpuVersion.L4:
       model_parameters_sweep_cmds.append(
-          f'CUDA_VISIBLE_DEVICES=0 make generate_engines RUN_ARGS=\'--benchmarks={model_name} --scenarios={general_configs["scenario"]}\''
+          f"CUDA_VISIBLE_DEVICES=0 make generate_engines RUN_ARGS='--benchmarks={model_name} --scenarios={scenario}'"
       )
     else:
       model_parameters_sweep_cmds.append(
-          f'make generate_engines RUN_ARGS=\'--benchmarks={model_name} --scenarios={general_configs["scenario"]}\''
+          f"make generate_engines RUN_ARGS='--benchmarks={model_name} --scenarios={scenario}'"
       )
 
-  for model_name in general_configs["model_name"].split(","):
+  for model_name in benchmark_configs["model_name"].split(","):
     for scenario in model_parameters[model_name]:
       for parameter in model_parameters[model_name][scenario]:
         steps = 2 ** (binary_search_steps - 1) + 1
@@ -153,6 +155,8 @@ def get_trt_llm_mlperf_gpu_config(
   docker_cmds = [
       "make link_dirs",
       "make build BUILD_TRTLLM=1",
+      "pip install huggingface_hub==0.24.7",
+      "lscpu",
   ]
   if accelerator_type == GpuVersion.L4:
     docker_cmds.append(
@@ -180,7 +184,9 @@ def get_trt_llm_mlperf_gpu_config(
           runtime_version=RUNTIME_IMAGE,
           network=network,
           subnetwork=subnetwork,
-          attach_local_ssd=True,
+          attach_local_ssd=True
+          if accelerator_type != GpuVersion.H100
+          else False,
           disk_size_gb=1000,
       ),
       test_name=test_name,
diff --git a/dags/inference/trt_llm_mlperf_v41_inference.py b/dags/inference/trt_llm_mlperf_v41_inference.py
index 1cfe4498..5546324d 100644
--- a/dags/inference/trt_llm_mlperf_v41_inference.py
+++ b/dags/inference/trt_llm_mlperf_v41_inference.py
@@ -17,11 +17,11 @@
 import datetime
 from airflow import models
 from dags import composer_env
-from dags.vm_resource import A100_INFERENCE_SUBNETWORKS, GpuVersion, Zone, ImageFamily, ImageProject, MachineVersion, Project, INFERENCE_NETWORKS, L4_INFERENCE_SUBNETWORKS
+from dags.vm_resource import A100_INFERENCE_SUBNETWORKS, H100_INFERENCE_SUBNETWORKS, GpuVersion, Zone, ImageFamily, ImageProject, MachineVersion, Project, INFERENCE_NETWORKS, L4_INFERENCE_SUBNETWORKS
 from dags.inference.configs import trt_llm_mlperf_v41_config
 
-# Run once a day at 4 am UTC (8 pm PST)
-SCHEDULED_TIME = "0 4 * * *" if composer_env.is_prod_env() else None
+# Run once a day at 1 pm UTC (5 am PST)
+SCHEDULED_TIME = "1 3 * * *" if composer_env.is_prod_env() else None
 
 
 with models.DAG(
@@ -41,10 +41,8 @@
 
   config_ver = "default,high_accuracy"
   test_mode = "PerformanceOnly"
-  scenario = "Offline,Server"
   g2_configs = {
-      "model_name": "bert",
-      "scenario": scenario,
+      "model_name": "bert,3d-unet",
       "config_ver": config_ver,
       "test_mode": test_mode,
       "docker_config": "gs://yijiaj/mlperf/config.json",
@@ -60,6 +58,43 @@
               "server_target_qps": (900, 1200),
           },
       },
+      "3d-unet": {
+          "Offline": {
+              "offline_expected_qps": (1.3, 2.6),
+          },
+      },
+      "dlrm-v2": {
+          "Offline": {
+              "offline_expected_qps": (3400, 3500),
+          },
+          "Server": {
+              "server_target_qps": (3300, 3500),
+          },
+      },
+      "gptj": {
+          "Offline": {
+              "offline_expected_qps": (1.3, 1.6),
+          },
+          "Server": {
+              "server_target_qps": (0.88, 1),
+          },
+      },
+      "resnet50": {
+          "Offline": {
+              "offline_expected_qps": (13000, 15000),
+          },
+          "Server": {
+              "server_target_qps": (11532.8125, 11600),
+          },
+      },
+      "retinanet": {
+          "Offline": {
+              "offline_expected_qps": (220, 230),
+          },
+          "Server": {
+              "server_target_qps": (200, 220),
+          },
+      },
   }
   g2_parameter_position = {
       "bert": {
@@ -70,10 +105,46 @@
               "server_target_qps": 278,
           },
       },
+      "3d-unet": {
+          "Offline": {
+              "offline_expected_qps": 55,
+          },
+      },
+      "dlrm-v2": {
+          "Offline": {
+              "offline_expected_qps": 233,
+          },
+          "Server": {
+              "server_target_qps": 176,
+          },
+      },
+      "gptj": {
+          "Offline": {
+              "offline_expected_qps": 191,
+          },
+          "Server": {
+              "server_target_qps": 158,
+          },
+      },
+      "resnet50": {
+          "Offline": {
+              "offline_expected_qps": 48,
+          },
+          "Server": {
+              "server_target_qps": 52,
+          },
+      },
+      "retinanet": {
+          "Offline": {
+              "offline_expected_qps": 51,
+          },
+          "Server": {
+              "server_target_qps": 57,
+          },
+      },
   }
   a2_configs = {
-      "model_name": "bert",
-      "scenario": scenario,
+      "model_name": "bert,3d-unet",
       "config_ver": config_ver,
       "test_mode": test_mode,
       "docker_config": "gs://yijiaj/mlperf/config.json",
@@ -89,6 +160,27 @@
               "server_target_qps": (25400, 25600),
           },
       },
+      "3d-unet": {
+          "Offline": {
+              "offline_expected_qps": (30, 40),
+          },
+      },
+      "resnet50": {
+          "Offline": {
+              "offline_expected_qps": (340000, 360000),
+          },
+          "Server": {
+              "server_target_qps": (290000, 299000),
+          },
+      },
+      "retinanet": {
+          "Offline": {
+              "offline_expected_qps": (5840, 5980),
+          },
+          "Server": {
+              "server_target_qps": (5600, 5800),
+          },
+      },
   }
   a2_parameter_position = {
       "bert": {
@@ -99,6 +191,177 @@
               "server_target_qps": 560,
           },
       },
+      "3d-unet": {
+          "Offline": {
+              "offline_expected_qps": 623,
+          },
+      },
+      "resnet50": {
+          "Offline": {
+              "offline_expected_qps": 456,
+          },
+          "Server": {
+              "server_target_qps": 396,
+          },
+      },
+      "retinanet": {
+          "Offline": {
+              "offline_expected_qps": 269,
+          },
+          "Server": {
+              "server_target_qps": 244,
+          },
+      },
+  }
+  a3_configs = {
+      "model_name": "resnet50,retinanet,stable-diffusion-xl,llama2-70b,mixtral-8x7b",
+      "config_ver": config_ver,
+      "test_mode": test_mode,
+      "docker_config": "gs://yijiaj/mlperf/config.json",
+      "models": "gs://yijiaj/mlperf/a3/models",
+      "preprocessed_data": "gs://yijiaj/mlperf/a3/preprocessed_data",
+  }
+  a3_model_parameters = {
+      "bert": {
+          "Offline": {
+              "offline_expected_qps": (75200, 76000),
+          },
+          "Server": {
+              "server_target_qps": (56000, 60000),
+          },
+      },
+      "3d-unet": {
+          "Offline": {
+              "offline_expected_qps": (54.4, 64),
+          },
+      },
+      "dlrm-v2": {
+          "Offline": {
+              "offline_expected_qps": (616000, 620000),
+          },
+          "Server": {
+              "server_target_qps": (458203.125, 510000),
+          },
+      },
+      "gptj": {
+          "Offline": {
+              "offline_expected_qps": (288, 300),
+          },
+          "Server": {
+              "server_target_qps": (279.36, 285),
+          },
+      },
+      "resnet50": {
+          "Offline": {
+              "offline_expected_qps": (720000, 740000),
+          },
+          "Server": {
+              "server_target_qps": (584000, 586000),
+          },
+      },
+      "retinanet": {
+          "Offline": {
+              "offline_expected_qps": (13600, 14000),
+          },
+          "Server": {
+              "server_target_qps": (12880, 13000),
+          },
+      },
+      "stable-diffusion-xl": {
+          "Offline": {
+              "offline_expected_qps": (16, 18),
+          },
+          "Server": {
+              "server_target_qps": (16.3, 18),
+          },
+      },
+      "llama2-70b": {
+          "Offline": {
+              "offline_expected_qps": (80, 86),
+          },
+          "Server": {
+              "server_target_qps": (75, 80),
+          },
+      },
+      "mixtral-8x7b": {
+          "Offline": {
+              "offline_expected_qps": (368, 386),
+          },
+          "Server": {
+              "server_target_qps": (345, 360),
+          },
+      },
+  }
+  a3_parameter_position = {
+      "bert": {
+          "Offline": {
+              "offline_expected_qps": 196,
+          },
+          "Server": {
+              "server_target_qps": 238,
+          },
+      },
+      "3d-unet": {
+          "Offline": {
+              "offline_expected_qps": 160,
+          },
+      },
+      "dlrm-v2": {
+          "Offline": {
+              "offline_expected_qps": 65,
+          },
+          "Server": {
+              "server_target_qps": 65,
+          },
+      },
+      "gptj": {
+          "Offline": {
+              "offline_expected_qps": 48,
+          },
+          "Server": {
+              "server_target_qps": 91,
+          },
+      },
+      "resnet50": {
+          "Offline": {
+              "offline_expected_qps": 84,
+          },
+          "Server": {
+              "server_target_qps": 132,
+          },
+      },
+      "retinanet": {
+          "Offline": {
+              "offline_expected_qps": 139,
+          },
+          "Server": {
+              "server_target_qps": 127,
+          },
+      },
+      "stable-diffusion-xl": {
+          "Offline": {
+              "offline_expected_qps": 55,
+          },
+          "Server": {
+              "server_target_qps": 59,
+          },
+      },
+      "llama2-70b": {
+          "Offline": {
+              "offline_expected_qps": 75,
+          },
+          "Server": {
+              "server_target_qps": 74,
+          },
+      },
+      "mixtral-8x7b": {
+          "Offline": {
+              "offline_expected_qps": 74,
+          },
+          "Server": {
+              "server_target_qps": 64,
+          },
+      },
   }
 
   # Running on A100 GPU
@@ -108,13 +371,13 @@
       image_family=ImageFamily.COMMON_CU121_DEBIAN_11,
       accelerator_type=GpuVersion.A100_80G,
       count=8,
-      gpu_zone=Zone.US_CENTRAL1_C,
+      gpu_zone=Zone.US_CENTRAL1_A,
       time_out_in_min=1600,
       test_name=f"{test_name_prefix}-nightly-test-a100-8",
       project=Project.CLOUD_TPU_INFERENCE_TEST,
       network=INFERENCE_NETWORKS,
       subnetwork=A100_INFERENCE_SUBNETWORKS,
-      general_configs=a2_configs,
+      benchmark_configs=a2_configs,
       model_parameters=a2_model_parameters,
       parameter_positions=a2_parameter_position,
       binary_search_steps=2,
@@ -127,14 +390,33 @@
       image_family=ImageFamily.COMMON_CU121_DEBIAN_11,
       accelerator_type=GpuVersion.L4,
       count=8,
-      gpu_zone=Zone.US_CENTRAL1_A,
+      gpu_zone=Zone.US_CENTRAL1_C,
       time_out_in_min=1600,
       test_name=f"{test_name_prefix}-nightly-test-l4-1",
       project=Project.CLOUD_TPU_INFERENCE_TEST,
       network=INFERENCE_NETWORKS,
       subnetwork=L4_INFERENCE_SUBNETWORKS,
-      general_configs=g2_configs,
+      benchmark_configs=g2_configs,
       model_parameters=g2_model_parameters,
       parameter_positions=g2_parameter_position,
       binary_search_steps=2,
   ).run()
+
+  # Running on H100 GPU
+  trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config(
+      machine_type=MachineVersion.A3_HIGHGPU_8G,
+      image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_family=ImageFamily.COMMON_CU121_DEBIAN_11,
+      accelerator_type=GpuVersion.H100,
+      count=8,
+      gpu_zone=Zone.US_CENTRAL1_A,
+      time_out_in_min=1600,
+      test_name=f"{test_name_prefix}-nightly-test-h100-8",
+      project=Project.CLOUD_TPU_INFERENCE_TEST,
+      network=INFERENCE_NETWORKS,
+      subnetwork=H100_INFERENCE_SUBNETWORKS,
+      benchmark_configs=a3_configs,
+      model_parameters=a3_model_parameters,
+      parameter_positions=a3_parameter_position,
+      binary_search_steps=2,
+  ).run()

From 4afadb458f48a2e31bdedaa3f283c92b5dd340ef Mon Sep 17 00:00:00 2001
From: Yifei Teng <tengyifei@users.noreply.github.com>
Date: Thu, 5 Dec 2024 15:08:38 -0800
Subject: [PATCH 26/26] Update the tpu dependency installation command (#499)

Because libtpu is switching Python package registries, we need to
include both registries during the transition.
---
 dags/legacy_test/tests/pytorch/nightly/common.libsonnet  | 3 ++-
 dags/pytorch_xla/configs/pytorchxla_torchbench_config.py | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet
index 9a5f4c21..8bbe599b 100644
--- a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet
+++ b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet
@@ -105,7 +105,8 @@ local volumes = import 'templates/volumes.libsonnet';
         pip3 install --user --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
         pip install --user \
           'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' \
-          -f https://storage.googleapis.com/libtpu-releases/index.html
+          -f https://storage.googleapis.com/libtpu-releases/index.html \
+          -f https://storage.googleapis.com/libtpu-wheels/index.html
         pip3 install pillow
         git clone --depth=1 https://github.com/pytorch/pytorch.git
         cd pytorch
diff --git a/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py b/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py
index bef03912..d490456a 100644
--- a/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py
+++ b/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py
@@ -168,7 +168,7 @@ def model_install_cmds(output_file=None) -> str:
           # "pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html",
           "pip3 uninstall -y libtpu-nightly jax jaxlib",
           "cd ~/xla/experimental/torch_xla2/",
-          "pip3 install --user -e .[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html",
+          "pip3 install --user -e .[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html",
       )
       if use_xla2
       else ()
@@ -185,7 +185,7 @@ def model_install_cmds(output_file=None) -> str:
           f"pip3 install --user --pre {version_mapping.TORCH.value} {version_mapping.TORCHVISION.value} {version_mapping.TORCHAUDIO.value} --index-url {version_mapping.TORCH_INDEX_CPU_URL.value}"
       ),
       (
-          f"pip3 install --user 'torch_xla[tpu] @{version_mapping.TORCH_XLA_TPU_WHEEL.value}' -f https://storage.googleapis.com/libtpu-releases/index.html"
+          f"pip3 install --user 'torch_xla[tpu] @{version_mapping.TORCH_XLA_TPU_WHEEL.value}' -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html"
       ),
       "pip3 install --user psutil",
       "cd; git clone https://github.com/pytorch/benchmark.git",
@@ -326,7 +326,7 @@ def get_nvidia_driver_install_cmd(driver_version: str) -> str:
           # TODO(piz): torch_xla2 only support nightly test at this time.
           "pip3 uninstall -y libtpu-nightly jax jaxlib",  # in case libtpu is installed from torch_xla
           "cd /tmp/xla/experimental/torch_xla2/",
-          "pip3 install --user -e .[cuda] -f https://storage.googleapis.com/libtpu-releases/index.html",
+          "pip3 install --user -e .[cuda] -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html",
       )
       if use_xla2
       else ()