Merge pull request #2431 from AI-Hypercomputer:chzheng/disruption_manager

Google-ML-Automation · Google-ML-Automation · commit be7c2de7e058 · 2025-11-07T09:07:25.000-08:00
PiperOrigin-RevId: 829461550
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
@@ -22,6 +22,13 @@
 import dataclasses
 import typing
 
+from enum import Enum
+
+
+class Framework(Enum):
+  PATHWAYS = "pathways"
+  MCJAX = "mcjax"
+
 
 def str2bool(v: str) -> bool:
   """Convert a string of truth to True or False.
diff --git a/benchmarks/disruption_management/disruption_handler.py b/benchmarks/disruption_management/disruption_handler.py
@@ -33,7 +33,7 @@
 MCJAX_STANDARD_TARGET_POD_REGEX_SUFFIX = ".*slice-job-0-0.*"
 MCJAX_STANDARD_STEP_POD_REGEX_SUFFIX = ".*slice-job-0-0.*"
 PATHWAYS_STANDARD_TARGET_POD_REGEX_SUFFIX = ".*worker-0-0.*"
-PATHWAYS_STANDARD_STEP_POD_REGEX_SUFFIX = ".*main-0-0.*"
+PATHWAYS_STANDARD_STEP_POD_REGEX_SUFFIX = ".*head-0-0.*"
 
 PATHWAYS_WORKER_CONTAINER_NAME = "pathways-worker"
 MCJAX_WORKER_CONTAINER_NAME = "jax-tpu"
diff --git a/benchmarks/disruption_management/disruption_manager.py b/benchmarks/disruption_management/disruption_manager.py
@@ -24,9 +24,18 @@
 from collections import defaultdict
 import threading
 
+from benchmarks.benchmark_utils import Framework
 from benchmarks.disruption_management.disruption_handler import create_disruption_handler
 from benchmarks.disruption_management.disruption_handler import DisruptionConfig
 from benchmarks.disruption_management.disruption_handler import DisruptionHandler
+from benchmarks.disruption_management.disruption_handler import DisruptionMethod
+from benchmarks.disruption_management.disruption_handler import MCJAX_STANDARD_TARGET_POD_REGEX_SUFFIX
+from benchmarks.disruption_management.disruption_handler import MCJAX_STANDARD_STEP_POD_REGEX_SUFFIX
+from benchmarks.disruption_management.disruption_handler import MCJAX_WORKER_CONTAINER_NAME
+from benchmarks.disruption_management.disruption_handler import PATHWAYS_STANDARD_TARGET_POD_REGEX_SUFFIX
+from benchmarks.disruption_management.disruption_handler import PATHWAYS_STANDARD_STEP_POD_REGEX_SUFFIX
+from benchmarks.disruption_management.disruption_handler import PATHWAYS_WORKER_CONTAINER_NAME
+from benchmarks.disruption_management.disruption_handler import TriggerType
 from benchmarks.disruption_management.monitor import create_monitor
 from benchmarks.disruption_management.monitor import Monitor
 from benchmarks.xpk_configs import XpkClusterConfig
@@ -131,3 +140,36 @@ def _monitor_and_disrupt_workload(
   def _monitor_recovery(self) -> None:
     """Monitors for recovery trigger and initiates recovery."""
     raise NotImplementedError("Recovery not implemented yet.")
+
+
+def construct_disruption_configs(
+    framework: str,
+    disruption_method: DisruptionMethod,
+    disruptions,
+) -> list[DisruptionConfig]:
+  """Constructs the disruption configs for the benchmark."""
+
+  if Framework(framework) == Framework.PATHWAYS:
+    target_pod_regex = PATHWAYS_STANDARD_TARGET_POD_REGEX_SUFFIX
+    step_pod_regex = PATHWAYS_STANDARD_STEP_POD_REGEX_SUFFIX
+    worker_container_name = PATHWAYS_WORKER_CONTAINER_NAME
+  else:
+    target_pod_regex = MCJAX_STANDARD_TARGET_POD_REGEX_SUFFIX
+    step_pod_regex = MCJAX_STANDARD_STEP_POD_REGEX_SUFFIX
+    worker_container_name = MCJAX_WORKER_CONTAINER_NAME
+
+  disruption_config_list = []
+  for trigger_type, trigger_values in disruptions.items():
+    for trigger_value in trigger_values:
+      disruption_config_list.append(
+          DisruptionConfig(
+              name="_".join([str(trigger_value), trigger_type]),
+              trigger_type=TriggerType.TIME_SECONDS if trigger_type == "time_seconds" else TriggerType.STEP,
+              trigger_value=trigger_value,
+              disruption_method=disruption_method,
+              target_pod_regex=target_pod_regex,
+              step_pod_regex=step_pod_regex,
+              worker_container_name=worker_container_name,
+          )
+      )
+  return disruption_config_list
diff --git a/benchmarks/recipes/pw_elastic_training_recipe.py b/benchmarks/recipes/pw_elastic_training_recipe.py
@@ -21,149 +21,49 @@
 """
 
 import os
+import sys
 
-import args_helper as helper
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(parent_dir)
+from . import args_helper as helper
+from . import user_configs
 
-from benchmarks.disruption_management.disruption_handler import DisruptionConfig
 from benchmarks.disruption_management.disruption_handler import DisruptionMethod
-from benchmarks.disruption_management.disruption_handler import MCJAX_STANDARD_TARGET_POD_REGEX_SUFFIX
-from benchmarks.disruption_management.disruption_handler import MCJAX_WORKER_CONTAINER_NAME
-from benchmarks.disruption_management.disruption_handler import PATHWAYS_STANDARD_TARGET_POD_REGEX_SUFFIX
-from benchmarks.disruption_management.disruption_handler import PATHWAYS_WORKER_CONTAINER_NAME
-from benchmarks.disruption_management.disruption_handler import TriggerType
-from benchmarks.maxtext_trillium_model_configs import MaxTextModel
-from benchmarks import maxtext_v5e_model_configs as v5e_model_configs
-from benchmarks import maxtext_xpk_runner as mxr
-from benchmarks.xpk_configs import XpkClusterConfig
+from .runner_utils import generate_and_run_workloads
 
-PROXY_IMAGE = "us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server"
-SERVER_IMAGE = "us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server"
-RUNNER = "us-docker.pkg.dev/path/to/maxtext_runner"
-
-# Cluster Params
-CLUSTER = "v6e-256-cluster"
-PROJECT = "tpu-prod-env-cluster"
-ZONE = "us-east5-b"
-COUNTRY = "us"
-DEVICE_TYPE = "v6e-256"
-
-# Other parameters (MUST BE SET BY USER)
-XPK_PATH = "../xpk"  # We're running this script from the maxtext directory
-USER = os.environ["USER"]
-BASE_OUTPUT_DIRECTORY = f"gs://{USER}-{PROJECT}-{COUNTRY}/disruption_management/"
-MAX_RESTARTS = 10
-NUM_SLICES = 2
-BENCHMARK_STEPS = 101
+user_configs.USER_CONFIG.max_restarts = 10
 COMPARE_WITH_MCJAX = True
 
-
-# Do 2 total disruptions, once after 2 minutes and once after 6 minutes.
-def construct_disruption_configs(
-    pathways_config: mxr.PathwaysConfig,
-) -> list[DisruptionConfig]:
-  """Constructs the disruption configs for the benchmark."""
-
-  if pathways_config:
-    target_pod_regex = PATHWAYS_STANDARD_TARGET_POD_REGEX_SUFFIX
-    worker_container_name = PATHWAYS_WORKER_CONTAINER_NAME
-  else:
-    target_pod_regex = MCJAX_STANDARD_TARGET_POD_REGEX_SUFFIX
-    worker_container_name = MCJAX_WORKER_CONTAINER_NAME
-
-  # Do 2 total disruptions, once after 2 minutes and once after 6 minutes.
-  return [
-      DisruptionConfig(
-          name="sigill_2min",
-          trigger_type=TriggerType.TIME_SECONDS,
-          trigger_value=2 * 60,  # 2 minutes
-          disruption_method=DisruptionMethod.SIGILL,
-          target_pod_regex=target_pod_regex,
-          worker_container_name=worker_container_name,
-      ),
-      DisruptionConfig(
-          name="sigill_6min",
-          trigger_type=TriggerType.TIME_SECONDS,
-          trigger_value=6 * 60,  # 6 minutes
-          disruption_method=DisruptionMethod.SIGILL,
-          target_pod_regex=target_pod_regex,
-          worker_container_name=worker_container_name,
-      ),
-  ]
-
-
-def construct_workload_config_with_disruptions(
-    cluster_config: XpkClusterConfig,
-    model: MaxTextModel,
-    pathways_config: mxr.PathwaysConfig = None,
-) -> list[mxr.WorkloadConfig]:
-  """Constructs the workload configs for the benchmark."""
-  return mxr.WorkloadConfig(
-      model=model,
-      num_slices=NUM_SLICES,
-      device_type=cluster_config.device_type,
-      base_output_directory=BASE_OUTPUT_DIRECTORY,
-      max_restarts=MAX_RESTARTS,
-      libtpu_type=None,
-      libtpu_nightly_version="",
-      base_docker_image=RUNNER,
-      pathways_config=pathways_config,
-      xpk_path=XPK_PATH,
-      num_steps=BENCHMARK_STEPS,
-      disruption_configs=construct_disruption_configs(pathways_config),
-  )
+DISRUPTION_METHOD = DisruptionMethod.SIGILL
+DISRUPTIONS = {
+    "time_seconds": [120, 600],
+    # "step":[3]
+}
 
 
 def main() -> None:
   """Main function to run the elastic training disruption test."""
-
-  # Cluster Configuration
-  cluster_config = XpkClusterConfig(
-      cluster_name=CLUSTER,
-      project=PROJECT,
-      zone=ZONE,
-      device_type=DEVICE_TYPE,
+  user_configs.USER_CONFIG.headless = False
+  should_continue = helper.handle_cmd_args(
+      user_configs.USER_CONFIG.cluster_config, helper.DELETE, xpk_path=user_configs.USER_CONFIG.xpk_path
   )
 
-  # Handle command line arguments using args_helper
-  should_continue = helper.handle_cmd_args(cluster_config, helper.DELETE, xpk_path=XPK_PATH)
-
   if not should_continue:
-    return
-
-  # Model Configuration - Using a simple default model for testing
-  model = v5e_model_configs.llama3_1_8b_8192
-
-  pathways_config = mxr.PathwaysConfig(
-      server_image=SERVER_IMAGE,
-      proxy_server_image=PROXY_IMAGE,
-      runner_image=RUNNER,
-      # User can add additional flags here.
-      server_flags="--enable_metrics_collection=false",
-      proxy_flags="--enable_metrics_collection=false",
-      worker_flags="--enable_metrics_collection=false",
+    return 0
+
+  return_code = generate_and_run_workloads(
+      user_configs.USER_CONFIG,
+      user_configs.USER_CONFIG.num_slices_list,
+      user_configs.USER_CONFIG.benchmark_steps,
+      user_configs.USER_CONFIG.priority,
+      DISRUPTION_METHOD,
+      DISRUPTIONS,
   )
 
-  # Pathways Workload Configuration with Disruption
-  workload_configs = []
-  pathways_workload_config = construct_workload_config_with_disruptions(cluster_config, model, pathways_config)
-  workload_configs.append(pathways_workload_config)
-
-  if COMPARE_WITH_MCJAX:
-    # Add a workload config for MCJAX
-    mcjax_workload_config = construct_workload_config_with_disruptions(cluster_config, model, None)
-    workload_configs.append(mcjax_workload_config)
-
-  # Run the benchmark and use the returned disruption manager.
-  disruption_manager = mxr.xpk_benchmark_runner(
-      cluster_config=cluster_config,
-      workload_configs=workload_configs,
-  )
-
-  # Wait for disruptions to complete
-  disruption_manager.start_disruptions_and_wait_for_completion()
-
   print("Elastic Training disruptions completed. Please check logs for results.")
 
+  return return_code
+
 
 if __name__ == "__main__":
   main()
diff --git a/benchmarks/recipes/pw_suspend_resume.py b/benchmarks/recipes/pw_suspend_resume.py
diff --git a/benchmarks/recipes/runner_utils.py b/benchmarks/recipes/runner_utils.py