Add A3U GPU Daily Run DAG (#593)

* Add A3U auto test * nit * Network name, fix mlperf-v40 * format
GoogleCloudPlatform · Feb 4, 2025 · 9cbe0c0 · 9cbe0c0
1 parent a5f32e2
commit 9cbe0c0
Show file tree

Hide file tree

Showing 5 changed files with 264 additions and 3 deletions.
diff --git a/dags/common/vm_resource.py b/dags/common/vm_resource.py
@@ -48,6 +48,9 @@
 L4_INFERENCE_SUBNETWORKS = (
     "regions/us-central1/subnetworks/mas-test-us-central1"
 )
+H200_INFERENCE_SUBNETWORKS = (
+    "regions/europe-west1/subnetworks/mas-test-europe-west1	"
+)
 
 
 class Project(enum.Enum):
@@ -117,6 +120,8 @@ class Zone(enum.Enum):
   US_WEST1_C = "us-west1-c"
   # reserved a3+ cluster in supercomputer-testing
   AUSTRALIA_SOUTHEAST1_C = "australia-southeast1-c"
+  # reserved H200 capacity in cloud-tpu-inference-test
+  EUROPE_WEST1_B = "europe-west1-b"
   # reserved TRILLIUM capacity
   EUROPE_WEST4_A = "europe-west4-a"
   # reserved v5e capacity in tpu-prod-env-multipod
@@ -139,6 +144,7 @@ class MachineVersion(enum.Enum):
   A2_ULTRAGPU_4G = "a2-ultragpu-4g"
   A2_ULTRAGPU_8G = "a2-ultragpu-8g"
   A3_HIGHGPU_8G = "a3-highgpu-8g"
+  A3_ULTRAGPU_8G = "a3-ultragpu-8g"
   G2_STAND_4 = "g2-standard-4"
   G2_STAND_16 = "g2-standard-16"  # 64GB memory
   G2_STAND_32 = "g2-standard-32"  # 128GB memroy
@@ -170,6 +176,7 @@ class GpuVersion(enum.Enum):
   A100 = "nvidia-tesla-a100"
   A100_80G = "nvidia-a100-80gb"
   H100 = "nvidia-h100-80gb"
+  H200 = "nvidia-h200-80gb"
   XPK_H100 = "h100-80gb-8"
   XPK_H100_MEGA = "h100-mega-80gb-8"
   V100 = "nvidia-tesla-v100"

diff --git a/dags/inference/configs/trt_llm_mlperf_v40_config.py b/dags/inference/configs/trt_llm_mlperf_v40_config.py
@@ -102,9 +102,9 @@ def get_trt_llm_mlperf_v40_gpu_config(
   make_jsonl_converter_cmd = f'echo "{py_script}" > jsonl_converter.py'
 
   docker_cmds = (
-      "make link_dirs",
-      "make build BUILD_TRTLLM=1",
-      "pip install huggingface_hub==0.24.7",
+      # "make link_dirs",
+      # "make build BUILD_TRTLLM=1",
+      # "pip install huggingface_hub==0.24.7",
       f'make run RUN_ARGS="--benchmarks={model_configs["model_name"]} --scenarios={model_configs["scenario"]} --config_ver={model_configs["config_ver"]} --test_mode=PerformanceOnly"',
   )
   docker_cmd = " && ".join(docker_cmds)

diff --git a/dags/inference/configs/trtllm_bench_inference_config.py b/dags/inference/configs/trtllm_bench_inference_config.py
@@ -0,0 +1,126 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities to construct configs for TensorRT-LLM inference DAG."""
+
+import datetime
+from dags.common import test_owner
+from xlml.apis import gcp_config, metric_config, task, test_config
+from dags.common import vm_resource
+from dags.common.vm_resource import Project, RuntimeVersion
+
+RUNTIME_IMAGE = RuntimeVersion.TPU_UBUNTU2204_BASE.value
+GCS_SUBFOLDER_PREFIX = test_owner.Team.INFERENCE.value
+
+
+def get_trtllm_bench_config(
+    machine_type: vm_resource.MachineVersion,
+    image_project: vm_resource.ImageProject,
+    image_family: vm_resource.ImageFamily,
+    accelerator_type: vm_resource.GpuVersion,
+    count: int,
+    gpu_zone: vm_resource.Zone,
+    time_out_in_min: int,
+    test_name: str,
+    project: Project,
+    network: str,
+    subnetwork: str,
+    existing_instance_name: str = None,
+) -> task.GpuCreateResourceTask:
+  set_up_cmds = (
+      "pip install --upgrade pip",
+      # Install Nvidia driver.
+      "wget -c https://us.download.nvidia.com/tesla/550.54.15/NVIDIA-Linux-x86_64-550.54.15.run",
+      "chmod u+x NVIDIA-Linux-x86_64-550.54.15.run",
+      "sudo ./NVIDIA-Linux-x86_64-550.54.15.run -x-module-path=/usr/lib/xorg/modules --ui=none -x-library-path=/usr/lib -q",
+      "sudo nvidia-smi -pm 1",
+      # Format and mount multiple Local SSD
+      "sudo apt update && sudo apt install mdadm --no-install-recommends",
+      "find /dev/ | grep google-local-nvme-ssd",
+      "sudo mdadm --create /dev/md0 --level=0 --raid-devices=$(find /dev/ -name 'google-local-nvme-ssd*' | wc -l) $(find /dev/ -name 'google-local-nvme-ssd*')",
+      "sudo mdadm --detail --prefer=by-id /dev/md0",
+      "sudo mkfs.ext4 -F /dev/md0",
+      "sudo mkdir -p /scratch",
+      "sudo mount /dev/md0 /scratch",
+      "sudo chmod a+w /scratch",
+      "cd /scratch",
+      "pip install jsonlines",
+      "wget https://raw.githubusercontent.com/GoogleCloudPlatform/ml-auto-solutions/refs/heads/master/dags/inference/utils/trtllm_bench_jsonl_converter.py",
+      # Install TensorRT-LLM.
+      "sudo apt-get update",
+      "sudo apt-get -y install git git-lfs",
+      "git clone https://github.com/NVIDIA/TensorRT-LLM.git",
+      "cd TensorRT-LLM",
+      "git submodule update --init --recursive",
+      "git lfs install",
+      "git lfs pull",
+      "make -C docker release_build",
+      "make -C docker release_run DOCKER_RUN_ARGS='--detach -v /scratch:/scratch' RUN_CMD='sleep infinity'",
+  )
+
+  jsonl_output_path = "metric_report.jsonl"
+  docker_container_name = "tensorrt_llm-release-yijiaj"
+  docker_cmds = (
+      "cp /scratch/trtllm-bench-test.sh trtllm-bench.sh",
+      "chmod +x trtllm-bench.sh",
+      "./trtllm-bench.sh",
+  )
+  docker_cmd = " && ".join(docker_cmds)
+  run_model_cmds = (
+      "cd /scratch",
+      f'docker exec -i {docker_container_name} /bin/bash -c "{docker_cmd}"',
+      f"python3 trtllm_bench_jsonl_converter.py {jsonl_output_path}",
+      f"cat {jsonl_output_path}",
+      f"gsutil cp {jsonl_output_path} {metric_config.SshEnvVars.GCS_OUTPUT.value}",
+  )
+
+  job_test_config = test_config.GpuVmTest(
+      test_config.Gpu(
+          machine_type=machine_type.value,
+          image_family=image_family.value,
+          count=count,
+          accelerator_type=accelerator_type.value,
+          runtime_version=RUNTIME_IMAGE,
+          network=network,
+          subnetwork=subnetwork,
+          disk_size_gb=1000,
+      ),
+      test_name=test_name,
+      set_up_cmds=set_up_cmds,
+      run_model_cmds=run_model_cmds,
+      timeout=datetime.timedelta(minutes=time_out_in_min),
+      task_owner=test_owner.YIJIA_J,
+      gcs_subfolder=f"{GCS_SUBFOLDER_PREFIX}/trt_llm_bench",
+      use_existing_instance=existing_instance_name is not None,
+  )
+
+  job_gcp_config = gcp_config.GCPConfig(
+      project_name=project.value,
+      zone=gpu_zone.value,
+      dataset_name=metric_config.DatasetOption.BENCHMARK_DATASET,
+  )
+
+  job_metric_config = metric_config.MetricConfig(
+      json_lines=metric_config.JSONLinesConfig("metric_report.jsonl"),
+      use_runtime_generated_gcs_folder=True,
+  )
+
+  return task.GpuCreateResourceTask(
+      image_project.value,
+      image_family.value,
+      job_test_config,
+      job_gcp_config,
+      job_metric_config,
+      existing_instance_name=existing_instance_name,
+  )
diff --git a/dags/inference/trtllm_bench_inference.py b/dags/inference/trtllm_bench_inference.py
@@ -0,0 +1,50 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A DAG to run TensorRT-LLM inference benchmarks with nightly version."""
+
+import datetime
+from airflow import models
+from dags import composer_env
+from dags.common.vm_resource import H200_INFERENCE_SUBNETWORKS, INFERENCE_NETWORKS, GpuVersion, Zone, ImageFamily, ImageProject, MachineVersion, Project
+from dags.inference.configs import trtllm_bench_inference_config
+
+# Run once a day at 4 am UTC (8 pm PST)
+SCHEDULED_TIME = "0 4 * * *" if composer_env.is_prod_env() else None
+
+
+with models.DAG(
+    dag_id="trtllm_bench_inference",
+    schedule=SCHEDULED_TIME,
+    tags=["inference_team", "tensorrt_llm", "nightly", "benchmark"],
+    start_date=datetime.datetime(2025, 1, 25),
+    catchup=False,
+) as dag:
+  test_name_prefix = "trtllm_bench_inference"
+
+  # Running on H200 GPU
+  trtllm_bench_inference_config.get_trtllm_bench_config(
+      machine_type=MachineVersion.A3_ULTRAGPU_8G,
+      image_project=ImageProject.ML_IMAGES,
+      image_family=ImageFamily.COMMON_CU124_DEBIAN_11,
+      accelerator_type=GpuVersion.H200,
+      count=8,
+      gpu_zone=Zone.EUROPE_WEST1_B,
+      time_out_in_min=1600,
+      test_name=f"{test_name_prefix}-nightly-h200-8",
+      project=Project.CLOUD_TPU_INFERENCE_TEST,
+      network=INFERENCE_NETWORKS,
+      subnetwork=H200_INFERENCE_SUBNETWORKS,
+      existing_instance_name="yijiaj-a3u-test-h200x8",
+  ).run()
diff --git a/dags/inference/utils/trtllm_bench_jsonl_converter.py b/dags/inference/utils/trtllm_bench_jsonl_converter.py
@@ -0,0 +1,78 @@
+import sys, glob, re, jsonlines
+
+
+def read_input_file(file_path):
+  with open(file_path, "r") as file:
+    return file.read()
+
+
+# Regex patterns to capture the data
+patterns = {
+    "engine_details": {
+        "Model": r"Model:\s*(.+)",
+        "Engine Directory": r"Engine Directory:\s*(.+)",
+        "TensorRT-LLM Version": r"TensorRT-LLM Version:\s*(.+)",
+        "Dtype": r"Dtype:\s*(.+)",
+        "KV Cache Dtype": r"KV Cache Dtype:\s*(.+)",
+        "Quantization": r"Quantization:\s*(.+)",
+        "Max Input Length": r"Max Input Length:\s*(\d+)",
+        "Max Sequence Length": r"Max Sequence Length:\s*(\d+)",
+    },
+    "runtime_info": {
+        "TP Size": r"TP Size:\s*(\d+)",
+        "PP Size": r"PP Size:\s*(\d+)",
+        "Max Runtime Batch Size": r"Max Runtime Batch Size:\s*(\d+)",
+        "Max Runtime Tokens": r"Max Runtime Tokens:\s*(\d+)",
+        "Scheduling Policy": r"Scheduling Policy:\s*(.+)",
+        "KV Memory Percentage": r"KV Memory Percentage:\s*([\d.]+)",
+        "Issue Rate (req/sec)": r"Issue Rate \(req/sec\):\s*(.+)",
+    },
+    "statistics": {
+        "Number of requests": r"Number of requests:\s*(\d+)",
+        "Average Input Length (tokens)": r"Average Input Length \(tokens\):\s*(\d+)",
+        "Average Output Length (tokens)": r"Average Output Length \(tokens\):\s*(\d+)",
+        "Token Throughput (tokens/sec)": r"Token Throughput \(tokens/sec\):\s*([\d.e+-]+)",
+        "Request Throughput (req/sec)": r"Request Throughput \(req/sec\):\s*([\d.e+-]+)",
+        "Total Latency (ms)": r"Total Latency \(ms\):\s*([\d.e+-]+)",
+    },
+}
+
+
+# Function to extract data based on regex patterns
+def extract_data(patterns, data):
+  extracted = {}
+  for section, section_patterns in patterns.items():
+    extracted[section] = {}
+    for field, pattern in section_patterns.items():
+      match = re.search(pattern, data)
+      if match:
+        extracted[section][field] = match.group(1)
+  return extracted
+
+
+def convert_to_jsonl(input_path, jsonl_path):
+  input_data = read_input_file(input_path)
+  extracted_data = extract_data(patterns, input_data)
+  data = dict()
+  data["dimensions"] = dict()
+  data["metrics"] = dict()
+  for sections in extracted_data.items():
+    for key in sections[1]:
+      try:
+        float(sections[1][key])
+        data["metrics"][key] = float(sections[1][key])
+      except:
+        data["dimensions"][key] = str(sections[1][key])
+  if len(data["dimensions"]) == 0 or len(data["metrics"]) == 0:
+    print(f"{input_path} contains incomplete results.")
+  else:
+    with jsonlines.open(jsonl_path, "a") as writter:
+      writter.write(data)
+
+
+file_pattern = "/scratch/*.txt"
+file_paths = glob.glob(file_pattern)
+
+if __name__ == "__main__":
+  for file_path in file_paths:
+    convert_to_jsonl(file_path, sys.argv[1])