Skip to content

Commit

Permalink
Test userbenchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
xuzhao9 committed Jan 27, 2025
1 parent ff2a8bc commit 551f9e9
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 204 deletions.
3 changes: 0 additions & 3 deletions .github/workflows/userbenchmark-a100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ jobs:
- name: Install Conda
run: |
bash ./.ci/torchbench/install-conda.sh
- name: Install TorchBench
run: |
bash ./.ci/torchbench/install.sh
- name: Run user benchmark
run: |
set -x
Expand Down
147 changes: 2 additions & 145 deletions userbenchmark/release-test/run.py
Original file line number Diff line number Diff line change
@@ -1,155 +1,12 @@
import argparse
import itertools
import os
import shutil
import subprocess
import time
from datetime import datetime
from pathlib import Path
from typing import List

import yaml
from git import Repo

from ..utils import dump_output, get_output_dir, get_output_json
from .result_analyzer import analyze

# Expected WORK_DIR structure
# WORK_DIR/
# |---examples/
# |---pytorch-<ver1>-cuda<ver1>/
# |---run.sh
# |---mnist/
# |---mnist-hogwild/
# |---<other-benchmarks>
# |---pytorch-<ver2>-cuda<ver2>/
# |---summary.csv
from typing import List

BM_NAME = "release-test"
EXAMPLE_URL = "https://github.com/pytorch/examples.git"
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
DEFAULT_CONFIG_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "configs"
)
RUN_TEMPLATE = """
# GENERATED BY userbenchmark/release-test/__init__.py. DO NOT EDIT!
bash {RELEASE_TEST_ROOT}/setup_env.sh '{CUDA_VERSION}' '{MAGMA_VERSION}' '{PYTORCH_VERSION}' '{PYTORCH_CHANNEL}' '{WORK_DIR}'
bash {RELEASE_TEST_ROOT}/run_release_test.sh '{CUDA_VERSION}' '{RESULT_DIR}'
"""


def get_timestamp():
return datetime.fromtimestamp(time.time()).strftime("%Y%m%d%H%M%S")


def get_work_dir(output_dir):
work_dir = output_dir.joinpath(f"run-{get_timestamp()}")
work_dir.mkdir(exist_ok=True, parents=True)
return work_dir


def generate_test_scripts(config, work_dir):
assert "cuda" in config and isinstance(
config["cuda"], list
), f"Expected CUDA config list, but not found."
assert "pytorch" in config and isinstance(
config["pytorch"], list
), f"Exptected pytorch version list, but not found."
bm_matrix = [config["cuda"], config["pytorch"]]
run_scripts = {}
for cuda, pytorch in itertools.product(*bm_matrix):
run_key = f"pytorch-{pytorch['version']}-cuda-{cuda['version']}"
run_script = RUN_TEMPLATE.format(
RELEASE_TEST_ROOT=CURRENT_DIR,
CUDA_VERSION=cuda["version"],
MAGMA_VERSION=cuda["magma_version"],
PYTORCH_VERSION=pytorch["version"],
PYTORCH_CHANNEL=pytorch["conda_channel"],
WORK_DIR=work_dir,
RESULT_DIR=work_dir.joinpath(run_key),
)
run_scripts[run_key] = run_script
return run_scripts


def dump_test_scripts(run_scripts, work_dir):
for run_key, run_script in run_scripts.items():
run_script_loc = work_dir.joinpath(run_key)
run_script_loc.mkdir(exist_ok=True)
with open(run_script_loc.joinpath("run.sh"), "w") as rs:
rs.write(run_script)


def dump_result_to_json(metrics):
result = get_output_json(BM_NAME, metrics)
dump_output(BM_NAME, result)


def run_benchmark(run_scripts, work_dir):
for run_key, _rscript in run_scripts.items():
run_script_path = work_dir.joinpath(run_key, "run.sh")
# run the benchmark
print(f"Running benchmark {run_key} ...")
subprocess.check_call(["bash", str(run_script_path)])


def get_config(config_name: str):
if os.path.exists(os.path.join(DEFAULT_CONFIG_PATH, config_name)):
config_name = os.path.join(DEFAULT_CONFIG_PATH, config_name)
elif os.path.exists(os.path.join(DEFAULT_CONFIG_PATH, f"{config_name}.yaml")):
config_name = os.path.join(DEFAULT_CONFIG_PATH, f"{config_name}.yaml")
else:
raise ValueError(
f"Can't find config name {config_name} in config path {DEFAULT_CONFIG_PATH}."
)
with open(config_name, "r") as yfile:
config = yaml.safe_load(yfile)
return config


def parse_args(args):
parser = argparse.ArgumentParser()
parser.add_argument(
"--config", "-c", default="1.12.1", type=str, help="Config for release testing"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Only generate the test scripts. Do not run the benchmark.",
)
parser.add_argument(
"--analyze",
type=str,
help="Only analyze the result of the specified work directory.",
)
args = parser.parse_args(args)
return args


def prepare_release_tests(args: argparse.Namespace, work_dir: Path):
config = get_config(args.config)
run_scripts = generate_test_scripts(config, work_dir)
dump_test_scripts(run_scripts, work_dir)
# clone the examples repo
Repo.clone_from(EXAMPLE_URL, work_dir.joinpath("examples"))
return run_scripts


def cleanup_release_tests(work_dir):
examples_path = work_dir.joinpath("examples")
if examples_path.exists():
shutil.rmtree(examples_path)


def run(args: List[str]):
args = parse_args(args)
if args.analyze:
analyze(args.analyze)
return
work_dir = get_work_dir(get_output_dir(BM_NAME))
run_scripts = prepare_release_tests(args=args, work_dir=work_dir)
if not args.dry_run:
run_benchmark(run_scripts, work_dir)
metrics = analyze(work_dir)
dump_result_to_json(metrics)
cleanup_release_tests(work_dir)
subprocess.check_call(["bash", f"{CURRENT_DIR}/run_release_test.sh"])
62 changes: 6 additions & 56 deletions userbenchmark/release-test/run_release_test.sh
Original file line number Diff line number Diff line change
@@ -1,60 +1,10 @@
#!/bin/bash

set -xeuo pipefail
set -euo pipefail

CUDA_VERSION="$1"
RESULT_DIR="$2"
EXAMPLES_DIR="${RESULT_DIR}/../examples"
# get the directory of the current script
CURRENT_DIR=$(dirname -- "$0")
python -c "import torch; import time; a = torch.randn([4096, 4096]).cuda(); time.sleep(60); print('done!')" > log.txt 2>&1 &

PREFIX=""
if [[ ${PLATFORM_NAME} == "aws_t4_metal" ]]; then
PREFIX="taskset -c 24-47";
export GOMP_CPU_AFFINITY="24-47"
fi

. switch-cuda.sh "${CUDA_VERSION}"


nvcc --version
sudo apt update
sudo apt-get install bc
sudo apt-get install --reinstall time
which time
# run mnist
mkdir -p "${RESULT_DIR}/mnist"
pushd "${EXAMPLES_DIR}/mnist"
export LOG_FILE=${RESULT_DIR}/mnist/result.log
export MEM_FILE=${RESULT_DIR}/mnist/result_mem.log
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3
# run mnist-hogwild
mkdir -p ${RESULT_DIR}/mnist_hogwild
pushd "${EXAMPLES_DIR}/mnist_hogwild"
export LOG_FILE=${RESULT_DIR}/mnist_hogwild/result.log
export MEM_FILE=${RESULT_DIR}/mnist_hogwild/result_mem.log
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3
# run CPU WLM LSTM
mkdir -p ${RESULT_DIR}/wlm_cpu_lstm
pushd "${EXAMPLES_DIR}/word_language_model"
export LOG_FILE=${RESULT_DIR}/wlm_cpu_lstm/result.log
export MEM_FILE=${RESULT_DIR}/wlm_cpu_lstm/result_mem.log
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model LSTM
# run GPU WLM LSTM
mkdir -p ${RESULT_DIR}/wlm_gpu_lstm
pushd "${EXAMPLES_DIR}/word_language_model"
export LOG_FILE=${RESULT_DIR}/wlm_gpu_lstm/result.log
export MEM_FILE=${RESULT_DIR}/wlm_gpu_lstm/result_mem.log
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model LSTM --cuda
# run CPU WLM Transformer
mkdir -p ${RESULT_DIR}/wlm_cpu_trans
pushd "${EXAMPLES_DIR}/word_language_model"
export LOG_FILE=${RESULT_DIR}/wlm_cpu_trans/result.log
export MEM_FILE=${RESULT_DIR}/wlm_cpu_trans/result_mem.log
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model Transformer
# run GPU WLM Transformer
mkdir -p ${RESULT_DIR}/wlm_gpu_trans
pushd "${EXAMPLES_DIR}/word_language_model"
export LOG_FILE=${RESULT_DIR}/wlm_gpu_trans/result.log
export MEM_FILE=${RESULT_DIR}/wlm_gpu_trans/result_mem.log
${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model Transformer --cuda
for i in {1..120}; do
nvidia-smi pmon -s m -c 1 -o T
sleep 0.5
done

0 comments on commit 551f9e9

Please sign in to comment.