Skip to content

Commit

Permalink
resolve comments
Browse files Browse the repository at this point in the history
  • Loading branch information
gunjanj007 committed Dec 17, 2024
1 parent 6b5724b commit 39c86ad
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 10 deletions.
19 changes: 16 additions & 3 deletions dags/map_reproducibility/nemo_gpt3.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
# Run once a day at 2 pm UTC (6 am PST)
SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None

MODEL_ID = "gpt3-175b"
BATCH_SIZE = 2048
NUM_ACCELERATORS = 256
PRECISION = "fp8"
ACCELERATOR_TYPE = "h100"


@task
def run_aotc_workload():
Expand All @@ -62,6 +68,7 @@ def run_aotc_workload():

with tempfile.TemporaryDirectory() as tmpdir:
hook = SubprocessHook()
# TODO(gunjanjalori): clone recipe first and extract params
result = hook.run_command(
[
"bash",
Expand All @@ -78,7 +85,13 @@ def run_aotc_workload():
+ helm_apply_cmds()
+ wait_for_jobs_cmds()
+ copy_bucket_cmds()
+ get_metrics_cmds()
+ get_metrics_cmds(
BATCH_SIZE,
NUM_ACCELERATORS,
PRECISION,
MODEL_ID,
ACCELERATOR_TYPE,
)
+ cleanup_cmds()
+ get_aotc_repo()
),
Expand All @@ -93,7 +106,7 @@ def run_aotc_workload():
print(f"Base path in python: {python_base_path}")
print(f"python to bq: {python_path_to_bq_writer}")

yaml_file_path = "reproducible-benchmark-recipes/projects/gpu-recipes/training/a3mega/gpt3-175b/nemo-pretraining-gke/values.yaml"
value_yaml_path = "reproducible-benchmark-recipes/projects/gpu-recipes/training/a3mega/gpt3-175b/nemo-pretraining-gke/values.yaml"
config_yaml_path = "reproducible-benchmark-recipes/projects/gpu-recipes/src/frameworks/a3mega/nemo-configs/gpt3-175b-256gpus-fp8.yaml"

(
Expand All @@ -103,7 +116,7 @@ def run_aotc_workload():
precision,
seq_length,
max_steps,
) = extract_run_details(tmpdir, yaml_file_path, config_yaml_path)
) = extract_run_details(tmpdir, value_yaml_path, config_yaml_path)
print(
f"batch size: {global_batch_size}, number of nodes: {number_of_nodes}"
)
Expand Down
15 changes: 8 additions & 7 deletions dags/map_reproducibility/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,17 +128,18 @@ def copy_bucket_cmds():
return copy_bucket_contents


def get_metrics_cmds():
def get_metrics_cmds(
batch_size, num_accelerators, precision, model_id, accelertator_type
):
# TODO(gunjanj007): get these parameters from the recipe
cmds = (
# "METRICS_FILE=$COMPLETE_JOB_NAME/metrics.txt",
"METRICS_FILE=metrics.txt",
"python3 process_training_results.py --file"
" dllogger.json --batch_size 2048 "
"--num_accelerators 256 "
"--precision fp8 "
"--model_type gpt3-175b "
"--accelerator_type h100 | "
f" dllogger.json --batch_size {batch_size} "
f"--num_accelerators {num_accelerators} "
f"--precision {precision} "
f"--model_type {model_id} "
f"--accelerator_type {accelertator_type} | "
"gsutil cp - $METRICS_FILE",
'echo "METRICS_FILE=${METRICS_FILE}"',
)
Expand Down

0 comments on commit 39c86ad

Please sign in to comment.