Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/integration_test_8gpu_features.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,14 @@ jobs:
exit 1
fi

python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --export-result="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs/result.txt" --steps=100
python -m scripts.loss_compare . . --baseline-options="${baseline_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --export-result="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs/result.txt" --steps=100

echo "Checking FSDP8 the first tep loss is the same as FSDP2HSDP4"
python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=1
python -m scripts.loss_compare . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=1
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*

echo "Checking FSDP8 loss from a new run v.s. FSDP8 loss from text file parity"
python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --import-result="${LOSS_FILE}" --assert-equal --steps=100
python -m scripts.loss_compare . . --baseline-options="${baseline_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --import-result="${LOSS_FILE}" --assert-equal --steps=100
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*

python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/integration_test_8gpu_h100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,5 +71,5 @@ jobs:
sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"
# Enable CPP stacktraces for debugging symmetric memory initialization errors.
TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 --gpu_arch_type ${{ matrix.gpu-arch-type }} $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
10 changes: 10 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Commit messages

Don't commit unless the user explicitly asks you to.

When writing a commit message, don't make a bullet list of the individual
changes. Instead, if the PR is large, explain the order to review changes
(e.g., the logical progression), or if it's short just omit the bullet list
entirely.

Disclose that the PR was authored with Claude.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copied over from pytorch's claude md

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this file, we can directly ask Claude code to create a PR for us?

5 changes: 5 additions & 0 deletions scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
58 changes: 38 additions & 20 deletions scripts/loss_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@
import unittest
from typing import Any

from torchtitan.tools.loss_utils import extract_losses_from_log

# =============================================================================
# GLOBAL CONFIGURATION
# =============================================================================
Expand Down Expand Up @@ -186,13 +188,18 @@ def validate_arguments(
assert_equal: bool,
export_result: str | None,
import_result: str | None,
run_to_run_determinism: bool = False,
) -> bool:
"""Validate command line arguments.

Returns:
True if baseline-only mode (all settings identical with import_result),
False otherwise.
"""
# Skip identical settings check for run-to-run determinism testing
if run_to_run_determinism:
return False # Not baseline-only mode

# Validate that we are comparing different settings
commits_differ = baseline_commit != test_commit
configs_differ = baseline_config != test_config
Expand Down Expand Up @@ -224,8 +231,8 @@ def validate_arguments(
"or options"
)
log_print(
" Or use --import-result with --assert-equal "
"or --export-result to run baseline-only mode"
" Or use --import-result with --assert-equal, "
"--export-result, or --run-to-run-determinism"
)
sys.exit(1)

Expand Down Expand Up @@ -511,24 +518,6 @@ def run_training(
# =============================================================================


def extract_losses_from_log(log_file: str) -> dict[int, float]:
"""Extract step and loss pairs from a log file."""
losses = {}
step_loss_pattern = re.compile(r"step:\s*(\d+)\s*loss:\s*(\d+\.\d+)")
ansi_escape = re.compile(r"\x1b\[[0-9;]*m")

with open(log_file, "r") as f:
for line in f:
# Strip ANSI codes before matching
clean_line = ansi_escape.sub("", line)
match = step_loss_pattern.search(clean_line)
if match:
step, loss = match.groups()
losses[int(step)] = float(loss)

return losses


def read_losses_from_file(loss_file: str) -> dict[int, float]:
"""Read losses from a processed loss file."""
losses = {}
Expand Down Expand Up @@ -1002,9 +991,37 @@ def parse_arguments() -> argparse.Namespace:
default=8,
help="Number of GPUs for test run (default: 8)",
)
parser.add_argument(
"--run-to-run-determinism",
action="store_true",
help=(
"Test run-to-run determinism by running the same configuration twice. "
"Implies --assert-equal. Only baseline options should be provided; "
"test-specific options (--test-config, --test-options, etc.) are not allowed."
),
)

args = parser.parse_args()

# Handle run-to-run determinism mode (must be before defaults are set)
if args.run_to_run_determinism:
# Validate that no test-specific options are provided
has_test_opts = (
args.test_config
or args.test_options
or args.test_train_file
or args.test_ngpus != args.baseline_ngpus
)
if has_test_opts:
raise ValueError(
"--run-to-run-determinism cannot be used with test-specific options "
"(--test-config, --test-options, --test-train-file, --test-ngpus)"
)

# Force assert_equal and copy baseline options to test
args.assert_equal = True
args.test_options = args.baseline_options

# Set default values if not provided
if not args.test_config:
args.test_config = args.baseline_config
Expand Down Expand Up @@ -1090,6 +1107,7 @@ def main() -> None:
args.assert_equal,
args.export_result,
args.import_result,
args.run_to_run_determinism,
)

# Setup environment
Expand Down
1 change: 1 addition & 0 deletions tests/integration_tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class OverrideDefinitions:
ngpu: int = 4
disabled: bool = False
skip_rocm_test: bool = False
determinism_test: bool = False # Run twice and verify losses are identical
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The point is not only about being deterministic, but also not changing before vs. after

  • pytorch nightly updates
  • user commits

Is it correct that this PR doesn't address such issues?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pr just makes sure that when you run the same command twice, it produces the same outputs. by adding this to PR time CI, you would run H100 CI twice on each PR, both against the same pytorch nightly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this test only guards the deterministic is setup correctly and working correctly, right? I think if we make sure the loss doesn't change before vs. after (pytorch nightly, and user commits), it already covers the deterministic check:

  • If it's not run-to-run deterministic, it's impossible to achieve identical loss before and after


def __repr__(self):
return self.test_descr
2 changes: 2 additions & 0 deletions tests/integration_tests/h100.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
],
"Float8 test",
"float8",
determinism_test=True,
),
# TODO: re-enable this test once the async TP issue is fixed
OverrideDefinitions(
Expand Down Expand Up @@ -77,6 +78,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
"HSDP+CP+torch.compile+Float8",
"hsdp+cp+compile+float8",
ngpu=8,
determinism_test=True,
),
]
return integration_tests_flavors
85 changes: 85 additions & 0 deletions tests/integration_tests/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import argparse
import os
import subprocess
import tempfile
import time

from torchtitan.tools.logging import logger
from torchtitan.tools.loss_utils import compare_losses, extract_losses_from_log

from tests.integration_tests import OverrideDefinitions

Expand Down Expand Up @@ -71,6 +73,82 @@ def run_single_test(test_flavor: OverrideDefinitions, full_path: str, output_dir
)


def run_determinism_test(
test_flavor: OverrideDefinitions, full_path: str, output_dir: str
):
"""Run a test twice and verify losses are identical (run-to-run determinism).

This runs the same configuration twice with deterministic settings enabled,
then compares the losses from both runs to ensure they match exactly.
"""
test_name = test_flavor.test_name
all_ranks = ",".join(map(str, range(test_flavor.ngpu)))

# Build the base command with determinism flags
override_arg = test_flavor.override_args[0] if test_flavor.override_args else []
override_str = " ".join(override_arg) if override_arg else ""

base_cmd = (
f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} "
f"./run_train.sh --job.dump_folder {output_dir}/{test_name}_determinism "
f"--debug.deterministic --debug.seed=42 --training.steps=10"
)
if override_str:
base_cmd += " " + override_str

logger.info(
f"===== {time.strftime('%Y-%m-%d %H:%M:%S')} Determinism test, flavor : {test_flavor.test_descr} ====="
)

# Create temp files for logs
with tempfile.NamedTemporaryFile(
mode="w", suffix="_run1.log", delete=False
) as log1_file:
log1_path = log1_file.name
with tempfile.NamedTemporaryFile(
mode="w", suffix="_run2.log", delete=False
) as log2_file:
log2_path = log2_file.name

try:
# Run 1
logger.info(f"Determinism test run 1: {base_cmd}")
cmd1 = f"{base_cmd} 2>&1 | tee {log1_path}"
result1 = _run_cmd(cmd1)
if result1.returncode != 0:
raise Exception(
f"Determinism test run 1 failed, flavor : {test_flavor.test_descr}"
)

# Run 2
logger.info(f"Determinism test run 2: {base_cmd}")
cmd2 = f"{base_cmd} 2>&1 | tee {log2_path}"
result2 = _run_cmd(cmd2)
if result2.returncode != 0:
raise Exception(
f"Determinism test run 2 failed, flavor : {test_flavor.test_descr}"
)

# Extract and compare losses
losses1 = extract_losses_from_log(log1_path)
losses2 = extract_losses_from_log(log2_path)

success, message = compare_losses(losses1, losses2, "run1", "run2")
if not success:
raise Exception(
f"Determinism test failed for {test_flavor.test_descr}: {message}"
)

logger.info(f"Determinism test passed for {test_flavor.test_descr}: {message}")

finally:
# Clean up temp files
if os.path.exists(log1_path):
os.remove(log1_path)
if os.path.exists(log2_path):
os.remove(log2_path)


def run_tests(args, test_list: list[OverrideDefinitions]):
"""Run all integration tests to test the core features of TorchTitan"""

Expand Down Expand Up @@ -106,6 +184,13 @@ def run_tests(args, test_list: list[OverrideDefinitions]):
run_single_test(test_flavor, args.config_path, args.output_dir)
ran_any_test = True

# Run determinism test if enabled (CUDA only)
if (
test_flavor.determinism_test
and getattr(args, "gpu_arch_type", "cuda") == "cuda"
):
run_determinism_test(test_flavor, args.config_path, args.output_dir)

if not ran_any_test:
available_tests = [t.test_name for t in test_list if not t.disabled]
if hasattr(args, "test_suite"):
Expand Down
90 changes: 90 additions & 0 deletions torchtitan/tools/loss_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
Shared utilities for loss extraction and comparison.

This module provides common functionality used by both:
- scripts/loss_compare.py (CLI tool for comparing losses across commits)
- tests/integration_tests/run_tests.py (integration test runner)
"""

import re


def extract_losses_from_log(log_file: str) -> dict[int, float]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are other use cases for this function to put it in an utils file?

"""Extract step and loss pairs from a training log file.

Parses log lines matching the pattern: "step: N loss: X.XXXX"
Handles ANSI escape codes that may be present in colored terminal output.

Args:
log_file: Path to the training log file

Returns:
Dictionary mapping step numbers to loss values
"""
losses = {}
step_loss_pattern = re.compile(r"step:\s*(\d+)\s*loss:\s*(\d+\.\d+)")
ansi_escape = re.compile(r"\x1b\[[0-9;]*m")

with open(log_file, "r") as f:
for line in f:
# Strip ANSI codes before matching
clean_line = ansi_escape.sub("", line)
match = step_loss_pattern.search(clean_line)
if match:
step, loss = match.groups()
losses[int(step)] = float(loss)

return losses


def compare_losses(
losses1: dict[int, float],
losses2: dict[int, float],
name1: str = "run1",
name2: str = "run2",
) -> tuple[bool, str]:
"""Compare two loss dictionaries for equality.

Args:
losses1: First loss dictionary (step -> loss)
losses2: Second loss dictionary (step -> loss)
name1: Name for first run (for error messages)
name2: Name for second run (for error messages)

Returns:
Tuple of (success: bool, message: str)
- success is True if all losses match exactly
- message contains details about the comparison or mismatch
"""
if not losses1:
return False, f"No losses found in {name1}"

if not losses2:
return False, f"No losses found in {name2}"

steps1 = set(losses1.keys())
steps2 = set(losses2.keys())

if steps1 != steps2:
return False, (
f"Steps mismatch: {name1} has {len(steps1)} steps, "
f"{name2} has {len(steps2)} steps"
)

mismatches = []
for step in sorted(steps1):
loss1 = losses1[step]
loss2 = losses2[step]
if loss1 != loss2:
mismatches.append(f" step {step}: {name1}={loss1}, {name2}={loss2}")

if mismatches:
return False, "Loss mismatches:\n" + "\n".join(mismatches)

return True, f"All {len(steps1)} steps have identical losses"
Loading