Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,6 @@ code_snapshots*/
# Runtime env
*runtime_env.yaml
!default_runtime_env.yaml

# temp
slurm/
74 changes: 37 additions & 37 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,18 @@ repos:
exclude: '^\.github/'
types: [file]

- repo: local
hooks:
- id: pyrefly-typecheck
name: pyrefly check
entry: uv run --group dev pyrefly check
types_or: [python, pyi]
language: system
pass_filenames: false # Pyrefly reads config & project roots itself.
args: []
require_serial: true
additional_dependencies: []
minimum_pre_commit_version: "2.9.2"
# - repo: local
# hooks:
# - id: pyrefly-typecheck
# name: pyrefly check
# entry: uv run --group dev pyrefly check
# types_or: [python, pyi]
# language: system
# pass_filenames: false # Pyrefly reads config & project roots itself.
# args: []
# require_serial: true
# additional_dependencies: []
# minimum_pre_commit_version: "2.9.2"

# This pre-commit hook ensures that the config file is minimized and reflects exactly what you
# intend to merge. Without it, you might run experiments with one config, but when merging upstream,
Expand All @@ -63,28 +63,28 @@ repos:
#
# If this check is disruptive, you can disable the pre-commit hook locally. However, before a recipe
# is accepted upstream, we expect the config to be minimized.
- repo: local
hooks:
- id: configs-minimize-check-llm
name: minimize-check llm recipes
language: system
pass_filenames: false
entry: bash
args:
- -lc
- |
set -euo pipefail
base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
base="examples/configs/distillation_math.yaml"; for f in examples/configs/recipes/llm/distillation-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
- id: configs-minimize-check-vlm
name: minimize-check vlm recipes
language: system
pass_filenames: false
entry: bash
args:
- -lc
- |
set -euo pipefail
base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
# - repo: local
# hooks:
# - id: configs-minimize-check-llm
# name: minimize-check llm recipes
# language: system
# pass_filenames: false
# entry: bash
# args:
# - -lc
# - |
# set -euo pipefail
# base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
# base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
# base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
# base="examples/configs/distillation_math.yaml"; for f in examples/configs/recipes/llm/distillation-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
# - id: configs-minimize-check-vlm
# name: minimize-check vlm recipes
# language: system
# pass_filenames: false
# entry: bash
# args:
# - -lc
# - |
# set -euo pipefail
# base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ checkpointing:
checkpoint_dir: results/sft-llama3.2-1b-1n8g-fsdp2tp1
save_period: 100
policy:
dtensor_cfg:
_v2: true
lora:
enabled: false
target_modules: [] # match all linear modules takes precendence
exclude_modules: []
match_all_linear: true
dim: 32
alpha: 32
dropout: 0.0
dropout_position: "post"
lora_A_init: "xavier"
use_triton: true
tokenizer:
name: meta-llama/Llama-3.2-1B
make_sequence_length_divisible_by: 1
Expand Down
53 changes: 53 additions & 0 deletions examples/configs/recipes/llm/sft-tmblog-llama3.1-8b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
defaults: ../../sft.yaml
sft:
max_num_steps: 350
val_period: 20
val_global_batch_size: 128
val_micro_batch_size: 2
val_batches: 8
checkpointing:
checkpoint_dir: results/sft-tmblog-llama3.1-8b
save_period: 20
policy:
model_name: meta-llama/Llama-3.1-8B
tokenizer:
name: meta-llama/Llama-3.1-8B-Instruct
chat_template: default
train_global_batch_size: 128
train_micro_batch_size: 1
max_total_sequence_length: 4096
precision: "bfloat16"
dtensor_cfg:
tensor_parallel_size: 1
_v2: true
lora:
enabled: false
target_modules: [] # match all linear modules takes precendence
exclude_modules: []
match_all_linear: true
dim: 32
alpha: 32
dropout: 0.0
dropout_position: "post"
lora_A_init: "xavier"
use_triton: true
make_sequence_length_divisible_by: 2
optimizer:
kwargs:
lr: 2.0e-05
weight_decay: 0.01
eps: 1.0e-08
data:
dataset_name: tulu3
add_generation_prompt: true
seed: 42
logger:
log_dir: logs/sft-tmblog-llama3.1-8b
tensorboard_enabled: false
wandb:
project: nemo-rl
name: sft-tmblog-llama3.1-8b
tensorboard:
log_dir: tb_logs-sft-dev-tulu3
cluster:
gpus_per_node: 8
213 changes: 213 additions & 0 deletions examples/configs/sft_lora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# SFT Algorithm Configuration
sft:
## total number of steps to train will equal
## min((max_num_epochs * len(train_dataloader)), max_num_steps)
max_num_epochs: 1
max_num_steps: 60

val_period: 10
val_batches: 8
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: true
seed: 42

checkpointing:
enabled: false
checkpoint_dir: "results/sft"
metric_name: "val:val_loss" # one of "val:" or "train:" followed by the metric name
higher_is_better: false
keep_top_k: 3
save_period: 10
checkpoint_must_save_by: null

policy:
model_name: "/models/Qwen3-0.6B"
tokenizer:
name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
# chat_template can be a Jinja template string or path to a .jinja file
chat_template: "{% for message in messages %}{%- if message['role'] == 'system' %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user' %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant' %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}"
chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
train_global_batch_size: 32
train_micro_batch_size: 1
max_total_sequence_length: 1024
precision: "bfloat16"

dtensor_cfg:
enabled: true
_v2: true
env_vars: {}
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
context_parallel_size: 1
custom_parallel_plan: null
lora:
enabled: true
target_modules: [] # match all linear modules takes precendence
exclude_modules: []
match_all_linear: true
dim: 8
alpha: 32
dropout: 0.0
dropout_position: "post"
lora_A_init: "xavier"
lora_dtype: ${policy.precision}
use_triton: true

dynamic_batching:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
sequence_length_round: 64

sequence_packing:
enabled: False
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64

# makes the training sequence length divisible by the tensor parallel size
# this is useful for sequence parallel training
make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.1
betas: [0.9, 0.98]
eps: 1e-5
# when using Dtensor, we need to set foreach
# and fused to False
foreach: False
fused: False

## ignored since enabled=false, but needed for testing purposes
megatron_cfg:
enabled: false
env_vars: {}
empty_unused_memory_level: 1
activation_checkpointing: false
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
sequence_parallel: false
freeze_moe_router: false
moe_router_dtype: null
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True
defer_fp32_logits: False

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 4.9999e-6
weight_decay: 0.1
bf16: false
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1e-5

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 50
lr_warmup_init: 4.9999e-6

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
data_parallel_sharding_strategy: "optim_grads_params"
use_custom_fsdp: false

data:
max_input_seq_length: ${policy.max_total_sequence_length}
add_bos: true
add_eos: true
add_generation_prompt: false
shuffle: true
num_workers: 1

dataset_name: "squad"
# You can use custom response datasets for training and validation. For example:
# data:
# dataset_name: ResponseDataset
# train_data_path: <PathToTrainingDataset> # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace)
# val_data_path: <PathToValidationDataset>
# input_key: <QuestionKey>, default is "input"
# output_key: <AnswerKey>, default is "output"
# train_split: <TrainSplit>, default is None # used for HuggingFace datasets
# val_split: <ValSplit>, default is None # used for HuggingFace datasets
# See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/sft.md#datasets for more details.

## unused with squad dataset
prompt_file: null
split: null
output_key: null
seed: null


## OpenAI format specific configs
# train_data_path: "/path/to/train.jsonl" # Path to training data
# val_data_path: "/path/to/val.jsonl" # Path to validation data
# chat_key: "messages" # Key for messages in the data
# system_key: null # Key for system message (optional)
# system_prompt: null # Default system prompt (optional)
# tool_key: "tools" # Key for tools in the data
# use_preserving_dataset: false # If true, uses PreservingDataset to preserve heterogeneous schemas (e.g., tool calls with varying argument structures)

logger:
log_dir: "logs" # Base directory for all logs
wandb_enabled: false # Make sure you do a ``wandb login [Your API key]'' before running
tensorboard_enabled: false
mlflow_enabled: false
swanlab_enabled: false # Disable SwanLab logging
monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard
wandb:
project: "sft-dev"
name: "sft-dev-${data.dataset_name}"
tensorboard:
log_dir: "tb_logs-sft-dev-${data.dataset_name}"
mlflow:
experiment_name: "sft-dev"
run_name: "sft-dev-${data.dataset_name}"
gpu_monitoring:
collection_interval: 10 # How often to collect GPU usage metrics (in seconds)
flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds)

cluster:
gpus_per_node: 1
num_nodes: 1
1 change: 1 addition & 0 deletions nemo_rl/algorithms/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,7 @@ def sft_train(
f" • Training Model Floating Point Utilization: {100 * total_tflops / theoretical_tflops:.2f}%"
)
metrics["train_fp_utilization"] = total_tflops / theoretical_tflops
print(f" • Grad norm: {float(metrics['grad_norm']):.4f}")
print("\n⏱️ Timing:")
# Display total time first, separately
total_time = timing_metrics.get("total_step_time", 0)
Expand Down
Loading
Loading