Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 192 additions & 0 deletions examples/configs/grpo_helpsteer3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
# Base GRPO Algorithm Configuration for HelpSteer3 dataset
grpo:
num_prompts_per_step: 32
num_generations_per_prompt: 16
max_rollout_turns: 1 # for multi-turn rollouts. HelpSteer3 conversations can have multiple turns
max_num_epochs: 1
max_num_steps: 500
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 10
val_at_start: false
overlong_filtering: false
max_val_samples: 256
val_batch_size: 256
seed: 42
use_dynamic_sampling: false
batch_multiplier: 1
dynamic_sampling_max_gen_batches: 10
reward_shaping:
enabled: false
overlong_buffer_length: 128
overlong_buffer_penalty: 1
max_response_length: ${policy.max_total_sequence_length}
reward_scaling:
enabled: false
source_min: 0.0
source_max: 1.0
target_min: 0.0
target_max: 1.0

async_grpo:
enabled: false # Set to true to enable async training mode
# Max age (in training steps) for trajectories used in training
max_trajectory_age_steps: 1

loss_fn:
reference_policy_kl_penalty: 0.01
ratio_clip_min: 0.2
ratio_clip_max: 0.2
ratio_clip_c: null
# (default off) loss formulation improvements (docs/guides/grpo.md#loss)
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
sequence_level_importance_ratios: false
truncated_importance_sampling_ratio: null
token_level_loss: true

checkpointing:
enabled: true
checkpoint_dir: "results/grpo-helpsteer3"
metric_name: "val_reward"
higher_is_better: true
keep_top_k: 3
save_period: 10
checkpoint_must_save_by: null
model_save_format: "safetensors"
save_consolidated: false

policy:
model_name: "meta-llama/Llama-3.2-1B-Instruct"
tokenizer:
name: ${policy.model_name}
max_total_sequence_length: 2048
precision: "bfloat16"
train_global_batch_size: 512
train_micro_batch_size: 4
logprob_batch_size: 4
logprob_chunk_size: null

dtensor_cfg:
_v2: true
enabled: true
cpu_offload: false
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
context_parallel_size: 1
custom_parallel_plan: null

megatron_cfg:
enabled: false

# See docs/design-docs/sequence-packing-and-dynamic-batching.md
# for more details on dynamic batching and sequence packing.
dynamic_batching:
enabled: True
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
sequence_length_round: 64

sequence_packing:
enabled: False
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64

make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.01
betas: [0.9, 0.999]
eps: 1e-8

scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.1
end_factor: 1.0
# The scheduler iteration is per GPRO step and is decoupled with the optimizer step (may be >=1 per GPRO step)
total_iters: 50
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [50]

generation:
backend: "vllm"
max_new_tokens: ${policy.max_total_sequence_length}
temperature: 1.0
top_p: 1.0
top_k: null
stop_token_ids: null
stop_strings: null
vllm_cfg:
async_engine: false
precision: ${policy.precision}
tensor_parallel_size: 1
pipeline_parallel_size: 1
expert_parallel_size: 1
gpu_memory_utilization: 0.6
max_model_len: ${policy.max_total_sequence_length}
# when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy,
# with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
# for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
enforce_eager: False
use_deep_gemm: False
num_last_layers_in_bf16: 0
num_first_layers_in_bf16: 0
vllm_kwargs: {}
colocated:
# true: generation shares training GPUs
# false: uses dedicated generation resources
enabled: true
# only relevant when enabled is false
resources:
gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
num_nodes: null # Decides number of nodes to be dedicated to generation

data:
max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
prompt_file: null # HelpSteer3 contains its own prompts
system_prompt_file: null
shuffle: true
num_workers: 1
dataset_name: "HelpSteer3"
# HelpSteer3 preference dataset will be converted to response format for GRPO
# The preferred responses will be used as target responses for the environment

env:
helpsteer3:
num_workers: 8
# Environment configuration for HelpSteer3 preference-based rewards
reward_model: "preference_based" # Use preference scores as rewards

logger:
log_dir: "logs" # Base directory for all logs
wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
tensorboard_enabled: false
mlflow_enabled: false
monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard
wandb:
project: "grpo-helpsteer3"
name: "grpo-helpsteer3"
tensorboard:
log_dir: "tb_logs-grpo-helpsteer3"
mlflow:
experiment_name: "grpo-helpsteer3"
run_name: "grpo-helpsteer3"
gpu_monitoring:
collection_interval: 10 # How often to collect GPU usage metrics (in seconds)
flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds)

cluster:
gpus_per_node: 8
num_nodes: 1

Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
defaults: ../../grpo_helpsteer3.yaml

# GRPO Algorithm Configuration for Llama-3.2-1B with HelpSteer3
grpo:
max_num_epochs: 3

checkpointing:
checkpoint_dir: "results/grpo-helpsteer3-llama-3.2-1b-5"

policy:
generation:
stop_token_ids:
- 128009 # <|eot_id|> for Llama-3.2

logger:
tensorboard_enabled: true
monitor_gpus: true
wandb:
project: "grpo-helpsteer3-llama-3.2-1b"
name: "grpo-helpsteer3-llama-3.2-1b-tp${policy.dtensor_cfg.tensor_parallel_size}"
tensorboard:
log_dir: "tb_logs-grpo-helpsteer3-llama-3.2-1b"
mlflow:
run_name: "grpo-helpsteer3-llama-3.2-1b"
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
defaults: ../../grpo_helpsteer3.yaml

# GRPO Algorithm Configuration for Llama-3.3-Nemotron-Super-49B-v1.5 with HelpSteer3
grpo:
num_prompts_per_step: 64
max_num_epochs: 1
max_num_steps: 10

checkpointing:
checkpoint_dir: "results/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-3"

policy:
model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
max_total_sequence_length: 32768
train_global_batch_size: 64
train_micro_batch_size: 1
logprob_batch_size: 1

dtensor_cfg:
activation_checkpointing: true
context_parallel_size: 4
cpu_offload: true
sequence_parallel: false
tensor_parallel_size: 8
custom_parallel_plan: examples.custom_parallel.llama_nemotron_super_49b_custom_plan.custom_parallel_plan

optimizer:
kwargs:
lr: 3.0e-7

scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 13
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [13]

generation:
vllm_cfg:
tensor_parallel_size: 4

logger:
wandb:
project: "grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5"
name: "grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-tp${policy.dtensor_cfg.tensor_parallel_size}"
tensorboard:
log_dir: "tb_logs-grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5"
mlflow:
run_name: "grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5"

cluster:
num_nodes: 16

This file was deleted.

This file was deleted.

Loading
Loading