Skip to content

Commit 2254783

Browse files
committed
Add acemath recipe.
Signed-off-by: Felipe Vieira Frujeri <ffrujeri@nvidia.com>
1 parent 775fc34 commit 2254783

File tree

11 files changed

+573
-0
lines changed

11 files changed

+573
-0
lines changed
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
defaults:
2+
- ../../grpo_math_1B.yaml
3+
- grpo-acereason-math-7b-8K.yaml
4+
policy:
5+
max_total_sequence_length: 16384
6+
dtensor_cfg:
7+
activation_checkpointing: true
8+
context_parallel_size: 2
9+
dynamic_batching:
10+
logprob_mb_tokens: 32768
11+
train_mb_tokens: 16384
12+
sequence_packing:
13+
enabled: false
14+
logprob_mb_tokens: 32768
15+
train_mb_tokens: 16384
16+
generation:
17+
max_new_tokens: 16384
18+
vllm_cfg:
19+
max_model_len: 16384
20+
data:
21+
max_input_seq_length: 16384
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
defaults:
2+
- ../../grpo_math_1B.yaml
3+
- grpo-acereason-math-7b-16K.yaml
4+
loss_fn:
5+
ratio_clip_c: 3
6+
reference_policy_kl_penalty: 0.0001
7+
policy:
8+
max_total_sequence_length: 24576
9+
logprob_batch_size: 2
10+
dtensor_cfg:
11+
activation_checkpointing: true
12+
context_parallel_size: 2
13+
dynamic_batching:
14+
logprob_mb_tokens: 49152
15+
train_mb_tokens: 24576
16+
sequence_packing:
17+
enabled: false
18+
logprob_mb_tokens: 49152
19+
train_mb_tokens: 24576
20+
optimizer:
21+
kwargs:
22+
lr: 5.0e-07
23+
generation:
24+
max_new_tokens: 24576
25+
vllm_cfg:
26+
max_model_len: 24576
27+
gpu_memory_utilization: 0.8
28+
enforce_eager: true
29+
data:
30+
max_input_seq_length: 24576
31+
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
defaults:
2+
- ../../grpo_math_1B.yaml
3+
- grpo-acereason-math-7b-16K.yaml
4+
loss_fn:
5+
ratio_clip_c: 3
6+
reference_policy_kl_penalty: 0.0001
7+
policy:
8+
max_total_sequence_length: 32768
9+
logprob_batch_size: 2
10+
dtensor_cfg:
11+
activation_checkpointing: true
12+
context_parallel_size: 4
13+
dynamic_batching:
14+
logprob_mb_tokens: 65536
15+
train_mb_tokens: 32768
16+
sequence_packing:
17+
enabled: false
18+
logprob_mb_tokens: 65536
19+
train_mb_tokens: 32768
20+
optimizer:
21+
kwargs:
22+
lr: 5.0e-07
23+
generation:
24+
max_new_tokens: 32768
25+
vllm_cfg:
26+
max_model_len: 32768
27+
gpu_memory_utilization: 0.8
28+
enforce_eager: true
29+
tensor_parallel_size: 4
30+
data:
31+
max_input_seq_length: 32768
32+
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
defaults: ../../grpo_math_1B.yaml
2+
grpo:
3+
max_num_epochs: 30
4+
num_prompts_per_step: 128
5+
use_leave_one_out_baseline: false
6+
val_period: 0
7+
loss_fn:
8+
ratio_clip_c: 3
9+
reference_policy_kl_penalty: 0.0
10+
checkpointing:
11+
keep_top_k: 10
12+
model_save_format: null
13+
policy:
14+
activation_checkpointing_enabled: false
15+
dtensor_cfg:
16+
activation_checkpointing: true
17+
context_parallel_size: 2
18+
dynamic_batching:
19+
logprob_mb_tokens: 16384
20+
train_mb_tokens: 8192
21+
fsdp_offload_enabled: false
22+
generation:
23+
colocated:
24+
resources:
25+
gpus_per_node: 8
26+
max_new_tokens: 8192
27+
model_name: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
28+
pad_token_id: 151643
29+
stop_token_ids:
30+
- 151643
31+
vllm_cfg:
32+
enable_expert_parallel: false
33+
enforce_eager: true
34+
load_format: dummy
35+
max_model_len: 8192
36+
precision: float32
37+
skip_tokenizer_init: true
38+
tensor_parallel_size: 2
39+
logprob_batch_size: 2
40+
lr: 1.0e-06
41+
make_sequence_length_divisible_by: 4
42+
max_total_sequence_length: 8192
43+
min_lr: 1.0e-06
44+
model_name: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
45+
optimizer:
46+
kwargs:
47+
lr: 1.0e-06
48+
pipeline_model_parallel_size: 1
49+
precision: float32
50+
refit_buffer_size_gb: 4
51+
scheduler:
52+
- kwargs:
53+
end_factor: 1.0
54+
start_factor: 1.0
55+
total_iters: 1
56+
name: torch.optim.lr_scheduler.LinearLR
57+
- kwargs:
58+
T_max: 1000000
59+
eta_min: 1.0e-06
60+
name: torch.optim.lr_scheduler.CosineAnnealingLR
61+
- milestones:
62+
- 0
63+
sequence_packing:
64+
enabled: false
65+
logprob_mb_tokens: 16384
66+
train_mb_tokens: 8192
67+
tensor_model_parallel_size: 1
68+
tokenizer:
69+
name: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
70+
train_global_batch_size: 2048
71+
train_micro_batch_size: 1
72+
weight_decay: 0.01
73+
data:
74+
dataset_name: nvidia/AceReason-Math
75+
max_input_seq_length: 8192
76+
prompt_file: examples/prompts/acemath_qwen_cot.txt
77+
shuffle: false
78+
num_workers: 16
79+
env:
80+
math:
81+
env_cls: nemo_skills.training.nemo_rl.environments.math_environment.MathEnvironment
82+
num_workers: 16
83+
logger:
84+
monitor_gpus: false
85+
cluster:
86+
gpus_per_node: 8
87+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Solve the following math problem. Make sure to put the answer (and only answer) inside \boxed{{}}.
2+
3+
{}

nemo_rl/data/datasets/response_datasets/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
from typing import Any
1515

16+
from nemo_rl.data.datasets.response_datasets.acereason_math import AceReasonMathDataset
1617
from nemo_rl.data.datasets.response_datasets.clevr import CLEVRCoGenTDataset
1718
from nemo_rl.data.datasets.response_datasets.dapo_math import DAPOMath17KDataset
1819
from nemo_rl.data.datasets.response_datasets.deepscaler import DeepScalerDataset
@@ -79,6 +80,9 @@ def load_response_dataset(data_config, seed: int = 42):
7980
"Loading BytedTsinghua-SIA/DAPO-Math-17k for training and AIME 2024 for validation"
8081
)
8182
base_dataset: Any = DAPOMath17KDataset(seed=seed)
83+
elif dataset_name == "nvidia/AceReason-Math":
84+
print("Loading nvidia/AceReason-Math for training and validation")
85+
base_dataset: Any = AceReasonMathDataset(seed=seed)
8286
# for vlm rl training
8387
elif dataset_name == "clevr-cogent":
8488
base_dataset: Any = CLEVRCoGenTDataset(
@@ -124,6 +128,7 @@ def load_response_dataset(data_config, seed: int = 42):
124128

125129

126130
__all__ = [
131+
"AceReasonMathDataset",
127132
"CLEVRCoGenTDataset",
128133
"DeepScalerDataset",
129134
"DAPOMath17KDataset",
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
from typing import Any
17+
18+
from datasets import Dataset, load_dataset
19+
20+
from nemo_rl.data.interfaces import TaskDataSpec
21+
22+
23+
def format_acereason_math(
24+
data: dict[str, str | float | int],
25+
) -> dict[str, list[Any] | str]:
26+
"""Format AceReason-Math data to the expected message format."""
27+
return {
28+
"messages": [
29+
{
30+
"role": "user",
31+
"content": data["problem"],
32+
},
33+
{
34+
"role": "assistant",
35+
"content": data["answer"],
36+
},
37+
],
38+
# For v0.1 release, nemo rl datasets require a task_name key such that user can map a task processor per unique task.
39+
"task_name": "math",
40+
}
41+
42+
43+
def extract_dataset(split_name: str, data_split: Any) -> Any:
44+
"""Extract dataset split and add task_name field for GRPO compatibility."""
45+
if data_split is None:
46+
return None
47+
48+
# Add task_name field to each sample for GRPO compatibility
49+
def add_task_name(example: dict) -> dict:
50+
example["task_name"] = "math"
51+
return example
52+
53+
return data_split.map(add_task_name)
54+
55+
56+
def prepare_acereason_math_dataset(seed: int = 42) -> dict[str, Dataset | None]:
57+
"""Load and prepare the AceReason-Math dataset for GRPO training."""
58+
# Load the AceReason-Math dataset for training
59+
train_ds = load_dataset("nvidia/AceReason-Math", split="train")
60+
61+
# Load AIME 2024 dataset for validation (following pattern of other math datasets)
62+
val_ds = load_dataset("HuggingFaceH4/aime_2024", split="train")
63+
64+
# Shuffle the training dataset with the specified seed
65+
train_ds = train_ds.shuffle(seed=seed)
66+
67+
# Format the examples, removing original columns
68+
train_formatted = train_ds.map(
69+
format_acereason_math, remove_columns=train_ds.column_names
70+
)
71+
val_formatted = val_ds.map(
72+
format_acereason_math, remove_columns=val_ds.column_names
73+
)
74+
75+
formatted_ds_dict = {
76+
"train": extract_dataset("train", train_formatted),
77+
"validation": extract_dataset("validation", val_formatted),
78+
}
79+
80+
return prepare_math_dataset(formatted_ds_dict)
81+
82+
83+
def prepare_math_dataset(formatted_ds_dict: dict[str, Any]) -> dict[str, Any]:
84+
"""Prepare math dataset with proper formatting for GRPO."""
85+
prepared_ds = {}
86+
for split, dataset in formatted_ds_dict.items():
87+
if dataset is not None:
88+
prepared_ds[split] = dataset
89+
else:
90+
prepared_ds[split] = None
91+
return prepared_ds
92+
93+
94+
class AceReasonMathDataset:
95+
def __init__(self, seed: int = 42) -> None:
96+
"""Initialize the AceReason-Math dataset with train/validation split.
97+
98+
Args:
99+
seed: Random seed for reproducible splitting
100+
"""
101+
self.formatted_ds = prepare_acereason_math_dataset(seed=seed)
102+
103+
self.task_spec = TaskDataSpec(
104+
task_name="AceReason-Math",
105+
)
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
3+
source $SCRIPT_DIR/common.env
4+
5+
# ===== BEGIN CONFIG =====
6+
NUM_NODES=1
7+
STEPS_PER_RUN=100
8+
MAX_STEPS=1000
9+
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
10+
NUM_MINUTES=240
11+
# ===== END CONFIG =====
12+
13+
exit_if_max_steps_reached
14+
15+
# Run the experiment
16+
cd $PROJECT_ROOT
17+
uv run examples/run_grpo_math.py \
18+
--config $CONFIG_PATH \
19+
grpo.max_num_steps=$MAX_STEPS \
20+
logger.log_dir=$LOG_DIR \
21+
logger.wandb_enabled=True \
22+
logger.wandb.project=nemo-rl \
23+
logger.wandb.name=$EXP_NAME \
24+
logger.monitor_gpus=True \
25+
logger.tensorboard_enabled=True \
26+
checkpointing.enabled=True \
27+
checkpointing.checkpoint_dir=$CKPT_DIR \
28+
$@ \
29+
2>&1 | tee $RUN_LOG
30+
31+
# Convert tensorboard logs to json
32+
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
33+
34+
# Only run metrics if the target step is reached
35+
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
36+
uv run tests/check_metrics.py $JSON_METRICS \
37+
'mean(data["train/token_mult_prob_error"]) < 1.05' \
38+
"data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05"
39+
fi
40+
41+
# Convert 16k checkpoint
42+
uv run examples/converters/convert_dcp_to_hf.py \
43+
--config=$CKPT_DIR/step_${MAX_STEPS}/config.yaml \
44+
--dcp-ckpt-path=$CKPT_DIR/step_${MAX_STEPS}/policy/weights \
45+
--hf-ckpt-path=$CKPT_DIR/grpo-acereason-math-7b-16k-${MAX_STEPS}-hf
46+
47+
# Run eval on AceReason-Math dataset
48+
uv run examples/run_eval.py \
49+
generation.model_name=$CKPT_DIR/grpo-acereason-math-7b-16k-${MAX_STEPS}-hf \
50+
data.prompt_file=examples/prompts/acemath_qwen_cot.txt \
51+
generation.vllm_cfg.max_model_len=16384 \
52+
generation.vllm_cfg.enforce_eager=True \
53+
generation.temperature=1.0 \
54+
eval.num_tests_per_prompt=16 \
55+
2>&1 | tee ${RUN_LOG}.acereason-eval
56+
57+
cat ${RUN_LOG}.acereason-eval | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > ${RUN_LOG}-16k-metric.json
58+
59+
# Set baseline score for AceReason-Math evaluation (adjust based on expected performance)
60+
uv run tests/check_metrics.py ${RUN_LOG}-16k-metric.json \
61+
'data["score"] >= 0.30' # Baseline score to be adjusted based on actual performance
62+
63+
# Performance tracking comments
64+
# ========================================================
65+
# deepseek-ai/DeepSeek-R1-Distill-Qwen-7B baseline performance
66+
# ========================================================
67+
# This section will be updated with baseline performance metrics
68+
# after initial runs to establish proper thresholds

0 commit comments

Comments
 (0)