From 6de44a1cffcc9cead823d33a701cddb21f0294ba Mon Sep 17 00:00:00 2001 From: Minzheng_Wang <842474595@qq.com> Date: Wed, 29 Oct 2025 22:55:06 +0800 Subject: [PATCH] fix grpo default setting --- examples/train_oat/train_oat_grpo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/train_oat/train_oat_grpo.py b/examples/train_oat/train_oat_grpo.py index 59d4e5f..8e45630 100644 --- a/examples/train_oat/train_oat_grpo.py +++ b/examples/train_oat/train_oat_grpo.py @@ -111,7 +111,7 @@ class Args(PPOArgs): # Reward settings gamma: float = 1.0 # Discount factor for Monte Carlo returns - norm_return: bool = True + norm_return: bool = False # Should be False to avoid double normalization # online evaluation settings eval_envs: str = None # 'eval:AIME24|eval:MATH500'. See gem.envs @@ -126,7 +126,7 @@ class Args(PPOArgs): # Episode collection logic keep_generation_failed: bool = False # Keep episodes with generation failures - critic_type2: Literal["grpo", "drgrpo", "rloo", "ep_level", "none"] = "none" + critic_type2: Literal["grpo", "drgrpo", "rloo", "ep_level", "none"] = "grpo" """ +=======================================+ """