From 6de44a1cffcc9cead823d33a701cddb21f0294ba Mon Sep 17 00:00:00 2001
From: Minzheng_Wang <842474595@qq.com>
Date: Wed, 29 Oct 2025 22:55:06 +0800
Subject: [PATCH] fix grpo default setting

---
 examples/train_oat/train_oat_grpo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train_oat/train_oat_grpo.py b/examples/train_oat/train_oat_grpo.py
index 59d4e5f..8e45630 100644
--- a/examples/train_oat/train_oat_grpo.py
+++ b/examples/train_oat/train_oat_grpo.py
@@ -111,7 +111,7 @@ class Args(PPOArgs):
 
     # Reward settings
     gamma: float = 1.0  # Discount factor for Monte Carlo returns
-    norm_return: bool = True
+    norm_return: bool = False  # Should be False to avoid double normalization
 
     # online evaluation settings
     eval_envs: str = None  # 'eval:AIME24|eval:MATH500'. See gem.envs
@@ -126,7 +126,7 @@ class Args(PPOArgs):
     # Episode collection logic
     keep_generation_failed: bool = False  # Keep episodes with generation failures
 
-    critic_type2: Literal["grpo", "drgrpo", "rloo", "ep_level", "none"] = "none"
+    critic_type2: Literal["grpo", "drgrpo", "rloo", "ep_level", "none"] = "grpo"
 
 
 """ +=======================================+ """