diff --git a/examples/train_oat/train_oat_grpo.py b/examples/train_oat/train_oat_grpo.py index 59d4e5f..8e45630 100644 --- a/examples/train_oat/train_oat_grpo.py +++ b/examples/train_oat/train_oat_grpo.py @@ -111,7 +111,7 @@ class Args(PPOArgs): # Reward settings gamma: float = 1.0 # Discount factor for Monte Carlo returns - norm_return: bool = True + norm_return: bool = False # Should be False to avoid double normalization # online evaluation settings eval_envs: str = None # 'eval:AIME24|eval:MATH500'. See gem.envs @@ -126,7 +126,7 @@ class Args(PPOArgs): # Episode collection logic keep_generation_failed: bool = False # Keep episodes with generation failures - critic_type2: Literal["grpo", "drgrpo", "rloo", "ep_level", "none"] = "none" + critic_type2: Literal["grpo", "drgrpo", "rloo", "ep_level", "none"] = "grpo" """ +=======================================+ """