@@ -206,14 +206,13 @@ def forward(self, x):
206
206
207
207
def make_env (cfg , idx , capture_video , run_name , gamma ):
208
208
def thunk ():
209
-
210
- if "safety" in cfg .env_id .lower ():
209
+ if "velocity" in cfg .env_id .lower () or "safety" not in cfg .env_id .lower ():
210
+ env = gym .make (cfg .env_id )
211
+ else :
211
212
if capture_video :
212
- env = gym .make (cfg .env_id , render_mode = "rgb_array" , early_termination = cfg .early_termination , term_cost = cfg .term_cost , failure_penalty = cfg .failure_penalty , reward_goal = cfg .reward_goal , reward_distance = cfg .reward_distance )
213
+ env = gym .make (cfg .env_id , render_mode = "rgb_array" , early_termination = cfg .early_termination , term_cost = cfg .term_cost , failure_penalty = cfg .failure_penalty , reward_goal = cfg .reward_goal , reward_distance = cfg .reward_distance )
213
214
else :
214
215
env = gym .make (cfg .env_id , early_termination = cfg .early_termination , term_cost = cfg .term_cost , failure_penalty = cfg .failure_penalty , reward_goal = cfg .reward_goal , reward_distance = cfg .reward_distance )
215
- else :
216
- env = gym .make (cfg .env_id )
217
216
env = gym .wrappers .FlattenObservation (env ) # deal with dm_control's Dict observation space
218
217
env = gym .wrappers .RecordEpisodeStatistics (env )
219
218
if capture_video :
@@ -733,11 +732,11 @@ def train(cfg):
733
732
if info is None :
734
733
continue
735
734
print (ep_risk_penalty )
736
- ep_cost = info ["cost_sum" ] if "safe" in cfg . env_id . lower () else info [ " cost" ]
735
+ ep_cost = info ["cost" ]
737
736
cum_cost += ep_cost
738
737
ep_len = info ["episode" ]["l" ][0 ]
739
738
buffer_num += ep_len
740
- goal_met_ep = info ["cum_goal_met" ] if "safe" in cfg .env_id .lower () else info [ "is_success" ]
739
+ goal_met_ep = info ["cum_goal_met" ] if "safe" in cfg .env_id .lower () and "velocity" not in cfg . env_id . lower () else 0
741
740
goal_met += goal_met_ep
742
741
#print(f"global_step={global_step}, episodic_return={info['episode']['r']}, episode_cost={ep_cost}")
743
742
scores .append (info ['episode' ]['r' ])
@@ -752,7 +751,7 @@ def train(cfg):
752
751
writer .add_scalar ("Results/Avg_Return" , avg_mean_score , global_step )
753
752
torch .save (agent .state_dict (), os .path .join (wandb .run .dir , "policy.pt" ))
754
753
wandb .save ("policy.pt" )
755
- print (f"cummulative_cost={ cum_cost } , global_step={ global_step } , episodic_return={ avg_mean_score } , episode_cost={ ep_cost } " )
754
+ print (f"cummulative_cost={ cum_cost } , global_step={ global_step } , episodic_return={ info [ 'episode' ][ 'r' ] } , avg_episodic_return= { avg_mean_score } , episode_cost={ ep_cost } " )
756
755
if cfg .use_risk :
757
756
ep_risk = torch .sum (all_risks .squeeze ()[last_step :global_step , 0 ]).item ()
758
757
cum_risk += ep_risk
@@ -769,7 +768,7 @@ def train(cfg):
769
768
step_log = 0
770
769
ep_risk_penalty = 0
771
770
# f_dist_to_fail = torch.Tensor(np.array(list(reversed(range(f_obs.size()[0]))))).to(device) if cost > 0 else torch.Tensor(np.array([f_obs.size()[0]]*f_obs.shape[0])).to(device)
772
- e_risks = np .array (list (reversed (range (int (ep_len ))))) if cum_cost > 0 else np .array ([int (ep_len )]* int (ep_len ))
771
+ e_risks = np .array (list (reversed (range (int (ep_len ))))) if terminated else np .array ([int (ep_len )]* int (ep_len ))
773
772
# print(risks.size())
774
773
e_risks = torch .Tensor (e_risks )
775
774
if cfg .fine_tune_risk != "None" and cfg .use_risk :
0 commit comments