-
Notifications
You must be signed in to change notification settings - Fork 2
/
robot_move.py
592 lines (495 loc) · 30.5 KB
/
robot_move.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
import pybullet as p
import pybullet_data
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import shutil
import yaml
import argparse
import json
from robot_environment import TwoWheelRobot
from utils.metrics import TotalTrainingMetrics
from utils.recordVideoDirectMode import *
from utils.epsilonSchedules import epsilon_function
import torch
import torch.optim as optim
import tensorflow as tf
from utils.general import to_float
class MoveRobot:
""" Class for moving the Two-Wheel RobotTWR and where all the classes interact with each other.
"""
def __init__(self,
model_config,
hyperparameter_tuning_variable=False,
hyperparameter_value=False):
# model config path
self.model_config_path = model_config['config_path'] # config file path
self.run_name = model_config['run']['name'] # run name
self.mode = model_config['mode'] # training or testing mode
self.record_video = model_config['testing']['record_video'] # whether to record video
# model specific params
self.model_name = model_config['model']['name'] # model name
self.gamma = model_config['model']['gamma'] # discount factor
self.learning_rate_actor = model_config['model']['learning_rate_actor'] # learning rate of the AdamW optimizer
self.learning_rate_critic = model_config['model']['learning_rate_critic'] # learning rate of the AdamW optimizer
self.hidden_layer_size = model_config['model']['hidden_layer_size'] # number of hidden units per layer (list format [first layer num units, second layer num units, ...])
self.weight_decay = model_config['model']['weight_decay'] # weight decay for learning rate
self.dropout_rate = model_config['model']['dropout_rate'] # dropout rate for layers
self.batch_size = model_config['model']['batch_size'] # number of transitions from the replay buffer
self.tau = model_config['model']['tau'] # update rate of the target network
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# Epsilon Greedy Parameters
self.epsilon_initial = model_config['epsilon_greedy']['eps_init'] # starting value of epsilon
self.epsilon_end = model_config['epsilon_greedy']['eps_end'] # final value of epsilon
self.EPS_DECAY = model_config['epsilon_greedy']['eps_decay'] # rate of exponential decay of epsilon
self.epsilon_decay_type = model_config['epsilon_greedy']['epsilon_decay_type'] # type of epislon schedule
self.epsilon_A=model_config['epsilon_greedy']['stretched_A']
self.epsilon_B=model_config['epsilon_greedy']['stretched_B']
self.epsilon_C=model_config['epsilon_greedy']['stretched_C']
# initialise training params if mode is training
if self.mode == 'train':
self.device_type = model_config['training']['device']
self.num_workers = model_config['training']['num_workers']
if torch.cuda.is_available():
print('cuda available')
else:
print('cuda not available')
if self.device_type == 'auto' or self.device_type == 'gpu': # automatically choose device type
import os
os.environ['CUDA_VISIBLE_DEVICES'] = model_config['gpu_id']
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ""
self.device = torch.device("cpu")
self.base_results_dir = model_config['training']['base_results_dir']
self.num_episodes = model_config['training']['num_train_episodes']
self.max_steps = model_config['training']['max_steps_per_episode']
self.n_steps = model_config['training']['n_step']
if self.n_steps == False:
self.n_steps = self.max_steps
self.save_model_weights = model_config['training']['save_model_weights']
# initialise testing params if mode is testing
elif self.mode == 'test':
self.device_type = model_config['testing']['device']
self.num_workers = model_config['testing']['num_workers']
if torch.cuda.is_available():
print('cuda available')
else:
print('cuda not available')
if self.device_type == 'auto' or self.device_type == 'gpu': # automatically choose device type
import os
os.environ['CUDA_VISIBLE_DEVICES'] = model_config['gpu_id']
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
self.device = torch.device("cpu")
self.base_results_dir = model_config['testing']['base_results_dir']
self.num_episodes = model_config['testing']['num_test_episodes']
self.max_steps = model_config['testing']['max_steps_per_episode']
self.load_model_weight_path = model_config['load_model_weights_path']
self.epsilon_decay_type = None
self.save_model_weights = False
self.record_video = model_config['testing']['record_video']
# load environment params
self.video_mode = model_config['environment']['video_mode']
self.render_mode = model_config['environment']['render_mode']
self.enable_keyboard = model_config['environment']['enable_keyboard']
self.environment_type = model_config['environment']['environment_type']
self.time_step_size = to_float(model_config['environment']['time_step_size'])
self.x_distance_to_goal=model_config['environment']['x_distance_to_goal']
self.distance_to_goal_penalty = model_config['environment']['distance_to_goal_penalty']
self.time_penalty = model_config['environment']['time_penalty']
self.target_velocity_change = model_config['environment']['target_velocity_change']
self.goal_type = model_config['environment']['goal_type']
self.goal_step = model_config['environment']['goal_step']
if self.goal_step > self.max_steps:
raise ValueError('goal step cannot be larger than max steps')
# if multi-agent or multi-action model is used
if self.model_name == 'MAA2C':
self.multi_action = True
# else single agent/single action model is used
else:
self.multi_action = False
if self.mode == 'test' and self.record_video:
self.render_mode = 'GUI'
# create our environment object class
self.env = TwoWheelRobot(render_mode=self.render_mode,
enable_keyboard=self.enable_keyboard,
environment_type = self.environment_type,
time_step_size = self.time_step_size,
x_distance_to_goal = self.x_distance_to_goal,
robot_time_penalty = self.time_penalty,
target_velocity_change = self.target_velocity_change,
record_video = self.record_video,
multi_action = self.multi_action,
goal_type=self.goal_type
)
# get observation dimensions
self.observation_space_dimension = len(self.env.get_robot_state())
# get action space dimensions
self.action_space_dimension = self.env.get_action_space()
# results dir stores the results of each experiment run
import os
self.results_dir = os.path.join(self.base_results_dir,self.run_name)
# if hyperparameter tuning, we create subdirectories to save the results of each run with a new hyperparameter value
if hyperparameter_tuning_variable:
self.results_dir = os.path.join(self.results_dir,f"{hyperparameter_tuning_variable}_{hyperparameter_value}")
# clears results directory if exist
if os.path.exists(self.results_dir) and os.path.isdir(self.results_dir):
shutil.rmtree(self.results_dir)
# creates results directory for current run
if not os.path.exists(self.results_dir):
os.makedirs(self.results_dir)
print(f'Saving results to {self.results_dir}')
if self.model_name == "DQN" or self.model_name == "DQNMA" or self.model_name == 'SAC':
self.save_model_weights_path = os.path.join(self.results_dir,'best_model.pt')
model_config['load_model_weights_path'] = self.save_model_weights_path
else:
model_config['load_model_weights_path'] = self.results_dir
# save config file to results dir
with open(os.path.join(self.results_dir, f"{self.run_name}.yaml"), 'w') as f:
yaml.dump(model_config,f)
# creates trajectory results directory for current run
self.results_dir_trajectory = os.path.join(self.results_dir,'robot_trajectories')
if not os.path.exists(self.results_dir_trajectory):
os.makedirs(self.results_dir_trajectory)
# creates plot results directory for current run
self.results_dir_plot = os.path.join(self.results_dir,'graphs')
if not os.path.exists(self.results_dir_plot):
os.makedirs(self.results_dir_plot)
# evaluation params
self.num_test_episodes = model_config['testing']['num_test_episodes']
# plotting params
self.plot_trajectories_episode_interval = model_config['plotting']['plot_trajectories_episode_interval']
self.record_trajectory_time_step_interval = model_config['plotting']['record_trajectory_time_step_interval']
# for testing, use zero epsilon, use full model exploitation
if self.mode == 'test':
self.epsilon_initial = 0.0
self.epsilon_end = 0.0
# initialise epsilon function class for varying epsilon during training
self.epsilon_function=epsilon_function(self.num_episodes,
eps_init=self.epsilon_initial,
eps_end=self.epsilon_end,
epsilon_decay_type=self.epsilon_decay_type,
A=self.epsilon_A,B=self.epsilon_B,C=self.epsilon_C)
# --------------------------------PURELY FOR MODEL INITIALISATION-------------------------------------------------
self.tf_model_list = ['A2C','MAA2C','Reinforce']
self.pt_model_list = ['DQN', 'DQNMA', 'SAC']
# load DQN based on config model name
if self.model_name == 'DQN' or self.model_name == 'DQNMA':
from robot_neural_network import ReplayMemoryDQN, DQN
if self.model_name == 'DQN':
self.policy_net = DQN(self.observation_space_dimension, self.action_space_dimension, self.hidden_layer_size).to(self.device)
self.target_net = DQN(self.observation_space_dimension, self.action_space_dimension, self.hidden_layer_size).to(self.device)
elif self.model_name == 'DQNMA':
from utils.general import Differential_Drive
self.differential_action_scheme = Differential_Drive(self.action_space_dimension,2)
self.policy_net = DQN(self.observation_space_dimension, len(self.differential_action_scheme.new_mapping), self.hidden_layer_size).to(self.device)
self.target_net = DQN(self.observation_space_dimension, len(self.differential_action_scheme.new_mapping), self.hidden_layer_size).to(self.device)
self.target_net.load_state_dict(self.policy_net.state_dict())
self.optimizer = optim.AdamW(self.policy_net.parameters(), lr=self.learning_rate_actor, amsgrad=True)
self.memory = ReplayMemoryDQN(10000)
# load saved model for testing
if self.mode == 'test':
print(f"Loading model from {self.load_model_weight_path}")
# load saved model weights
self.policy_net.load_state_dict(torch.load(self.load_model_weight_path, map_location=self.device))
# Set model for parallel computation, move model to devie
from torch.nn.parallel import DataParallel
if self.device_type != 'cpu':
self.policy_net = DataParallel(self.policy_net)
self.policy_net = self.policy_net.to(self.device)
self.target_net = DataParallel(self.target_net)
self.target_net = self.target_net.to(self.device)
else:
self.policy_net = DataParallel(self.policy_net)
self.policy_net = self.policy_net.module.to(self.device)
self.target_net = DataParallel(self.target_net)
self.target_net = self.target_net.module.to(self.device)
# Set the number of worker processes to use for computation
torch.set_num_threads(self.num_workers)
elif self.model_name == 'SAC':
from robot_neural_network import SAC
from robot_neural_network import ReplayMemorySAC
self.updates = 0
self.agent = SAC(num_inputs = self.observation_space_dimension, hidden_layer_size = self.hidden_layer_size[0], learning_rate = self.learning_rate_actor, update_interval = 1, device = self.device, epsilon = self.epsilon_initial, gamma = self.gamma, tau = self.tau)
self.memory = ReplayMemorySAC(10000)
# Set model for parallel computation, move model to devie
from torch.nn.parallel import DataParallel
if self.device_type != 'cpu':
self.agent = DataParallel(self.agent)
self.agent = self.agent.to(self.device)
else:
self.agent = DataParallel(self.agent)
self.agent =self.agent.to(self.device)
if self.mode == 'test':
print(f"Loading model from {self.load_model_weight_path}")
load_weight = torch.load(self.load_model_weight_path, map_location=self.device)
print(load_weight)
# load saved model weights
self.agent.load_state_dict(torch.load(self.load_model_weight_path, map_location=self.device))
elif self.model_name in self.tf_model_list:
self.num_wheels = 2
from robot_agent import Agent
if self.model_name == "MAA2C" or self.model_name == "Reinforce":
self.agent = Agent(model = self.model_name,
discount_rate = self.gamma,
learning_rate_actor = self.learning_rate_actor,
learning_rate_critic = self.learning_rate_critic,
hidden_layer_size=self.hidden_layer_size,
weight_decay=self.weight_decay,
dropout_rate=self.dropout_rate,
action_space_dimension = self.action_space_dimension,
observation_space_dimension=self.observation_space_dimension,
epsilon = self.epsilon_initial,
num_wheels = self.num_wheels)
elif self.model_name == "A2C":
self.agent = Agent(model = self.model_name,
discount_rate = self.gamma,
learning_rate_actor = self.learning_rate_actor,
learning_rate_critic = self.learning_rate_critic,
hidden_layer_size=self.hidden_layer_size,
weight_decay=self.weight_decay,
dropout_rate=self.dropout_rate,
action_space_dimension = self.action_space_dimension,
observation_space_dimension=self.observation_space_dimension,
epsilon = self.epsilon_initial,
num_wheels = self.num_wheels)
else:
raise ValueError("Invalid model/algorithm set in config")
else:
raise ValueError(f"model_name {self.model_name} not recognized")
# initialise training metrics logging function
self.MetricTotal=TotalTrainingMetrics()
# printing which device used for running
print(f"Using device: {self.device}")
def run_episode(self,episode):
"""Run the episode
"""
steps_original_reward_list = []
steps_transient_reward_list = []
x_pos_list = []
y_pos_list = []
record_episode = 1
time_step = 0
done = 0
loss = 0
# get initial robot state
state = self.env.get_robot_state()
state = self.env.normalize_observations(state)
# if DQN, move state tensor to device
if (self.model_name == 'DQN' or self.model_name == "DQNMA") and not self.enable_keyboard and self.mode == 'train':
state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
# start logging metrics for episode
self.MetricTotal.start_episode(0,self.env.robot_base_position)
# set gravity to -9.8 m/s^2
p.setGravity(0,0,-9.8)
# run episode by stepping through each step
for time_step in range(self.max_steps):
if self.record_video and self.render_mode=='GUI' and (episode == record_episode):
record_gui_mode(p,time_step)
# capture image frames for DIRECT mode for specified episode
if self.video_mode and self.render_mode=='DIRECT' and (episode == record_episode): #and time_step % 100 == 0:
record_direct_mode(p,time_step)
# save trajectories of robot at each time step
if (time_step % self.record_trajectory_time_step_interval) == 0:
base_pos, _ = p.getBasePositionAndOrientation(self.env.robotid)
linear_velocity, angular_velocity = p.getBaseVelocity(self.env.robotid)
x_pos_list.append(base_pos[0])
y_pos_list.append(base_pos[1])
""" get action from model and take a step in the environment """
# enable keyboard manual control for debugging
if self.enable_keyboard:
keys = p.getKeyboardEvents()
if ord('q') in keys:
break
action = self.env.select_manual_action_from_keyboard(keys)
# model to choose action
else:
if (self.model_name == 'DQN' or self.model_name == "DQNMA"):
from robot_neural_network import select_action_DQN, update_weights
if (self.model_name == 'DQN' or self.model_name == "DQNMA") and not self.enable_keyboard and self.mode == 'test':
state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
action = select_action_DQN(obs = state, n_actions = self.action_space_dimension, policy_net = self.policy_net, device = self.device, epsilon = self.epsilon) # state, self.policy_net, self.device, self.epsilon)
elif self.model_name == 'SAC':
action = self.agent.module.select_action(state, self.epsilon,evaluate=self.mode) # Sample action from policy
elif self.model_name in self.tf_model_list:
action = self.agent.select_actions(state,mode="train")
else:
print(f'{self.model_name} does not exist')
# take a step in the environment by applying the action on the robot
# for multi-action/ multi-agent model
if self.model_name == 'MAA2C':
step_action = action
elif self.model_name == "DQNMA":
step_action = self.differential_action_scheme.new_mapping[int(action)]
else:
step_action = action[0]
# take a step in environment and observe next state and rewards
next_state, reward, transient_reward, done, suceed, info = \
self.env.step(step_action, self.model_name, time_step=time_step, goal_step=self.goal_step)
# normalise observations
next_state = self.env.normalize_observations(next_state)
# add rewards to list
steps_original_reward_list.append(reward) # for measuring model performance
steps_transient_reward_list.append(transient_reward) # use transient reward to update models
# Check for the maximum displacement travelled by the agent
base_pos, base_ori = p.getBasePositionAndOrientation(self.env.robotid)
linear_velocity, angular_velocity = p.getBaseVelocity(self.env.robotid)
self.MetricTotal.intermediate_episode(base_pos,linear_velocity)
""" ===== update models during training ===== """
if self.mode == 'train':
# update weights of SAC model
if (self.model_name == 'SAC') :
if len(self.memory) > self.batch_size:
# Number of updates per step in environment
updates_per_step = 1
for i in range(updates_per_step):
# Update parameters of all the networks
policy_loss = self.agent.module.update_weights(self.memory, self.batch_size, self.updates)
self.updates += 1
loss += policy_loss
if time_step == self.max_steps:
mask = 1
else:
mask = float(not done)
# Add transition to memory
self.memory.push(state, action, reward, next_state, mask)
# update weights of DQN model
elif (self.model_name == 'DQN' or self.model_name == 'DQNMA') and not self.enable_keyboard:
next_state = torch.tensor(next_state, dtype=torch.float32, device=self.device).unsqueeze(0)
transient_reward = torch.tensor(transient_reward, dtype=torch.float32, device=self.device).unsqueeze(0)
# Store the transition in memory
self.memory.push(state, action, next_state, transient_reward)
# Perform one step of the optimization (on the policy network)
pyt_loss = update_weights(self.policy_net, self.target_net, self.optimizer, self.memory, self.batch_size, self.gamma, self.device)
loss += pyt_loss.item()
# Soft update of the target network's weights: θ′ ← τ θ + (1 −τ )θ′
target_net_state_dict = self.target_net.state_dict()
policy_net_state_dict = self.policy_net.state_dict()
for key in policy_net_state_dict:
target_net_state_dict[key] = policy_net_state_dict[key]*self.tau + target_net_state_dict[key]*(1-self.tau)
self.target_net.load_state_dict(target_net_state_dict)
# update weights of tensorflow models
elif self.model_name in self.tf_model_list:
if self.agent.model == "MAA2C":
actor_observations=state # Local state observed by the acting mechanism (e.g. wheel)
actor_observations_prime = next_state
critic_observations = state
critic_observations_prime=next_state # Global State observed; >= actor observation
tf_loss = self.agent.update_weights(model_name=self.agent.model,
observations=critic_observations,
reward=reward,
observations_prime=critic_observations_prime,
is_done=done,
actor_observations=actor_observations)
actor_observations = actor_observations_prime
critic_observations = critic_observations_prime
# apply gradients for combined hybrid actor critic
elif self.agent.model == "A2C" or self.agent.model == "Reinforce":
tf_loss = self.agent.update_weights(model_name=self.agent.model,
observations=state,
reward = reward,
observations_prime=next_state,
is_done=done)
# for plotting of loss over steps
if type(tf_loss) == list:
loss += tf_loss[0]
elif type(tf_loss) == float:
loss += tf_loss
# assign previous state to current state
state = next_state
# if episode has ended, log the metrics
if done:
# calculate cumulative rewards
cumulative_reward = sum(steps_original_reward_list)
cumulative_transient_reward = sum(steps_original_reward_list)
# get final base position for logging
base_pos, base_ori = p.getBasePositionAndOrientation(self.env.robotid)
# if DQN model, we move the transient reward to cpu for logging
if (self.model_name == "DQN" or self.model_name == 'DQNMA') and not self.enable_keyboard and self.mode =='train':
transient_reward = transient_reward.to('cpu').numpy()[0]
# add all logging metrics to the MetricTotal class at the end of the episode
self.MetricTotal.end_episode(transient_reward, cumulative_transient_reward, reward, cumulative_reward, time_step, base_pos, self.epsilon,loss)
# add robot final trajectory position (x,y) coordinates to lists
x_pos_list.append(base_pos[0])
y_pos_list.append(base_pos[1])
break
# plot the episode trajectory
plt.plot(y_pos_list, x_pos_list, color=self.robot_trajectory_color)
# sitch video for DIRECT mode for specified episode
if ((self.record_video and self.render_mode =='GUI') or \
(self.video_mode and self.render_mode =='DIRECT')) \
and episode == record_episode:
stitch_video_direct_mode(episode)
return steps_transient_reward_list[-1], suceed, time_step
def plot_trajectories(self,episode_index):
plt.xlabel('Y position (m)')
# plt.xlim(-3, 3)
plt.ylabel('X position (m)')
# plt.ylim(0, 12)
plt.title(f'robot path over episodes_{episode_index}')
# plt.show()
plt.savefig(os.path.join(self.results_dir_trajectory,f'robot_path_over_episodes_{episode_index}.png'))
# Clear the current axes.
plt.cla()
# Clear the current figure.
plt.clf()
# Closes all the figure windows.
plt.close('all')
import gc
gc.collect()
plt.figure()
def run(self):
""" Run the training/testing loop.
"""
print(f"goal type: {self.goal_type}")
mode_caption = 'TRAINING' if self.mode == 'train' else 'TESTING'
print(f"\n ===== START {mode_caption} ===== ")
total_success = 0
import time
start_time = time.time()
self.robot_trajectory_color = np.random.rand(3,)
plt.figure()
# loop through episodes
for episode in range(self.num_episodes):
if episode != 0:
self.env.reset()
self.epsilon = self.epsilon_function.get_current_epsilon(episode)
ep_reward, is_succeed, end_step = self.run_episode(episode)
total_success += is_succeed
print(f"\r\033[KEpisode: {episode+1}/{self.num_episodes} | Sucesses so far: {total_success}/{episode+1} | Sucess Rate: {round(total_success*100/(episode+1),2)}% | epsilon threshold: {round(self.epsilon,5):.5f} | transient rewards: {round(ep_reward,5):.5f} | survival_time: {round(end_step*self.time_step_size,2):.2f} | end_step: {end_step}", end='', flush=True)
# plot trajectories for TWR at an episode interval
if episode != 0 and episode% self.plot_trajectories_episode_interval==0:
self.plot_trajectories(episode)
if episode+1 != self.num_episodes:
self.robot_trajectory_color = np.random.rand(3,)
# plot last episode trajectories
self.plot_trajectories(episode+1)
self.MetricTotal.end_training()
self.MetricTotal.plot_all(self.time_step_size, self.results_dir_plot)
plt.clf()
plt.figure().clear()
plt.close('all')
print(f"\r\033[KEpisode: {episode+1}/{self.num_episodes} | Sucesses so far: {total_success}/{episode+1} | Sucess Rate: {round(total_success*100/(episode+1),2)}% | epsilon threshold: {round(self.epsilon,5):.5f} | transient rewards: {round(ep_reward,5):.5f} | survival_time (secs): {round(end_step*self.time_step_size,2):.2f} | end_step: {end_step} ")
from datetime import timedelta
elapsed = time.time() - start_time
print(f"--- {mode_caption} took {timedelta(seconds=elapsed)} , HH : MM : SS ---")
print(f"===== {mode_caption} COMPLETED ===== \n\n")
# save DQN policy net weights
if self.save_model_weights and (self.model_name=="DQN" or self.model_name=='SAC' or self.model_name == "DQNMA"):
print(f"saved model weights to {self.save_model_weights_path}")
if self.model_name=="DQN" or self.model_name == "DQNMA":
model_class = self.policy_net
# SAC model
else:
model_class = self.agent.module.policy
if self.device_type !='cpu':
torch.save(model_class.module.state_dict(), self.save_model_weights_path)
else:
torch.save(model_class.state_dict(), self.save_model_weights_path)
elif self.save_model_weights:
self.agent.save_models(self.results_dir)
return