-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample.py
36 lines (30 loc) · 1.36 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np
from lqr_mdp import MDP_LQR_Disc
# load the discrete-time LQR instance from file
mdp = MDP_LQR_Disc(fn='env/mdp_disc_laplacian.json')
print("The starting state is {}".format(mdp.state))
# Get a random linear policy K and run it for 10 steps
K = np.random.random((mdp.dim_x, mdp.dim_u))
for _ in range(10):
# take random actions and print the state
action = K.dot(mdp.state)
next_state, reward, done = mdp.step(action)
print("Iter {}: Next state: {}, cost: {}, done: {}".format(mdp.iter, next_state, reward, done))
if done:
# In this case, it's either the cost is bigger than the maximum, or that the state is NaN
print("Time to terminate")
break
# Print the Value Matrix P for the current policy K where V(x) = x'Px
mdp.compute_cost_matrix(K)
print("The value matrix P: \n", mdp.P)
# Get the optimal policy and run it for 10 steps
K_optimal = mdp.get_optimal_policy()
mdp.reset()
print("The state is reset to {}".format(mdp.state))
for _ in range(10):
# take optimal action and print the state
action = K_optimal.dot(mdp.state)
next_state, reward, done = mdp.step(action)
print("Iter {}: Next state: {}, cost: {}, done: {}".format(mdp.iter, next_state, reward, done))
# Print the Optimal Value Matrix P_optimal where V*(x) = x'P_optimal x
print("The optimal value matrix P_optimal: \n", mdp.P_optimal)