-
Notifications
You must be signed in to change notification settings - Fork 341
/
Copy pathmonte_carlo.py
142 lines (118 loc) · 4.79 KB
/
monte_carlo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
'''Monte Carlo methods for working with Markov Reward Process and
Markov Decision Processes.
'''
from typing import Iterable, Iterator, TypeVar, Callable
from rl.distribution import Categorical
from rl.approximate_dynamic_programming import (ValueFunctionApprox,
QValueFunctionApprox,
NTStateDistribution)
from rl.iterate import last
from rl.markov_decision_process import MarkovDecisionProcess, Policy, \
TransitionStep, NonTerminal
from rl.policy import DeterministicPolicy, RandomPolicy, UniformPolicy
import rl.markov_process as mp
from rl.returns import returns
import itertools
S = TypeVar('S')
A = TypeVar('A')
def mc_prediction(
traces: Iterable[Iterable[mp.TransitionStep[S]]],
approx_0: ValueFunctionApprox[S],
γ: float,
episode_length_tolerance: float = 1e-6
) -> Iterator[ValueFunctionApprox[S]]:
'''Evaluate an MRP using the monte carlo method, simulating episodes
of the given number of steps.
Each value this function yields represents the approximated value
function for the MRP after one additional epsiode.
Arguments:
traces -- an iterator of simulation traces from an MRP
approx_0 -- initial approximation of value function
γ -- discount rate (0 < γ ≤ 1), default: 1
episode_length_tolerance -- stop iterating once γᵏ ≤ tolerance
Returns an iterator with updates to the approximated value
function after each episode.
'''
episodes: Iterator[Iterator[mp.ReturnStep[S]]] = \
(returns(trace, γ, episode_length_tolerance) for trace in traces)
f = approx_0
yield f
for episode in episodes:
f = last(f.iterate_updates(
[(step.state, step.return_)] for step in episode
))
yield f
def batch_mc_prediction(
traces: Iterable[Iterable[mp.TransitionStep[S]]],
approx: ValueFunctionApprox[S],
γ: float,
episode_length_tolerance: float = 1e-6,
convergence_tolerance: float = 1e-5
) -> ValueFunctionApprox[S]:
'''traces is a finite iterable'''
return_steps: Iterable[mp.ReturnStep[S]] = \
itertools.chain.from_iterable(
returns(trace, γ, episode_length_tolerance) for trace in traces
)
return approx.solve(
[(step.state, step.return_) for step in return_steps],
convergence_tolerance
)
def greedy_policy_from_qvf(
q: QValueFunctionApprox[S, A],
actions: Callable[[NonTerminal[S]], Iterable[A]]
) -> DeterministicPolicy[S, A]:
'''Return the policy that takes the optimal action at each state based
on the given approximation of the process's Q function.
'''
def optimal_action(s: S) -> A:
_, a = q.argmax((NonTerminal(s), a) for a in actions(NonTerminal(s)))
return a
return DeterministicPolicy(optimal_action)
def epsilon_greedy_policy(
q: QValueFunctionApprox[S, A],
mdp: MarkovDecisionProcess[S, A],
ε: float = 0.0
) -> Policy[S, A]:
def explore(s: S, mdp=mdp) -> Iterable[A]:
return mdp.actions(NonTerminal(s))
return RandomPolicy(Categorical(
{UniformPolicy(explore): ε,
greedy_policy_from_qvf(q, mdp.actions): 1 - ε}
))
def glie_mc_control(
mdp: MarkovDecisionProcess[S, A],
states: NTStateDistribution[S],
approx_0: QValueFunctionApprox[S, A],
γ: float,
ϵ_as_func_of_episodes: Callable[[int], float],
episode_length_tolerance: float = 1e-6
) -> Iterator[QValueFunctionApprox[S, A]]:
'''Evaluate an MRP using the monte carlo method, simulating episodes
of the given number of steps.
Each value this function yields represents the approximated value
function for the MRP after one additional epsiode.
Arguments:
mdp -- the Markov Decision Process to evaluate
states -- distribution of states to start episodes from
approx_0 -- initial approximation of value function
γ -- discount rate (0 ≤ γ ≤ 1)
ϵ_as_func_of_episodes -- a function from the number of episodes
to epsilon. epsilon is the fraction of the actions where we explore
rather than following the optimal policy
episode_length_tolerance -- stop iterating once γᵏ ≤ tolerance
Returns an iterator with updates to the approximated Q function
after each episode.
'''
q: QValueFunctionApprox[S, A] = approx_0
p: Policy[S, A] = epsilon_greedy_policy(q, mdp, 1.0)
yield q
num_episodes: int = 0
while True:
trace: Iterable[TransitionStep[S, A]] = \
mdp.simulate_actions(states, p)
num_episodes += 1
for step in returns(trace, γ, episode_length_tolerance):
q = q.update([((step.state, step.action), step.return_)])
p = epsilon_greedy_policy(q, mdp, ϵ_as_func_of_episodes(num_episodes))
yield q