-
Notifications
You must be signed in to change notification settings - Fork 2
/
FRIQ_episode.m
91 lines (74 loc) · 3.14 KB
/
FRIQ_episode.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
function [total_reward_friq, steps_friq] = FRIQ_episode(maxsteps, alpha, gamma, epsilon)
% FRIQ_episode: FRIQ-learning framework: Evaluate an episode
%
% FRIQ-learning framework v0.70
% https://github.com/szaguldo-kamaz/
%
% Author: David Vincze <david.vincze@uni-miskolc.hu>
% Copyright (c) 2013-2022 by David Vincze
%
global FRIQ_param_drawfunc FRIQ_param_doactionfunc FRIQ_param_rewardfunc FRIQ_param_quantize_observationsfunc
global FRIQ_param_states FRIQ_param_statedivs FRIQ_param_states_default FRIQ_param_actions
global FRIQ_param_test_previous_rb
global FRIQ_param_drawsim
global stopappnow stepno reduction_state measure_rb_usage_state
global debug_on
state = FRIQ_param_states_default;
steps_friq = 0;
total_reward_friq = 0;
if measure_rb_usage_state == 1
noRBupdate=1;
else
if FRIQ_param_test_previous_rb == 1
noRBupdate=1;
else
if reduction_state == 1
noRBupdate=1;
else
noRBupdate=0;
end
end
end
% set initial state
state_quantized = state;
% choose an action using the epsilon-greedy selection strategy for the initial state
action_index = FRIQ_epsilon_greedy_selection(state_quantized, epsilon, FRIQ_param_actions);
for stepno = 1:maxsteps
% convert the index of the action to an action value
action_value = FRIQ_param_actions(action_index);
% perform action, which takes the system to a new state
state_p = FRIQ_param_doactionfunc(action_value, state);
% calculate the reward for the state change
[reward, isfinalstate] = FRIQ_param_rewardfunc(state_p);
total_reward_friq = total_reward_friq + reward;
%state_p_quantized=state_p;
state_p_quantized = FRIQ_param_quantize_observationsfunc(state_p, FRIQ_param_states, FRIQ_param_statedivs);
if debug_on == 1
format long
['step: ' int2str(stepno) ' rew: ' num2str(reward, '%.16f')]
state_p
state_p_quantized
end
% choose an action for the currently observed situation
action_p = FRIQ_epsilon_greedy_selection(state_p_quantized, epsilon, FRIQ_param_actions);
% convert the index of the action to an action value
action_p_value = FRIQ_param_actions(action_p);
% update the rule-base conclusions (Q vals) (when in incremental construction phase)
if noRBupdate == 0
FRIQ_update_RB(state_quantized, action_value, reward, state_p_quantized, action_p_value, alpha, gamma);
end
state_quantized = state_p_quantized;
action_index = action_p;
state = state_p;
steps_friq = steps_friq + 1;
% visualization
if FRIQ_param_drawsim == 1
FRIQ_param_drawfunc(state, action_value, steps_friq);
end
if (isfinalstate == 1) || (stopappnow == 1)
break
end
end
if FRIQ_param_drawsim == 1
FRIQ_param_drawfunc(state, action_value, steps_friq);
end