-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQLearningAgent.py
51 lines (41 loc) · 2.22 KB
/
QLearningAgent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# QLearningAgent.py
import numpy as np
class QLearningAgent:
def __init__(self, stateSize, actionSize, learningRate, discountFactor, explorationRate):
# Q-table to store Q-values for each state-action pair
self.qTable = np.zeros(stateSize + (actionSize,))
self.learningRate = learningRate # Learning rate for Q-value updates
self.discountFactor = discountFactor # Discount factor for future rewards
self.explorationRate = explorationRate # Exploration rate for epsilon-greedy strategy
self.actionSize = actionSize # Number of possible actions
self.episodePath = [] # Store the agent's path during an episode
def selectAction(self, state):
# Implementing action selection logic using epsilon-greedy strategy
if np.random.rand() < self.explorationRate:
return np.random.randint(self.actionSize)
else:
return np.argmax(self.qTable[state])
def updateQTable(self, state, action, reward, nextState):
# Implementing Q-table update logic using the Q-learning update rule
currentQValues = self.qTable[state]
maxNextQValue = np.max(self.qTable[nextState])
newQValues = currentQValues.copy()
newQValues[action] = (1 - self.learningRate) * currentQValues[action] + \
self.learningRate * (reward + self.discountFactor * maxNextQValue)
self.qTable[state] = newQValues
def resetEpisodePath(self):
# Reset the agent's episode path
self.episodePath = []
def getOptimalPolicy(self, obstacles):
# Deriving optimal policy based on Q-values
optimalPolicy = np.zeros(self.qTable.shape[:-1], dtype=int)
for i in range(optimalPolicy.shape[0]):
for j in range(optimalPolicy.shape[1]):
if obstacles[i, j]:
optimalPolicy[i, j] = -1 # Mark obstacles
else:
maxQValue = np.max(self.qTable[i, j])
candidates = np.where(self.qTable[i, j] == maxQValue)[0]
action = np.random.choice(candidates) # Randomly choose among actions with equal Q-values
optimalPolicy[i, j] = action
return optimalPolicy