Hi,
I modified the template of the code that was given to us for the part 2 SARSA which is given below and similarly for part 3. The lines I added are in blue and the ones I removed are in green.
I changed the code like this because, the way I understood the algorithm, when you are in s',a' you calculate Q(s, a) (for 1-step horizon) but you will also play the action a' you chose for the calculation. Is it correct ?
n = 5
env = Maze(n)
Q_table = np.zeros((env.n_states, env.n_actions))
state = env.current_state
eta = 0.1 # learning rate
gamma = 0.9 # discount rate
eps = 0.1 # exploration rate
num_episodes = 100
rewards = np.zeros(num_episodes)
for itr in range(num_episodes):
action = pick_action(state, Q_table, epsilon=eps) #added
while not env.done:
state_ind = cell_to_ind(state, n)
#action = pick_action(state, Q_table, epsilon=eps) removed
next_state, reward, done = env.step(action)
next_state_ind = cell_to_ind(next_state, n)
next_action = pick_action(next_state, Q_table, epsilon=eps) #added
######## code here #############################
# update Q-table using the iterative update rule
Q_table[state_ind, action] += eta*(reward+gamma*Q_table[next_state_ind, next_action]- \
Q_table[state_ind, action])
######## code here ############################
rewards[itr] += reward
state = next_state
action = next_action #added
env.reset()
state = env.current_state
print(calc_V(Q_table, epsilon=eps))
plt.plot(rewards)
Best regards.
Müller Nathan