It has been three days and I could not make much progress. This is how close I am getting to the expected result. However, it still does not print out the out put that I want.
For example with this
with this input 15 12 8 6 p
I expect this result
1 up
2 right
3 up
4 left
5 up
6 wall-square
7 up
8 forbid
9 up
10 up
11 up
12 goal
13 right
14 right
15 goal
16 up
with this input 15 12 8 6 q 11
i am expecting this
up 100.0
right 100.0
down 0.89
left 0.89
Updated: I am able to print out correctly with this input 15 12 8 6 q 11. However, it throws an error when I tried different inputs such as
12 7 5 6 q 3
Traceback (most recent call last):
File "C:", line 200, in <module>
user_input()
File "C:", line 197, in user_input
environment.print_four_Q_value(int(input_list[5]))
File "C:", line 142, in print_four_Q_value
print("down" + ' ' + str(round(episode.qValues[3], 2)))
TypeError: type NoneType doesn't define __round__ method
up 100.0
right 0.89
The expected output is
up 100.0
right 0.89
down 9.9
left 0.89
I wonder if you have anymore advice?
For example with this
with this input 15 12 8 6 p
I expect this result
1 up
2 right
3 up
4 left
5 up
6 wall-square
7 up
8 forbid
9 up
10 up
11 up
12 goal
13 right
14 right
15 goal
16 up
with this input 15 12 8 6 q 11
i am expecting this
up 100.0
right 100.0
down 0.89
left 0.89
Updated: I am able to print out correctly with this input 15 12 8 6 q 11. However, it throws an error when I tried different inputs such as
12 7 5 6 q 3
Traceback (most recent call last):
File "C:", line 200, in <module>
user_input()
File "C:", line 197, in user_input
environment.print_four_Q_value(int(input_list[5]))
File "C:", line 142, in print_four_Q_value
print("down" + ' ' + str(round(episode.qValues[3], 2)))
TypeError: type NoneType doesn't define __round__ method
up 100.0
right 0.89
The expected output is
up 100.0
right 0.89
down 9.9
left 0.89
I wonder if you have anymore advice?
import random import numpy as np import enum EACH_STEP_REWARD = -0.1 GOAL_SQUARE_REWARD = 100 FORBIDDEN_SQUARE_REWARD = -100 DISCOUNT_RATE_GAMMA = 0.1 # Discount Rate LEARNING_RATE_ALPHA = 0.3 # Learning Rate GREEDY_PROBABILITY_EPSILON = 0.5 # Greedy Probability ITERATION_MAX_NUM = 10000 # Will be 10,000 START_LABEL = 2 LEVEL = 4 HEIGHT = 4 WEIGHT = 4 class Direction(enum.Enum): up = 1 right = 2 down = 3 left = 0 class Node: def __init__(self, title, next, Goal=False, Forbidden=False, Wall=False, qValues=None, actions=None): self.title = title self.next = next self.qValues = [qValues] * 5 self.move = [actions] * 5 self.goal = Goal self.forbidden = Forbidden self.wall = Wall def max_Q_value(self): if self.wall: return False max_q = [] for q in self.qValues: if q is not None: max_q.append(q) return max(max_q) def find_best_move(self): max_q = self.max_Q_value() q_index = self.qValues.index(max_q) return Direction(q_index) class create_env: def __init__(self, input_list, wall=None): self.wall = wall self.episode = [[13, 14, 15, 16], [9, 10, 11, 12], [5, 6, 7, 8], [1, 2, 3, 4]] S = 2 Node_1 = Node(1, [self.wall, 5, S, self.wall]) Node_Start = Node(S, [1, 6, 3, self.wall]) Node_3 = Node(3, [S, 7, 4, self.wall]) Node_4 = Node(4, [3, 8, self.wall, self.wall]) Node_5 = Node(5, [self.wall, 9, 6, 1]) Node_6 = Node(6, [5, 10, 7, S]) Node_7 = Node(7, [6, 11, 8, 3]) Node_8 = Node(8, [7, 12, self.wall, 4]) Node_9 = Node(9, [self.wall, 13, 10, 5]) Node_10 = Node(10, [9, 14, 11, 6]) Node_11 = Node(11, [10, 15, 12, 7]) Node_12 = Node(12, [11, 16, self.wall, 8]) Node_13 = Node(13, [self.wall, self.wall, 14, 9]) Node_14 = Node(14, [13, self.wall, 15, 10]) Node_15 = Node(15, [14, self.wall, 16, 11]) Node_16 = Node(16, [15, self.wall, self.wall, 12]) self.episode[0][0] = Node_1 self.episode[0][1] = Node_Start self.episode[0][S] = Node_3 self.episode[0][3] = Node_4 self.episode[1][0] = Node_5 self.episode[1][1] = Node_6 self.episode[1][S] = Node_7 self.episode[1][3] = Node_8 self.episode[S][0] = Node_9 self.episode[S][1] = Node_10 self.episode[S][S] = Node_11 self.episode[S][3] = Node_12 self.episode[3][0] = Node_13 self.episode[3][1] = Node_14 self.episode[3][S] = Node_15 self.episode[3][3] = Node_16 self.goal_labels = [int(input_list[0]), int(input_list[1])] self.forbidden_label = int(input_list[2]) self.wall_label = int(input_list[3]) x = 0 while x < LEVEL: y = 0 while y < LEVEL: current_episode = self.episode[x][y] if current_episode.title in self.goal_labels: current_episode.goal = 1 current_episode.move.insert(4, 0) current_episode.qValues.insert(4, 0) elif current_episode.title == self.forbidden_label: current_episode.forbidden = 1 current_episode.move.insert(4, 0) current_episode.qValues.insert(4, 0) elif current_episode.title == self.wall_label: current_episode.wall = 1 else: position = 0 while position < LEVEL: if current_episode.next[position] is not None: current_episode.move.insert(position, Direction(position)), current_episode.qValues.insert( position, False) position += 1 y += 1 x += 1 def get_episode(self, name): for x in self.episode: for episode in x: if episode.title == name: # print(episode) return episode def print_best_actions(self): for row in self.episode: for episode in row: if episode.goal: best_action_str = 'Direction.goal' elif episode.forbidden: best_action_str = "Direction.forbid" elif episode.wall: best_action_str = 'Direction.wall-square' else: best_action_str = str(episode.find_best_move()) print(str(episode.title) + " " + best_action_str[10:]) def print_four_Q_value(self, index): episode = self.get_episode(index) print("up" + ' ' + str(round(episode.qValues[1], 2))) print("right" + ' ' + str(round(episode.qValues[2], 2))) print("down" + ' ' + str(round(episode.qValues[3], 2))) print("left" + ' ' + str(round(episode.qValues[0], 2))) def Q_learning(environment, print_best_actions, index): for iteration in range(ITERATION_MAX_NUM): current_episode = environment.get_episode(START_LABEL) total_episode_reward = 0 for episode in range(100): if np.random.uniform(0, 1) < GREEDY_PROBABILITY_EPSILON: next_move = [] for score in current_episode.move: if score is not None: next_move.append(score) next_move = random.choice(next_move) else: next_move = current_episode.find_best_move() next_episode = environment.get_episode(current_episode.next[next_move.value]) if next_episode.goal: reward = GOAL_SQUARE_REWARD elif next_episode.forbidden: reward = FORBIDDEN_SQUARE_REWARD else: reward = EACH_STEP_REWARD total_episode_reward += reward old_q = current_episode.qValues[next_move.value] new_q = old_q + LEARNING_RATE_ALPHA * (reward + DISCOUNT_RATE_GAMMA * next_episode.max_Q_value() - old_q) current_episode.qValues[next_move.value] = new_q if next_episode.goal: break elif next_episode.forbidden: break else: if next_episode.wall: break else: current_episode = next_episode def user_input(): try: input_list = [] input_str = input() input_list = input_str.split() except: print("The input should be like: 15 12 8 6 p") environment = create_env(input_list) if (len(input_list) == 5) and (input_list[-1] == 'p'): Q_learning(environment, 1, 0) environment.print_best_actions() elif (len(input_list) == 6) and (input_list[-2] == 'q'): Q_learning(environment, 0, int(input_list[5])) environment.print_four_Q_value(int(input_list[5])) user_input()