(Apr-23-2022, 10:32 PM)deanhystad Wrote: I still think your building the board incorrectly. You should not be using insert() to set moves or values.
You should describe how your Q_learning function is supposed to work.
Can you elaborate on insert() to set move or values? Also why you think that the Q_learning function is not working? Thanks
Here is update on the code
import random import numpy as np import enum EACH_STEP_REWARD = -0.1 GOAL_SQUARE_REWARD = 100 FORBIDDEN_SQUARE_REWARD = -100 DISCOUNT_RATE_GAMMA = 0.1 # Discount Rate LEARNING_RATE_ALPHA = 0.3 # Learning Rate GREEDY_PROBABILITY_EPSILON = 0.5 # Greedy Probability ITERATION_MAX_NUM = 10000 # Will be 10,000 START_STATE = 2 LEVEL = 4 HEIGHT = 4 WEIGHT = 4 class Direction(enum.Enum): up = 0 right = 1 down = 2 left = 3 class Node: def __init__(self, title, next, Goal=False, Forbidden=False, Wall=False, qValues=None, actions=None): self.title = title self.next = next self.qValues = [qValues] * 4 self.move = [actions] * 4 self.goal = Goal self.forbidden = Forbidden self.wall = Wall def max_Q_value(self): if self.wall: return False max_q = [] for q in self.qValues: if q is not None: max_q.append(q) return max(max_q) def find_best_move(self): max_q = self.max_Q_value() q_index = self.qValues.index(max_q) return Direction(q_index) class create_env: def __init__(self, input_list, wall=None): self.wall = wall self.episode = [[13, 14, 15, 16], [9, 10, 11, 12], [5, 6, 7, 8], [1, 2, 3, 4]] S = 2 Node_1 = Node(1, [5, S, self.wall, self.wall]) Node_Start = Node(S, [6, 3, self.wall, 1]) Node_3 = Node(3, [S, 7, 4, self.wall]) Node_4 = Node(4, [8, self.wall, self.wall, 3]) Node_5 = Node(5, [9, 6, 1, self.wall]) Node_6 = Node(6, [10, 7, S, 5]) Node_7 = Node(7, [11, 8, 3, 6]) Node_8 = Node(8, [12, self.wall, 4, 7]) Node_9 = Node(9, [13, 10, 5, self.wall]) Node_10 = Node(10, [14, 11, 6, 9]) Node_11 = Node(11, [15, 12, 7, 10]) Node_12 = Node(12, [16, self.wall, 8, 11]) Node_13 = Node(13, [self.wall, 14, 9, self.wall]) Node_14 = Node(14, [self.wall, 15, 10, 13]) Node_15 = Node(15, [self.wall, 16, 11, 14]) Node_16 = Node(16, [self.wall, self.wall, 12, 15]) self.episode[0][0] = Node_1 self.episode[0][1] = Node_Start self.episode[0][S] = Node_3 self.episode[0][3] = Node_4 self.episode[1][0] = Node_5 self.episode[1][1] = Node_6 self.episode[1][S] = Node_7 self.episode[1][3] = Node_8 self.episode[S][0] = Node_9 self.episode[S][1] = Node_10 self.episode[S][S] = Node_11 self.episode[S][3] = Node_12 self.episode[3][0] = Node_13 self.episode[3][1] = Node_14 self.episode[3][S] = Node_15 self.episode[3][3] = Node_16 self.goal_labels = [int(input_list[0]), int(input_list[1])] self.forbidden_label = int(input_list[2]) self.wall_label = int(input_list[3]) x = 0 while x < LEVEL: y = 0 while y < LEVEL: current_episode = self.episode[x][y] if current_episode.title in self.goal_labels: current_episode.goal = 1 current_episode.move.append(4) current_episode.qValues.append(4) elif current_episode.title == self.forbidden_label: current_episode.forbidden = 1 current_episode.move.append(4) current_episode.qValues.append(4) elif current_episode.title == self.wall_label: current_episode.wall = 1 else: position = 0 while position < LEVEL: if current_episode.next[position] is not None: current_episode.move.append(Direction(position)), current_episode.qValues.insert( position, False) position += 1 y += 1 x += 1 def get_episode(self, name): for x in self.episode: for episode in x: if episode.title == name: # print(episode) return episode def print_best_actions(self): for row in self.episode: for episode in row: if episode.goal: best_action_str = 'Direction.goal' elif episode.forbidden: best_action_str = "Direction.forbid" elif episode.wall: best_action_str = 'Direction.wall-square' else: best_action_str = str(episode.find_best_move()) print(str(episode.title) + " " + best_action_str[10:]) def print_four_Q_value(self, index): episode = self.get_episode(index) print("up" + ' ' + str(round(episode.qValues[1], 2))) print("right" + ' ' + str(round(episode.qValues[2], 2))) print("down" + ' ' + str(round(episode.qValues[3], 2))) print("left" + ' ' + str(round(episode.qValues[0], 2))) def Q_learning(environment, print_best_actions, index): for iteration in range(ITERATION_MAX_NUM): current_episode = environment.get_episode(START_STATE) total_episode_reward = 0 for episode in range(ITERATION_MAX_NUM): if np.random.uniform(0, 1) < GREEDY_PROBABILITY_EPSILON: next_move = [] for score in current_episode.move: if score is not None: next_move.append(score) # print(score) next_move = random.choice(next_move) else: next_move = current_episode.find_best_move() next_episode = environment.get_episode(current_episode.next[next_move.value]) if next_episode.goal: reward = GOAL_SQUARE_REWARD elif next_episode.forbidden: reward = FORBIDDEN_SQUARE_REWARD else: reward = EACH_STEP_REWARD total_episode_reward += reward old_q = current_episode.qValues[next_move.value] new_q = old_q + LEARNING_RATE_ALPHA * (reward + DISCOUNT_RATE_GAMMA * next_episode.max_Q_value() - old_q) # print(new_q) current_episode.qValues[next_move.value] = new_q # print(current_episode.qValues[next_move.value]) if next_episode.goal: break elif next_episode.forbidden: break else: if next_episode.wall: break else: current_episode = next_episode def user_input(): try: input_list = [] input_str = input() input_list = input_str.split() except: print("The input should be like: 15 12 8 6 p") environment = create_env(input_list) if (len(input_list) == 5) and (input_list[-1] == 'p'): Q_learning(environment, 1, 0) environment.print_best_actions() elif (len(input_list) == 6) and (input_list[-2] == 'q'): Q_learning(environment, 0, int(input_list[5])) environment.print_four_Q_value(int(input_list[5])) user_input()