(Apr-23-2022, 10:32 PM)deanhystad Wrote: I still think your building the board incorrectly. You should not be using insert() to set moves or values.
You should describe how your Q_learning function is supposed to work.
Can you elaborate on insert() to set move or values? Also why you think that the Q_learning function is not working? Thanks
Here is update on the code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import random import numpy as np import enum EACH_STEP_REWARD = - 0.1 GOAL_SQUARE_REWARD = 100 FORBIDDEN_SQUARE_REWARD = - 100 DISCOUNT_RATE_GAMMA = 0.1 # Discount Rate LEARNING_RATE_ALPHA = 0.3 # Learning Rate GREEDY_PROBABILITY_EPSILON = 0.5 # Greedy Probability ITERATION_MAX_NUM = 10000 # Will be 10,000 START_STATE = 2 LEVEL = 4 HEIGHT = 4 WEIGHT = 4 class Direction(enum.Enum): up = 0 right = 1 down = 2 left = 3 class Node: def __init__( self , title, next , Goal = False , Forbidden = False , Wall = False , qValues = None , actions = None ): self .title = title self . next = next self .qValues = [qValues] * 4 self .move = [actions] * 4 self .goal = Goal self .forbidden = Forbidden self .wall = Wall def max_Q_value( self ): if self .wall: return False max_q = [] for q in self .qValues: if q is not None : max_q.append(q) return max (max_q) def find_best_move( self ): max_q = self .max_Q_value() q_index = self .qValues.index(max_q) return Direction(q_index) class create_env: def __init__( self , input_list, wall = None ): self .wall = wall self .episode = [[ 13 , 14 , 15 , 16 ], [ 9 , 10 , 11 , 12 ], [ 5 , 6 , 7 , 8 ], [ 1 , 2 , 3 , 4 ]] S = 2 Node_1 = Node( 1 , [ 5 , S, self .wall, self .wall]) Node_Start = Node(S, [ 6 , 3 , self .wall, 1 ]) Node_3 = Node( 3 , [S, 7 , 4 , self .wall]) Node_4 = Node( 4 , [ 8 , self .wall, self .wall, 3 ]) Node_5 = Node( 5 , [ 9 , 6 , 1 , self .wall]) Node_6 = Node( 6 , [ 10 , 7 , S, 5 ]) Node_7 = Node( 7 , [ 11 , 8 , 3 , 6 ]) Node_8 = Node( 8 , [ 12 , self .wall, 4 , 7 ]) Node_9 = Node( 9 , [ 13 , 10 , 5 , self .wall]) Node_10 = Node( 10 , [ 14 , 11 , 6 , 9 ]) Node_11 = Node( 11 , [ 15 , 12 , 7 , 10 ]) Node_12 = Node( 12 , [ 16 , self .wall, 8 , 11 ]) Node_13 = Node( 13 , [ self .wall, 14 , 9 , self .wall]) Node_14 = Node( 14 , [ self .wall, 15 , 10 , 13 ]) Node_15 = Node( 15 , [ self .wall, 16 , 11 , 14 ]) Node_16 = Node( 16 , [ self .wall, self .wall, 12 , 15 ]) self .episode[ 0 ][ 0 ] = Node_1 self .episode[ 0 ][ 1 ] = Node_Start self .episode[ 0 ][S] = Node_3 self .episode[ 0 ][ 3 ] = Node_4 self .episode[ 1 ][ 0 ] = Node_5 self .episode[ 1 ][ 1 ] = Node_6 self .episode[ 1 ][S] = Node_7 self .episode[ 1 ][ 3 ] = Node_8 self .episode[S][ 0 ] = Node_9 self .episode[S][ 1 ] = Node_10 self .episode[S][S] = Node_11 self .episode[S][ 3 ] = Node_12 self .episode[ 3 ][ 0 ] = Node_13 self .episode[ 3 ][ 1 ] = Node_14 self .episode[ 3 ][S] = Node_15 self .episode[ 3 ][ 3 ] = Node_16 self .goal_labels = [ int (input_list[ 0 ]), int (input_list[ 1 ])] self .forbidden_label = int (input_list[ 2 ]) self .wall_label = int (input_list[ 3 ]) x = 0 while x < LEVEL: y = 0 while y < LEVEL: current_episode = self .episode[x][y] if current_episode.title in self .goal_labels: current_episode.goal = 1 current_episode.move.append( 4 ) current_episode.qValues.append( 4 ) elif current_episode.title = = self .forbidden_label: current_episode.forbidden = 1 current_episode.move.append( 4 ) current_episode.qValues.append( 4 ) elif current_episode.title = = self .wall_label: current_episode.wall = 1 else : position = 0 while position < LEVEL: if current_episode. next [position] is not None : current_episode.move.append(Direction(position)), current_episode.qValues.insert( position, False ) position + = 1 y + = 1 x + = 1 def get_episode( self , name): for x in self .episode: for episode in x: if episode.title = = name: # print(episode) return episode def print_best_actions( self ): for row in self .episode: for episode in row: if episode.goal: best_action_str = 'Direction.goal' elif episode.forbidden: best_action_str = "Direction.forbid" elif episode.wall: best_action_str = 'Direction.wall-square' else : best_action_str = str (episode.find_best_move()) print ( str (episode.title) + " " + best_action_str[ 10 :]) def print_four_Q_value( self , index): episode = self .get_episode(index) print ( "up" + ' ' + str ( round (episode.qValues[ 1 ], 2 ))) print ( "right" + ' ' + str ( round (episode.qValues[ 2 ], 2 ))) print ( "down" + ' ' + str ( round (episode.qValues[ 3 ], 2 ))) print ( "left" + ' ' + str ( round (episode.qValues[ 0 ], 2 ))) def Q_learning(environment, print_best_actions, index): for iteration in range (ITERATION_MAX_NUM): current_episode = environment.get_episode(START_STATE) total_episode_reward = 0 for episode in range (ITERATION_MAX_NUM): if np.random.uniform( 0 , 1 ) < GREEDY_PROBABILITY_EPSILON: next_move = [] for score in current_episode.move: if score is not None : next_move.append(score) # print(score) next_move = random.choice(next_move) else : next_move = current_episode.find_best_move() next_episode = environment.get_episode(current_episode. next [next_move.value]) if next_episode.goal: reward = GOAL_SQUARE_REWARD elif next_episode.forbidden: reward = FORBIDDEN_SQUARE_REWARD else : reward = EACH_STEP_REWARD total_episode_reward + = reward old_q = current_episode.qValues[next_move.value] new_q = old_q + LEARNING_RATE_ALPHA * (reward + DISCOUNT_RATE_GAMMA * next_episode.max_Q_value() - old_q) # print(new_q) current_episode.qValues[next_move.value] = new_q # print(current_episode.qValues[next_move.value]) if next_episode.goal: break elif next_episode.forbidden: break else : if next_episode.wall: break else : current_episode = next_episode def user_input(): try : input_list = [] input_str = input () input_list = input_str.split() except : print ( "The input should be like: 15 12 8 6 p" ) environment = create_env(input_list) if ( len (input_list) = = 5 ) and (input_list[ - 1 ] = = 'p' ): Q_learning(environment, 1 , 0 ) environment.print_best_actions() elif ( len (input_list) = = 6 ) and (input_list[ - 2 ] = = 'q' ): Q_learning(environment, 0 , int (input_list[ 5 ])) environment.print_four_Q_value( int (input_list[ 5 ])) user_input() |