diff --git a/main.py b/main.py index 62eee2a..704e174 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,8 @@ import matplotlib.pyplot as plt P = 0.1 ALPHA = 0.90 -EPSILON = 1e-12 # Convergence criterium +EPSILON = 1e-12 +# EPSILON = 1e-12 # Convergence criterium A2 = np.array([ # Action index to action mapping [-1, 0], # Up [ 1, 0], # Down @@ -56,7 +57,7 @@ def init_global(maze_filename): # Basic maze structure initialization MAZE = np.genfromtxt( maze_filename, - dtype=str, + dtype='|S1', ) state_mask = (MAZE != '1') @@ -72,7 +73,7 @@ def init_global(maze_filename): maze_cost[MAZE == 'T'] = 50 maze_cost[MAZE == 'G'] = -1 G1_X = maze_cost.copy()[state_mask] - maze_cost[maze_cost < 1] += 1 # assert np.nan < whatever == False + maze_cost[(MAZE=='0') | (MAZE=='S') | (MAZE=='G')] += 1 G2_X = maze_cost.copy()[state_mask] # Actual environment modelling @@ -146,20 +147,23 @@ def policy_iteration(j, g): return policy, j -def _terminate(j, j_old): - # TODO: DIS - return np.abs(j - j_old).max() < EPSILON +def _terminate(j, j_old, policy, policy_old): + # eps = EPSILON + # return np.abs(j - j_old).max() < eps + return np.all(policy == policy_old) def dynamic_programming(optimizer_step, g, return_history=False): j = np.zeros(SN, dtype=np.float64) + policy = None history = [] while True: j_old = j + policy_old = policy policy, j = optimizer_step(j, g) if return_history: history.append(j) - if _terminate(j, j_old): + if _terminate(j, j_old, policy, policy_old): break if not return_history: return j, policy @@ -191,7 +195,9 @@ if __name__ == '__main__': name = ' / '.join([opt, cost]) ALPHA = a j, policy = dynamic_programming(optimizers[opt], costs[cost]) - print(name, j) + print(name) + print(j) + # print(name, j) plt.subplot(2, 2, i) plt.gca().set_title(name) plot_j_policy_on_maze(j, policy) @@ -200,6 +206,7 @@ if __name__ == '__main__': # Error graphs for opt in ['Value Iteration', 'Policy Iteration']: plt.figure() + plt.subplots_adjust(wspace=0.45, hspace=0.45) plt.suptitle(opt) i = 1 for cost in ['g1', 'g2']: