removed unnecessary redundancy from optimizers

2018-12-17 21:49:14 +01:00
parent f390d67b1b
commit e214a3cc7d
1 changed files with 52 additions and 54 deletions
--- a/main.py
+++ b/main.py
@@ -8,14 +8,21 @@ from time import time
 import numpy as np

 import matplotlib as mpl
-mpl.use('TkAgg')
+mpl.use('TkAgg')  # fixes my macOS bug

 import matplotlib.pyplot as plt


 P = 0.1
 ALPHA = 0.90
-EPSILON = 1e-8  # Convergence criterium
+EPSILON = 1e-12  # Convergence criterium
+A2 = np.array([  # Action index to action mapping
+    [-1,  0],  # Up
+    [ 1,  0],  # Down
+    [ 0, -1],  # Left
+    [ 0,  1],  # Right
+    [ 0,  0],  # Idle
+])

 # Global state
 MAZE = None  # Map of the environment
@@ -25,15 +32,7 @@ U_OF_X = None  # The allowed action space matrix representation
 PW_OF_X_U = None  # The probability distribution of disturbance
 G1_X = None  # The cost function vector representation (depends only on state)
 G2_X = None  # The second cost function vector representation
-F_X_U_W = None  # The state function
-
-A2 = np.array([
-    [-1, 0],
-    [1, 0],
-    [0, -1],
-    [0, 1],
-    [0, 0]
-])
+F_X_U_W = None  # The System Equation


 def h_matrix(j, g):
@@ -50,29 +49,30 @@ def _valid_target(target):
    )


-def init_global(maze_file):
+def init_global(maze_filename):
    global MAZE, SN, S_TO_IJ
    global U_OF_X, PW_OF_X_U, F_X_U_W, G1_X, G2_X

    # Basic maze structure initialization
    MAZE = np.genfromtxt(
-        maze_file,
+        maze_filename,
        dtype=str,
    )
    state_mask = (MAZE != '1')
+
    S_TO_IJ = np.indices(MAZE.shape).transpose(1, 2, 0)[state_mask]
    SN = len(S_TO_IJ)
    ij_to_s = np.zeros(MAZE.shape, dtype=np.int32)
    ij_to_s[state_mask] = np.arange(SN)

    # One step cost functions initialization
-    maze_cost = np.zeros(MAZE.shape)
+    maze_cost = np.zeros(MAZE.shape, dtype=np.float64)
    maze_cost[MAZE == '1'] = np.nan
    maze_cost[(MAZE == '0') | (MAZE == 'S')] = 0
    maze_cost[MAZE == 'T'] = 50
    maze_cost[MAZE == 'G'] = -1
    G1_X = maze_cost.copy()[state_mask]
-    maze_cost[maze_cost < 1] += 1  # assert np.nan < whatever == True
+    maze_cost[maze_cost < 1] += 1  # assert np.nan < whatever == False
    G2_X = maze_cost.copy()[state_mask]

    # Actual environment modelling
@@ -84,28 +84,28 @@ def init_global(maze_file):
        for iu, u in enumerate(A2):
            if _valid_target(x + u):
                U_OF_X[ix, iu] = True
-                if iu in (0, 1):
-                    possible_iw = [2, 3]
-                elif iu in (2, 3):
-                    possible_iw = [0, 1]
+                if iu in (0, 1):  # (Up, Down)
+                    possible_iw = [2, 3]  # [Left, Right]
+                elif iu in (2, 3):  # (Left, Right)
+                    possible_iw = [0, 1]  # [Up, Down]
                for iw in possible_iw:
                    if _valid_target(x + u + A2[iw]):
                        PW_OF_X_U[ix, iu, iw] = P
                        F_X_U_W[ix, iu, iw] = ij_to_s[tuple(x + u + A2[iw])]
-                # IDLE w is always possible
+                # Idle w is always possible
                PW_OF_X_U[ix, iu, -1] = 1 - PW_OF_X_U[ix, iu].sum()
                F_X_U_W[ix, iu, -1] = ij_to_s[tuple(x + u)]


 def plot_j_policy_on_maze(j, policy):
    heatmap = np.full(MAZE.shape, np.nan)
-    heatmap[S_TO_IJ[:,0], S_TO_IJ[:,1]] = j
+    heatmap[S_TO_IJ[:, 0], S_TO_IJ[:, 1]] = j
    cmap = mpl.cm.get_cmap('coolwarm')
    cmap.set_bad(color='black')
    plt.imshow(heatmap, cmap=cmap)
    plt.colorbar()
-    plt.quiver(S_TO_IJ[:,1], S_TO_IJ[:,0],
-               A2[policy, 1], -A2[policy, 0])
+    # quiver has some weird behavior, the arrow y component must be flipped
+    plt.quiver(S_TO_IJ[:, 1], S_TO_IJ[:, 0], A2[policy, 1], -A2[policy, 0])
    plt.gca().get_xaxis().set_visible(False)
    plt.gca().get_yaxis().set_visible(False)

@@ -113,7 +113,7 @@ def plot_j_policy_on_maze(j, policy):
 def plot_cost_history(hist):
    error = np.sqrt(np.square(hist[:-1] - hist[-1]).mean(axis=1))
    plt.xlabel('Number of iterations')
-    plt.ylabel('Cost function error')
+    plt.ylabel('Cost function RMSE')
    plt.plot(error)


@@ -127,7 +127,8 @@ def _evaluate_policy(policy, g):
    targs = F_X_U_W[np.arange(SN), policy]  # all f(x, u(x), w(x, u(x)))
    G = (pw_pi * g[targs]).sum(axis=1)  # Expected one-step cost vector

-    M = np.zeros((SN, SN))  # Markov matrix for given deterministic policy
+    # Markov matrix for given deterministic policy
+    M = np.zeros((SN, SN), dtype=np.float64)
    x_from = [x_ff for x_f, nz in
              zip(np.arange(SN), np.count_nonzero(pw_pi, axis=1))
              for x_ff in [x_f] * nz]
@@ -135,35 +136,31 @@ def _evaluate_policy(policy, g):
    return np.linalg.solve(np.eye(SN) - ALPHA*M, G)


-def value_iteration(g, return_history=False):
-    j = np.zeros(SN)
-    history = [j]
-    while True:
-        # print(j)
-        policy, j_new = _policy_improvement(j, g)
-        j_old = j
-        j = j_new
-        if return_history:
-            history.append(j)
-        if np.abs(j - j_old).max() < EPSILON:
-            break
-    if not return_history:
-        return j, policy
-    else:
-        return np.array(history)
+def value_iteration(j, g):
+    return _policy_improvement(j, g)


-def policy_iteration(g, return_history=False):
-    j = None
-    policy = np.full(SN, len(A2) - 1)  # starting policy is IDLE
+def policy_iteration(j, g):
+    policy, _ = _policy_improvement(j, g)
+    j = _evaluate_policy(policy, g)
+    return policy, j
+
+
+def _terminate(j, j_old):
+    # TODO: DIS
+    return np.abs(j - j_old).max() < EPSILON
+
+
+def dynamic_programming(optimizer_step, g, return_history=False):
+    j = np.zeros(SN, dtype=np.float64)
    history = []
    while True:
        j_old = j
-        j = _evaluate_policy(policy, g)
+        policy, j = optimizer_step(j, g)
+        if return_history:
            history.append(j)
-        if j_old is not None and np.abs(j - j_old).max() < EPSILON:
+        if _terminate(j, j_old):
            break
-        policy, _ = _policy_improvement(j, g)
    if not return_history:
        return j, policy
    else:
@@ -190,10 +187,10 @@ if __name__ == '__main__':
        plt.suptitle('DISCOUNT = ' + str(a))
        i = 1
        for opt in ['Value Iteration', 'Policy Iteration']:
-            for g in ['g1', 'g2']:
-                name = ' / '.join([opt, g])
+            for cost in ['g1', 'g2']:
+                name = ' / '.join([opt, cost])
                ALPHA = a
-                j, policy = optimizers[opt](costs[g])
+                j, policy = dynamic_programming(optimizers[opt], costs[cost])
                print(name, j)
                plt.subplot(2, 2, i)
                plt.gca().set_title(name)
@@ -205,11 +202,12 @@ if __name__ == '__main__':
        plt.figure()
        plt.suptitle(opt)
        i = 1
-        for g in ['g1', 'g2']:
+        for cost in ['g1', 'g2']:
            for a in [0.9, 0.8, 0.7]:
-                name = 'Cost: {}, discount: {}'.format(g, a)
+                name = 'Cost: {}, discount: {}'.format(cost, a)
                ALPHA = a
-                history = optimizers[opt](costs[g], return_history=True)
+                history = dynamic_programming(optimizers[opt], costs[cost],
+                                              return_history=True)
                plt.subplot(2, 3, i)
                plt.gca().set_title(name)
                plot_cost_history(history)