From e214a3cc7df51ddf033dbf592e5de207c515116d Mon Sep 17 00:00:00 2001
From: Pavel Lutskov <pavel.lutskov@gmail.com>
Date: Mon, 17 Dec 2018 21:49:14 +0100
Subject: [PATCH] removed unnecessary redundancy from optimizers

---
 main.py | 106 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 52 insertions(+), 54 deletions(-)

diff --git a/main.py b/main.py
index f2b2ea6..62eee2a 100644
--- a/main.py
+++ b/main.py
@@ -8,14 +8,21 @@ from time import time
 import numpy as np
 
 import matplotlib as mpl
-mpl.use('TkAgg')
+mpl.use('TkAgg')  # fixes my macOS bug
 
 import matplotlib.pyplot as plt
 
 
 P = 0.1
 ALPHA = 0.90
-EPSILON = 1e-8  # Convergence criterium
+EPSILON = 1e-12  # Convergence criterium
+A2 = np.array([  # Action index to action mapping
+    [-1,  0],  # Up
+    [ 1,  0],  # Down
+    [ 0, -1],  # Left
+    [ 0,  1],  # Right
+    [ 0,  0],  # Idle
+])
 
 # Global state
 MAZE = None  # Map of the environment
@@ -25,15 +32,7 @@ U_OF_X = None  # The allowed action space matrix representation
 PW_OF_X_U = None  # The probability distribution of disturbance
 G1_X = None  # The cost function vector representation (depends only on state)
 G2_X = None  # The second cost function vector representation
-F_X_U_W = None  # The state function
-
-A2 = np.array([
-    [-1, 0],
-    [1, 0],
-    [0, -1],
-    [0, 1],
-    [0, 0]
-])
+F_X_U_W = None  # The System Equation
 
 
 def h_matrix(j, g):
@@ -50,29 +49,30 @@ def _valid_target(target):
     )
 
 
-def init_global(maze_file):
+def init_global(maze_filename):
     global MAZE, SN, S_TO_IJ
     global U_OF_X, PW_OF_X_U, F_X_U_W, G1_X, G2_X
 
     # Basic maze structure initialization
     MAZE = np.genfromtxt(
-        maze_file,
+        maze_filename,
         dtype=str,
     )
     state_mask = (MAZE != '1')
+
     S_TO_IJ = np.indices(MAZE.shape).transpose(1, 2, 0)[state_mask]
     SN = len(S_TO_IJ)
     ij_to_s = np.zeros(MAZE.shape, dtype=np.int32)
     ij_to_s[state_mask] = np.arange(SN)
 
     # One step cost functions initialization
-    maze_cost = np.zeros(MAZE.shape)
+    maze_cost = np.zeros(MAZE.shape, dtype=np.float64)
     maze_cost[MAZE == '1'] = np.nan
     maze_cost[(MAZE == '0') | (MAZE == 'S')] = 0
     maze_cost[MAZE == 'T'] = 50
     maze_cost[MAZE == 'G'] = -1
     G1_X = maze_cost.copy()[state_mask]
-    maze_cost[maze_cost < 1] += 1  # assert np.nan < whatever == True
+    maze_cost[maze_cost < 1] += 1  # assert np.nan < whatever == False
     G2_X = maze_cost.copy()[state_mask]
 
     # Actual environment modelling
@@ -84,28 +84,28 @@ def init_global(maze_file):
         for iu, u in enumerate(A2):
             if _valid_target(x + u):
                 U_OF_X[ix, iu] = True
-                if iu in (0, 1):
-                    possible_iw = [2, 3]
-                elif iu in (2, 3):
-                    possible_iw = [0, 1]
+                if iu in (0, 1):  # (Up, Down)
+                    possible_iw = [2, 3]  # [Left, Right]
+                elif iu in (2, 3):  # (Left, Right)
+                    possible_iw = [0, 1]  # [Up, Down]
                 for iw in possible_iw:
                     if _valid_target(x + u + A2[iw]):
                         PW_OF_X_U[ix, iu, iw] = P
                         F_X_U_W[ix, iu, iw] = ij_to_s[tuple(x + u + A2[iw])]
-                # IDLE w is always possible
+                # Idle w is always possible
                 PW_OF_X_U[ix, iu, -1] = 1 - PW_OF_X_U[ix, iu].sum()
                 F_X_U_W[ix, iu, -1] = ij_to_s[tuple(x + u)]
 
 
 def plot_j_policy_on_maze(j, policy):
     heatmap = np.full(MAZE.shape, np.nan)
-    heatmap[S_TO_IJ[:,0], S_TO_IJ[:,1]] = j
+    heatmap[S_TO_IJ[:, 0], S_TO_IJ[:, 1]] = j
     cmap = mpl.cm.get_cmap('coolwarm')
     cmap.set_bad(color='black')
     plt.imshow(heatmap, cmap=cmap)
     plt.colorbar()
-    plt.quiver(S_TO_IJ[:,1], S_TO_IJ[:,0],
-               A2[policy, 1], -A2[policy, 0])
+    # quiver has some weird behavior, the arrow y component must be flipped
+    plt.quiver(S_TO_IJ[:, 1], S_TO_IJ[:, 0], A2[policy, 1], -A2[policy, 0])
     plt.gca().get_xaxis().set_visible(False)
     plt.gca().get_yaxis().set_visible(False)
 
@@ -113,7 +113,7 @@ def plot_j_policy_on_maze(j, policy):
 def plot_cost_history(hist):
     error = np.sqrt(np.square(hist[:-1] - hist[-1]).mean(axis=1))
     plt.xlabel('Number of iterations')
-    plt.ylabel('Cost function error')
+    plt.ylabel('Cost function RMSE')
     plt.plot(error)
 
 
@@ -127,7 +127,8 @@ def _evaluate_policy(policy, g):
     targs = F_X_U_W[np.arange(SN), policy]  # all f(x, u(x), w(x, u(x)))
     G = (pw_pi * g[targs]).sum(axis=1)  # Expected one-step cost vector
 
-    M = np.zeros((SN, SN))  # Markov matrix for given deterministic policy
+    # Markov matrix for given deterministic policy
+    M = np.zeros((SN, SN), dtype=np.float64)
     x_from = [x_ff for x_f, nz in
               zip(np.arange(SN), np.count_nonzero(pw_pi, axis=1))
               for x_ff in [x_f] * nz]
@@ -135,35 +136,31 @@ def _evaluate_policy(policy, g):
     return np.linalg.solve(np.eye(SN) - ALPHA*M, G)
 
 
-def value_iteration(g, return_history=False):
-    j = np.zeros(SN)
-    history = [j]
-    while True:
-        # print(j)
-        policy, j_new = _policy_improvement(j, g)
-        j_old = j
-        j = j_new
-        if return_history:
-            history.append(j)
-        if np.abs(j - j_old).max() < EPSILON:
-            break
-    if not return_history:
-        return j, policy
-    else:
-        return np.array(history)
+def value_iteration(j, g):
+    return _policy_improvement(j, g)
 
 
-def policy_iteration(g, return_history=False):
-    j = None
-    policy = np.full(SN, len(A2) - 1)  # starting policy is IDLE
+def policy_iteration(j, g):
+    policy, _ = _policy_improvement(j, g)
+    j = _evaluate_policy(policy, g)
+    return policy, j
+
+
+def _terminate(j, j_old):
+    # TODO: DIS
+    return np.abs(j - j_old).max() < EPSILON
+
+
+def dynamic_programming(optimizer_step, g, return_history=False):
+    j = np.zeros(SN, dtype=np.float64)
     history = []
     while True:
         j_old = j
-        j = _evaluate_policy(policy, g)
-        history.append(j)
-        if j_old is not None and np.abs(j - j_old).max() < EPSILON:
+        policy, j = optimizer_step(j, g)
+        if return_history:
+            history.append(j)
+        if _terminate(j, j_old):
             break
-        policy, _ = _policy_improvement(j, g)
     if not return_history:
         return j, policy
     else:
@@ -190,10 +187,10 @@ if __name__ == '__main__':
         plt.suptitle('DISCOUNT = ' + str(a))
         i = 1
         for opt in ['Value Iteration', 'Policy Iteration']:
-            for g in ['g1', 'g2']:
-                name = ' / '.join([opt, g])
+            for cost in ['g1', 'g2']:
+                name = ' / '.join([opt, cost])
                 ALPHA = a
-                j, policy = optimizers[opt](costs[g])
+                j, policy = dynamic_programming(optimizers[opt], costs[cost])
                 print(name, j)
                 plt.subplot(2, 2, i)
                 plt.gca().set_title(name)
@@ -205,11 +202,12 @@ if __name__ == '__main__':
         plt.figure()
         plt.suptitle(opt)
         i = 1
-        for g in ['g1', 'g2']:
+        for cost in ['g1', 'g2']:
             for a in [0.9, 0.8, 0.7]:
-                name = 'Cost: {}, discount: {}'.format(g, a)
+                name = 'Cost: {}, discount: {}'.format(cost, a)
                 ALPHA = a
-                history = optimizers[opt](costs[g], return_history=True)
+                history = dynamic_programming(optimizers[opt], costs[cost],
+                                              return_history=True)
                 plt.subplot(2, 3, i)
                 plt.gca().set_title(name)
                 plot_cost_history(history)