From a88fcfc1f282958924fb6ae7cae0725a3b86e8b9 Mon Sep 17 00:00:00 2001
From: Pavel Lutskov <pavel.lutskov@gmail.com>
Date: Sun, 27 Jan 2019 14:22:12 +0100
Subject: [PATCH] FINALLY CLEAR GRAPHS AHAHAHAHA

---
 main.py      | 121 ++++++++++++++++++++++++++++++---------------------
 report.latex |  88 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 158 insertions(+), 51 deletions(-)

diff --git a/main.py b/main.py
index b96d226..f8c2838 100644
--- a/main.py
+++ b/main.py
@@ -10,10 +10,11 @@ import numpy as np
 import matplotlib as mpl
 mpl.use('TkAgg')  # fixes my macOS bug
 import matplotlib.pyplot as plt
+import matplotlib.colors as colors
 
 
 P = 0.1  # Slip probability
-ALPHA = 0.90  # Discount factor
+ALPHA = 0.8  # Discount factor
 
 A2 = np.array([  # Action index to action mapping
     [-1,  0],  # Up
@@ -34,12 +35,6 @@ G2_X = None  # The second cost function vector representation
 F_X_U_W = None  # The System Equation
 
 
-def h_matrix(j, g):
-    result = (PW_OF_X_U * (g[F_X_U_W] + ALPHA*j[F_X_U_W])).sum(axis=2)
-    result[~U_OF_X] = np.inf  # discard invalid policies
-    return result
-
-
 def _valid_target(target):
     return (
         0 <= target[0] < MAZE.shape[0] and
@@ -72,8 +67,9 @@ def init_global(maze_filename):
     maze_cost[MAZE == 'T'] = 50
     maze_cost[MAZE == 'G'] = -1
     G1_X = maze_cost.copy()[state_mask]
-    maze_cost[(MAZE=='0') | (MAZE=='S') | (MAZE=='G')] += 1
-    G2_X = maze_cost.copy()[state_mask]
+    # maze_cost[(MAZE=='0') | (MAZE=='S') | (MAZE=='G')] += 1
+    # G2_X = maze_cost.copy()[state_mask]
+    G2_X = G1_X + 1
 
     # Actual environment modelling
     U_OF_X = np.zeros((SN, len(A2)), dtype=np.bool)
@@ -97,26 +93,10 @@ def init_global(maze_filename):
                 F_X_U_W[ix, iu, -1] = ij_to_s[tuple(x + u)]
 
 
-def plot_j_policy_on_maze(j, policy):
-    heatmap = np.full(MAZE.shape, np.nan)
-    heatmap[S_TO_IJ[:, 0], S_TO_IJ[:, 1]] = j
-    cmap = mpl.cm.get_cmap('coolwarm')
-    cmap.set_bad(color='black')
-    plt.imshow(heatmap, cmap=cmap)
-    # plt.colorbar()
-    # quiver has some weird behavior, the arrow y component must be flipped
-    plt.quiver(S_TO_IJ[:, 1], S_TO_IJ[:, 0], A2[policy, 1], -A2[policy, 0])
-    plt.gca().get_xaxis().set_visible(False)
-    plt.tick_params(axis='y', which='both', left=False, labelleft=False)
-
-
-def plot_cost_history(hist):
-    error = np.log10(
-        np.sqrt(np.square(hist[:-1] - hist[-1]).mean(axis=1))
-    )
-    plt.xticks(np.arange(0, len(error), len(error) // 5))
-    plt.yticks(np.linspace(error.min(), error.max(), 5))
-    plt.plot(error)
+def h_matrix(j, g):
+    h_x_u = (PW_OF_X_U * (g[F_X_U_W] + ALPHA*j[F_X_U_W])).sum(axis=2)
+    h_x_u[~U_OF_X] = np.inf  # discard invalid policies
+    return h_x_u
 
 
 def _policy_improvement(j, g):
@@ -159,7 +139,7 @@ def _terminate_vi(j, j_old, policy, policy_old):
 
 def dynamic_programming(optimizer_step, g, terminator, return_history=False):
     j = np.zeros(SN, dtype=np.float64)
-    policy = np.full(SN, -1, dtype=np.int32)  # idle policy
+    policy = np.full(SN, len(A2) - 1, dtype=np.int32)  # idle policy
     history = []
     while True:
         j_old = j
@@ -181,6 +161,43 @@ def dynamic_programming(optimizer_step, g, terminator, return_history=False):
         return history
 
 
+def plot_j_policy_on_maze(j, policy, normalize=True):
+
+    heatmap = np.full(MAZE.shape, np.nan, dtype=np.float64)
+    if normalize:
+        # Non-linear, but a discrete representation of different costs
+        norm = colors.BoundaryNorm(boundaries=np.sort(j)[1:-1], ncolors=256)
+        vmin = 0
+        vmax = 256
+    else:
+        norm = lambda x: x
+        vmin = None
+        vmax = None
+
+    heatmap[S_TO_IJ[:, 0], S_TO_IJ[:, 1]] = norm(j)
+
+    cmap = mpl.cm.get_cmap('coolwarm')
+    cmap.set_bad(color='black')
+
+    plt.imshow(
+        heatmap, vmin=vmin, vmax=vmax, cmap=cmap,
+    )
+
+    # quiver has some weird behavior, the arrow y component must be flipped
+    plt.quiver(S_TO_IJ[:, 1], S_TO_IJ[:, 0], A2[policy, 1], -A2[policy, 0])
+    plt.gca().get_xaxis().set_visible(False)
+    plt.tick_params(axis='y', which='both', left=False, labelleft=False)
+
+
+def plot_cost_history(hist):
+    error = np.log10(
+        np.sqrt(np.square(hist[:-1] - hist[-1]).mean(axis=1))
+    )
+    plt.xticks(np.arange(0, len(error), len(error) // 5))
+    plt.yticks(np.linspace(error.min(), error.max(), 5))
+    plt.plot(error)
+
+
 if __name__ == '__main__':
     # Argument Parsing
     ap = ArgumentParser()
@@ -197,27 +214,31 @@ if __name__ == '__main__':
                   'Policy Iteration': policy_iteration}
     terminators = {'Value Iteration': _terminate_vi,
                    'Policy Iteration': _terminate_pi}
+    # cost_transform = {'g1': _neg_log_neg, 'g2': _gamma}
 
-    for a in [0.9, 0.5, 0.01]:
-        plt.figure(figsize=(9, 7))
-        plt.subplots_adjust(top=0.9, bottom=0.05, left=0.1, right=0.95,
-                            wspace=0.1)
-        plt.suptitle('DISCOUNT = ' + str(a))
-        i = 1
-        for opt in ['Value Iteration', 'Policy Iteration']:
-            for cost in ['g1', 'g2']:
-                name = '{} / {}'.format(opt, cost)
-                ALPHA = a
-                j, policy = dynamic_programming(optimizers[opt], costs[cost],
-                                                terminators[opt])
-                plt.subplot(2, 2, i)
-                plot_j_policy_on_maze(j, policy)
-                if i <= 2:
-                    plt.gca().set_title('Cost: {}'.format(cost),
-                                        fontsize='x-large')
-                if (i - 1) % 2 == 0:
-                    plt.ylabel(opt, fontsize='x-large')
-                i += 1
+    for normalize in [False, True]:
+        for a in [0.9, 0.5, 0.01]:
+            plt.figure(figsize=(9, 7))
+            plt.subplots_adjust(top=0.9, bottom=0.05, left=0.1, right=0.95,
+                                wspace=0.1)
+            plt.suptitle('DISCOUNT: {}'.format(a) +
+                         ('\nNormalized view' if normalize else ''))
+            i = 1
+            for opt in ['Value Iteration', 'Policy Iteration']:
+                for cost in ['g1', 'g2']:
+                    name = '{} / {}'.format(opt, cost)
+                    ALPHA = a
+                    j, policy = dynamic_programming(optimizers[opt],
+                                                    costs[cost],
+                                                    terminators[opt])
+                    plt.subplot(2, 2, i)
+                    plot_j_policy_on_maze(j, policy, normalize=normalize)
+                    if i <= 2:
+                        plt.gca().set_title('Cost: {}'.format(cost),
+                                            fontsize='x-large')
+                    if (i - 1) % 2 == 0:
+                        plt.ylabel(opt, fontsize='x-large')
+                    i += 1
 
     # Error graphs
     for opt in ['Value Iteration', 'Policy Iteration']:
diff --git a/report.latex b/report.latex
index 4433095..6d5a507 100644
--- a/report.latex
+++ b/report.latex
@@ -16,5 +16,91 @@
 
 \section{Environment modeling}
 
-Blya ya zamodeliroval environment.
+In my code the behavior of the maze is represented using the system equation
+model. First, I assign a numeric index to each valid (non-wall) state of the
+maze in a row-major order. The possible actions are also assigned a numeric
+index. The space of possible disturbances in my implementation is the same as
+the space of actions (meaning $\{up, down, left, right, idle\}$). Using the
+numerical indexing described, the system equation and system stochasticity can
+be represented as two 3-D matrices $F_{xuw}$ and $P_{xuw}$. The $(x,u,w)$-th
+element of the matrix $F$ gives the index of the state, resulting from state
+$x$, when taken action $u$, under disturbance $w$. If action $u$ is impossible
+in state $x$, or if $w$ is impossible for the given $(x,u)$, then the
+$(x,u,w)$-th entry should be treated as invalid. This is achieved by using a
+supporting matrix $U_{xu}$, the $(x,u)$-th element of which contains a Boolean
+value, indicating whether action $u$ is allowed in the state $x$. Furthermore,
+the element $(x,u,w)$ of matrix $P$ gives the probability of the disturbance
+$w$, when action $u$ is taken in state $x$, being equal to zero if such
+configuration of $(x,u,w)$ is impossible. These matrices are initialized before
+the execution of the dynamic programming algorithm begins.
+
+The advantage of such formulation is the possibility to accelerate the
+computations by leveraging the \textit{NumPy} library for matrix operations.
+The alternative formulation are the Markovian state transition probabilities.
+This approach, however, has a number of drawbacks. If the transition
+probabilities $p_{ij}(u)$ were stored as a 3-D matrix $P_{iju}$, the size of
+the matrix would grow quadratically with the size of the state space, while the
+size of matrices used for implementing the system equation grows only linearly.
+Furthermore, this matrix would be very sparse, meaning only a few entries would
+be non-zero. Therefore, one would need a more space efficient representation of
+the transition probabilities, and therefore wouldn't be able to use a matrix
+library such as \textit{NumPy} for acceleration of computations.
+
+The one step costs in my implementation only depend on the target state,
+meaning $g(x, u, w) = g(f(x, u, w))$, therefore the cost functions are
+represented as vectors $G_x^1$ and $G_x^2$, where the goal state has a lower
+cost than the rest of the states, and the trap state incurs a high penalty.
+This formulation differs slightly from the formulation in the task, where for
+$g_2$ only the \textit{self-loop} in the final state is for free. However, this
+difference doesn't affect the resulting policy, and only has significant
+influence on the value function of the states directly adjacent to the goal
+state. If the cost did depend on the action taken to transit to the goal state
+(i.e.\ self-loop vs transition from the adjacent state), the cost couldn't have
+been stored as a vector, and instead a 2-D matrix would have been needed, which
+would have introduced unnecessary complexity to the code.
+
+A policy is implemented as a vector $\Pi_x$, where the $x$-th element of the
+vector contains the index of the action, that will be taken in state $x$.
+
+The convergence criteria differ for Value Iteration and Policy Iteration. The
+most sensible convergence criterion for Policy Iteration, is that the policy
+stopped changing between the iterations of the algorithm,
+i.e.\ $\pi_{k+1} = \pi_k$. For value iteration I use a common criterion of
+$\|J_{k+1} - J_k\|_{\infty} < \epsilon$. The value of $\epsilon$ depends on the
+discount factor $\alpha$, and the relation $\epsilon = \alpha^{|S|}$, where
+$|S|$ is the number of possible states, has been empirically found to provide
+good results.
+
+For visualization I used a non-linear scale for the value function. Each
+different value in the value vector was assigned a different color in order to
+ensure, that for small values for $\alpha$ the distinct values could be clearly
+visible. The unnormalized representation is also provided as reference.
+
+\section{Algorithm inspection}
+
+If the termination criterion for Value Iteration is chosen correctly, i.e. the
+algorithm only terminates when it converged to an optimal policy, then both PI
+and VI will result in the same policy. The cost $g_2$ is constantly shifted by
+$1$ relative to $g_1$, except for the trap state. For this reason $g_1$ and
+$g_2$ produce the same result for most $\alpha$, however the values of $\alpha$
+exist, for which the two costs produce different policies in the proximity of
+the trap. Generally, the behavior with both costs may differ, depending on the
+$\alpha$. For large $\alpha$ the algorithms may favor risking getting into the
+trap over going around it. For smaller $\alpha$ the resulting policy, on the
+contrary, is playing on the safe side.
+
+Furthermore, for very small $\alpha$, e.g.\ $0.01$, machine precision starts
+playing a role. The double precision floating point variable can store numbers
+of large range of magnitude, however the precision is limited by the 52-bit
+fractional part. The precision is not an issue for the cost $g_1$, because the
+negative cost of the goal state is propagated through the maze as a number of
+ever decreasing magnitude, since the one-step costs in the maze are $0$. For
+the cost $g_2$, however, the dominating term for the value function is the
+one-step cost of $1$ for the non-goal states, therefore the cost-free final
+state is propagated as an ever-decreasing additive term, and the distance of
+the propagation is restricted by the precision of the floating point variable
+used to store the value function. Hence, the algorithms may not converge to the
+optimal policy, when $g_2$ is used in conjunction with small values of
+$\alpha$.
+
 \end{document}