FINALLY CLEAR GRAPHS AHAHAHAHA

2019-01-27 14:22:12 +01:00
parent 0be126e57b
commit a88fcfc1f2
2 changed files with 158 additions and 51 deletions
--- a/main.py
+++ b/main.py
@@ -10,10 +10,11 @@ import numpy as np
 import matplotlib as mpl
 mpl.use('TkAgg')  # fixes my macOS bug
 import matplotlib.pyplot as plt
 import matplotlib.colors as colors
 P = 0.1  # Slip probability
-ALPHA = 0.90  # Discount factor
+ALPHA = 0.8  # Discount factor
 A2 = np.array([  # Action index to action mapping
    [-1,  0],  # Up
@@ -34,12 +35,6 @@ G2_X = None  # The second cost function vector representation
 F_X_U_W = None  # The System Equation
 def h_matrix(j, g):
    result = (PW_OF_X_U * (g[F_X_U_W] + ALPHA*j[F_X_U_W])).sum(axis=2)
    result[~U_OF_X] = np.inf  # discard invalid policies
    return result
 def _valid_target(target):
    return (
        0 <= target[0] < MAZE.shape[0] and
@@ -72,8 +67,9 @@ def init_global(maze_filename):
    maze_cost[MAZE == 'T'] = 50
    maze_cost[MAZE == 'G'] = -1
    G1_X = maze_cost.copy()[state_mask]
-    maze_cost[(MAZE=='0') | (MAZE=='S') | (MAZE=='G')] += 1
+    # maze_cost[(MAZE=='0') | (MAZE=='S') | (MAZE=='G')] += 1
-    G2_X = maze_cost.copy()[state_mask]
+    # G2_X = maze_cost.copy()[state_mask]
    G2_X = G1_X + 1
    # Actual environment modelling
    U_OF_X = np.zeros((SN, len(A2)), dtype=np.bool)
@@ -97,26 +93,10 @@ def init_global(maze_filename):
                F_X_U_W[ix, iu, -1] = ij_to_s[tuple(x + u)]
-def plot_j_policy_on_maze(j, policy):
+def h_matrix(j, g):
-    heatmap = np.full(MAZE.shape, np.nan)
+    h_x_u = (PW_OF_X_U * (g[F_X_U_W] + ALPHA*j[F_X_U_W])).sum(axis=2)
-    heatmap[S_TO_IJ[:, 0], S_TO_IJ[:, 1]] = j
+    h_x_u[~U_OF_X] = np.inf  # discard invalid policies
-    cmap = mpl.cm.get_cmap('coolwarm')
+    return h_x_u
    cmap.set_bad(color='black')
    plt.imshow(heatmap, cmap=cmap)
    # plt.colorbar()
    # quiver has some weird behavior, the arrow y component must be flipped
    plt.quiver(S_TO_IJ[:, 1], S_TO_IJ[:, 0], A2[policy, 1], -A2[policy, 0])
    plt.gca().get_xaxis().set_visible(False)
    plt.tick_params(axis='y', which='both', left=False, labelleft=False)
 def plot_cost_history(hist):
    error = np.log10(
        np.sqrt(np.square(hist[:-1] - hist[-1]).mean(axis=1))
    )
    plt.xticks(np.arange(0, len(error), len(error) // 5))
    plt.yticks(np.linspace(error.min(), error.max(), 5))
    plt.plot(error)
 def _policy_improvement(j, g):
@@ -159,7 +139,7 @@ def _terminate_vi(j, j_old, policy, policy_old):
 def dynamic_programming(optimizer_step, g, terminator, return_history=False):
    j = np.zeros(SN, dtype=np.float64)
-    policy = np.full(SN, -1, dtype=np.int32)  # idle policy
+    policy = np.full(SN, len(A2) - 1, dtype=np.int32)  # idle policy
    history = []
    while True:
        j_old = j
@@ -181,6 +161,43 @@ def dynamic_programming(optimizer_step, g, terminator, return_history=False):
        return history
 def plot_j_policy_on_maze(j, policy, normalize=True):
    heatmap = np.full(MAZE.shape, np.nan, dtype=np.float64)
    if normalize:
        # Non-linear, but a discrete representation of different costs
        norm = colors.BoundaryNorm(boundaries=np.sort(j)[1:-1], ncolors=256)
        vmin = 0
        vmax = 256
    else:
        norm = lambda x: x
        vmin = None
        vmax = None
    heatmap[S_TO_IJ[:, 0], S_TO_IJ[:, 1]] = norm(j)
    cmap = mpl.cm.get_cmap('coolwarm')
    cmap.set_bad(color='black')
    plt.imshow(
        heatmap, vmin=vmin, vmax=vmax, cmap=cmap,
    )
    # quiver has some weird behavior, the arrow y component must be flipped
    plt.quiver(S_TO_IJ[:, 1], S_TO_IJ[:, 0], A2[policy, 1], -A2[policy, 0])
    plt.gca().get_xaxis().set_visible(False)
    plt.tick_params(axis='y', which='both', left=False, labelleft=False)
 def plot_cost_history(hist):
    error = np.log10(
        np.sqrt(np.square(hist[:-1] - hist[-1]).mean(axis=1))
    )
    plt.xticks(np.arange(0, len(error), len(error) // 5))
    plt.yticks(np.linspace(error.min(), error.max(), 5))
    plt.plot(error)
 if __name__ == '__main__':
    # Argument Parsing
    ap = ArgumentParser()
@@ -197,27 +214,31 @@ if __name__ == '__main__':
                  'Policy Iteration': policy_iteration}
    terminators = {'Value Iteration': _terminate_vi,
                   'Policy Iteration': _terminate_pi}
    # cost_transform = {'g1': _neg_log_neg, 'g2': _gamma}
-    for a in [0.9, 0.5, 0.01]:
+    for normalize in [False, True]:
-        plt.figure(figsize=(9, 7))
+        for a in [0.9, 0.5, 0.01]:
-        plt.subplots_adjust(top=0.9, bottom=0.05, left=0.1, right=0.95,
+            plt.figure(figsize=(9, 7))
-                            wspace=0.1)
+            plt.subplots_adjust(top=0.9, bottom=0.05, left=0.1, right=0.95,
-        plt.suptitle('DISCOUNT = ' + str(a))
+                                wspace=0.1)
-        i = 1
+            plt.suptitle('DISCOUNT: {}'.format(a) +
-        for opt in ['Value Iteration', 'Policy Iteration']:
+                         ('\nNormalized view' if normalize else ''))
-            for cost in ['g1', 'g2']:
+            i = 1
-                name = '{} / {}'.format(opt, cost)
+            for opt in ['Value Iteration', 'Policy Iteration']:
-                ALPHA = a
+                for cost in ['g1', 'g2']:
-                j, policy = dynamic_programming(optimizers[opt], costs[cost],
+                    name = '{} / {}'.format(opt, cost)
-                                                terminators[opt])
+                    ALPHA = a
-                plt.subplot(2, 2, i)
+                    j, policy = dynamic_programming(optimizers[opt],
-                plot_j_policy_on_maze(j, policy)
+                                                    costs[cost],
-                if i <= 2:
+                                                    terminators[opt])
-                    plt.gca().set_title('Cost: {}'.format(cost),
+                    plt.subplot(2, 2, i)
-                                        fontsize='x-large')
+                    plot_j_policy_on_maze(j, policy, normalize=normalize)
-                if (i - 1) % 2 == 0:
+                    if i <= 2:
-                    plt.ylabel(opt, fontsize='x-large')
+                        plt.gca().set_title('Cost: {}'.format(cost),
-                i += 1
+                                            fontsize='x-large')
                    if (i - 1) % 2 == 0:
                        plt.ylabel(opt, fontsize='x-large')
                    i += 1
    # Error graphs
    for opt in ['Value Iteration', 'Policy Iteration']:
--- a/report.latex
+++ b/report.latex
@@ -16,5 +16,91 @@
 \section{Environment modeling}
-Blya ya zamodeliroval environment.
+In my code the behavior of the maze is represented using the system equation
 model. First, I assign a numeric index to each valid (non-wall) state of the
 maze in a row-major order. The possible actions are also assigned a numeric
 index. The space of possible disturbances in my implementation is the same as
 the space of actions (meaning $\{up, down, left, right, idle\}$). Using the
 numerical indexing described, the system equation and system stochasticity can
 be represented as two 3-D matrices $F_{xuw}$ and $P_{xuw}$. The $(x,u,w)$-th
 element of the matrix $F$ gives the index of the state, resulting from state
 $x$, when taken action $u$, under disturbance $w$. If action $u$ is impossible
 in state $x$, or if $w$ is impossible for the given $(x,u)$, then the
 $(x,u,w)$-th entry should be treated as invalid. This is achieved by using a
 supporting matrix $U_{xu}$, the $(x,u)$-th element of which contains a Boolean
 value, indicating whether action $u$ is allowed in the state $x$. Furthermore,
 the element $(x,u,w)$ of matrix $P$ gives the probability of the disturbance
 $w$, when action $u$ is taken in state $x$, being equal to zero if such
 configuration of $(x,u,w)$ is impossible. These matrices are initialized before
 the execution of the dynamic programming algorithm begins.
 The advantage of such formulation is the possibility to accelerate the
 computations by leveraging the \textit{NumPy} library for matrix operations.
 The alternative formulation are the Markovian state transition probabilities.
 This approach, however, has a number of drawbacks. If the transition
 probabilities $p_{ij}(u)$ were stored as a 3-D matrix $P_{iju}$, the size of
 the matrix would grow quadratically with the size of the state space, while the
 size of matrices used for implementing the system equation grows only linearly.
 Furthermore, this matrix would be very sparse, meaning only a few entries would
 be non-zero. Therefore, one would need a more space efficient representation of
 the transition probabilities, and therefore wouldn't be able to use a matrix
 library such as \textit{NumPy} for acceleration of computations.
 The one step costs in my implementation only depend on the target state,
 meaning $g(x, u, w) = g(f(x, u, w))$, therefore the cost functions are
 represented as vectors $G_x^1$ and $G_x^2$, where the goal state has a lower
 cost than the rest of the states, and the trap state incurs a high penalty.
 This formulation differs slightly from the formulation in the task, where for
 $g_2$ only the \textit{self-loop} in the final state is for free. However, this
 difference doesn't affect the resulting policy, and only has significant
 influence on the value function of the states directly adjacent to the goal
 state. If the cost did depend on the action taken to transit to the goal state
 (i.e.\ self-loop vs transition from the adjacent state), the cost couldn't have
 been stored as a vector, and instead a 2-D matrix would have been needed, which
 would have introduced unnecessary complexity to the code.
 A policy is implemented as a vector $\Pi_x$, where the $x$-th element of the
 vector contains the index of the action, that will be taken in state $x$.
 The convergence criteria differ for Value Iteration and Policy Iteration. The
 most sensible convergence criterion for Policy Iteration, is that the policy
 stopped changing between the iterations of the algorithm,
 i.e.\ $\pi_{k+1} = \pi_k$. For value iteration I use a common criterion of
 $\|J_{k+1} - J_k\|_{\infty} < \epsilon$. The value of $\epsilon$ depends on the
 discount factor $\alpha$, and the relation $\epsilon = \alpha^{|S|}$, where
 $|S|$ is the number of possible states, has been empirically found to provide
 good results.
 For visualization I used a non-linear scale for the value function. Each
 different value in the value vector was assigned a different color in order to
 ensure, that for small values for $\alpha$ the distinct values could be clearly
 visible. The unnormalized representation is also provided as reference.
 \section{Algorithm inspection}
 If the termination criterion for Value Iteration is chosen correctly, i.e. the
 algorithm only terminates when it converged to an optimal policy, then both PI
 and VI will result in the same policy. The cost $g_2$ is constantly shifted by
 $1$ relative to $g_1$, except for the trap state. For this reason $g_1$ and
 $g_2$ produce the same result for most $\alpha$, however the values of $\alpha$
 exist, for which the two costs produce different policies in the proximity of
 the trap. Generally, the behavior with both costs may differ, depending on the
 $\alpha$. For large $\alpha$ the algorithms may favor risking getting into the
 trap over going around it. For smaller $\alpha$ the resulting policy, on the
 contrary, is playing on the safe side.
 Furthermore, for very small $\alpha$, e.g.\ $0.01$, machine precision starts
 playing a role. The double precision floating point variable can store numbers
 of large range of magnitude, however the precision is limited by the 52-bit
 fractional part. The precision is not an issue for the cost $g_1$, because the
 negative cost of the goal state is propagated through the maze as a number of
 ever decreasing magnitude, since the one-step costs in the maze are $0$. For
 the cost $g_2$, however, the dominating term for the value function is the
 one-step cost of $1$ for the non-goal states, therefore the cost-free final
 state is propagated as an ever-decreasing additive term, and the distance of
 the propagation is restricted by the precision of the floating point variable
 used to store the value function. Hence, the algorithms may not converge to the
 optimal policy, when $g_2$ is used in conjunction with small values of
 $\alpha$.
 \end{document}