diff --git a/figures/a001.png b/figures/a001.png new file mode 100644 index 0000000..ee8e48d Binary files /dev/null and b/figures/a001.png differ diff --git a/figures/a001_norm.png b/figures/a001_norm.png new file mode 100644 index 0000000..43f24ae Binary files /dev/null and b/figures/a001_norm.png differ diff --git a/figures/a05.png b/figures/a05.png new file mode 100644 index 0000000..d6fb7b9 Binary files /dev/null and b/figures/a05.png differ diff --git a/figures/a05_norm.png b/figures/a05_norm.png new file mode 100644 index 0000000..c7d44eb Binary files /dev/null and b/figures/a05_norm.png differ diff --git a/figures/a09.png b/figures/a09.png new file mode 100644 index 0000000..a6f0348 Binary files /dev/null and b/figures/a09.png differ diff --git a/figures/a09_norm.png b/figures/a09_norm.png new file mode 100644 index 0000000..f953c82 Binary files /dev/null and b/figures/a09_norm.png differ diff --git a/figures/pi1.png b/figures/pi1.png new file mode 100644 index 0000000..cc6de92 Binary files /dev/null and b/figures/pi1.png differ diff --git a/figures/vi1.png b/figures/vi1.png new file mode 100644 index 0000000..9512c80 Binary files /dev/null and b/figures/vi1.png differ diff --git a/main.py b/main.py index f8c2838..ae447b8 100644 --- a/main.py +++ b/main.py @@ -162,7 +162,6 @@ def dynamic_programming(optimizer_step, g, terminator, return_history=False): def plot_j_policy_on_maze(j, policy, normalize=True): - heatmap = np.full(MAZE.shape, np.nan, dtype=np.float64) if normalize: # Non-linear, but a discrete representation of different costs @@ -221,7 +220,7 @@ if __name__ == '__main__': plt.figure(figsize=(9, 7)) plt.subplots_adjust(top=0.9, bottom=0.05, left=0.1, right=0.95, wspace=0.1) - plt.suptitle('DISCOUNT: {}'.format(a) + + plt.suptitle('Discount: {}'.format(a) + ('\nNormalized view' if normalize else '')) i = 1 for opt in ['Value Iteration', 'Policy Iteration']: @@ -242,24 +241,22 @@ if __name__ == '__main__': # Error graphs for opt in ['Value Iteration', 'Policy Iteration']: - plt.figure(figsize=(7, 10)) + plt.figure(figsize=(6, 10)) plt.figtext(0.5, 0.04, 'Number of iterations', ha='center', fontsize='large') - plt.figtext(0.05, 0.5, 'Logarithm of cost RMSE', va='center', + plt.figtext(0.01, 0.5, 'Logarithm of cost RMSE', va='center', rotation='vertical', fontsize='large') - plt.subplots_adjust(wspace=0.38, hspace=0.35, left=0.205, right=0.92, + plt.subplots_adjust(wspace=0.38, hspace=0.35, left=0.205, right=0.98, top=0.9) plt.suptitle(opt) i = 1 for a in [0.99, 0.7, 0.1]: for cost in ['g1', 'g2']: - # name = 'Cost: {}, discount: {}'.format(cost, a) ALPHA = a history = dynamic_programming(optimizers[opt], costs[cost], terminators[opt], return_history=True) plt.subplot(3, 2, i) - # plt.gca().set_title(name) plot_cost_history(history) if i <= 2: plt.gca().set_title('Cost: {}'.format(cost)) @@ -268,6 +265,5 @@ if __name__ == '__main__': i += 1 - print('I ran in {} seconds'.format(time() - start)) plt.show() diff --git a/report.latex b/report.latex index 6d5a507..c0c4b44 100644 --- a/report.latex +++ b/report.latex @@ -4,6 +4,8 @@ \usepackage{fancyhdr} \pagestyle{fancy} \usepackage{lastpage} +\usepackage{graphicx} +% \graphicspath{{./figures}} \cfoot{Page \thepage\ of \pageref{LastPage}} \rhead{Pavel Lutskov, 03654990} \lhead{Programming Assignment} @@ -46,18 +48,19 @@ be non-zero. Therefore, one would need a more space efficient representation of the transition probabilities, and therefore wouldn't be able to use a matrix library such as \textit{NumPy} for acceleration of computations. -The one step costs in my implementation only depend on the target state, -meaning $g(x, u, w) = g(f(x, u, w))$, therefore the cost functions are +The one-step costs in my implementation only depend on the target state, +meaning $g(x, u, w) = g(f(x, u, w))$, therefore the one-step cost functions are represented as vectors $G_x^1$ and $G_x^2$, where the goal state has a lower cost than the rest of the states, and the trap state incurs a high penalty. This formulation differs slightly from the formulation in the task, where for $g_2$ only the \textit{self-loop} in the final state is for free. However, this difference doesn't affect the resulting policy, and only has significant -influence on the value function of the states directly adjacent to the goal -state. If the cost did depend on the action taken to transit to the goal state -(i.e.\ self-loop vs transition from the adjacent state), the cost couldn't have -been stored as a vector, and instead a 2-D matrix would have been needed, which -would have introduced unnecessary complexity to the code. +influence on the cost function of the states directly adjacent to the goal +state. If the one-step cost did depend on the action taken to transit to the +goal state (i.e.\ self-loop vs transition from the adjacent state), the +one-step cost couldn't have been stored as a vector, and instead a 2-D matrix +would have been needed, which would have introduced unnecessary complexity to +the code. A policy is implemented as a vector $\Pi_x$, where the $x$-th element of the vector contains the index of the action, that will be taken in state $x$. @@ -71,36 +74,76 @@ discount factor $\alpha$, and the relation $\epsilon = \alpha^{|S|}$, where $|S|$ is the number of possible states, has been empirically found to provide good results. -For visualization I used a non-linear scale for the value function. Each -different value in the value vector was assigned a different color in order to +\section{Algorithm inspection} + +For visualization I used a non-linear scale for the cost function. Each +different value in the cost vector was assigned a different color in order to ensure, that for small values for $\alpha$ the distinct values could be clearly visible. The unnormalized representation is also provided as reference. -\section{Algorithm inspection} - -If the termination criterion for Value Iteration is chosen correctly, i.e. the +If the termination criterion for Value Iteration is chosen correctly, i.e.\ the algorithm only terminates when it converged to an optimal policy, then both PI -and VI will result in the same policy. The cost $g_2$ is constantly shifted by -$1$ relative to $g_1$, except for the trap state. For this reason $g_1$ and -$g_2$ produce the same result for most $\alpha$, however the values of $\alpha$ -exist, for which the two costs produce different policies in the proximity of -the trap. Generally, the behavior with both costs may differ, depending on the -$\alpha$. For large $\alpha$ the algorithms may favor risking getting into the -trap over going around it. For smaller $\alpha$ the resulting policy, on the -contrary, is playing on the safe side. +and VI will result in the same policy. The one-step cost $g_2$ is constantly +shifted by $1$ relative to $g_1$, except for the trap state. For this reason +$g_1$ and $g_2$ produce the same result for most $\alpha$, however the values +of $\alpha$ exist, for which the two one-step costs produce different policies +in the proximity of the trap. Generally, the behavior with both one-step costs +may differ, depending on the $\alpha$. For large $\alpha$ the algorithms may +favor risking getting into the trap over going around it. For smaller $\alpha$ +the resulting policy, on the contrary, is playing on the safe side. Furthermore, for very small $\alpha$, e.g.\ $0.01$, machine precision starts playing a role. The double precision floating point variable can store numbers of large range of magnitude, however the precision is limited by the 52-bit -fractional part. The precision is not an issue for the cost $g_1$, because the -negative cost of the goal state is propagated through the maze as a number of -ever decreasing magnitude, since the one-step costs in the maze are $0$. For -the cost $g_2$, however, the dominating term for the value function is the -one-step cost of $1$ for the non-goal states, therefore the cost-free final -state is propagated as an ever-decreasing additive term, and the distance of -the propagation is restricted by the precision of the floating point variable -used to store the value function. Hence, the algorithms may not converge to the -optimal policy, when $g_2$ is used in conjunction with small values of -$\alpha$. +fractional part. The precision is not an issue for $g_1$, because the negative +cost of the goal state is propagated through the maze as a number of ever +decreasing magnitude, since the one-step costs in the maze are $0$. For $g_2$, +however, the dominating term for the cost function is the one-step cost of $1$ +for the non-goal states, therefore the cost-free final state is propagated as +an ever-decreasing additive term, and the distance of the propagation is +restricted by the precision of the floating point variable used to store the +cost function. Hence, the algorithms may not converge to the optimal policy, +when $g_2$ is used in conjunction with small values of $\alpha$. + +For comparison of Value Iteration and Policy Iteration I used a wide range of +$\alpha$, the values that I chose are $0.99$, $0.7$ and $0.1$. Using these +values demonstrates the impact, that $\alpha$ has on the optimization. With +large $\alpha$ it can be seen, that both algorithms stagnate for several +iterations, after which they converge rapidly to the optimal policy and cost +function. With decreasing $\alpha$ this effect becomes less pronounced, and the +algorithms converge more steadily. From these graphs it is apparent, that +Policy Iteration converges in two to three times less iterations than Value +Iteration. Surprisingly, the number of iterations doesn't seem to depend on the +discount factor, which could mean that the given maze problem is small and +simple enough, so we don't have to care about choosing the $\alpha$ carefully. +Furthermore, the one-step cost $g_2$ allows both algorithms to converge faster. + +It is natural, that PI converges in less iterations than VI, since policy is +guaranteed to improve on each iteration. However, finding the exact cost +function $J_{\pi_k}$ on each iteration can get expensive, when the state space +grows. However, the given maze is small, so it is affordable to use the PI. + +\begin{figure} + \includegraphics[width=\linewidth]{figures/a09.png} + \includegraphics[width=\linewidth]{figures/a09_norm.png} +\end{figure} + +\begin{figure} + \includegraphics[width=\linewidth]{figures/a05.png} + \includegraphics[width=\linewidth]{figures/a05_norm.png} +\end{figure} + +\begin{figure} + \includegraphics[width=\linewidth]{figures/a001.png} + \includegraphics[width=\linewidth]{figures/a001_norm.png} +\end{figure} + +\begin{figure} + \includegraphics[width=\linewidth]{figures/vi1.png} +\end{figure} + +\begin{figure} + \includegraphics[width=\linewidth]{figures/pi1.png} +\end{figure} \end{document}