diff --git a/.gitignore b/.gitignore index 9996ceb..f9a97fd 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ # Inputs *.txt + +# Latex +build/ diff --git a/main.py b/main.py index 704e174..fdac15f 100644 --- a/main.py +++ b/main.py @@ -13,10 +13,9 @@ mpl.use('TkAgg') # fixes my macOS bug import matplotlib.pyplot as plt -P = 0.1 -ALPHA = 0.90 -EPSILON = 1e-12 -# EPSILON = 1e-12 # Convergence criterium +P = 0.1 # Slip probability +ALPHA = 0.90 # Discount factor + A2 = np.array([ # Action index to action mapping [-1, 0], # Up [ 1, 0], # Down @@ -31,7 +30,7 @@ S_TO_IJ = None # Mapping of state vector to coordinates SN = None # Number of states U_OF_X = None # The allowed action space matrix representation PW_OF_X_U = None # The probability distribution of disturbance -G1_X = None # The cost function vector representation (depends only on state) +G1_X = None # The cost function vector representation G2_X = None # The second cost function vector representation F_X_U_W = None # The System Equation @@ -63,6 +62,7 @@ def init_global(maze_filename): S_TO_IJ = np.indices(MAZE.shape).transpose(1, 2, 0)[state_mask] SN = len(S_TO_IJ) + ij_to_s = np.zeros(MAZE.shape, dtype=np.int32) ij_to_s[state_mask] = np.arange(SN) @@ -99,6 +99,12 @@ def init_global(maze_filename): def plot_j_policy_on_maze(j, policy): + j_norm = (j - j.min()) / (j.max() - j.min()) + 1e-50 + j_log = np.log10(j_norm) + print(j) + print(j_norm) + print(j_log) + print('-' * 50) heatmap = np.full(MAZE.shape, np.nan) heatmap[S_TO_IJ[:, 0], S_TO_IJ[:, 1]] = j cmap = mpl.cm.get_cmap('coolwarm') @@ -147,13 +153,16 @@ def policy_iteration(j, g): return policy, j -def _terminate(j, j_old, policy, policy_old): - # eps = EPSILON - # return np.abs(j - j_old).max() < eps +def _terminate_pi(j, j_old, policy, policy_old): return np.all(policy == policy_old) -def dynamic_programming(optimizer_step, g, return_history=False): +def _terminate_vi(j, j_old, policy, policy_old): + eps = ALPHA**SN + return np.abs(j - j_old).max() < eps + + +def dynamic_programming(optimizer_step, g, terminator, return_history=False): j = np.zeros(SN, dtype=np.float64) policy = None history = [] @@ -163,7 +172,7 @@ def dynamic_programming(optimizer_step, g, return_history=False): policy, j = optimizer_step(j, g) if return_history: history.append(j) - if _terminate(j, j_old, policy, policy_old): + if terminator(j, j_old, policy, policy_old): break if not return_history: return j, policy @@ -185,19 +194,20 @@ if __name__ == '__main__': costs = {'g1': G1_X, 'g2': G2_X} optimizers = {'Value Iteration': value_iteration, 'Policy Iteration': policy_iteration} + terminators = {'Value Iteration': _terminate_vi, + 'Policy Iteration': _terminate_pi} for a in [0.9, 0.5, 0.01]: - plt.figure() + plt.figure(figsize=(9, 6)) + plt.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95) plt.suptitle('DISCOUNT = ' + str(a)) i = 1 for opt in ['Value Iteration', 'Policy Iteration']: for cost in ['g1', 'g2']: - name = ' / '.join([opt, cost]) + name = '{} / {}'.format(opt, cost) ALPHA = a - j, policy = dynamic_programming(optimizers[opt], costs[cost]) - print(name) - print(j) - # print(name, j) + j, policy = dynamic_programming(optimizers[opt], costs[cost], + terminators[opt]) plt.subplot(2, 2, i) plt.gca().set_title(name) plot_j_policy_on_maze(j, policy) @@ -205,15 +215,16 @@ if __name__ == '__main__': # Error graphs for opt in ['Value Iteration', 'Policy Iteration']: - plt.figure() - plt.subplots_adjust(wspace=0.45, hspace=0.45) + plt.figure(figsize=(9, 6)) + plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.suptitle(opt) i = 1 for cost in ['g1', 'g2']: - for a in [0.9, 0.8, 0.7]: + for a in [0.99, 0.7, 0.5]: name = 'Cost: {}, discount: {}'.format(cost, a) ALPHA = a history = dynamic_programming(optimizers[opt], costs[cost], + terminators[opt], return_history=True) plt.subplot(2, 3, i) plt.gca().set_title(name) diff --git a/report.latex b/report.latex new file mode 100644 index 0000000..4433095 --- /dev/null +++ b/report.latex @@ -0,0 +1,20 @@ +\documentclass{article} +\usepackage[a4paper, margin=1in]{geometry} +\usepackage{amsmath} +\usepackage{fancyhdr} +\pagestyle{fancy} +\usepackage{lastpage} +\cfoot{Page \thepage\ of \pageref{LastPage}} +\rhead{Pavel Lutskov, 03654990} +\lhead{Programming Assignment} +\title{\huge Approximate Dynamic Programming and Reinforcement Learning \\ + \Large Programming Assignment} +% \subtitle{Assignment 1} +\author{Pavel Lutskov, 03654990} +\begin{document} +\maketitle + +\section{Environment modeling} + +Blya ya zamodeliroval environment. +\end{document}