Sfoglia il codice sorgente

Improve pseudocode

Martin Thoma 9 anni fa
parent
commit
ddd08a2a45

BIN
source-code/Pseudocode/Policy-Iteration/Policy-Iteration.png


+ 3 - 2
source-code/Pseudocode/Policy-Iteration/Policy-Iteration.tex

@@ -22,8 +22,9 @@
         \Statex Sates $\mathcal{X} = \{1, \dots, n_x\}$
         \Statex Actions $\mathcal{A} = \{1, \dots, n_a\},\qquad A: \mathcal{X} \Rightarrow \mathcal{A}$
         \Statex Cost function $g: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$
-        \Statex Transition probabilities $f$
-        \Procedure{PolicyIteration}{$\mathcal{X}$, $A$, $g$, $f$}
+        \Statex Transition probabilities $f$, $F$
+        \Statex $\alpha \in (0, 1)$
+        \Procedure{PolicyIteration}{$\mathcal{X}$, $A$, $g$, $f$, $F$, $\alpha$}
             \State Initialize $\pi$ arbitrarily
             \While{$\pi$ is not converged}
                 \State $J \gets$ solve system of linear equations $(I - \alpha \cdot F(\pi)) \cdot J = g(\pi)$

BIN
source-code/Pseudocode/Value-Iteration/Value-Iteration.png


+ 1 - 1
source-code/Pseudocode/Value-Iteration/Value-Iteration.tex

@@ -22,7 +22,7 @@
         \Statex Sates $\mathcal{X} = \{1, \dots, n_x\}$
         \Statex Actions $\mathcal{A} = \{1, \dots, n_a\},\qquad A: \mathcal{X} \Rightarrow \mathcal{A}$
         \Statex Cost function $g: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$
-        \Statex Transition probabilities $f$
+        \Statex Transition probabilities $f_{xy}(a) = \mathbb{P}(y | x, a)$
         \Statex Discounting factor $\alpha \in (0, 1)$, typically $\alpha = 0.9$
         \Procedure{ValueIteration}{$\mathcal{X}$, $A$, $g$, $f$, $\alpha$}
             \State Initialize $J, J': \mathcal{X} \rightarrow \mathbb{R}_0^+$ arbitrarily

BIN
source-code/Pseudocode/dynamic-programming/dynamic-programming.png


+ 3 - 2
source-code/Pseudocode/dynamic-programming/dynamic-programming.tex

@@ -22,7 +22,7 @@
         \Statex Sates $\mathcal{X} = \{1, \dots, n_x\}$
         \Statex Actions $\mathcal{A} = \{1, \dots, n_a\},\qquad A: \mathcal{X} \Rightarrow \mathcal{A}$
         \Statex Cost function $g: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$
-        \Statex Horizon $N$
+        \Statex Horizon $N \in \mathbb{N}_{\geq 1}$
         \Statex Discounting factor $\alpha \in [0, 1]$
         \Procedure{DynamicProgramming}{$\mathcal{X}$, $A$, $g$, $N$, $\alpha$}
             \State $J_N(x) \gets g_N(x) \quad \forall x \in \mathcal{X}$
@@ -36,10 +36,11 @@
                     \State $\pi_k(x) \gets \arg \min_a (Q_k(x, a))$
                 \EndFor
             \EndFor
+            \Return $\pi_{0:N-1}$
         \EndProcedure
         \end{algorithmic}
     \caption{Dynamic Programming}
-    \label{alg:dynamic-programming}
+    \label{alg:dynamic-programming: Learn a strategy}
     \end{algorithm}
 \end{preview}
 \end{document}

BIN
source-code/Pseudocode/label-correction/label-correction.png


+ 3 - 4
source-code/Pseudocode/label-correction/label-correction.tex

@@ -43,14 +43,13 @@
                             \State $u \gets d_v + g_{vt}$
                         \EndIf
                     \EndIf
-                    \If{$d_c + m_c < u$}
-                        \State $u \gets d_c + m_c$
-                    \EndIf
+                    \State $u \gets \min (u, d_c + m_c)$
                 \EndFor
             \EndWhile
+            \Return $u, t$
         \EndProcedure
         \end{algorithmic}
-    \caption{Label correction algorithm}
+    \caption{Label correction algorithm: Find shortest path}
     \label{alg:label-correction-algorithm}
     \end{algorithm}
 \end{preview}

BIN
source-code/Pseudocode/q-learning/q-learning.png


+ 1 - 1
source-code/Pseudocode/q-learning/q-learning.tex

@@ -33,7 +33,7 @@
                 \While{$s$ is not terminal}
                     \State Calculate $\pi$ according to Q and exploration strategy (e.g. $\pi(x) \gets \argmax_{a} Q(x, a)$)
                     \State $a \gets \pi(s)$
-                    \State $r \gets R(s, a)$
+                    \State $r \gets R(s, a)$ \Comment{Receive the reward}
                     \State $s' \gets T(s, a)$ \Comment{Receive the new state}
                     \State $Q(s', a) \gets (1 - \alpha) \cdot Q(s, a) + \alpha \cdot (r + \gamma \cdot \max_{a'} Q(s', a'))$
                     \State $s \gets s'$