appendix.tex 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. %!TEX root = main.tex
  2. \appendix
  3. \onecolumn
  4. \section*{Overview}
  5. \begin{table}[H]
  6. \centering
  7. \hspace*{-1cm}\begin{tabular}{lllll}
  8. \toprule
  9. Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ & Used by \\\midrule %
  10. Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ & \cite{971754} \\
  11. \parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ & \cite{mcculloch1943logical}\\
  12. Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ & \cite{duch1999survey} \\
  13. Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ & \cite{LeNet-5,Thoma:2014}\\
  14. \gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & \cite{AlexNet-2012}\\
  15. \parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ & \cite{maas2013rectifier,he2015delving} \\
  16. Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ & \cite{dugas2001incorporating,glorot2011deep} \\
  17. \gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ & \cite{clevert2015fast} \\
  18. Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & \cite{AlexNet-2012,Thoma:2014}\\
  19. Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ & \cite{goodfellow2013maxout} \\
  20. \bottomrule
  21. \end{tabular}
  22. \caption[Activation functions]{Overview of activation functions. Functions
  23. marked with $\dagger$ are not differentiable at 0 and functions
  24. marked with $\ddagger$ operate on all elements of a layer
  25. simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
  26. ReLU and ELU are typically $\alpha = 0.01$. Other activation
  27. function like randomized leaky ReLUs exist~\cite{xu2015empirical},
  28. but are far less commonly used.\\
  29. Some functions are smoothed versions of others, like the logistic
  30. function for the Heaviside step function, tanh for the sign
  31. function, softplus for ReLU.\\
  32. Softmax is the standard activation function for the last layer of
  33. a classification network as it produces a probability
  34. distribution. See \Cref{fig:activation-functions-plot} for a plot
  35. of some of them.}
  36. \label{table:activation-functions-overview}
  37. \end{table}
  38. \footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}
  39. \section*{Evaluation Results}
  40. \glsunset{LReLU}
  41. \begin{table}[H]
  42. \centering
  43. \begin{tabular}{@{\extracolsep{4pt}}lcccccc@{}}
  44. \toprule
  45. \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} \\\cline{2-3}\cline{4-5}\cline{6-7}
  46. & \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Training set & Test set \\\midrule
  47. Identity & \SI{66.25}{\percent} & $\boldsymbol{\sigma=0.77}$ &\SI{56.74}{\percent} & \textbf{$\sigma=0.51$} & \SI{68.77}{\percent} & \SI{58.78}{\percent}\\
  48. Logistic & \SI{51.87}{\percent} & $\sigma=3.64$ &\SI{46.54}{\percent} & $\sigma=3.22$ & \SI{61.19}{\percent} & \SI{54.58}{\percent}\\
  49. Logistic$^-$ & \SI{66.49}{\percent} & $\sigma=1.99$ &\SI{57.84}{\percent} & $\sigma=1.15$ & \SI{69.04}{\percent} & \SI{60.10}{\percent}\\
  50. Softmax & \SI{75.22}{\percent} & $\sigma=2.41$ &\SI{59.49}{\percent} & $\sigma=1.25$ & \SI{78.87}{\percent} & \SI{63.06}{\percent}\\
  51. Tanh & \SI{67.27}{\percent} & $\sigma=2.38$ &\SI{55.70}{\percent} & $\sigma=1.44$ & \SI{70.21}{\percent} & \SI{58.10}{\percent}\\
  52. Softsign & \SI{66.43}{\percent} & $\sigma=1.74$ &\SI{55.75}{\percent} & $\sigma=0.93$ & \SI{69.78}{\percent} & \SI{58.40}{\percent}\\
  53. \gls{ReLU} & \SI{78.62}{\percent} & $\sigma=2.15$ &\SI{62.18}{\percent} & $\sigma=0.99$ & \SI{81.81}{\percent} & \SI{64.57}{\percent}\\
  54. \gls{ReLU}$^-$ & \SI{76.01}{\percent} & $\sigma=2.31$ &\SI{62.87}{\percent} & $\sigma=1.08$ & \SI{78.18}{\percent} & \SI{64.81}{\percent}\\
  55. Softplus & \SI{66.75}{\percent} & $\sigma=2.45$ &\SI{56.68}{\percent} & $\sigma=1.32$ & \SI{71.27}{\percent} & \SI{60.26}{\percent}\\
  56. S2ReLU & \SI{63.32}{\percent} & $\sigma=1.69$ &\SI{56.99}{\percent} & $\sigma=1.14$ & \SI{65.80}{\percent} & \SI{59.20}{\percent}\\
  57. \gls{LReLU} & \SI{74.92}{\percent} & $\sigma=2.49$ &\SI{61.86}{\percent} & $\sigma=1.23$ & \SI{77.67}{\percent} & \SI{64.01}{\percent}\\
  58. \gls{PReLU} & \textbf{\SI{80.01}{\percent}} & $\sigma=2.03$ &\SI{62.16}{\percent} & $\sigma=0.73$ & \textbf{\SI{83.50}{\percent}} & \textbf{\SI{64.79}{\percent}}\\
  59. \gls{ELU} & \SI{76.64}{\percent} & $\sigma=1.48$ &\textbf{\SI{63.38}{\percent}} & $\sigma=0.55$ & \SI{78.30}{\percent} & \SI{64.70}{\percent}\\
  60. \bottomrule
  61. \end{tabular}
  62. \caption[Activation function evaluation results on CIFAR-100]{Training and
  63. test accuracy of adjusted baseline models trained with different
  64. activation functions on CIFAR-100. For \gls{LReLU}, $\alpha = 0.3$ was
  65. chosen.}
  66. \label{table:CIFAR-100-accuracies-activation-functions}
  67. \end{table}
  68. \begin{table}[H]
  69. \centering
  70. \setlength\tabcolsep{1.5pt}
  71. \begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
  72. \toprule
  73. \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
  74. & \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Train & Test & Range & \multicolumn{1}{c}{Mean} \\\midrule
  75. Identity & \SI{87.92}{\percent} & $\sigma=0.40$ & \SI{84.69}{\percent} & $\sigma=0.08$ & \SI{88.59}{\percent} & \SI{85.43}{\percent} & \hphantom{0}92 -- 140 & 114.5\\%TODO: Really?
  76. Logistic & \SI{81.46}{\percent} & $\sigma=5.08$ & \SI{79.67}{\percent} & $\sigma=4.85$ & \SI{86.38}{\percent} & \SI{84.60}{\percent} & \hphantom{0}\textbf{58} -- \hphantom{0}\textbf{91} & \textbf{77.3}\\
  77. Softmax & \SI{88.19}{\percent} & $\sigma=0.31$ & \SI{84.70}{\percent} & $\sigma=0.15$ & \SI{88.69}{\percent} & \SI{85.43}{\percent} & 124 -- 171& 145.8\\
  78. Tanh & \SI{88.41}{\percent} & $\sigma=0.36$ & \SI{84.46}{\percent} & $\sigma=0.27$ & \SI{89.24}{\percent} & \SI{85.45}{\percent} & \hphantom{0}89 -- 123 & 108.7\\
  79. Softsign & \SI{88.00}{\percent} & $\sigma=0.47$ & \SI{84.46}{\percent} & $\sigma=0.23$ & \SI{88.77}{\percent} & \SI{85.33}{\percent} & \hphantom{0}77 -- 119 & 104.1\\
  80. \gls{ReLU} & \SI{88.93}{\percent} & $\sigma=0.46$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.21$ & \SI{89.35}{\percent} & \SI{85.95}{\percent} & \hphantom{0}96 -- 132 & 102.8\\
  81. Softplus & \SI{88.42}{\percent} & $\boldsymbol{\sigma=0.29}$ & \SI{85.16}{\percent} & $\sigma=0.15$ & \SI{88.90}{\percent} & \SI{85.73}{\percent} & 108 -- 143 & 121.0\\
  82. \gls{LReLU} & \SI{88.61}{\percent} & $\sigma=0.41$ & \SI{85.21}{\percent} & $\boldsymbol{\sigma=0.05}$ & \SI{89.07}{\percent} & \SI{85.83}{\percent} & \hphantom{0}87 -- 117 & 104.5\\
  83. \gls{PReLU} & \textbf{\SI{89.62}{\percent}} & $\sigma=0.41$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.17$& \textbf{\SI{90.10}{\percent}} & \SI{86.01}{\percent} & \hphantom{0}85 -- 111 & 100.5\\
  84. \gls{ELU} & \SI{89.49}{\percent} & $\sigma=0.42$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.10$ & \SI{89.94}{\percent} & \textbf{\SI{86.03}{\percent}} & \hphantom{0}73 -- 113 & 92.4\\
  85. \bottomrule
  86. \end{tabular}
  87. \caption[Activation function evaluation results on HASYv2]{Test accuracy of
  88. adjusted baseline models trained with different activation
  89. functions on HASYv2. For \gls{LReLU}, $\alpha = 0.3$ was chosen.}
  90. \label{table:HASYv2-accuracies-activation-functions}
  91. \end{table}
  92. \begin{table}[H]
  93. \centering
  94. \setlength\tabcolsep{1.5pt}
  95. \begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
  96. \toprule
  97. \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
  98. & \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Train & Test & Range & \multicolumn{1}{c}{Mean} \\\midrule
  99. Identity & \SI{87.49}{\percent} & $\sigma=2.50$ & \SI{69.86}{\percent} & $\sigma=1.41$ & \SI{89.78}{\percent} & \SI{71.90}{\percent} & \hphantom{0}51 -- \hphantom{0}65 & 53.4\\
  100. Logistic & \SI{45.32}{\percent} & $\sigma=14.88$& \SI{40.85}{\percent} & $\sigma=12.56$ & \SI{51.06}{\percent} & \SI{45.49}{\percent} & \hphantom{0}38 -- \hphantom{0}93 & 74.6\\
  101. Softmax & \SI{87.90}{\percent} & $\sigma=3.58$ & \SI{67.91}{\percent} & $\sigma=2.32$ & \SI{91.51}{\percent} & \SI{70.96}{\percent} & 108 -- 150 & 127.5\\
  102. Tanh & \SI{85.38}{\percent} & $\sigma=4.04$ & \SI{67.65}{\percent} & $\sigma=2.01$ & \SI{90.47}{\percent} & \SI{71.29}{\percent} & 48 -- \hphantom{0}92 & 65.2\\
  103. Softsign & \SI{88.57}{\percent} & $\sigma=4.00$ & \SI{69.32}{\percent} & $\sigma=1.68$ & \SI{93.04}{\percent} & \SI{72.40}{\percent} & 55 -- 117 & 83.2\\
  104. \gls{ReLU} & \SI{94.35}{\percent} & $\sigma=3.38$ & \SI{71.01}{\percent} & $\sigma=1.63$ & \SI{98.20}{\percent} & \SI{74.85}{\percent} & 52 -- \hphantom{0}98 & 75.5\\
  105. Softplus & \SI{83.03}{\percent} & $\sigma=2.07$ & \SI{68.28}{\percent} & $\sigma=1.74$ & \SI{93.04}{\percent} & \SI{75.99}{\percent} & 56 -- \hphantom{0}89 & 68.9\\
  106. \gls{LReLU} & \SI{93.83}{\percent} & $\sigma=3.89$ & \SI{74.66}{\percent} & $\sigma=2.11$ & \SI{97.56}{\percent} & \SI{78.08}{\percent} & 52 -- 120 & 80.1\\
  107. \gls{PReLU} & \SI{95.53}{\percent} & $\sigma=1.92$ & \SI{71.69}{\percent} & $\sigma=1.37$ & \SI{98.17}{\percent} & \SI{74.69}{\percent} & 59 -- 101 & 78.8\\
  108. \gls{ELU} & \SI{95.42}{\percent} & $\sigma=3.57$ & \SI{75.09}{\percent} & $\sigma=2.39$ & \SI{98.54}{\percent} & \SI{78.66}{\percent} & 66 -- \hphantom{0}72 & 67.2\\
  109. \bottomrule
  110. \end{tabular}
  111. \caption[Activation function evaluation results on STL-10]{Test accuracy of
  112. adjusted baseline models trained with different activation
  113. functions on STL-10. For \gls{LReLU}, $\alpha = 0.3$ was chosen.}
  114. \label{table:STL-10-accuracies-activation-functions}
  115. \end{table}
  116. \begin{table}[H]
  117. \centering
  118. \hspace*{-1cm}\begin{tabular}{lllll}
  119. \toprule
  120. Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ \\\midrule % & Used by
  121. Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ \\%& \cite{971754} \\
  122. \parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ \\%& \cite{mcculloch1943logical}\\
  123. Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ \\%& \cite{duch1999survey} \\
  124. Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ \\%& \cite{LeNet-5,Thoma:2014}\\
  125. \gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ \\%& \cite{AlexNet-2012}\\
  126. \parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ \\%& \cite{maas2013rectifier,he2015delving} \\
  127. Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ \\%& \cite{dugas2001incorporating,glorot2011deep} \\
  128. \gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ \\%& \cite{clevert2015fast} \\
  129. Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ \\%& \cite{AlexNet-2012,Thoma:2014}\\
  130. Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ \\%& \cite{goodfellow2013maxout} \\
  131. \bottomrule
  132. \end{tabular}
  133. \caption[Activation functions]{Overview of activation functions. Functions
  134. marked with $\dagger$ are not differentiable at 0 and functions
  135. marked with $\ddagger$ operate on all elements of a layer
  136. simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
  137. ReLU and ELU are typically $\alpha = 0.01$. Other activation
  138. function like randomized leaky ReLUs exist~\cite{xu2015empirical},
  139. but are far less commonly used.\\
  140. Some functions are smoothed versions of others, like the logistic
  141. function for the Heaviside step function, tanh for the sign
  142. function, softplus for ReLU.\\
  143. Softmax is the standard activation function for the last layer of
  144. a classification network as it produces a probability
  145. distribution. See \Cref{fig:activation-functions-plot} for a plot
  146. of some of them.}
  147. \label{table:activation-functions-overview}
  148. \end{table}
  149. \footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}
  150. \begin{figure}[ht]
  151. \centering
  152. \begin{tikzpicture}
  153. \definecolor{color1}{HTML}{E66101}
  154. \definecolor{color2}{HTML}{FDB863}
  155. \definecolor{color3}{HTML}{B2ABD2}
  156. \definecolor{color4}{HTML}{5E3C99}
  157. \begin{axis}[
  158. legend pos=north west,
  159. legend cell align={left},
  160. axis x line=middle,
  161. axis y line=middle,
  162. x tick label style={/pgf/number format/fixed,
  163. /pgf/number format/fixed zerofill,
  164. /pgf/number format/precision=1},
  165. y tick label style={/pgf/number format/fixed,
  166. /pgf/number format/fixed zerofill,
  167. /pgf/number format/precision=1},
  168. grid = major,
  169. width=16cm,
  170. height=8cm,
  171. grid style={dashed, gray!30},
  172. xmin=-2, % start the diagram at this x-coordinate
  173. xmax= 2, % end the diagram at this x-coordinate
  174. ymin=-1, % start the diagram at this y-coordinate
  175. ymax= 2, % end the diagram at this y-coordinate
  176. xlabel=x,
  177. ylabel=y,
  178. tick align=outside,
  179. enlargelimits=false]
  180. \addplot[domain=-2:2, color1, ultra thick,samples=500] {1/(1+exp(-x))};
  181. \addplot[domain=-2:2, color2, ultra thick,samples=500] {tanh(x)};
  182. \addplot[domain=-2:2, color4, ultra thick,samples=500] {max(0, x)};
  183. \addplot[domain=-2:2, color4, ultra thick,samples=500, dashed] {ln(exp(x) + 1)};
  184. \addplot[domain=-2:2, color3, ultra thick,samples=500, dotted] {max(x, exp(x) - 1)};
  185. \addlegendentry{$\varphi_1(x)=\frac{1}{1+e^{-x}}$}
  186. \addlegendentry{$\varphi_2(x)=\tanh(x)$}
  187. \addlegendentry{$\varphi_3(x)=\max(0, x)$}
  188. \addlegendentry{$\varphi_4(x)=\log(e^x + 1)$}
  189. \addlegendentry{$\varphi_5(x)=\max(x, e^x - 1)$}
  190. \end{axis}
  191. \end{tikzpicture}
  192. \caption[Activation functions]{Activation functions plotted in $[-2, +2]$.
  193. $\tanh$ and ELU are able to produce negative numbers. The image of
  194. ELU, ReLU and Softplus is not bound on the positive side, whereas
  195. $\tanh$ and the logistic function are always below~1.}
  196. \label{fig:activation-functions-plot}
  197. \end{figure}
  198. \glsreset{LReLU}
  199. \twocolumn