%!s(int64=9) %!d(string=hai) anos · 8faca5d36b
--- a/publications/activation-functions/00README.XXX
+++ b/publications/activation-functions/00README.XXX
@@ -0,0 +1,2 @@
 
				+main.tex toplevelfile
			
 
				+nohypertex
			
--- a/publications/activation-functions/IEEEtran.cls
+++ b/publications/activation-functions/IEEEtran.cls
--- a/publications/activation-functions/Makefile
+++ b/publications/activation-functions/Makefile
@@ -0,0 +1,23 @@
 
				+DOKUMENT = main
			
 
				+
			
 
				+make:
			
 
				+	pdflatex $(DOKUMENT).tex -output-format=pdf # Referenzen erstellen
			
 
				+	makeglossaries $(DOKUMENT)
			
 
				+	bibtex $(DOKUMENT)
			
 
				+	pdflatex $(DOKUMENT).tex -output-format=pdf # Referenzen einbinden
			
 
				+	pdflatex $(DOKUMENT).tex -output-format=pdf # Referenzen einbinden
			
 
				+	# make clean
			
 
				+
			
 
				+ebook:
			
 
				+	latexml --dest=$(DOKUMENT).xml $(DOKUMENT).tex
			
 
				+	latexmlpost -dest=$(DOKUMENT).html $(DOKUMENT).xml
			
 
				+	ebook-convert $(DOKUMENT).html $(DOKUMENT).epub --language de --no-default-epub-cover
			
 
				+
			
 
				+arxiv:
			
 
				+	zip -r upload.zip . -x \*.git\* -x MAKEFILE -x *.zip -x *.pdf
			
 
				+
			
 
				+clean:
			
 
				+	rm -rf $(TARGET) *.class *.html *.aux *.out *.thm *.idx *.toc *.ilg *.glg *.glo *.gls *.ist *.xdy *.fdb_latexmk *.bak *.blg *.glsdefs *.acn *.acr *.alg *.nls *.nlo *.bak *.pyg *.lot *.lof *.xmpdata *.xmpi *.bbl
			
 
				+	rm -rf _minted-booka4
			
 
				+	rm -rf *.log  # Analyze this for errors
			
 
				+	# rm -rf *.bbl *.ind  # Needed for arxiv
			
--- a/publications/activation-functions/abstract.tex
+++ b/publications/activation-functions/abstract.tex
@@ -0,0 +1,7 @@
 
				+\begin{abstract}

			
 
				+This paper reviews the most common activation functions for convolution neural

			
 
				+networks. They are evaluated on TODO dataset and possible reasons for the

			
 
				+differences in their performance are given.

			
 
				+

			
 
				+New state of the art results are achieved for TODO.

			
 
				+\end{abstract}

			
--- a/publications/activation-functions/appendix.tex
+++ b/publications/activation-functions/appendix.tex
@@ -0,0 +1,123 @@
 
				+%!TEX root = main.tex
			
 
				+
			
 
				+\appendix
			
 
				+\onecolumn
			
 
				+\section*{Overview}
			
 
				+\begin{table}[H]
			
 
				+    \centering
			
 
				+    \hspace*{-1cm}\begin{tabular}{lllll}
			
 
				+    \toprule
			
 
				+    Name                     & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ \\\midrule % & Used by 
			
 
				+    Sign function$^\dagger$  & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$                              & $0$                    \\%& \cite{971754} \\
			
 
				+    \parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$  & $0$                       \\%& \cite{mcculloch1943logical}\\
			
 
				+    Logistic function        & $\frac{1}{1+e^{-x}}$                           & $[0, 1]$                                                        & $\frac{e^x}{(e^x +1)^2}$  \\%& \cite{duch1999survey} \\
			
 
				+    Tanh                     & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$                                                       & $\sech^2(x)$              \\%& \cite{LeNet-5,Thoma:2014}\\
			
 
				+    \gls{ReLU}$^\dagger$           & $\max(0, x)$                                   & $[0, +\infty)$                                                  & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$      \\%& \cite{AlexNet-2012}\\
			
 
				+    \parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$                        & $(-\infty, +\infty)$                                             & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ \\%& \cite{maas2013rectifier,he2015delving} \\
			
 
				+    Softplus                 & $\log(e^x + 1)$                                & $(0, +\infty)$                                       & $\frac{e^x}{e^x + 1}$    \\%& \cite{dugas2001incorporating,glorot2011deep} \\
			
 
				+    \gls{ELU}                & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ \\%& \cite{clevert2015fast} \\
			
 
				+    Softmax$^\ddagger$       & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$    & $[0, 1]^K$                                           & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$          \\%& \cite{AlexNet-2012,Thoma:2014}\\
			
 
				+    Maxout$^\ddagger$        & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$                 & $(-\infty, +\infty)$                                 & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$          \\%& \cite{goodfellow2013maxout}       \\
			
 
				+    \bottomrule
			
 
				+    \end{tabular}
			
 
				+    \caption[Activation functions]{Overview of activation functions. Functions
			
 
				+             marked with $\dagger$ are not differentiable at 0 and functions
			
 
				+             marked with $\ddagger$ operate on all elements of a layer
			
 
				+             simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
			
 
				+             ReLU and ELU are typically $\alpha = 0.01$. Other activation
			
 
				+             function like randomized leaky ReLUs exist~\cite{xu2015empirical},
			
 
				+             but are far less commonly used.\\
			
 
				+             Some functions are smoothed versions of others, like the logistic
			
 
				+             function for the Heaviside step function, tanh for the sign
			
 
				+             function, softplus for ReLU.\\
			
 
				+             Softmax is the standard activation function for the last layer of
			
 
				+             a classification network as it produces a probability
			
 
				+             distribution. See \Cref{fig:activation-functions-plot} for a plot
			
 
				+             of some of them.}
			
 
				+    \label{table:activation-functions-overview}
			
 
				+\end{table}
			
 
				+\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}
			
 
				+
			
 
				+\section*{Evaluation Results}
			
 
				+\glsunset{LReLU}
			
 
				+\begin{table}[H]
			
 
				+    \centering
			
 
				+    \begin{tabular}{@{\extracolsep{4pt}}lcccccc@{}}
			
 
				+    \toprule
			
 
				+    \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model}                                                    & \multicolumn{2}{c}{Ensemble of 10} \\\cline{2-3}\cline{4-5}\cline{6-7}
			
 
				+                   & \multicolumn{2}{c}{Training set}     &\multicolumn{2}{c}{Test set}                  & Training set         & Test set \\\midrule
			
 
				+    Identity       & \SI{66.25}{\percent} & $\boldsymbol{\sigma=0.77}$ &\SI{56.74}{\percent} & \textbf{$\sigma=0.51$} & \SI{68.77}{\percent} & \SI{58.78}{\percent}\\
			
 
				+    Logistic       & \SI{51.87}{\percent} & $\sigma=3.64$ &\SI{46.54}{\percent} & $\sigma=3.22$          & \SI{61.19}{\percent} & \SI{54.58}{\percent}\\
			
 
				+    Logistic$^-$   & \SI{66.49}{\percent} & $\sigma=1.99$ &\SI{57.84}{\percent} & $\sigma=1.15$          & \SI{69.04}{\percent} & \SI{60.10}{\percent}\\
			
 
				+    Softmax        & \SI{75.22}{\percent} & $\sigma=2.41$ &\SI{59.49}{\percent} & $\sigma=1.25$          & \SI{78.87}{\percent} & \SI{63.06}{\percent}\\
			
 
				+    Tanh           & \SI{67.27}{\percent} & $\sigma=2.38$ &\SI{55.70}{\percent} & $\sigma=1.44$          & \SI{70.21}{\percent} & \SI{58.10}{\percent}\\
			
 
				+    Softsign       & \SI{66.43}{\percent} & $\sigma=1.74$ &\SI{55.75}{\percent} & $\sigma=0.93$          & \SI{69.78}{\percent} & \SI{58.40}{\percent}\\
			
 
				+    \gls{ReLU}     & \SI{78.62}{\percent} & $\sigma=2.15$ &\SI{62.18}{\percent} & $\sigma=0.99$          & \SI{81.81}{\percent} & \SI{64.57}{\percent}\\
			
 
				+    \gls{ReLU}$^-$ & \SI{76.01}{\percent} & $\sigma=2.31$ &\SI{62.87}{\percent} & $\sigma=1.08$          & \SI{78.18}{\percent} & \SI{64.81}{\percent}\\
			
 
				+    Softplus       & \SI{66.75}{\percent} & $\sigma=2.45$ &\SI{56.68}{\percent} & $\sigma=1.32$          & \SI{71.27}{\percent} & \SI{60.26}{\percent}\\
			
 
				+    S2ReLU         & \SI{63.32}{\percent} & $\sigma=1.69$ &\SI{56.99}{\percent} & $\sigma=1.14$          & \SI{65.80}{\percent} & \SI{59.20}{\percent}\\
			
 
				+    \gls{LReLU}    & \SI{74.92}{\percent} & $\sigma=2.49$ &\SI{61.86}{\percent} & $\sigma=1.23$          & \SI{77.67}{\percent} & \SI{64.01}{\percent}\\
			
 
				+    \gls{PReLU}    & \textbf{\SI{80.01}{\percent}} & $\sigma=2.03$ &\SI{62.16}{\percent} & $\sigma=0.73$ & \textbf{\SI{83.50}{\percent}} & \textbf{\SI{64.79}{\percent}}\\
			
 
				+    \gls{ELU}      & \SI{76.64}{\percent} & $\sigma=1.48$ &\textbf{\SI{63.38}{\percent}} & $\sigma=0.55$ & \SI{78.30}{\percent} & \SI{64.70}{\percent}\\
			
 
				+    \bottomrule
			
 
				+    \end{tabular}
			
 
				+    \caption[Activation function evaluation results on CIFAR-100]{Training and
			
 
				+             test accuracy of adjusted baseline models trained with different
			
 
				+             activation functions on CIFAR-100. For LReLU, $\alpha = 0.3$ was
			
 
				+             chosen.}
			
 
				+    \label{table:CIFAR-100-accuracies-activation-functions}
			
 
				+\end{table}
			
 
				+
			
 
				+\glsreset{LReLU}
			
 
				+
			
 
				+\begin{table}[H]
			
 
				+    \centering
			
 
				+    \setlength\tabcolsep{1.5pt}
			
 
				+    \begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
			
 
				+    \toprule
			
 
				+    \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model}              & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
			
 
				+                              & \multicolumn{2}{c}{Training set}     &\multicolumn{2}{c}{Test set}                  & Train                & Test                 & Range     & \multicolumn{1}{c}{Mean} \\\midrule
			
 
				+    Identity                  & \SI{87.92}{\percent} & $\sigma=0.40$ & \SI{84.69}{\percent} & $\sigma=0.08$         & \SI{88.59}{\percent} & \SI{85.43}{\percent} & \hphantom{0}92 -- 140 & 114.5\\%TODO: Really?
			
 
				+    Logistic                  & \SI{81.46}{\percent} & $\sigma=5.08$ & \SI{79.67}{\percent} & $\sigma=4.85$         & \SI{86.38}{\percent} & \SI{84.60}{\percent} & \hphantom{0}\textbf{58} -- \hphantom{0}\textbf{91}  & \textbf{77.3}\\
			
 
				+    Softmax                   & \SI{88.19}{\percent} & $\sigma=0.31$ & \SI{84.70}{\percent} & $\sigma=0.15$         & \SI{88.69}{\percent} & \SI{85.43}{\percent} & 124 -- 171& 145.8\\
			
 
				+    Tanh                      & \SI{88.41}{\percent} & $\sigma=0.36$ & \SI{84.46}{\percent} & $\sigma=0.27$         & \SI{89.24}{\percent} & \SI{85.45}{\percent} & \hphantom{0}89 -- 123 & 108.7\\
			
 
				+    Softsign                  & \SI{88.00}{\percent} & $\sigma=0.47$ & \SI{84.46}{\percent} & $\sigma=0.23$         & \SI{88.77}{\percent} & \SI{85.33}{\percent} & \hphantom{0}77 -- 119 & 104.1\\
			
 
				+    \gls{ReLU}                & \SI{88.93}{\percent} & $\sigma=0.46$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.21$         & \SI{89.35}{\percent} & \SI{85.95}{\percent} & \hphantom{0}96 -- 132 & 102.8\\
			
 
				+    Softplus                  & \SI{88.42}{\percent} & $\boldsymbol{\sigma=0.29}$ & \SI{85.16}{\percent} & $\sigma=0.15$         & \SI{88.90}{\percent} & \SI{85.73}{\percent} &            108 -- 143 & 121.0\\
			
 
				+    \gls{LReLU}               & \SI{88.61}{\percent} & $\sigma=0.41$ & \SI{85.21}{\percent} & $\boldsymbol{\sigma=0.05}$         & \SI{89.07}{\percent} & \SI{85.83}{\percent} & \hphantom{0}87 -- 117 & 104.5\\
			
 
				+    \gls{PReLU}               & \textbf{\SI{89.62}{\percent}} & $\sigma=0.41$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.17$& \textbf{\SI{90.10}{\percent}} & \SI{86.01}{\percent} & \hphantom{0}85 -- 111 & 100.5\\
			
 
				+    \gls{ELU}                 & \SI{89.49}{\percent} & $\sigma=0.42$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.10$         & \SI{89.94}{\percent} & \textbf{\SI{86.03}{\percent}} & \hphantom{0}73 -- 113 &  92.4\\
			
 
				+    \bottomrule
			
 
				+    \end{tabular}
			
 
				+    \caption[Activation function evaluation results on HASYv2]{Test accuracy of
			
 
				+             adjusted baseline models trained with different activation
			
 
				+             functions on HASYv2. For LReLU, $\alpha = 0.3$ was chosen.}
			
 
				+    \label{table:HASYv2-accuracies-activation-functions}
			
 
				+\end{table}
			
 
				+
			
 
				+\begin{table}[H]
			
 
				+    \centering
			
 
				+    \setlength\tabcolsep{1.5pt}
			
 
				+    \begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
			
 
				+    \toprule
			
 
				+    \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model}              & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
			
 
				+                              & \multicolumn{2}{c}{Training set}     &\multicolumn{2}{c}{Test set}                  & Train                & Test                 & Range     & \multicolumn{1}{c}{Mean} \\\midrule
			
 
				+    Identity                  & \SI{87.49}{\percent} & $\sigma=2.50$ & \SI{69.86}{\percent} & $\sigma=1.41$         & \SI{89.78}{\percent} & \SI{71.90}{\percent} & \hphantom{0}51 -- \hphantom{0}65  &  53.4\\
			
 
				+    Logistic                  & \SI{45.32}{\percent} & $\sigma=14.88$& \SI{40.85}{\percent} & $\sigma=12.56$        & \SI{51.06}{\percent} & \SI{45.49}{\percent} & \hphantom{0}38 -- \hphantom{0}93  &  74.6\\
			
 
				+    Softmax                   & \SI{87.90}{\percent} & $\sigma=3.58$ & \SI{67.91}{\percent} & $\sigma=2.32$         & \SI{91.51}{\percent} & \SI{70.96}{\percent} & 108 -- 150           & 127.5\\
			
 
				+    Tanh                      & \SI{85.38}{\percent} & $\sigma=4.04$ & \SI{67.65}{\percent} & $\sigma=2.01$         & \SI{90.47}{\percent} & \SI{71.29}{\percent} & 48 -- \hphantom{0}92 & 65.2\\
			
 
				+    Softsign                  & \SI{88.57}{\percent} & $\sigma=4.00$ & \SI{69.32}{\percent} & $\sigma=1.68$         & \SI{93.04}{\percent} & \SI{72.40}{\percent} & 55 -- 117            & 83.2\\
			
 
				+    \gls{ReLU}                & \SI{94.35}{\percent} & $\sigma=3.38$ & \SI{71.01}{\percent} & $\sigma=1.63$         & \SI{98.20}{\percent} & \SI{74.85}{\percent} & 52 -- \hphantom{0}98 & 75.5\\
			
 
				+    Softplus                  & \SI{83.03}{\percent} & $\sigma=2.07$ & \SI{68.28}{\percent} & $\sigma=1.74$         & \SI{93.04}{\percent} & \SI{75.99}{\percent} & 56 -- \hphantom{0}89 & 68.9\\
			
 
				+    \gls{LReLU}               & \SI{93.83}{\percent} & $\sigma=3.89$ & \SI{74.66}{\percent} & $\sigma=2.11$         & \SI{97.56}{\percent} & \SI{78.08}{\percent} & 52 -- 120 & 80.1\\
			
 
				+    \gls{PReLU}               & \SI{95.53}{\percent} & $\sigma=1.92$ & \SI{71.69}{\percent} & $\sigma=1.37$         & \SI{98.17}{\percent} & \SI{74.69}{\percent} & 59 -- 101 & 78.8\\
			
 
				+    \gls{ELU}                 & \SI{95.42}{\percent} & $\sigma=3.57$ & \SI{75.09}{\percent} & $\sigma=2.39$         & \SI{98.54}{\percent} & \SI{78.66}{\percent} & 66 -- \hphantom{0}72 & 67.2\\
			
 
				+    \bottomrule
			
 
				+    \end{tabular}
			
 
				+    \caption[Activation function evaluation results on STL-10]{Test accuracy of
			
 
				+             adjusted baseline models trained with different activation
			
 
				+             functions on STL-10. For LReLU, $\alpha = 0.3$ was chosen.}
			
 
				+    \label{table:STL-10-accuracies-activation-functions}
			
 
				+\end{table}
			
 
				+
			
 
				+\twocolumn
			
--- a/publications/activation-functions/bibliography.bib
+++ b/publications/activation-functions/bibliography.bib
--- a/publications/activation-functions/cleveref.sty
+++ b/publications/activation-functions/cleveref.sty
--- a/publications/activation-functions/content.tex
+++ b/publications/activation-functions/content.tex
@@ -0,0 +1,176 @@
 
				+%!TEX root = main.tex
			
 
				+\section{Introduction}
			
 
				+TODO\cite{Thoma:2014}
			
 
				+
			
 
				+\section{Terminology}
			
 
				+TODO
			
 
				+
			
 
				+
			
 
				+\section{Activation Functions}
			
 
				+Nonlinear, differentiable activation functions are important for neural
			
 
				+networks to allow them to learn nonlinear decision boundaries. One of the
			
 
				+simplest and most widely used activation functions for \glspl{CNN} is
			
 
				+\gls{ReLU}~\cite{AlexNet-2012}, but others such as
			
 
				+\gls{ELU}~\cite{clevert2015fast}, \gls{PReLU}~\cite{he2015delving}, softplus~\cite{7280459}
			
 
				+and softsign~\cite{bergstra2009quadratic} have been proposed. The baseline uses
			
 
				+\gls{ELU}.
			
 
				+
			
 
				+Activation functions differ in the range of values and the derivative. The
			
 
				+definitions and other comparisons of eleven activation functions are given
			
 
				+in~\cref{table:activation-functions-overview}.
			
 
				+
			
 
				+Theoretical explanations why one activation function is preferable to another
			
 
				+in some scenarios are the following:
			
 
				+\begin{itemize}
			
 
				+    \item \textbf{Vanishing Gradient}: Activation functions like tanh and the
			
 
				+          logistic function saturate outside of the interval $[-5, 5]$. This
			
 
				+          means weight updates are very small for preceding neurons, which is
			
 
				+          especially a problem for very deep or recurrent networks as described
			
 
				+          in~\cite{bengio1994learning}. Even if the neurons learn eventually,
			
 
				+          learning is slower~\cite{AlexNet-2012}.
			
 
				+    \item \textbf{Dying ReLU}: The dying \gls{ReLU} problem is similar to the
			
 
				+          vanishing gradient problem. The gradient of the \gls{ReLU} function
			
 
				+          is~0 for all non-positive values. This means if all elements of the
			
 
				+          training set lead to a negative input for one neuron at any point in
			
 
				+          the training process, this neuron does not get any update and hence
			
 
				+          does not participate in the training process. This problem is
			
 
				+          addressed in~\cite{maas2013rectifier}.
			
 
				+    \item \textbf{Mean unit activation}: Some publications
			
 
				+          like~\cite{clevert2015fast,BatchNormalization-2015} claim that mean
			
 
				+          unit activations close to 0 are desirable. They claim that this
			
 
				+          speeds up learning by reducing the bias shift effect. The speedup
			
 
				+          of learning is supported by many experiments. Hence the possibility
			
 
				+          of negative activations is desirable.
			
 
				+\end{itemize}
			
 
				+
			
 
				+Those considerations are listed
			
 
				+in~\cref{table:properties-of-activation-functions} for 11~activation functions.
			
 
				+Besides the theoretical properties, empiric results are provided
			
 
				+in~\cref{table:CIFAR-100-accuracies-activation-functions,table:CIFAR-100-timing-activation-functions}.
			
 
				+The baseline network was adjusted so that every activation function except the
			
 
				+one of the output layer was replaced by one of the 11~activation functions.
			
 
				+
			
 
				+As expected, \gls{PReLU} and \gls{ELU} performed best. Unexpected was that the
			
 
				+logistic function, tanh and softplus performed worse than the identity and it
			
 
				+is unclear why the pure-softmax network performed so much better than the
			
 
				+logistic function.
			
 
				+One hypothesis why the logistic function performs so bad is that it cannot
			
 
				+produce negative outputs. Hence the logistic$^-$ function was developed:
			
 
				+\[\text{logistic}^{-}(x) = \frac{1}{1+ e^{-x}} - 0.5\]
			
 
				+The logistic$^-$ function has the same derivative as the logistic function and
			
 
				+hence still suffers from the vanishing gradient problem.
			
 
				+The network with the logistic$^-$ function achieves an accuracy which is
			
 
				+\SI{11.30}{\percent} better than the network with the logistic function, but is
			
 
				+still \SI{5.54}{\percent} worse than the \gls{ELU}.
			
 
				+
			
 
				+Similarly, \gls{ReLU} was adjusted to have a negative output:
			
 
				+\[\text{ReLU}^{-}(x) = \max(-1, x) = \text{ReLU}(x+1) - 1\]
			
 
				+The results of \gls{ReLU}$^-$ are much worse on the training set, but perform
			
 
				+similar on the test set. The result indicates that the possibility of hard zero
			
 
				+and thus a sparse representation is either not important or similar important as
			
 
				+the possibility to produce negative outputs. This
			
 
				+contradicts~\cite{glorot2011deep,srivastava2014understanding}.
			
 
				+
			
 
				+A key difference between the logistic$^-$ function and \gls{ELU} is that
			
 
				+\gls{ELU} does neither suffers from the vanishing gradient problem nor is its
			
 
				+range of values bound. For this reason, the S2ReLU activation function, defined
			
 
				+as
			
 
				+\begin{align*}
			
 
				+  \StwoReLU(x) &= \ReLU \left (\frac{x}{2} + 1 \right ) - \ReLU \left (-\frac{x}{2} + 1 \right)\\
			
 
				+  &=
			
 
				+  \begin{cases}-\frac{x}{2} + 1 &\text{if } x \le -2\\
			
 
				+               x &\text{if } -2\le x \le 2\\
			
 
				+               \frac{x}{2} + 1&\text{if } x > -2\end{cases}
			
 
				+\end{align*}
			
 
				+This function is similar to SReLUs as introduced in~\cite{jin2016deep}. The
			
 
				+difference is that S2ReLU does not introduce learnable parameters. The S2ReLU
			
 
				+was designed to be symmetric, be the identity close to zero and have a smaller
			
 
				+absolute value than the identity farther away. It is easy to compute and easy to
			
 
				+implement.
			
 
				+
			
 
				+Those results --- not only the absolute values, but also the relative
			
 
				+comparison --- might depend on the network architecture, the training
			
 
				+algorithm, the initialization and the dataset. Results for MNIST can be found
			
 
				+in~\cref{table:MNIST-accuracies-activation-functions} and for HASYv2
			
 
				+in~\cref{table:HASYv2-accuracies-activation-functions}. For both datasets, the
			
 
				+logistic function has a much shorter training time and a noticeably lower test
			
 
				+accuracy.
			
 
				+
			
 
				+\begin{table}[H]
			
 
				+    \centering
			
 
				+    \begin{tabular}{lccc}
			
 
				+    \toprule
			
 
				+    \multirow{2}{*}{Function} & Vanishing  & Negative Activation & Bound \\
			
 
				+                  & Gradient       & possible & activation \\\midrule
			
 
				+    Identity      & \cellcolor{green!25}No    & \cellcolor{green!25}  Yes    & \cellcolor{green!25}No  \\
			
 
				+    Logistic      & \cellcolor{red!25} Yes    & \cellcolor{red!25}   No      & \cellcolor{red!25}  Yes \\
			
 
				+    Logistic$^-$  & \cellcolor{red!25} Yes    & \cellcolor{green!25}  Yes    & \cellcolor{red!25}  Yes \\
			
 
				+    Softmax        & \cellcolor{red!25} Yes    & \cellcolor{green!25}  Yes    & \cellcolor{red!25}  Yes \\
			
 
				+    tanh          & \cellcolor{red!25} Yes    & \cellcolor{green!25}  Yes    & \cellcolor{red!25}  Yes \\
			
 
				+    Softsign      & \cellcolor{red!25} Yes    & \cellcolor{green!25}Yes      & \cellcolor{red!25}   Yes \\
			
 
				+    ReLU          & \cellcolor{yellow!25}Yes\footnotemark & \cellcolor{red!25} No & \cellcolor{yellow!25}Half-sided \\
			
 
				+    Softplus      & \cellcolor{green!25}No    & \cellcolor{red!25}   No      & \cellcolor{yellow!25}Half-sided \\
			
 
				+    S2ReLU        & \cellcolor{green!25}No    & \cellcolor{green!25}Yes      & \cellcolor{green!25} No \\
			
 
				+    LReLU/PReLU   & \cellcolor{green!25}No    & \cellcolor{green!25}Yes      & \cellcolor{green!25} No \\
			
 
				+    ELU           & \cellcolor{green!25}No    & \cellcolor{green!25}Yes      & \cellcolor{green!25} No \\
			
 
				+    \bottomrule
			
 
				+    \end{tabular}
			
 
				+    \caption[Activation function properties]{Properties of activation functions.}
			
 
				+    \label{table:properties-of-activation-functions}
			
 
				+\end{table}
			
 
				+\footnotetext{The dying ReLU problem is similar to the vanishing gradient problem.}
			
 
				+
			
 
				+\glsunset{LReLU}
			
 
				+
			
 
				+\begin{table}[H]
			
 
				+    \centering
			
 
				+    \begin{tabular}{lccclllll}
			
 
				+    \toprule
			
 
				+    \multirow{2}{*}{Function} & \multicolumn{2}{c}{Inference per}                                & Training                            & \multirow{2}{*}{Epochs} & Mean total        \\\cline{2-3}
			
 
				+                              & 1 Image                        & 128                             & time                                &                         & training time     \\\midrule
			
 
				+    Identity                  & \SI{8}{\milli\second}          & \SI{42}{\milli\second}          & \SI{31}{\second\per\epoch}          & 108 -- \textbf{148}     &\SI{3629}{\second} \\
			
 
				+    Logistic                  & \SI{6}{\milli\second}          & \textbf{\SI{31}{\milli\second}} & \SI{24}{\second\per\epoch}          & \textbf{101} -- 167     &\textbf{\SI{2234}{\second}} \\
			
 
				+    Logistic$^-$              & \SI{6}{\milli\second}          & \textbf{\SI{31}{\milli\second}} & \textbf{\SI{22}{\second\per\epoch}} & 133 -- 255              &\SI{3421}{\second} \\
			
 
				+    Softmax                   & \SI{7}{\milli\second}          & \SI{37}{\milli\second}          & \SI{33}{\second\per\epoch}          & 127 -- 248              &\SI{5250}{\second} \\
			
 
				+    Tanh                      & \SI{6}{\milli\second}          & \textbf{\SI{31}{\milli\second}} & \SI{23}{\second\per\epoch}          & 125 -- 211              &\SI{3141}{\second} \\
			
 
				+    Softsign                  & \SI{6}{\milli\second}          & \textbf{\SI{31}{\milli\second}} & \SI{23}{\second\per\epoch}          & 122 -- 205              &\SI{3505}{\second} \\
			
 
				+    \gls{ReLU}                & \SI{6}{\milli\second}          & \textbf{\SI{31}{\milli\second}} & \SI{23}{\second\per\epoch}          & 118 -- 192              &\SI{3449}{\second} \\
			
 
				+    Softplus                  & \SI{6}{\milli\second}          & \textbf{\SI{31}{\milli\second}} & \SI{24}{\second\per\epoch}          & \textbf{101} -- 165     &\SI{2718}{\second} \\
			
 
				+    S2ReLU                    & \textbf{\SI{5}{\milli\second}} & \SI{32}{\milli\second}          & \SI{26}{\second\per\epoch}          & 108 -- 209              &\SI{3231}{\second} \\
			
 
				+    \gls{LReLU}               & \SI{7}{\milli\second}          & \SI{34}{\milli\second}          & \SI{25}{\second\per\epoch}          & 109 -- 198              &\SI{3388}{\second} \\
			
 
				+    \gls{PReLU}               & \SI{7}{\milli\second}          & \SI{34}{\milli\second}          & \SI{28}{\second\per\epoch}          & 131 -- 215              &\SI{3970}{\second} \\
			
 
				+    \gls{ELU}                 & \SI{6}{\milli\second}          & \textbf{\SI{31}{\milli\second}} & \SI{23}{\second\per\epoch}          & 146 -- 232              &\SI{3692}{\second} \\
			
 
				+    \bottomrule
			
 
				+    \end{tabular}
			
 
				+    \caption[Activation function timing results on CIFAR-100]{Training time and
			
 
				+             inference time of adjusted baseline models trained with different
			
 
				+             activation functions on GTX~970 \glspl{GPU} on CIFAR-100. It was
			
 
				+             expected that the identity is the fastest function. This result is
			
 
				+             likely an implementation specific problem of Keras~2.0.4 or
			
 
				+             Tensorflow~1.1.0.}
			
 
				+    \label{table:CIFAR-100-timing-activation-functions}
			
 
				+\end{table}
			
 
				+
			
 
				+\begin{table}[H]
			
 
				+    \centering
			
 
				+    \begin{tabular}{lccccc}
			
 
				+    \toprule
			
 
				+    \multirow{2}{*}{Function} & \multicolumn{2}{c}{Single model}              & Ensemble & \multicolumn{2}{c}{Epochs}\\\cline{2-3}\cline{5-6}
			
 
				+                              & Accuracy             & std                    & Accuracy & Range & Mean \\\midrule
			
 
				+    Identity                  & \SI{99.45}{\percent} & $\sigma=0.09$          & \SI{99.63}{\percent} & 55 -- \hphantom{0}77  & 62.2\\%TODO: Really?
			
 
				+    Logistic                  & \SI{97.27}{\percent} & $\sigma=2.10$          & \SI{99.48}{\percent} & \textbf{37} -- \hphantom{0}76  & \textbf{54.5}\\
			
 
				+    Softmax                   & \SI{99.60}{\percent} & $\boldsymbol{\sigma=0.03}$& \SI{99.63}{\percent} & 44 -- \hphantom{0}73  & 55.6\\
			
 
				+    Tanh                      & \SI{99.40}{\percent} & $\sigma=0.09$          & \SI{99.57}{\percent} & 56 -- \hphantom{0}80  & 67.6\\
			
 
				+    Softsign                  & \SI{99.40}{\percent} & $\sigma=0.08$          & \SI{99.57}{\percent} & 72 -- 101             & 84.0\\
			
 
				+    \gls{ReLU}                & \textbf{\SI{99.62}{\percent}} & \textbf{$\sigma=0.04$} & \textbf{\SI{99.73}{\percent}} & 51 -- \hphantom{0}94 & 71.7\\
			
 
				+    Softplus                  & \SI{99.52}{\percent} & $\sigma=0.05$          & \SI{99.62}{\percent} & 62 -- \hphantom{0}\textbf{70}  & 68.9\\
			
 
				+    \gls{PReLU}               & \SI{99.57}{\percent} & $\sigma=0.07$          & \textbf{\SI{99.73}{\percent}} & 44 -- \hphantom{0}89 & 71.2\\
			
 
				+    \gls{ELU}                 & \SI{99.53}{\percent} & $\sigma=0.06$          & \SI{99.58}{\percent} & 45 -- 111 & 72.5\\
			
 
				+    \bottomrule
			
 
				+    \end{tabular}
			
 
				+    \caption[Activation function evaluation results on MNIST]{Test accuracy of
			
 
				+             adjusted baseline models trained with different activation
			
 
				+             functions on MNIST.}
			
 
				+    \label{table:MNIST-accuracies-activation-functions}
			
 
				+\end{table}
			
 
				+\glsreset{LReLU}
			
--- a/publications/activation-functions/figures/sample-images.png
+++ b/publications/activation-functions/figures/sample-images.png
--- a/publications/activation-functions/glossary.tex
+++ b/publications/activation-functions/glossary.tex
@@ -0,0 +1,43 @@
 
				+%!TEX root = booka4.tex
			
 
				+%Term definitions
			
 
				+\newacronym{ANN}{ANN}{artificial neural network}
			
 
				+\newacronym{ANOVA}{ANOVA}{analysis of variance}
			
 
				+\newacronym{ASO}{ASO}{Automatic Structure Optimization}
			
 
				+\newacronym{CMO}{CMO}{Confusion Matrix Ordering}
			
 
				+\newacronym{CE}{CE}{cross entropy}
			
 
				+\newacronym{CUDA}{CUDA}{Compute Unified Device Architecture}
			
 
				+\newacronym{CNN}{CNN}{Convolutional Neural Network}
			
 
				+\newacronym{CSR}{CSR}{cursive script recognition}
			
 
				+\newacronym{CFM}{CFM}{classification figure of merit}
			
 
				+\newacronym{DTW}{DTW}{dynamic time warping}
			
 
				+\newacronym{ELU}{ELU}{Exponential Linear Unit}
			
 
				+\newacronym{ES}{ES}{early stopping}
			
 
				+\newacronym{FLOP}{FLOP}{floating point operation}
			
 
				+\newacronym{FC}{FC}{Fully Connected}
			
 
				+\newacronym{GA}{GA}{genetic algorithm}
			
 
				+\newacronym{GPU}{GPU}{graphics processing unit}
			
 
				+\newacronym{GAN}{GAN}{Generative Adverserial Network}
			
 
				+\newacronym{GMM}{GMM}{Gaussian mixture model}
			
 
				+\newacronym{GTW}{GTW}{greedy time warping}
			
 
				+\newacronym{HMM}{HMM}{hidden Markov model}
			
 
				+\newacronym{HWR}{HWR}{handwriting recognition}
			
 
				+\newacronym{HWRT}{HWRT}{handwriting recognition toolkit}
			
 
				+\newacronym{HSV}{HSV}{hue, saturation, value}
			
 
				+\newacronym{LReLU}{LReLU}{leaky rectified linear unit}
			
 
				+\newacronym{LDA}{LDA}{linear discriminant analysis}
			
 
				+\newacronym{LCN}{LCN}{Local Contrast Normalization}
			
 
				+\newacronym{MLP}{MLP}{multilayer perceptron}
			
 
				+\newacronym{MSE}{MSE}{mean squared error}
			
 
				+\newacronym{NAG}{NAG}{Nesterov Accellerated Momentum}
			
 
				+\newacronym{NEAT}{NEAT}{NeuroEvolution of Augmenting Topologies}
			
 
				+\newacronym{OBD}{OBD}{Optimal Brain Damage}
			
 
				+\newacronym{OOV}{OOV}{out of vocabulary}
			
 
				+\newacronym{PCA}{PCA}{principal component analysis}
			
 
				+\newacronym{PyPI}{PyPI}{Python Package Index}
			
 
				+\newacronym{PReLU}{PReLU}{parametrized rectified linear unit}
			
 
				+\newacronym{SGD}{SGD}{stochastic gradient descent}
			
 
				+\newacronym{TDNN}{TDNN}{time delay neural network}
			
 
				+\newacronym{SVM}{SVM}{support vector machine}
			
 
				+\newacronym{SLP}{SLP}{supervised layer-wise pretraining}
			
 
				+\newacronym{ReLU}{ReLU}{rectified linear unit}
			
 
				+\newacronym{ZCA}{ZCA}{Zero Components Analysis}
			
--- a/publications/activation-functions/main.tex
+++ b/publications/activation-functions/main.tex
@@ -0,0 +1,99 @@
 
				+\documentclass[technote,a4paper,leqno]{IEEEtran}

			
 
				+\pdfoutput=1

			
 
				+

			
 
				+\usepackage[utf8]{inputenc} % this is needed for umlauts

			
 
				+\usepackage[USenglish]{babel} % this is needed for umlauts

			
 
				+\usepackage[T1]{fontenc}    % this is needed for correct output of umlauts in pdf

			
 
				+\usepackage{amsmath,amssymb}

			
 
				+\usepackage[table]{xcolor}

			
 
				+\usepackage[absolute,overlay]{textpos}

			
 
				+\usepackage{tikz}

			
 
				+\usepackage{csquotes}

			
 
				+\usepackage[binary-units,group-separator={,}]{siunitx}

			
 
				+\sisetup{per-mode=fraction,

			
 
				+         binary-units=true,

			
 
				+         group-separator = {\,},

			
 
				+         range-phrase=-,

			
 
				+         detect-weight=true,

			
 
				+         detect-family=true}

			
 
				+\DeclareSIUnit\pixel{px}

			
 
				+\DeclareSIUnit\epoch{epoch}

			
 
				+\DeclareSIUnit\float{float}

			
 
				+\DeclareSIUnit\floats{floats}

			
 
				+\usepackage{caption}  % nicer captions

			
 
				+

			
 
				+\usepackage{url}

			
 
				+\usepackage{breakurl}

			
 
				+\usepackage[raiselinks=true,

			
 
				+            bookmarks=true,

			
 
				+            bookmarksopenlevel=1,

			
 
				+            bookmarksopen=true,

			
 
				+            bookmarksnumbered=true,

			
 
				+            breaklinks,

			
 
				+            hyperindex=true,

			
 
				+            plainpages=false,

			
 
				+            pdfpagelabels=true,

			
 
				+            pdfborder={0 0 0.5}]{hyperref}

			
 
				+\def\UrlBreaks{\do\/\do-}

			
 
				+

			
 
				+\usepackage{xspace}

			
 
				+\newcommand*\elide{\textup{[\,\dots]}\xspace}

			
 
				+

			
 
				+\usepackage[nameinlink, noabbrev,capitalise]{cleveref}

			
 
				+

			
 
				+\title{A review of activation functions for convolutional neural networks}

			
 
				+\author{%

			
 
				+    \IEEEauthorblockN{Martin Thoma}\\

			
 
				+    \IEEEauthorblockA{E-Mail: info@martin-thoma.de} % ORCID: http://orcid.org/0000-0002-6517-1690

			
 
				+}

			
 
				+

			
 
				+\hypersetup{

			
 
				+    pdfauthor   = {Martin Thoma},

			
 
				+    pdfkeywords = {activation functions, review},

			
 
				+    pdfsubject  = {activation functions},

			
 
				+    pdftitle    = {A review of activation functions for convolutional neural networks},

			
 
				+}

			
 
				+\usepackage[inline]{enumitem}

			
 
				+\usepackage{longtable}

			
 
				+\usepackage{booktabs}       % \toprule

			
 
				+\usepackage{braket}         % needed for \Set

			
 
				+\usepackage{algorithm,algpseudocode}

			
 
				+

			
 
				+\usepackage[xindy,toc,section=chapter,numberedsection=autolabel]{glossaries}

			
 
				+

			
 
				+% Make document nicer

			
 
				+\DeclareMathOperator*{\argmin}{arg\,min}

			
 
				+\DeclareMathOperator*{\sech}{sech}

			
 
				+\DeclareMathOperator*{\conv}{conv}

			
 
				+\DeclareMathOperator*{\ReLU}{ReLU}

			
 
				+\DeclareMathOperator*{\StwoReLU}{S2ReLU}

			
 
				+\DeclareMathOperator*{\logistic}{logistic}

			
 
				+\newcommand*\diff{\mathop{}\!\mathrm{d}}

			
 
				+\usepackage{tensor}

			
 
				+

			
 
				+\usepackage{parskip}

			
 
				+\usepackage{multirow}

			
 
				+\usepackage{microtype}

			
 
				+\loadglsentries[main]{glossary}

			
 
				+\makeglossaries

			
 
				+

			
 
				+% % Variables

			
 
				+% \newcommand{\dbTotalClasses}{369}

			
 
				+% \newcommand{\dbTotalInstances}{\num{168233}}

			
 
				+% \newcommand{\dbName}{HASY}

			
 
				+% \newcommand{\dbNameVersion}{HASYv2}

			
 
				+% \newcommand{\dbSizeMB}{34.6}

			
 
				+% \newcommand{\dbDownloadURL}{\url{https://doi.org/10.5281/zenodo.259444}}

			
 
				+% \newcommand{\dbMDfivesum}{fddf23f36e24b5236f6b3a0880c778e3}

			
 
				+

			
 
				+% Start

			
 
				+\begin{document}

			
 
				+\maketitle

			
 
				+\input{abstract}

			
 
				+\input{content}

			
 
				+\bibliographystyle{IEEEtranSA}

			
 
				+\bibliography{bibliography}

			
 
				+\input{appendix}

			
 
				+

			
 
				+

			
 
				+\end{document}