11 年之前 · 2a80dd2c0a
--- a/documents/write-math-ba-paper/Makefile
+++ b/documents/write-math-ba-paper/Makefile
@@ -1,12 +0,0 @@
 
				-DOKUMENT = write-math-ba-paper
			
 
				-make:
			
 
				-	pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # aux-files for makeindex / makeglossaries
			
 
				-	makeglossaries $(DOKUMENT)
			
 
				-	bibtex $(DOKUMENT)
			
 
				-	pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include index
			
 
				-	pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include symbol table
			
 
				-	pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include symbol table
			
 
				-	make clean
			
 
				-
			
 
				-clean:
			
 
				-	rm -rf  $(TARGET) *.class *.html *.log *.aux *.out *.thm *.idx *.toc *.ind *.ilg figures/torus.tex *.glg *.glo *.gls *.ist *.xdy *.fdb_latexmk *.bak *.blg *.bbl *.glsdefs *.acn *.acr *.alg *.nls *.nlo *.bak *.pyg *.lot *.lof
			
--- a/documents/write-math-ba-paper/README.md
+++ b/documents/write-math-ba-paper/README.md
@@ -1,8 +1,2 @@
 
				-[Download compiled PDF](https://github.com/MartinThoma/LaTeX-examples/blob/master/documents/write-math-ba-paper/write-math-ba-paper.pdf)
			
 
				-
			
 
				-Paper for [ICDAR 2015](http://2015.icdar.org/).
			
 
				-
			
 
				-## Spell checking
			
 
				-* Spell checking `aspell --lang=en --mode=tex check write-math-ba-paper.tex`
			
 
				-* Spell checking with `http://www.reverso.net/spell-checker`
			
 
				-* https://github.com/devd/Academic-Writing-Check
			
 
				+This example is now in a private repository. If you want to get access, please
			
 
				+ask info@martin-thoma.de
			
--- a/documents/write-math-ba-paper/baseline-1.csv
+++ b/documents/write-math-ba-paper/baseline-1.csv
--- a/documents/write-math-ba-paper/baseline-2-pretraining.csv
+++ b/documents/write-math-ba-paper/baseline-2-pretraining.csv
--- a/documents/write-math-ba-paper/baseline-2.csv
+++ b/documents/write-math-ba-paper/baseline-2.csv
--- a/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/Makefile
+++ b/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/Makefile
@@ -1,35 +0,0 @@
 
				-SOURCE = errors-by-epoch-pretraining
			
 
				-DELAY = 80
			
 
				-DENSITY = 300
			
 
				-WIDTH = 512
			
 
				-
			
 
				-make:
			
 
				-	pdflatex $(SOURCE).tex -output-format=pdf
			
 
				-	make clean
			
 
				-
			
 
				-clean:
			
 
				-	rm -rf  $(TARGET) *.class *.html *.log *.aux *.data *.gnuplot
			
 
				-
			
 
				-gif:
			
 
				-	pdfcrop $(SOURCE).pdf
			
 
				-	convert -verbose -delay $(DELAY) -loop 0 -density $(DENSITY) $(SOURCE)-crop.pdf $(SOURCE).gif
			
 
				-	make clean
			
 
				-
			
 
				-png:
			
 
				-	make
			
 
				-	make svg
			
 
				-	inkscape $(SOURCE).svg -w $(WIDTH) --export-png=$(SOURCE).png
			
 
				-
			
 
				-transparentGif:
			
 
				-	convert $(SOURCE).pdf -transparent white result.gif
			
 
				-	make clean
			
 
				-
			
 
				-svg:
			
 
				-	make
			
 
				-	#inkscape $(SOURCE).pdf --export-plain-svg=$(SOURCE).svg
			
 
				-	pdf2svg $(SOURCE).pdf $(SOURCE).svg
			
 
				-	# Necessary, as pdf2svg does not always create valid svgs:
			
 
				-	inkscape $(SOURCE).svg --export-plain-svg=$(SOURCE).svg
			
 
				-	rsvg-convert -a -w $(WIDTH) -f svg $(SOURCE).svg -o $(SOURCE)2.svg
			
 
				-	inkscape $(SOURCE)2.svg --export-plain-svg=$(SOURCE).svg
			
 
				-	rm $(SOURCE)2.svg
			
--- a/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-1.csv
+++ b/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-1.csv
--- a/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-2-pretraining.csv
+++ b/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-2-pretraining.csv
--- a/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-2.csv
+++ b/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-2.csv
--- a/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-3-pretraining.csv
+++ b/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-3-pretraining.csv
--- a/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-4-pretraining.csv
+++ b/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/baseline-4-pretraining.csv
--- a/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/errors-by-epoch-pretraining.tex
+++ b/documents/write-math-ba-paper/figures/errors-by-epoch-pretraining/errors-by-epoch-pretraining.tex
@@ -1,31 +0,0 @@
 
				-\begin{tikzpicture}
			
 
				-    \begin{axis}[
			
 
				-            axis x line=middle,
			
 
				-            axis y line=middle,
			
 
				-            enlarge y limits=true,
			
 
				-            xmin=0,
			
 
				-            % xmax=1000,
			
 
				-            ymin=0.18, ymax=0.4,
			
 
				-            minor ytick={0, 0.01, ..., 1},
			
 
				-            % width=15cm, height=8cm,     % size of the image
			
 
				-            grid = both,
			
 
				-            minor grid style={dashed, gray!30},
			
 
				-            major grid style={gray!40},,
			
 
				-            %grid style={dashed, gray!30},
			
 
				-            ylabel=error,
			
 
				-            xlabel=epoch,
			
 
				-            legend cell align=left,
			
 
				-            legend style={
			
 
				-                at={(0.5,-0.1)},
			
 
				-                anchor=north,
			
 
				-                legend columns=2
			
 
				-            }
			
 
				-         ]
			
 
				-          \addplot[mark=x,green] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-1.csv};
			
 
				-          \addplot[mark=x,orange] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-2.csv};
			
 
				-          \addplot[mark=x,red] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-2-pretraining.csv};
			
 
				-          \legend{{1 hidden layer},
			
 
				-                  {2 hidden layers},
			
 
				-                  {2 hidden layers with pretraining}}
			
 
				-    \end{axis}
			
 
				-\end{tikzpicture}
			
--- a/documents/write-math-ba-paper/glossary.tex
+++ b/documents/write-math-ba-paper/glossary.tex
@@ -1,73 +0,0 @@
 
				-%!TEX root = thesis.tex
			
 
				-%Term definitions
			
 
				-\newacronym{ANN}{ANN}{artificial neural network}
			
 
				-\newacronym{CSR}{CSR}{cursive script recognition}
			
 
				-\newacronym{DTW}{DTW}{dynamic time warping}
			
 
				-\newacronym{GTW}{GTW}{greedy time warping}
			
 
				-\newacronym{HMM}{HMM}{hidden Markov model}
			
 
				-\newacronym{HWR}{HWR}{handwriting recognition}
			
 
				-\newacronym{HWRT}{HWRT}{handwriting recognition toolkit}
			
 
				-\newacronym{MLP}{MLP}{multilayer perceptron}
			
 
				-\newacronym{MSE}{MSE}{mean squared error}
			
 
				-\newacronym{OOV}{OOV}{out of vocabulary}
			
 
				-\newacronym{TDNN}{TDNN}{time delay neural network}
			
 
				-\newacronym{PCA}{PCA}{principal component analysis}
			
 
				-\newacronym{LDA}{LDA}{linear discriminant analysis}
			
 
				-\newacronym{CROHME}{CROHME}{Competition on Recognition of Online Handwritten Mathematical Expressions}
			
 
				-\newacronym{GMM}{GMM}{Gaussian mixture model}
			
 
				-\newacronym{SVM}{SVM}{support vector machine}
			
 
				-\newacronym{PyPI}{PyPI}{Python Package Index}
			
 
				-\newacronym{CFM}{CFM}{classification figure of merit}
			
 
				-\newacronym{CE}{CE}{cross entropy}
			
 
				-\newacronym{GPU}{GPU}{graphics processing unit}
			
 
				-\newacronym{CUDA}{CUDA}{Compute Unified Device Architecture}
			
 
				-\newacronym{SLP}{SLP}{supervised layer-wise pretraining}
			
 
				-
			
 
				-% Term definitions
			
 
				-\newglossaryentry{Detexify}{name={Detexify}, description={A system used for
			
 
				-on-line handwritten symbol recognition which is described in \cite{Kirsch}}}
			
 
				-
			
 
				-\newglossaryentry{epoch}{name={epoch}, description={During iterative training of a neural network, an \textit{epoch} is a single pass through the entire training set, followed by testing of the verification set.\cite{Concise12}}}
			
 
				-
			
 
				-\newglossaryentry{hypothesis}{
			
 
				-    name={hypothesis},
			
 
				-    description={The recognition results which a classifier returns is called a hypothesis. In other words, it is the \enquote{guess} of a classifier},
			
 
				-    plural=hypotheses
			
 
				-}
			
 
				-
			
 
				-\newglossaryentry{reference}{
			
 
				-    name={reference},
			
 
				-    description={Labeled data is used to evaluate classifiers. Those labels are called references},
			
 
				-}
			
 
				-
			
 
				-\newglossaryentry{YAML}{name={YAML}, description={YAML is a human-readable data format that can be used for configuration files}}
			
 
				-\newglossaryentry{MER}{name={MER}, description={An error measure which combines symbols to equivalence classes. It was introduced on \cpageref{merged-error-introduction}}}
			
 
				-
			
 
				-\newglossaryentry{JSON}{name={JSON}, description={JSON, short for JavaScript Object Notation, is a language-independent data format that can be used to transmit data between a server and a client in web applications}}
			
 
				-
			
 
				-\newglossaryentry{hyperparamter}{name={hyperparamter}, description={A
			
 
				-\textit{hyperparamter} is a parameter of a neural net, that cannot be learned,
			
 
				-but has to be chosen}, symbol={\ensuremath{\theta}}}
			
 
				-
			
 
				-\newglossaryentry{learning rate}{name={learning rate}, description={A factor $0 \leq \eta \in \mdr$ that affects how fast new weights are learned. $\eta=0$ means that no new data is learned}, symbol={\ensuremath{\eta}}} % Andrew Ng: \alpha
			
 
				-
			
 
				-\newglossaryentry{learning rate decay}{name={learning rate decay}, description={The learning rate decay $0 < \alpha \leq 1$ is used to adjust the learning rate. After each epoch the learning rate $\eta$ is updated to $\eta \gets \eta \times \alpha$}, symbol={\ensuremath{\eta}}}
			
 
				-
			
 
				-\newglossaryentry{preactivation}{name={preactivation}, description={The preactivation of a neuron is the weighted sum of its input, before the activation function is applied}}
			
 
				-
			
 
				-\newglossaryentry{stroke}{name={stroke}, description={The path the pen took from
			
 
				-the point where the pen was put down to the point where the pen was lifted first}}
			
 
				-
			
 
				-\newglossaryentry{line}{name={line}, description={Geometric object that is infinitely long
			
 
				-and defined by two points.}}
			
 
				-
			
 
				-\newglossaryentry{line segment}{name={line segment}, description={Geometric object that has finite length
			
 
				-and defined by two points.}}
			
 
				-
			
 
				-\newglossaryentry{symbol}{name={symbol}, description={An atomic semantic entity. A more detailed description can be found in \cref{sec:what-is-a-symbol}}}
			
 
				-
			
 
				-\newglossaryentry{weight}{name={weight}, description={A
			
 
				-\textit{weight} is a parameter of a neural net, that can be learned}, symbol={\ensuremath{\weight}}}
			
 
				-
			
 
				-\newglossaryentry{control point}{name={control point}, description={A
			
 
				-\textit{control point} is a point recorded by the input device.}}
			
--- a/documents/write-math-ba-paper/variables.tex
+++ b/documents/write-math-ba-paper/variables.tex
@@ -1,12 +0,0 @@
 
				-\newcommand{\totalCollectedRecordings}{166898}  % ACTUALITY
			
 
				-\newcommand{\detexifyCollectedRecordings}{153423}
			
 
				-\newcommand{\trainingsetsize}{134804}
			
 
				-\newcommand{\validtionsetsize}{15161}
			
 
				-\newcommand{\testsetsize}{17012}
			
 
				-\newcommand{\totalClasses}{1111}
			
 
				-\newcommand{\totalClassesAnalyzed}{369}
			
 
				-\newcommand{\totalClassesAboveFifty}{680}
			
 
				-\newcommand{\totalClassesNotAnalyzedBelowFifty}{431}
			
 
				-\newcommand{\detexifyPercentage}{$\SI{91.93}{\percent}$}
			
 
				-\newcommand{\recordingsWithDots}{$\SI{2.77}{\percent}$}  % excluding i,j, ...
			
 
				-\newcommand{\recordingsWithDotsSizechange}{$\SI{0.85}{\percent}$}  % excluding i,j, ...
			
--- a/documents/write-math-ba-paper/waibel-feedback/page-1.jpg
+++ b/documents/write-math-ba-paper/waibel-feedback/page-1.jpg
--- a/documents/write-math-ba-paper/waibel-feedback/page-3.jpg
+++ b/documents/write-math-ba-paper/waibel-feedback/page-3.jpg
--- a/documents/write-math-ba-paper/waibel-feedback/page-4.jpg
+++ b/documents/write-math-ba-paper/waibel-feedback/page-4.jpg
--- a/documents/write-math-ba-paper/waibel-feedback/page-5.jpg
+++ b/documents/write-math-ba-paper/waibel-feedback/page-5.jpg
--- a/documents/write-math-ba-paper/waibel-feedback/page-6.jpg
+++ b/documents/write-math-ba-paper/waibel-feedback/page-6.jpg
--- a/documents/write-math-ba-paper/write-math-ba-paper.bib
+++ b/documents/write-math-ba-paper/write-math-ba-paper.bib
--- a/documents/write-math-ba-paper/write-math-ba-paper.pdf
+++ b/documents/write-math-ba-paper/write-math-ba-paper.pdf
--- a/documents/write-math-ba-paper/write-math-ba-paper.tex
+++ b/documents/write-math-ba-paper/write-math-ba-paper.tex
@@ -1,586 +0,0 @@
 
				-\documentclass[9pt,technote]{IEEEtran}
			
 
				-\usepackage{amssymb, amsmath} % needed for math
			
 
				-\usepackage{hyperref}   % links im text
			
 
				-\usepackage{parskip}
			
 
				-\usepackage[pdftex,final]{graphicx}
			
 
				-\usepackage{csquotes}
			
 
				-\usepackage{braket}
			
 
				-\usepackage{booktabs}
			
 
				-\usepackage{multirow}
			
 
				-\usepackage{pgfplots}
			
 
				-\usepackage{ wasysym }
			
 
				-\usepackage[noadjust]{cite}
			
 
				-\usepackage[nameinlink,noabbrev]{cleveref} % has to be after hyperref, ntheorem, amsthm
			
 
				-\usepackage[binary-units]{siunitx}
			
 
				-\sisetup{per-mode=fraction,binary-units=true}
			
 
				-\DeclareSIUnit\pixel{px}
			
 
				-\usepackage{glossaries}
			
 
				-\loadglsentries[main]{glossary}
			
 
				-\makeglossaries
			
 
				-
			
 
				-\title{On-line Recognition of Handwritten Mathematical Symbols}
			
 
				-\author{Martin Thoma, Kevin Kilgour, Sebastian St{\"u}ker and Alexander Waibel}
			
 
				-
			
 
				-\hypersetup{ 
			
 
				-  pdfauthor   = {Martin Thoma, Kevin Kilgour, Sebastian St{\"u}ker and Alexander Waibel},
			
 
				-  pdfkeywords = {Mathematics,Symbols,recognition},
			
 
				-  pdftitle    = {On-line Recognition of Handwritten Mathematical Symbols}
			
 
				-}
			
 
				-\include{variables}
			
 
				-
			
 
				-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
			
 
				-% Begin document                                                    %
			
 
				-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
			
 
				-\begin{document}
			
 
				-\maketitle
			
 
				-\begin{abstract}
			
 
				-
			
 
				-The automatic recognition of single handwritten symbols has three main
			
 
				-applications. The first application is to support users who know how a symbol
			
 
				-looks like, but not what its name is such as $\saturn$. The second application
			
 
				-is providing the necessary commands for professional publishing in books or on
			
 
				-websites, e.g. in form of \LaTeX{} commands, as MathML, or as code points. The
			
 
				-third application of single symbol classifiers is in form of a building block
			
 
				-for formula recognition.
			
 
				-
			
 
				-This paper presents a system
			
 
				-which uses the pen trajectory to classify handwritten symbols. Five
			
 
				-preprocessing steps, one data multiplication algorithm, five features and five
			
 
				-variants for multilayer Perceptron training were evaluated using $\num{166898}$
			
 
				-recordings which were collected with two crowdsourcing projects. The evaluation
			
 
				-results of these 21~experiments were used to create an optimized recognizer
			
 
				-which has a TOP-1 error of less than $\SI{17.5}{\percent}$ and a TOP-3 error of
			
 
				-$\SI{4.0}{\percent}$. This is a relative improvement of $\SI{18.5}{\percent}$
			
 
				-for the TOP-1 error and $\SI{29.7}{\percent}$ for the TOP-3 error compared to
			
 
				-the baseline system.
			
 
				-\end{abstract}
			
 
				-
			
 
				-\section{Introduction}
			
 
				-On-line recognition makes use of the pen trajectory. This means the data is
			
 
				-given as groups of sequences of tuples $(x, y, t) \in \mathbb{R}^3$, where each
			
 
				-group represents a stroke, $(x, y)$ is the position of the pen on a canvas and
			
 
				-$t$ is the time.
			
 
				-
			
 
				-On-line data was used to classify handwritten natural language text in many
			
 
				-different variants. For example, the NPen++ system classified cursive
			
 
				-handwriting into English words by using hidden Markov models and neural
			
 
				-networks\cite{Manke1995}.
			
 
				-
			
 
				-% One handwritten symbol in the described format is called a
			
 
				-% \textit{recording}. One approach to classify recordings into symbol classes
			
 
				-% assigns a probability to each class given the data. The classifier can be
			
 
				-% evaluated by using recordings which were classified by humans and were not used
			
 
				-% to train the classifier. The set of those recordings is called \textit{test
			
 
				-% set}. The TOP-$n$ error is defined as the fraction of the symbols where
			
 
				-% the correct class was not within the top $n$ classes of the highest
			
 
				-% probability.
			
 
				-
			
 
				-Several systems for mathematical symbol recognition with on-line data have been
			
 
				-described so far~\cite{Kosmala98,Mouchere2013}, but no standard test set
			
 
				-existed to compare the results of different classifiers. The used symbols
			
 
				-differed in all papers. This is unfortunate as the choice of symbols is crucial
			
 
				-for the TOP-$n$ error. For example, the symbols $o$, $O$, $\circ$ and $0$ are
			
 
				-very similar and systems which know all those classes will certainly have a
			
 
				-higher TOP-$n$ error than systems which only accept one of them. But not only
			
 
				-the classes differed, also the used data to train and test had to be collected
			
 
				-by each author again.
			
 
				-
			
 
				-Daniel Kirsch describes in~\cite{Kirsch} a system called Detexify which uses
			
 
				-time warping to classify on-line handwritten symbols and reports a TOP-3 error
			
 
				-of less than $\SI{10}{\percent}$ for a set of $\num{100}$~symbols. He did also
			
 
				-recently publish his data on \url{https://github.com/kirel/detexify-data},
			
 
				-which was collected by a crowdsourcing approach via
			
 
				-\url{http://detexify.kirelabs.org}. Those recordings as well as some recordings
			
 
				-which were collected by a similar approach via \url{http://write-math.com} were
			
 
				-used to train and evaluated different classifiers. A complete description of
			
 
				-all involved software, data and experiments is given in~\cite{Thoma:2014}.
			
 
				-
			
 
				-In this paper we present a baseline system for the classification of on-line
			
 
				-handwriting into $369$ classes of which some are very similar. An optimized
			
 
				-classifier which has a $\SI{29.7}{\percent}$ relative improvement of the TOP-3
			
 
				-error. This was achieved by using better features and layer-wise supervised
			
 
				-pretraining. The absolute improvements compared to the baseline of those
			
 
				-changes will also be shown.
			
 
				-
			
 
				-
			
 
				-\section{Steps in Handwriting Recognition}
			
 
				-The following steps are used for symbol classification:\nobreak
			
 
				-\begin{enumerate}
			
 
				-    \item \textbf{Preprocessing}: Recorded data is never perfect. Devices have
			
 
				-          errors and people make mistakes while using the devices. To tackle
			
 
				-          these problems there are preprocessing algorithms to clean the data.
			
 
				-          The preprocessing algorithms can also remove unnecessary variations
			
 
				-          of the data that do not help in the classification process, but hide
			
 
				-          what is important. Having slightly different sizes of the same symbol
			
 
				-          is an example of such a variation. Four preprocessing algorithms that
			
 
				-          clean or normalize recordings are explained in
			
 
				-          \cref{sec:preprocessing}.
			
 
				-    \item \textbf{Data multiplication}: Learning algorithms need lots of data
			
 
				-          to learn internal parameters. If there is not enough data available,
			
 
				-          domain knowledge can be considered to create new artificial data from
			
 
				-          the original data. In the domain of on-line handwriting recognition,
			
 
				-          data can be multiplied by adding rotated variants.
			
 
				-    \item \textbf{Segmentation}: The task of formula recognition can eventually
			
 
				-          be reduced to the task of symbol recognition combined with symbol
			
 
				-          placement. Before symbol recognition can be done, the formula has
			
 
				-          to be segmented. As this paper is only about single-symbol
			
 
				-          recognition, this step will not be further discussed.
			
 
				-    \item \textbf{Feature computation}: A feature is high-level information
			
 
				-          derived from the raw data after preprocessing. Some systems like
			
 
				-          Detexify take the result of the preprocessing step, but many compute
			
 
				-          new features. Those features could be designed by a human engineer or
			
 
				-          learned. Non-raw data features can have the advantage that less
			
 
				-          training data is needed since the developer can use knowledge about
			
 
				-          handwriting to compute highly discriminative features. Various
			
 
				-          features are explained in \cref{sec:features}.
			
 
				-    \item \textbf{Feature enhancement}: Applying PCA, LDA, or
			
 
				-          feature standardization might change the features in ways that could
			
 
				-          improve the performance of learning algorithms.
			
 
				-\end{enumerate}
			
 
				-
			
 
				-After these steps, we are faced with a classification learning task which
			
 
				-consists of two parts:
			
 
				-\begin{enumerate}
			
 
				-    \item \textbf{Learning} parameters for a given classifier.
			
 
				-    \item \textbf{Classifying} new recordings, sometimes called
			
 
				-          \textit{evaluation}. This should not be confused with the evaluation
			
 
				-          of the classification performance which is done for multiple
			
 
				-          topologies, preprocessing queues, and features in
			
 
				-          \Cref{ch:Evaluation}.
			
 
				-\end{enumerate}
			
 
				-
			
 
				-The classification learning task can be solved with \glspl{MLP} if the number
			
 
				-of input features is the same for every recording. There are many ways how to
			
 
				-adjust \glspl{MLP} and how to adjust their training. Some of them are
			
 
				-described in~\cref{sec:mlp-training}.
			
 
				-
			
 
				-
			
 
				-\section{Data and Implementation}
			
 
				-The combined data of Detexify and \href{http://write-math.com}{write-math.com}
			
 
				-can be downloaded via \href{http://write-math.com/data}{write-math.com/data} as
			
 
				-a compressed tar archive. It contains a list of $369$ symbols which are used in
			
 
				-mathematical context. Each symbol has at least $50$ labeled examples, but most
			
 
				-symbols have more than $200$ labeled examples and some have more than $2000$.
			
 
				-In total, more than $\num{160000}$ labeled recordings were collected.
			
 
				-
			
 
				-Preprocessing and feature computation algorithms were implemented and are
			
 
				-publicly available as open-source software in the Python package \texttt{hwrt}
			
 
				-and \gls{MLP} algorithms are available in the Python package
			
 
				-\texttt{nntoolkit}.
			
 
				-
			
 
				-
			
 
				-\section{Algorithms}
			
 
				-\subsection{Preprocessing}\label{sec:preprocessing}
			
 
				-Preprocessing in symbol recognition is done to improve the quality and
			
 
				-expressive power of the data. It should make follow-up tasks like segmentation
			
 
				-and feature extraction easier, more effective or faster. It does so by resolving
			
 
				-errors in the input data, reducing duplicate information and removing irrelevant
			
 
				-information.
			
 
				-
			
 
				-Preprocessing algorithms fall into two groups: Normalization and noise
			
 
				-reduction algorithms.
			
 
				-
			
 
				-A very important normalization algorithm in single-symbol recognition is
			
 
				-\textit{scale-and-shift}~\cite{Thoma:2014}. It scales the recording so that
			
 
				-its bounding box fits into a unit square. As the aspect ratio of a recording
			
 
				-is almost never 1:1, only one dimension will fit exactly in the unit square.
			
 
				-There are multiple ways how to shift the recording. For this paper, it was
			
 
				-chosen to shift the bigger dimension to fit into the $[0,1] \times [0,1]$ unit
			
 
				-square whereas the smaller dimension is centered in the $[-1,1] \times [-1,1]$
			
 
				-square.
			
 
				-
			
 
				-Another normalization preprocessing algorithm is resampling. As the data points
			
 
				-on the pen trajectory are generated asynchronously and with different
			
 
				-time-resolutions depending on the used hardware and software, it is desirable
			
 
				-to resample the recordings to have points spread equally in time for every
			
 
				-recording. This was done by linear interpolation of the $(x,t)$ and $(y,t)$
			
 
				-sequences and getting a fixed number of equally spaced points per stroke.
			
 
				-
			
 
				-\textit{Connect strokes} is a noise reduction algorithm. It happens sometimes
			
 
				-that the hardware detects that the user lifted the pen where the user certainly
			
 
				-didn't do so. This can be detected by measuring the Euclidean distance between
			
 
				-the end of one stroke and the beginning of the next stroke. If this distance is
			
 
				-below a threshold, then the strokes are connected.
			
 
				-
			
 
				-Due to a limited resolution of the recording device and due to erratic
			
 
				-handwriting, the pen trajectory might not be smooth. One way to smooth is
			
 
				-calculating a weighted average and replacing points by the weighted average of
			
 
				-their coordinate and their neighbors coordinates. Another way to do smoothing
			
 
				-would be to reduce the number of points with the Douglas-Peucker algorithm to
			
 
				-the most relevant ones and then interpolate the stroke between those points.
			
 
				-The Douglas-Peucker stroke simplification algorithm is usually used in
			
 
				-cartography to simplify the shape of roads. It works recursively to find a
			
 
				-subset of points of a stroke that is simpler and still similar to the original
			
 
				-shape. The algorithm adds the first and the last point $p_1$ and $p_n$ of a
			
 
				-stroke to the simplified set of points $S$. Then it searches the point $p_i$ in
			
 
				-between that has maximum distance from the line $p_1 p_n$. If this
			
 
				-distance is above a threshold $\varepsilon$, the point $p_i$ is added to $S$.
			
 
				-Then the algorithm gets applied to $p_1 p_i$ and $p_i p_n$ recursively. It is
			
 
				-described as \enquote{Algorithm 1} in~\cite{Visvalingam1990}.
			
 
				-
			
 
				-\subsection{Features}\label{sec:features}
			
 
				-Features can be \textit{global}, that means calculated for the complete
			
 
				-recording or complete strokes. Other features are calculated for single points
			
 
				-on the pen trajectory and are called \textit{local}.
			
 
				-
			
 
				-Global features are the \textit{number of strokes} in a recording, the
			
 
				-\textit{aspect ratio} of a recordings bounding box or the
			
 
				-\textit{ink} being used for a recording. The ink feature gets calculated by
			
 
				-measuring the length of all strokes combined. The re-curvature, which was
			
 
				-introduced in~\cite{Huang06}, is defined as
			
 
				-\[\text{re-curvature}(stroke) := \frac{\text{height}(stroke)}{\text{length}(stroke)}\]
			
 
				-and a stroke-global feature.
			
 
				-
			
 
				-The simplest local feature is the coordinate of the point itself. Speed,
			
 
				-curvature and a local small-resolution bitmap around the point, which was
			
 
				-introduced by Manke, Finke and Waibel in~\cite{Manke1995}, are other local
			
 
				-features.
			
 
				-
			
 
				-\subsection{Multilayer Perceptrons}\label{sec:mlp-training}
			
 
				-\Glspl{MLP} are explained in detail in~\cite{Mitchell97}. They can have
			
 
				-different numbers of hidden layers, the number of neurons per layer and the
			
 
				-activation functions can be varied. The learning algorithm is parameterized by
			
 
				-the learning rate $\eta \in (0, \infty)$, the momentum $\alpha \in [0, \infty)$
			
 
				-and the number of epochs.
			
 
				-
			
 
				-The topology of \glspl{MLP} will be denoted in the following by separating the
			
 
				-number of neurons per layer with colons. For example, the notation
			
 
				-$160{:}500{:}500{:}500{:}369$ means that the input layer gets 160~features,
			
 
				-there are three hidden layers with 500~neurons per layer and one output layer
			
 
				-with 369~neurons.
			
 
				-
			
 
				-\glspl{MLP} training can be executed in various different ways, for example
			
 
				-with \gls{SLP}. In case of a \gls{MLP} with the topology
			
 
				-$160{:}500{:}500{:}500{:}369$, \gls{SLP} works as follows: At first a \gls{MLP}
			
 
				-with one hidden layer ($160{:}500{:}369$) is trained. Then the output layer is
			
 
				-discarded, a new hidden layer and a new output layer is added and it is trained
			
 
				-again, resulting in a $160{:}500{:}500{:}369$ \gls{MLP}. The output layer is
			
 
				-discarded again, a new hidden layer is added and a new output layer is added
			
 
				-and the training is executed again.
			
 
				-
			
 
				-Denoising auto-encoders are another way of pretraining. An
			
 
				-\textit{auto-encoder} is a neural network that is trained to restore its input.
			
 
				-This means the number of input neurons is equal to the number of output
			
 
				-neurons. The weights define an \textit{encoding} of the input that allows
			
 
				-restoring the input. As the neural network finds the encoding by itself, it is
			
 
				-called auto-encoder. If the hidden layer is smaller than the input layer, it
			
 
				-can be used for dimensionality reduction~\cite{Hinton1989}. If only one hidden
			
 
				-layer with linear activation functions is used, then the hidden layer contains
			
 
				-the principal components after training~\cite{Duda2001}.
			
 
				-
			
 
				-Denoising auto-encoders are a variant introduced in~\cite{Vincent2008} that
			
 
				-is more robust to partial corruption of the input features. It is trained to
			
 
				-get robust by adding noise to the input features.
			
 
				-
			
 
				-There are multiple ways how noise can be added. Gaussian noise and
			
 
				-randomly masking elements with zero are two possibilities. \cite{Deeplearning-Denoising-AE}
			
 
				-describes how such a denoising auto-encoder with masking noise can be
			
 
				-implemented. The \texttt{corruption} is the probability of a feature being
			
 
				-masked.
			
 
				-
			
 
				-\section{Evaluation}\label{ch:Evaluation}
			
 
				-In order to evaluate the effect of different preprocessing algorithms, features
			
 
				-and adjustments in the \gls{MLP} training and topology, the following baseline
			
 
				-system was used:
			
 
				-
			
 
				-Scale the recording to fit into a unit square while keeping the aspect ratio,
			
 
				-shift it into $[-1,1] \times [-1,1]$ as described in \cref{sec:preprocessing},
			
 
				-resample it with linear interpolation to get 20~points per stroke, spaced
			
 
				-evenly in time. Take the first 4~strokes with 20~points per stroke and
			
 
				-2~coordinates per point as features, resulting in 160~features which is equal
			
 
				-to the number of input neurons. If a recording has less than 4~strokes, the
			
 
				-remaining features were filled with zeroes.
			
 
				-
			
 
				-All experiments were evaluated with four baseline systems $B_i$, $i \in \Set{1,
			
 
				-2, 3, 4}$, where $i$ is the number of hidden layers as different topologies
			
 
				-could have a severe influence on the effect of new features or preprocessing
			
 
				-steps. Each hidden layer in all evaluated systems has $500$ neurons.
			
 
				-
			
 
				-Each \gls{MLP} was trained with a learning rate of $\eta = 0.1$ and a momentum
			
 
				-of $\alpha = 0.1$. The activation function of every neuron in a hidden layer is
			
 
				-the sigmoid function $\text{sig}(x) := \frac{1}{1+e^{-x}}$. The neurons in the
			
 
				-output layer use the softmax function. For every experiment, exactly one part
			
 
				-of the baseline systems was changed.
			
 
				-
			
 
				-\subsection{Random Weight Initialization}
			
 
				-The neural networks in all experiments got initialized with a small random
			
 
				-weight
			
 
				-
			
 
				-\[w_{i,j} \sim U(-4 \cdot \sqrt{\frac{6}{n_l + n_{l+1}}}, 4 \cdot \sqrt{\frac{6}{n_l + n_{l+1}}})\]
			
 
				-
			
 
				-where $w_{i,j}$ is the weight between the neurons $i$ and $j$, $l$ is the layer
			
 
				-of neuron $i$, and $n_i$ is the number of neurons in layer $i$. This random
			
 
				-initialization was suggested on
			
 
				-\cite{deeplearningweights} and is done to break symmetry.
			
 
				-
			
 
				-This might lead to different error rates for the same systems just because the
			
 
				-initialization was different.
			
 
				-
			
 
				-In order to get an impression of the magnitude of the influence on the different
			
 
				-topologies and error rates the baseline models were trained 5 times with
			
 
				-random initializations.
			
 
				-\Cref{table:baseline-systems-random-initializations-summary}
			
 
				-shows a summary of the results. The more hidden layers are used, the more do
			
 
				-the results vary between different random weight initializations.
			
 
				-
			
 
				-\begin{table}[h]
			
 
				-    \centering
			
 
				-    \begin{tabular}{crrr|rrr} %chktex 44
			
 
				-    \toprule
			
 
				-    \multirow{3}{*}{System}  & \multicolumn{6}{c}{Classification error}\\
			
 
				-    \cmidrule(l){2-7}
			
 
				-          & \multicolumn{3}{c}{TOP-1}   & \multicolumn{3}{c}{TOP-3}\\
			
 
				-          & min                    & max                    & range                 & min                   & max                   & range\\\midrule
			
 
				-    $B_1$ & $\SI{23.08}{\percent}$ & $\SI{23.44}{\percent}$ & $\SI{0.36}{\percent}$ & $\SI{6.67}{\percent}$ & $\SI{6.80}{\percent}$ & $\SI{0.13}{\percent}$ \\
			
 
				-    $B_2$ & \underline{$\SI{21.45}{\percent}$} & \underline{$\SI{21.83}{\percent}$}& $\SI{0.38}{\percent}$ & $\SI{5.68}{\percent}$ & \underline{$\SI{5.75}{\percent}$} & $\SI{0.07}{\percent}$\\
			
 
				-    $B_3$ & $\SI{21.54}{\percent}$ & $\SI{22.28}{\percent}$ & $\SI{0.74}{\percent}$ & \underline{$\SI{5.50}{\percent}$} & $\SI{5.82}{\percent}$ & $\SI{0.32}{\percent}$\\
			
 
				-    $B_4$ & $\SI{23.19}{\percent}$ & $\SI{24.84}{\percent}$ & $\SI{1.65}{\percent}$ & $\SI{5.98}{\percent}$ & $\SI{6.44}{\percent}$ & $\SI{0.46}{\percent}$\\
			
 
				-    \bottomrule
			
 
				-    \end{tabular}
			
 
				-    \caption{The systems $B_1$ -- $B_4$ were randomly initialized, trained
			
 
				-             and evaluated 5~times to estimate the influence of random weight
			
 
				-             initialization.}
			
 
				-\label{table:baseline-systems-random-initializations-summary}
			
 
				-\end{table}
			
 
				-
			
 
				-\subsection{Connect strokes}
			
 
				-In order to solve the problem of interrupted strokes, pairs of strokes
			
 
				-can be connected with stroke connect algorithm. The idea is that for
			
 
				-a pair of consecutively drawn strokes $s_{i}, s_{i+1}$ the last point $s_i$ is
			
 
				-close to the first point of $s_{i+1}$ if a stroke was accidentally split
			
 
				-into two strokes.
			
 
				-
			
 
				-$\SI{59}{\percent}$ of all stroke pair distances in the collected data are
			
 
				-between $\SI{30}{\pixel}$ and $\SI{150}{\pixel}$. Hence the stroke connect
			
 
				-algorithm was tried with $\SI{5}{\pixel}$, $\SI{10}{\pixel}$ and
			
 
				-$\SI{20}{\pixel}$.
			
 
				-All models TOP-3 error improved with a threshold of $\theta = \SI{10}{\pixel}$
			
 
				-by at least $\SI{0.17}{\percent}$, except $B_4$ which improved only by
			
 
				-$\SI{0.01}{\percent}$ which could be a result of random weight initialization.
			
 
				-
			
 
				-\subsection{Douglas-Peucker Smoothing}
			
 
				-The Douglas-Peucker algorithm can be used to find
			
 
				-points that are more relevant for the overall shape of a recording. After that,
			
 
				-an interpolation can be done. If the interpolation is a cubic spline
			
 
				-interpolation, this makes the recording smooth.
			
 
				-
			
 
				-The Douglas-Peucker algorithm was applied with a threshold of $\varepsilon =
			
 
				-0.05$, $\varepsilon = 0.1$ and $\varepsilon = 0.2$ after scaling and shifting,
			
 
				-but before resampling. The interpolation in the resampling step was done
			
 
				-linearly and with cubic splines in two experiments. The recording was scaled
			
 
				-and shifted again after the interpolation because the bounding box might have
			
 
				-changed.
			
 
				-
			
 
				-The result of the application of the Douglas-Peucker smoothing with $\varepsilon
			
 
				-> 0.05$ was a high rise of the TOP-1 and TOP-3 error for all models $B_i$.
			
 
				-This means that the simplification process removes some relevant information and
			
 
				-does not --- as it was expected --- remove only noise. For $\varepsilon = 0.05$
			
 
				-with linear interpolation some models TOP-1 error improved, but the
			
 
				-changes were small. It could be an effect of random weight initialization.
			
 
				-However, cubic spline interpolation made all systems perform more than
			
 
				-$\SI{1.7}{\percent}$ worse for TOP-1 and TOP-3 error.
			
 
				-
			
 
				-The lower the value of $\varepsilon$, the less does the recording change after
			
 
				-this preprocessing step. As it was applied after scaling the recording such that
			
 
				-the biggest dimension of the recording (width or height) is $1$, a value of
			
 
				-$\varepsilon = 0.05$ means that a point has to move at least $\SI{5}{\percent}$
			
 
				-of the biggest dimension.
			
 
				-
			
 
				-\subsection{Global Features}
			
 
				-Single global features were added one at a time to the baseline systems. Those
			
 
				-features were re-curvature $\text{re-curvature}(stroke) = \frac{\text{height}(stroke)}{\text{length}(stroke)}$
			
 
				-as described in \cite{Huang06}, the ink feature which is the summed length
			
 
				-of all strokes, the stroke count, the aspect ratio and the stroke center points
			
 
				-for the first four strokes. The stroke center point feature improved the system
			
 
				-$B_1$ by $\SI{0.27}{\percent}$ for the TOP-3 error and system $B_3$ for the
			
 
				-TOP-1 error by $\SI{0.74}{\percent}$, but all other systems and error measures
			
 
				-either got worse or did not improve much.
			
 
				-
			
 
				-The other global features did improve the systems $B_1 -- B_3$, but not $B_4$.
			
 
				-The highest improvement was achieved with the re-curvature feature. It
			
 
				-improved the systems $B_1 -- B_4$ by more than $\SI{0.6}{\percent}$ TOP-1 error.
			
 
				-
			
 
				-
			
 
				-\subsection{Data Multiplication}
			
 
				-Data multiplication can be used to make the model invariant to transformations.
			
 
				-However, this idea seems not to work well in the domain of on-line handwritten
			
 
				-mathematical symbols. It was tried to triple the data by adding a rotated
			
 
				-version that is rotated 3 degrees to the left and another one that is rotated
			
 
				-3 degrees to the right around the center of mass. This data multiplication
			
 
				-made all classifiers for most error measures perform worse by more than
			
 
				-$\SI{2}{\percent}$ for the TOP-1 error.
			
 
				-
			
 
				-\subsection{Pretraining}\label{subsec:pretraining-evaluation}
			
 
				-Pretraining is a technique used to improve the training of \glspl{MLP} with
			
 
				-multiple hidden layers.
			
 
				-
			
 
				-\Cref{fig:training-and-test-error-for-different-topologies-pretraining} shows
			
 
				-the evolution of the TOP-1 error over 1000~epochs with supervised
			
 
				-layer-wise pretraining and without pretraining. It clearly shows that this
			
 
				-kind of pretraining improves the classification performance by $\SI{1.6}{\percent}$
			
 
				-for the TOP-1 error and $\SI{1.0}{\percent}$ for the TOP-3 error.
			
 
				-
			
 
				-\begin{figure}[htb]
			
 
				-    \centering
			
 
				-    \input{figures/errors-by-epoch-pretraining/errors-by-epoch-pretraining.tex}
			
 
				-    \caption{Training- and test error by number of trained epochs for different
			
 
				-             topologies with \gls{SLP}. The plot shows
			
 
				-             that all pretrained systems performed much better than the systems
			
 
				-             without pretraining. All plotted systems did not improve
			
 
				-             with more epochs of training.}
			
 
				-\label{fig:training-and-test-error-for-different-topologies-pretraining}
			
 
				-\end{figure}
			
 
				-
			
 
				-Pretraining with denoising auto-encoder lead to the much worse results listed in
			
 
				-\cref{table:pretraining-denoising-auto-encoder}. The first layer used a $\tanh$
			
 
				-activation function. Every layer was trained for $1000$ epochs and the 
			
 
				-\gls{MSE} loss function. A learning-rate of $\eta = 0.001$, a corruption of
			
 
				-$0.3$ and a $L_2$ regularization of $\lambda = 10^{-4}$ were chosen. This
			
 
				-pretraining setup made all systems with all error measures perform much worse.
			
 
				-
			
 
				-\begin{table}[tb]
			
 
				-    \centering
			
 
				-    \begin{tabular}{lrrrr}
			
 
				-    \toprule
			
 
				-    \multirow{2}{*}{System}  & \multicolumn{4}{c}{Classification error}\\
			
 
				-    \cmidrule(l){2-5}
			
 
				-              & TOP-1                  & change                 & TOP-3                 & change                 \\\midrule
			
 
				-    $B_{1,p}$ & $\SI{23.75}{\percent}$ & $\SI{+0.41}{\percent}$ & $\SI{7.19}{\percent}$ & $\SI{+0.39}{\percent}$\\
			
 
				-    $B_{2,p}$ & \underline{$\SI{22.76}{\percent}$} & $\SI{+1.25}{\percent}$ & $\SI{6.38}{\percent}$ & $\SI{+0.63}{\percent}$\\
			
 
				-    $B_{3,p}$ & $\SI{23.10}{\percent}$ & $\SI{+1.17}{\percent}$ & \underline{$\SI{6.14}{\percent}$} & $\SI{+0.40}{\percent}$\\
			
 
				-    $B_{4,p}$ & $\SI{25.59}{\percent}$ & $\SI{+1.71}{\percent}$ & $\SI{6.99}{\percent}$ & $\SI{+0.87}{\percent}$\\
			
 
				-    \bottomrule
			
 
				-    \end{tabular}
			
 
				-    \caption{Systems with denoising auto-encoder pretraining compared to pure
			
 
				-             gradient descent. The pretrained systems clearly performed worse.}
			
 
				-\label{table:pretraining-denoising-auto-encoder}
			
 
				-\end{table}
			
 
				-
			
 
				-\subsection{Optimized Recognizer}
			
 
				-All preprocessing steps and features that were useful were combined to
			
 
				-create a recognizer that should perform best.
			
 
				-
			
 
				-All models were much better than everything that was tried before. The results
			
 
				-of this experiment show that single-symbol recognition with
			
 
				-\totalClassesAnalyzed{} classes and usual touch devices and the mouse can be
			
 
				-done with a TOP1 error rate of $\SI{18.56}{\percent}$ and a TOP3 error of
			
 
				-$\SI{4.11}{\percent}$. This was
			
 
				-achieved by a \gls{MLP} with a $167{:}500{:}500{:}\totalClassesAnalyzed{}$ topology.
			
 
				-
			
 
				-It used an algorithm to connect strokes of which the ends were less than
			
 
				-$\SI{10}{\pixel}$ away, scaled each recording to a unit square and shifted this
			
 
				-unit square to $(0,0)$. After that, a linear resampling step was applied to the
			
 
				-first 4 strokes to resample them to 20 points each. All other strokes were
			
 
				-discarded.
			
 
				-
			
 
				-The 167 features were
			
 
				-
			
 
				-\begin{itemize}
			
 
				-     \item the first 4 strokes with 20 points per stroke resulting in 160
			
 
				-           features,
			
 
				-     \item the re-curvature for the first 4 strokes,
			
 
				-     \item the ink,
			
 
				-     \item the number of strokes and
			
 
				-     \item the aspect ratio
			
 
				-\end{itemize}
			
 
				-
			
 
				-\Gls{SLP} was applied with $\num{1000}$ epochs per layer, a
			
 
				-learning rate of $\eta=0.1$ and a momentum of $\alpha=0.1$. After that, the
			
 
				-complete model was trained again for $1000$ epochs with standard mini-batch
			
 
				-gradient descent.
			
 
				-
			
 
				-After the models $B_{1,c}$ -- $B_{4,c}$ were trained the first $1000$ epochs,
			
 
				-they were trained again for $1000$ epochs with a learning rate of $\eta = 0.05$.
			
 
				-\Cref{table:complex-recognizer-systems-evaluation} shows that
			
 
				-this improved the classifiers again.
			
 
				-
			
 
				-\begin{table}[htb]
			
 
				-    \centering
			
 
				-    \begin{tabular}{lrrrr}
			
 
				-    \toprule
			
 
				-    \multirow{2}{*}{System}  & \multicolumn{4}{c}{Classification error}\\
			
 
				-    \cmidrule(l){2-5}
			
 
				-              & TOP1                   & change                 & TOP3                  & change\\\midrule
			
 
				-    $B_{1,c}$ & $\SI{20.96}{\percent}$ & $\SI{-2.38}{\percent}$ & $\SI{5.24}{\percent}$ & $\SI{-1.56}{\percent}$\\
			
 
				-    $B_{2,c}$ & $\SI{18.26}{\percent}$ & $\SI{-3.25}{\percent}$ & $\SI{4.07}{\percent}$ & $\SI{-1.68}{\percent}$\\
			
 
				-    $B_{3,c}$ & \underline{$\SI{18.19}{\percent}$} & $\SI{-3.74}{\percent}$ & \underline{$\SI{4.06}{\percent}$} & $\SI{-1.68}{\percent}$\\
			
 
				-    $B_{4,c}$ & $\SI{18.57}{\percent}$ & $\SI{-5.31}{\percent}$ & $\SI{4.25}{\percent}$ & $\SI{-1.87}{\percent}$\\\midrule
			
 
				-    $B_{1,c}'$ & $\SI{19.33}{\percent}$ & $\SI{-1.63}{\percent}$ & $\SI{4.78}{\percent}$ & $\SI{-0.46}{\percent}$ \\
			
 
				-    $B_{2,c}'$ & \underline{$\SI{17.52}{\percent}$} & $\SI{-0.74}{\percent}$ & \underline{$\SI{4.04}{\percent}$} & $\SI{-0.03}{\percent}$\\
			
 
				-    $B_{3,c}'$ & $\SI{17.65}{\percent}$ & $\SI{-0.54}{\percent}$ & $\SI{4.07}{\percent}$ & $\SI{+0.01}{\percent}$\\
			
 
				-    $B_{4,c}'$ & $\SI{17.82}{\percent}$ & $\SI{-0.75}{\percent}$ & $\SI{4.26}{\percent}$ & $\SI{+0.01}{\percent}$\\
			
 
				-    \bottomrule
			
 
				-    \end{tabular}
			
 
				-    \caption{Error rates of the optimized recognizer systems. The systems
			
 
				-             $B_{i,c}'$ were trained another $1000$ epochs with a learning rate
			
 
				-             of $\eta=0.05$. The value of the column \enquote{change} of the
			
 
				-             systems $B_{i,c}'$ is relative to $B_{i,c}$.}
			
 
				-\label{table:complex-recognizer-systems-evaluation}
			
 
				-\end{table}
			
 
				-
			
 
				-
			
 
				-\section{Discussion}
			
 
				-Four baseline recognition systems were adjusted in many experiments and their
			
 
				-recognition capabilities were compared in order to build a recognition system
			
 
				-that can recognize 396 mathematical symbols with low error rates as well as to
			
 
				-evaluate which preprocessing steps and features help to improve the recognition
			
 
				-rate.
			
 
				-
			
 
				-All recognition systems were trained and evaluated with
			
 
				-$\num{\totalCollectedRecordings{}}$ recordings for \totalClassesAnalyzed{}
			
 
				-symbols. These recordings were collected by two crowdsourcing projects 
			
 
				-(\href{http://detexify.kirelabs.org/classify.html}{Detexify} and
			
 
				-\href{write-math.com}{write-math.com}) and created with various devices. While
			
 
				-some recordings were created with standard touch devices such as tablets and
			
 
				-smartphones, others were created with the mouse.
			
 
				-
			
 
				-\Glspl{MLP} were used for the classification task. Four baseline systems with
			
 
				-different numbers of hidden layers were used, as the number of hidden layer
			
 
				-influences the capabilities and problems of \glspl{MLP}.
			
 
				-
			
 
				-All baseline systems used the same preprocessing queue. The recordings were
			
 
				-scaled to fit into a unit square, shifted to $(0,0)$, resampled with linear
			
 
				-interpolation so that every stroke had exactly 20~points which are spread
			
 
				-equidistant in time. The 80~($x,y$) coordinates of the first 4~strokes were used
			
 
				-to get exactly $160$ input features for every recording. The baseline system
			
 
				-$B_2$ has a TOP-3 error of $\SI{5.75}{\percent}$.
			
 
				-
			
 
				-Adding two slightly rotated variants for each recording and hence tripling the
			
 
				-training set made the systems $B_3$ and $B_4$ perform much worse, but improved
			
 
				-the performance of the smaller systems.
			
 
				-
			
 
				-The global features re-curvature, ink, stoke count and aspect ratio improved the
			
 
				-systems $B_1$--$B_3$, whereas the stroke center point feature made $B_2$ perform
			
 
				-worse.
			
 
				-
			
 
				-Denoising auto-encoders were evaluated as one way
			
 
				-to use pretraining, but by this the error rate increased notably. However,
			
 
				-supervised layer-wise pretraining improved the performance decidedly.
			
 
				-
			
 
				-The stroke connect algorithm was added to the preprocessing steps of the
			
 
				-baseline system as well as the re-curvature feature, the ink feature, the number
			
 
				-of strokes and the aspect ratio. The training setup of the baseline system was
			
 
				-changed to supervised layer-wise pretraining and the resulting model was trained
			
 
				-with a lower learning rate again. This optimized recognizer $B_{2,c}'$ had a TOP-3
			
 
				-error of $\SI{4.04}{\percent}$. This means that the TOP-3 error dropped by over
			
 
				-$\SI{1.7}{\percent}$ in comparison to the baseline system $B_2$.
			
 
				-
			
 
				-A TOP-3 error of $\SI{4.04}{\percent}$ makes the system usable for symbol lookup.
			
 
				-It could also be used as a starting point for the development of a
			
 
				-multiple-symbol classifier.
			
 
				-
			
 
				-The aim of this work was to develop a symbol recognition system which is easy
			
 
				-to use, fast and has high recognition rates as well as evaluating ideas for
			
 
				-single symbol classifiers. Some of those goals were reached. The recognition
			
 
				-system $B_{2,c}'$ evaluates new recordings in a fraction of a second and has
			
 
				-acceptable recognition rates.
			
 
				-
			
 
				-% Many algorithms were evaluated.
			
 
				-% However, there are still many other algorithms which could be evaluated and, at
			
 
				-% the time of this work, the best classifier $B_{2,c}'$ is only available
			
 
				-% through the Python package \texttt{hwrt}. It is planned to add an web version
			
 
				-% of that classifier online.
			
 
				-
			
 
				-\bibliographystyle{IEEEtranSA}
			
 
				-\bibliography{write-math-ba-paper}
			
 
				-\end{document}