\subsection{Write Math} \begin{frame}{write-math.com} \begin{itemize} \item a website where users can add labeled training data and unlabeled data which they want to classify. I call this data \enquote{recording} \begin{figure}[ht] \centering \subfloat{ \includegraphics[height=0.1\textwidth]{../images/279952.pdf} }% \qquad \subfloat{ \includegraphics[height=0.1\textwidth]{../images/281507.pdf} }% \qquad \subfloat{ \includegraphics[height=0.1\textwidth]{../images/287612.pdf} }% \qquad \subfloat{ \includegraphics[height=0.1\textwidth]{../images/292175.pdf} }% \caption*{4 recordings} \end{figure} \item works with desktop computers and touch devices \item symbol recognition can be done by multiple classifiers \item users can contribute formulas as recordings and as \LaTeX{} answers for recordings \item users can vote for \LaTeX{} answers: \Large $\leq$, $\leqq$, $\leqslant$, \dots \normalsize \item user who entered the recording can accept one answer \end{itemize} \end{frame} % \framedgraphic{Classify}{../images/classify.png} % \framedgraphic{Workflow}{../images/workflow.png} % \framedgraphic{User page}{../images/user-page.png} % \framedgraphic{Information about recordings}{../images/view.png} % \framedgraphic{Symbol page}{../images/symbol.png} % \framedgraphic{Training}{../images/train.png} % \framedgraphic{Ranking}{../images/ranking.png} \begin{frame}[fragile]{Statistics} \begin{itemize} \item 127 users with at least 5 recordings \item $\num{1111}$ symbols, but only $\num{369}$ used for experiments \item $\num{235831}$ recordings (e.g. $\num{3489}$ times \verb+\int+, but only 50 times \verb+X+) \end{itemize} \end{frame} % \begin{frame}{First classification worker} % \begin{itemize} % \item preprocessing: Scale to fit into unit square while keeping the aspect % ratio % \item applies greedy time warping % \item compares a new recording with every recording % in the database % \item[$\Rightarrow$] Classification time is in $\mathcal{O}(\text{recordings})$, % but we rather would like $\mathcal{O}(\text{symbols})$ % \item the current server / workflow can only handle about 4000 recordings % \item[$\Rightarrow$] Another way to classify is necessary % \end{itemize} % \end{frame} \begin{frame}[fragile]{Handwriting Recognition Toolkit (hwrt)} \begin{itemize} \item Many preprocessing algorithms / features implemented \item Plug-in system for preprocessing algorithms / features \item Needs neural network toolkit \item Hosted at \url{https://github.com/MartinThoma/hwrt} \item Installable via \verb+pip+ (Python package installer):\\ \verb+$ pip install hwrt+ \end{itemize} \end{frame} \begin{frame}[fragile]{hwrt preprocessing configuration file} \begin{verbatim} data-source: raw-datasets/2014-08-26-20-14-data-raw.pickle queue: - RemoveDuplicateTime: null - ScaleAndShift: - max_width: 1.0 - max_height: 1.0 - center: true - SpaceEvenlyPerStroke: - kind: linear - number: 20 \end{verbatim} \end{frame} \subsection{Preprocessing algorithms} \begin{frame}{Preprocessing algorithms} \begin{itemize} \item Normalizing \begin{itemize} \item Scaling \item Shifting \item Resampling \end{itemize} \item Noise reduction \begin{itemize} \item Smoothing (e.g. moving average) \item Dot reduction \item Filtering (by distance, speed or angle) \item Stroke connection \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile]{hwrt feature configuration file} \begin{verbatim} data-source: preprocessed/baseline data-multiplication: - Multiply: - nr: 1 features: - ConstantPointCoordinates: - strokes: 4 - points_per_stroke: 20 - fill_empty_with: 0 - pen_down: false \end{verbatim} \end{frame} \subsection{Features} \begin{frame}{Features} \begin{itemize} \item Local \begin{itemize} \item Coordinates \item Speed \item Binary pen pressure \item Direction \item Curvature \item Bitmap-environment \item Hat-Feature \end{itemize} \item Global \begin{itemize} \item \# of points \item \# of strokes \item Center point \item Bitmap \item Bounding box (width, height, time) \end{itemize} \end{itemize} \end{frame}