# Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """String network description language to define network layouts.""" import re import time import decoder import errorcounter as ec import shapes import tensorflow as tf import vgsl_input import vgslspecs import tensorflow.contrib.slim as slim from tensorflow.core.framework import summary_pb2 from tensorflow.python.platform import tf_logging as logging # Parameters for rate decay. # We divide the learning_rate_halflife by DECAY_STEPS_FACTOR and use DECAY_RATE # as the decay factor for the learning rate, ie we use the DECAY_STEPS_FACTORth # root of 2 as the decay rate every halflife/DECAY_STEPS_FACTOR to achieve the # desired halflife. DECAY_STEPS_FACTOR = 16 DECAY_RATE = pow(0.5, 1.0 / DECAY_STEPS_FACTOR) def Train(train_dir, model_str, train_data, max_steps, master='', task=0, ps_tasks=0, initial_learning_rate=0.001, final_learning_rate=0.001, learning_rate_halflife=160000, optimizer_type='Adam', num_preprocess_threads=1, reader=None): """Testable trainer with no dependence on FLAGS. Args: train_dir: Directory to write checkpoints. model_str: Network specification string. train_data: Training data file pattern. max_steps: Number of training steps to run. master: Name of the TensorFlow master to use. task: Task id of this replica running the training. (0 will be master). ps_tasks: Number of tasks in ps job, or 0 if no ps job. initial_learning_rate: Learing rate at start of training. final_learning_rate: Asymptotic minimum learning rate. learning_rate_halflife: Number of steps over which to halve the difference between initial and final learning rate. optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'. num_preprocess_threads: Number of input threads. reader: Function that returns an actual reader to read Examples from input files. If None, uses tf.TFRecordReader(). """ if master.startswith('local'): device = tf.ReplicaDeviceSetter(ps_tasks) else: device = '/cpu:0' with tf.Graph().as_default(): with tf.device(device): model = InitNetwork(train_data, model_str, 'train', initial_learning_rate, final_learning_rate, learning_rate_halflife, optimizer_type, num_preprocess_threads, reader) # Create a Supervisor. It will take care of initialization, summaries, # checkpoints, and recovery. # # When multiple replicas of this program are running, the first one, # identified by --task=0 is the 'chief' supervisor. It is the only one # that takes case of initialization, etc. sv = tf.train.Supervisor( logdir=train_dir, is_chief=(task == 0), saver=model.saver, save_summaries_secs=10, save_model_secs=30, recovery_wait_secs=5) step = 0 while step < max_steps: try: # Get an initialized, and possibly recovered session. Launch the # services: Checkpointing, Summaries, step counting. with sv.managed_session(master) as sess: while step < max_steps: _, step = model.TrainAStep(sess) if sv.coord.should_stop(): break except tf.errors.AbortedError as e: logging.error('Received error:%s', e) continue def Eval(train_dir, eval_dir, model_str, eval_data, decoder_file, num_steps, graph_def_file=None, eval_interval_secs=0, reader=None): """Restores a model from a checkpoint and evaluates it. Args: train_dir: Directory to find checkpoints. eval_dir: Directory to write summary events. model_str: Network specification string. eval_data: Evaluation data file pattern. decoder_file: File to read to decode the labels. num_steps: Number of eval steps to run. graph_def_file: File to write graph definition to for freezing. eval_interval_secs: How often to run evaluations, or once if 0. reader: Function that returns an actual reader to read Examples from input files. If None, uses tf.TFRecordReader(). Returns: (char error rate, word recall error rate, sequence error rate) as percent. Raises: ValueError: If unimplemented feature is used. """ decode = None if decoder_file: decode = decoder.Decoder(decoder_file) # Run eval. rates = ec.ErrorRates( label_error=None, word_recall_error=None, word_precision_error=None, sequence_error=None) with tf.Graph().as_default(): model = InitNetwork(eval_data, model_str, 'eval', reader=reader) sw = tf.summary.FileWriter(eval_dir) while True: sess = tf.Session('') if graph_def_file is not None: # Write the eval version of the graph to a file for freezing. if not tf.gfile.Exists(graph_def_file): with tf.gfile.FastGFile(graph_def_file, 'w') as f: f.write( sess.graph.as_graph_def(add_shapes=True).SerializeToString()) ckpt = tf.train.get_checkpoint_state(train_dir) if ckpt and ckpt.model_checkpoint_path: step = model.Restore(ckpt.model_checkpoint_path, sess) if decode: rates = decode.SoftmaxEval(sess, model, num_steps) _AddRateToSummary('Label error rate', rates.label_error, step, sw) _AddRateToSummary('Word recall error rate', rates.word_recall_error, step, sw) _AddRateToSummary('Word precision error rate', rates.word_precision_error, step, sw) _AddRateToSummary('Sequence error rate', rates.sequence_error, step, sw) sw.flush() print 'Error rates=', rates else: raise ValueError('Non-softmax decoder evaluation not implemented!') if eval_interval_secs: time.sleep(eval_interval_secs) else: break return rates def InitNetwork(input_pattern, model_spec, mode='eval', initial_learning_rate=0.00005, final_learning_rate=0.00005, halflife=1600000, optimizer_type='Adam', num_preprocess_threads=1, reader=None): """Constructs a python tensor flow model defined by model_spec. Args: input_pattern: File pattern of the data in tfrecords of Example. model_spec: Concatenation of input spec, model spec and output spec. See Build below for input/output spec. For model spec, see vgslspecs.py mode: One of 'train', 'eval' initial_learning_rate: Initial learning rate for the network. final_learning_rate: Final learning rate for the network. halflife: Number of steps over which to halve the difference between initial and final learning rate for the network. optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'. num_preprocess_threads: Number of threads to use for image processing. reader: Function that returns an actual reader to read Examples from input files. If None, uses tf.TFRecordReader(). Eval tasks need only specify input_pattern and model_spec. Returns: A VGSLImageModel class. Raises: ValueError: if the model spec syntax is incorrect. """ model = VGSLImageModel(mode, model_spec, initial_learning_rate, final_learning_rate, halflife) left_bracket = model_spec.find('[') right_bracket = model_spec.rfind(']') if left_bracket < 0 or right_bracket < 0: raise ValueError('Failed to find [] in model spec! ', model_spec) input_spec = model_spec[:left_bracket] layer_spec = model_spec[left_bracket:right_bracket + 1] output_spec = model_spec[right_bracket + 1:] model.Build(input_pattern, input_spec, layer_spec, output_spec, optimizer_type, num_preprocess_threads, reader) return model class VGSLImageModel(object): """Class that builds a tensor flow model for training or evaluation. """ def __init__(self, mode, model_spec, initial_learning_rate, final_learning_rate, halflife): """Constructs a VGSLImageModel. Args: mode: One of "train", "eval" model_spec: Full model specification string, for reference only. initial_learning_rate: Initial learning rate for the network. final_learning_rate: Final learning rate for the network. halflife: Number of steps over which to halve the difference between initial and final learning rate for the network. """ # The string that was used to build this model. self.model_spec = model_spec # The layers between input and output. self.layers = None # The train/eval mode. self.mode = mode # The initial learning rate. self.initial_learning_rate = initial_learning_rate self.final_learning_rate = final_learning_rate self.decay_steps = halflife / DECAY_STEPS_FACTOR self.decay_rate = DECAY_RATE # Tensor for the labels. self.labels = None self.sparse_labels = None # Debug data containing the truth text. self.truths = None # Tensor for loss self.loss = None # Train operation self.train_op = None # Tensor for the global step counter self.global_step = None # Tensor for the output predictions (usually softmax) self.output = None # True if we are using CTC training mode. self.using_ctc = False # Saver object to load or restore the variables. self.saver = None def Build(self, input_pattern, input_spec, model_spec, output_spec, optimizer_type, num_preprocess_threads, reader): """Builds the model from the separate input/layers/output spec strings. Args: input_pattern: File pattern of the data in tfrecords of TF Example format. input_spec: Specification of the input layer: batchsize,height,width,depth (4 comma-separated integers) Training will run with batches of batchsize images, but runtime can use any batch size. height and/or width can be 0 or -1, indicating variable size, otherwise all images must be the given size. depth must be 1 or 3 to indicate greyscale or color. NOTE 1-d image input, treating the y image dimension as depth, can be achieved using S1(1x0)1,3 as the first op in the model_spec, but the y-size of the input must then be fixed. model_spec: Model definition. See vgslspecs.py output_spec: Output layer definition: O(2|1|0)(l|s|c)n output layer with n classes. 2 (heatmap) Output is a 2-d vector map of the input (possibly at different scale). 1 (sequence) Output is a 1-d sequence of vector values. 0 (value) Output is a 0-d single vector value. l uses a logistic non-linearity on the output, allowing multiple hot elements in any output vector value. s uses a softmax non-linearity, with one-hot output in each value. c uses a softmax with CTC. Can only be used with s (sequence). NOTE Only O1s and O1c are currently supported. optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'. num_preprocess_threads: Number of threads to use for image processing. reader: Function that returns an actual reader to read Examples from input files. If None, uses tf.TFRecordReader(). """ self.global_step = tf.Variable(0, name='global_step', trainable=False) shape = _ParseInputSpec(input_spec) out_dims, out_func, num_classes = _ParseOutputSpec(output_spec) self.using_ctc = out_func == 'c' images, heights, widths, labels, sparse, _ = vgsl_input.ImageInput( input_pattern, num_preprocess_threads, shape, self.using_ctc, reader) self.labels = labels self.sparse_labels = sparse self.layers = vgslspecs.VGSLSpecs(widths, heights, self.mode == 'train') last_layer = self.layers.Build(images, model_spec) self._AddOutputs(last_layer, out_dims, out_func, num_classes) if self.mode == 'train': self._AddOptimizer(optimizer_type) # For saving the model across training and evaluation self.saver = tf.train.Saver() def TrainAStep(self, sess): """Runs a training step in the session. Args: sess: Session in which to train the model. Returns: loss, global_step. """ _, loss, step = sess.run([self.train_op, self.loss, self.global_step]) return loss, step def Restore(self, checkpoint_path, sess): """Restores the model from the given checkpoint path into the session. Args: checkpoint_path: File pathname of the checkpoint. sess: Session in which to restore the model. Returns: global_step of the model. """ self.saver.restore(sess, checkpoint_path) return tf.train.global_step(sess, self.global_step) def RunAStep(self, sess): """Runs a step for eval in the session. Args: sess: Session in which to run the model. Returns: output tensor result, labels tensor result. """ return sess.run([self.output, self.labels]) def _AddOutputs(self, prev_layer, out_dims, out_func, num_classes): """Adds the output layer and loss function. Args: prev_layer: Output of last layer of main network. out_dims: Number of output dimensions, 0, 1 or 2. out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic. num_classes: Number of outputs/size of last output dimension. """ height_in = shapes.tensor_dim(prev_layer, dim=1) logits, outputs = self._AddOutputLayer(prev_layer, out_dims, out_func, num_classes) if self.mode == 'train': # Setup loss for training. self.loss = self._AddLossFunction(logits, height_in, out_dims, out_func) tf.summary.scalar('loss', self.loss) elif out_dims == 0: # Be sure the labels match the output, even in eval mode. self.labels = tf.slice(self.labels, [0, 0], [-1, 1]) self.labels = tf.reshape(self.labels, [-1]) logging.info('Final output=%s', outputs) logging.info('Labels tensor=%s', self.labels) self.output = outputs def _AddOutputLayer(self, prev_layer, out_dims, out_func, num_classes): """Add the fully-connected logits and SoftMax/Logistic output Layer. Args: prev_layer: Output of last layer of main network. out_dims: Number of output dimensions, 0, 1 or 2. out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic. num_classes: Number of outputs/size of last output dimension. Returns: logits: Pre-softmax/logistic fully-connected output shaped to out_dims. outputs: Post-softmax/logistic shaped to out_dims. Raises: ValueError: if syntax is incorrect. """ # Reduce dimensionality appropriate to the output dimensions. batch_in = shapes.tensor_dim(prev_layer, dim=0) height_in = shapes.tensor_dim(prev_layer, dim=1) width_in = shapes.tensor_dim(prev_layer, dim=2) depth_in = shapes.tensor_dim(prev_layer, dim=3) if out_dims: # Combine any remaining height and width with batch and unpack after. shaped = tf.reshape(prev_layer, [-1, depth_in]) else: # Everything except batch goes to depth, and therefore has to be known. shaped = tf.reshape(prev_layer, [-1, height_in * width_in * depth_in]) logits = slim.fully_connected(shaped, num_classes, activation_fn=None) if out_func == 'l': raise ValueError('Logistic not yet supported!') else: output = tf.nn.softmax(logits) # Reshape to the dessired output. if out_dims == 2: output_shape = [batch_in, height_in, width_in, num_classes] elif out_dims == 1: output_shape = [batch_in, height_in * width_in, num_classes] else: output_shape = [batch_in, num_classes] output = tf.reshape(output, output_shape, name='Output') logits = tf.reshape(logits, output_shape) return logits, output def _AddLossFunction(self, logits, height_in, out_dims, out_func): """Add the appropriate loss function. Args: logits: Pre-softmax/logistic fully-connected output shaped to out_dims. height_in: Height of logits before going into the softmax layer. out_dims: Number of output dimensions, 0, 1 or 2. out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic. Returns: loss: That which is to be minimized. Raises: ValueError: if logistic is used. """ if out_func == 'c': # Transpose batch to the middle. ctc_input = tf.transpose(logits, [1, 0, 2]) # Compute the widths of each batch element from the input widths. widths = self.layers.GetLengths(dim=2, factor=height_in) cross_entropy = tf.nn.ctc_loss(ctc_input, self.sparse_labels, widths) elif out_func == 's': if out_dims == 2: self.labels = _PadLabels3d(logits, self.labels) elif out_dims == 1: self.labels = _PadLabels2d( shapes.tensor_dim( logits, dim=1), self.labels) else: self.labels = tf.slice(self.labels, [0, 0], [-1, 1]) self.labels = tf.reshape(self.labels, [-1]) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=self.labels, name='xent') else: # TODO(rays) Labels need an extra dimension for logistic, so different # padding functions are needed, as well as a different loss function. raise ValueError('Logistic not yet supported!') return tf.reduce_sum(cross_entropy) def _AddOptimizer(self, optimizer_type): """Adds an optimizer with learning rate decay to minimize self.loss. Args: optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'. Raises: ValueError: if the optimizer type is unrecognized. """ learn_rate_delta = self.initial_learning_rate - self.final_learning_rate learn_rate_dec = tf.add( tf.train.exponential_decay(learn_rate_delta, self.global_step, self.decay_steps, self.decay_rate), self.final_learning_rate) if optimizer_type == 'GradientDescent': opt = tf.train.GradientDescentOptimizer(learn_rate_dec) elif optimizer_type == 'AdaGrad': opt = tf.train.AdagradOptimizer(learn_rate_dec) elif optimizer_type == 'Momentum': opt = tf.train.MomentumOptimizer(learn_rate_dec, momentum=0.9) elif optimizer_type == 'Adam': opt = tf.train.AdamOptimizer(learning_rate=learn_rate_dec) else: raise ValueError('Invalid optimizer type: ' + optimizer_type) tf.summary.scalar('learn_rate', learn_rate_dec) self.train_op = opt.minimize( self.loss, global_step=self.global_step, name='train') def _PadLabels3d(logits, labels): """Pads or slices 3-d labels to match logits. Covers the case of 2-d softmax output, when labels is [batch, height, width] and logits is [batch, height, width, onehot] Args: logits: 4-d Pre-softmax fully-connected output. labels: 3-d, but not necessarily matching in size. Returns: labels: Resized by padding or clipping to match logits. """ logits_shape = shapes.tensor_shape(logits) labels_shape = shapes.tensor_shape(labels) labels = tf.reshape(labels, [-1, labels_shape[2]]) labels = _PadLabels2d(logits_shape[2], labels) labels = tf.reshape(labels, [labels_shape[0], -1]) labels = _PadLabels2d(logits_shape[1] * logits_shape[2], labels) return tf.reshape(labels, [labels_shape[0], logits_shape[1], logits_shape[2]]) def _PadLabels2d(logits_size, labels): """Pads or slices the 2nd dimension of 2-d labels to match logits_size. Covers the case of 1-d softmax output, when labels is [batch, seq] and logits is [batch, seq, onehot] Args: logits_size: Tensor returned from tf.shape giving the target size. labels: 2-d, but not necessarily matching in size. Returns: labels: Resized by padding or clipping the last dimension to logits_size. """ pad = logits_size - tf.shape(labels)[1] def _PadFn(): return tf.pad(labels, [[0, 0], [0, pad]]) def _SliceFn(): return tf.slice(labels, [0, 0], [-1, logits_size]) return tf.cond(tf.greater(pad, 0), _PadFn, _SliceFn) def _ParseInputSpec(input_spec): """Parses input_spec and returns the numbers obtained therefrom. Args: input_spec: Specification of the input layer. See Build. Returns: shape: ImageShape with the desired shape of the input. Raises: ValueError: if syntax is incorrect. """ pattern = re.compile(R'(\d+),(\d+),(\d+),(\d+)') m = pattern.match(input_spec) if m is None: raise ValueError('Failed to parse input spec:' + input_spec) batch_size = int(m.group(1)) y_size = int(m.group(2)) if int(m.group(2)) > 0 else None x_size = int(m.group(3)) if int(m.group(3)) > 0 else None depth = int(m.group(4)) if depth not in [1, 3]: raise ValueError('Depth must be 1 or 3, had:', depth) return vgsl_input.ImageShape(batch_size, y_size, x_size, depth) def _ParseOutputSpec(output_spec): """Parses the output spec. Args: output_spec: Output layer definition. See Build. Returns: out_dims: 2|1|0 for 2-d, 1-d, 0-d. out_func: l|s|c for logistic, softmax, softmax+CTC num_classes: Number of classes in output. Raises: ValueError: if syntax is incorrect. """ pattern = re.compile(R'(O)(0|1|2)(l|s|c)(\d+)') m = pattern.match(output_spec) if m is None: raise ValueError('Failed to parse output spec:' + output_spec) out_dims = int(m.group(2)) out_func = m.group(3) if out_func == 'c' and out_dims != 1: raise ValueError('CTC can only be used with a 1-D sequence!') num_classes = int(m.group(4)) return out_dims, out_func, num_classes def _AddRateToSummary(tag, rate, step, sw): """Adds the given rate to the summary with the given tag. Args: tag: Name for this value. rate: Value to add to the summary. Perhaps an error rate. step: Global step of the graph for the x-coordinate of the summary. sw: Summary writer to which to write the rate value. """ sw.add_summary( summary_pb2.Summary(value=[summary_pb2.Summary.Value( tag=tag, simple_value=rate)]), step)