123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600 |
- # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """String network description language to define network layouts."""
- import re
- import time
- import decoder
- import errorcounter as ec
- import shapes
- import tensorflow as tf
- import vgsl_input
- import vgslspecs
- import tensorflow.contrib.slim as slim
- from tensorflow.core.framework import summary_pb2
- from tensorflow.python.platform import tf_logging as logging
- # Parameters for rate decay.
- # We divide the learning_rate_halflife by DECAY_STEPS_FACTOR and use DECAY_RATE
- # as the decay factor for the learning rate, ie we use the DECAY_STEPS_FACTORth
- # root of 2 as the decay rate every halflife/DECAY_STEPS_FACTOR to achieve the
- # desired halflife.
- DECAY_STEPS_FACTOR = 16
- DECAY_RATE = pow(0.5, 1.0 / DECAY_STEPS_FACTOR)
- def Train(train_dir,
- model_str,
- train_data,
- max_steps,
- master='',
- task=0,
- ps_tasks=0,
- initial_learning_rate=0.001,
- final_learning_rate=0.001,
- learning_rate_halflife=160000,
- optimizer_type='Adam',
- num_preprocess_threads=1,
- reader=None):
- """Testable trainer with no dependence on FLAGS.
- Args:
- train_dir: Directory to write checkpoints.
- model_str: Network specification string.
- train_data: Training data file pattern.
- max_steps: Number of training steps to run.
- master: Name of the TensorFlow master to use.
- task: Task id of this replica running the training. (0 will be master).
- ps_tasks: Number of tasks in ps job, or 0 if no ps job.
- initial_learning_rate: Learing rate at start of training.
- final_learning_rate: Asymptotic minimum learning rate.
- learning_rate_halflife: Number of steps over which to halve the difference
- between initial and final learning rate.
- optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
- num_preprocess_threads: Number of input threads.
- reader: Function that returns an actual reader to read Examples from input
- files. If None, uses tf.TFRecordReader().
- """
- if master.startswith('local'):
- device = tf.ReplicaDeviceSetter(ps_tasks)
- else:
- device = '/cpu:0'
- with tf.Graph().as_default():
- with tf.device(device):
- model = InitNetwork(train_data, model_str, 'train', initial_learning_rate,
- final_learning_rate, learning_rate_halflife,
- optimizer_type, num_preprocess_threads, reader)
- # Create a Supervisor. It will take care of initialization, summaries,
- # checkpoints, and recovery.
- #
- # When multiple replicas of this program are running, the first one,
- # identified by --task=0 is the 'chief' supervisor. It is the only one
- # that takes case of initialization, etc.
- sv = tf.train.Supervisor(
- logdir=train_dir,
- is_chief=(task == 0),
- saver=model.saver,
- save_summaries_secs=10,
- save_model_secs=30,
- recovery_wait_secs=5)
- step = 0
- while step < max_steps:
- try:
- # Get an initialized, and possibly recovered session. Launch the
- # services: Checkpointing, Summaries, step counting.
- with sv.managed_session(master) as sess:
- while step < max_steps:
- _, step = model.TrainAStep(sess)
- if sv.coord.should_stop():
- break
- except tf.errors.AbortedError as e:
- logging.error('Received error:%s', e)
- continue
- def Eval(train_dir,
- eval_dir,
- model_str,
- eval_data,
- decoder_file,
- num_steps,
- graph_def_file=None,
- eval_interval_secs=0,
- reader=None):
- """Restores a model from a checkpoint and evaluates it.
- Args:
- train_dir: Directory to find checkpoints.
- eval_dir: Directory to write summary events.
- model_str: Network specification string.
- eval_data: Evaluation data file pattern.
- decoder_file: File to read to decode the labels.
- num_steps: Number of eval steps to run.
- graph_def_file: File to write graph definition to for freezing.
- eval_interval_secs: How often to run evaluations, or once if 0.
- reader: Function that returns an actual reader to read Examples from input
- files. If None, uses tf.TFRecordReader().
- Returns:
- (char error rate, word recall error rate, sequence error rate) as percent.
- Raises:
- ValueError: If unimplemented feature is used.
- """
- decode = None
- if decoder_file:
- decode = decoder.Decoder(decoder_file)
- # Run eval.
- rates = ec.ErrorRates(
- label_error=None,
- word_recall_error=None,
- word_precision_error=None,
- sequence_error=None)
- with tf.Graph().as_default():
- model = InitNetwork(eval_data, model_str, 'eval', reader=reader)
- sw = tf.summary.FileWriter(eval_dir)
- while True:
- sess = tf.Session('')
- if graph_def_file is not None:
- # Write the eval version of the graph to a file for freezing.
- if not tf.gfile.Exists(graph_def_file):
- with tf.gfile.FastGFile(graph_def_file, 'w') as f:
- f.write(
- sess.graph.as_graph_def(add_shapes=True).SerializeToString())
- ckpt = tf.train.get_checkpoint_state(train_dir)
- if ckpt and ckpt.model_checkpoint_path:
- step = model.Restore(ckpt.model_checkpoint_path, sess)
- if decode:
- rates = decode.SoftmaxEval(sess, model, num_steps)
- _AddRateToSummary('Label error rate', rates.label_error, step, sw)
- _AddRateToSummary('Word recall error rate', rates.word_recall_error,
- step, sw)
- _AddRateToSummary('Word precision error rate',
- rates.word_precision_error, step, sw)
- _AddRateToSummary('Sequence error rate', rates.sequence_error, step,
- sw)
- sw.flush()
- print 'Error rates=', rates
- else:
- raise ValueError('Non-softmax decoder evaluation not implemented!')
- if eval_interval_secs:
- time.sleep(eval_interval_secs)
- else:
- break
- return rates
- def InitNetwork(input_pattern,
- model_spec,
- mode='eval',
- initial_learning_rate=0.00005,
- final_learning_rate=0.00005,
- halflife=1600000,
- optimizer_type='Adam',
- num_preprocess_threads=1,
- reader=None):
- """Constructs a python tensor flow model defined by model_spec.
- Args:
- input_pattern: File pattern of the data in tfrecords of Example.
- model_spec: Concatenation of input spec, model spec and output spec.
- See Build below for input/output spec. For model spec, see vgslspecs.py
- mode: One of 'train', 'eval'
- initial_learning_rate: Initial learning rate for the network.
- final_learning_rate: Final learning rate for the network.
- halflife: Number of steps over which to halve the difference between
- initial and final learning rate for the network.
- optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
- num_preprocess_threads: Number of threads to use for image processing.
- reader: Function that returns an actual reader to read Examples from input
- files. If None, uses tf.TFRecordReader().
- Eval tasks need only specify input_pattern and model_spec.
- Returns:
- A VGSLImageModel class.
- Raises:
- ValueError: if the model spec syntax is incorrect.
- """
- model = VGSLImageModel(mode, model_spec, initial_learning_rate,
- final_learning_rate, halflife)
- left_bracket = model_spec.find('[')
- right_bracket = model_spec.rfind(']')
- if left_bracket < 0 or right_bracket < 0:
- raise ValueError('Failed to find [] in model spec! ', model_spec)
- input_spec = model_spec[:left_bracket]
- layer_spec = model_spec[left_bracket:right_bracket + 1]
- output_spec = model_spec[right_bracket + 1:]
- model.Build(input_pattern, input_spec, layer_spec, output_spec,
- optimizer_type, num_preprocess_threads, reader)
- return model
- class VGSLImageModel(object):
- """Class that builds a tensor flow model for training or evaluation.
- """
- def __init__(self, mode, model_spec, initial_learning_rate,
- final_learning_rate, halflife):
- """Constructs a VGSLImageModel.
- Args:
- mode: One of "train", "eval"
- model_spec: Full model specification string, for reference only.
- initial_learning_rate: Initial learning rate for the network.
- final_learning_rate: Final learning rate for the network.
- halflife: Number of steps over which to halve the difference between
- initial and final learning rate for the network.
- """
- # The string that was used to build this model.
- self.model_spec = model_spec
- # The layers between input and output.
- self.layers = None
- # The train/eval mode.
- self.mode = mode
- # The initial learning rate.
- self.initial_learning_rate = initial_learning_rate
- self.final_learning_rate = final_learning_rate
- self.decay_steps = halflife / DECAY_STEPS_FACTOR
- self.decay_rate = DECAY_RATE
- # Tensor for the labels.
- self.labels = None
- self.sparse_labels = None
- # Debug data containing the truth text.
- self.truths = None
- # Tensor for loss
- self.loss = None
- # Train operation
- self.train_op = None
- # Tensor for the global step counter
- self.global_step = None
- # Tensor for the output predictions (usually softmax)
- self.output = None
- # True if we are using CTC training mode.
- self.using_ctc = False
- # Saver object to load or restore the variables.
- self.saver = None
- def Build(self, input_pattern, input_spec, model_spec, output_spec,
- optimizer_type, num_preprocess_threads, reader):
- """Builds the model from the separate input/layers/output spec strings.
- Args:
- input_pattern: File pattern of the data in tfrecords of TF Example format.
- input_spec: Specification of the input layer:
- batchsize,height,width,depth (4 comma-separated integers)
- Training will run with batches of batchsize images, but runtime can
- use any batch size.
- height and/or width can be 0 or -1, indicating variable size,
- otherwise all images must be the given size.
- depth must be 1 or 3 to indicate greyscale or color.
- NOTE 1-d image input, treating the y image dimension as depth, can
- be achieved using S1(1x0)1,3 as the first op in the model_spec, but
- the y-size of the input must then be fixed.
- model_spec: Model definition. See vgslspecs.py
- output_spec: Output layer definition:
- O(2|1|0)(l|s|c)n output layer with n classes.
- 2 (heatmap) Output is a 2-d vector map of the input (possibly at
- different scale).
- 1 (sequence) Output is a 1-d sequence of vector values.
- 0 (value) Output is a 0-d single vector value.
- l uses a logistic non-linearity on the output, allowing multiple
- hot elements in any output vector value.
- s uses a softmax non-linearity, with one-hot output in each value.
- c uses a softmax with CTC. Can only be used with s (sequence).
- NOTE Only O1s and O1c are currently supported.
- optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
- num_preprocess_threads: Number of threads to use for image processing.
- reader: Function that returns an actual reader to read Examples from input
- files. If None, uses tf.TFRecordReader().
- """
- self.global_step = tf.Variable(0, name='global_step', trainable=False)
- shape = _ParseInputSpec(input_spec)
- out_dims, out_func, num_classes = _ParseOutputSpec(output_spec)
- self.using_ctc = out_func == 'c'
- images, heights, widths, labels, sparse, _ = vgsl_input.ImageInput(
- input_pattern, num_preprocess_threads, shape, self.using_ctc, reader)
- self.labels = labels
- self.sparse_labels = sparse
- self.layers = vgslspecs.VGSLSpecs(widths, heights, self.mode == 'train')
- last_layer = self.layers.Build(images, model_spec)
- self._AddOutputs(last_layer, out_dims, out_func, num_classes)
- if self.mode == 'train':
- self._AddOptimizer(optimizer_type)
- # For saving the model across training and evaluation
- self.saver = tf.train.Saver()
- def TrainAStep(self, sess):
- """Runs a training step in the session.
- Args:
- sess: Session in which to train the model.
- Returns:
- loss, global_step.
- """
- _, loss, step = sess.run([self.train_op, self.loss, self.global_step])
- return loss, step
- def Restore(self, checkpoint_path, sess):
- """Restores the model from the given checkpoint path into the session.
- Args:
- checkpoint_path: File pathname of the checkpoint.
- sess: Session in which to restore the model.
- Returns:
- global_step of the model.
- """
- self.saver.restore(sess, checkpoint_path)
- return tf.train.global_step(sess, self.global_step)
- def RunAStep(self, sess):
- """Runs a step for eval in the session.
- Args:
- sess: Session in which to run the model.
- Returns:
- output tensor result, labels tensor result.
- """
- return sess.run([self.output, self.labels])
- def _AddOutputs(self, prev_layer, out_dims, out_func, num_classes):
- """Adds the output layer and loss function.
- Args:
- prev_layer: Output of last layer of main network.
- out_dims: Number of output dimensions, 0, 1 or 2.
- out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic.
- num_classes: Number of outputs/size of last output dimension.
- """
- height_in = shapes.tensor_dim(prev_layer, dim=1)
- logits, outputs = self._AddOutputLayer(prev_layer, out_dims, out_func,
- num_classes)
- if self.mode == 'train':
- # Setup loss for training.
- self.loss = self._AddLossFunction(logits, height_in, out_dims, out_func)
- tf.summary.scalar('loss', self.loss)
- elif out_dims == 0:
- # Be sure the labels match the output, even in eval mode.
- self.labels = tf.slice(self.labels, [0, 0], [-1, 1])
- self.labels = tf.reshape(self.labels, [-1])
- logging.info('Final output=%s', outputs)
- logging.info('Labels tensor=%s', self.labels)
- self.output = outputs
- def _AddOutputLayer(self, prev_layer, out_dims, out_func, num_classes):
- """Add the fully-connected logits and SoftMax/Logistic output Layer.
- Args:
- prev_layer: Output of last layer of main network.
- out_dims: Number of output dimensions, 0, 1 or 2.
- out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic.
- num_classes: Number of outputs/size of last output dimension.
- Returns:
- logits: Pre-softmax/logistic fully-connected output shaped to out_dims.
- outputs: Post-softmax/logistic shaped to out_dims.
- Raises:
- ValueError: if syntax is incorrect.
- """
- # Reduce dimensionality appropriate to the output dimensions.
- batch_in = shapes.tensor_dim(prev_layer, dim=0)
- height_in = shapes.tensor_dim(prev_layer, dim=1)
- width_in = shapes.tensor_dim(prev_layer, dim=2)
- depth_in = shapes.tensor_dim(prev_layer, dim=3)
- if out_dims:
- # Combine any remaining height and width with batch and unpack after.
- shaped = tf.reshape(prev_layer, [-1, depth_in])
- else:
- # Everything except batch goes to depth, and therefore has to be known.
- shaped = tf.reshape(prev_layer, [-1, height_in * width_in * depth_in])
- logits = slim.fully_connected(shaped, num_classes, activation_fn=None)
- if out_func == 'l':
- raise ValueError('Logistic not yet supported!')
- else:
- output = tf.nn.softmax(logits)
- # Reshape to the dessired output.
- if out_dims == 2:
- output_shape = [batch_in, height_in, width_in, num_classes]
- elif out_dims == 1:
- output_shape = [batch_in, height_in * width_in, num_classes]
- else:
- output_shape = [batch_in, num_classes]
- output = tf.reshape(output, output_shape, name='Output')
- logits = tf.reshape(logits, output_shape)
- return logits, output
- def _AddLossFunction(self, logits, height_in, out_dims, out_func):
- """Add the appropriate loss function.
- Args:
- logits: Pre-softmax/logistic fully-connected output shaped to out_dims.
- height_in: Height of logits before going into the softmax layer.
- out_dims: Number of output dimensions, 0, 1 or 2.
- out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic.
- Returns:
- loss: That which is to be minimized.
- Raises:
- ValueError: if logistic is used.
- """
- if out_func == 'c':
- # Transpose batch to the middle.
- ctc_input = tf.transpose(logits, [1, 0, 2])
- # Compute the widths of each batch element from the input widths.
- widths = self.layers.GetLengths(dim=2, factor=height_in)
- cross_entropy = tf.nn.ctc_loss(ctc_input, self.sparse_labels, widths)
- elif out_func == 's':
- if out_dims == 2:
- self.labels = _PadLabels3d(logits, self.labels)
- elif out_dims == 1:
- self.labels = _PadLabels2d(
- shapes.tensor_dim(
- logits, dim=1), self.labels)
- else:
- self.labels = tf.slice(self.labels, [0, 0], [-1, 1])
- self.labels = tf.reshape(self.labels, [-1])
- cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
- logits=logits, labels=self.labels, name='xent')
- else:
- # TODO(rays) Labels need an extra dimension for logistic, so different
- # padding functions are needed, as well as a different loss function.
- raise ValueError('Logistic not yet supported!')
- return tf.reduce_sum(cross_entropy)
- def _AddOptimizer(self, optimizer_type):
- """Adds an optimizer with learning rate decay to minimize self.loss.
- Args:
- optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
- Raises:
- ValueError: if the optimizer type is unrecognized.
- """
- learn_rate_delta = self.initial_learning_rate - self.final_learning_rate
- learn_rate_dec = tf.add(
- tf.train.exponential_decay(learn_rate_delta, self.global_step,
- self.decay_steps, self.decay_rate),
- self.final_learning_rate)
- if optimizer_type == 'GradientDescent':
- opt = tf.train.GradientDescentOptimizer(learn_rate_dec)
- elif optimizer_type == 'AdaGrad':
- opt = tf.train.AdagradOptimizer(learn_rate_dec)
- elif optimizer_type == 'Momentum':
- opt = tf.train.MomentumOptimizer(learn_rate_dec, momentum=0.9)
- elif optimizer_type == 'Adam':
- opt = tf.train.AdamOptimizer(learning_rate=learn_rate_dec)
- else:
- raise ValueError('Invalid optimizer type: ' + optimizer_type)
- tf.summary.scalar('learn_rate', learn_rate_dec)
- self.train_op = opt.minimize(
- self.loss, global_step=self.global_step, name='train')
- def _PadLabels3d(logits, labels):
- """Pads or slices 3-d labels to match logits.
- Covers the case of 2-d softmax output, when labels is [batch, height, width]
- and logits is [batch, height, width, onehot]
- Args:
- logits: 4-d Pre-softmax fully-connected output.
- labels: 3-d, but not necessarily matching in size.
- Returns:
- labels: Resized by padding or clipping to match logits.
- """
- logits_shape = shapes.tensor_shape(logits)
- labels_shape = shapes.tensor_shape(labels)
- labels = tf.reshape(labels, [-1, labels_shape[2]])
- labels = _PadLabels2d(logits_shape[2], labels)
- labels = tf.reshape(labels, [labels_shape[0], -1])
- labels = _PadLabels2d(logits_shape[1] * logits_shape[2], labels)
- return tf.reshape(labels, [labels_shape[0], logits_shape[1], logits_shape[2]])
- def _PadLabels2d(logits_size, labels):
- """Pads or slices the 2nd dimension of 2-d labels to match logits_size.
- Covers the case of 1-d softmax output, when labels is [batch, seq] and
- logits is [batch, seq, onehot]
- Args:
- logits_size: Tensor returned from tf.shape giving the target size.
- labels: 2-d, but not necessarily matching in size.
- Returns:
- labels: Resized by padding or clipping the last dimension to logits_size.
- """
- pad = logits_size - tf.shape(labels)[1]
- def _PadFn():
- return tf.pad(labels, [[0, 0], [0, pad]])
- def _SliceFn():
- return tf.slice(labels, [0, 0], [-1, logits_size])
- return tf.cond(tf.greater(pad, 0), _PadFn, _SliceFn)
- def _ParseInputSpec(input_spec):
- """Parses input_spec and returns the numbers obtained therefrom.
- Args:
- input_spec: Specification of the input layer. See Build.
- Returns:
- shape: ImageShape with the desired shape of the input.
- Raises:
- ValueError: if syntax is incorrect.
- """
- pattern = re.compile(R'(\d+),(\d+),(\d+),(\d+)')
- m = pattern.match(input_spec)
- if m is None:
- raise ValueError('Failed to parse input spec:' + input_spec)
- batch_size = int(m.group(1))
- y_size = int(m.group(2)) if int(m.group(2)) > 0 else None
- x_size = int(m.group(3)) if int(m.group(3)) > 0 else None
- depth = int(m.group(4))
- if depth not in [1, 3]:
- raise ValueError('Depth must be 1 or 3, had:', depth)
- return vgsl_input.ImageShape(batch_size, y_size, x_size, depth)
- def _ParseOutputSpec(output_spec):
- """Parses the output spec.
- Args:
- output_spec: Output layer definition. See Build.
- Returns:
- out_dims: 2|1|0 for 2-d, 1-d, 0-d.
- out_func: l|s|c for logistic, softmax, softmax+CTC
- num_classes: Number of classes in output.
- Raises:
- ValueError: if syntax is incorrect.
- """
- pattern = re.compile(R'(O)(0|1|2)(l|s|c)(\d+)')
- m = pattern.match(output_spec)
- if m is None:
- raise ValueError('Failed to parse output spec:' + output_spec)
- out_dims = int(m.group(2))
- out_func = m.group(3)
- if out_func == 'c' and out_dims != 1:
- raise ValueError('CTC can only be used with a 1-D sequence!')
- num_classes = int(m.group(4))
- return out_dims, out_func, num_classes
- def _AddRateToSummary(tag, rate, step, sw):
- """Adds the given rate to the summary with the given tag.
- Args:
- tag: Name for this value.
- rate: Value to add to the summary. Perhaps an error rate.
- step: Global step of the graph for the x-coordinate of the summary.
- sw: Summary writer to which to write the rate value.
- """
- sw.add_summary(
- summary_pb2.Summary(value=[summary_pb2.Summary.Value(
- tag=tag, simple_value=rate)]), step)
|