123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323 |
- # Copyright 2015 Google Inc. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """The Neural GPU Model."""
- import time
- import tensorflow as tf
- import data_utils
- def conv_linear(args, kw, kh, nin, nout, do_bias, bias_start, prefix):
- """Convolutional linear map."""
- assert args is not None
- if not isinstance(args, (list, tuple)):
- args = [args]
- with tf.variable_scope(prefix):
- k = tf.get_variable("CvK", [kw, kh, nin, nout])
- if len(args) == 1:
- res = tf.nn.conv2d(args[0], k, [1, 1, 1, 1], "SAME")
- else:
- res = tf.nn.conv2d(tf.concat(3, args), k, [1, 1, 1, 1], "SAME")
- if not do_bias: return res
- bias_term = tf.get_variable("CvB", [nout],
- initializer=tf.constant_initializer(0.0))
- return res + bias_term + bias_start
- def sigmoid_cutoff(x, cutoff):
- """Sigmoid with cutoff, e.g., 1.2sigmoid(x) - 0.1."""
- y = tf.sigmoid(x)
- if cutoff < 1.01: return y
- d = (cutoff - 1.0) / 2.0
- return tf.minimum(1.0, tf.maximum(0.0, cutoff * y - d))
- def tanh_cutoff(x, cutoff):
- """Tanh with cutoff, e.g., 1.1tanh(x) cut to [-1. 1]."""
- y = tf.tanh(x)
- if cutoff < 1.01: return y
- d = (cutoff - 1.0) / 2.0
- return tf.minimum(1.0, tf.maximum(-1.0, (1.0 + d) * y))
- def conv_gru(inpts, mem, kw, kh, nmaps, cutoff, prefix):
- """Convolutional GRU."""
- def conv_lin(args, suffix, bias_start):
- return conv_linear(args, kw, kh, len(args) * nmaps, nmaps, True, bias_start,
- prefix + "/" + suffix)
- reset = sigmoid_cutoff(conv_lin(inpts + [mem], "r", 1.0), cutoff)
- # candidate = tanh_cutoff(conv_lin(inpts + [reset * mem], "c", 0.0), cutoff)
- candidate = tf.tanh(conv_lin(inpts + [reset * mem], "c", 0.0))
- gate = sigmoid_cutoff(conv_lin(inpts + [mem], "g", 1.0), cutoff)
- return gate * mem + (1 - gate) * candidate
- @tf.RegisterGradient("CustomIdG")
- def _custom_id_grad(_, grads):
- return grads
- def quantize(t, quant_scale, max_value=1.0):
- """Quantize a tensor t with each element in [-max_value, max_value]."""
- t = tf.minimum(max_value, tf.maximum(t, -max_value))
- big = quant_scale * (t + max_value) + 0.5
- with tf.get_default_graph().gradient_override_map({"Floor": "CustomIdG"}):
- res = (tf.floor(big) / quant_scale) - max_value
- return res
- def quantize_weights_op(quant_scale, max_value):
- ops = [v.assign(quantize(v, quant_scale, float(max_value)))
- for v in tf.trainable_variables()]
- return tf.group(*ops)
- def relaxed_average(var_name_suffix, rx_step):
- """Calculate the average of relaxed variables having var_name_suffix."""
- relaxed_vars = []
- for l in xrange(rx_step):
- with tf.variable_scope("RX%d" % l, reuse=True):
- try:
- relaxed_vars.append(tf.get_variable(var_name_suffix))
- except ValueError:
- pass
- dsum = tf.add_n(relaxed_vars)
- avg = dsum / len(relaxed_vars)
- diff = [v - avg for v in relaxed_vars]
- davg = tf.add_n([d*d for d in diff])
- return avg, tf.reduce_sum(davg)
- def relaxed_distance(rx_step):
- """Distance between relaxed variables and their average."""
- res, ops, rx_done = [], [], {}
- for v in tf.trainable_variables():
- if v.name[0:2] == "RX":
- rx_name = v.op.name[v.name.find("/") + 1:]
- if rx_name not in rx_done:
- avg, dist_loss = relaxed_average(rx_name, rx_step)
- res.append(dist_loss)
- rx_done[rx_name] = avg
- ops.append(v.assign(rx_done[rx_name]))
- return tf.add_n(res), tf.group(*ops)
- def make_dense(targets, noclass):
- """Move a batch of targets to a dense 1-hot representation."""
- with tf.device("/cpu:0"):
- shape = tf.shape(targets)
- batch_size = shape[0]
- indices = targets + noclass * tf.range(0, batch_size)
- length = tf.expand_dims(batch_size * noclass, 0)
- dense = tf.sparse_to_dense(indices, length, 1.0, 0.0)
- return tf.reshape(dense, [-1, noclass])
- def check_for_zero(sparse):
- """In a sparse batch of ints, make 1.0 if it's 0 and 0.0 else."""
- with tf.device("/cpu:0"):
- shape = tf.shape(sparse)
- batch_size = shape[0]
- sparse = tf.minimum(sparse, 1)
- indices = sparse + 2 * tf.range(0, batch_size)
- dense = tf.sparse_to_dense(indices, tf.expand_dims(2 * batch_size, 0),
- 1.0, 0.0)
- reshaped = tf.reshape(dense, [-1, 2])
- return tf.reshape(tf.slice(reshaped, [0, 0], [-1, 1]), [-1])
- class NeuralGPU(object):
- """Neural GPU Model."""
- def __init__(self, nmaps, vec_size, niclass, noclass, dropout, rx_step,
- max_grad_norm, cutoff, nconvs, kw, kh, height, mode,
- learning_rate, pull, pull_incr, min_length, act_noise=0.0):
- # Feeds for parameters and ops to update them.
- self.global_step = tf.Variable(0, trainable=False)
- self.cur_length = tf.Variable(min_length, trainable=False)
- self.cur_length_incr_op = self.cur_length.assign_add(1)
- self.lr = tf.Variable(float(learning_rate), trainable=False)
- self.lr_decay_op = self.lr.assign(self.lr * 0.98)
- self.pull = tf.Variable(float(pull), trainable=False)
- self.pull_incr_op = self.pull.assign(self.pull * pull_incr)
- self.do_training = tf.placeholder(tf.float32, name="do_training")
- self.noise_param = tf.placeholder(tf.float32, name="noise_param")
- # Feeds for inputs, targets, outputs, losses, etc.
- self.input = []
- self.target = []
- for l in xrange(data_utils.forward_max + 1):
- self.input.append(tf.placeholder(tf.int32, name="inp{0}".format(l)))
- self.target.append(tf.placeholder(tf.int32, name="tgt{0}".format(l)))
- self.outputs = []
- self.losses = []
- self.grad_norms = []
- self.updates = []
- # Computation.
- inp0_shape = tf.shape(self.input[0])
- batch_size = inp0_shape[0]
- with tf.device("/cpu:0"):
- emb_weights = tf.get_variable(
- "embedding", [niclass, vec_size],
- initializer=tf.random_uniform_initializer(-1.7, 1.7))
- e0 = tf.scatter_update(emb_weights,
- tf.constant(0, dtype=tf.int32, shape=[1]),
- tf.zeros([1, vec_size]))
- adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4)
- # Main graph creation loop, for every bin in data_utils.
- self.steps = []
- for length in sorted(list(set(data_utils.bins + [data_utils.forward_max]))):
- data_utils.print_out("Creating model for bin of length %d." % length)
- start_time = time.time()
- if length > data_utils.bins[0]:
- tf.get_variable_scope().reuse_variables()
- # Embed inputs and calculate mask.
- with tf.device("/cpu:0"):
- with tf.control_dependencies([e0]):
- embedded = [tf.nn.embedding_lookup(emb_weights, self.input[l])
- for l in xrange(length)]
- # Mask to 0-out padding space in each step.
- imask = [check_for_zero(self.input[l]) for l in xrange(length)]
- omask = [check_for_zero(self.target[l]) for l in xrange(length)]
- mask = [1.0 - (imask[i] * omask[i]) for i in xrange(length)]
- mask = [tf.reshape(m, [-1, 1]) for m in mask]
- # Use a shifted mask for step scaling and concatenated for weights.
- shifted_mask = mask + [tf.zeros_like(mask[0])]
- scales = [shifted_mask[i] * (1.0 - shifted_mask[i+1])
- for i in xrange(length)]
- scales = [tf.reshape(s, [-1, 1, 1, 1]) for s in scales]
- mask = tf.concat(1, mask[0:length]) # batch x length
- weights = mask
- # Add a height dimension to mask to use later for masking.
- mask = tf.reshape(mask, [-1, length, 1, 1])
- mask = tf.concat(2, [mask for _ in xrange(height)]) + tf.zeros(
- tf.pack([batch_size, length, height, nmaps]), dtype=tf.float32)
- # Start is a length-list of batch-by-nmaps tensors, reshape and concat.
- start = [tf.tanh(embedded[l]) for l in xrange(length)]
- start = [tf.reshape(start[l], [-1, 1, nmaps]) for l in xrange(length)]
- start = tf.reshape(tf.concat(1, start), [-1, length, 1, nmaps])
- # First image comes from start by applying one convolution and adding 0s.
- first = conv_linear(start, 1, 1, vec_size, nmaps, True, 0.0, "input")
- first = [first] + [tf.zeros(tf.pack([batch_size, length, 1, nmaps]),
- dtype=tf.float32) for _ in xrange(height - 1)]
- first = tf.concat(2, first)
- # Computation steps.
- keep_prob = 1.0 - self.do_training * (dropout * 8.0 / float(length))
- step = [tf.nn.dropout(first, keep_prob) * mask]
- act_noise_scale = act_noise * self.do_training * self.pull
- outputs = []
- for it in xrange(length):
- with tf.variable_scope("RX%d" % (it % rx_step)) as vs:
- if it >= rx_step:
- vs.reuse_variables()
- cur = step[it]
- # Do nconvs-many CGRU steps.
- for layer in xrange(nconvs):
- cur = conv_gru([], cur, kw, kh, nmaps, cutoff, "cgru_%d" % layer)
- cur *= mask
- outputs.append(tf.slice(cur, [0, 0, 0, 0], [-1, -1, 1, -1]))
- cur = tf.nn.dropout(cur, keep_prob)
- if act_noise > 0.00001:
- cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
- step.append(cur * mask)
- self.steps.append([tf.reshape(s, [-1, length, height * nmaps])
- for s in step])
- # Output is the n-th step output; n = current length, as in scales.
- output = tf.add_n([outputs[i] * scales[i] for i in xrange(length)])
- # Final convolution to get logits, list outputs.
- output = conv_linear(output, 1, 1, nmaps, noclass, True, 0.0, "output")
- output = tf.reshape(output, [-1, length, noclass])
- external_output = [tf.reshape(o, [-1, noclass])
- for o in list(tf.split(1, length, output))]
- external_output = [tf.nn.softmax(o) for o in external_output]
- self.outputs.append(external_output)
- # Calculate cross-entropy loss and normalize it.
- targets = tf.concat(1, [make_dense(self.target[l], noclass)
- for l in xrange(length)])
- targets = tf.reshape(targets, [-1, noclass])
- xent = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
- tf.reshape(output, [-1, noclass]), targets), [-1, length])
- perp_loss = tf.reduce_sum(xent * weights)
- perp_loss /= tf.cast(batch_size, dtype=tf.float32)
- perp_loss /= length
- # Final loss: cross-entropy + shared parameter relaxation part.
- relax_dist, self.avg_op = relaxed_distance(rx_step)
- total_loss = perp_loss + relax_dist * self.pull
- self.losses.append(perp_loss)
- # Gradients and Adam update operation.
- if length == data_utils.bins[0] or (mode == 0 and
- length < data_utils.bins[-1] + 1):
- data_utils.print_out("Creating backward for bin of length %d." % length)
- params = tf.trainable_variables()
- grads = tf.gradients(total_loss, params)
- grads, norm = tf.clip_by_global_norm(grads, max_grad_norm)
- self.grad_norms.append(norm)
- for grad in grads:
- if isinstance(grad, tf.Tensor):
- grad += tf.truncated_normal(tf.shape(grad)) * self.noise_param
- update = adam.apply_gradients(zip(grads, params),
- global_step=self.global_step)
- self.updates.append(update)
- data_utils.print_out("Created model for bin of length %d in"
- " %.2f s." % (length, time.time() - start_time))
- self.saver = tf.train.Saver(tf.all_variables())
- def step(self, sess, inp, target, do_backward, noise_param=None,
- get_steps=False):
- """Run a step of the network."""
- assert len(inp) == len(target)
- length = len(target)
- feed_in = {}
- feed_in[self.noise_param.name] = noise_param if noise_param else 0.0
- feed_in[self.do_training.name] = 1.0 if do_backward else 0.0
- feed_out = []
- index = len(data_utils.bins)
- if length < data_utils.bins[-1] + 1:
- index = data_utils.bins.index(length)
- if do_backward:
- feed_out.append(self.updates[index])
- feed_out.append(self.grad_norms[index])
- feed_out.append(self.losses[index])
- for l in xrange(length):
- feed_in[self.input[l].name] = inp[l]
- for l in xrange(length):
- feed_in[self.target[l].name] = target[l]
- feed_out.append(self.outputs[index][l])
- if get_steps:
- for l in xrange(length+1):
- feed_out.append(self.steps[index][l])
- res = sess.run(feed_out, feed_in)
- offset = 0
- norm = None
- if do_backward:
- offset = 2
- norm = res[1]
- outputs = res[offset + 1:offset + 1 + length]
- steps = res[offset + 1 + length:] if get_steps else None
- return res[offset], outputs, norm, steps
|