translate.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """Binary for training translation models and decoding from them.
  16. Running this program without --decode will download the WMT corpus into
  17. the directory specified as --data_dir and tokenize it in a very basic way,
  18. and then start training a model saving checkpoints to --train_dir.
  19. Running with --decode starts an interactive loop so you can see how
  20. the current checkpoint translates English sentences into French.
  21. See the following papers for more information on neural translation models.
  22. * http://arxiv.org/abs/1409.3215
  23. * http://arxiv.org/abs/1409.0473
  24. * http://arxiv.org/abs/1412.2007
  25. """
  26. from __future__ import absolute_import
  27. from __future__ import division
  28. from __future__ import print_function
  29. import math
  30. import os
  31. import random
  32. import sys
  33. import time
  34. import logging
  35. import numpy as np
  36. from six.moves import xrange # pylint: disable=redefined-builtin
  37. import tensorflow as tf
  38. import data_utils
  39. import seq2seq_model
  40. tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.")
  41. tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99,
  42. "Learning rate decays by this much.")
  43. tf.app.flags.DEFINE_float("max_gradient_norm", 5.0,
  44. "Clip gradients to this norm.")
  45. tf.app.flags.DEFINE_integer("batch_size", 64,
  46. "Batch size to use during training.")
  47. tf.app.flags.DEFINE_integer("size", 1024, "Size of each model layer.")
  48. tf.app.flags.DEFINE_integer("num_layers", 3, "Number of layers in the model.")
  49. tf.app.flags.DEFINE_integer("from_vocab_size", 40000, "English vocabulary size.")
  50. tf.app.flags.DEFINE_integer("to_vocab_size", 40000, "French vocabulary size.")
  51. tf.app.flags.DEFINE_string("data_dir", "/tmp", "Data directory")
  52. tf.app.flags.DEFINE_string("train_dir", "/tmp", "Training directory.")
  53. tf.app.flags.DEFINE_string("from_train_data", None, "Training data.")
  54. tf.app.flags.DEFINE_string("to_train_data", None, "Training data.")
  55. tf.app.flags.DEFINE_string("from_dev_data", None, "Training data.")
  56. tf.app.flags.DEFINE_string("to_dev_data", None, "Training data.")
  57. tf.app.flags.DEFINE_integer("max_train_data_size", 0,
  58. "Limit on the size of training data (0: no limit).")
  59. tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200,
  60. "How many training steps to do per checkpoint.")
  61. tf.app.flags.DEFINE_boolean("decode", False,
  62. "Set to True for interactive decoding.")
  63. tf.app.flags.DEFINE_boolean("self_test", False,
  64. "Run a self-test if this is set to True.")
  65. tf.app.flags.DEFINE_boolean("use_fp16", False,
  66. "Train using fp16 instead of fp32.")
  67. FLAGS = tf.app.flags.FLAGS
  68. # We use a number of buckets and pad to the closest one for efficiency.
  69. # See seq2seq_model.Seq2SeqModel for details of how they work.
  70. _buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
  71. def read_data(source_path, target_path, max_size=None):
  72. """Read data from source and target files and put into buckets.
  73. Args:
  74. source_path: path to the files with token-ids for the source language.
  75. target_path: path to the file with token-ids for the target language;
  76. it must be aligned with the source file: n-th line contains the desired
  77. output for n-th line from the source_path.
  78. max_size: maximum number of lines to read, all other will be ignored;
  79. if 0 or None, data files will be read completely (no limit).
  80. Returns:
  81. data_set: a list of length len(_buckets); data_set[n] contains a list of
  82. (source, target) pairs read from the provided data files that fit
  83. into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
  84. len(target) < _buckets[n][1]; source and target are lists of token-ids.
  85. """
  86. data_set = [[] for _ in _buckets]
  87. with tf.gfile.GFile(source_path, mode="r") as source_file:
  88. with tf.gfile.GFile(target_path, mode="r") as target_file:
  89. source, target = source_file.readline(), target_file.readline()
  90. counter = 0
  91. while source and target and (not max_size or counter < max_size):
  92. counter += 1
  93. if counter % 100000 == 0:
  94. print(" reading data line %d" % counter)
  95. sys.stdout.flush()
  96. source_ids = [int(x) for x in source.split()]
  97. target_ids = [int(x) for x in target.split()]
  98. target_ids.append(data_utils.EOS_ID)
  99. for bucket_id, (source_size, target_size) in enumerate(_buckets):
  100. if len(source_ids) < source_size and len(target_ids) < target_size:
  101. data_set[bucket_id].append([source_ids, target_ids])
  102. break
  103. source, target = source_file.readline(), target_file.readline()
  104. return data_set
  105. def create_model(session, forward_only):
  106. """Create translation model and initialize or load parameters in session."""
  107. dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
  108. model = seq2seq_model.Seq2SeqModel(
  109. FLAGS.from_vocab_size,
  110. FLAGS.to_vocab_size,
  111. _buckets,
  112. FLAGS.size,
  113. FLAGS.num_layers,
  114. FLAGS.max_gradient_norm,
  115. FLAGS.batch_size,
  116. FLAGS.learning_rate,
  117. FLAGS.learning_rate_decay_factor,
  118. forward_only=forward_only,
  119. dtype=dtype)
  120. ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
  121. if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
  122. print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
  123. model.saver.restore(session, ckpt.model_checkpoint_path)
  124. else:
  125. print("Created model with fresh parameters.")
  126. session.run(tf.global_variables_initializer())
  127. return model
  128. def train():
  129. """Train a en->fr translation model using WMT data."""
  130. from_train = None
  131. to_train = None
  132. from_dev = None
  133. to_dev = None
  134. if FLAGS.from_train_data and FLAGS.to_train_data:
  135. from_train_data = FLAGS.from_train_data
  136. to_train_data = FLAGS.to_train_data
  137. from_dev_data = from_train_data
  138. to_dev_data = to_train_data
  139. if FLAGS.from_dev_data and FLAGS.to_dev_data:
  140. from_dev_data = FLAGS.from_dev_data
  141. to_dev_data = FLAGS.to_dev_data
  142. from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data(
  143. FLAGS.data_dir,
  144. from_train_data,
  145. to_train_data,
  146. from_dev_data,
  147. to_dev_data,
  148. FLAGS.from_vocab_size,
  149. FLAGS.to_vocab_size)
  150. else:
  151. # Prepare WMT data.
  152. print("Preparing WMT data in %s" % FLAGS.data_dir)
  153. from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_wmt_data(
  154. FLAGS.data_dir, FLAGS.from_vocab_size, FLAGS.to_vocab_size)
  155. with tf.Session() as sess:
  156. # Create model.
  157. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
  158. model = create_model(sess, False)
  159. # Read data into buckets and compute their sizes.
  160. print ("Reading development and training data (limit: %d)."
  161. % FLAGS.max_train_data_size)
  162. dev_set = read_data(from_dev, to_dev)
  163. train_set = read_data(from_train, to_train, FLAGS.max_train_data_size)
  164. train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
  165. train_total_size = float(sum(train_bucket_sizes))
  166. # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
  167. # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
  168. # the size if i-th training bucket, as used later.
  169. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
  170. for i in xrange(len(train_bucket_sizes))]
  171. # This is the training loop.
  172. step_time, loss = 0.0, 0.0
  173. current_step = 0
  174. previous_losses = []
  175. while True:
  176. # Choose a bucket according to data distribution. We pick a random number
  177. # in [0, 1] and use the corresponding interval in train_buckets_scale.
  178. random_number_01 = np.random.random_sample()
  179. bucket_id = min([i for i in xrange(len(train_buckets_scale))
  180. if train_buckets_scale[i] > random_number_01])
  181. # Get a batch and make a step.
  182. start_time = time.time()
  183. encoder_inputs, decoder_inputs, target_weights = model.get_batch(
  184. train_set, bucket_id)
  185. _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
  186. target_weights, bucket_id, False)
  187. step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
  188. loss += step_loss / FLAGS.steps_per_checkpoint
  189. current_step += 1
  190. # Once in a while, we save checkpoint, print statistics, and run evals.
  191. if current_step % FLAGS.steps_per_checkpoint == 0:
  192. # Print statistics for the previous epoch.
  193. perplexity = math.exp(float(loss)) if loss < 300 else float("inf")
  194. print ("global step %d learning rate %.4f step-time %.2f perplexity "
  195. "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
  196. step_time, perplexity))
  197. # Decrease learning rate if no improvement was seen over last 3 times.
  198. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
  199. sess.run(model.learning_rate_decay_op)
  200. previous_losses.append(loss)
  201. # Save checkpoint and zero timer and loss.
  202. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
  203. model.saver.save(sess, checkpoint_path, global_step=model.global_step)
  204. step_time, loss = 0.0, 0.0
  205. # Run evals on development set and print their perplexity.
  206. for bucket_id in xrange(len(_buckets)):
  207. if len(dev_set[bucket_id]) == 0:
  208. print(" eval: empty bucket %d" % (bucket_id))
  209. continue
  210. encoder_inputs, decoder_inputs, target_weights = model.get_batch(
  211. dev_set, bucket_id)
  212. _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
  213. target_weights, bucket_id, True)
  214. eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float(
  215. "inf")
  216. print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
  217. sys.stdout.flush()
  218. def decode():
  219. with tf.Session() as sess:
  220. # Create model and load parameters.
  221. model = create_model(sess, True)
  222. model.batch_size = 1 # We decode one sentence at a time.
  223. # Load vocabularies.
  224. en_vocab_path = os.path.join(FLAGS.data_dir,
  225. "vocab%d.from" % FLAGS.from_vocab_size)
  226. fr_vocab_path = os.path.join(FLAGS.data_dir,
  227. "vocab%d.to" % FLAGS.to_vocab_size)
  228. en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
  229. _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)
  230. # Decode from standard input.
  231. sys.stdout.write("> ")
  232. sys.stdout.flush()
  233. sentence = sys.stdin.readline()
  234. while sentence:
  235. # Get token-ids for the input sentence.
  236. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
  237. # Which bucket does it belong to?
  238. bucket_id = len(_buckets) - 1
  239. for i, bucket in enumerate(_buckets):
  240. if bucket[0] >= len(token_ids):
  241. bucket_id = i
  242. break
  243. else:
  244. logging.warning("Sentence truncated: %s", sentence)
  245. # Get a 1-element batch to feed the sentence to the model.
  246. encoder_inputs, decoder_inputs, target_weights = model.get_batch(
  247. {bucket_id: [(token_ids, [])]}, bucket_id)
  248. # Get output logits for the sentence.
  249. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
  250. target_weights, bucket_id, True)
  251. # This is a greedy decoder - outputs are just argmaxes of output_logits.
  252. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
  253. # If there is an EOS symbol in outputs, cut them at that point.
  254. if data_utils.EOS_ID in outputs:
  255. outputs = outputs[:outputs.index(data_utils.EOS_ID)]
  256. # Print out French sentence corresponding to outputs.
  257. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
  258. print("> ", end="")
  259. sys.stdout.flush()
  260. sentence = sys.stdin.readline()
  261. def self_test():
  262. """Test the translation model."""
  263. with tf.Session() as sess:
  264. print("Self-test for neural translation model.")
  265. # Create model with vocabularies of 10, 2 small buckets, 2 layers of 32.
  266. model = seq2seq_model.Seq2SeqModel(10, 10, [(3, 3), (6, 6)], 32, 2,
  267. 5.0, 32, 0.3, 0.99, num_samples=8)
  268. sess.run(tf.global_variables_initializer())
  269. # Fake data set for both the (3, 3) and (6, 6) bucket.
  270. data_set = ([([1, 1], [2, 2]), ([3, 3], [4]), ([5], [6])],
  271. [([1, 1, 1, 1, 1], [2, 2, 2, 2, 2]), ([3, 3, 3], [5, 6])])
  272. for _ in xrange(5): # Train the fake model for 5 steps.
  273. bucket_id = random.choice([0, 1])
  274. encoder_inputs, decoder_inputs, target_weights = model.get_batch(
  275. data_set, bucket_id)
  276. model.step(sess, encoder_inputs, decoder_inputs, target_weights,
  277. bucket_id, False)
  278. def main(_):
  279. if FLAGS.self_test:
  280. self_test()
  281. elif FLAGS.decode:
  282. decode()
  283. else:
  284. train()
  285. if __name__ == "__main__":
  286. tf.app.run()