ptb_word_lm.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """Example / benchmark for building a PTB LSTM model.
  16. Trains the model described in:
  17. (Zaremba, et. al.) Recurrent Neural Network Regularization
  18. http://arxiv.org/abs/1409.2329
  19. There are 3 supported model configurations:
  20. ===========================================
  21. | config | epochs | train | valid | test
  22. ===========================================
  23. | small | 13 | 37.99 | 121.39 | 115.91
  24. | medium | 39 | 48.45 | 86.16 | 82.07
  25. | large | 55 | 37.87 | 82.62 | 78.29
  26. The exact results may vary depending on the random initialization.
  27. The hyperparameters used in the model:
  28. - init_scale - the initial scale of the weights
  29. - learning_rate - the initial value of the learning rate
  30. - max_grad_norm - the maximum permissible norm of the gradient
  31. - num_layers - the number of LSTM layers
  32. - num_steps - the number of unrolled steps of LSTM
  33. - hidden_size - the number of LSTM units
  34. - max_epoch - the number of epochs trained with the initial learning rate
  35. - max_max_epoch - the total number of epochs for training
  36. - keep_prob - the probability of keeping weights in the dropout layer
  37. - lr_decay - the decay of the learning rate for each epoch after "max_epoch"
  38. - batch_size - the batch size
  39. The data required for this example is in the data/ dir of the
  40. PTB dataset from Tomas Mikolov's webpage:
  41. $ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
  42. $ tar xvf simple-examples.tgz
  43. To run:
  44. $ python ptb_word_lm.py --data_path=simple-examples/data/
  45. """
  46. from __future__ import absolute_import
  47. from __future__ import division
  48. from __future__ import print_function
  49. import time
  50. import numpy as np
  51. import tensorflow as tf
  52. import reader
  53. flags = tf.flags
  54. logging = tf.logging
  55. flags.DEFINE_string(
  56. "model", "small",
  57. "A type of model. Possible options are: small, medium, large.")
  58. flags.DEFINE_string("data_path", None,
  59. "Where the training/test data is stored.")
  60. flags.DEFINE_string("save_path", None,
  61. "Model output directory.")
  62. flags.DEFINE_bool("use_fp16", False,
  63. "Train using 16-bit floats instead of 32bit floats")
  64. FLAGS = flags.FLAGS
  65. def data_type():
  66. return tf.float16 if FLAGS.use_fp16 else tf.float32
  67. class PTBInput(object):
  68. """The input data."""
  69. def __init__(self, config, data, name=None):
  70. self.batch_size = batch_size = config.batch_size
  71. self.num_steps = num_steps = config.num_steps
  72. self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
  73. self.input_data, self.targets = reader.ptb_producer(
  74. data, batch_size, num_steps, name=name)
  75. class PTBModel(object):
  76. """The PTB model."""
  77. def __init__(self, is_training, config, input_):
  78. self._input = input_
  79. batch_size = input_.batch_size
  80. num_steps = input_.num_steps
  81. size = config.hidden_size
  82. vocab_size = config.vocab_size
  83. # Slightly better results can be obtained with forget gate biases
  84. # initialized to 1 but the hyperparameters of the model would need to be
  85. # different than reported in the paper.
  86. lstm_cell = tf.contrib.rnn.BasicLSTMCell(
  87. size, forget_bias=0.0, state_is_tuple=True)
  88. if is_training and config.keep_prob < 1:
  89. lstm_cell = tf.contrib.rnn.DropoutWrapper(
  90. lstm_cell, output_keep_prob=config.keep_prob)
  91. cell = tf.contrib.rnn.MultiRNNCell(
  92. [lstm_cell] * config.num_layers, state_is_tuple=True)
  93. self._initial_state = cell.zero_state(batch_size, data_type())
  94. with tf.device("/cpu:0"):
  95. embedding = tf.get_variable(
  96. "embedding", [vocab_size, size], dtype=data_type())
  97. inputs = tf.nn.embedding_lookup(embedding, input_.input_data)
  98. if is_training and config.keep_prob < 1:
  99. inputs = tf.nn.dropout(inputs, config.keep_prob)
  100. # Simplified version of models/tutorials/rnn/rnn.py's rnn().
  101. # This builds an unrolled LSTM for tutorial purposes only.
  102. # In general, use the rnn() or state_saving_rnn() from rnn.py.
  103. #
  104. # The alternative version of the code below is:
  105. #
  106. # inputs = tf.unstack(inputs, num=num_steps, axis=1)
  107. # outputs, state = tf.nn.rnn(cell, inputs,
  108. # initial_state=self._initial_state)
  109. outputs = []
  110. state = self._initial_state
  111. with tf.variable_scope("RNN"):
  112. for time_step in range(num_steps):
  113. if time_step > 0: tf.get_variable_scope().reuse_variables()
  114. (cell_output, state) = cell(inputs[:, time_step, :], state)
  115. outputs.append(cell_output)
  116. output = tf.reshape(tf.concat_v2(outputs, 1), [-1, size])
  117. softmax_w = tf.get_variable(
  118. "softmax_w", [size, vocab_size], dtype=data_type())
  119. softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
  120. logits = tf.matmul(output, softmax_w) + softmax_b
  121. loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
  122. [logits],
  123. [tf.reshape(input_.targets, [-1])],
  124. [tf.ones([batch_size * num_steps], dtype=data_type())])
  125. self._cost = cost = tf.reduce_sum(loss) / batch_size
  126. self._final_state = state
  127. if not is_training:
  128. return
  129. self._lr = tf.Variable(0.0, trainable=False)
  130. tvars = tf.trainable_variables()
  131. grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
  132. config.max_grad_norm)
  133. optimizer = tf.train.GradientDescentOptimizer(self._lr)
  134. self._train_op = optimizer.apply_gradients(
  135. zip(grads, tvars),
  136. global_step=tf.contrib.framework.get_or_create_global_step())
  137. self._new_lr = tf.placeholder(
  138. tf.float32, shape=[], name="new_learning_rate")
  139. self._lr_update = tf.assign(self._lr, self._new_lr)
  140. def assign_lr(self, session, lr_value):
  141. session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
  142. @property
  143. def input(self):
  144. return self._input
  145. @property
  146. def initial_state(self):
  147. return self._initial_state
  148. @property
  149. def cost(self):
  150. return self._cost
  151. @property
  152. def final_state(self):
  153. return self._final_state
  154. @property
  155. def lr(self):
  156. return self._lr
  157. @property
  158. def train_op(self):
  159. return self._train_op
  160. class SmallConfig(object):
  161. """Small config."""
  162. init_scale = 0.1
  163. learning_rate = 1.0
  164. max_grad_norm = 5
  165. num_layers = 2
  166. num_steps = 20
  167. hidden_size = 200
  168. max_epoch = 4
  169. max_max_epoch = 13
  170. keep_prob = 1.0
  171. lr_decay = 0.5
  172. batch_size = 20
  173. vocab_size = 10000
  174. class MediumConfig(object):
  175. """Medium config."""
  176. init_scale = 0.05
  177. learning_rate = 1.0
  178. max_grad_norm = 5
  179. num_layers = 2
  180. num_steps = 35
  181. hidden_size = 650
  182. max_epoch = 6
  183. max_max_epoch = 39
  184. keep_prob = 0.5
  185. lr_decay = 0.8
  186. batch_size = 20
  187. vocab_size = 10000
  188. class LargeConfig(object):
  189. """Large config."""
  190. init_scale = 0.04
  191. learning_rate = 1.0
  192. max_grad_norm = 10
  193. num_layers = 2
  194. num_steps = 35
  195. hidden_size = 1500
  196. max_epoch = 14
  197. max_max_epoch = 55
  198. keep_prob = 0.35
  199. lr_decay = 1 / 1.15
  200. batch_size = 20
  201. vocab_size = 10000
  202. class TestConfig(object):
  203. """Tiny config, for testing."""
  204. init_scale = 0.1
  205. learning_rate = 1.0
  206. max_grad_norm = 1
  207. num_layers = 1
  208. num_steps = 2
  209. hidden_size = 2
  210. max_epoch = 1
  211. max_max_epoch = 1
  212. keep_prob = 1.0
  213. lr_decay = 0.5
  214. batch_size = 20
  215. vocab_size = 10000
  216. def run_epoch(session, model, eval_op=None, verbose=False):
  217. """Runs the model on the given data."""
  218. start_time = time.time()
  219. costs = 0.0
  220. iters = 0
  221. state = session.run(model.initial_state)
  222. fetches = {
  223. "cost": model.cost,
  224. "final_state": model.final_state,
  225. }
  226. if eval_op is not None:
  227. fetches["eval_op"] = eval_op
  228. for step in range(model.input.epoch_size):
  229. feed_dict = {}
  230. for i, (c, h) in enumerate(model.initial_state):
  231. feed_dict[c] = state[i].c
  232. feed_dict[h] = state[i].h
  233. vals = session.run(fetches, feed_dict)
  234. cost = vals["cost"]
  235. state = vals["final_state"]
  236. costs += cost
  237. iters += model.input.num_steps
  238. if verbose and step % (model.input.epoch_size // 10) == 10:
  239. print("%.3f perplexity: %.3f speed: %.0f wps" %
  240. (step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
  241. iters * model.input.batch_size / (time.time() - start_time)))
  242. return np.exp(costs / iters)
  243. def get_config():
  244. if FLAGS.model == "small":
  245. return SmallConfig()
  246. elif FLAGS.model == "medium":
  247. return MediumConfig()
  248. elif FLAGS.model == "large":
  249. return LargeConfig()
  250. elif FLAGS.model == "test":
  251. return TestConfig()
  252. else:
  253. raise ValueError("Invalid model: %s", FLAGS.model)
  254. def main(_):
  255. if not FLAGS.data_path:
  256. raise ValueError("Must set --data_path to PTB data directory")
  257. raw_data = reader.ptb_raw_data(FLAGS.data_path)
  258. train_data, valid_data, test_data, _ = raw_data
  259. config = get_config()
  260. eval_config = get_config()
  261. eval_config.batch_size = 1
  262. eval_config.num_steps = 1
  263. with tf.Graph().as_default():
  264. initializer = tf.random_uniform_initializer(-config.init_scale,
  265. config.init_scale)
  266. with tf.name_scope("Train"):
  267. train_input = PTBInput(config=config, data=train_data, name="TrainInput")
  268. with tf.variable_scope("Model", reuse=None, initializer=initializer):
  269. m = PTBModel(is_training=True, config=config, input_=train_input)
  270. tf.contrib.deprecated.scalar_summary("Training Loss", m.cost)
  271. tf.contrib.deprecated.scalar_summary("Learning Rate", m.lr)
  272. with tf.name_scope("Valid"):
  273. valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
  274. with tf.variable_scope("Model", reuse=True, initializer=initializer):
  275. mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
  276. tf.contrib.deprecated.scalar_summary("Validation Loss", mvalid.cost)
  277. with tf.name_scope("Test"):
  278. test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
  279. with tf.variable_scope("Model", reuse=True, initializer=initializer):
  280. mtest = PTBModel(is_training=False, config=eval_config,
  281. input_=test_input)
  282. sv = tf.train.Supervisor(logdir=FLAGS.save_path)
  283. with sv.managed_session() as session:
  284. for i in range(config.max_max_epoch):
  285. lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
  286. m.assign_lr(session, config.learning_rate * lr_decay)
  287. print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
  288. train_perplexity = run_epoch(session, m, eval_op=m.train_op,
  289. verbose=True)
  290. print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
  291. valid_perplexity = run_epoch(session, mvalid)
  292. print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
  293. test_perplexity = run_epoch(session, mtest)
  294. print("Test Perplexity: %.3f" % test_perplexity)
  295. if FLAGS.save_path:
  296. print("Saving model to %s." % FLAGS.save_path)
  297. sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
  298. if __name__ == "__main__":
  299. tf.app.run()