9 years ago · 2a5a559689
--- a/lm_1b/BUILD
+++ b/lm_1b/BUILD
@@ -0,0 +1,27 @@
 
				+package(default_visibility = [":internal"])
			
 
				+
			
 
				+licenses(["notice"])  # Apache 2.0
			
 
				+
			
 
				+exports_files(["LICENSE"])
			
 
				+
			
 
				+package_group(
			
 
				+    name = "internal",
			
 
				+    packages = [
			
 
				+        "//lm_1b/...",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+py_library(
			
 
				+    name = "data_utils",
			
 
				+    srcs = ["data_utils.py"],
			
 
				+)
			
 
				+
			
 
				+py_binary(
			
 
				+    name = "lm_1b_eval",
			
 
				+    srcs = [
			
 
				+        "lm_1b_eval.py",
			
 
				+    ],
			
 
				+    deps = [
			
 
				+        ":data_utils",
			
 
				+    ],
			
 
				+)
			
--- a/lm_1b/README.md
+++ b/lm_1b/README.md
@@ -0,0 +1,191 @@
 
				+<font size=4><b>Language Model on One Billion Word Benchmark</b></font>
			
 
				+
			
 
				+<b>Authors:</b>
			
 
				+
			
 
				+Oriol Vinyals (vinyals@google.com, github: OriolVinyals),
			
 
				+Xin Pan (xpan@google.com, github: panyx0718)
			
 
				+
			
 
				+<b>Paper Authors:</b>
			
 
				+
			
 
				+Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam Shazeer, Yonghui Wu
			
 
				+
			
 
				+<b>TL;DR</b>
			
 
				+
			
 
				+This is a pretrained model on One Billion Word Benchmark.
			
 
				+If you use this model in your publication, please cite the original paper:
			
 
				+
			
 
				+@article{jozefowicz2016exploring,
			
 
				+  title={Exploring the Limits of Language Modeling},
			
 
				+  author={Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike
			
 
				+          and Shazeer, Noam and Wu, Yonghui},
			
 
				+  journal={arXiv preprint arXiv:1602.02410},
			
 
				+  year={2016}
			
 
				+}
			
 
				+
			
 
				+<b>Introduction</b>
			
 
				+
			
 
				+In this release, we open source a model trained on the One Billion Word
			
 
				+Benchmark (http://arxiv.org/abs/1312.3005), a large language corpus in English
			
 
				+which was released in 2013. This dataset contains about one billion words, and
			
 
				+has a vocabulary size of about 800K words. It contains mostly news data. Since
			
 
				+sentences in the training set are shuffled, models can ignore the context and
			
 
				+focus on sentence level language modeling.
			
 
				+
			
 
				+In the original release and subsequent work, people have used the same test set
			
 
				+to train models on this dataset as a standard benchmark for language modeling.
			
 
				+Recently, we wrote an article (http://arxiv.org/abs/1602.02410) describing a
			
 
				+model hybrid between character CNN, a large and deep LSTM, and a specific
			
 
				+Softmax architecture which allowed us to train the best model on this dataset
			
 
				+thus far, almost halving the best perplexity previously obtained by others.
			
 
				+
			
 
				+<b>Code Release</b>
			
 
				+
			
 
				+The open-sourced components include:
			
 
				+
			
 
				+* TensorFlow GraphDef proto buffer text file.
			
 
				+* TensorFlow pre-trained checkpoint shards.
			
 
				+* Code used to evaluate the pre-trained model.
			
 
				+* Vocabulary file.
			
 
				+* Test set from LM-1B evaluation.
			
 
				+
			
 
				+The code supports 4 evaluation modes:
			
 
				+
			
 
				+* Given provided dataset, calculate the model's perplexity.
			
 
				+* Given a prefix sentence, predict the next words.
			
 
				+* Dump the softmax embedding, character-level CNN word embeddings.
			
 
				+* Give a sentence, dump the embedding from the LSTM state.
			
 
				+
			
 
				+<b>Results</b>
			
 
				+
			
 
				+Model | Test Perplexity | Number of Params [billions]
			
 
				+------|-----------------|----------------------------
			
 
				+Sigmoid-RNN-2048 [Blackout] | 68.3 | 4.1
			
 
				+Interpolated KN 5-gram, 1.1B n-grams [chelba2013one] | 67.6 | 1.76
			
 
				+Sparse Non-Negative Matrix LM [shazeer2015sparse] | 52.9 | 33
			
 
				+RNN-1024 + MaxEnt 9-gram features [chelba2013one] | 51.3 | 20
			
 
				+LSTM-512-512 | 54.1 | 0.82
			
 
				+LSTM-1024-512 | 48.2 | 0.82
			
 
				+LSTM-2048-512 | 43.7 | 0.83
			
 
				+LSTM-8192-2048 (No Dropout) | 37.9 | 3.3
			
 
				+LSTM-8192-2048 (50\% Dropout) | 32.2 | 3.3
			
 
				+2-Layer LSTM-8192-1024 (BIG LSTM) | 30.6 | 1.8
			
 
				+(THIS RELEASE) BIG LSTM+CNN Inputs | <b>30.0</b> | <b>1.04</b>
			
 
				+
			
 
				+<b>How To Run</b>
			
 
				+
			
 
				+Pre-requesite:
			
 
				+
			
 
				+* Install TensorFlow.
			
 
				+* Install Bazel.
			
 
				+* Download the data files:
			
 
				+  * Model GraphDef file:
			
 
				+  [link](download.tensorflow.org/models/LM_LSTM_CNN/graph-2016-09-10.pbtxt)
			
 
				+  * Model Checkpoint sharded file:
			
 
				+  [1](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-base)
			
 
				+  [2](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-char-embedding)
			
 
				+  [3](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-lstm)
			
 
				+  [4](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax0)
			
 
				+  [5](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax1)
			
 
				+  [6](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax2)
			
 
				+  [7](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax3)
			
 
				+  [8](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax4)
			
 
				+  [9](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax5)
			
 
				+  [10](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax6)
			
 
				+  [11](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax7)
			
 
				+  [12](download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax8)
			
 
				+  * Vocabulary file:
			
 
				+  [link](download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt)
			
 
				+  * test dataset: link
			
 
				+  [link](download.tensorflow.org/models/LM_LSTM_CNN/test/news.en.heldout-00000-of-00050)
			
 
				+* It is recommended to run on modern desktop PC instead of laptop.
			
 
				+
			
 
				+```shell
			
 
				+# 1. Clone the code to your workspace.
			
 
				+# 2. Download the data to your workspace.
			
 
				+# 3. Create an empty WORKSPACE file in your workspace.
			
 
				+# 4. Create an empty output directory in your workspace.
			
 
				+# Example directory structure below:
			
 
				+ls -R
			
 
				+.:
			
 
				+data  lm_1b  output  WORKSPACE
			
 
				+
			
 
				+./data:
			
 
				+ckpt  eval_2_8k_1k_1_1_char.pbtxt  news.en.heldout-00000-of-00050  vocab.txt
			
 
				+
			
 
				+./lm_1b:
			
 
				+BUILD  data_utils.py  data_utils.pyc  lm_1b_eval.py  README.md
			
 
				+
			
 
				+./output:
			
 
				+
			
 
				+# Build the codes.
			
 
				+bazel build -c opt lm_1b/...
			
 
				+# Run sample mode:
			
 
				+bazel-bin/lm_1b/lm_1b_eval --mode sample \
			
 
				+                           --prefix "I love that I" \
			
 
				+                           --pbtxt data/eval_2_8k_1k_1_1_char.pbtxt \
			
 
				+                           --vocab_file data/vocab.txt  \
			
 
				+                           --ckpt data/ckpt
			
 
				+...(omitted some TensorFlow output)
			
 
				+I love
			
 
				+I love that
			
 
				+I love that I
			
 
				+I love that I find
			
 
				+I love that I find that
			
 
				+I love that I find that amazing
			
 
				+...(omitted)
			
 
				+
			
 
				+# Run eval mode:
			
 
				+bazel-bin/lm_1b/lm_1b_eval --mode eval \
			
 
				+                           --pbtxt data/eval_2_8k_1k_1_1_char.pbtxt \
			
 
				+                           --vocab_file data/vocab.txt  \
			
 
				+                           --input_data data/news.en.heldout-00000-of-00050 \
			
 
				+                           --ckpt data/ckpt
			
 
				+...(omitted some TensorFlow output)
			
 
				+Loaded step 14108582.
			
 
				+# perplexity is high initially because words without context are harder to
			
 
				+# predict.
			
 
				+Eval Step: 0, Average Perplexity: 2045.512297.
			
 
				+Eval Step: 1, Average Perplexity: 229.478699.
			
 
				+Eval Step: 2, Average Perplexity: 208.116787.
			
 
				+Eval Step: 3, Average Perplexity: 338.870601.
			
 
				+Eval Step: 4, Average Perplexity: 228.950107.
			
 
				+Eval Step: 5, Average Perplexity: 197.685857.
			
 
				+Eval Step: 6, Average Perplexity: 156.287063.
			
 
				+Eval Step: 7, Average Perplexity: 124.866189.
			
 
				+Eval Step: 8, Average Perplexity: 147.204975.
			
 
				+Eval Step: 9, Average Perplexity: 90.124864.
			
 
				+Eval Step: 10, Average Perplexity: 59.897914.
			
 
				+Eval Step: 11, Average Perplexity: 42.591137.
			
 
				+...(omitted)
			
 
				+Eval Step: 4529, Average Perplexity: 29.243668.
			
 
				+Eval Step: 4530, Average Perplexity: 29.302362.
			
 
				+Eval Step: 4531, Average Perplexity: 29.285674.
			
 
				+...(omitted. At convergence, it should be around 30.)
			
 
				+
			
 
				+# Run dump_emb mode:
			
 
				+bazel-bin/lm_1b/lm_1b_eval --mode dump_emb \
			
 
				+                           --pbtxt data/eval_2_8k_1k_1_1_char.pbtxt \
			
 
				+                           --vocab_file data/vocab.txt  \
			
 
				+                           --ckpt data/ckpt \
			
 
				+                           --save_dir output
			
 
				+...(omitted some TensorFlow output)
			
 
				+Finished softmax weights
			
 
				+Finished word embedding 0/793471
			
 
				+Finished word embedding 1/793471
			
 
				+Finished word embedding 2/793471
			
 
				+...(omitted)
			
 
				+ls output/
			
 
				+embeddings_softmax.npy ...
			
 
				+
			
 
				+# Run dump_lstm_emb mode:
			
 
				+bazel-bin/lm_1b/lm_1b_eval --mode dump_lstm_emb \
			
 
				+                           --pbtxt data/eval_2_8k_1k_1_1_char.pbtxt \
			
 
				+                           --vocab_file data/vocab.txt \
			
 
				+                           --ckpt data/ckpt \
			
 
				+                           --sentence "I love who I am ." \
			
 
				+                           --save_dir output
			
 
				+ls output/
			
 
				+lstm_emb_step_0.npy  lstm_emb_step_2.npy  lstm_emb_step_4.npy
			
 
				+lstm_emb_step_6.npy  lstm_emb_step_1.npy  lstm_emb_step_3.npy
			
 
				+lstm_emb_step_5.npy
			
 
				+```
			
--- a/lm_1b/data_utils.py
+++ b/lm_1b/data_utils.py
@@ -0,0 +1,279 @@
 
				+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+# ==============================================================================
			
 
				+
			
 
				+"""A library for loading 1B word benchmark dataset."""
			
 
				+
			
 
				+import random
			
 
				+
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+
			
 
				+class Vocabulary(object):
			
 
				+  """Class that holds a vocabulary for the dataset."""
			
 
				+
			
 
				+  def __init__(self, filename):
			
 
				+    """Initialize vocabulary.
			
 
				+
			
 
				+    Args:
			
 
				+      filename: Vocabulary file name.
			
 
				+    """
			
 
				+
			
 
				+    self._id_to_word = []
			
 
				+    self._word_to_id = {}
			
 
				+    self._unk = -1
			
 
				+    self._bos = -1
			
 
				+    self._eos = -1
			
 
				+
			
 
				+    with tf.gfile.Open(filename) as f:
			
 
				+      idx = 0
			
 
				+      for line in f:
			
 
				+        word_name = line.strip()
			
 
				+        if word_name == '<S>':
			
 
				+          self._bos = idx
			
 
				+        elif word_name == '</S>':
			
 
				+          self._eos = idx
			
 
				+        elif word_name == '<UNK>':
			
 
				+          self._unk = idx
			
 
				+        if word_name == '!!!MAXTERMID':
			
 
				+          continue
			
 
				+
			
 
				+        self._id_to_word.append(word_name)
			
 
				+        self._word_to_id[word_name] = idx
			
 
				+        idx += 1
			
 
				+
			
 
				+  @property
			
 
				+  def bos(self):
			
 
				+    return self._bos
			
 
				+
			
 
				+  @property
			
 
				+  def eos(self):
			
 
				+    return self._eos
			
 
				+
			
 
				+  @property
			
 
				+  def unk(self):
			
 
				+    return self._unk
			
 
				+
			
 
				+  @property
			
 
				+  def size(self):
			
 
				+    return len(self._id_to_word)
			
 
				+
			
 
				+  def word_to_id(self, word):
			
 
				+    if word in self._word_to_id:
			
 
				+      return self._word_to_id[word]
			
 
				+    return self.unk
			
 
				+
			
 
				+  def id_to_word(self, cur_id):
			
 
				+    if cur_id < self.size:
			
 
				+      return self._id_to_word[cur_id]
			
 
				+    return 'ERROR'
			
 
				+
			
 
				+  def decode(self, cur_ids):
			
 
				+    """Convert a list of ids to a sentence, with space inserted."""
			
 
				+    return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])
			
 
				+
			
 
				+  def encode(self, sentence):
			
 
				+    """Convert a sentence to a list of ids, with special tokens added."""
			
 
				+    word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
			
 
				+    return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32)
			
 
				+
			
 
				+
			
 
				+class CharsVocabulary(Vocabulary):
			
 
				+  """Vocabulary containing character-level information."""
			
 
				+
			
 
				+  def __init__(self, filename, max_word_length):
			
 
				+    super(CharsVocabulary, self).__init__(filename)
			
 
				+    self._max_word_length = max_word_length
			
 
				+    chars_set = set()
			
 
				+
			
 
				+    for word in self._id_to_word:
			
 
				+      chars_set |= set(word)
			
 
				+
			
 
				+    free_ids = []
			
 
				+    for i in range(256):
			
 
				+      if chr(i) in chars_set:
			
 
				+        continue
			
 
				+      free_ids.append(chr(i))
			
 
				+
			
 
				+    if len(free_ids) < 5:
			
 
				+      raise ValueError('Not enough free char ids: %d' % len(free_ids))
			
 
				+
			
 
				+    self.bos_char = free_ids[0]  # <begin sentence>
			
 
				+    self.eos_char = free_ids[1]  # <end sentence>
			
 
				+    self.bow_char = free_ids[2]  # <begin word>
			
 
				+    self.eow_char = free_ids[3]  # <end word>
			
 
				+    self.pad_char = free_ids[4]  # <padding>
			
 
				+
			
 
				+    chars_set |= {self.bos_char, self.eos_char, self.bow_char, self.eow_char,
			
 
				+                  self.pad_char}
			
 
				+
			
 
				+    self._char_set = chars_set
			
 
				+    num_words = len(self._id_to_word)
			
 
				+
			
 
				+    self._word_char_ids = np.zeros([num_words, max_word_length], dtype=np.int32)
			
 
				+
			
 
				+    self.bos_chars = self._convert_word_to_char_ids(self.bos_char)
			
 
				+    self.eos_chars = self._convert_word_to_char_ids(self.eos_char)
			
 
				+
			
 
				+    for i, word in enumerate(self._id_to_word):
			
 
				+      self._word_char_ids[i] = self._convert_word_to_char_ids(word)
			
 
				+
			
 
				+  @property
			
 
				+  def word_char_ids(self):
			
 
				+    return self._word_char_ids
			
 
				+
			
 
				+  @property
			
 
				+  def max_word_length(self):
			
 
				+    return self._max_word_length
			
 
				+
			
 
				+  def _convert_word_to_char_ids(self, word):
			
 
				+    code = np.zeros([self.max_word_length], dtype=np.int32)
			
 
				+    code[:] = ord(self.pad_char)
			
 
				+
			
 
				+    if len(word) > self.max_word_length - 2:
			
 
				+      word = word[:self.max_word_length-2]
			
 
				+    cur_word = self.bow_char + word + self.eow_char
			
 
				+    for j in range(len(cur_word)):
			
 
				+      code[j] = ord(cur_word[j])
			
 
				+    return code
			
 
				+
			
 
				+  def word_to_char_ids(self, word):
			
 
				+    if word in self._word_to_id:
			
 
				+      return self._word_char_ids[self._word_to_id[word]]
			
 
				+    else:
			
 
				+      return self._convert_word_to_char_ids(word)
			
 
				+
			
 
				+  def encode_chars(self, sentence):
			
 
				+    chars_ids = [self.word_to_char_ids(cur_word)
			
 
				+                 for cur_word in sentence.split()]
			
 
				+    return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])
			
 
				+
			
 
				+
			
 
				+def get_batch(generator, batch_size, num_steps, max_word_length, pad=False):
			
 
				+  """Read batches of input."""
			
 
				+  cur_stream = [None] * batch_size
			
 
				+
			
 
				+  inputs = np.zeros([batch_size, num_steps], np.int32)
			
 
				+  char_inputs = np.zeros([batch_size, num_steps, max_word_length], np.int32)
			
 
				+  global_word_ids = np.zeros([batch_size, num_steps], np.int32)
			
 
				+  targets = np.zeros([batch_size, num_steps], np.int32)
			
 
				+  weights = np.ones([batch_size, num_steps], np.float32)
			
 
				+
			
 
				+  no_more_data = False
			
 
				+  while True:
			
 
				+    inputs[:] = 0
			
 
				+    char_inputs[:] = 0
			
 
				+    global_word_ids[:] = 0
			
 
				+    targets[:] = 0
			
 
				+    weights[:] = 0.0
			
 
				+
			
 
				+    for i in range(batch_size):
			
 
				+      cur_pos = 0
			
 
				+
			
 
				+      while cur_pos < num_steps:
			
 
				+        if cur_stream[i] is None or len(cur_stream[i][0]) <= 1:
			
 
				+          try:
			
 
				+            cur_stream[i] = list(generator.next())
			
 
				+          except StopIteration:
			
 
				+            # No more data, exhaust current streams and quit
			
 
				+            no_more_data = True
			
 
				+            break
			
 
				+
			
 
				+        how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos)
			
 
				+        next_pos = cur_pos + how_many
			
 
				+
			
 
				+        inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many]
			
 
				+        char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][:how_many]
			
 
				+        global_word_ids[i, cur_pos:next_pos] = cur_stream[i][2][:how_many]
			
 
				+        targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many+1]
			
 
				+        weights[i, cur_pos:next_pos] = 1.0
			
 
				+
			
 
				+        cur_pos = next_pos
			
 
				+        cur_stream[i][0] = cur_stream[i][0][how_many:]
			
 
				+        cur_stream[i][1] = cur_stream[i][1][how_many:]
			
 
				+        cur_stream[i][2] = cur_stream[i][2][how_many:]
			
 
				+
			
 
				+        if pad:
			
 
				+          break
			
 
				+
			
 
				+    if no_more_data and np.sum(weights) == 0:
			
 
				+      # There is no more data and this is an empty batch. Done!
			
 
				+      break
			
 
				+    yield inputs, char_inputs, global_word_ids, targets, weights
			
 
				+
			
 
				+
			
 
				+class LM1BDataset(object):
			
 
				+  """Utility class for 1B word benchmark dataset.
			
 
				+
			
 
				+  The current implementation reads the data from the tokenized text files.
			
 
				+  """
			
 
				+
			
 
				+  def __init__(self, filepattern, vocab):
			
 
				+    """Initialize LM1BDataset reader.
			
 
				+
			
 
				+    Args:
			
 
				+      filepattern: Dataset file pattern.
			
 
				+      vocab: Vocabulary.
			
 
				+    """
			
 
				+    self._vocab = vocab
			
 
				+    self._all_shards = tf.gfile.Glob(filepattern)
			
 
				+    tf.logging.info('Found %d shards at %s', len(self._all_shards), filepattern)
			
 
				+
			
 
				+  def _load_random_shard(self):
			
 
				+    """Randomly select a file and read it."""
			
 
				+    return self._load_shard(random.choice(self._all_shards))
			
 
				+
			
 
				+  def _load_shard(self, shard_name):
			
 
				+    """Read one file and convert to ids.
			
 
				+
			
 
				+    Args:
			
 
				+      shard_name: file path.
			
 
				+
			
 
				+    Returns:
			
 
				+      list of (id, char_id, global_word_id) tuples.
			
 
				+    """
			
 
				+    tf.logging.info('Loading data from: %s', shard_name)
			
 
				+    with tf.gfile.Open(shard_name) as f:
			
 
				+      sentences = f.readlines()
			
 
				+    chars_ids = [self.vocab.encode_chars(sentence) for sentence in sentences]
			
 
				+    ids = [self.vocab.encode(sentence) for sentence in sentences]
			
 
				+
			
 
				+    global_word_ids = []
			
 
				+    current_idx = 0
			
 
				+    for word_ids in ids:
			
 
				+      current_size = len(word_ids) - 1  # without <BOS> symbol
			
 
				+      cur_ids = np.arange(current_idx, current_idx + current_size)
			
 
				+      global_word_ids.append(cur_ids)
			
 
				+      current_idx += current_size
			
 
				+
			
 
				+    tf.logging.info('Loaded %d words.', current_idx)
			
 
				+    tf.logging.info('Finished loading')
			
 
				+    return zip(ids, chars_ids, global_word_ids)
			
 
				+
			
 
				+  def _get_sentence(self, forever=True):
			
 
				+    while True:
			
 
				+      ids = self._load_random_shard()
			
 
				+      for current_ids in ids:
			
 
				+        yield current_ids
			
 
				+      if not forever:
			
 
				+        break
			
 
				+
			
 
				+  def get_batch(self, batch_size, num_steps, pad=False, forever=True):
			
 
				+    return get_batch(self._get_sentence(forever), batch_size, num_steps,
			
 
				+                     self.vocab.max_word_length, pad=pad)
			
 
				+
			
 
				+  @property
			
 
				+  def vocab(self):
			
 
				+    return self._vocab
			
--- a/lm_1b/lm_1b_eval.py
+++ b/lm_1b/lm_1b_eval.py
@@ -0,0 +1,307 @@
 
				+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+# ==============================================================================
			
 
				+
			
 
				+"""Eval pre-trained 1 billion word language model.
			
 
				+"""
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+from google.protobuf import text_format
			
 
				+import data_utils
			
 
				+
			
 
				+FLAGS = tf.flags.FLAGS
			
 
				+# General flags.
			
 
				+tf.flags.DEFINE_string('mode', 'eval',
			
 
				+                       'One of [sample, eval, dump_emb, dump_lstm_emb]. '
			
 
				+                       '"sample" mode samples future word predictions, using '
			
 
				+                       'FLAGS.prefix as prefix (prefix could be left empty). '
			
 
				+                       '"eval" mode calculates perplexity of the '
			
 
				+                       'FLAGS.input_data. '
			
 
				+                       '"dump_emb" mode dumps word and softmax embeddings to '
			
 
				+                       'FLAGS.save_dir. embeddings are dumped in the same '
			
 
				+                       'order as words in vocabulary. All words in vocabulary '
			
 
				+                       'are dumped.'
			
 
				+                       'dump_lstm_emb dumps lstm embeddings of FLAGS.sentence '
			
 
				+                       'to FLAGS.save_dir.')
			
 
				+tf.flags.DEFINE_string('pbtxt', '',
			
 
				+                       'GraphDef proto text file used to construct model '
			
 
				+                       'structure.')
			
 
				+tf.flags.DEFINE_string('ckpt', '',
			
 
				+                       'Checkpoint directory used to fill model values.')
			
 
				+tf.flags.DEFINE_string('vocab_file', '', 'Vocabulary file.')
			
 
				+tf.flags.DEFINE_string('save_dir', '',
			
 
				+                       'Used for "dump_emb" mode to save word embeddings.')
			
 
				+# sample mode flags.
			
 
				+tf.flags.DEFINE_string('prefix', '',
			
 
				+                       'Used for "sample" mode to predict next words.')
			
 
				+tf.flags.DEFINE_integer('max_sample_words', 100,
			
 
				+                        'Sampling stops either when </S> is met or this number '
			
 
				+                        'of steps has passed.')
			
 
				+tf.flags.DEFINE_integer('num_samples', 3,
			
 
				+                        'Number of samples to generate for the prefix.')
			
 
				+# dump_lstm_emb mode flags.
			
 
				+tf.flags.DEFINE_string('sentence', '',
			
 
				+                       'Used as input for "dump_lstm_emb" mode.')
			
 
				+# eval mode flags.
			
 
				+tf.flags.DEFINE_string('input_data', '',
			
 
				+                       'Input data files for eval model.')
			
 
				+tf.flags.DEFINE_integer('max_eval_steps', 1000000,
			
 
				+                        'Maximum mumber of steps to run "eval" mode.')
			
 
				+
			
 
				+
			
 
				+# For saving demo resources, use batch size 1 and step 1.
			
 
				+BATCH_SIZE = 1
			
 
				+NUM_TIMESTEPS = 1
			
 
				+MAX_WORD_LEN = 50
			
 
				+
			
 
				+
			
 
				+def _LoadModel(gd_file, ckpt_file):
			
 
				+  """Load the model from GraphDef and Checkpoint.
			
 
				+
			
 
				+  Args:
			
 
				+    gd_file: GraphDef proto text file.
			
 
				+    ckpt_file: TensorFlow Checkpoint file.
			
 
				+
			
 
				+  Returns:
			
 
				+    TensorFlow session and tensors dict.
			
 
				+  """
			
 
				+  with tf.Graph().as_default():
			
 
				+    sys.stderr.write('Recovering graph.\n')
			
 
				+    with tf.gfile.FastGFile(gd_file, 'r') as f:
			
 
				+      s = f.read()
			
 
				+      gd = tf.GraphDef()
			
 
				+      text_format.Merge(s, gd)
			
 
				+
			
 
				+    tf.logging.info('Recovering Graph %s', gd_file)
			
 
				+    t = {}
			
 
				+    [t['states_init'], t['lstm/lstm_0/control_dependency'],
			
 
				+     t['lstm/lstm_1/control_dependency'], t['softmax_out'], t['class_ids_out'],
			
 
				+     t['class_weights_out'], t['log_perplexity_out'], t['inputs_in'],
			
 
				+     t['targets_in'], t['target_weights_in'], t['char_inputs_in'],
			
 
				+     t['all_embs'], t['softmax_weights'], t['global_step']
			
 
				+    ] = tf.import_graph_def(gd, {}, ['states_init',
			
 
				+                                     'lstm/lstm_0/control_dependency:0',
			
 
				+                                     'lstm/lstm_1/control_dependency:0',
			
 
				+                                     'softmax_out:0',
			
 
				+                                     'class_ids_out:0',
			
 
				+                                     'class_weights_out:0',
			
 
				+                                     'log_perplexity_out:0',
			
 
				+                                     'inputs_in:0',
			
 
				+                                     'targets_in:0',
			
 
				+                                     'target_weights_in:0',
			
 
				+                                     'char_inputs_in:0',
			
 
				+                                     'all_embs_out:0',
			
 
				+                                     'Reshape_3:0',
			
 
				+                                     'global_step:0'], name='')
			
 
				+
			
 
				+    sys.stderr.write('Recovering checkpoint %s\n' % ckpt_file)
			
 
				+    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
			
 
				+    sess.run('save/restore_all', {'save/Const:0': ckpt_file})
			
 
				+    sess.run(t['states_init'])
			
 
				+
			
 
				+  return sess, t
			
 
				+
			
 
				+
			
 
				+def _EvalModel(dataset):
			
 
				+  """Evaluate model perplexity using provided dataset.
			
 
				+
			
 
				+  Args:
			
 
				+    dataset: LM1BDataset object.
			
 
				+  """
			
 
				+  sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)
			
 
				+
			
 
				+  current_step = t['global_step'].eval(session=sess)
			
 
				+  sys.stderr.write('Loaded step %d.\n' % current_step)
			
 
				+
			
 
				+  data_gen = dataset.get_batch(BATCH_SIZE, NUM_TIMESTEPS, forever=False)
			
 
				+  sum_num = 0.0
			
 
				+  sum_den = 0.0
			
 
				+  perplexity = 0.0
			
 
				+  for i, (inputs, char_inputs, _, targets, weights) in enumerate(data_gen):
			
 
				+    input_dict = {t['inputs_in']: inputs,
			
 
				+                  t['targets_in']: targets,
			
 
				+                  t['target_weights_in']: weights}
			
 
				+    if 'char_inputs_in' in t:
			
 
				+      input_dict[t['char_inputs_in']] = char_inputs
			
 
				+    log_perp = sess.run(t['log_perplexity_out'], feed_dict=input_dict)
			
 
				+
			
 
				+    if np.isnan(log_perp):
			
 
				+      sys.stderr.error('log_perplexity is Nan.\n')
			
 
				+    else:
			
 
				+      sum_num += log_perp * weights.mean()
			
 
				+      sum_den += weights.mean()
			
 
				+    if sum_den > 0:
			
 
				+      perplexity = np.exp(sum_num / sum_den)
			
 
				+
			
 
				+    sys.stderr.write('Eval Step: %d, Average Perplexity: %f.\n' %
			
 
				+                     (i, perplexity))
			
 
				+
			
 
				+    if i > FLAGS.max_eval_steps:
			
 
				+      break
			
 
				+
			
 
				+
			
 
				+def _SampleSoftmax(softmax):
			
 
				+  return min(np.sum(np.cumsum(softmax) < np.random.rand()), len(softmax) - 1)
			
 
				+
			
 
				+
			
 
				+def _SampleModel(prefix_words, vocab):
			
 
				+  """Predict next words using the given prefix words.
			
 
				+
			
 
				+  Args:
			
 
				+    prefix_words: Prefix words.
			
 
				+    vocab: Vocabulary. Contains max word chard id length and converts between
			
 
				+        words and ids.
			
 
				+  """
			
 
				+  targets = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32)
			
 
				+  weights = np.ones([BATCH_SIZE, NUM_TIMESTEPS], np.float32)
			
 
				+
			
 
				+  sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)
			
 
				+
			
 
				+  if prefix_words.find('<S>') != 0:
			
 
				+    prefix_words = '<S> ' + prefix_words
			
 
				+
			
 
				+  prefix = [vocab.word_to_id(w) for w in prefix_words.split()]
			
 
				+  prefix_char_ids = [vocab.word_to_char_ids(w) for w in prefix_words.split()]
			
 
				+  for _ in xrange(FLAGS.num_samples):
			
 
				+    inputs = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32)
			
 
				+    char_ids_inputs = np.zeros(
			
 
				+        [BATCH_SIZE, NUM_TIMESTEPS, vocab.max_word_length], np.int32)
			
 
				+    samples = prefix[:]
			
 
				+    char_ids_samples = prefix_char_ids[:]
			
 
				+    sent = ''
			
 
				+    while True:
			
 
				+      inputs[0, 0] = samples[0]
			
 
				+      char_ids_inputs[0, 0, :] = char_ids_samples[0]
			
 
				+      samples = samples[1:]
			
 
				+      char_ids_samples = char_ids_samples[1:]
			
 
				+
			
 
				+      softmax = sess.run(t['softmax_out'],
			
 
				+                         feed_dict={t['char_inputs_in']: char_ids_inputs,
			
 
				+                                    t['inputs_in']: inputs,
			
 
				+                                    t['targets_in']: targets,
			
 
				+                                    t['target_weights_in']: weights})
			
 
				+
			
 
				+      sample = _SampleSoftmax(softmax[0])
			
 
				+      sample_char_ids = vocab.word_to_char_ids(vocab.id_to_word(sample))
			
 
				+
			
 
				+      if not samples:
			
 
				+        samples = [sample]
			
 
				+        char_ids_samples = [sample_char_ids]
			
 
				+      sent += vocab.id_to_word(samples[0]) + ' '
			
 
				+      sys.stderr.write('%s\n' % sent)
			
 
				+
			
 
				+      if (vocab.id_to_word(samples[0]) == '</S>' or
			
 
				+          len(sent) > FLAGS.max_sample_words):
			
 
				+        break
			
 
				+
			
 
				+
			
 
				+def _DumpEmb(vocab):
			
 
				+  """Dump the softmax weights and word embeddings to files.
			
 
				+
			
 
				+  Args:
			
 
				+    vocab: Vocabulary. Contains vocabulary size and converts word to ids.
			
 
				+  """
			
 
				+  assert FLAGS.save_dir, 'Must specify FLAGS.save_dir for dump_emb.'
			
 
				+  inputs = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32)
			
 
				+  targets = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32)
			
 
				+  weights = np.ones([BATCH_SIZE, NUM_TIMESTEPS], np.float32)
			
 
				+
			
 
				+  sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)
			
 
				+
			
 
				+  softmax_weights = sess.run(t['softmax_weights'])
			
 
				+  fname = FLAGS.save_dir + '/embeddings_softmax.npy'
			
 
				+  with tf.gfile.Open(fname, mode='w') as f:
			
 
				+    np.save(f, softmax_weights)
			
 
				+  sys.stderr.write('Finished softmax weights\n')
			
 
				+
			
 
				+  all_embs = np.zeros([vocab.size, 1024])
			
 
				+  for i in range(vocab.size):
			
 
				+    input_dict = {t['inputs_in']: inputs,
			
 
				+                  t['targets_in']: targets,
			
 
				+                  t['target_weights_in']: weights}
			
 
				+    if 'char_inputs_in' in t:
			
 
				+      input_dict[t['char_inputs_in']] = (
			
 
				+          vocab.word_char_ids[i].reshape([-1, 1, MAX_WORD_LEN]))
			
 
				+    embs = sess.run(t['all_embs'], input_dict)
			
 
				+    all_embs[i, :] = embs
			
 
				+    sys.stderr.write('Finished word embedding %d/%d\n' % (i, vocab.size))
			
 
				+
			
 
				+  fname = FLAGS.save_dir + '/embeddings_char_cnn.npy'
			
 
				+  with tf.gfile.Open(fname, mode='w') as f:
			
 
				+    np.save(f, all_embs)
			
 
				+  sys.stderr.write('Embedding file saved\n')
			
 
				+
			
 
				+
			
 
				+def _DumpSentenceEmbedding(sentence, vocab):
			
 
				+  """Predict next words using the given prefix words.
			
 
				+
			
 
				+  Args:
			
 
				+    sentence: Sentence words.
			
 
				+    vocab: Vocabulary. Contains max word chard id length and converts between
			
 
				+        words and ids.
			
 
				+  """
			
 
				+  targets = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32)
			
 
				+  weights = np.ones([BATCH_SIZE, NUM_TIMESTEPS], np.float32)
			
 
				+
			
 
				+  sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)
			
 
				+
			
 
				+  if sentence.find('<S>') != 0:
			
 
				+    sentence = '<S> ' + sentence
			
 
				+
			
 
				+  word_ids = [vocab.word_to_id(w) for w in sentence.split()]
			
 
				+  char_ids = [vocab.word_to_char_ids(w) for w in sentence.split()]
			
 
				+
			
 
				+  inputs = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32)
			
 
				+  char_ids_inputs = np.zeros(
			
 
				+      [BATCH_SIZE, NUM_TIMESTEPS, vocab.max_word_length], np.int32)
			
 
				+  for i in xrange(len(word_ids)):
			
 
				+    inputs[0, 0] = word_ids[i]
			
 
				+    char_ids_inputs[0, 0, :] = char_ids[i]
			
 
				+
			
 
				+    # Add 'lstm/lstm_0/control_dependency' if you want to dump previous layer
			
 
				+    # LSTM.
			
 
				+    lstm_emb = sess.run(t['lstm/lstm_1/control_dependency'],
			
 
				+                        feed_dict={t['char_inputs_in']: char_ids_inputs,
			
 
				+                                   t['inputs_in']: inputs,
			
 
				+                                   t['targets_in']: targets,
			
 
				+                                   t['target_weights_in']: weights})
			
 
				+
			
 
				+    fname = os.path.join(FLAGS.save_dir, 'lstm_emb_step_%d.npy' % i)
			
 
				+    with tf.gfile.Open(fname, mode='w') as f:
			
 
				+      np.save(f, lstm_emb)
			
 
				+    sys.stderr.write('LSTM embedding step %d file saved\n' % i)
			
 
				+
			
 
				+
			
 
				+def main(unused_argv):
			
 
				+  vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN)
			
 
				+
			
 
				+  if FLAGS.mode == 'eval':
			
 
				+    dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab)
			
 
				+    _EvalModel(dataset)
			
 
				+  elif FLAGS.mode == 'sample':
			
 
				+    _SampleModel(FLAGS.prefix, vocab)
			
 
				+  elif FLAGS.mode == 'dump_emb':
			
 
				+    _DumpEmb(vocab)
			
 
				+  elif FLAGS.mode == 'dump_lstm_emb':
			
 
				+    _DumpSentenceEmbedding(FLAGS.sentence, vocab)
			
 
				+  else:
			
 
				+    raise Exception('Mode not supported.')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+  tf.app.run()