word2vec.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. """ Word2Vec.
  2. Implement Word2Vec algorithm to compute vector representations of words.
  3. This example is using a small chunk of Wikipedia articles to train from.
  4. References:
  5. - Mikolov, Tomas et al. "Efficient Estimation of Word Representations
  6. in Vector Space.", 2013.
  7. Links:
  8. - [Word2Vec] https://arxiv.org/pdf/1301.3781.pdf
  9. Author: Aymeric Damien
  10. Project: https://github.com/aymericdamien/TensorFlow-Examples/
  11. """
  12. from __future__ import division, print_function, absolute_import
  13. import collections
  14. import os
  15. import random
  16. import urllib
  17. import zipfile
  18. import numpy as np
  19. import tensorflow as tf
  20. # Training Parameters
  21. learning_rate = 0.1
  22. batch_size = 128
  23. num_steps = 3000000
  24. display_step = 10000
  25. eval_step = 200000
  26. # Evaluation Parameters
  27. eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']
  28. # Word2Vec Parameters
  29. embedding_size = 200 # Dimension of the embedding vector
  30. max_vocabulary_size = 50000 # Total number of different words in the vocabulary
  31. min_occurrence = 10 # Remove all words that does not appears at least n times
  32. skip_window = 3 # How many words to consider left and right
  33. num_skips = 2 # How many times to reuse an input to generate a label
  34. num_sampled = 64 # Number of negative examples to sample
  35. # Download a small chunk of Wikipedia articles collection
  36. url = 'http://mattmahoney.net/dc/text8.zip'
  37. data_path = 'text8.zip'
  38. if not os.path.exists(data_path):
  39. print("Downloading the dataset... (It may take some time)")
  40. filename, _ = urllib.urlretrieve(url, data_path)
  41. print("Done!")
  42. # Unzip the dataset file. Text has already been processed
  43. with zipfile.ZipFile(data_path) as f:
  44. text_words = f.read(f.namelist()[0]).lower().split()
  45. # Build the dictionary and replace rare words with UNK token
  46. count = [('UNK', -1)]
  47. # Retrieve the most common words
  48. count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))
  49. # Remove samples with less than 'min_occurrence' occurrences
  50. for i in range(len(count) - 1, -1, -1):
  51. if count[i][1] < min_occurrence:
  52. count.pop(i)
  53. else:
  54. # The collection is ordered, so stop when 'min_occurrence' is reached
  55. break
  56. # Compute the vocabulary size
  57. vocabulary_size = len(count)
  58. # Assign an id to each word
  59. word2id = dict()
  60. for i, (word, _)in enumerate(count):
  61. word2id[word] = i
  62. data = list()
  63. unk_count = 0
  64. for word in text_words:
  65. # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary
  66. index = word2id.get(word, 0)
  67. if index == 0:
  68. unk_count += 1
  69. data.append(index)
  70. count[0] = ('UNK', unk_count)
  71. id2word = dict(zip(word2id.values(), word2id.keys()))
  72. print("Words count:", len(text_words))
  73. print("Unique words:", len(set(text_words)))
  74. print("Vocabulary size:", vocabulary_size)
  75. print("Most common words:", count[:10])
  76. data_index = 0
  77. # Generate training batch for the skip-gram model
  78. def next_batch(batch_size, num_skips, skip_window):
  79. global data_index
  80. assert batch_size % num_skips == 0
  81. assert num_skips <= 2 * skip_window
  82. batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  83. labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  84. # get window size (words left and right + current one)
  85. span = 2 * skip_window + 1
  86. buffer = collections.deque(maxlen=span)
  87. if data_index + span > len(data):
  88. data_index = 0
  89. buffer.extend(data[data_index:data_index + span])
  90. data_index += span
  91. for i in range(batch_size // num_skips):
  92. context_words = [w for w in range(span) if w != skip_window]
  93. words_to_use = random.sample(context_words, num_skips)
  94. for j, context_word in enumerate(words_to_use):
  95. batch[i * num_skips + j] = buffer[skip_window]
  96. labels[i * num_skips + j, 0] = buffer[context_word]
  97. if data_index == len(data):
  98. buffer.extend(data[0:span])
  99. data_index = span
  100. else:
  101. buffer.append(data[data_index])
  102. data_index += 1
  103. # Backtrack a little bit to avoid skipping words in the end of a batch
  104. data_index = (data_index + len(data) - span) % len(data)
  105. return batch, labels
  106. # Input data
  107. X = tf.placeholder(tf.int32, shape=[None])
  108. # Input label
  109. Y = tf.placeholder(tf.int32, shape=[None, 1])
  110. # Ensure the following ops & var are assigned on CPU
  111. # (some ops are not compatible on GPU)
  112. with tf.device('/cpu:0'):
  113. # Create the embedding variable (each row represent a word embedding vector)
  114. embedding = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
  115. # Lookup the corresponding embedding vectors for each sample in X
  116. X_embed = tf.nn.embedding_lookup(embedding, X)
  117. # Construct the variables for the NCE loss
  118. nce_weights = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
  119. nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
  120. # Compute the average NCE loss for the batch
  121. loss_op = tf.reduce_mean(
  122. tf.nn.nce_loss(weights=nce_weights,
  123. biases=nce_biases,
  124. labels=Y,
  125. inputs=X_embed,
  126. num_sampled=num_sampled,
  127. num_classes=vocabulary_size))
  128. # Define the optimizer
  129. optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  130. train_op = optimizer.minimize(loss_op)
  131. # Evaluation
  132. # Compute the cosine similarity between input data embedding and every embedding vectors
  133. X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
  134. embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
  135. cosine_sim_op = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True)
  136. # Initialize the variables (i.e. assign their default value)
  137. init = tf.global_variables_initializer()
  138. with tf.Session() as sess:
  139. # Run the initializer
  140. sess.run(init)
  141. # Testing data
  142. x_test = np.array([word2id[w] for w in eval_words])
  143. average_loss = 0
  144. for step in xrange(1, num_steps + 1):
  145. # Get a new batch of data
  146. batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)
  147. # Run training op
  148. _, loss = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})
  149. average_loss += loss
  150. if step % display_step == 0 or step == 1:
  151. if step > 1:
  152. average_loss /= display_step
  153. print("Step " + str(step) + ", Average Loss= " + \
  154. "{:.4f}".format(average_loss))
  155. average_loss = 0
  156. # Evaluation
  157. if step % eval_step == 0 or step == 1:
  158. print("Evaluation...")
  159. sim = sess.run(cosine_sim_op, feed_dict={X: x_test})
  160. for i in xrange(len(eval_words)):
  161. top_k = 8 # number of nearest neighbors
  162. nearest = (-sim[i, :]).argsort()[1:top_k + 1]
  163. log_str = '"%s" nearest neighbors:' % eval_words[i]
  164. for k in xrange(top_k):
  165. log_str = '%s %s,' % (log_str, id2word[nearest[k]])
  166. print(log_str)