|
@@ -0,0 +1,724 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "# Word2Vec (Word Embedding)\n",
|
|
|
+ "\n",
|
|
|
+ "Implement Word2Vec algorithm to compute vector representations of words, with TensorFlow 2.0. This example is using a small chunk of Wikipedia articles to train from.\n",
|
|
|
+ "\n",
|
|
|
+ "More info: [Mikolov, Tomas et al. \"Efficient Estimation of Word Representations in Vector Space.\", 2013](https://arxiv.org/pdf/1301.3781.pdf)\n",
|
|
|
+ "\n",
|
|
|
+ "- Author: Aymeric Damien\n",
|
|
|
+ "- Project: https://github.com/aymericdamien/TensorFlow-Examples/"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 1,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "from __future__ import division, print_function, absolute_import\n",
|
|
|
+ "\n",
|
|
|
+ "import collections\n",
|
|
|
+ "import os\n",
|
|
|
+ "import random\n",
|
|
|
+ "import urllib\n",
|
|
|
+ "import zipfile\n",
|
|
|
+ "\n",
|
|
|
+ "import numpy as np\n",
|
|
|
+ "import tensorflow as tf"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 2,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Training Parameters.\n",
|
|
|
+ "learning_rate = 0.1\n",
|
|
|
+ "batch_size = 128\n",
|
|
|
+ "num_steps = 3000000\n",
|
|
|
+ "display_step = 10000\n",
|
|
|
+ "eval_step = 200000\n",
|
|
|
+ "\n",
|
|
|
+ "# Evaluation Parameters.\n",
|
|
|
+ "eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']\n",
|
|
|
+ "\n",
|
|
|
+ "# Word2Vec Parameters.\n",
|
|
|
+ "embedding_size = 200 # Dimension of the embedding vector.\n",
|
|
|
+ "max_vocabulary_size = 50000 # Total number of different words in the vocabulary.\n",
|
|
|
+ "min_occurrence = 10 # Remove all words that does not appears at least n times.\n",
|
|
|
+ "skip_window = 3 # How many words to consider left and right.\n",
|
|
|
+ "num_skips = 2 # How many times to reuse an input to generate a label.\n",
|
|
|
+ "num_sampled = 64 # Number of negative examples to sample."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 3,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Download a small chunk of Wikipedia articles collection.\n",
|
|
|
+ "url = 'http://mattmahoney.net/dc/text8.zip'\n",
|
|
|
+ "data_path = 'text8.zip'\n",
|
|
|
+ "if not os.path.exists(data_path):\n",
|
|
|
+ " print(\"Downloading the dataset... (It may take some time)\")\n",
|
|
|
+ " filename, _ = urllib.urlretrieve(url, data_path)\n",
|
|
|
+ " print(\"Done!\")\n",
|
|
|
+ "# Unzip the dataset file. Text has already been processed.\n",
|
|
|
+ "with zipfile.ZipFile(data_path) as f:\n",
|
|
|
+ " text_words = f.read(f.namelist()[0]).lower().split()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 4,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Words count: 17005207\n",
|
|
|
+ "Unique words: 253854\n",
|
|
|
+ "Vocabulary size: 47135\n",
|
|
|
+ "Most common words: [('UNK', 444176), ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Build the dictionary and replace rare words with UNK token.\n",
|
|
|
+ "count = [('UNK', -1)]\n",
|
|
|
+ "# Retrieve the most common words.\n",
|
|
|
+ "count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))\n",
|
|
|
+ "# Remove samples with less than 'min_occurrence' occurrences.\n",
|
|
|
+ "for i in range(len(count) - 1, -1, -1):\n",
|
|
|
+ " if count[i][1] < min_occurrence:\n",
|
|
|
+ " count.pop(i)\n",
|
|
|
+ " else:\n",
|
|
|
+ " # The collection is ordered, so stop when 'min_occurrence' is reached.\n",
|
|
|
+ " break\n",
|
|
|
+ "# Compute the vocabulary size.\n",
|
|
|
+ "vocabulary_size = len(count)\n",
|
|
|
+ "# Assign an id to each word.\n",
|
|
|
+ "word2id = dict()\n",
|
|
|
+ "for i, (word, _)in enumerate(count):\n",
|
|
|
+ " word2id[word] = i\n",
|
|
|
+ "\n",
|
|
|
+ "data = list()\n",
|
|
|
+ "unk_count = 0\n",
|
|
|
+ "for word in text_words:\n",
|
|
|
+ " # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary.\n",
|
|
|
+ " index = word2id.get(word, 0)\n",
|
|
|
+ " if index == 0:\n",
|
|
|
+ " unk_count += 1\n",
|
|
|
+ " data.append(index)\n",
|
|
|
+ "count[0] = ('UNK', unk_count)\n",
|
|
|
+ "id2word = dict(zip(word2id.values(), word2id.keys()))\n",
|
|
|
+ "\n",
|
|
|
+ "print(\"Words count:\", len(text_words))\n",
|
|
|
+ "print(\"Unique words:\", len(set(text_words)))\n",
|
|
|
+ "print(\"Vocabulary size:\", vocabulary_size)\n",
|
|
|
+ "print(\"Most common words:\", count[:10])"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 5,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "data_index = 0\n",
|
|
|
+ "# Generate training batch for the skip-gram model.\n",
|
|
|
+ "def next_batch(batch_size, num_skips, skip_window):\n",
|
|
|
+ " global data_index\n",
|
|
|
+ " assert batch_size % num_skips == 0\n",
|
|
|
+ " assert num_skips <= 2 * skip_window\n",
|
|
|
+ " batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
|
|
|
+ " labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
|
|
|
+ " # get window size (words left and right + current one).\n",
|
|
|
+ " span = 2 * skip_window + 1\n",
|
|
|
+ " buffer = collections.deque(maxlen=span)\n",
|
|
|
+ " if data_index + span > len(data):\n",
|
|
|
+ " data_index = 0\n",
|
|
|
+ " buffer.extend(data[data_index:data_index + span])\n",
|
|
|
+ " data_index += span\n",
|
|
|
+ " for i in range(batch_size // num_skips):\n",
|
|
|
+ " context_words = [w for w in range(span) if w != skip_window]\n",
|
|
|
+ " words_to_use = random.sample(context_words, num_skips)\n",
|
|
|
+ " for j, context_word in enumerate(words_to_use):\n",
|
|
|
+ " batch[i * num_skips + j] = buffer[skip_window]\n",
|
|
|
+ " labels[i * num_skips + j, 0] = buffer[context_word]\n",
|
|
|
+ " if data_index == len(data):\n",
|
|
|
+ " buffer.extend(data[0:span])\n",
|
|
|
+ " data_index = span\n",
|
|
|
+ " else:\n",
|
|
|
+ " buffer.append(data[data_index])\n",
|
|
|
+ " data_index += 1\n",
|
|
|
+ " # Backtrack a little bit to avoid skipping words in the end of a batch.\n",
|
|
|
+ " data_index = (data_index + len(data) - span) % len(data)\n",
|
|
|
+ " return batch, labels"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 6,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Ensure the following ops & var are assigned on CPU\n",
|
|
|
+ "# (some ops are not compatible on GPU).\n",
|
|
|
+ "with tf.device('/cpu:0'):\n",
|
|
|
+ " # Create the embedding variable (each row represent a word embedding vector).\n",
|
|
|
+ " embedding = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))\n",
|
|
|
+ " # Construct the variables for the NCE loss.\n",
|
|
|
+ " nce_weights = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))\n",
|
|
|
+ " nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
|
|
|
+ "\n",
|
|
|
+ "def get_embedding(x):\n",
|
|
|
+ " with tf.device('/cpu:0'):\n",
|
|
|
+ " # Lookup the corresponding embedding vectors for each sample in X.\n",
|
|
|
+ " x_embed = tf.nn.embedding_lookup(embedding, x)\n",
|
|
|
+ " return x_embed\n",
|
|
|
+ "\n",
|
|
|
+ "def nce_loss(x_embed, y):\n",
|
|
|
+ " with tf.device('/cpu:0'):\n",
|
|
|
+ " # Compute the average NCE loss for the batch.\n",
|
|
|
+ " y = tf.cast(y, tf.int64)\n",
|
|
|
+ " loss = tf.reduce_mean(\n",
|
|
|
+ " tf.nn.nce_loss(weights=nce_weights,\n",
|
|
|
+ " biases=nce_biases,\n",
|
|
|
+ " labels=y,\n",
|
|
|
+ " inputs=x_embed,\n",
|
|
|
+ " num_sampled=num_sampled,\n",
|
|
|
+ " num_classes=vocabulary_size))\n",
|
|
|
+ " return loss\n",
|
|
|
+ "\n",
|
|
|
+ "# Evaluation.\n",
|
|
|
+ "def evaluate(x_embed):\n",
|
|
|
+ " with tf.device('/cpu:0'):\n",
|
|
|
+ " # Compute the cosine similarity between input data embedding and every embedding vectors\n",
|
|
|
+ " x_embed = tf.cast(x_embed, tf.float32)\n",
|
|
|
+ " x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed)))\n",
|
|
|
+ " embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True), tf.float32)\n",
|
|
|
+ " cosine_sim_op = tf.matmul(x_embed_norm, embedding_norm, transpose_b=True)\n",
|
|
|
+ " return cosine_sim_op\n",
|
|
|
+ "\n",
|
|
|
+ "# Define the optimizer.\n",
|
|
|
+ "optimizer = tf.optimizers.SGD(learning_rate)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 7,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Optimization process. \n",
|
|
|
+ "def run_optimization(x, y):\n",
|
|
|
+ " with tf.device('/cpu:0'):\n",
|
|
|
+ " # Wrap computation inside a GradientTape for automatic differentiation.\n",
|
|
|
+ " with tf.GradientTape() as g:\n",
|
|
|
+ " emb = get_embedding(x)\n",
|
|
|
+ " loss = nce_loss(emb, y)\n",
|
|
|
+ "\n",
|
|
|
+ " # Compute gradients.\n",
|
|
|
+ " gradients = g.gradient(loss, [embedding, nce_weights, nce_biases])\n",
|
|
|
+ "\n",
|
|
|
+ " # Update W and b following gradients.\n",
|
|
|
+ " optimizer.apply_gradients(zip(gradients, [embedding, nce_weights, nce_biases]))"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 8,
|
|
|
+ "metadata": {
|
|
|
+ "scrolled": false
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "step: 1, loss: 504.444214\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: censure, stricken, anglicanism, stick, streetcars, shrines, horrified, sparkle,\n",
|
|
|
+ "\"of\" nearest neighbors: jolly, weary, clinicians, kerouac, economist, owls, safe, playoff,\n",
|
|
|
+ "\"going\" nearest neighbors: filament, platforms, moderately, micheal, despotic, krag, disclosed, your,\n",
|
|
|
+ "\"hardware\" nearest neighbors: occupants, paraffin, vera, reorganized, rename, declares, prima, condoned,\n",
|
|
|
+ "\"american\" nearest neighbors: portfolio, rhein, aalto, angle, lifeson, tucker, sexton, dench,\n",
|
|
|
+ "\"britain\" nearest neighbors: indivisible, disbelief, scripture, pepsi, scriptores, sighting, napalm, strike,\n",
|
|
|
+ "step: 10000, loss: 117.166962\n",
|
|
|
+ "step: 20000, loss: 65.478333\n",
|
|
|
+ "step: 30000, loss: 46.580460\n",
|
|
|
+ "step: 40000, loss: 25.563128\n",
|
|
|
+ "step: 50000, loss: 50.924446\n",
|
|
|
+ "step: 60000, loss: 51.696526\n",
|
|
|
+ "step: 70000, loss: 17.272142\n",
|
|
|
+ "step: 80000, loss: 32.579414\n",
|
|
|
+ "step: 90000, loss: 68.372032\n",
|
|
|
+ "step: 100000, loss: 36.026573\n",
|
|
|
+ "step: 110000, loss: 22.502020\n",
|
|
|
+ "step: 120000, loss: 15.788742\n",
|
|
|
+ "step: 130000, loss: 31.832420\n",
|
|
|
+ "step: 140000, loss: 25.096617\n",
|
|
|
+ "step: 150000, loss: 12.013027\n",
|
|
|
+ "step: 160000, loss: 20.574780\n",
|
|
|
+ "step: 170000, loss: 12.201975\n",
|
|
|
+ "step: 180000, loss: 20.983793\n",
|
|
|
+ "step: 190000, loss: 11.366720\n",
|
|
|
+ "step: 200000, loss: 19.431549\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: three, four, eight, six, two, seven, nine, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, a, and, first, with, on, but, from,\n",
|
|
|
+ "\"going\" nearest neighbors: have, more, used, out, be, with, on, however,\n",
|
|
|
+ "\"hardware\" nearest neighbors: be, known, system, apollo, and, a, such, used,\n",
|
|
|
+ "\"american\" nearest neighbors: UNK, and, from, s, at, in, after, about,\n",
|
|
|
+ "\"britain\" nearest neighbors: of, and, many, the, as, used, but, such,\n",
|
|
|
+ "step: 210000, loss: 16.361233\n",
|
|
|
+ "step: 220000, loss: 17.529526\n",
|
|
|
+ "step: 230000, loss: 16.805817\n",
|
|
|
+ "step: 240000, loss: 6.365625\n",
|
|
|
+ "step: 250000, loss: 8.083097\n",
|
|
|
+ "step: 260000, loss: 11.262514\n",
|
|
|
+ "step: 270000, loss: 9.842708\n",
|
|
|
+ "step: 280000, loss: 6.363440\n",
|
|
|
+ "step: 290000, loss: 8.732617\n",
|
|
|
+ "step: 300000, loss: 10.484728\n",
|
|
|
+ "step: 310000, loss: 12.099487\n",
|
|
|
+ "step: 320000, loss: 11.496288\n",
|
|
|
+ "step: 330000, loss: 9.283813\n",
|
|
|
+ "step: 340000, loss: 10.777218\n",
|
|
|
+ "step: 350000, loss: 16.310440\n",
|
|
|
+ "step: 360000, loss: 7.495782\n",
|
|
|
+ "step: 370000, loss: 9.287696\n",
|
|
|
+ "step: 380000, loss: 6.982735\n",
|
|
|
+ "step: 390000, loss: 8.549622\n",
|
|
|
+ "step: 400000, loss: 8.388112\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, three, six, two, seven, eight, one, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, a, with, also, for, and, which, by,\n",
|
|
|
+ "\"going\" nearest neighbors: have, are, both, called, being, a, of, had,\n",
|
|
|
+ "\"hardware\" nearest neighbors: may, de, some, have, so, which, other, also,\n",
|
|
|
+ "\"american\" nearest neighbors: s, british, UNK, from, in, including, first, see,\n",
|
|
|
+ "\"britain\" nearest neighbors: against, include, including, both, british, other, an, most,\n",
|
|
|
+ "step: 410000, loss: 8.757725\n",
|
|
|
+ "step: 420000, loss: 12.303110\n",
|
|
|
+ "step: 430000, loss: 12.325478\n",
|
|
|
+ "step: 440000, loss: 7.659882\n",
|
|
|
+ "step: 450000, loss: 6.028089\n",
|
|
|
+ "step: 460000, loss: 12.700299\n",
|
|
|
+ "step: 470000, loss: 7.063077\n",
|
|
|
+ "step: 480000, loss: 18.004183\n",
|
|
|
+ "step: 490000, loss: 7.510474\n",
|
|
|
+ "step: 500000, loss: 10.089376\n",
|
|
|
+ "step: 510000, loss: 11.404436\n",
|
|
|
+ "step: 520000, loss: 9.494527\n",
|
|
|
+ "step: 530000, loss: 7.797963\n",
|
|
|
+ "step: 540000, loss: 7.390718\n",
|
|
|
+ "step: 550000, loss: 13.911215\n",
|
|
|
+ "step: 560000, loss: 6.975731\n",
|
|
|
+ "step: 570000, loss: 6.179163\n",
|
|
|
+ "step: 580000, loss: 7.066525\n",
|
|
|
+ "step: 590000, loss: 6.487288\n",
|
|
|
+ "step: 600000, loss: 5.361528\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, six, three, seven, two, one, eight, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, and, from, with, a, including, in, include,\n",
|
|
|
+ "\"going\" nearest neighbors: have, even, they, term, who, many, which, were,\n",
|
|
|
+ "\"hardware\" nearest neighbors: include, computer, an, which, other, each, than, may,\n",
|
|
|
+ "\"american\" nearest neighbors: english, french, s, german, from, in, film, see,\n",
|
|
|
+ "\"britain\" nearest neighbors: several, first, modern, part, government, german, was, were,\n",
|
|
|
+ "step: 610000, loss: 4.144980\n",
|
|
|
+ "step: 620000, loss: 5.865635\n",
|
|
|
+ "step: 630000, loss: 6.826498\n",
|
|
|
+ "step: 640000, loss: 8.376097\n",
|
|
|
+ "step: 650000, loss: 7.117930\n",
|
|
|
+ "step: 660000, loss: 7.639544\n",
|
|
|
+ "step: 670000, loss: 5.973255\n",
|
|
|
+ "step: 680000, loss: 4.908459\n",
|
|
|
+ "step: 690000, loss: 6.164993\n",
|
|
|
+ "step: 700000, loss: 7.360281\n",
|
|
|
+ "step: 710000, loss: 12.693079\n",
|
|
|
+ "step: 720000, loss: 6.410182\n",
|
|
|
+ "step: 730000, loss: 7.499201\n",
|
|
|
+ "step: 740000, loss: 6.509094\n",
|
|
|
+ "step: 750000, loss: 10.625893\n",
|
|
|
+ "step: 760000, loss: 7.177696\n",
|
|
|
+ "step: 770000, loss: 12.639092\n",
|
|
|
+ "step: 780000, loss: 8.441635\n",
|
|
|
+ "step: 790000, loss: 7.529139\n",
|
|
|
+ "step: 800000, loss: 6.579177\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, three, six, seven, eight, two, one, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: and, with, in, the, its, from, by, including,\n",
|
|
|
+ "\"going\" nearest neighbors: have, they, how, include, people, however, also, their,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, large, include, may, or, which, other, there,\n",
|
|
|
+ "\"american\" nearest neighbors: born, french, british, english, german, b, john, d,\n",
|
|
|
+ "\"britain\" nearest neighbors: country, including, include, general, part, various, several, by,\n",
|
|
|
+ "step: 810000, loss: 6.934138\n",
|
|
|
+ "step: 820000, loss: 5.686094\n",
|
|
|
+ "step: 830000, loss: 7.310243\n",
|
|
|
+ "step: 840000, loss: 5.028157\n",
|
|
|
+ "step: 850000, loss: 7.079705\n",
|
|
|
+ "step: 860000, loss: 6.768996\n",
|
|
|
+ "step: 870000, loss: 5.604030\n",
|
|
|
+ "step: 880000, loss: 8.208309\n",
|
|
|
+ "step: 890000, loss: 6.301597\n",
|
|
|
+ "step: 900000, loss: 5.733234\n",
|
|
|
+ "step: 910000, loss: 6.577081\n",
|
|
|
+ "step: 920000, loss: 6.774826\n",
|
|
|
+ "step: 930000, loss: 7.068932\n",
|
|
|
+ "step: 940000, loss: 6.694956\n",
|
|
|
+ "step: 950000, loss: 7.944673\n",
|
|
|
+ "step: 960000, loss: 5.988618\n",
|
|
|
+ "step: 970000, loss: 6.651366\n",
|
|
|
+ "step: 980000, loss: 4.595577\n",
|
|
|
+ "step: 990000, loss: 6.564834\n",
|
|
|
+ "step: 1000000, loss: 4.327858\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, three, seven, six, eight, two, nine, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, first, and, became, from, under, at, with,\n",
|
|
|
+ "\"going\" nearest neighbors: others, has, then, have, how, become, had, also,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, large, systems, these, different, either, include, using,\n",
|
|
|
+ "\"american\" nearest neighbors: b, born, d, UNK, nine, english, german, french,\n",
|
|
|
+ "\"britain\" nearest neighbors: government, island, local, country, by, including, control, within,\n",
|
|
|
+ "step: 1010000, loss: 5.841236\n",
|
|
|
+ "step: 1020000, loss: 5.805200\n",
|
|
|
+ "step: 1030000, loss: 9.962063\n",
|
|
|
+ "step: 1040000, loss: 6.281199\n",
|
|
|
+ "step: 1050000, loss: 7.147995\n",
|
|
|
+ "step: 1060000, loss: 5.721184\n",
|
|
|
+ "step: 1070000, loss: 7.080662\n",
|
|
|
+ "step: 1080000, loss: 6.638658\n",
|
|
|
+ "step: 1090000, loss: 5.814178\n",
|
|
|
+ "step: 1100000, loss: 5.195928\n",
|
|
|
+ "step: 1110000, loss: 6.724787\n",
|
|
|
+ "step: 1120000, loss: 6.503905\n",
|
|
|
+ "step: 1130000, loss: 5.762966\n",
|
|
|
+ "step: 1140000, loss: 5.790243\n",
|
|
|
+ "step: 1150000, loss: 5.958191\n",
|
|
|
+ "step: 1160000, loss: 5.997983\n",
|
|
|
+ "step: 1170000, loss: 7.065348\n",
|
|
|
+ "step: 1180000, loss: 6.073387\n",
|
|
|
+ "step: 1190000, loss: 6.644097\n",
|
|
|
+ "step: 1200000, loss: 5.934450\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: three, four, six, eight, seven, two, nine, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, and, including, in, its, with, from, on,\n",
|
|
|
+ "\"going\" nearest neighbors: others, then, through, has, had, another, people, when,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, control, systems, either, these, large, small, other,\n",
|
|
|
+ "\"american\" nearest neighbors: born, german, john, d, british, b, UNK, french,\n",
|
|
|
+ "\"britain\" nearest neighbors: local, against, british, island, country, general, including, within,\n",
|
|
|
+ "step: 1210000, loss: 5.832344\n",
|
|
|
+ "step: 1220000, loss: 6.453851\n",
|
|
|
+ "step: 1230000, loss: 6.583966\n",
|
|
|
+ "step: 1240000, loss: 5.571673\n",
|
|
|
+ "step: 1250000, loss: 5.720917\n",
|
|
|
+ "step: 1260000, loss: 7.663424\n",
|
|
|
+ "step: 1270000, loss: 6.583741\n",
|
|
|
+ "step: 1280000, loss: 8.503859\n",
|
|
|
+ "step: 1290000, loss: 5.540640\n",
|
|
|
+ "step: 1300000, loss: 6.703249\n",
|
|
|
+ "step: 1310000, loss: 5.274101\n",
|
|
|
+ "step: 1320000, loss: 5.846446\n",
|
|
|
+ "step: 1330000, loss: 5.438172\n",
|
|
|
+ "step: 1340000, loss: 6.367691\n",
|
|
|
+ "step: 1350000, loss: 6.558622\n",
|
|
|
+ "step: 1360000, loss: 9.822924\n",
|
|
|
+ "step: 1370000, loss: 4.982378\n",
|
|
|
+ "step: 1380000, loss: 6.159739\n",
|
|
|
+ "step: 1390000, loss: 5.819083\n",
|
|
|
+ "step: 1400000, loss: 7.775135\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, three, six, seven, two, eight, one, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: and, the, in, with, its, within, for, including,\n",
|
|
|
+ "\"going\" nearest neighbors: others, through, while, has, to, how, particularly, their,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, large, control, research, using, information, either,\n",
|
|
|
+ "\"american\" nearest neighbors: english, french, german, born, film, british, s, former,\n",
|
|
|
+ "\"britain\" nearest neighbors: british, country, europe, local, military, island, against, western,\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "step: 1410000, loss: 8.214248\n",
|
|
|
+ "step: 1420000, loss: 4.696859\n",
|
|
|
+ "step: 1430000, loss: 5.873761\n",
|
|
|
+ "step: 1440000, loss: 5.971557\n",
|
|
|
+ "step: 1450000, loss: 4.992722\n",
|
|
|
+ "step: 1460000, loss: 5.197714\n",
|
|
|
+ "step: 1470000, loss: 6.916918\n",
|
|
|
+ "step: 1480000, loss: 6.441984\n",
|
|
|
+ "step: 1490000, loss: 5.443647\n",
|
|
|
+ "step: 1500000, loss: 5.178482\n",
|
|
|
+ "step: 1510000, loss: 6.060414\n",
|
|
|
+ "step: 1520000, loss: 6.373306\n",
|
|
|
+ "step: 1530000, loss: 5.098322\n",
|
|
|
+ "step: 1540000, loss: 6.674916\n",
|
|
|
+ "step: 1550000, loss: 6.712685\n",
|
|
|
+ "step: 1560000, loss: 5.280202\n",
|
|
|
+ "step: 1570000, loss: 6.454964\n",
|
|
|
+ "step: 1580000, loss: 4.896697\n",
|
|
|
+ "step: 1590000, loss: 6.239226\n",
|
|
|
+ "step: 1600000, loss: 5.709726\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: three, four, two, six, seven, eight, one, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, and, including, in, with, within, its, following,\n",
|
|
|
+ "\"going\" nearest neighbors: others, people, who, they, that, far, were, have,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, include, high, research, some, information, large,\n",
|
|
|
+ "\"american\" nearest neighbors: born, english, french, british, german, d, john, b,\n",
|
|
|
+ "\"britain\" nearest neighbors: country, military, china, europe, against, local, central, british,\n",
|
|
|
+ "step: 1610000, loss: 6.334940\n",
|
|
|
+ "step: 1620000, loss: 5.093616\n",
|
|
|
+ "step: 1630000, loss: 6.119366\n",
|
|
|
+ "step: 1640000, loss: 4.975187\n",
|
|
|
+ "step: 1650000, loss: 6.490408\n",
|
|
|
+ "step: 1660000, loss: 7.464082\n",
|
|
|
+ "step: 1670000, loss: 4.977184\n",
|
|
|
+ "step: 1680000, loss: 5.658133\n",
|
|
|
+ "step: 1690000, loss: 5.352454\n",
|
|
|
+ "step: 1700000, loss: 6.810776\n",
|
|
|
+ "step: 1710000, loss: 5.687447\n",
|
|
|
+ "step: 1720000, loss: 5.992206\n",
|
|
|
+ "step: 1730000, loss: 5.513011\n",
|
|
|
+ "step: 1740000, loss: 5.548522\n",
|
|
|
+ "step: 1750000, loss: 6.200248\n",
|
|
|
+ "step: 1760000, loss: 13.070073\n",
|
|
|
+ "step: 1770000, loss: 4.621058\n",
|
|
|
+ "step: 1780000, loss: 5.301342\n",
|
|
|
+ "step: 1790000, loss: 4.777030\n",
|
|
|
+ "step: 1800000, loss: 6.912136\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: three, four, six, seven, eight, two, nine, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, in, first, from, became, and, following, under,\n",
|
|
|
+ "\"going\" nearest neighbors: others, their, through, which, therefore, open, how, that,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, include, research, standard, different, system, small,\n",
|
|
|
+ "\"american\" nearest neighbors: b, d, born, actor, UNK, english, nine, german,\n",
|
|
|
+ "\"britain\" nearest neighbors: china, country, europe, against, canada, military, island, including,\n",
|
|
|
+ "step: 1810000, loss: 5.584600\n",
|
|
|
+ "step: 1820000, loss: 5.619820\n",
|
|
|
+ "step: 1830000, loss: 6.078709\n",
|
|
|
+ "step: 1840000, loss: 5.052518\n",
|
|
|
+ "step: 1850000, loss: 5.430106\n",
|
|
|
+ "step: 1860000, loss: 7.396770\n",
|
|
|
+ "step: 1870000, loss: 5.344787\n",
|
|
|
+ "step: 1880000, loss: 5.937998\n",
|
|
|
+ "step: 1890000, loss: 5.706491\n",
|
|
|
+ "step: 1900000, loss: 5.140662\n",
|
|
|
+ "step: 1910000, loss: 5.607048\n",
|
|
|
+ "step: 1920000, loss: 5.407231\n",
|
|
|
+ "step: 1930000, loss: 6.238531\n",
|
|
|
+ "step: 1940000, loss: 5.567973\n",
|
|
|
+ "step: 1950000, loss: 4.894245\n",
|
|
|
+ "step: 1960000, loss: 6.104193\n",
|
|
|
+ "step: 1970000, loss: 5.282631\n",
|
|
|
+ "step: 1980000, loss: 6.189069\n",
|
|
|
+ "step: 1990000, loss: 6.169409\n",
|
|
|
+ "step: 2000000, loss: 6.470152\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, three, six, seven, eight, two, nine, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, its, in, with, and, including, within, against,\n",
|
|
|
+ "\"going\" nearest neighbors: others, only, therefore, will, how, a, far, though,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, for, network, software, program, research, system,\n",
|
|
|
+ "\"american\" nearest neighbors: born, actor, d, italian, german, john, robert, b,\n",
|
|
|
+ "\"britain\" nearest neighbors: china, country, europe, canada, british, former, island, france,\n",
|
|
|
+ "step: 2010000, loss: 5.298714\n",
|
|
|
+ "step: 2020000, loss: 5.494207\n",
|
|
|
+ "step: 2030000, loss: 5.410875\n",
|
|
|
+ "step: 2040000, loss: 6.228232\n",
|
|
|
+ "step: 2050000, loss: 5.044596\n",
|
|
|
+ "step: 2060000, loss: 4.624638\n",
|
|
|
+ "step: 2070000, loss: 4.919327\n",
|
|
|
+ "step: 2080000, loss: 4.639625\n",
|
|
|
+ "step: 2090000, loss: 4.865627\n",
|
|
|
+ "step: 2100000, loss: 4.951073\n",
|
|
|
+ "step: 2110000, loss: 5.973768\n",
|
|
|
+ "step: 2120000, loss: 7.366824\n",
|
|
|
+ "step: 2130000, loss: 5.149571\n",
|
|
|
+ "step: 2140000, loss: 7.846234\n",
|
|
|
+ "step: 2150000, loss: 5.449315\n",
|
|
|
+ "step: 2160000, loss: 5.359211\n",
|
|
|
+ "step: 2170000, loss: 5.171029\n",
|
|
|
+ "step: 2180000, loss: 6.106437\n",
|
|
|
+ "step: 2190000, loss: 6.043995\n",
|
|
|
+ "step: 2200000, loss: 5.642351\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, three, six, two, eight, seven, zero, one,\n",
|
|
|
+ "\"of\" nearest neighbors: the, and, its, see, for, in, with, including,\n",
|
|
|
+ "\"going\" nearest neighbors: others, therefore, how, even, them, your, have, although,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, system, network, program, research, software, include,\n",
|
|
|
+ "\"american\" nearest neighbors: english, french, german, canadian, british, film, author, italian,\n",
|
|
|
+ "\"britain\" nearest neighbors: europe, china, country, germany, british, england, france, throughout,\n",
|
|
|
+ "step: 2210000, loss: 4.427110\n",
|
|
|
+ "step: 2220000, loss: 6.240989\n",
|
|
|
+ "step: 2230000, loss: 5.184978\n",
|
|
|
+ "step: 2240000, loss: 8.035570\n",
|
|
|
+ "step: 2250000, loss: 5.793781\n",
|
|
|
+ "step: 2260000, loss: 4.908427\n",
|
|
|
+ "step: 2270000, loss: 8.807668\n",
|
|
|
+ "step: 2280000, loss: 6.083229\n",
|
|
|
+ "step: 2290000, loss: 5.773360\n",
|
|
|
+ "step: 2300000, loss: 5.613671\n",
|
|
|
+ "step: 2310000, loss: 6.080076\n",
|
|
|
+ "step: 2320000, loss: 5.288568\n",
|
|
|
+ "step: 2330000, loss: 5.949232\n",
|
|
|
+ "step: 2340000, loss: 5.479994\n",
|
|
|
+ "step: 2350000, loss: 7.717686\n",
|
|
|
+ "step: 2360000, loss: 5.163609\n",
|
|
|
+ "step: 2370000, loss: 5.989407\n",
|
|
|
+ "step: 2380000, loss: 5.785729\n",
|
|
|
+ "step: 2390000, loss: 5.345478\n",
|
|
|
+ "step: 2400000, loss: 6.627133\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: three, four, six, two, seven, eight, zero, nine,\n",
|
|
|
+ "\"of\" nearest neighbors: the, in, and, including, from, within, its, with,\n",
|
|
|
+ "\"going\" nearest neighbors: therefore, people, they, out, only, according, your, now,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, network, program, system, software, run, design,\n",
|
|
|
+ "\"american\" nearest neighbors: author, born, actor, english, canadian, british, italian, d,\n",
|
|
|
+ "\"britain\" nearest neighbors: china, europe, country, throughout, france, canada, england, western,\n",
|
|
|
+ "step: 2410000, loss: 5.666146\n",
|
|
|
+ "step: 2420000, loss: 5.316198\n",
|
|
|
+ "step: 2430000, loss: 5.129625\n",
|
|
|
+ "step: 2440000, loss: 5.247949\n",
|
|
|
+ "step: 2450000, loss: 5.741394\n",
|
|
|
+ "step: 2460000, loss: 5.833083\n",
|
|
|
+ "step: 2470000, loss: 7.704844\n",
|
|
|
+ "step: 2480000, loss: 5.398345\n",
|
|
|
+ "step: 2490000, loss: 5.089633\n",
|
|
|
+ "step: 2500000, loss: 5.620508\n",
|
|
|
+ "step: 2510000, loss: 4.976034\n",
|
|
|
+ "step: 2520000, loss: 5.884676\n",
|
|
|
+ "step: 2530000, loss: 6.649922\n",
|
|
|
+ "step: 2540000, loss: 5.002588\n",
|
|
|
+ "step: 2550000, loss: 5.072144\n",
|
|
|
+ "step: 2560000, loss: 5.165375\n",
|
|
|
+ "step: 2570000, loss: 5.310089\n",
|
|
|
+ "step: 2580000, loss: 5.481957\n",
|
|
|
+ "step: 2590000, loss: 6.104440\n",
|
|
|
+ "step: 2600000, loss: 5.339644\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: three, four, six, seven, eight, nine, two, zero,\n",
|
|
|
+ "\"of\" nearest neighbors: the, first, from, with, became, in, following, and,\n",
|
|
|
+ "\"going\" nearest neighbors: how, therefore, back, will, through, always, your, make,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, system, network, program, technology, design, software,\n",
|
|
|
+ "\"american\" nearest neighbors: actor, singer, born, b, author, d, english, writer,\n",
|
|
|
+ "\"britain\" nearest neighbors: europe, china, throughout, great, england, france, country, india,\n",
|
|
|
+ "step: 2610000, loss: 7.754117\n",
|
|
|
+ "step: 2620000, loss: 5.979313\n",
|
|
|
+ "step: 2630000, loss: 5.394362\n",
|
|
|
+ "step: 2640000, loss: 4.866740\n",
|
|
|
+ "step: 2650000, loss: 5.219806\n",
|
|
|
+ "step: 2660000, loss: 6.074809\n",
|
|
|
+ "step: 2670000, loss: 6.216953\n",
|
|
|
+ "step: 2680000, loss: 5.944881\n",
|
|
|
+ "step: 2690000, loss: 5.863350\n",
|
|
|
+ "step: 2700000, loss: 6.128705\n",
|
|
|
+ "step: 2710000, loss: 5.502523\n",
|
|
|
+ "step: 2720000, loss: 5.300839\n",
|
|
|
+ "step: 2730000, loss: 6.358493\n",
|
|
|
+ "step: 2740000, loss: 6.058306\n",
|
|
|
+ "step: 2750000, loss: 4.689510\n",
|
|
|
+ "step: 2760000, loss: 6.032880\n",
|
|
|
+ "step: 2770000, loss: 5.844904\n",
|
|
|
+ "step: 2780000, loss: 5.385874\n",
|
|
|
+ "step: 2790000, loss: 5.370956\n",
|
|
|
+ "step: 2800000, loss: 4.912577\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, six, three, eight, seven, two, nine, one,\n",
|
|
|
+ "\"of\" nearest neighbors: in, the, and, from, including, following, with, under,\n",
|
|
|
+ "\"going\" nearest neighbors: your, then, through, will, how, so, back, even,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, program, network, design, standard, physical, software,\n",
|
|
|
+ "\"american\" nearest neighbors: actor, singer, born, author, writer, canadian, italian, d,\n",
|
|
|
+ "\"britain\" nearest neighbors: europe, china, england, throughout, france, india, great, germany,\n",
|
|
|
+ "step: 2810000, loss: 5.897756\n",
|
|
|
+ "step: 2820000, loss: 7.194932\n",
|
|
|
+ "step: 2830000, loss: 7.430175\n",
|
|
|
+ "step: 2840000, loss: 7.258231\n",
|
|
|
+ "step: 2850000, loss: 5.837617\n",
|
|
|
+ "step: 2860000, loss: 5.496673\n",
|
|
|
+ "step: 2870000, loss: 6.173716\n",
|
|
|
+ "step: 2880000, loss: 6.095749\n",
|
|
|
+ "step: 2890000, loss: 6.064944\n",
|
|
|
+ "step: 2900000, loss: 5.560488\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "step: 2910000, loss: 4.966107\n",
|
|
|
+ "step: 2920000, loss: 5.789579\n",
|
|
|
+ "step: 2930000, loss: 4.525987\n",
|
|
|
+ "step: 2940000, loss: 6.704808\n",
|
|
|
+ "step: 2950000, loss: 4.506433\n",
|
|
|
+ "step: 2960000, loss: 6.251270\n",
|
|
|
+ "step: 2970000, loss: 5.588204\n",
|
|
|
+ "step: 2980000, loss: 5.423235\n",
|
|
|
+ "step: 2990000, loss: 5.613834\n",
|
|
|
+ "step: 3000000, loss: 5.137326\n",
|
|
|
+ "Evaluation...\n",
|
|
|
+ "\"five\" nearest neighbors: four, three, six, seven, eight, two, zero, one,\n",
|
|
|
+ "\"of\" nearest neighbors: the, including, and, with, in, its, includes, within,\n",
|
|
|
+ "\"going\" nearest neighbors: how, they, when, them, make, always, your, though,\n",
|
|
|
+ "\"hardware\" nearest neighbors: computer, systems, network, program, physical, design, technology, software,\n",
|
|
|
+ "\"american\" nearest neighbors: canadian, english, australian, british, german, film, italian, author,\n",
|
|
|
+ "\"britain\" nearest neighbors: europe, england, china, throughout, india, france, great, british,\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Words for testing.\n",
|
|
|
+ "x_test = np.array([word2id[w] for w in eval_words])\n",
|
|
|
+ "\n",
|
|
|
+ "# Run training for the given number of steps.\n",
|
|
|
+ "for step in xrange(1, num_steps + 1):\n",
|
|
|
+ " batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)\n",
|
|
|
+ " run_optimization(batch_x, batch_y)\n",
|
|
|
+ " \n",
|
|
|
+ " if step % display_step == 0 or step == 1:\n",
|
|
|
+ " loss = nce_loss(get_embedding(batch_x), batch_y)\n",
|
|
|
+ " print(\"step: %i, loss: %f\" % (step, loss))\n",
|
|
|
+ " \n",
|
|
|
+ " # Evaluation.\n",
|
|
|
+ " if step % eval_step == 0 or step == 1:\n",
|
|
|
+ " print(\"Evaluation...\")\n",
|
|
|
+ " sim = evaluate(get_embedding(x_test)).numpy()\n",
|
|
|
+ " for i in xrange(len(eval_words)):\n",
|
|
|
+ " top_k = 8 # number of nearest neighbors.\n",
|
|
|
+ " nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n",
|
|
|
+ " log_str = '\"%s\" nearest neighbors:' % eval_words[i]\n",
|
|
|
+ " for k in xrange(top_k):\n",
|
|
|
+ " log_str = '%s %s,' % (log_str, id2word[nearest[k]])\n",
|
|
|
+ " print(log_str)"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 2",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python2"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 2
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython2",
|
|
|
+ "version": "2.7.15"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2
|
|
|
+}
|