{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Word2Vec (Word Embedding)\n", "\n", "Implement Word2Vec algorithm to compute vector representations of words.\n", "This example is using a small chunk of Wikipedia articles to train from.\n", "\n", "More info: [Mikolov, Tomas et al. \"Efficient Estimation of Word Representations in Vector Space.\", 2013](https://arxiv.org/pdf/1301.3781.pdf)\n", "\n", "\n", "- Author: Aymeric Damien\n", "- Project: https://github.com/aymericdamien/TensorFlow-Examples/" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from __future__ import division, print_function, absolute_import\n", "\n", "import collections\n", "import os\n", "import random\n", "import urllib\n", "import zipfile\n", "\n", "import numpy as np\n", "import tensorflow as tf" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Training Parameters\n", "learning_rate = 0.1\n", "batch_size = 128\n", "num_steps = 3000000\n", "display_step = 10000\n", "eval_step = 200000\n", "\n", "# Evaluation Parameters\n", "eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']\n", "\n", "# Word2Vec Parameters\n", "embedding_size = 200 # Dimension of the embedding vector\n", "max_vocabulary_size = 50000 # Total number of different words in the vocabulary\n", "min_occurrence = 10 # Remove all words that does not appears at least n times\n", "skip_window = 3 # How many words to consider left and right\n", "num_skips = 2 # How many times to reuse an input to generate a label\n", "num_sampled = 64 # Number of negative examples to sample" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading the dataset... (It may take some time)\n", "Done!\n" ] } ], "source": [ "# Download a small chunk of Wikipedia articles collection\n", "url = 'http://mattmahoney.net/dc/text8.zip'\n", "data_path = 'text8.zip'\n", "if not os.path.exists(data_path):\n", " print(\"Downloading the dataset... (It may take some time)\")\n", " filename, _ = urllib.urlretrieve(url, data_path)\n", " print(\"Done!\")\n", "# Unzip the dataset file. Text has already been processed\n", "with zipfile.ZipFile(data_path) as f:\n", " text_words = f.read(f.namelist()[0]).lower().split()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Words count: 17005207\n", "Unique words: 253854\n", "Vocabulary size: 50000\n", "Most common words: [('UNK', 418391), ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]\n" ] } ], "source": [ "# Build the dictionary and replace rare words with UNK token\n", "count = [('UNK', -1)]\n", "# Retrieve the most common words\n", "count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))\n", "# Remove samples with less than 'min_occurrence' occurrences\n", "for i in range(len(count) - 1, -1, -1):\n", " if count[i][1] < min_occurrence:\n", " count.pop(i)\n", " else:\n", " # The collection is ordered, so stop when 'min_occurrence' is reached\n", " break\n", "# Compute the vocabulary size\n", "vocabulary_size = len(count)\n", "# Assign an id to each word\n", "word2id = dict()\n", "for i, (word, _)in enumerate(count):\n", " word2id[word] = i\n", "\n", "data = list()\n", "unk_count = 0\n", "for word in text_words:\n", " # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary\n", " index = word2id.get(word, 0)\n", " if index == 0:\n", " unk_count += 1\n", " data.append(index)\n", "count[0] = ('UNK', unk_count)\n", "id2word = dict(zip(word2id.values(), word2id.keys()))\n", "\n", "print(\"Words count:\", len(text_words))\n", "print(\"Unique words:\", len(set(text_words)))\n", "print(\"Vocabulary size:\", vocabulary_size)\n", "print(\"Most common words:\", count[:10])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data_index = 0\n", "# Generate training batch for the skip-gram model\n", "def next_batch(batch_size, num_skips, skip_window):\n", " global data_index\n", " assert batch_size % num_skips == 0\n", " assert num_skips <= 2 * skip_window\n", " batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n", " labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n", " # get window size (words left and right + current one)\n", " span = 2 * skip_window + 1\n", " buffer = collections.deque(maxlen=span)\n", " if data_index + span > len(data):\n", " data_index = 0\n", " buffer.extend(data[data_index:data_index + span])\n", " data_index += span\n", " for i in range(batch_size // num_skips):\n", " context_words = [w for w in range(span) if w != skip_window]\n", " words_to_use = random.sample(context_words, num_skips)\n", " for j, context_word in enumerate(words_to_use):\n", " batch[i * num_skips + j] = buffer[skip_window]\n", " labels[i * num_skips + j, 0] = buffer[context_word]\n", " if data_index == len(data):\n", " buffer.extend(data[0:span])\n", " data_index = span\n", " else:\n", " buffer.append(data[data_index])\n", " data_index += 1\n", " # Backtrack a little bit to avoid skipping words in the end of a batch\n", " data_index = (data_index + len(data) - span) % len(data)\n", " return batch, labels" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Input data\n", "X = tf.placeholder(tf.int32, shape=[None])\n", "# Input label\n", "Y = tf.placeholder(tf.int32, shape=[None, 1])\n", "\n", "# Ensure the following ops & var are assigned on CPU\n", "# (some ops are not compatible on GPU)\n", "with tf.device('/cpu:0'):\n", " # Create the embedding variable (each row represent a word embedding vector)\n", " embedding = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))\n", " # Lookup the corresponding embedding vectors for each sample in X\n", " X_embed = tf.nn.embedding_lookup(embedding, X)\n", "\n", " # Construct the variables for the NCE loss\n", " nce_weights = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))\n", " nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n", "\n", "# Compute the average NCE loss for the batch\n", "loss_op = tf.reduce_mean(\n", " tf.nn.nce_loss(weights=nce_weights,\n", " biases=nce_biases,\n", " labels=Y,\n", " inputs=X_embed,\n", " num_sampled=num_sampled,\n", " num_classes=vocabulary_size))\n", "\n", "# Define the optimizer\n", "optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", "train_op = optimizer.minimize(loss_op)\n", "\n", "# Evaluation\n", "# Compute the cosine similarity between input data embedding and every embedding vectors\n", "X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed)))\n", "embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))\n", "cosine_sim_op = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Step 1, Average Loss= 520.3188\n", "Evaluation...\n", "\"five\" nearest neighbors: brothers, swinging, dissemination, fruitful, trichloride, dll, timur, torre,\n", "\"of\" nearest neighbors: malting, vaginal, cecil, xiaoping, arrangers, hydras, exhibits, splits,\n", "\"going\" nearest neighbors: besht, xps, sdtv, mississippi, frequencies, tora, reciprocating, tursiops,\n", "\"hardware\" nearest neighbors: burgh, residences, mares, attested, whirlwind, isomerism, admiration, ties,\n", "\"american\" nearest neighbors: tensile, months, baffling, cricket, kodak, risky, nicomedia, jura,\n", "\"britain\" nearest neighbors: superstring, interpretations, genealogical, munition, boer, occasional, psychologists, turbofan,\n", "Step 10000, Average Loss= 202.2640\n", "Step 20000, Average Loss= 96.5149\n", "Step 30000, Average Loss= 67.2858\n", "Step 40000, Average Loss= 52.5055\n", "Step 50000, Average Loss= 42.6301\n", "Step 60000, Average Loss= 37.3644\n", "Step 70000, Average Loss= 33.1220\n", "Step 80000, Average Loss= 30.5835\n", "Step 90000, Average Loss= 28.2243\n", "Step 100000, Average Loss= 25.5532\n", "Step 110000, Average Loss= 24.0891\n", "Step 120000, Average Loss= 21.8576\n", "Step 130000, Average Loss= 21.2192\n", "Step 140000, Average Loss= 19.8834\n", "Step 150000, Average Loss= 19.3362\n", "Step 160000, Average Loss= 18.3129\n", "Step 170000, Average Loss= 17.4952\n", "Step 180000, Average Loss= 16.8531\n", "Step 190000, Average Loss= 15.9615\n", "Step 200000, Average Loss= 15.0718\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, eight, six, seven, two, nine, one,\n", "\"of\" nearest neighbors: the, is, a, was, with, in, and, on,\n", "\"going\" nearest neighbors: time, military, called, with, used, state, most, new,\n", "\"hardware\" nearest neighbors: deaths, system, three, at, zero, two, s, UNK,\n", "\"american\" nearest neighbors: UNK, and, s, about, in, when, from, after,\n", "\"britain\" nearest neighbors: years, were, from, both, of, these, is, many,\n", "Step 210000, Average Loss= 14.9267\n", "Step 220000, Average Loss= 15.4700\n", "Step 230000, Average Loss= 14.0867\n", "Step 240000, Average Loss= 14.5337\n", "Step 250000, Average Loss= 13.2458\n", "Step 260000, Average Loss= 13.2944\n", "Step 270000, Average Loss= 13.0396\n", "Step 280000, Average Loss= 12.1902\n", "Step 290000, Average Loss= 11.7444\n", "Step 300000, Average Loss= 11.8473\n", "Step 310000, Average Loss= 11.1306\n", "Step 320000, Average Loss= 11.1699\n", "Step 330000, Average Loss= 10.8638\n", "Step 340000, Average Loss= 10.7910\n", "Step 350000, Average Loss= 11.0721\n", "Step 360000, Average Loss= 10.6309\n", "Step 370000, Average Loss= 10.4836\n", "Step 380000, Average Loss= 10.3482\n", "Step 390000, Average Loss= 10.0679\n", "Step 400000, Average Loss= 10.0070\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, seven, eight, two, one, zero,\n", "\"of\" nearest neighbors: and, in, the, a, for, by, is, while,\n", "\"going\" nearest neighbors: name, called, made, military, music, people, city, was,\n", "\"hardware\" nearest neighbors: power, a, john, the, has, see, and, system,\n", "\"american\" nearest neighbors: s, british, UNK, john, in, during, and, from,\n", "\"britain\" nearest neighbors: from, general, are, before, first, after, history, was,\n", "Step 410000, Average Loss= 10.1151\n", "Step 420000, Average Loss= 9.5719\n", "Step 430000, Average Loss= 9.8267\n", "Step 440000, Average Loss= 9.4704\n", "Step 450000, Average Loss= 9.5561\n", "Step 460000, Average Loss= 9.1479\n", "Step 470000, Average Loss= 8.8914\n", "Step 480000, Average Loss= 9.0281\n", "Step 490000, Average Loss= 9.3139\n", "Step 500000, Average Loss= 9.1559\n", "Step 510000, Average Loss= 8.8257\n", "Step 520000, Average Loss= 8.9081\n", "Step 530000, Average Loss= 8.8572\n", "Step 540000, Average Loss= 8.5835\n", "Step 550000, Average Loss= 8.4495\n", "Step 560000, Average Loss= 8.4193\n", "Step 570000, Average Loss= 8.3399\n", "Step 580000, Average Loss= 8.1633\n", "Step 590000, Average Loss= 8.2914\n", "Step 600000, Average Loss= 8.0268\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, two, seven, eight, one, zero,\n", "\"of\" nearest neighbors: and, the, in, including, with, for, on, or,\n", "\"going\" nearest neighbors: popular, king, his, music, and, time, name, being,\n", "\"hardware\" nearest neighbors: power, over, then, than, became, at, less, for,\n", "\"american\" nearest neighbors: english, s, german, in, french, since, john, between,\n", "\"britain\" nearest neighbors: however, were, state, first, group, general, from, second,\n", "Step 610000, Average Loss= 8.1733\n", "Step 620000, Average Loss= 8.2522\n", "Step 630000, Average Loss= 8.0434\n", "Step 640000, Average Loss= 8.0930\n", "Step 650000, Average Loss= 7.8770\n", "Step 660000, Average Loss= 7.9221\n", "Step 670000, Average Loss= 7.7645\n", "Step 680000, Average Loss= 7.9534\n", "Step 690000, Average Loss= 7.7507\n", "Step 700000, Average Loss= 7.7499\n", "Step 710000, Average Loss= 7.6629\n", "Step 720000, Average Loss= 7.6055\n", "Step 730000, Average Loss= 7.4779\n", "Step 740000, Average Loss= 7.3182\n", "Step 750000, Average Loss= 7.6399\n", "Step 760000, Average Loss= 7.4364\n", "Step 770000, Average Loss= 7.6509\n", "Step 780000, Average Loss= 7.3204\n", "Step 790000, Average Loss= 7.4101\n", "Step 800000, Average Loss= 7.4354\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, seven, eight, two, one, nine,\n", "\"of\" nearest neighbors: and, the, its, a, with, at, in, for,\n", "\"going\" nearest neighbors: were, man, music, now, great, support, popular, her,\n", "\"hardware\" nearest neighbors: power, system, then, military, high, against, since, international,\n", "\"american\" nearest neighbors: english, british, born, b, john, french, d, german,\n", "\"britain\" nearest neighbors: government, second, before, from, state, several, the, at,\n", "Step 810000, Average Loss= 7.2603\n", "Step 820000, Average Loss= 7.1646\n", "Step 830000, Average Loss= 7.3155\n", "Step 840000, Average Loss= 7.1274\n", "Step 850000, Average Loss= 7.1237\n", "Step 860000, Average Loss= 7.1528\n", "Step 870000, Average Loss= 7.0673\n", "Step 880000, Average Loss= 7.2167\n", "Step 890000, Average Loss= 7.1359\n", "Step 900000, Average Loss= 7.0940\n", "Step 910000, Average Loss= 7.1114\n", "Step 920000, Average Loss= 6.9328\n", "Step 930000, Average Loss= 7.0108\n", "Step 940000, Average Loss= 7.0630\n", "Step 950000, Average Loss= 6.8371\n", "Step 960000, Average Loss= 7.0466\n", "Step 970000, Average Loss= 6.8331\n", "Step 980000, Average Loss= 6.9670\n", "Step 990000, Average Loss= 6.7357\n", "Step 1000000, Average Loss= 6.6453\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, eight, seven, two, nine, zero,\n", "\"of\" nearest neighbors: the, became, including, first, second, from, following, and,\n", "\"going\" nearest neighbors: near, music, popular, made, while, his, works, most,\n", "\"hardware\" nearest neighbors: power, system, before, its, using, for, thus, an,\n", "\"american\" nearest neighbors: b, born, d, UNK, nine, john, english, seven,\n", "\"britain\" nearest neighbors: of, following, government, home, from, state, end, several,\n", "Step 1010000, Average Loss= 6.7193\n", "Step 1020000, Average Loss= 6.9297\n", "Step 1030000, Average Loss= 6.7905\n", "Step 1040000, Average Loss= 6.7709\n", "Step 1050000, Average Loss= 6.7337\n", "Step 1060000, Average Loss= 6.7617\n", "Step 1070000, Average Loss= 6.7489\n", "Step 1080000, Average Loss= 6.6259\n", "Step 1090000, Average Loss= 6.6415\n", "Step 1100000, Average Loss= 6.7209\n", "Step 1110000, Average Loss= 6.5471\n", "Step 1120000, Average Loss= 6.6508\n", "Step 1130000, Average Loss= 6.5184\n", "Step 1140000, Average Loss= 6.6202\n", "Step 1150000, Average Loss= 6.7205\n", "Step 1160000, Average Loss= 6.5821\n", "Step 1170000, Average Loss= 6.6200\n", "Step 1180000, Average Loss= 6.5089\n", "Step 1190000, Average Loss= 6.5587\n", "Step 1200000, Average Loss= 6.4930\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, seven, eight, two, nine, zero,\n", "\"of\" nearest neighbors: the, and, including, in, first, with, following, from,\n", "\"going\" nearest neighbors: near, popular, works, today, large, now, when, both,\n", "\"hardware\" nearest neighbors: power, system, computer, its, both, for, using, which,\n", "\"american\" nearest neighbors: born, d, john, german, b, UNK, english, s,\n", "\"britain\" nearest neighbors: state, following, government, home, became, people, were, the,\n", "Step 1210000, Average Loss= 6.5985\n", "Step 1220000, Average Loss= 6.4534\n", "Step 1230000, Average Loss= 6.5083\n", "Step 1240000, Average Loss= 6.4913\n", "Step 1250000, Average Loss= 6.4326\n", "Step 1260000, Average Loss= 6.3891\n", "Step 1270000, Average Loss= 6.1601\n", "Step 1280000, Average Loss= 6.4479\n", "Step 1290000, Average Loss= 6.3813\n", "Step 1300000, Average Loss= 6.5335\n", "Step 1310000, Average Loss= 6.2971\n", "Step 1320000, Average Loss= 6.3723\n", "Step 1330000, Average Loss= 6.4234\n", "Step 1340000, Average Loss= 6.3130\n", "Step 1350000, Average Loss= 6.2867\n", "Step 1360000, Average Loss= 6.3505\n", "Step 1370000, Average Loss= 6.2990\n", "Step 1380000, Average Loss= 6.3012\n", "Step 1390000, Average Loss= 6.3112\n", "Step 1400000, Average Loss= 6.2680\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, two, seven, eight, one, zero,\n", "\"of\" nearest neighbors: the, its, and, including, in, with, see, for,\n", "\"going\" nearest neighbors: near, great, like, today, began, called, an, another,\n", "\"hardware\" nearest neighbors: power, computer, system, for, program, high, control, small,\n", "\"american\" nearest neighbors: english, german, french, born, john, british, s, references,\n", "\"britain\" nearest neighbors: state, great, government, people, following, became, along, home,\n", "Step 1410000, Average Loss= 6.3157\n", "Step 1420000, Average Loss= 6.3466\n", "Step 1430000, Average Loss= 6.3090\n", "Step 1440000, Average Loss= 6.3330\n", "Step 1450000, Average Loss= 6.2072\n", "Step 1460000, Average Loss= 6.2363\n", "Step 1470000, Average Loss= 6.2736\n", "Step 1480000, Average Loss= 6.1793\n", "Step 1490000, Average Loss= 6.2977\n", "Step 1500000, Average Loss= 6.1899\n", "Step 1510000, Average Loss= 6.2381\n", "Step 1520000, Average Loss= 6.1027\n", "Step 1530000, Average Loss= 6.0046\n", "Step 1540000, Average Loss= 6.0747\n", "Step 1550000, Average Loss= 6.2524\n", "Step 1560000, Average Loss= 6.1247\n", "Step 1570000, Average Loss= 6.1937\n", "Step 1580000, Average Loss= 6.0450\n", "Step 1590000, Average Loss= 6.1556\n", "Step 1600000, Average Loss= 6.1765\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, two, seven, eight, one, zero,\n", "\"of\" nearest neighbors: the, and, its, for, from, modern, in, part,\n", "\"going\" nearest neighbors: great, today, once, now, while, her, like, by,\n", "\"hardware\" nearest neighbors: power, system, high, program, control, computer, typically, making,\n", "\"american\" nearest neighbors: born, english, british, german, john, french, b, d,\n", "\"britain\" nearest neighbors: country, state, home, government, first, following, during, from,\n", "Step 1610000, Average Loss= 6.1029\n", "Step 1620000, Average Loss= 6.0501\n", "Step 1630000, Average Loss= 6.1536\n", "Step 1640000, Average Loss= 6.0483\n", "Step 1650000, Average Loss= 6.1197\n", "Step 1660000, Average Loss= 6.0261\n", "Step 1670000, Average Loss= 6.1012\n", "Step 1680000, Average Loss= 6.1795\n", "Step 1690000, Average Loss= 6.1224\n", "Step 1700000, Average Loss= 6.0896\n", "Step 1710000, Average Loss= 6.0418\n", "Step 1720000, Average Loss= 6.0626\n", "Step 1730000, Average Loss= 6.0214\n", "Step 1740000, Average Loss= 6.1206\n", "Step 1750000, Average Loss= 5.9721\n", "Step 1760000, Average Loss= 6.0782\n", "Step 1770000, Average Loss= 6.0291\n", "Step 1780000, Average Loss= 6.0187\n", "Step 1790000, Average Loss= 5.9761\n", "Step 1800000, Average Loss= 5.7518\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, seven, eight, nine, two, zero,\n", "\"of\" nearest neighbors: the, from, in, became, and, second, first, including,\n", "\"going\" nearest neighbors: today, which, once, little, made, before, now, etc,\n", "\"hardware\" nearest neighbors: computer, power, program, system, high, typically, current, eventually,\n", "\"american\" nearest neighbors: b, d, born, actor, UNK, robert, william, english,\n", "\"britain\" nearest neighbors: government, state, country, from, world, great, of, in,\n", "Step 1810000, Average Loss= 5.9839\n", "Step 1820000, Average Loss= 5.9931\n", "Step 1830000, Average Loss= 6.0794\n", "Step 1840000, Average Loss= 5.9072\n", "Step 1850000, Average Loss= 5.9831\n", "Step 1860000, Average Loss= 6.0023\n", "Step 1870000, Average Loss= 5.9375\n", "Step 1880000, Average Loss= 5.9250\n", "Step 1890000, Average Loss= 5.9422\n", "Step 1900000, Average Loss= 5.9339\n", "Step 1910000, Average Loss= 5.9235\n", "Step 1920000, Average Loss= 5.9692\n", "Step 1930000, Average Loss= 5.9022\n", "Step 1940000, Average Loss= 5.9599\n", "Step 1950000, Average Loss= 6.0174\n", "Step 1960000, Average Loss= 5.9530\n", "Step 1970000, Average Loss= 5.9479\n", "Step 1980000, Average Loss= 5.8870\n", "Step 1990000, Average Loss= 5.9271\n", "Step 2000000, Average Loss= 5.8774\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, seven, eight, two, nine, zero,\n", "\"of\" nearest neighbors: and, the, from, in, within, first, including, with,\n", "\"going\" nearest neighbors: today, before, another, little, work, etc, now, him,\n", "\"hardware\" nearest neighbors: computer, program, system, both, making, designed, power, simple,\n", "\"american\" nearest neighbors: actor, born, d, robert, john, b, german, writer,\n", "\"britain\" nearest neighbors: government, state, following, great, england, became, country, from,\n", "Step 2010000, Average Loss= 5.9373\n", "Step 2020000, Average Loss= 5.9113\n", "Step 2030000, Average Loss= 5.9158\n", "Step 2040000, Average Loss= 5.9020\n", "Step 2050000, Average Loss= 5.8608\n", "Step 2060000, Average Loss= 5.7379\n", "Step 2070000, Average Loss= 5.7143\n", "Step 2080000, Average Loss= 5.9379\n", "Step 2090000, Average Loss= 5.8201\n", "Step 2100000, Average Loss= 5.9390\n", "Step 2110000, Average Loss= 5.7295\n", "Step 2120000, Average Loss= 5.8290\n", "Step 2130000, Average Loss= 5.9042\n", "Step 2140000, Average Loss= 5.8367\n", "Step 2150000, Average Loss= 5.7760\n", "Step 2160000, Average Loss= 5.8664\n", "Step 2170000, Average Loss= 5.7974\n", "Step 2180000, Average Loss= 5.8523\n", "Step 2190000, Average Loss= 5.8047\n", "Step 2200000, Average Loss= 5.8172\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, eight, two, seven, one, zero,\n", "\"of\" nearest neighbors: the, with, group, in, its, and, from, including,\n", "\"going\" nearest neighbors: produced, when, today, while, little, before, had, like,\n", "\"hardware\" nearest neighbors: computer, system, power, technology, program, simple, for, designed,\n", "\"american\" nearest neighbors: english, canadian, german, french, author, british, film, born,\n", "\"britain\" nearest neighbors: government, great, state, established, british, england, country, army,\n", "Step 2210000, Average Loss= 5.8847\n", "Step 2220000, Average Loss= 5.8622\n", "Step 2230000, Average Loss= 5.8295\n", "Step 2240000, Average Loss= 5.8484\n", "Step 2250000, Average Loss= 5.7917\n", "Step 2260000, Average Loss= 5.7846\n", "Step 2270000, Average Loss= 5.8307\n", "Step 2280000, Average Loss= 5.7341\n", "Step 2290000, Average Loss= 5.8519\n", "Step 2300000, Average Loss= 5.7792\n", "Step 2310000, Average Loss= 5.8277\n", "Step 2320000, Average Loss= 5.7196\n", "Step 2330000, Average Loss= 5.5469\n", "Step 2340000, Average Loss= 5.7177\n", "Step 2350000, Average Loss= 5.8139\n", "Step 2360000, Average Loss= 5.7849\n", "Step 2370000, Average Loss= 5.7022\n", "Step 2380000, Average Loss= 5.7447\n", "Step 2390000, Average Loss= 5.7667\n", "Step 2400000, Average Loss= 5.7625\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, seven, two, eight, zero, nine,\n", "\"of\" nearest neighbors: the, and, from, part, in, following, within, including,\n", "\"going\" nearest neighbors: where, once, little, now, again, while, off, produced,\n", "\"hardware\" nearest neighbors: system, computer, high, power, using, designed, systems, simple,\n", "\"american\" nearest neighbors: author, actor, english, born, writer, british, b, d,\n", "\"britain\" nearest neighbors: great, established, government, england, country, state, army, former,\n", "Step 2410000, Average Loss= 5.6953\n", "Step 2420000, Average Loss= 5.7413\n", "Step 2430000, Average Loss= 5.7242\n", "Step 2440000, Average Loss= 5.7397\n", "Step 2450000, Average Loss= 5.7755\n", "Step 2460000, Average Loss= 5.6881\n", "Step 2470000, Average Loss= 5.7471\n", "Step 2480000, Average Loss= 5.8159\n", "Step 2490000, Average Loss= 5.7452\n", "Step 2500000, Average Loss= 5.7547\n", "Step 2510000, Average Loss= 5.6945\n", "Step 2520000, Average Loss= 5.7318\n", "Step 2530000, Average Loss= 5.6682\n", "Step 2540000, Average Loss= 5.7660\n", "Step 2550000, Average Loss= 5.6956\n", "Step 2560000, Average Loss= 5.7307\n", "Step 2570000, Average Loss= 5.7015\n", "Step 2580000, Average Loss= 5.6932\n", "Step 2590000, Average Loss= 5.6386\n", "Step 2600000, Average Loss= 5.4734\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, seven, eight, nine, two, zero,\n", "\"of\" nearest neighbors: the, and, in, from, became, including, for, with,\n", "\"going\" nearest neighbors: little, again, just, a, now, where, to, for,\n", "\"hardware\" nearest neighbors: computer, program, system, software, designed, systems, technology, current,\n", "\"american\" nearest neighbors: actor, d, writer, b, born, singer, author, robert,\n", "\"britain\" nearest neighbors: great, established, government, england, country, in, from, state,\n", "Step 2610000, Average Loss= 5.7291\n", "Step 2620000, Average Loss= 5.6412\n", "Step 2630000, Average Loss= 5.7485\n", "Step 2640000, Average Loss= 5.5833\n", "Step 2650000, Average Loss= 5.6548\n", "Step 2660000, Average Loss= 5.7159\n", "Step 2670000, Average Loss= 5.6569\n", "Step 2680000, Average Loss= 5.6080\n", "Step 2690000, Average Loss= 5.7037\n", "Step 2700000, Average Loss= 5.6360\n", "Step 2710000, Average Loss= 5.6707\n", "Step 2720000, Average Loss= 5.6811\n", "Step 2730000, Average Loss= 5.6237\n", "Step 2740000, Average Loss= 5.7050\n", "Step 2750000, Average Loss= 5.6991\n", "Step 2760000, Average Loss= 5.6691\n", "Step 2770000, Average Loss= 5.7057\n", "Step 2780000, Average Loss= 5.6162\n", "Step 2790000, Average Loss= 5.6484\n", "Step 2800000, Average Loss= 5.6627\n", "Evaluation...\n", "\"five\" nearest neighbors: four, six, three, seven, eight, nine, two, one,\n", "\"of\" nearest neighbors: the, in, following, including, part, and, from, under,\n", "\"going\" nearest neighbors: again, before, little, away, once, when, eventually, then,\n", "\"hardware\" nearest neighbors: computer, system, software, program, systems, designed, for, design,\n", "\"american\" nearest neighbors: actor, writer, singer, author, born, robert, d, john,\n", "\"britain\" nearest neighbors: established, england, great, government, france, army, the, throughout,\n", "Step 2810000, Average Loss= 5.5900\n", "Step 2820000, Average Loss= 5.7053\n", "Step 2830000, Average Loss= 5.6064\n", "Step 2840000, Average Loss= 5.6891\n", "Step 2850000, Average Loss= 5.5571\n", "Step 2860000, Average Loss= 5.4490\n", "Step 2870000, Average Loss= 5.5428\n", "Step 2880000, Average Loss= 5.6832\n", "Step 2890000, Average Loss= 5.5973\n", "Step 2900000, Average Loss= 5.5816\n", "Step 2910000, Average Loss= 5.5647\n", "Step 2920000, Average Loss= 5.6001\n", "Step 2930000, Average Loss= 5.6459\n", "Step 2940000, Average Loss= 5.5622\n", "Step 2950000, Average Loss= 5.5707\n", "Step 2960000, Average Loss= 5.6492\n", "Step 2970000, Average Loss= 5.5633\n", "Step 2980000, Average Loss= 5.6323\n", "Step 2990000, Average Loss= 5.5440\n", "Step 3000000, Average Loss= 5.6209\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, eight, seven, two, zero, one,\n", "\"of\" nearest neighbors: the, in, and, including, group, includes, part, from,\n", "\"going\" nearest neighbors: once, again, when, quickly, before, eventually, little, had,\n", "\"hardware\" nearest neighbors: computer, system, software, designed, program, simple, systems, sound,\n", "\"american\" nearest neighbors: canadian, english, author, german, french, british, irish, australian,\n", "\"britain\" nearest neighbors: established, england, great, government, throughout, france, british, northern,\n" ] } ], "source": [ "# Initialize the variables (i.e. assign their default value)\n", "init = tf.global_variables_initializer()\n", "\n", "with tf.Session() as sess:\n", "\n", " # Run the initializer\n", " sess.run(init)\n", "\n", " # Testing data\n", " x_test = np.array([word2id[w] for w in eval_words])\n", "\n", " average_loss = 0\n", " for step in xrange(1, num_steps + 1):\n", " # Get a new batch of data\n", " batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)\n", " # Run training op\n", " _, loss = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})\n", " average_loss += loss\n", "\n", " if step % display_step == 0 or step == 1:\n", " if step > 1:\n", " average_loss /= display_step\n", " print(\"Step \" + str(step) + \", Average Loss= \" + \\\n", " \"{:.4f}\".format(average_loss))\n", " average_loss = 0\n", "\n", " # Evaluation\n", " if step % eval_step == 0 or step == 1:\n", " print(\"Evaluation...\")\n", " sim = sess.run(cosine_sim_op, feed_dict={X: x_test})\n", " for i in xrange(len(eval_words)):\n", " top_k = 8 # number of nearest neighbors\n", " nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n", " log_str = '\"%s\" nearest neighbors:' % eval_words[i]\n", " for k in xrange(top_k):\n", " log_str = '%s %s,' % (log_str, id2word[nearest[k]])\n", " print(log_str)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }