{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Word2Vec (Word Embedding)\n", "\n", "Implement Word2Vec algorithm to compute vector representations of words, with TensorFlow 2.0. This example is using a small chunk of Wikipedia articles to train from.\n", "\n", "More info: [Mikolov, Tomas et al. \"Efficient Estimation of Word Representations in Vector Space.\", 2013](https://arxiv.org/pdf/1301.3781.pdf)\n", "\n", "- Author: Aymeric Damien\n", "- Project: https://github.com/aymericdamien/TensorFlow-Examples/" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from __future__ import division, print_function, absolute_import\n", "\n", "import collections\n", "import os\n", "import random\n", "import urllib\n", "import zipfile\n", "\n", "import numpy as np\n", "import tensorflow as tf" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Training Parameters.\n", "learning_rate = 0.1\n", "batch_size = 128\n", "num_steps = 3000000\n", "display_step = 10000\n", "eval_step = 200000\n", "\n", "# Evaluation Parameters.\n", "eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']\n", "\n", "# Word2Vec Parameters.\n", "embedding_size = 200 # Dimension of the embedding vector.\n", "max_vocabulary_size = 50000 # Total number of different words in the vocabulary.\n", "min_occurrence = 10 # Remove all words that does not appears at least n times.\n", "skip_window = 3 # How many words to consider left and right.\n", "num_skips = 2 # How many times to reuse an input to generate a label.\n", "num_sampled = 64 # Number of negative examples to sample." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Download a small chunk of Wikipedia articles collection.\n", "url = 'http://mattmahoney.net/dc/text8.zip'\n", "data_path = 'text8.zip'\n", "if not os.path.exists(data_path):\n", " print(\"Downloading the dataset... (It may take some time)\")\n", " filename, _ = urllib.urlretrieve(url, data_path)\n", " print(\"Done!\")\n", "# Unzip the dataset file. Text has already been processed.\n", "with zipfile.ZipFile(data_path) as f:\n", " text_words = f.read(f.namelist()[0]).lower().split()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Words count: 17005207\n", "Unique words: 253854\n", "Vocabulary size: 47135\n", "Most common words: [('UNK', 444176), ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]\n" ] } ], "source": [ "# Build the dictionary and replace rare words with UNK token.\n", "count = [('UNK', -1)]\n", "# Retrieve the most common words.\n", "count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))\n", "# Remove samples with less than 'min_occurrence' occurrences.\n", "for i in range(len(count) - 1, -1, -1):\n", " if count[i][1] < min_occurrence:\n", " count.pop(i)\n", " else:\n", " # The collection is ordered, so stop when 'min_occurrence' is reached.\n", " break\n", "# Compute the vocabulary size.\n", "vocabulary_size = len(count)\n", "# Assign an id to each word.\n", "word2id = dict()\n", "for i, (word, _)in enumerate(count):\n", " word2id[word] = i\n", "\n", "data = list()\n", "unk_count = 0\n", "for word in text_words:\n", " # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary.\n", " index = word2id.get(word, 0)\n", " if index == 0:\n", " unk_count += 1\n", " data.append(index)\n", "count[0] = ('UNK', unk_count)\n", "id2word = dict(zip(word2id.values(), word2id.keys()))\n", "\n", "print(\"Words count:\", len(text_words))\n", "print(\"Unique words:\", len(set(text_words)))\n", "print(\"Vocabulary size:\", vocabulary_size)\n", "print(\"Most common words:\", count[:10])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data_index = 0\n", "# Generate training batch for the skip-gram model.\n", "def next_batch(batch_size, num_skips, skip_window):\n", " global data_index\n", " assert batch_size % num_skips == 0\n", " assert num_skips <= 2 * skip_window\n", " batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n", " labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n", " # get window size (words left and right + current one).\n", " span = 2 * skip_window + 1\n", " buffer = collections.deque(maxlen=span)\n", " if data_index + span > len(data):\n", " data_index = 0\n", " buffer.extend(data[data_index:data_index + span])\n", " data_index += span\n", " for i in range(batch_size // num_skips):\n", " context_words = [w for w in range(span) if w != skip_window]\n", " words_to_use = random.sample(context_words, num_skips)\n", " for j, context_word in enumerate(words_to_use):\n", " batch[i * num_skips + j] = buffer[skip_window]\n", " labels[i * num_skips + j, 0] = buffer[context_word]\n", " if data_index == len(data):\n", " buffer.extend(data[0:span])\n", " data_index = span\n", " else:\n", " buffer.append(data[data_index])\n", " data_index += 1\n", " # Backtrack a little bit to avoid skipping words in the end of a batch.\n", " data_index = (data_index + len(data) - span) % len(data)\n", " return batch, labels" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Ensure the following ops & var are assigned on CPU\n", "# (some ops are not compatible on GPU).\n", "with tf.device('/cpu:0'):\n", " # Create the embedding variable (each row represent a word embedding vector).\n", " embedding = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))\n", " # Construct the variables for the NCE loss.\n", " nce_weights = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))\n", " nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n", "\n", "def get_embedding(x):\n", " with tf.device('/cpu:0'):\n", " # Lookup the corresponding embedding vectors for each sample in X.\n", " x_embed = tf.nn.embedding_lookup(embedding, x)\n", " return x_embed\n", "\n", "def nce_loss(x_embed, y):\n", " with tf.device('/cpu:0'):\n", " # Compute the average NCE loss for the batch.\n", " y = tf.cast(y, tf.int64)\n", " loss = tf.reduce_mean(\n", " tf.nn.nce_loss(weights=nce_weights,\n", " biases=nce_biases,\n", " labels=y,\n", " inputs=x_embed,\n", " num_sampled=num_sampled,\n", " num_classes=vocabulary_size))\n", " return loss\n", "\n", "# Evaluation.\n", "def evaluate(x_embed):\n", " with tf.device('/cpu:0'):\n", " # Compute the cosine similarity between input data embedding and every embedding vectors\n", " x_embed = tf.cast(x_embed, tf.float32)\n", " x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed)))\n", " embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True), tf.float32)\n", " cosine_sim_op = tf.matmul(x_embed_norm, embedding_norm, transpose_b=True)\n", " return cosine_sim_op\n", "\n", "# Define the optimizer.\n", "optimizer = tf.optimizers.SGD(learning_rate)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Optimization process. \n", "def run_optimization(x, y):\n", " with tf.device('/cpu:0'):\n", " # Wrap computation inside a GradientTape for automatic differentiation.\n", " with tf.GradientTape() as g:\n", " emb = get_embedding(x)\n", " loss = nce_loss(emb, y)\n", "\n", " # Compute gradients.\n", " gradients = g.gradient(loss, [embedding, nce_weights, nce_biases])\n", "\n", " # Update W and b following gradients.\n", " optimizer.apply_gradients(zip(gradients, [embedding, nce_weights, nce_biases]))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "step: 1, loss: 504.444214\n", "Evaluation...\n", "\"five\" nearest neighbors: censure, stricken, anglicanism, stick, streetcars, shrines, horrified, sparkle,\n", "\"of\" nearest neighbors: jolly, weary, clinicians, kerouac, economist, owls, safe, playoff,\n", "\"going\" nearest neighbors: filament, platforms, moderately, micheal, despotic, krag, disclosed, your,\n", "\"hardware\" nearest neighbors: occupants, paraffin, vera, reorganized, rename, declares, prima, condoned,\n", "\"american\" nearest neighbors: portfolio, rhein, aalto, angle, lifeson, tucker, sexton, dench,\n", "\"britain\" nearest neighbors: indivisible, disbelief, scripture, pepsi, scriptores, sighting, napalm, strike,\n", "step: 10000, loss: 117.166962\n", "step: 20000, loss: 65.478333\n", "step: 30000, loss: 46.580460\n", "step: 40000, loss: 25.563128\n", "step: 50000, loss: 50.924446\n", "step: 60000, loss: 51.696526\n", "step: 70000, loss: 17.272142\n", "step: 80000, loss: 32.579414\n", "step: 90000, loss: 68.372032\n", "step: 100000, loss: 36.026573\n", "step: 110000, loss: 22.502020\n", "step: 120000, loss: 15.788742\n", "step: 130000, loss: 31.832420\n", "step: 140000, loss: 25.096617\n", "step: 150000, loss: 12.013027\n", "step: 160000, loss: 20.574780\n", "step: 170000, loss: 12.201975\n", "step: 180000, loss: 20.983793\n", "step: 190000, loss: 11.366720\n", "step: 200000, loss: 19.431549\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, eight, six, two, seven, nine, zero,\n", "\"of\" nearest neighbors: the, a, and, first, with, on, but, from,\n", "\"going\" nearest neighbors: have, more, used, out, be, with, on, however,\n", "\"hardware\" nearest neighbors: be, known, system, apollo, and, a, such, used,\n", "\"american\" nearest neighbors: UNK, and, from, s, at, in, after, about,\n", "\"britain\" nearest neighbors: of, and, many, the, as, used, but, such,\n", "step: 210000, loss: 16.361233\n", "step: 220000, loss: 17.529526\n", "step: 230000, loss: 16.805817\n", "step: 240000, loss: 6.365625\n", "step: 250000, loss: 8.083097\n", "step: 260000, loss: 11.262514\n", "step: 270000, loss: 9.842708\n", "step: 280000, loss: 6.363440\n", "step: 290000, loss: 8.732617\n", "step: 300000, loss: 10.484728\n", "step: 310000, loss: 12.099487\n", "step: 320000, loss: 11.496288\n", "step: 330000, loss: 9.283813\n", "step: 340000, loss: 10.777218\n", "step: 350000, loss: 16.310440\n", "step: 360000, loss: 7.495782\n", "step: 370000, loss: 9.287696\n", "step: 380000, loss: 6.982735\n", "step: 390000, loss: 8.549622\n", "step: 400000, loss: 8.388112\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, two, seven, eight, one, zero,\n", "\"of\" nearest neighbors: the, a, with, also, for, and, which, by,\n", "\"going\" nearest neighbors: have, are, both, called, being, a, of, had,\n", "\"hardware\" nearest neighbors: may, de, some, have, so, which, other, also,\n", "\"american\" nearest neighbors: s, british, UNK, from, in, including, first, see,\n", "\"britain\" nearest neighbors: against, include, including, both, british, other, an, most,\n", "step: 410000, loss: 8.757725\n", "step: 420000, loss: 12.303110\n", "step: 430000, loss: 12.325478\n", "step: 440000, loss: 7.659882\n", "step: 450000, loss: 6.028089\n", "step: 460000, loss: 12.700299\n", "step: 470000, loss: 7.063077\n", "step: 480000, loss: 18.004183\n", "step: 490000, loss: 7.510474\n", "step: 500000, loss: 10.089376\n", "step: 510000, loss: 11.404436\n", "step: 520000, loss: 9.494527\n", "step: 530000, loss: 7.797963\n", "step: 540000, loss: 7.390718\n", "step: 550000, loss: 13.911215\n", "step: 560000, loss: 6.975731\n", "step: 570000, loss: 6.179163\n", "step: 580000, loss: 7.066525\n", "step: 590000, loss: 6.487288\n", "step: 600000, loss: 5.361528\n", "Evaluation...\n", "\"five\" nearest neighbors: four, six, three, seven, two, one, eight, zero,\n", "\"of\" nearest neighbors: the, and, from, with, a, including, in, include,\n", "\"going\" nearest neighbors: have, even, they, term, who, many, which, were,\n", "\"hardware\" nearest neighbors: include, computer, an, which, other, each, than, may,\n", "\"american\" nearest neighbors: english, french, s, german, from, in, film, see,\n", "\"britain\" nearest neighbors: several, first, modern, part, government, german, was, were,\n", "step: 610000, loss: 4.144980\n", "step: 620000, loss: 5.865635\n", "step: 630000, loss: 6.826498\n", "step: 640000, loss: 8.376097\n", "step: 650000, loss: 7.117930\n", "step: 660000, loss: 7.639544\n", "step: 670000, loss: 5.973255\n", "step: 680000, loss: 4.908459\n", "step: 690000, loss: 6.164993\n", "step: 700000, loss: 7.360281\n", "step: 710000, loss: 12.693079\n", "step: 720000, loss: 6.410182\n", "step: 730000, loss: 7.499201\n", "step: 740000, loss: 6.509094\n", "step: 750000, loss: 10.625893\n", "step: 760000, loss: 7.177696\n", "step: 770000, loss: 12.639092\n", "step: 780000, loss: 8.441635\n", "step: 790000, loss: 7.529139\n", "step: 800000, loss: 6.579177\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, seven, eight, two, one, zero,\n", "\"of\" nearest neighbors: and, with, in, the, its, from, by, including,\n", "\"going\" nearest neighbors: have, they, how, include, people, however, also, their,\n", "\"hardware\" nearest neighbors: computer, large, include, may, or, which, other, there,\n", "\"american\" nearest neighbors: born, french, british, english, german, b, john, d,\n", "\"britain\" nearest neighbors: country, including, include, general, part, various, several, by,\n", "step: 810000, loss: 6.934138\n", "step: 820000, loss: 5.686094\n", "step: 830000, loss: 7.310243\n", "step: 840000, loss: 5.028157\n", "step: 850000, loss: 7.079705\n", "step: 860000, loss: 6.768996\n", "step: 870000, loss: 5.604030\n", "step: 880000, loss: 8.208309\n", "step: 890000, loss: 6.301597\n", "step: 900000, loss: 5.733234\n", "step: 910000, loss: 6.577081\n", "step: 920000, loss: 6.774826\n", "step: 930000, loss: 7.068932\n", "step: 940000, loss: 6.694956\n", "step: 950000, loss: 7.944673\n", "step: 960000, loss: 5.988618\n", "step: 970000, loss: 6.651366\n", "step: 980000, loss: 4.595577\n", "step: 990000, loss: 6.564834\n", "step: 1000000, loss: 4.327858\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, seven, six, eight, two, nine, zero,\n", "\"of\" nearest neighbors: the, first, and, became, from, under, at, with,\n", "\"going\" nearest neighbors: others, has, then, have, how, become, had, also,\n", "\"hardware\" nearest neighbors: computer, large, systems, these, different, either, include, using,\n", "\"american\" nearest neighbors: b, born, d, UNK, nine, english, german, french,\n", "\"britain\" nearest neighbors: government, island, local, country, by, including, control, within,\n", "step: 1010000, loss: 5.841236\n", "step: 1020000, loss: 5.805200\n", "step: 1030000, loss: 9.962063\n", "step: 1040000, loss: 6.281199\n", "step: 1050000, loss: 7.147995\n", "step: 1060000, loss: 5.721184\n", "step: 1070000, loss: 7.080662\n", "step: 1080000, loss: 6.638658\n", "step: 1090000, loss: 5.814178\n", "step: 1100000, loss: 5.195928\n", "step: 1110000, loss: 6.724787\n", "step: 1120000, loss: 6.503905\n", "step: 1130000, loss: 5.762966\n", "step: 1140000, loss: 5.790243\n", "step: 1150000, loss: 5.958191\n", "step: 1160000, loss: 5.997983\n", "step: 1170000, loss: 7.065348\n", "step: 1180000, loss: 6.073387\n", "step: 1190000, loss: 6.644097\n", "step: 1200000, loss: 5.934450\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, eight, seven, two, nine, zero,\n", "\"of\" nearest neighbors: the, and, including, in, its, with, from, on,\n", "\"going\" nearest neighbors: others, then, through, has, had, another, people, when,\n", "\"hardware\" nearest neighbors: computer, control, systems, either, these, large, small, other,\n", "\"american\" nearest neighbors: born, german, john, d, british, b, UNK, french,\n", "\"britain\" nearest neighbors: local, against, british, island, country, general, including, within,\n", "step: 1210000, loss: 5.832344\n", "step: 1220000, loss: 6.453851\n", "step: 1230000, loss: 6.583966\n", "step: 1240000, loss: 5.571673\n", "step: 1250000, loss: 5.720917\n", "step: 1260000, loss: 7.663424\n", "step: 1270000, loss: 6.583741\n", "step: 1280000, loss: 8.503859\n", "step: 1290000, loss: 5.540640\n", "step: 1300000, loss: 6.703249\n", "step: 1310000, loss: 5.274101\n", "step: 1320000, loss: 5.846446\n", "step: 1330000, loss: 5.438172\n", "step: 1340000, loss: 6.367691\n", "step: 1350000, loss: 6.558622\n", "step: 1360000, loss: 9.822924\n", "step: 1370000, loss: 4.982378\n", "step: 1380000, loss: 6.159739\n", "step: 1390000, loss: 5.819083\n", "step: 1400000, loss: 7.775135\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, seven, two, eight, one, zero,\n", "\"of\" nearest neighbors: and, the, in, with, its, within, for, including,\n", "\"going\" nearest neighbors: others, through, while, has, to, how, particularly, their,\n", "\"hardware\" nearest neighbors: computer, systems, large, control, research, using, information, either,\n", "\"american\" nearest neighbors: english, french, german, born, film, british, s, former,\n", "\"britain\" nearest neighbors: british, country, europe, local, military, island, against, western,\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step: 1410000, loss: 8.214248\n", "step: 1420000, loss: 4.696859\n", "step: 1430000, loss: 5.873761\n", "step: 1440000, loss: 5.971557\n", "step: 1450000, loss: 4.992722\n", "step: 1460000, loss: 5.197714\n", "step: 1470000, loss: 6.916918\n", "step: 1480000, loss: 6.441984\n", "step: 1490000, loss: 5.443647\n", "step: 1500000, loss: 5.178482\n", "step: 1510000, loss: 6.060414\n", "step: 1520000, loss: 6.373306\n", "step: 1530000, loss: 5.098322\n", "step: 1540000, loss: 6.674916\n", "step: 1550000, loss: 6.712685\n", "step: 1560000, loss: 5.280202\n", "step: 1570000, loss: 6.454964\n", "step: 1580000, loss: 4.896697\n", "step: 1590000, loss: 6.239226\n", "step: 1600000, loss: 5.709726\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, two, six, seven, eight, one, zero,\n", "\"of\" nearest neighbors: the, and, including, in, with, within, its, following,\n", "\"going\" nearest neighbors: others, people, who, they, that, far, were, have,\n", "\"hardware\" nearest neighbors: computer, systems, include, high, research, some, information, large,\n", "\"american\" nearest neighbors: born, english, french, british, german, d, john, b,\n", "\"britain\" nearest neighbors: country, military, china, europe, against, local, central, british,\n", "step: 1610000, loss: 6.334940\n", "step: 1620000, loss: 5.093616\n", "step: 1630000, loss: 6.119366\n", "step: 1640000, loss: 4.975187\n", "step: 1650000, loss: 6.490408\n", "step: 1660000, loss: 7.464082\n", "step: 1670000, loss: 4.977184\n", "step: 1680000, loss: 5.658133\n", "step: 1690000, loss: 5.352454\n", "step: 1700000, loss: 6.810776\n", "step: 1710000, loss: 5.687447\n", "step: 1720000, loss: 5.992206\n", "step: 1730000, loss: 5.513011\n", "step: 1740000, loss: 5.548522\n", "step: 1750000, loss: 6.200248\n", "step: 1760000, loss: 13.070073\n", "step: 1770000, loss: 4.621058\n", "step: 1780000, loss: 5.301342\n", "step: 1790000, loss: 4.777030\n", "step: 1800000, loss: 6.912136\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, seven, eight, two, nine, zero,\n", "\"of\" nearest neighbors: the, in, first, from, became, and, following, under,\n", "\"going\" nearest neighbors: others, their, through, which, therefore, open, how, that,\n", "\"hardware\" nearest neighbors: computer, systems, include, research, standard, different, system, small,\n", "\"american\" nearest neighbors: b, d, born, actor, UNK, english, nine, german,\n", "\"britain\" nearest neighbors: china, country, europe, against, canada, military, island, including,\n", "step: 1810000, loss: 5.584600\n", "step: 1820000, loss: 5.619820\n", "step: 1830000, loss: 6.078709\n", "step: 1840000, loss: 5.052518\n", "step: 1850000, loss: 5.430106\n", "step: 1860000, loss: 7.396770\n", "step: 1870000, loss: 5.344787\n", "step: 1880000, loss: 5.937998\n", "step: 1890000, loss: 5.706491\n", "step: 1900000, loss: 5.140662\n", "step: 1910000, loss: 5.607048\n", "step: 1920000, loss: 5.407231\n", "step: 1930000, loss: 6.238531\n", "step: 1940000, loss: 5.567973\n", "step: 1950000, loss: 4.894245\n", "step: 1960000, loss: 6.104193\n", "step: 1970000, loss: 5.282631\n", "step: 1980000, loss: 6.189069\n", "step: 1990000, loss: 6.169409\n", "step: 2000000, loss: 6.470152\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, seven, eight, two, nine, zero,\n", "\"of\" nearest neighbors: the, its, in, with, and, including, within, against,\n", "\"going\" nearest neighbors: others, only, therefore, will, how, a, far, though,\n", "\"hardware\" nearest neighbors: computer, systems, for, network, software, program, research, system,\n", "\"american\" nearest neighbors: born, actor, d, italian, german, john, robert, b,\n", "\"britain\" nearest neighbors: china, country, europe, canada, british, former, island, france,\n", "step: 2010000, loss: 5.298714\n", "step: 2020000, loss: 5.494207\n", "step: 2030000, loss: 5.410875\n", "step: 2040000, loss: 6.228232\n", "step: 2050000, loss: 5.044596\n", "step: 2060000, loss: 4.624638\n", "step: 2070000, loss: 4.919327\n", "step: 2080000, loss: 4.639625\n", "step: 2090000, loss: 4.865627\n", "step: 2100000, loss: 4.951073\n", "step: 2110000, loss: 5.973768\n", "step: 2120000, loss: 7.366824\n", "step: 2130000, loss: 5.149571\n", "step: 2140000, loss: 7.846234\n", "step: 2150000, loss: 5.449315\n", "step: 2160000, loss: 5.359211\n", "step: 2170000, loss: 5.171029\n", "step: 2180000, loss: 6.106437\n", "step: 2190000, loss: 6.043995\n", "step: 2200000, loss: 5.642351\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, two, eight, seven, zero, one,\n", "\"of\" nearest neighbors: the, and, its, see, for, in, with, including,\n", "\"going\" nearest neighbors: others, therefore, how, even, them, your, have, although,\n", "\"hardware\" nearest neighbors: computer, systems, system, network, program, research, software, include,\n", "\"american\" nearest neighbors: english, french, german, canadian, british, film, author, italian,\n", "\"britain\" nearest neighbors: europe, china, country, germany, british, england, france, throughout,\n", "step: 2210000, loss: 4.427110\n", "step: 2220000, loss: 6.240989\n", "step: 2230000, loss: 5.184978\n", "step: 2240000, loss: 8.035570\n", "step: 2250000, loss: 5.793781\n", "step: 2260000, loss: 4.908427\n", "step: 2270000, loss: 8.807668\n", "step: 2280000, loss: 6.083229\n", "step: 2290000, loss: 5.773360\n", "step: 2300000, loss: 5.613671\n", "step: 2310000, loss: 6.080076\n", "step: 2320000, loss: 5.288568\n", "step: 2330000, loss: 5.949232\n", "step: 2340000, loss: 5.479994\n", "step: 2350000, loss: 7.717686\n", "step: 2360000, loss: 5.163609\n", "step: 2370000, loss: 5.989407\n", "step: 2380000, loss: 5.785729\n", "step: 2390000, loss: 5.345478\n", "step: 2400000, loss: 6.627133\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, two, seven, eight, zero, nine,\n", "\"of\" nearest neighbors: the, in, and, including, from, within, its, with,\n", "\"going\" nearest neighbors: therefore, people, they, out, only, according, your, now,\n", "\"hardware\" nearest neighbors: computer, systems, network, program, system, software, run, design,\n", "\"american\" nearest neighbors: author, born, actor, english, canadian, british, italian, d,\n", "\"britain\" nearest neighbors: china, europe, country, throughout, france, canada, england, western,\n", "step: 2410000, loss: 5.666146\n", "step: 2420000, loss: 5.316198\n", "step: 2430000, loss: 5.129625\n", "step: 2440000, loss: 5.247949\n", "step: 2450000, loss: 5.741394\n", "step: 2460000, loss: 5.833083\n", "step: 2470000, loss: 7.704844\n", "step: 2480000, loss: 5.398345\n", "step: 2490000, loss: 5.089633\n", "step: 2500000, loss: 5.620508\n", "step: 2510000, loss: 4.976034\n", "step: 2520000, loss: 5.884676\n", "step: 2530000, loss: 6.649922\n", "step: 2540000, loss: 5.002588\n", "step: 2550000, loss: 5.072144\n", "step: 2560000, loss: 5.165375\n", "step: 2570000, loss: 5.310089\n", "step: 2580000, loss: 5.481957\n", "step: 2590000, loss: 6.104440\n", "step: 2600000, loss: 5.339644\n", "Evaluation...\n", "\"five\" nearest neighbors: three, four, six, seven, eight, nine, two, zero,\n", "\"of\" nearest neighbors: the, first, from, with, became, in, following, and,\n", "\"going\" nearest neighbors: how, therefore, back, will, through, always, your, make,\n", "\"hardware\" nearest neighbors: computer, systems, system, network, program, technology, design, software,\n", "\"american\" nearest neighbors: actor, singer, born, b, author, d, english, writer,\n", "\"britain\" nearest neighbors: europe, china, throughout, great, england, france, country, india,\n", "step: 2610000, loss: 7.754117\n", "step: 2620000, loss: 5.979313\n", "step: 2630000, loss: 5.394362\n", "step: 2640000, loss: 4.866740\n", "step: 2650000, loss: 5.219806\n", "step: 2660000, loss: 6.074809\n", "step: 2670000, loss: 6.216953\n", "step: 2680000, loss: 5.944881\n", "step: 2690000, loss: 5.863350\n", "step: 2700000, loss: 6.128705\n", "step: 2710000, loss: 5.502523\n", "step: 2720000, loss: 5.300839\n", "step: 2730000, loss: 6.358493\n", "step: 2740000, loss: 6.058306\n", "step: 2750000, loss: 4.689510\n", "step: 2760000, loss: 6.032880\n", "step: 2770000, loss: 5.844904\n", "step: 2780000, loss: 5.385874\n", "step: 2790000, loss: 5.370956\n", "step: 2800000, loss: 4.912577\n", "Evaluation...\n", "\"five\" nearest neighbors: four, six, three, eight, seven, two, nine, one,\n", "\"of\" nearest neighbors: in, the, and, from, including, following, with, under,\n", "\"going\" nearest neighbors: your, then, through, will, how, so, back, even,\n", "\"hardware\" nearest neighbors: computer, systems, program, network, design, standard, physical, software,\n", "\"american\" nearest neighbors: actor, singer, born, author, writer, canadian, italian, d,\n", "\"britain\" nearest neighbors: europe, china, england, throughout, france, india, great, germany,\n", "step: 2810000, loss: 5.897756\n", "step: 2820000, loss: 7.194932\n", "step: 2830000, loss: 7.430175\n", "step: 2840000, loss: 7.258231\n", "step: 2850000, loss: 5.837617\n", "step: 2860000, loss: 5.496673\n", "step: 2870000, loss: 6.173716\n", "step: 2880000, loss: 6.095749\n", "step: 2890000, loss: 6.064944\n", "step: 2900000, loss: 5.560488\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step: 2910000, loss: 4.966107\n", "step: 2920000, loss: 5.789579\n", "step: 2930000, loss: 4.525987\n", "step: 2940000, loss: 6.704808\n", "step: 2950000, loss: 4.506433\n", "step: 2960000, loss: 6.251270\n", "step: 2970000, loss: 5.588204\n", "step: 2980000, loss: 5.423235\n", "step: 2990000, loss: 5.613834\n", "step: 3000000, loss: 5.137326\n", "Evaluation...\n", "\"five\" nearest neighbors: four, three, six, seven, eight, two, zero, one,\n", "\"of\" nearest neighbors: the, including, and, with, in, its, includes, within,\n", "\"going\" nearest neighbors: how, they, when, them, make, always, your, though,\n", "\"hardware\" nearest neighbors: computer, systems, network, program, physical, design, technology, software,\n", "\"american\" nearest neighbors: canadian, english, australian, british, german, film, italian, author,\n", "\"britain\" nearest neighbors: europe, england, china, throughout, india, france, great, british,\n" ] } ], "source": [ "# Words for testing.\n", "x_test = np.array([word2id[w] for w in eval_words])\n", "\n", "# Run training for the given number of steps.\n", "for step in xrange(1, num_steps + 1):\n", " batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)\n", " run_optimization(batch_x, batch_y)\n", " \n", " if step % display_step == 0 or step == 1:\n", " loss = nce_loss(get_embedding(batch_x), batch_y)\n", " print(\"step: %i, loss: %f\" % (step, loss))\n", " \n", " # Evaluation.\n", " if step % eval_step == 0 or step == 1:\n", " print(\"Evaluation...\")\n", " sim = evaluate(get_embedding(x_test)).numpy()\n", " for i in xrange(len(eval_words)):\n", " top_k = 8 # number of nearest neighbors.\n", " nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n", " log_str = '\"%s\" nearest neighbors:' % eval_words[i]\n", " for k in xrange(top_k):\n", " log_str = '%s %s,' % (log_str, id2word[nearest[k]])\n", " print(log_str)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }