{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# K-Means Example\n", "\n", "Implement K-Means algorithm with TensorFlow, and apply it to classify\n", "handwritten digit images. This example is using the MNIST database of\n", "handwritten digits as training samples (http://yann.lecun.com/exdb/mnist/).\n", "\n", "Note: This example requires TensorFlow v1.1.0 or over.\n", "\n", "- Author: Aymeric Damien\n", "- Project: https://github.com/aymericdamien/TensorFlow-Examples/" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from __future__ import print_function\n", "\n", "import numpy as np\n", "import tensorflow as tf\n", "from tensorflow.contrib.factorization import KMeans\n", "\n", "# Ignore all GPUs, tf random forest does not benefit from it.\n", "import os\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Extracting /tmp/data/train-images-idx3-ubyte.gz\n", "Extracting /tmp/data/train-labels-idx1-ubyte.gz\n", "Extracting /tmp/data/t10k-images-idx3-ubyte.gz\n", "Extracting /tmp/data/t10k-labels-idx1-ubyte.gz\n" ] } ], "source": [ "# Import MNIST data\n", "from tensorflow.examples.tutorials.mnist import input_data\n", "mnist = input_data.read_data_sets(\"/tmp/data/\", one_hot=True)\n", "full_data_x = mnist.train.images" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Parameters\n", "num_steps = 50 # Total steps to train\n", "batch_size = 1024 # The number of samples per batch\n", "k = 25 # The number of clusters\n", "num_classes = 10 # The 10 digits\n", "num_features = 784 # Each image is 28x28 pixels\n", "\n", "# Input images\n", "X = tf.placeholder(tf.float32, shape=[None, num_features])\n", "# Labels (for assigning a label to a centroid and testing)\n", "Y = tf.placeholder(tf.float32, shape=[None, num_classes])\n", "\n", "# K-Means Parameters\n", "kmeans = KMeans(inputs=X, num_clusters=k, distance_metric='cosine',\n", " use_mini_batch=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Build KMeans graph\n", "(all_scores, cluster_idx, scores, cluster_centers_initialized, \n", " cluster_centers_vars,init_op,train_op) = kmeans.training_graph()\n", "cluster_idx = cluster_idx[0] # fix for cluster_idx being a tuple\n", "avg_distance = tf.reduce_mean(scores)\n", "\n", "# Initialize the variables (i.e. assign their default value)\n", "init_vars = tf.global_variables_initializer()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Step 1, Avg Distance: 0.341471\n", "Step 10, Avg Distance: 0.221609\n", "Step 20, Avg Distance: 0.220328\n", "Step 30, Avg Distance: 0.219776\n", "Step 40, Avg Distance: 0.219419\n", "Step 50, Avg Distance: 0.219154\n" ] } ], "source": [ "# Start TensorFlow session\n", "sess = tf.Session()\n", "\n", "# Run the initializer\n", "sess.run(init_vars, feed_dict={X: full_data_x})\n", "sess.run(init_op, feed_dict={X: full_data_x})\n", "\n", "# Training\n", "for i in range(1, num_steps + 1):\n", " _, d, idx = sess.run([train_op, avg_distance, cluster_idx],\n", " feed_dict={X: full_data_x})\n", " if i % 10 == 0 or i == 1:\n", " print(\"Step %i, Avg Distance: %f\" % (i, d))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test Accuracy: 0.7127\n" ] } ], "source": [ "# Assign a label to each centroid\n", "# Count total number of labels per centroid, using the label of each training\n", "# sample to their closest centroid (given by 'idx')\n", "counts = np.zeros(shape=(k, num_classes))\n", "for i in range(len(idx)):\n", " counts[idx[i]] += mnist.train.labels[i]\n", "# Assign the most frequent label to the centroid\n", "labels_map = [np.argmax(c) for c in counts]\n", "labels_map = tf.convert_to_tensor(labels_map)\n", "\n", "# Evaluation ops\n", "# Lookup: centroid_id -> label\n", "cluster_label = tf.nn.embedding_lookup(labels_map, cluster_idx)\n", "# Compute accuracy\n", "correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(Y, 1), tf.int32))\n", "accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))\n", "\n", "# Test Model\n", "test_x, test_y = mnist.test.images, mnist.test.labels\n", "print(\"Test Accuracy:\", sess.run(accuracy_op, feed_dict={X: test_x, Y: test_y}))" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 1 }