|
@@ -0,0 +1,500 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "skip"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "<small><i>This notebook was put together by [Jake Vanderplas](http://www.vanderplas.com) for PyCon 2014. Source and license info is on [GitHub](https://github.com/jakevdp/sklearn_pycon2014/).</i></small>"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "slide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "# Scipy Sparse Matrices"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "**Sparse Matrices** are very nice in some situations. \n",
|
|
|
+ "\n",
|
|
|
+ "For example, in some machine learning tasks, especially those associated\n",
|
|
|
+ "with textual analysis, the data may be mostly zeros. \n",
|
|
|
+ "\n",
|
|
|
+ "Storing all these zeros is very inefficient. \n",
|
|
|
+ "\n",
|
|
|
+ "We can create and manipulate sparse matrices as follows:"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 2,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": true
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import numpy as np"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 4,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "subslide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "[[ 0.92071168 0.66941621 0.30097014 0.8668366 0.94764952]\n",
|
|
|
+ " [ 0.16978456 0.59292571 0.78884569 0.76910071 0.56415941]\n",
|
|
|
+ " [ 0.096867 0.96869327 0.8643055 0.0297782 0.11921581]\n",
|
|
|
+ " [ 0.22387061 0.71015351 0.45882072 0.34433871 0.85566776]\n",
|
|
|
+ " [ 0.22217957 0.83387745 0.40605966 0.41212024 0.65548993]\n",
|
|
|
+ " [ 0.53416368 0.92406734 0.66444729 0.57218427 0.48198361]\n",
|
|
|
+ " [ 0.37469397 0.33167227 0.9107519 0.03360275 0.20205017]\n",
|
|
|
+ " [ 0.39939621 0.61025928 0.14715445 0.86871212 0.25921407]\n",
|
|
|
+ " [ 0.07210422 0.99690991 0.31477122 0.49698491 0.34563232]\n",
|
|
|
+ " [ 0.10310154 0.3806856 0.77690381 0.46116052 0.43330533]]\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Create a random array with a lot of zeros\n",
|
|
|
+ "X = np.random.random((10, 5))\n",
|
|
|
+ "print(X)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 5,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "subslide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "[[ 0.92071168 0. 0. 0.8668366 0.94764952]\n",
|
|
|
+ " [ 0. 0. 0.78884569 0.76910071 0. ]\n",
|
|
|
+ " [ 0. 0.96869327 0.8643055 0. 0. ]\n",
|
|
|
+ " [ 0. 0.71015351 0. 0. 0.85566776]\n",
|
|
|
+ " [ 0. 0.83387745 0. 0. 0. ]\n",
|
|
|
+ " [ 0. 0.92406734 0. 0. 0. ]\n",
|
|
|
+ " [ 0. 0. 0.9107519 0. 0. ]\n",
|
|
|
+ " [ 0. 0. 0. 0.86871212 0. ]\n",
|
|
|
+ " [ 0. 0.99690991 0. 0. 0. ]\n",
|
|
|
+ " [ 0. 0. 0.77690381 0. 0. ]]\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "X[X < 0.7] = 0\n",
|
|
|
+ "print(X)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 6,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "subslide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ " (0, 0)\t0.920711681384\n",
|
|
|
+ " (0, 3)\t0.866836604396\n",
|
|
|
+ " (0, 4)\t0.947649515452\n",
|
|
|
+ " (1, 2)\t0.788845688727\n",
|
|
|
+ " (1, 3)\t0.769100712548\n",
|
|
|
+ " (2, 1)\t0.968693269052\n",
|
|
|
+ " (2, 2)\t0.864305496772\n",
|
|
|
+ " (3, 1)\t0.710153508323\n",
|
|
|
+ " (3, 4)\t0.855667757095\n",
|
|
|
+ " (4, 1)\t0.833877448584\n",
|
|
|
+ " (5, 1)\t0.924067342994\n",
|
|
|
+ " (6, 2)\t0.910751902907\n",
|
|
|
+ " (7, 3)\t0.868712121221\n",
|
|
|
+ " (8, 1)\t0.996909907387\n",
|
|
|
+ " (9, 2)\t0.776903807028\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "from scipy import sparse\n",
|
|
|
+ "\n",
|
|
|
+ "# turn X into a csr (Compressed-Sparse-Row) matrix\n",
|
|
|
+ "X_csr = sparse.csr_matrix(X)\n",
|
|
|
+ "print(X_csr)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 7,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "subslide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "[[ 0.92071168 0. 0. 0.8668366 0.94764952]\n",
|
|
|
+ " [ 0. 0. 0.78884569 0.76910071 0. ]\n",
|
|
|
+ " [ 0. 0.96869327 0.8643055 0. 0. ]\n",
|
|
|
+ " [ 0. 0.71015351 0. 0. 0.85566776]\n",
|
|
|
+ " [ 0. 0.83387745 0. 0. 0. ]\n",
|
|
|
+ " [ 0. 0.92406734 0. 0. 0. ]\n",
|
|
|
+ " [ 0. 0. 0.9107519 0. 0. ]\n",
|
|
|
+ " [ 0. 0. 0. 0.86871212 0. ]\n",
|
|
|
+ " [ 0. 0.99690991 0. 0. 0. ]\n",
|
|
|
+ " [ 0. 0. 0.77690381 0. 0. ]]\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# convert the sparse matrix to a dense array\n",
|
|
|
+ "print(X_csr.toarray())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 8,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "subslide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "data": {
|
|
|
+ "text/plain": [
|
|
|
+ "True"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "execution_count": 8,
|
|
|
+ "metadata": {},
|
|
|
+ "output_type": "execute_result"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Sparse matrices support linear algebra:\n",
|
|
|
+ "y = np.random.random(X_csr.shape[1])\n",
|
|
|
+ "z1 = X_csr.dot(y)\n",
|
|
|
+ "z2 = X.dot(y)\n",
|
|
|
+ "np.allclose(z1, z2)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "subslide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "* The CSR representation can be very efficient for computations, but it is not as good for adding elements. \n",
|
|
|
+ "\n",
|
|
|
+ "* For that, the **LIL** (List-In-List) representation is better:"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 9,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "fragment"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ " (0, 2)\t2.0\n",
|
|
|
+ " (1, 1)\t2.0\n",
|
|
|
+ " (1, 2)\t3.0\n",
|
|
|
+ " (1, 3)\t4.0\n",
|
|
|
+ " (2, 0)\t2.0\n",
|
|
|
+ " (2, 3)\t5.0\n",
|
|
|
+ " (2, 4)\t6.0\n",
|
|
|
+ " (3, 0)\t3.0\n",
|
|
|
+ " (3, 1)\t4.0\n",
|
|
|
+ " (3, 4)\t7.0\n",
|
|
|
+ " (4, 2)\t6.0\n",
|
|
|
+ " (4, 3)\t7.0\n",
|
|
|
+ "[[ 0. 0. 2. 0. 0.]\n",
|
|
|
+ " [ 0. 2. 3. 4. 0.]\n",
|
|
|
+ " [ 2. 0. 0. 5. 6.]\n",
|
|
|
+ " [ 3. 4. 0. 0. 7.]\n",
|
|
|
+ " [ 0. 0. 6. 7. 0.]]\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Create an empty LIL matrix and add some items\n",
|
|
|
+ "X_lil = sparse.lil_matrix((5, 5))\n",
|
|
|
+ "\n",
|
|
|
+ "for i, j in np.random.randint(0, 5, (15, 2)):\n",
|
|
|
+ " X_lil[i, j] = i + j\n",
|
|
|
+ "\n",
|
|
|
+ "print(X_lil)\n",
|
|
|
+ "print(X_lil.toarray())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "subslide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "* Often, once an LIL matrix is created, it is useful to convert it to a CSR format \n",
|
|
|
+ " * **Note**: many scikit-learn algorithms require CSR or CSC format"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 10,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "fragment"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ " (0, 2)\t2.0\n",
|
|
|
+ " (1, 1)\t2.0\n",
|
|
|
+ " (1, 2)\t3.0\n",
|
|
|
+ " (1, 3)\t4.0\n",
|
|
|
+ " (2, 0)\t2.0\n",
|
|
|
+ " (2, 3)\t5.0\n",
|
|
|
+ " (2, 4)\t6.0\n",
|
|
|
+ " (3, 0)\t3.0\n",
|
|
|
+ " (3, 1)\t4.0\n",
|
|
|
+ " (3, 4)\t7.0\n",
|
|
|
+ " (4, 2)\t6.0\n",
|
|
|
+ " (4, 3)\t7.0\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "X_csr = X_lil.tocsr()\n",
|
|
|
+ "print(X_csr)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "slideshow": {
|
|
|
+ "slide_type": "subslide"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "There are several other sparse formats that can be useful for various problems:\n",
|
|
|
+ "\n",
|
|
|
+ "- `CSC` (compressed sparse column)\n",
|
|
|
+ "- `BSR` (block sparse row)\n",
|
|
|
+ "- `COO` (coordinate)\n",
|
|
|
+ "- `DIA` (diagonal)\n",
|
|
|
+ "- `DOK` (dictionary of keys)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## CSC - Compressed Sparse Column\n",
|
|
|
+ "\n",
|
|
|
+ "**Advantages of the CSC format**\n",
|
|
|
+ "\n",
|
|
|
+ " * efficient arithmetic operations CSC + CSC, CSC * CSC, etc.\n",
|
|
|
+ " * efficient column slicing\n",
|
|
|
+ " * fast matrix vector products (CSR, BSR may be faster)\n",
|
|
|
+ "\n",
|
|
|
+ "**Disadvantages of the CSC format**\n",
|
|
|
+ "\n",
|
|
|
+ " * slow row slicing operations (consider CSR)\n",
|
|
|
+ " * changes to the sparsity structure are expensive (consider LIL or DOK)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### BSR - Block Sparse Row\n",
|
|
|
+ "\n",
|
|
|
+ "The Block Compressed Row (`BSR`) format is very similar to the Compressed Sparse Row (`CSR`) format. \n",
|
|
|
+ "\n",
|
|
|
+ "BSR is appropriate for sparse matrices with *dense sub matrices* like the example below. \n",
|
|
|
+ "\n",
|
|
|
+ "Block matrices often arise in *vector-valued* finite element discretizations. \n",
|
|
|
+ "\n",
|
|
|
+ "In such cases, BSR is **considerably more efficient** than CSR and CSC for many sparse arithmetic operations."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 12,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "data": {
|
|
|
+ "text/plain": [
|
|
|
+ "array([[1, 1, 0, 0, 2, 2],\n",
|
|
|
+ " [1, 1, 0, 0, 2, 2],\n",
|
|
|
+ " [0, 0, 0, 0, 3, 3],\n",
|
|
|
+ " [0, 0, 0, 0, 3, 3],\n",
|
|
|
+ " [4, 4, 5, 5, 6, 6],\n",
|
|
|
+ " [4, 4, 5, 5, 6, 6]])"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "execution_count": 12,
|
|
|
+ "metadata": {},
|
|
|
+ "output_type": "execute_result"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "from scipy.sparse import bsr_matrix\n",
|
|
|
+ "\n",
|
|
|
+ "indptr = np.array([0, 2, 3, 6])\n",
|
|
|
+ "indices = np.array([0, 2, 2, 0, 1, 2])\n",
|
|
|
+ "data = np.array([1, 2, 3, 4, 5, 6]).repeat(4).reshape(6, 2, 2)\n",
|
|
|
+ "bsr_matrix((data,indices,indptr), shape=(6, 6)).toarray()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## COO - Coordinate Sparse Matrix\n",
|
|
|
+ "\n",
|
|
|
+ "**Advantages of the CSC format**\n",
|
|
|
+ "\n",
|
|
|
+ " * facilitates fast conversion among sparse formats\n",
|
|
|
+ " * permits duplicate entries (see example)\n",
|
|
|
+ " * very fast conversion to and from CSR/CSC formats\n",
|
|
|
+ "\n",
|
|
|
+ "**Disadvantages of the CSC format**\n",
|
|
|
+ "\n",
|
|
|
+ " * does not directly support arithmetic operations and slicing\n",
|
|
|
+ " \n",
|
|
|
+ "** Intended Usage**\n",
|
|
|
+ "\n",
|
|
|
+ " * COO is a fast format for constructing sparse matrices\n",
|
|
|
+ " * Once a matrix has been constructed, convert to CSR or CSC format for fast arithmetic and matrix vector\n",
|
|
|
+ " operations\n",
|
|
|
+ " * By default when converting to CSR or CSC format, duplicate (i,j) entries will be summed together. \n",
|
|
|
+ " This facilitates efficient construction of finite element matrices and the like.\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## DOK - Dictionary of Keys\n",
|
|
|
+ "\n",
|
|
|
+ "Sparse matrices can be used in arithmetic operations: they support addition, subtraction, multiplication, division, and matrix power.\n",
|
|
|
+ "\n",
|
|
|
+ "Allows for efficient O(1) access of individual elements. Duplicates are not allowed. Can be efficiently converted to a coo_matrix once constructed."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 15,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "data": {
|
|
|
+ "text/plain": [
|
|
|
+ "array([[ 0., 1., 2., 3., 4.],\n",
|
|
|
+ " [ 0., 2., 3., 4., 5.],\n",
|
|
|
+ " [ 0., 0., 4., 5., 6.],\n",
|
|
|
+ " [ 0., 0., 0., 6., 7.],\n",
|
|
|
+ " [ 0., 0., 0., 0., 8.]], dtype=float32)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "execution_count": 15,
|
|
|
+ "metadata": {},
|
|
|
+ "output_type": "execute_result"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "from scipy.sparse import dok_matrix\n",
|
|
|
+ "S = dok_matrix((5, 5), dtype=np.float32)\n",
|
|
|
+ "for i in range(5):\n",
|
|
|
+ " for j in range(i, 5):\n",
|
|
|
+ " S[i,j] = i+j\n",
|
|
|
+ " \n",
|
|
|
+ "S.toarray()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "The ``scipy.sparse`` submodule also has a lot of functions for sparse matrices\n",
|
|
|
+ "including linear algebra, sparse solvers, graph algorithms, and much more."
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.4.3"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 0
|
|
|
+}
|