{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "7fede23e", "metadata": {}, "outputs": [], "source": [ "import pandas as pd # DataFrames and plotting\n", "import numpy as np\n", "import matplotlib.pyplot as plt # plotting\n", "from matplotlib.colors import ListedColormap # custom color maps\n", "import matplotlib.ticker as mtick\n", "from matplotlib.patches import Rectangle\n", "import matplotlib as mpl\n", "from mpl_toolkits.axes_grid1 import make_axes_locatable\n", "from numpy.linalg import eig # Eigen values and Eigen vectors\n", "from sklearn.decomposition import PCA # PCA program from scikit learn (package for machine learning)\n", "from sklearn.preprocessing import StandardScaler # normalize synthetic data\n", "from ipywidgets import interactive # widgets and interactivity\n", "from ipywidgets import widgets \n", "from ipywidgets import Layout\n", "from ipywidgets import Label\n", "from ipywidgets import VBox, HBox\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": null, "id": "dd001700", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "12effd17", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "70e4f054", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "80e526c4", "metadata": {}, "outputs": [], "source": [ "m = 4\n", "mean = np.zeros((m))\n", "#cov = np.zeros((m,m))\n", "cov = np.full((m,m),0.8)\n", "for i in range(0,m):\n", " cov[i,i] = 1.0\n", "cov[2,3] = cov[3,2] = 0.2; cov[1,3] = cov[3,1] = -0.2; cov[2,0] = cov[0,2] = 0.4; cov[1,0] = cov[0,1] = -0.5\n", "\n", "data = np.random.multivariate_normal(mean = mean, cov = cov, size = 1000)\n", "data = StandardScaler(copy=True, with_mean=True, with_std=True).fit(data).transform(data)\n", "\n", "df = pd.DataFrame(data, columns=np.array(['' + str(i) for i in range(0, m)]))\n", "\n", "plt.subplot(121) # plot correlation matrix with significance colormap\n", "sns.heatmap(cov,vmin = -1.0, vmax = 1.0,linewidths=.5, fmt= '.1f',cmap = plt.cm.Spectral_r)\n", "plt.title('Target Covariance Matrix')\n", "\n", "plt.subplot(122)\n", "sns.heatmap(df.iloc[:,:].corr(),vmin = -1.0, vmax = 1.0,linewidths=.5, fmt= '.1f',cmap = plt.cm.Spectral_r)\n", "plt.title('Actual Covariance Matrix')\n", "\n", "plt.subplots_adjust(left=0.0, bottom=0.0, right=2.0, top=1.1, wspace=0.2, hspace=0.2); plt.show()\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "26aa283f", "metadata": {}, "outputs": [], "source": [ "df.iloc[:,:].corr()" ] }, { "cell_type": "code", "execution_count": null, "id": "ea5b2515", "metadata": {}, "outputs": [], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "83b1e382", "metadata": {}, "outputs": [], "source": [ "nsample = 100\n", "dpalette = sns.color_palette(\"rocket_r\",n_colors = 3) # matrix scatter plot with points and density estimator\n", "palette = sns.color_palette(\"rocket\")\n", "matrixplot = sns.pairplot(df.sample(n=nsample),diag_kind = 'kde',palette = dpalette,diag_kws={'edgecolor':'black'},plot_kws=dict(s=50, edgecolor=\"black\", linewidth=0.5,alpha=0.2))\n", "matrixplot.map_lower(sns.kdeplot, levels=3, color=\"black\")\n", "plt.subplots_adjust(left=0.0, bottom=0.0, right=0.5, top=0.6, wspace=0.2, hspace=0.3); plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "58970b63", "metadata": {}, "outputs": [], "source": [ "cov_actual = np.cov(data,rowvar = False)\n", "eigen_values,eigen_vectors = eig(cov_actual)\n", "sorted_indices = np.argsort(-eigen_values)\n", "sorted_eigen_vectors = eigen_vectors[:, sorted_indices]\n", "sorted_eigen_values = np.sort(-eigen_values)*-1\n", "\n", "plt.subplot(121)\n", "plt.plot(np.arange(1,5,1),np.cumsum(sorted_eigen_values)/np.sum(sorted_eigen_values)*100,color='darkorange',alpha=0.8)\n", "plt.scatter(np.arange(1,5,1),np.cumsum(sorted_eigen_values)/np.sum(sorted_eigen_values)*100,color='darkorange',alpha=0.8,edgecolor='black')\n", "plt.plot([1,4],[95,95], color='black',linestyle='dashed')\n", "plt.xlabel('Principal Component'); plt.ylabel('Cumulative Variance Explained'); plt.title('Cumulative Variance Explained by Principal Component')\n", "fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'\n", "yticks = mtick.FormatStrFormatter(fmt); \n", "plt.xlim(1,4); plt.ylim(0,100.0); plt.annotate('95% variance explained',[3.0,90])\n", "plt.gca().yaxis.set_major_formatter(yticks)\n", "\n", "plt.subplot(122)\n", "im = plt.imshow(sorted_eigen_vectors,cmap = plt.cm.Spectral_r)\n", "plt.title('Actual Covariance Matrix')\n", "cbar = plt.colorbar(\n", " im, orientation=\"vertical\", ticks=np.linspace(-1, 1, 10)\n", ")\n", "cbar.set_label('Component Loadings', rotation=270, labelpad=20)\n", "plt.xlim([-0.5,3.5]); plt.ylim([-0.5,3.5])\n", "plt.gca().set_xticks([0,1, 2, 3],[1,2,3,4]); plt.gca().set_yticks([0,1, 2, 3],[1,2,3,4])\n", "for x in np.arange(0.5,4.5,1.0):\n", " plt.plot([x,x],[-0.5,3.5],c='black',lw=3)\n", " plt.plot([-0.5,3.5],[x,x],c='black',lw=1,ls='--')\n", "\n", "plt.subplots_adjust(left=0.0, bottom=0.0, right=2.01, top=0.9, wspace=0.2, hspace=0.3); plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "5142ac6d", "metadata": {}, "outputs": [], "source": [ "fig = plt.figure(figsize=(6, 6))\n", "gs = fig.add_gridspec(2,2 ,width_ratios=(1.0, 1.0))\n", "\n", "plt_center = fig.add_subplot(gs[0, 1])\n", "plt_x = fig.add_subplot(gs[0, 0],sharey=plt_center) \n", "plt_y = fig.add_subplot(gs[1, 1],sharex=plt_center) \n", "plt_extra = fig.add_subplot(gs[1, 0]) \n", "\n", "# im = plt_center.imshow(sorted_eigen_vectors,cmap = plt.cm.Spectral_r)\n", "# plt_center.set_title('Actual Covariance Matrix')\n", "# cbar = plt.colorbar(\n", "# im, orientation=\"vertical\", ticks=np.linspace(-1, 1, 10)\n", "# )\n", "# cbar.set_label('Component Loadings', rotation=270, labelpad=20)\n", "\n", "for i in range(0,m):\n", " for j in range(0,m):\n", " color = (sorted_eigen_vectors[j,i] + 1.0)/(2.0)\n", " plt_center.add_patch(Rectangle((i-0.5,j-0.5), 1, 1,color = plt.cm.RdGy_r(color),fill=True))\n", "\n", "plt_center.set_xlim([-0.5,3.5]); plt_center.set_ylim([-0.5,3.5])\n", "plt_center.set_xticks([0,1, 2, 3],[1,2,3,4]); plt_center.set_yticks([0,1, 2, 3],[1,2,3,4])\n", "for x in np.arange(0.5,3.5,1.0):\n", " plt_center.plot([x,x],[-0.5,3.5],c='black',lw=3)\n", " plt_center.plot([-0.5,3.5],[x,x],c='black',lw=1,ls='--')\n", "plt_center.set_title('Eigen Vectors / Principal Component Loadings') \n", "\n", "plt_x.barh(y=np.array([0,1,2,3],dtype='float'),width=np.var(data,axis=0),color='darkorange',edgecolor='black')\n", "plt_x.set_xlim([1.2,0]); plt_x.set_yticks([0,1, 2, 3],[1,2,3,4])\n", "plt_x.set_ylabel('Feature'); plt_x.set_xlabel('Variance')\n", "plt_x.set_title('Original Feature Variance') \n", "\n", "plt_y.bar(x=np.array([0,1,2,3],dtype='float'),height=sorted_eigen_values,color='darkorange',edgecolor='black')\n", "plt_y.set_ylim([2.5,0]); plt_y.set_xticks([0,1, 2, 3],[1,2,3,4])\n", "plt_y.set_xlabel('Feature'); plt_y.set_ylabel('Variance')\n", "plt_y.set_title('Projected Feature Variance') \n", "\n", "for i in range(0,m):\n", " for j in range(0,m):\n", " color = (cov_actual[j,i] + 1.0)/(2.0)\n", " plt_extra.add_patch(Rectangle((i-0.5,j-0.5), 1, 1,color = plt.cm.bwr(color),fill=True))\n", "\n", "plt_extra.set_xlim([-0.5,3.5]); plt_extra.set_ylim([-0.5,3.5])\n", "plt_extra.set_xticks([0,1, 2, 3],[1,2,3,4]); plt_extra.set_yticks([0,1, 2, 3],[1,2,3,4])\n", "for x in np.arange(0.5,3.5,1.0):\n", " plt_extra.plot([x,x],[-0.5,3.5],c='black',lw=3)\n", " plt_extra.plot([-0.5,3.5],[x,x],c='black',lw=1,ls='--')\n", "plt_extra.set_title('Eigen Vectors / Principal Component Loadings') \n", "\n", "plt.subplots_adjust(left=0.0, bottom=0.0, right=1.51, top=1.50, wspace=0.2, hspace=0.2); plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "36698d45", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "\n", "m = 4; cstr = 0.0\n", "\n", "mean = np.zeros((m)) # make inputs for multivariate dataset\n", "#cov = np.zeros((m,m))\n", "cov = np.full((m,m),0.8*cstr)\n", "for i in range(0,m):\n", " cov[i,i] = 1.0\n", "cov[2,3] = cov[3,2] = 0.2*cstr; cov[1,3] = cov[3,1] = -0.2*cstr; cov[2,0] = cov[0,2] = 0.4*cstr; \n", "cov[1,0] = cov[0,1] = -0.5*cstr\n", "\n", "data = np.random.multivariate_normal(mean = mean, cov = cov, size = 1000) # draw samples from MV Gaussian\n", "data = StandardScaler(copy=True, with_mean=True, with_std=True).fit(data).transform(data)\n", "\n", "cov_actual = np.cov(data,rowvar = False)\n", "\n", "eigen_values,eigen_vectors = eig(cov_actual) # Eigen values and vectors \n", "sorted_indices = np.argsort(-eigen_values)\n", "sorted_eigen_vectors = eigen_vectors[:, sorted_indices]\n", "sorted_eigen_values = np.sort(-eigen_values)*-1\n", "\n", "fig = plt.figure(figsize=(6, 6))\n", "gs = fig.add_gridspec(2,2 ,width_ratios=(1.0, 1.0))\n", "\n", "plt_center = fig.add_subplot(gs[1, 1])\n", "plt_x = fig.add_subplot(gs[1, 0],sharey=plt_center) \n", "plt_y = fig.add_subplot(gs[0, 1],sharex=plt_center) \n", "plt_extra = fig.add_subplot(gs[0, 0]) \n", "\n", "for i in range(0,m):\n", " for j in range(0,m):\n", " color = (sorted_eigen_vectors[j,i] + 1.0)/(2.0)\n", " plt_center.add_patch(Rectangle((i-0.5,j-0.5), 1, 1,color = plt.cm.RdGy_r(color),fill=True))\n", " plt_center.annotate(np.round(sorted_eigen_vectors[j,i],1),(i-0.1,j-0.05))\n", "\n", "plt_center.set_xlim([-0.5,3.5]); plt_center.set_ylim([-0.5,3.5])\n", "plt_center.set_xticks([0,1, 2, 3],[1,2,3,4]); plt_center.set_yticks([0,1, 2, 3],[1,2,3,4])\n", "for x in np.arange(0.5,3.5,1.0):\n", " plt_center.plot([x,x],[-0.5,3.5],c='black',lw=3)\n", " plt_center.plot([-0.5,3.5],[x,x],c='black',lw=1,ls='--')\n", "plt_center.set_title('Eigen Vectors / Principal Component Loadings') \n", "plt_center.set_xlabel('Eigen Vector'); plt_center.set_ylabel('Feature')\n", "\n", "plt_x.barh(y=np.array([0,1,2,3],dtype='float'),width=np.var(data,axis=0),color='darkorange',edgecolor='black')\n", "plt_x.set_xlim([1.2,0]); plt_x.set_yticks([0,1, 2, 3],[1,2,3,4])\n", "plt_x.set_ylabel('Feature'); plt_x.set_xlabel('Variance')\n", "plt_x.set_title('Original Feature Variance') \n", "\n", "plt_y.bar(x=np.array([0,1,2,3],dtype='float'),height=sorted_eigen_values,color='darkorange',edgecolor='black')\n", "plt_y.set_ylim([0,2.5]); plt_y.set_xticks([0,1, 2, 3],[1,2,3,4])\n", "plt_y.set_xlabel('Eigen Value'); plt_y.set_ylabel('Variance')\n", "plt_y.set_title('Sorted, Projected Feature Variance') \n", "\n", "for i in range(0,m):\n", " for j in range(0,m):\n", " color = (cov_actual[j,i] + 1.0)/(2.0)\n", " plt_extra.add_patch(Rectangle((i-0.5,j-0.5), 1, 1,color = plt.cm.BrBG(color),fill=True))\n", "\n", "plt_extra.set_xlim([-0.5,3.5]); plt_extra.set_ylim([3.5,-0.5])\n", "plt_extra.set_xticks([0,1, 2, 3],[1,2,3,4]); plt_extra.set_yticks([0,1, 2, 3],[1,2,3,4])\n", "for x in np.arange(0.5,3.5,1.0):\n", " plt_extra.plot([x,x],[-0.5,3.5],c='black',lw=2)\n", " plt_extra.plot([-0.5,3.5],[x,x],c='black',lw=2)\n", "plt_extra.set_title('Original Covariance Matrix') \n", " \n", "cplt_extra = make_axes_locatable(plt_extra).append_axes('left', size='5%', pad=0.3)\n", "fig.colorbar(mpl.cm.ScalarMappable(norm=mpl.colors.Normalize(vmin=-1.0, vmax=1.0), cmap=plt.cm.BrBG),\n", " cax=cplt_extra, orientation='vertical')\n", "cplt_extra.yaxis.set_ticks_position('left')\n", "\n", "plt.subplots_adjust(left=0.0, bottom=0.0, right=1.51, top=1.50, wspace=0.2, hspace=0.2); plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "26c81bfe", "metadata": {}, "outputs": [], "source": [ "cov" ] }, { "cell_type": "code", "execution_count": null, "id": "dfd37f62", "metadata": {}, "outputs": [], "source": [ "eigen_vectors_sorted[3][3]" ] }, { "cell_type": "code", "execution_count": null, "id": "8ffc30bb", "metadata": {}, "outputs": [], "source": [ "sorted_eigen_vectors" ] }, { "cell_type": "code", "execution_count": null, "id": "ed00b8a9", "metadata": {}, "outputs": [], "source": [ "n_components = 4\n", "pca = PCA(n_components=n_components,)\n", "pca.fit(data)\n", "print(np.round(pca.components_,3))" ] }, { "cell_type": "code", "execution_count": null, "id": "a6c58c46", "metadata": {}, "outputs": [], "source": [ "pca.explained_variance_" ] }, { "cell_type": "code", "execution_count": null, "id": "39565068", "metadata": {}, "outputs": [], "source": [ "eigen_values" ] }, { "cell_type": "code", "execution_count": null, "id": "4fdb342b", "metadata": {}, "outputs": [], "source": [ "sorted_indices = np.argsort(-eigen_values)\n", "sorted_eigen_vectors = eigen_vectors[:, sorted_indices]\n", "sorted_eigen_vectors" ] }, { "cell_type": "code", "execution_count": null, "id": "4bd12717", "metadata": {}, "outputs": [], "source": [ "eigen_vectors" ] }, { "cell_type": "code", "execution_count": null, "id": "df84d9b7", "metadata": {}, "outputs": [], "source": [ "sorted_indices" ] }, { "cell_type": "code", "execution_count": null, "id": "8b72f080", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }