In [1]:
import pandas as pd # DataFrames and plotting
import numpy as np
import matplotlib.pyplot as plt # plotting
from matplotlib.colors import ListedColormap # custom color maps
import matplotlib.ticker as mtick
from matplotlib.patches import Rectangle
import matplotlib as mpl
from mpl_toolkits.axes_grid1 import make_axes_locatable
from numpy.linalg import eig # Eigen values and Eigen vectors
from sklearn.decomposition import PCA # PCA program from scikit learn (package for machine learning)
from sklearn.preprocessing import StandardScaler # normalize synthetic data
from ipywidgets import interactive # widgets and interactivity
from ipywidgets import widgets 
from ipywidgets import Layout
from ipywidgets import Label
from ipywidgets import VBox, HBox
import warnings
warnings.filterwarnings('ignore')

In [None]:
m = 4
mean = np.zeros((m))
#cov = np.zeros((m,m))
cov = np.full((m,m),0.8)
for i in range(0,m):
 cov[i,i] = 1.0
cov[2,3] = cov[3,2] = 0.2; cov[1,3] = cov[3,1] = -0.2; cov[2,0] = cov[0,2] = 0.4; cov[1,0] = cov[0,1] = -0.5

data = np.random.multivariate_normal(mean = mean, cov = cov, size = 1000)
data = StandardScaler(copy=True, with_mean=True, with_std=True).fit(data).transform(data)

df = pd.DataFrame(data, columns=np.array(['' + str(i) for i in range(0, m)]))

plt.subplot(121) # plot correlation matrix with significance colormap
sns.heatmap(cov,vmin = -1.0, vmax = 1.0,linewidths=.5, fmt= '.1f',cmap = plt.cm.Spectral_r)
plt.title('Target Covariance Matrix')

plt.subplot(122)
sns.heatmap(df.iloc[:,:].corr(),vmin = -1.0, vmax = 1.0,linewidths=.5, fmt= '.1f',cmap = plt.cm.Spectral_r)
plt.title('Actual Covariance Matrix')

plt.subplots_adjust(left=0.0, bottom=0.0, right=2.0, top=1.1, wspace=0.2, hspace=0.2); plt.show()

df.head()

In [None]:
df.iloc[:,:].corr()

In [None]:
df.describe()

In [None]:
nsample = 100
dpalette = sns.color_palette("rocket_r",n_colors = 3) # matrix scatter plot with points and density estimator
palette = sns.color_palette("rocket")
matrixplot = sns.pairplot(df.sample(n=nsample),diag_kind = 'kde',palette = dpalette,diag_kws={'edgecolor':'black'},plot_kws=dict(s=50, edgecolor="black", linewidth=0.5,alpha=0.2))
matrixplot.map_lower(sns.kdeplot, levels=3, color="black")
plt.subplots_adjust(left=0.0, bottom=0.0, right=0.5, top=0.6, wspace=0.2, hspace=0.3); plt.show()

In [None]:
cov_actual = np.cov(data,rowvar = False)
eigen_values,eigen_vectors = eig(cov_actual)
sorted_indices = np.argsort(-eigen_values)
sorted_eigen_vectors = eigen_vectors[:, sorted_indices]
sorted_eigen_values = np.sort(-eigen_values)*-1

plt.subplot(121)
plt.plot(np.arange(1,5,1),np.cumsum(sorted_eigen_values)/np.sum(sorted_eigen_values)*100,color='darkorange',alpha=0.8)
plt.scatter(np.arange(1,5,1),np.cumsum(sorted_eigen_values)/np.sum(sorted_eigen_values)*100,color='darkorange',alpha=0.8,edgecolor='black')
plt.plot([1,4],[95,95], color='black',linestyle='dashed')
plt.xlabel('Principal Component'); plt.ylabel('Cumulative Variance Explained'); plt.title('Cumulative Variance Explained by Principal Component')
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt); 
plt.xlim(1,4); plt.ylim(0,100.0); plt.annotate('95% variance explained',[3.0,90])
plt.gca().yaxis.set_major_formatter(yticks)

plt.subplot(122)
im = plt.imshow(sorted_eigen_vectors,cmap = plt.cm.Spectral_r)
plt.title('Actual Covariance Matrix')
cbar = plt.colorbar(
 im, orientation="vertical", ticks=np.linspace(-1, 1, 10)
)
cbar.set_label('Component Loadings', rotation=270, labelpad=20)
plt.xlim([-0.5,3.5]); plt.ylim([-0.5,3.5])
plt.gca().set_xticks([0,1, 2, 3],[1,2,3,4]); plt.gca().set_yticks([0,1, 2, 3],[1,2,3,4])
for x in np.arange(0.5,4.5,1.0):
 plt.plot([x,x],[-0.5,3.5],c='black',lw=3)
 plt.plot([-0.5,3.5],[x,x],c='black',lw=1,ls='--')

plt.subplots_adjust(left=0.0, bottom=0.0, right=2.01, top=0.9, wspace=0.2, hspace=0.3); plt.show()

In [None]:
fig = plt.figure(figsize=(6, 6))
gs = fig.add_gridspec(2,2 ,width_ratios=(1.0, 1.0))

plt_center = fig.add_subplot(gs[0, 1])
plt_x = fig.add_subplot(gs[0, 0],sharey=plt_center) 
plt_y = fig.add_subplot(gs[1, 1],sharex=plt_center) 
plt_extra = fig.add_subplot(gs[1, 0]) 

# im = plt_center.imshow(sorted_eigen_vectors,cmap = plt.cm.Spectral_r)
# plt_center.set_title('Actual Covariance Matrix')
# cbar = plt.colorbar(
# im, orientation="vertical", ticks=np.linspace(-1, 1, 10)
# )
# cbar.set_label('Component Loadings', rotation=270, labelpad=20)

for i in range(0,m):
 for j in range(0,m):
 color = (sorted_eigen_vectors[j,i] + 1.0)/(2.0)
 plt_center.add_patch(Rectangle((i-0.5,j-0.5), 1, 1,color = plt.cm.RdGy_r(color),fill=True))

plt_center.set_xlim([-0.5,3.5]); plt_center.set_ylim([-0.5,3.5])
plt_center.set_xticks([0,1, 2, 3],[1,2,3,4]); plt_center.set_yticks([0,1, 2, 3],[1,2,3,4])
for x in np.arange(0.5,3.5,1.0):
 plt_center.plot([x,x],[-0.5,3.5],c='black',lw=3)
 plt_center.plot([-0.5,3.5],[x,x],c='black',lw=1,ls='--')
plt_center.set_title('Eigen Vectors / Principal Component Loadings') 

plt_x.barh(y=np.array([0,1,2,3],dtype='float'),width=np.var(data,axis=0),color='darkorange',edgecolor='black')
plt_x.set_xlim([1.2,0]); plt_x.set_yticks([0,1, 2, 3],[1,2,3,4])
plt_x.set_ylabel('Feature'); plt_x.set_xlabel('Variance')
plt_x.set_title('Original Feature Variance') 

plt_y.bar(x=np.array([0,1,2,3],dtype='float'),height=sorted_eigen_values,color='darkorange',edgecolor='black')
plt_y.set_ylim([2.5,0]); plt_y.set_xticks([0,1, 2, 3],[1,2,3,4])
plt_y.set_xlabel('Feature'); plt_y.set_ylabel('Variance')
plt_y.set_title('Projected Feature Variance') 

for i in range(0,m):
 for j in range(0,m):
 color = (cov_actual[j,i] + 1.0)/(2.0)
 plt_extra.add_patch(Rectangle((i-0.5,j-0.5), 1, 1,color = plt.cm.bwr(color),fill=True))

plt_extra.set_xlim([-0.5,3.5]); plt_extra.set_ylim([-0.5,3.5])
plt_extra.set_xticks([0,1, 2, 3],[1,2,3,4]); plt_extra.set_yticks([0,1, 2, 3],[1,2,3,4])
for x in np.arange(0.5,3.5,1.0):
 plt_extra.plot([x,x],[-0.5,3.5],c='black',lw=3)
 plt_extra.plot([-0.5,3.5],[x,x],c='black',lw=1,ls='--')
plt_extra.set_title('Eigen Vectors / Principal Component Loadings') 

plt.subplots_adjust(left=0.0, bottom=0.0, right=1.51, top=1.50, wspace=0.2, hspace=0.2); plt.show()

In [None]:



m = 4; cstr = 0.0

mean = np.zeros((m)) # make inputs for multivariate dataset
#cov = np.zeros((m,m))
cov = np.full((m,m),0.8*cstr)
for i in range(0,m):
 cov[i,i] = 1.0
cov[2,3] = cov[3,2] = 0.2*cstr; cov[1,3] = cov[3,1] = -0.2*cstr; cov[2,0] = cov[0,2] = 0.4*cstr; 
cov[1,0] = cov[0,1] = -0.5*cstr

data = np.random.multivariate_normal(mean = mean, cov = cov, size = 1000) # draw samples from MV Gaussian
data = StandardScaler(copy=True, with_mean=True, with_std=True).fit(data).transform(data)

cov_actual = np.cov(data,rowvar = False)

eigen_values,eigen_vectors = eig(cov_actual) # Eigen values and vectors 
sorted_indices = np.argsort(-eigen_values)
sorted_eigen_vectors = eigen_vectors[:, sorted_indices]
sorted_eigen_values = np.sort(-eigen_values)*-1

fig = plt.figure(figsize=(6, 6))
gs = fig.add_gridspec(2,2 ,width_ratios=(1.0, 1.0))

plt_center = fig.add_subplot(gs[1, 1])
plt_x = fig.add_subplot(gs[1, 0],sharey=plt_center) 
plt_y = fig.add_subplot(gs[0, 1],sharex=plt_center) 
plt_extra = fig.add_subplot(gs[0, 0]) 

for i in range(0,m):
 for j in range(0,m):
 color = (sorted_eigen_vectors[j,i] + 1.0)/(2.0)
 plt_center.add_patch(Rectangle((i-0.5,j-0.5), 1, 1,color = plt.cm.RdGy_r(color),fill=True))
 plt_center.annotate(np.round(sorted_eigen_vectors[j,i],1),(i-0.1,j-0.05))

plt_center.set_xlim([-0.5,3.5]); plt_center.set_ylim([-0.5,3.5])
plt_center.set_xticks([0,1, 2, 3],[1,2,3,4]); plt_center.set_yticks([0,1, 2, 3],[1,2,3,4])
for x in np.arange(0.5,3.5,1.0):
 plt_center.plot([x,x],[-0.5,3.5],c='black',lw=3)
 plt_center.plot([-0.5,3.5],[x,x],c='black',lw=1,ls='--')
plt_center.set_title('Eigen Vectors / Principal Component Loadings') 
plt_center.set_xlabel('Eigen Vector'); plt_center.set_ylabel('Feature')

plt_x.barh(y=np.array([0,1,2,3],dtype='float'),width=np.var(data,axis=0),color='darkorange',edgecolor='black')
plt_x.set_xlim([1.2,0]); plt_x.set_yticks([0,1, 2, 3],[1,2,3,4])
plt_x.set_ylabel('Feature'); plt_x.set_xlabel('Variance')
plt_x.set_title('Original Feature Variance') 

plt_y.bar(x=np.array([0,1,2,3],dtype='float'),height=sorted_eigen_values,color='darkorange',edgecolor='black')
plt_y.set_ylim([0,2.5]); plt_y.set_xticks([0,1, 2, 3],[1,2,3,4])
plt_y.set_xlabel('Eigen Value'); plt_y.set_ylabel('Variance')
plt_y.set_title('Sorted, Projected Feature Variance') 

for i in range(0,m):
 for j in range(0,m):
 color = (cov_actual[j,i] + 1.0)/(2.0)
 plt_extra.add_patch(Rectangle((i-0.5,j-0.5), 1, 1,color = plt.cm.BrBG(color),fill=True))

plt_extra.set_xlim([-0.5,3.5]); plt_extra.set_ylim([3.5,-0.5])
plt_extra.set_xticks([0,1, 2, 3],[1,2,3,4]); plt_extra.set_yticks([0,1, 2, 3],[1,2,3,4])
for x in np.arange(0.5,3.5,1.0):
 plt_extra.plot([x,x],[-0.5,3.5],c='black',lw=2)
 plt_extra.plot([-0.5,3.5],[x,x],c='black',lw=2)
plt_extra.set_title('Original Covariance Matrix') 
 
cplt_extra = make_axes_locatable(plt_extra).append_axes('left', size='5%', pad=0.3)
fig.colorbar(mpl.cm.ScalarMappable(norm=mpl.colors.Normalize(vmin=-1.0, vmax=1.0), cmap=plt.cm.BrBG),
 cax=cplt_extra, orientation='vertical')
cplt_extra.yaxis.set_ticks_position('left')

plt.subplots_adjust(left=0.0, bottom=0.0, right=1.51, top=1.50, wspace=0.2, hspace=0.2); plt.show()

In [None]:
cov

In [None]:
eigen_vectors_sorted[3][3]

In [None]:
sorted_eigen_vectors

In [None]:
n_components = 4
pca = PCA(n_components=n_components,)
pca.fit(data)
print(np.round(pca.components_,3))

In [None]:
pca.explained_variance_

In [None]:
eigen_values

In [None]:
sorted_indices = np.argsort(-eigen_values)
sorted_eigen_vectors = eigen_vectors[:, sorted_indices]
sorted_eigen_vectors

In [None]:
eigen_vectors

In [None]:
sorted_indices