A brief overview of the implementation of principal component analysis of R’s first principles

Picture by Edward Howell on Loosen

variables that still contain most of the data in the original data set.

rm(list = ls())   #clear environment
if(!is.null(dev.list())) dev.off() #clear plots
library(tidyverse)
library(matlib)
data(iris)
head(iris)
X <- data.matrix(iris)
X
n <- nrow(iris)  #this gives the sample sizes
ones <- matrix(1,n,1) #this gives the n x 1 column vector with all entries equal to one
xbar <- 1/n * t(X) %*% ones #sample variable means
S <- (1/(n-1))*t(X)%*%(diag(n)-1/n*ones%*%t(ones))%*%X #sample covariance matrix
D <- inv(sqrt(diag(diag(S))))
R <- D%*%S%*%D #sample correlation matrix;
ev <- eigen(R)
e_values <- ev$values
e_vectors <- ev$vectors
e_values
e_vectors
proportion <- e_values/length(e_values) #proportion of variance explainedcumulative <- 0
for(i in 1:length(proportion)){ #cumulative proportion of variance explained
cumulative[i] <- sum(proportion[1:i])
}
proportion.df <- data.frame(PC = 1:length(proportion), variance=proportion)
proportion.df$type <- 'Proportion'
cumulative.df <- data.frame(PC = 1:length(cumulative), variance=cumulative)
cumulative.df$type <- 'Cumulative proportion'
variance <- rbind.data.frame(proportion.df, cumulative.df)
variance
ggplot(variance, aes(x = PC, y = variance))+
geom_line(aes(colour = type), size = 1)+
geom_point(aes(colour = type), size = 3)+
scale_x_continuous(name="Principal components")+
scale_y_continuous(name="Variance explained")+
ggtitle("Scree plot: PCA on scaled data")+
scale_colour_manual(values = c('Proportion' = 'skyblue3',
'Cumulative proportion' = 'salmon3'))
ggplot(proportion.df, aes(x=PC,y=variance))+
geom_col(fill="skyblue3")+
labs(title="Scree plot: PCA on scaled data")+
xlab("Principal components")+
ylab("Variance explained")
V <- diag(D)
Z <- (X - ones%*%t(xbar))/(ones%*%V)
pcomps <- Z%*%e_vectors; #project Z onto pcs
pcomps

LEAVE A REPLY

Please enter your comment!
Please enter your name here