Principal component analysis

Iris dataset

https://www.datacamp.com/tutorial/machine-learning-in-r

The iris dataset contains four features: length and width of sepals and petals. It includes 50 samples of three species of Iris: Iris setosa, Iris virginica and Iris versicolor. Type ?iris to learn more.

data("iris")
head(iris)

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
5.1	3.5	1.4	0.2	setosa
4.9	3.0	1.4	0.2	setosa
4.7	3.2	1.3	0.2	setosa
4.6	3.1	1.5	0.2	setosa
5.0	3.6	1.4	0.2	setosa
5.4	3.9	1.7	0.4	setosa

Let’s check the number of observation per Species as well as the average values for sepal and petal widths and lengths.

iris %>% group_by(Species) %>%
  summarize(n = n())

Species	n
setosa	50
versicolor	50
virginica	50

iris %>% group_by(Species) %>%
  summarize_all(mean)

Species	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
setosa	5.006	3.428	1.462	0.246
versicolor	5.936	2.770	4.260	1.326
virginica	6.588	2.974	5.552	2.026

Let’s check the scatter plot for each pair of variables.

library(GGally)
ggpairs(iris, columns = 1:4, aes(color = Species))

PCA on iris dataset

PCA can be performed using prcomp function.

X = iris[, 1:4]
PCA = prcomp(X, center = TRUE, scale = TRUE)

sdev is the standard deviations of the principal components (i.e., the square roots of the eigenvalues of the covariance matrix).

sdev = PCA$sdev
sdev

## [1] 1.7083611 0.9560494 0.3830886 0.1439265

rotation is the matrix of variable loadings (i.e., a matrix whose columns contain the eigenvectors).

rotation = PCA$rotation
data.frame(rotation)

	PC1	PC2	PC3	PC4
Sepal.Length	0.5210659	-0.3774176	0.7195664	0.2612863
Sepal.Width	-0.2693474	-0.9232957	-0.2443818	-0.1235096
Petal.Length	0.5804131	-0.0244916	-0.1421264	-0.8014492
Petal.Width	0.5648565	-0.0669420	-0.6342727	0.5235971

x the value of the rotated data (the centered and scaled data multiplied by the rotation matrix) is returned.

head(data.frame(PCA$x))

PC1	PC2	PC3	PC4
-2.257141	-0.4784238	0.1272796	0.0240875
-2.074013	0.6718827	0.2338255	0.1026628
-2.356335	0.3407664	-0.0440539	0.0282823
-2.291707	0.5953999	-0.0909853	-0.0657353
-2.381863	-0.6446757	-0.0156856	-0.0358029
-2.068701	-1.4842053	-0.0268782	0.0065861

Variance explained (VE) and the cumulative values of VE can be found using summary() function.

summ = summary(PCA)
data.frame(summ$importance)

	PC1	PC2	PC3	PC4
Standard deviation	1.708361	0.9560494	0.3830886	0.1439265
Proportion of Variance	0.729620	0.2285100	0.0366900	0.0051800
Cumulative Proportion	0.729620	0.9581300	0.9948200	1.0000000

Generate scree plot for variance explained.

ve = summ$importance[2,]
barplot(ve)+
  xlab("principal component")+
  ylab("variance explained")+
  ylim(0, 1)

Generate biplot with autoplot.

library(ggfortify)
autoplot(PCA, data = iris, scale = 0)

Add class labels and vectors representing each feature.

autoplot(PCA, data = iris, color = "Species", loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3, scale = 0)

Handwritten digits data

Normalized handwritten digits, automatically scanned from envelopes by the U.S. Postal Service. The original scanned digits are binary and of different sizes and orientations; the images here have been deslanted and size normalized, resulting in 16 x 16 grayscale images (Le Cun et al., 1990). The data can be accessed at https://hastie.su.domains/ElemStatLearn/.

The data consists of the digit id (0-9) followed by the 256 grayscale values.

First look

We first check how many digits are in the training data.

zip = read.table("Data/zip.train")
dim(zip)

## [1] 7291  257

colnames(zip) = c("label", paste0("pixel", 1:256))
zip = zip %>% mutate(label = factor(label))
zip %>% group_by(label) %>%
  summarize(n = n())

label	n
0	1194
1	1005
2	731
3	658
4	652
5	556
6	664
7	645
8	542
9	644

Let’s take a look at the images of the digits.

digits = zip[,-1]
getdigit = function(gsvalues){
 matrix(unlist(gsvalues), 16, 16, byrow = T) 
}
heatmap(getdigit(digits[1,]))

heatmap(getdigit(digits[2,]))

heatmap(getdigit(digits[3,]))

As expected, they are labeled as

zip$label[1:3]

## [1] 6 5 4
## Levels: 0 1 2 3 4 5 6 7 8 9

PCA for “3” only

We will focus on threes only for now.

threes = digits[zip$label == 3,]
heatmap(getdigit(threes[1,]))

heatmap(getdigit(threes[2,]))

heatmap(getdigit(threes[3,]))

We compute the principal components. The first ~50 components explain 90% of the variance.

PCA = prcomp(threes)
ve = summary(PCA)$importance[3,]
barplot(ve[1:60])+
  xlab("principal component")+
  ylab("cumulative variance explained")+
  ylim(0, 1)+
  geom_hline(aes(yintercept = 0.9), size = 1, linetype = "dashed")

Lets check the biplot for the first two components.

autoplot(PCA, scale = 0, label = TRUE, label.size = 3, shape = F)

Extreme values along the PC1 axis.

Two negative

heatmap(getdigit(threes["5649",]))

heatmap(getdigit(threes["1528",]))

and two positive.

heatmap(getdigit(threes["2877",]))

heatmap(getdigit(threes["5051",]))

Extreme values along the PC2 axis.

Two negative

heatmap(getdigit(threes["4439",]))

heatmap(getdigit(threes["7167",]))

and two positive.

heatmap(getdigit(threes["321",]))

heatmap(getdigit(threes["5489",]))

Let’s take a look at the loadings. The first loading accounts for horizontal movement (stretching) and the second loading accounts for the line thickness.

heatmap(getdigit(PCA$rotation[,1]))

heatmap(getdigit(PCA$rotation[,2]))

PCA for all digits

This is the plot for the first two principal components. We see that the first component separates well “0” and “1”. Other digits are mixed up.

PCA = prcomp(digits)
autoplot(PCA, data = zip, shape = F, color = "label", label = TRUE, label.label = "label", scale = 0)+
  theme(legend.position = "none")

Let’s take a look at the loadings. The interpretation is more challenging here :)

heatmap(getdigit(PCA$rotation[,1]))

heatmap(getdigit(PCA$rotation[,2]))

Illustration of power iteration

Create \(S\) first.

U = matrix(c(1/sqrt(2), 1/sqrt(2), -1/sqrt(2), 1/sqrt(2)), 2, 2)
lambda = c(5, 1)
S = U %*% diag(lambda) %*% t(U)

Consider 20 different initialization for \(v\).

nrep = 20
v0 = matrix(rnorm(nrep * 2), nrep, 2)
v0 = v0/sqrt(rowSums(v0^2))
colnames(v0) = c("x1", "x2")

df = data.frame(v0, point = factor(1:nrep))
ggplot(df, aes(x1, x2, color = point))+
  geom_point()+
  xlim(-1, 1)+
  ylim(-1, 1)+
  theme(legend.position = "none")+
  geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
  annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)

Plot the results after one iteration of power iteration.

poweriter = function(v){
  u = S %*% v
  return(u / sqrt(sum(u^2)))
}

v1 = t(apply(v0, 1, poweriter))
colnames(v1) = c("x1", "x2")
df = data.frame(v1, point = factor(1:nrep))
ggplot(df, aes(x1, x2, color = point))+
  geom_point()+
  xlim(-1, 1)+
  ylim(-1, 1)+
  theme(legend.position = "none")+
  geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
  annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)

Compute results for 5 iterations.

library(plotly)
viter = data.frame(v0, point = factor(1:nrep), iter = 0)
lambda1iter = data.frame(lambda1 = diag(v0 %*% S %*% t(v0)), point = factor(1:nrep), iter = 0)
v = v0
for(iter in 1:5){
  v = t(apply(v, 1, poweriter))
  colnames(v) = c("x1", "x2")
  viter = rbind(viter, data.frame(v, point = factor(1:nrep), iter = iter))
  lambda1iter = rbind(lambda1iter, data.frame(lambda1 = diag(v %*% S %*% t(v)), point = factor(1:nrep), iter = iter)) 
}

Plot eigen vectors vs iteration for each initialization value.

plt = ggplot(viter, aes(x1, x2, color = point, frame = iter))+
  geom_point()+
  xlim(-1, 1)+
  ylim(-1, 1)+
  theme(legend.position = "none")+
  geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
  annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)+
  coord_fixed()
ggplotly(plt)

What happens to the eigenvalues?

ggplot(lambda1iter, aes(iter, lambda1, color = point))+
  geom_line()+
  theme(legend.position = "none")+
  xlab("iteration")+
  ylab("value of the top eigen value")