library(ggplot2)
library(dplyr)
barplot = function(values){
n = length(values)
df = data.frame(value = values, index = 1:n)
ggplot(df, aes(index, value, fill = value)) +
geom_bar(color = "black", stat = "identity")+
scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
theme_bw()
}
heatmap = function(A){
n = nrow(A)
p = ncol(A)
df = data.frame(value = c(A), i = 1:n, j = rep(1:p, rep(n, p)))
ggplot(df, aes(j, i, fill = value)) +
geom_tile(color = "black")+
scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
scale_y_reverse()+
theme_void()
}
Fnorm = function(A) return(sqrt(sum(A^2)))
The iris
dataset contains four features: length and
width of sepals and petals. It includes 50 samples of three species of
Iris: Iris setosa, Iris virginica and Iris versicolor. Type
?iris
to learn more.
data("iris")
head(iris)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
5.1 | 3.5 | 1.4 | 0.2 | setosa |
4.9 | 3.0 | 1.4 | 0.2 | setosa |
4.7 | 3.2 | 1.3 | 0.2 | setosa |
4.6 | 3.1 | 1.5 | 0.2 | setosa |
5.0 | 3.6 | 1.4 | 0.2 | setosa |
5.4 | 3.9 | 1.7 | 0.4 | setosa |
Let’s check the number of observation per Species as well as the average values for sepal and petal widths and lengths.
iris %>% group_by(Species) %>%
summarize(n = n())
Species | n |
---|---|
setosa | 50 |
versicolor | 50 |
virginica | 50 |
iris %>% group_by(Species) %>%
summarize_all(mean)
Species | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width |
---|---|---|---|---|
setosa | 5.006 | 3.428 | 1.462 | 0.246 |
versicolor | 5.936 | 2.770 | 4.260 | 1.326 |
virginica | 6.588 | 2.974 | 5.552 | 2.026 |
Let’s check the scatter plot for each pair of variables.
library(GGally)
ggpairs(iris, columns = 1:4, aes(color = Species))
PCA can be performed using prcomp
function.
X = iris[, 1:4]
PCA = prcomp(X, center = TRUE, scale = TRUE)
sdev
is the standard deviations of the principal
components (i.e., the square roots of the eigenvalues of the covariance
matrix).
sdev = PCA$sdev
sdev
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
rotation
is the matrix of variable loadings (i.e., a
matrix whose columns contain the eigenvectors).
rotation = PCA$rotation
data.frame(rotation)
PC1 | PC2 | PC3 | PC4 | |
---|---|---|---|---|
Sepal.Length | 0.5210659 | -0.3774176 | 0.7195664 | 0.2612863 |
Sepal.Width | -0.2693474 | -0.9232957 | -0.2443818 | -0.1235096 |
Petal.Length | 0.5804131 | -0.0244916 | -0.1421264 | -0.8014492 |
Petal.Width | 0.5648565 | -0.0669420 | -0.6342727 | 0.5235971 |
x
the value of the rotated data (the centered and scaled
data multiplied by the rotation matrix) is returned.
head(data.frame(PCA$x))
PC1 | PC2 | PC3 | PC4 |
---|---|---|---|
-2.257141 | -0.4784238 | 0.1272796 | 0.0240875 |
-2.074013 | 0.6718827 | 0.2338255 | 0.1026628 |
-2.356335 | 0.3407664 | -0.0440539 | 0.0282823 |
-2.291707 | 0.5953999 | -0.0909853 | -0.0657353 |
-2.381863 | -0.6446757 | -0.0156856 | -0.0358029 |
-2.068701 | -1.4842053 | -0.0268782 | 0.0065861 |
Variance explained (VE) and the cumulative values of VE can be found
using summary()
function.
summ = summary(PCA)
data.frame(summ$importance)
PC1 | PC2 | PC3 | PC4 | |
---|---|---|---|---|
Standard deviation | 1.708361 | 0.9560494 | 0.3830886 | 0.1439265 |
Proportion of Variance | 0.729620 | 0.2285100 | 0.0366900 | 0.0051800 |
Cumulative Proportion | 0.729620 | 0.9581300 | 0.9948200 | 1.0000000 |
Generate scree plot for variance explained.
ve = summ$importance[2,]
barplot(ve)+
xlab("principal component")+
ylab("variance explained")+
ylim(0, 1)
Generate biplot with autoplot
.
library(ggfortify)
autoplot(PCA, data = iris, scale = 0)
Add class labels and vectors representing each feature.
autoplot(PCA, data = iris, color = "Species", loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3, scale = 0)
Normalized handwritten digits, automatically scanned from envelopes by the U.S. Postal Service. The original scanned digits are binary and of different sizes and orientations; the images here have been deslanted and size normalized, resulting in 16 x 16 grayscale images (Le Cun et al., 1990). The data can be accessed at https://hastie.su.domains/ElemStatLearn/.
The data consists of the digit id (0-9) followed by the 256 grayscale values.
We first check how many digits are in the training data.
zip = read.table("Data/zip.train")
dim(zip)
## [1] 7291 257
colnames(zip) = c("label", paste0("pixel", 1:256))
zip = zip %>% mutate(label = factor(label))
zip %>% group_by(label) %>%
summarize(n = n())
label | n |
---|---|
0 | 1194 |
1 | 1005 |
2 | 731 |
3 | 658 |
4 | 652 |
5 | 556 |
6 | 664 |
7 | 645 |
8 | 542 |
9 | 644 |
Let’s take a look at the images of the digits.
digits = zip[,-1]
getdigit = function(gsvalues){
matrix(unlist(gsvalues), 16, 16, byrow = T)
}
heatmap(getdigit(digits[1,]))
heatmap(getdigit(digits[2,]))
heatmap(getdigit(digits[3,]))
As expected, they are labeled as
zip$label[1:3]
## [1] 6 5 4
## Levels: 0 1 2 3 4 5 6 7 8 9
We will focus on threes only for now.
threes = digits[zip$label == 3,]
heatmap(getdigit(threes[1,]))
heatmap(getdigit(threes[2,]))
heatmap(getdigit(threes[3,]))
We compute the principal components. The first ~50 components explain 90% of the variance.
PCA = prcomp(threes)
ve = summary(PCA)$importance[3,]
barplot(ve[1:60])+
xlab("principal component")+
ylab("cumulative variance explained")+
ylim(0, 1)+
geom_hline(aes(yintercept = 0.9), size = 1, linetype = "dashed")
Lets check the biplot for the first two components.
autoplot(PCA, scale = 0, label = TRUE, label.size = 3, shape = F)
Extreme values along the PC1 axis.
Two negative
heatmap(getdigit(threes["5649",]))
heatmap(getdigit(threes["1528",]))
and two positive.
heatmap(getdigit(threes["2877",]))
heatmap(getdigit(threes["5051",]))
Extreme values along the PC2 axis.
Two negative
heatmap(getdigit(threes["4439",]))
heatmap(getdigit(threes["7167",]))
and two positive.
heatmap(getdigit(threes["321",]))
heatmap(getdigit(threes["5489",]))
Let’s take a look at the loadings. The first loading accounts for horizontal movement (stretching) and the second loading accounts for the line thickness.
heatmap(getdigit(PCA$rotation[,1]))
heatmap(getdigit(PCA$rotation[,2]))
This is the plot for the first two principal components. We see that the first component separates well “0” and “1”. Other digits are mixed up.
PCA = prcomp(digits)
autoplot(PCA, data = zip, shape = F, color = "label", label = TRUE, label.label = "label", scale = 0)+
theme(legend.position = "none")
Let’s take a look at the loadings. The interpretation is more challenging here :)
heatmap(getdigit(PCA$rotation[,1]))
heatmap(getdigit(PCA$rotation[,2]))
Create \(S\) first.
U = matrix(c(1/sqrt(2), 1/sqrt(2), -1/sqrt(2), 1/sqrt(2)), 2, 2)
lambda = c(5, 1)
S = U %*% diag(lambda) %*% t(U)
Consider 20 different initialization for \(v\).
nrep = 20
v0 = matrix(rnorm(nrep * 2), nrep, 2)
v0 = v0/sqrt(rowSums(v0^2))
colnames(v0) = c("x1", "x2")
df = data.frame(v0, point = factor(1:nrep))
ggplot(df, aes(x1, x2, color = point))+
geom_point()+
xlim(-1, 1)+
ylim(-1, 1)+
theme(legend.position = "none")+
geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)
Plot the results after one iteration of power iteration.
poweriter = function(v){
u = S %*% v
return(u / sqrt(sum(u^2)))
}
v1 = t(apply(v0, 1, poweriter))
colnames(v1) = c("x1", "x2")
df = data.frame(v1, point = factor(1:nrep))
ggplot(df, aes(x1, x2, color = point))+
geom_point()+
xlim(-1, 1)+
ylim(-1, 1)+
theme(legend.position = "none")+
geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)
Compute results for 5 iterations.
library(plotly)
viter = data.frame(v0, point = factor(1:nrep), iter = 0)
lambda1iter = data.frame(lambda1 = diag(v0 %*% S %*% t(v0)), point = factor(1:nrep), iter = 0)
v = v0
for(iter in 1:5){
v = t(apply(v, 1, poweriter))
colnames(v) = c("x1", "x2")
viter = rbind(viter, data.frame(v, point = factor(1:nrep), iter = iter))
lambda1iter = rbind(lambda1iter, data.frame(lambda1 = diag(v %*% S %*% t(v)), point = factor(1:nrep), iter = iter))
}
Plot eigen vectors vs iteration for each initialization value.
plt = ggplot(viter, aes(x1, x2, color = point, frame = iter))+
geom_point()+
xlim(-1, 1)+
ylim(-1, 1)+
theme(legend.position = "none")+
geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)+
coord_fixed()
ggplotly(plt)
What happens to the eigenvalues?
ggplot(lambda1iter, aes(iter, lambda1, color = point))+
geom_line()+
theme(legend.position = "none")+
xlab("iteration")+
ylab("value of the top eigen value")