library(ggplot2)
library(dplyr)
barplot = function(values){
n = length(values)
df = data.frame(value = values, index = 1:n)
ggplot(df, aes(index, value, fill = value)) +
geom_bar(color = "black", stat = "identity")+
scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
theme_bw()
}
heatmap = function(A){
n = nrow(A)
p = ncol(A)
df = data.frame(value = c(A), i = 1:n, j = rep(1:p, rep(n, p)))
ggplot(df, aes(j, i, fill = value)) +
geom_tile(color = "black")+
scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
scale_y_reverse()+
theme_void()
}
Fnorm = function(A) return(sqrt(sum(A^2)))
The iris dataset contains four features: length and
width of sepals and petals. It includes 50 samples of three species of
Iris: Iris setosa, Iris virginica and Iris versicolor. Type
?iris to learn more.
data("iris")
head(iris)
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 5.4 | 3.9 | 1.7 | 0.4 | setosa |
Let’s check the number of observation per Species as well as the average values for sepal and petal widths and lengths.
iris %>% group_by(Species) %>%
summarize(n = n())
| Species | n |
|---|---|
| setosa | 50 |
| versicolor | 50 |
| virginica | 50 |
iris %>% group_by(Species) %>%
summarize_all(mean)
| Species | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width |
|---|---|---|---|---|
| setosa | 5.006 | 3.428 | 1.462 | 0.246 |
| versicolor | 5.936 | 2.770 | 4.260 | 1.326 |
| virginica | 6.588 | 2.974 | 5.552 | 2.026 |
Let’s check the scatter plot for each pair of variables.
library(GGally)
ggpairs(iris, columns = 1:4, aes(color = Species))
PCA can be performed using prcomp function.
X = iris[, 1:4]
PCA = prcomp(X, center = TRUE, scale = TRUE)
sdev is the standard deviations of the principal
components (i.e., the square roots of the eigenvalues of the covariance
matrix).
sdev = PCA$sdev
sdev
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
rotation is the matrix of variable loadings (i.e., a
matrix whose columns contain the eigenvectors).
rotation = PCA$rotation
data.frame(rotation)
| PC1 | PC2 | PC3 | PC4 | |
|---|---|---|---|---|
| Sepal.Length | 0.5210659 | -0.3774176 | 0.7195664 | 0.2612863 |
| Sepal.Width | -0.2693474 | -0.9232957 | -0.2443818 | -0.1235096 |
| Petal.Length | 0.5804131 | -0.0244916 | -0.1421264 | -0.8014492 |
| Petal.Width | 0.5648565 | -0.0669420 | -0.6342727 | 0.5235971 |
x the value of the rotated data (the centered and scaled
data multiplied by the rotation matrix) is returned.
head(data.frame(PCA$x))
| PC1 | PC2 | PC3 | PC4 |
|---|---|---|---|
| -2.257141 | -0.4784238 | 0.1272796 | 0.0240875 |
| -2.074013 | 0.6718827 | 0.2338255 | 0.1026628 |
| -2.356335 | 0.3407664 | -0.0440539 | 0.0282823 |
| -2.291707 | 0.5953999 | -0.0909853 | -0.0657353 |
| -2.381863 | -0.6446757 | -0.0156856 | -0.0358029 |
| -2.068701 | -1.4842053 | -0.0268782 | 0.0065861 |
Variance explained (VE) and the cumulative values of VE can be found
using summary() function.
summ = summary(PCA)
data.frame(summ$importance)
| PC1 | PC2 | PC3 | PC4 | |
|---|---|---|---|---|
| Standard deviation | 1.708361 | 0.9560494 | 0.3830886 | 0.1439265 |
| Proportion of Variance | 0.729620 | 0.2285100 | 0.0366900 | 0.0051800 |
| Cumulative Proportion | 0.729620 | 0.9581300 | 0.9948200 | 1.0000000 |
Generate scree plot for variance explained.
ve = summ$importance[2,]
barplot(ve)+
xlab("principal component")+
ylab("variance explained")+
ylim(0, 1)
Generate biplot with autoplot.
library(ggfortify)
autoplot(PCA, data = iris, scale = 0)
Add class labels and vectors representing each feature.
autoplot(PCA, data = iris, color = "Species", loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3, scale = 0)
Normalized handwritten digits, automatically scanned from envelopes by the U.S. Postal Service. The original scanned digits are binary and of different sizes and orientations; the images here have been deslanted and size normalized, resulting in 16 x 16 grayscale images (Le Cun et al., 1990). The data can be accessed at https://hastie.su.domains/ElemStatLearn/.
The data consists of the digit id (0-9) followed by the 256 grayscale values.
We first check how many digits are in the training data.
zip = read.table("Data/zip.train")
dim(zip)
## [1] 7291 257
colnames(zip) = c("label", paste0("pixel", 1:256))
zip = zip %>% mutate(label = factor(label))
zip %>% group_by(label) %>%
summarize(n = n())
| label | n |
|---|---|
| 0 | 1194 |
| 1 | 1005 |
| 2 | 731 |
| 3 | 658 |
| 4 | 652 |
| 5 | 556 |
| 6 | 664 |
| 7 | 645 |
| 8 | 542 |
| 9 | 644 |
Let’s take a look at the images of the digits.
digits = zip[,-1]
getdigit = function(gsvalues){
matrix(unlist(gsvalues), 16, 16, byrow = T)
}
heatmap(getdigit(digits[1,]))
heatmap(getdigit(digits[2,]))
heatmap(getdigit(digits[3,]))
As expected, they are labeled as
zip$label[1:3]
## [1] 6 5 4
## Levels: 0 1 2 3 4 5 6 7 8 9
We will focus on threes only for now.
threes = digits[zip$label == 3,]
heatmap(getdigit(threes[1,]))
heatmap(getdigit(threes[2,]))
heatmap(getdigit(threes[3,]))
We compute the principal components. The first ~50 components explain 90% of the variance.
PCA = prcomp(threes)
ve = summary(PCA)$importance[3,]
barplot(ve[1:60])+
xlab("principal component")+
ylab("cumulative variance explained")+
ylim(0, 1)+
geom_hline(aes(yintercept = 0.9), size = 1, linetype = "dashed")
Lets check the biplot for the first two components.
autoplot(PCA, scale = 0, label = TRUE, label.size = 3, shape = F)
Extreme values along the PC1 axis.
Two negative
heatmap(getdigit(threes["5649",]))
heatmap(getdigit(threes["1528",]))
and two positive.
heatmap(getdigit(threes["2877",]))
heatmap(getdigit(threes["5051",]))
Extreme values along the PC2 axis.
Two negative
heatmap(getdigit(threes["4439",]))
heatmap(getdigit(threes["7167",]))
and two positive.
heatmap(getdigit(threes["321",]))
heatmap(getdigit(threes["5489",]))
Let’s take a look at the loadings. The first loading accounts for horizontal movement (stretching) and the second loading accounts for the line thickness.
heatmap(getdigit(PCA$rotation[,1]))
heatmap(getdigit(PCA$rotation[,2]))
This is the plot for the first two principal components. We see that the first component separates well “0” and “1”. Other digits are mixed up.
PCA = prcomp(digits)
autoplot(PCA, data = zip, shape = F, color = "label", label = TRUE, label.label = "label", scale = 0)+
theme(legend.position = "none")
Let’s take a look at the loadings. The interpretation is more challenging here :)
heatmap(getdigit(PCA$rotation[,1]))
heatmap(getdigit(PCA$rotation[,2]))
Create \(S\) first.
U = matrix(c(1/sqrt(2), 1/sqrt(2), -1/sqrt(2), 1/sqrt(2)), 2, 2)
lambda = c(5, 1)
S = U %*% diag(lambda) %*% t(U)
Consider 20 different initialization for \(v\).
nrep = 20
v0 = matrix(rnorm(nrep * 2), nrep, 2)
v0 = v0/sqrt(rowSums(v0^2))
colnames(v0) = c("x1", "x2")
df = data.frame(v0, point = factor(1:nrep))
ggplot(df, aes(x1, x2, color = point))+
geom_point()+
xlim(-1, 1)+
ylim(-1, 1)+
theme(legend.position = "none")+
geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)
Plot the results after one iteration of power iteration.
poweriter = function(v){
u = S %*% v
return(u / sqrt(sum(u^2)))
}
v1 = t(apply(v0, 1, poweriter))
colnames(v1) = c("x1", "x2")
df = data.frame(v1, point = factor(1:nrep))
ggplot(df, aes(x1, x2, color = point))+
geom_point()+
xlim(-1, 1)+
ylim(-1, 1)+
theme(legend.position = "none")+
geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)
Compute results for 5 iterations.
library(plotly)
viter = data.frame(v0, point = factor(1:nrep), iter = 0)
lambda1iter = data.frame(lambda1 = diag(v0 %*% S %*% t(v0)), point = factor(1:nrep), iter = 0)
v = v0
for(iter in 1:5){
v = t(apply(v, 1, poweriter))
colnames(v) = c("x1", "x2")
viter = rbind(viter, data.frame(v, point = factor(1:nrep), iter = iter))
lambda1iter = rbind(lambda1iter, data.frame(lambda1 = diag(v %*% S %*% t(v)), point = factor(1:nrep), iter = iter))
}
Plot eigen vectors vs iteration for each initialization value.
plt = ggplot(viter, aes(x1, x2, color = point, frame = iter))+
geom_point()+
xlim(-1, 1)+
ylim(-1, 1)+
theme(legend.position = "none")+
geom_abline(aes(intercept = 0, slope = U[1,1]/U[2,1]), alpha = 0.2)+
annotate("segment", x = 0, y = 0, xend = U[1,1], yend = U[2,1], size = 0.5)+
coord_fixed()
ggplotly(plt)
What happens to the eigenvalues?
ggplot(lambda1iter, aes(iter, lambda1, color = point))+
geom_line()+
theme(legend.position = "none")+
xlab("iteration")+
ylab("value of the top eigen value")