library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)
library(cluster)
library(plotly)
library(ggdendro)
scatterplot = function(X, cluster = NULL){
if(is.null(cluster)) cluster = rep(1, nrow(X))
if(length(unique(cluster)) == 1){
ggplot()+
geom_point(X, mapping = aes(x1, x2))
} else {
ggplot()+
geom_point(data.frame(X, cluster = as.factor(cluster)), mapping = aes(x1, x2, color = cluster))+
theme(legend.position = "none")
}
}
scatterplot3D = function(X, cluster = NULL){
if(is.null(cluster)) cluster = rep(1, nrow(X))
plot_ly(x = X[,1], y = X[,2], z = X[,1], color = as.factor(cluster)) %>% add_markers() %>%
layout(scene = list(xaxis = list(title = 'first'),
yaxis = list(title = 'second'),
zaxis = list(title = 'third')), showlegend = FALSE)
}
barplot = function(values){
n = length(values)
df = data.frame(value = values, index = 1:n)
ggplot(df, aes(index, value, fill = value)) +
geom_bar(color = "black", stat = "identity")+
scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
scale_x_continuous(breaks = 1:n)+
theme_bw()+
theme(legend.position = "none")
}
heatmap = function(A){
n = nrow(A)
p = ncol(A)
df = data.frame(value = c(A), i = 1:n, j = rep(1:p, rep(n, p)))
ggplot(df, aes(j, i, fill = value)) +
geom_tile()+
scale_fill_viridis()+
scale_y_reverse()+
theme_void()
}
The data were extracted from the Acoustic-Phonetic Continuous Speech Corpus. A dataset was formed by selecting five phonemes for classification based on digitized speech from this database.
The phonemes are transcribed as follows:
From continuous speech of 50 male speakers, 4509 speech frames of 32 msec duration were selected, approximately 2 examples of each phoneme from each speaker. Each speech frame is represented by 512 samples at a 16kHz sampling rate, and each frame represents one of the above five phonemes. From each speech frame, a log-periodogram was computed.
Load the data and split it into train and test data.
data = read.csv("https://hastie.su.domains/ElemStatLearn/datasets/phoneme.data", header = T, sep = ",")
X = data %>% dplyr::select(-row.names, -g, -speaker)
Y = data %>% dplyr::select(g) %>% pull() %>% as.factor()
table(Y)
## Y
## aa ao dcl iy sh
## 695 1022 757 1163 872
dim(X)
## [1] 4509 256
n = nrow(X)
p = ncol(X)
train = sample(1:n, n*0.8)
Xtrain = as.matrix(X[train,])
Ytrain = Y[train]
Xtest = as.matrix(X[-train,])
Ytest = Y[-train]
PCA results.
PCA = prcomp(Xtrain, scale = FALSE)
autoplot(PCA, data = data.frame(Xtrain, phoneme = Ytrain), colour = 'phoneme')
Fit the LDA model.
library(MASS)
LDA = lda(Xtrain, grouping = Ytrain)
LDA$prior
## aa ao dcl iy sh
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223
head(LDA$scaling) # is the transformation matrix
## LD1 LD2 LD3 LD4
## x.1 0.054817009 0.01463716 0.06713418 0.050214462
## x.2 -0.011014695 0.07510455 -0.04545557 -0.051805734
## x.3 -0.039125133 0.04164280 0.03327425 -0.055371316
## x.4 -0.009008501 -0.21237745 0.05404024 0.124108619
## x.5 -0.033386367 -0.19818523 0.10077913 0.009457531
## x.6 0.010336300 -0.14173295 0.04316948 0.031095589
dim(LDA$scaling)
## [1] 256 4
head(t(LDA$means))
## aa ao dcl iy sh
## x.1 11.77489 10.96335 9.606017 10.83331 10.81047
## x.2 12.97825 13.46851 9.901521 11.75975 10.65563
## x.3 16.61219 16.73301 13.275869 15.49476 10.89706
## x.4 17.92733 16.95343 14.203040 17.80487 11.21339
## x.5 16.52991 15.85652 13.454478 17.28067 11.43806
## x.6 16.05482 16.50254 13.278399 16.22198 11.79408
dim(t(LDA$means))
## [1] 256 5
Visualize the results.
Option 1: use the transformation matrix.
pi = LDA$prior
AX = Xtrain %*% LDA$scaling
AM = LDA$means %*% LDA$scaling
classify = function(X, M, pi){
D = as.matrix(dist(rbind(M, X)))[nrow(M) + (1:nrow(X)), 1:nrow(M)]
C = scale(1/2 * D^2, center = log(pi), scale = F)
colnames(D)[apply(C, 1, which.min)]
}
labels = classify(AX, AM, pi)
df = data.frame(AX, phoneme = labels)
dfm = data.frame(AM)
ggplot(mapping = aes(LD1, LD2, color = phoneme))+
geom_point(data = df)+
geom_point(data = dfm, color = "black", shape = 4, size = 3)
Option 1: use predict()
function.
pred = predict(LDA)
PX = pred$x
df = data.frame(PX, phoneme = pred$class)
predm = predict(LDA, LDA$means)
PM = predm$x
dfm = data.frame(PM, phoneme = predm$class)
ggplot(mapping = aes(LD1, LD2, color = phoneme))+
geom_point(data = df)+
geom_point(data = dfm, color = "black", shape = 4, size = 3)
Option 1 and 2 return different projected data. What is the difference?
sum((AX - PX)^2)
## [1] 538672.7
sum((AM - PM)^2)
## [1] 746.7046
The difference is in centering. The predict()
function
centers the projections at \(\mu =
\sum_{k=1}^K\pi_k \mu_k\)
colSums(diag(pi) %*% PM)
## LD1 LD2 LD3 LD4
## 2.220446e-16 9.974660e-16 2.081668e-16 2.116363e-16
center = colSums(diag(pi) %*% AM)
AXc = scale(AX, center = center, scale = F)
AMc = scale(AM, center = center, scale = F)
sum((AXc - PX)^2)
## [1] 7.066536e-25
sum((AMc - PM)^2)
## [1] 4.745459e-28
Check other LD components.
ggplot(mapping = aes(LD1, LD2, color = phoneme))+
geom_point(data = df)+
geom_point(data = dfm, color = "black", shape = 4, size = 3)
ggplot(mapping = aes(LD2, LD3, color = phoneme))+
geom_point(data = df)+
geom_point(data = dfm, color = "black", shape = 4, size = 3)
ggplot(mapping = aes(LD3, LD4, color = phoneme))+
geom_point(data = df)+
geom_point(data = dfm, color = "black", shape = 4, size = 3)
Compare true labels with predicted ones.
df = data.frame(PX, phoneme = c(pred$class, Ytrain), type = c(rep("predicted", length(train)), rep("true", length(train))))
ggplot(df, aes(LD1, LD2, color = phoneme))+
geom_point()+
facet_wrap(~type)
Quantify the accuracy. It’s quite high!
table(Ytrain, pred$class)
##
## Ytrain aa ao dcl iy sh
## aa 451 108 0 0 0
## ao 103 724 0 0 0
## dcl 0 0 586 12 1
## iy 0 0 1 940 1
## sh 0 0 0 0 680
mean(Ytrain == pred$class)
## [1] 0.9373441
Plot the decision boundaries when using LD1 and LD2.
grid = expand.grid(LD1 = seq(min(PX[,1]), max(PX[,1]), length.out = 50), LD2 = seq(min(PX[,2]), max(PX[,2]), length.out = 50))
LDAreg = lda(PX[,1:2], grouping = Ytrain)
LDAreg$prior
## aa ao dcl iy sh
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223
pi
## aa ao dcl iy sh
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223
LDAreg$means
## LD1 LD2
## aa -6.330314 0.3813777
## ao -6.627299 0.0517982
## dcl 4.167750 -2.6549509
## iy 3.706228 -3.2643678
## sh 4.458343 6.4842983
PM[,1:2]
## LD1 LD2
## aa -6.330314 0.3813777
## ao -6.627299 0.0517982
## dcl 4.167750 -2.6549509
## iy 3.706228 -3.2643678
## sh 4.458343 6.4842983
labelsgrid = predict(LDAreg, grid)$class
#labelsgrid = classify(grid, PM[,1:2], pi)
dfgrid = data.frame(grid, phoneme = labelsgrid)
plt = ggplot(mapping = aes(LD1, LD2, color = phoneme))+
geom_point(data = dfgrid, alpha = 0.2, size = 3)+
geom_point(data = PM, color = "black", shape = 4, size = 3)
plt
Compare to the true labels. The accuracy dropped!
plt +
geom_point(data = df %>% filter(type == "true"))+
geom_point(data = PM, color = "black", shape = 4, size = 3)
labelsreg = predict(LDAreg, PX[,1:2])$class
mean(Ytrain == labelsreg)
## [1] 0.7280288
Compare the results for is we use \(L=1,2,3,4\).
accs = c()
for(L in 1:4){
LDAreg = lda(PX[,1:L,drop = F], grouping = Ytrain)
labelsreg = predict(LDAreg)$class
#labelsreg = classify(PX[,1:L,drop = F], PM[,1:L, drop = F], pi)
accs = c(accs, mean(Ytrain == labelsreg))
}
data.frame(L = 1:4, accuracy = accs)
L | accuracy |
---|---|
1 | 0.5422789 |
2 | 0.7280288 |
3 | 0.8735792 |
4 | 0.9373441 |
This was train accuracy. Now evaluate the classification performance on the test set. The results are consistent.
pred = predict(LDA, Xtest)
PXtest = pred$x
accstest = c()
for(L in 1:4){
LDAreg = lda(PX[,1:L,drop = F], grouping = Ytrain)
labelsreg = predict(LDAreg, PXtest[,1:L,drop = F])$class
#labelsreg = classify(PXtest[,1:L,drop = F], PM[,1:L, drop = F], pi)
accstest = c(accstest, mean(Ytest == labelsreg))
}
data.frame(L = 1:4, accuracy_train = accs, accuracy_test = accstest)
L | accuracy_train | accuracy_test |
---|---|---|
1 | 0.5422789 | 0.5365854 |
2 | 0.7280288 | 0.7272727 |
3 | 0.8735792 | 0.8747228 |
4 | 0.9373441 | 0.9334812 |
Fit the QDA model.
QDA = qda(Xtrain, grouping = Ytrain)
QDA$prior # the same for LDA
## aa ao dcl iy sh
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223
head(t(QDA$means)) # the same for LDA
## aa ao dcl iy sh
## x.1 11.77489 10.96335 9.606017 10.83331 10.81047
## x.2 12.97825 13.46851 9.901521 11.75975 10.65563
## x.3 16.61219 16.73301 13.275869 15.49476 10.89706
## x.4 17.92733 16.95343 14.203040 17.80487 11.21339
## x.5 16.52991 15.85652 13.454478 17.28067 11.43806
## x.6 16.05482 16.50254 13.278399 16.22198 11.79408
dim(QDA$scaling) # is the sphering matrices
## [1] 256 256 5
Compare the performance. QDA overfits slightly.
labels = predict(QDA)$class
mean(Ytrain == labels)
## [1] 1
labelstest = predict(QDA, Xtest)$class
mean(Ytest == labelstest)
## [1] 0.8470067