Helper functions

library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)
library(cluster)
library(plotly)
library(ggdendro)


scatterplot = function(X, cluster = NULL){
  if(is.null(cluster)) cluster = rep(1, nrow(X))
  if(length(unique(cluster)) == 1){
    ggplot()+
      geom_point(X, mapping = aes(x1, x2))
  } else {
    ggplot()+
      geom_point(data.frame(X, cluster = as.factor(cluster)), mapping = aes(x1, x2, color = cluster))+
      theme(legend.position = "none")
  } 
}

scatterplot3D = function(X, cluster = NULL){
  if(is.null(cluster)) cluster = rep(1, nrow(X))
  plot_ly(x = X[,1], y = X[,2], z = X[,1], color = as.factor(cluster)) %>% add_markers() %>%
    layout(scene = list(xaxis = list(title = 'first'),
                     yaxis = list(title = 'second'),
                     zaxis = list(title = 'third')), showlegend = FALSE)
}

barplot = function(values){
  n = length(values)
  df = data.frame(value = values,  index = 1:n)
  ggplot(df, aes(index, value, fill = value)) + 
    geom_bar(color = "black", stat = "identity")+
    scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
    scale_x_continuous(breaks = 1:n)+
    theme_bw()+
    theme(legend.position = "none")
}

heatmap = function(A){
  n = nrow(A)
  p = ncol(A)  
  df = data.frame(value = c(A),  i = 1:n, j = rep(1:p, rep(n, p)))
  ggplot(df, aes(j, i, fill = value)) + 
    geom_tile()+
    scale_fill_viridis()+
    scale_y_reverse()+
    theme_void()
}

Phoeme data

The data were extracted from the Acoustic-Phonetic Continuous Speech Corpus. A dataset was formed by selecting five phonemes for classification based on digitized speech from this database.

The phonemes are transcribed as follows:

From continuous speech of 50 male speakers, 4509 speech frames of 32 msec duration were selected, approximately 2 examples of each phoneme from each speaker. Each speech frame is represented by 512 samples at a 16kHz sampling rate, and each frame represents one of the above five phonemes. From each speech frame, a log-periodogram was computed.

Load the data and split it into train and test data.

data = read.csv("https://hastie.su.domains/ElemStatLearn/datasets/phoneme.data", header = T, sep = ",")
X = data %>% dplyr::select(-row.names, -g, -speaker)
Y = data %>% dplyr::select(g) %>% pull() %>% as.factor()
table(Y)
## Y
##   aa   ao  dcl   iy   sh 
##  695 1022  757 1163  872
dim(X)
## [1] 4509  256
n = nrow(X)
p = ncol(X)
train = sample(1:n, n*0.8)
Xtrain = as.matrix(X[train,])
Ytrain = Y[train]
Xtest = as.matrix(X[-train,])
Ytest = Y[-train]

PCA results.

PCA = prcomp(Xtrain, scale = FALSE)
autoplot(PCA, data = data.frame(Xtrain, phoneme = Ytrain), colour = 'phoneme')             

LDA model

Fit the LDA model.

library(MASS)
LDA = lda(Xtrain, grouping = Ytrain)
LDA$prior
##        aa        ao       dcl        iy        sh 
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223
head(LDA$scaling) # is the transformation matrix
##              LD1         LD2         LD3          LD4
## x.1  0.054817009  0.01463716  0.06713418  0.050214462
## x.2 -0.011014695  0.07510455 -0.04545557 -0.051805734
## x.3 -0.039125133  0.04164280  0.03327425 -0.055371316
## x.4 -0.009008501 -0.21237745  0.05404024  0.124108619
## x.5 -0.033386367 -0.19818523  0.10077913  0.009457531
## x.6  0.010336300 -0.14173295  0.04316948  0.031095589
dim(LDA$scaling)
## [1] 256   4
head(t(LDA$means))
##           aa       ao       dcl       iy       sh
## x.1 11.77489 10.96335  9.606017 10.83331 10.81047
## x.2 12.97825 13.46851  9.901521 11.75975 10.65563
## x.3 16.61219 16.73301 13.275869 15.49476 10.89706
## x.4 17.92733 16.95343 14.203040 17.80487 11.21339
## x.5 16.52991 15.85652 13.454478 17.28067 11.43806
## x.6 16.05482 16.50254 13.278399 16.22198 11.79408
dim(t(LDA$means))
## [1] 256   5

Visualize the results.

Option 1: use the transformation matrix.

pi = LDA$prior
AX = Xtrain %*% LDA$scaling
AM = LDA$means %*% LDA$scaling
classify = function(X, M, pi){
  D = as.matrix(dist(rbind(M, X)))[nrow(M) + (1:nrow(X)), 1:nrow(M)]
  C = scale(1/2 * D^2, center = log(pi), scale = F)
  colnames(D)[apply(C, 1, which.min)]
}
labels = classify(AX, AM, pi)

df = data.frame(AX, phoneme = labels)
dfm = data.frame(AM)
ggplot(mapping = aes(LD1, LD2, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

Option 1: use predict() function.

pred = predict(LDA)
PX = pred$x
df = data.frame(PX, phoneme = pred$class)
predm = predict(LDA, LDA$means)
PM = predm$x
dfm = data.frame(PM, phoneme = predm$class)
ggplot(mapping = aes(LD1, LD2, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

Option 1 and 2 return different projected data. What is the difference?

sum((AX - PX)^2)
## [1] 538672.7
sum((AM - PM)^2)
## [1] 746.7046

The difference is in centering. The predict() function centers the projections at \(\mu = \sum_{k=1}^K\pi_k \mu_k\)

colSums(diag(pi) %*% PM)
##          LD1          LD2          LD3          LD4 
## 2.220446e-16 9.974660e-16 2.081668e-16 2.116363e-16
center = colSums(diag(pi) %*% AM)
AXc = scale(AX, center = center, scale = F)
AMc = scale(AM, center = center, scale = F)
sum((AXc - PX)^2)
## [1] 7.066536e-25
sum((AMc - PM)^2)
## [1] 4.745459e-28

Check other LD components.

ggplot(mapping = aes(LD1, LD2, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

ggplot(mapping = aes(LD2, LD3, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

ggplot(mapping = aes(LD3, LD4, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

Compare true labels with predicted ones.

df = data.frame(PX, phoneme = c(pred$class, Ytrain), type = c(rep("predicted", length(train)), rep("true", length(train))))
ggplot(df, aes(LD1, LD2, color = phoneme))+
  geom_point()+
  facet_wrap(~type)

Quantify the accuracy. It’s quite high!

table(Ytrain, pred$class)
##       
## Ytrain  aa  ao dcl  iy  sh
##    aa  451 108   0   0   0
##    ao  103 724   0   0   0
##    dcl   0   0 586  12   1
##    iy    0   0   1 940   1
##    sh    0   0   0   0 680
mean(Ytrain == pred$class)
## [1] 0.9373441

Plot the decision boundaries when using LD1 and LD2.

grid = expand.grid(LD1 = seq(min(PX[,1]), max(PX[,1]), length.out = 50), LD2 = seq(min(PX[,2]), max(PX[,2]), length.out = 50))
LDAreg = lda(PX[,1:2], grouping = Ytrain)
LDAreg$prior
##        aa        ao       dcl        iy        sh 
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223
pi
##        aa        ao       dcl        iy        sh 
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223
LDAreg$means
##           LD1        LD2
## aa  -6.330314  0.3813777
## ao  -6.627299  0.0517982
## dcl  4.167750 -2.6549509
## iy   3.706228 -3.2643678
## sh   4.458343  6.4842983
PM[,1:2]
##           LD1        LD2
## aa  -6.330314  0.3813777
## ao  -6.627299  0.0517982
## dcl  4.167750 -2.6549509
## iy   3.706228 -3.2643678
## sh   4.458343  6.4842983
labelsgrid = predict(LDAreg, grid)$class
#labelsgrid = classify(grid, PM[,1:2], pi)
dfgrid =  data.frame(grid, phoneme = labelsgrid)
plt = ggplot(mapping = aes(LD1, LD2, color = phoneme))+
  geom_point(data = dfgrid, alpha = 0.2, size = 3)+
  geom_point(data = PM, color = "black", shape = 4, size = 3)
plt

Compare to the true labels. The accuracy dropped!

plt + 
  geom_point(data = df %>% filter(type == "true"))+
  geom_point(data = PM, color = "black", shape = 4, size = 3)

labelsreg = predict(LDAreg, PX[,1:2])$class
mean(Ytrain == labelsreg)
## [1] 0.7280288

Compare the results for is we use \(L=1,2,3,4\).

accs = c()
for(L in 1:4){
  LDAreg = lda(PX[,1:L,drop = F], grouping = Ytrain)
  labelsreg = predict(LDAreg)$class
  #labelsreg = classify(PX[,1:L,drop = F], PM[,1:L, drop = F], pi)
  accs = c(accs, mean(Ytrain == labelsreg))
}
data.frame(L = 1:4, accuracy = accs)
L accuracy
1 0.5422789
2 0.7280288
3 0.8735792
4 0.9373441

This was train accuracy. Now evaluate the classification performance on the test set. The results are consistent.

pred = predict(LDA, Xtest)
PXtest = pred$x
accstest = c()
for(L in 1:4){
  LDAreg = lda(PX[,1:L,drop = F], grouping = Ytrain)
  labelsreg = predict(LDAreg, PXtest[,1:L,drop = F])$class
  #labelsreg = classify(PXtest[,1:L,drop = F], PM[,1:L, drop = F], pi)
  accstest = c(accstest, mean(Ytest == labelsreg))
}
data.frame(L = 1:4, accuracy_train = accs, accuracy_test = accstest)
L accuracy_train accuracy_test
1 0.5422789 0.5365854
2 0.7280288 0.7272727
3 0.8735792 0.8747228
4 0.9373441 0.9334812

QDA model

Fit the QDA model.

QDA = qda(Xtrain, grouping = Ytrain)
QDA$prior # the same for LDA
##        aa        ao       dcl        iy        sh 
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223
head(t(QDA$means)) # the same for LDA
##           aa       ao       dcl       iy       sh
## x.1 11.77489 10.96335  9.606017 10.83331 10.81047
## x.2 12.97825 13.46851  9.901521 11.75975 10.65563
## x.3 16.61219 16.73301 13.275869 15.49476 10.89706
## x.4 17.92733 16.95343 14.203040 17.80487 11.21339
## x.5 16.52991 15.85652 13.454478 17.28067 11.43806
## x.6 16.05482 16.50254 13.278399 16.22198 11.79408
dim(QDA$scaling) # is the sphering matrices
## [1] 256 256   5

Compare the performance. QDA overfits slightly.

labels = predict(QDA)$class
mean(Ytrain == labels)
## [1] 1
labelstest = predict(QDA, Xtest)$class
mean(Ytest == labelstest)
## [1] 0.8470067