Helper functions

library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)
library(cluster)
library(plotly)
library(ggdendro)


scatterplot = function(X, cluster = NULL){
  if(is.null(cluster)) cluster = rep(1, nrow(X))
  if(length(unique(cluster)) == 1){
    ggplot()+
      geom_point(X, mapping = aes(x1, x2))
  } else {
    ggplot()+
      geom_point(data.frame(X, cluster = as.factor(cluster)), mapping = aes(x1, x2, color = cluster))+
      theme(legend.position = "none")
  } 
}

scatterplot3D = function(X, cluster = NULL){
  if(is.null(cluster)) cluster = rep(1, nrow(X))
  plot_ly(x = X[,1], y = X[,2], z = X[,1], color = as.factor(cluster)) %>% add_markers() %>%
    layout(scene = list(xaxis = list(title = 'first'),
                     yaxis = list(title = 'second'),
                     zaxis = list(title = 'third')), showlegend = FALSE)
}

barplot = function(values){
  n = length(values)
  df = data.frame(value = values,  index = 1:n)
  ggplot(df, aes(index, value, fill = value)) + 
    geom_bar(color = "black", stat = "identity")+
    scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
    scale_x_continuous(breaks = 1:n)+
    theme_bw()+
    theme(legend.position = "none")
}

heatmap = function(A){
  n = nrow(A)
  p = ncol(A)  
  df = data.frame(value = c(A),  i = 1:n, j = rep(1:p, rep(n, p)))
  ggplot(df, aes(j, i, fill = value)) + 
    geom_tile()+
    scale_fill_viridis()+
    scale_y_reverse()+
    theme_void()
}

Phoeme data

The data were extracted from the Acoustic-Phonetic Continuous Speech Corpus. A dataset was formed by selecting five phonemes for classification based on digitized speech from this database.

The phonemes are transcribed as follows:

“sh” as in “she”,
“dcl” as in “dark”,
“iy” as the vowel in “she”,
“aa” as the vowel in “dark”,
“ao” as the first vowel in “water”.

From continuous speech of 50 male speakers, 4509 speech frames of 32 msec duration were selected, approximately 2 examples of each phoneme from each speaker. Each speech frame is represented by 512 samples at a 16kHz sampling rate, and each frame represents one of the above five phonemes. From each speech frame, a log-periodogram was computed.

Load the data and split it into train and test data.

data = read.csv("https://hastie.su.domains/ElemStatLearn/datasets/phoneme.data", header = T, sep = ",")
X = data %>% dplyr::select(-row.names, -g, -speaker)
Y = data %>% dplyr::select(g) %>% pull() %>% as.factor()
table(Y)

## Y
##   aa   ao  dcl   iy   sh 
##  695 1022  757 1163  872

dim(X)

## [1] 4509  256

n = nrow(X)
p = ncol(X)
train = sample(1:n, n*0.8)
Xtrain = as.matrix(X[train,])
Ytrain = Y[train]
Xtest = as.matrix(X[-train,])
Ytest = Y[-train]

PCA results.

PCA = prcomp(Xtrain, scale = FALSE)
autoplot(PCA, data = data.frame(Xtrain, phoneme = Ytrain), colour = 'phoneme')

LDA model

Fit the LDA model.

library(MASS)
LDA = lda(Xtrain, grouping = Ytrain)
LDA$prior

##        aa        ao       dcl        iy        sh 
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223

head(LDA$scaling) # is the transformation matrix

##              LD1         LD2         LD3          LD4
## x.1  0.054817009  0.01463716  0.06713418  0.050214462
## x.2 -0.011014695  0.07510455 -0.04545557 -0.051805734
## x.3 -0.039125133  0.04164280  0.03327425 -0.055371316
## x.4 -0.009008501 -0.21237745  0.05404024  0.124108619
## x.5 -0.033386367 -0.19818523  0.10077913  0.009457531
## x.6  0.010336300 -0.14173295  0.04316948  0.031095589

dim(LDA$scaling)

## [1] 256   4

head(t(LDA$means))

##           aa       ao       dcl       iy       sh
## x.1 11.77489 10.96335  9.606017 10.83331 10.81047
## x.2 12.97825 13.46851  9.901521 11.75975 10.65563
## x.3 16.61219 16.73301 13.275869 15.49476 10.89706
## x.4 17.92733 16.95343 14.203040 17.80487 11.21339
## x.5 16.52991 15.85652 13.454478 17.28067 11.43806
## x.6 16.05482 16.50254 13.278399 16.22198 11.79408

dim(t(LDA$means))

## [1] 256   5

Visualize the results.

Option 1: use the transformation matrix.

pi = LDA$prior
AX = Xtrain %*% LDA$scaling
AM = LDA$means %*% LDA$scaling
classify = function(X, M, pi){
  D = as.matrix(dist(rbind(M, X)))[nrow(M) + (1:nrow(X)), 1:nrow(M)]
  C = scale(1/2 * D^2, center = log(pi), scale = F)
  colnames(D)[apply(C, 1, which.min)]
}
labels = classify(AX, AM, pi)

df = data.frame(AX, phoneme = labels)
dfm = data.frame(AM)
ggplot(mapping = aes(LD1, LD2, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

Option 1: use predict() function.

pred = predict(LDA)
PX = pred$x
df = data.frame(PX, phoneme = pred$class)
predm = predict(LDA, LDA$means)
PM = predm$x
dfm = data.frame(PM, phoneme = predm$class)
ggplot(mapping = aes(LD1, LD2, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

Option 1 and 2 return different projected data. What is the difference?

sum((AX - PX)^2)

## [1] 538672.7

sum((AM - PM)^2)

## [1] 746.7046

The difference is in centering. The predict() function centers the projections at \(\mu = \sum_{k=1}^K\pi_k \mu_k\)

colSums(diag(pi) %*% PM)

##          LD1          LD2          LD3          LD4 
## 2.220446e-16 9.974660e-16 2.081668e-16 2.116363e-16

center = colSums(diag(pi) %*% AM)
AXc = scale(AX, center = center, scale = F)
AMc = scale(AM, center = center, scale = F)
sum((AXc - PX)^2)

## [1] 7.066536e-25

sum((AMc - PM)^2)

## [1] 4.745459e-28

Check other LD components.

ggplot(mapping = aes(LD1, LD2, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

ggplot(mapping = aes(LD2, LD3, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

ggplot(mapping = aes(LD3, LD4, color = phoneme))+
  geom_point(data = df)+
  geom_point(data = dfm, color = "black", shape = 4, size = 3)

Compare true labels with predicted ones.

df = data.frame(PX, phoneme = c(pred$class, Ytrain), type = c(rep("predicted", length(train)), rep("true", length(train))))
ggplot(df, aes(LD1, LD2, color = phoneme))+
  geom_point()+
  facet_wrap(~type)

Quantify the accuracy. It’s quite high!

table(Ytrain, pred$class)

##       
## Ytrain  aa  ao dcl  iy  sh
##    aa  451 108   0   0   0
##    ao  103 724   0   0   0
##    dcl   0   0 586  12   1
##    iy    0   0   1 940   1
##    sh    0   0   0   0 680

mean(Ytrain == pred$class)

## [1] 0.9373441

Plot the decision boundaries when using LD1 and LD2.

grid = expand.grid(LD1 = seq(min(PX[,1]), max(PX[,1]), length.out = 50), LD2 = seq(min(PX[,2]), max(PX[,2]), length.out = 50))
LDAreg = lda(PX[,1:2], grouping = Ytrain)
LDAreg$prior

##        aa        ao       dcl        iy        sh 
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223

pi

##        aa        ao       dcl        iy        sh 
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223

LDAreg$means

##           LD1        LD2
## aa  -6.330314  0.3813777
## ao  -6.627299  0.0517982
## dcl  4.167750 -2.6549509
## iy   3.706228 -3.2643678
## sh   4.458343  6.4842983

PM[,1:2]

##           LD1        LD2
## aa  -6.330314  0.3813777
## ao  -6.627299  0.0517982
## dcl  4.167750 -2.6549509
## iy   3.706228 -3.2643678
## sh   4.458343  6.4842983

labelsgrid = predict(LDAreg, grid)$class
#labelsgrid = classify(grid, PM[,1:2], pi)
dfgrid =  data.frame(grid, phoneme = labelsgrid)
plt = ggplot(mapping = aes(LD1, LD2, color = phoneme))+
  geom_point(data = dfgrid, alpha = 0.2, size = 3)+
  geom_point(data = PM, color = "black", shape = 4, size = 3)
plt

Compare to the true labels. The accuracy dropped!

plt + 
  geom_point(data = df %>% filter(type == "true"))+
  geom_point(data = PM, color = "black", shape = 4, size = 3)

labelsreg = predict(LDAreg, PX[,1:2])$class
mean(Ytrain == labelsreg)

## [1] 0.7280288

Compare the results for is we use \(L=1,2,3,4\).

accs = c()
for(L in 1:4){
  LDAreg = lda(PX[,1:L,drop = F], grouping = Ytrain)
  labelsreg = predict(LDAreg)$class
  #labelsreg = classify(PX[,1:L,drop = F], PM[,1:L, drop = F], pi)
  accs = c(accs, mean(Ytrain == labelsreg))
}
data.frame(L = 1:4, accuracy = accs)

L	accuracy
1	0.5422789
2	0.7280288
3	0.8735792
4	0.9373441

This was train accuracy. Now evaluate the classification performance on the test set. The results are consistent.

pred = predict(LDA, Xtest)
PXtest = pred$x
accstest = c()
for(L in 1:4){
  LDAreg = lda(PX[,1:L,drop = F], grouping = Ytrain)
  labelsreg = predict(LDAreg, PXtest[,1:L,drop = F])$class
  #labelsreg = classify(PXtest[,1:L,drop = F], PM[,1:L, drop = F], pi)
  accstest = c(accstest, mean(Ytest == labelsreg))
}
data.frame(L = 1:4, accuracy_train = accs, accuracy_test = accstest)

L	accuracy_train	accuracy_test
1	0.5422789	0.5365854
2	0.7280288	0.7272727
3	0.8735792	0.8747228
4	0.9373441	0.9334812

QDA model

Fit the QDA model.

QDA = qda(Xtrain, grouping = Ytrain)
QDA$prior # the same for LDA

##        aa        ao       dcl        iy        sh 
## 0.1549764 0.2292764 0.1660660 0.2611589 0.1885223

head(t(QDA$means)) # the same for LDA

##           aa       ao       dcl       iy       sh
## x.1 11.77489 10.96335  9.606017 10.83331 10.81047
## x.2 12.97825 13.46851  9.901521 11.75975 10.65563
## x.3 16.61219 16.73301 13.275869 15.49476 10.89706
## x.4 17.92733 16.95343 14.203040 17.80487 11.21339
## x.5 16.52991 15.85652 13.454478 17.28067 11.43806
## x.6 16.05482 16.50254 13.278399 16.22198 11.79408

dim(QDA$scaling) # is the sphering matrices

## [1] 256 256   5

Compare the performance. QDA overfits slightly.

labels = predict(QDA)$class
mean(Ytrain == labels)

## [1] 1

labelstest = predict(QDA, Xtest)$class
mean(Ytest == labelstest)

## [1] 0.8470067

Classification: LDA

Helper functions

Phoeme data

LDA model

QDA model