Helper functions

library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)
library(cluster)
library(plotly)
library(ggdendro)


scatterplot = function(X, M, cluster, label = F){
  if(length(unique(cluster)) == 1){
    plt = ggplot()+
      geom_point(X, mapping = aes(longtitude, latitude))+
      geom_point(M, mapping = aes(longtitude, latitude), shape = 4, size = 4)
    if(label) return(plt + geom_text(X, mapping = aes(longtitude, latitude, label = 1:nrow(X)), nudge_x = 0.1))
    else return(plt)
  }
  else{
    ggplot()+
      geom_point(data.frame(X, cluster = as.factor(cluster)), mapping = aes(longtitude, latitude, color = cluster))+
      geom_point(M, mapping = aes(longtitude, latitude), shape = 4, size = 4)+
      theme(legend.position = "none")
  } 
}

barplot = function(values){
  n = length(values)
  df = data.frame(value = values,  index = 1:n)
  ggplot(df, aes(index, value, fill = value)) + 
    geom_bar(color = "black", stat = "identity")+
    scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
    scale_x_continuous(breaks = 1:n)+
    theme_bw()+
    theme(legend.position = "none")
}

heatmap = function(A){
  n = nrow(A)
  p = ncol(A)  
  df = data.frame(value = c(A),  i = 1:n, j = rep(1:p, rep(n, p)))
  ggplot(df, aes(j, i, fill = value)) + 
    geom_tile()+
    scale_fill_gradient(low="green", high = "red")+
    scale_y_reverse()+
    theme_void()
}

heatmap_bw = function(A){
  n = nrow(A)
  p = ncol(A)  
  df = data.frame(value = c(A),  i = 1:n, j = rep(1:p, rep(n, p)))
  ggplot(df, aes(j, i, fill = value)) + 
    geom_tile()+
    scale_fill_gradient(low="white", high = "black")+
    scale_y_reverse()+
    theme_void()
}


gmmhist = function(X, Pi, M, S, title = NULL){
  col = c("forestgreen", "orange", "steelblue", "purple", "yellow", "salmon")
  K = length(Pi)
  plt = ggplot() +  geom_histogram(X, mapping = aes(x1, y = ..density..), alpha = 0.5) + theme(legend.position = "none")
  xs = seq(min(X[,1]), max(X[,1]), length.out = 100)
  for(k in 1:K){
    mu = M[k]
    pro = Pi[k]
    center = data.frame(x1 = mu)
    if(length(S) > 1) Sigma = S[k]
    else Sigma = S
    plt = plt + 
      geom_point(data = center, mapping = aes(x1, 0), color = col[k], shape = 4, size = 4)+
      geom_line(data = data.frame(x1 = xs, density = pro * dnorm(xs, mu,  sqrt(Sigma))), mapping = aes(x1, density), color = col[k], size = 1)
  }
  plt + ggtitle(title)
}

gmmplot = function(X, Z, Pi, M, S){
  col = c("forestgreen", "orange", "steelblue", "purple", "yellow", "salmon")
  K = ncol(Z)
  c = qchisq(0.8, 2)
  ts = seq(0, 2 * pi, 0.01)
  circle = data.frame(longtitude = sqrt(c) * cos(ts), latitude = sqrt(c) * sin(ts))
  plt = ggplot() + theme(legend.position = "none") + coord_equal()
  for(k in 1:K){
    df = data.frame(X, alpha = Z[,k])
    mu = M[,k]
    pro = Pi[k]
    center = data.frame(longtitude = mu[1], latitude = mu[2])
    Sigma = S[,,k]
    ED = eigen(Sigma)
    ellipse = t(ED$vectors %*% diag(sqrt(ED$values)) %*% t(circle) + mu)
    colnames(ellipse) = c("longtitude", "latitude")
    plt = plt + geom_point(df, mapping = aes(longtitude, latitude, alpha = alpha), color = col[k]) +
      geom_point(data = center, mapping = aes(longtitude, latitude), color = col[k], shape = 4, size = 4)+
      geom_path(data = ellipse, mapping = aes(longtitude, latitude), color = col[k])
  }
  plt
}

Hierarchical clustering

Load microarray data.

X = read.csv("data/cancer_microarray_subsample.csv", row.names = 1) %>% as.matrix()
dim(X)
## [1]  54 198
X = scale(X, scale = T, center = T)
heatmap(X)

Let’s check the PCA plot. It seems there is one outlier.

PCA =  prcomp(X)
autoplot(PCA)+
  theme(legend.position = "none")

Compute pairwise distances first.

D = dist(X, method = 'euclidean')
heatmap_bw(as.matrix(D))

Try different hierarchical clustering approaches.

HCc = hclust(D, method = 'complete')
plot(HCc)

HCs = hclust(D, method = 'single')
plot(HCs)

HCa = hclust(D, method = 'average')
plot(HCa)

We will use complete linkage. We explore the output of the hclust() function first.

plt = ggdendrogram(HCc)
plt

HCc$height
##  [1]  0.000000  0.000000  2.549343  3.666582  3.744704  3.854209  3.971881
##  [8]  4.734353  5.013432  5.127819  5.193668  5.201293  5.365987  5.436321
## [15]  5.472059  5.710921  5.986432  6.485512  7.002768  7.104716  7.656299
## [22]  7.813109  7.921703  8.177158  8.772128  8.811652  9.464666  9.891181
## [29] 10.496558 10.521582 10.538403 11.225068 11.393834 12.030347 12.496697
## [36] 12.589057 13.066754 13.161896 14.119534 15.085123 15.432359 15.653248
## [43] 16.152225 16.713650 17.989188 19.407605 19.555250 21.088042 22.983250
## [50] 26.405329 31.528027 42.304428 43.378299
plt+
  geom_hline(aes(yintercept = HCc$height), color = "red", linetype = "dashed")

Permute genes according to the hierarchical clustering order.

heatmap(X[HCc$order,])

What will happen to the pairwise distances heatmap?

heatmap_bw(as.matrix(D)[HCc$order,HCc$order])

Let’s cut the dendrogram at \(K=2,\ldots,6\) clusters anc check the PCA result.

col = c("#E69F00", "#56B4E9", "#009E73",
          "#F0E442", "#CC79A7", "black")
for(k in 2:6){
  CT = cutree(HCc, k = k)
  print(autoplot(PCA, data = data.frame(X, cluster = as.factor(CT)), shape = F, color = "cluster")+
    theme(legend.position = "none")+
      scale_color_manual(values = col))
}

More fancy way to do this.

library(factoextra)
fviz_dend(HCc, k = 6,  color_labels_by_k = TRUE, rect = TRUE)

fviz_cluster(list(data = X, cluster = CT))+
  theme(legend.position = "none")

Gaussian Mixture Model

Let’s apply GMM to the data in \(p=2\).

set.seed(6)
n = 300
K = 3
M = matrix(rnorm(K), K, 1)
S = matrix(c(1, 0.7, 0.5), 3, 1)
colnames(M) =  c("x1")
rat = c(1/6, 1/3, 1/2)
Ms = M[rep(c(1, 2, 3), times = n*rat),]
Ss = S[rep(c(1, 2, 3), times = n*rat),]
X = matrix(rnorm(n, 0, 0.3), n, 1) * Ss + Ms
colnames(X) = c("x1")
plt = ggplot()+
  geom_histogram(X, mapping = aes(x1, y = ..density..), alpha = 0.5)
plt

Fit the GMM model.

library(mclust)
GMM = Mclust(X, G = 3)

Model “V” means that the distribution is variable variance.

GMM$modelName
## [1] "V"

Obtain the parameters of the fit.

GMM$parameters$pro
## [1] 0.3250472 0.1659476 0.5090052
GMM$parameters$mean
##          1          2          3 
## -0.6652228  0.2065095  0.8658256
GMM$parameters$variance$sigmasq
## [1] 0.03836507 0.07283484 0.01772777
head(GMM$z)
##              [,1]       [,2]         [,3]
## [1,] 5.581504e-13 0.01873396 9.812660e-01
## [2,] 2.623392e-05 0.99961413 3.596403e-04
## [3,] 2.128698e-06 0.99033374 9.664135e-03
## [4,] 1.094786e-01 0.89052141 1.192682e-11
## [5,] 1.036446e-07 0.82953260 1.704673e-01
## [6,] 2.265044e-05 0.99953307 4.442782e-04
gmmhist(X, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigmasq)

Use modelNames to control the type of the variance used for fitting. Model “E” means that the distribution is equal variance.

GMM = Mclust(X, G = 3, modelNames = "E")
gmmhist(X, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigmasq)

Vary G to control the number of clusters.

for(G in 2:6){
  GMM = Mclust(X, G = G, modelNames = "V", verbose = F)
  print(gmmhist(X, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigmasq))
}

Let’s see the convergence process.

G = 3
for(iter in seq(1, 25, 8)){
  GMM = Mclust(X, G = G, modelNames = "V", control = emControl(itmax = iter),  verbose = F)
  print(gmmhist(X, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigmasq, title = paste("iteration = ", iter)))
}

Now consider \(p=3\).

set.seed(6)
n = 180
K = 3
M = matrix(rnorm(K * 2), K, 2)
S = matrix(c(1, 0.7, 0.5, 1, 0.7, 0.5), 3, 2)
colnames(M) =  c("longtitude", "latitude")
rat = c(1/6, 1/3, 1/2)
Ms = M[rep(c(1, 2, 3), times = n*rat),]
Ss = S[rep(c(1, 2, 3), times = n*rat),]
X = matrix(rnorm(n * 2, 0, 0.5), n, 2) * Ss + Ms
colnames(X) = c("longtitude", "latitude")
plt = ggplot()+
  geom_point(X, mapping = aes(longtitude, latitude))
plt

Fit the GMM model.

GMM = Mclust(X, G = 3, control = emControl(itmax = 100),  verbose = F)

Model “VII” means that the distribution is spherical with unequal volume.

GMM$modelName
## [1] "VII"

Obtain the parameters of the fit.

GMM$parameters$pro
## [1] 0.1710492 0.5095774 0.3193734
GMM$parameters$mean
##                 [,1]      [,2]         [,3]
## longtitude 0.2723381 0.8339633 -0.682466292
## latitude   1.6415196 0.3519436 -0.007222801
GMM$parameters$variance$sigma
## , , 1
## 
##            longtitude  latitude
## longtitude  0.2887978 0.0000000
## latitude    0.0000000 0.2887978
## 
## , , 2
## 
##            longtitude   latitude
## longtitude 0.05473706 0.00000000
## latitude   0.00000000 0.05473706
## 
## , , 3
## 
##            longtitude  latitude
## longtitude  0.1134929 0.0000000
## latitude    0.0000000 0.1134929
head(GMM$z)
##           [,1]         [,2]         [,3]
## [1,] 0.9999770 3.926770e-12 2.297163e-05
## [2,] 0.9999999 1.188758e-07 2.169940e-09
## [3,] 0.9984440 1.431792e-03 1.242195e-04
## [4,] 0.9999521 4.671379e-10 4.792697e-05
## [5,] 1.0000000 6.020000e-14 1.301858e-15
## [6,] 0.9999883 7.932274e-12 1.165102e-05
gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma)

Use modelNames to control the type of the variance used for fitting. “EII” is spherical, equal volume

GMM = Mclust(X, G = 3, modelNames = "EII")
gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma)

“EEE” is ellipsoidal, equal volume, shape, and orientation.

GMM = Mclust(X, G = 3, modelNames = "EEE")
gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma)

“VVV” is ellipsoidal, varying volume, shape, and orientation.

GMM = Mclust(X, G = 3, modelNames = "VVV")
gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma)

Vary G to control the number of clusters.

for(G in 2:6){
  GMM = Mclust(X, G = G, modelNames = "VVV", verbose = F)
  print(gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma))
}