library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)
library(cluster)
library(plotly)
library(ggdendro)
scatterplot = function(X, M, cluster, label = F){
if(length(unique(cluster)) == 1){
plt = ggplot()+
geom_point(X, mapping = aes(longtitude, latitude))+
geom_point(M, mapping = aes(longtitude, latitude), shape = 4, size = 4)
if(label) return(plt + geom_text(X, mapping = aes(longtitude, latitude, label = 1:nrow(X)), nudge_x = 0.1))
else return(plt)
}
else{
ggplot()+
geom_point(data.frame(X, cluster = as.factor(cluster)), mapping = aes(longtitude, latitude, color = cluster))+
geom_point(M, mapping = aes(longtitude, latitude), shape = 4, size = 4)+
theme(legend.position = "none")
}
}
barplot = function(values){
n = length(values)
df = data.frame(value = values, index = 1:n)
ggplot(df, aes(index, value, fill = value)) +
geom_bar(color = "black", stat = "identity")+
scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
scale_x_continuous(breaks = 1:n)+
theme_bw()+
theme(legend.position = "none")
}
heatmap = function(A){
n = nrow(A)
p = ncol(A)
df = data.frame(value = c(A), i = 1:n, j = rep(1:p, rep(n, p)))
ggplot(df, aes(j, i, fill = value)) +
geom_tile()+
scale_fill_gradient(low="green", high = "red")+
scale_y_reverse()+
theme_void()
}
heatmap_bw = function(A){
n = nrow(A)
p = ncol(A)
df = data.frame(value = c(A), i = 1:n, j = rep(1:p, rep(n, p)))
ggplot(df, aes(j, i, fill = value)) +
geom_tile()+
scale_fill_gradient(low="white", high = "black")+
scale_y_reverse()+
theme_void()
}
gmmhist = function(X, Pi, M, S, title = NULL){
col = c("forestgreen", "orange", "steelblue", "purple", "yellow", "salmon")
K = length(Pi)
plt = ggplot() + geom_histogram(X, mapping = aes(x1, y = ..density..), alpha = 0.5) + theme(legend.position = "none")
xs = seq(min(X[,1]), max(X[,1]), length.out = 100)
for(k in 1:K){
mu = M[k]
pro = Pi[k]
center = data.frame(x1 = mu)
if(length(S) > 1) Sigma = S[k]
else Sigma = S
plt = plt +
geom_point(data = center, mapping = aes(x1, 0), color = col[k], shape = 4, size = 4)+
geom_line(data = data.frame(x1 = xs, density = pro * dnorm(xs, mu, sqrt(Sigma))), mapping = aes(x1, density), color = col[k], size = 1)
}
plt + ggtitle(title)
}
gmmplot = function(X, Z, Pi, M, S){
col = c("forestgreen", "orange", "steelblue", "purple", "yellow", "salmon")
K = ncol(Z)
c = qchisq(0.8, 2)
ts = seq(0, 2 * pi, 0.01)
circle = data.frame(longtitude = sqrt(c) * cos(ts), latitude = sqrt(c) * sin(ts))
plt = ggplot() + theme(legend.position = "none") + coord_equal()
for(k in 1:K){
df = data.frame(X, alpha = Z[,k])
mu = M[,k]
pro = Pi[k]
center = data.frame(longtitude = mu[1], latitude = mu[2])
Sigma = S[,,k]
ED = eigen(Sigma)
ellipse = t(ED$vectors %*% diag(sqrt(ED$values)) %*% t(circle) + mu)
colnames(ellipse) = c("longtitude", "latitude")
plt = plt + geom_point(df, mapping = aes(longtitude, latitude, alpha = alpha), color = col[k]) +
geom_point(data = center, mapping = aes(longtitude, latitude), color = col[k], shape = 4, size = 4)+
geom_path(data = ellipse, mapping = aes(longtitude, latitude), color = col[k])
}
plt
}
Load microarray data.
X = read.csv("data/cancer_microarray_subsample.csv", row.names = 1) %>% as.matrix()
dim(X)
## [1] 54 198
X = scale(X, scale = T, center = T)
heatmap(X)
Let’s check the PCA plot. It seems there is one outlier.
PCA = prcomp(X)
autoplot(PCA)+
theme(legend.position = "none")
Compute pairwise distances first.
D = dist(X, method = 'euclidean')
heatmap_bw(as.matrix(D))
Try different hierarchical clustering approaches.
HCc = hclust(D, method = 'complete')
plot(HCc)
HCs = hclust(D, method = 'single')
plot(HCs)
HCa = hclust(D, method = 'average')
plot(HCa)
We will use complete linkage. We explore the output of the
hclust()
function first.
plt = ggdendrogram(HCc)
plt
HCc$height
## [1] 0.000000 0.000000 2.549343 3.666582 3.744704 3.854209 3.971881
## [8] 4.734353 5.013432 5.127819 5.193668 5.201293 5.365987 5.436321
## [15] 5.472059 5.710921 5.986432 6.485512 7.002768 7.104716 7.656299
## [22] 7.813109 7.921703 8.177158 8.772128 8.811652 9.464666 9.891181
## [29] 10.496558 10.521582 10.538403 11.225068 11.393834 12.030347 12.496697
## [36] 12.589057 13.066754 13.161896 14.119534 15.085123 15.432359 15.653248
## [43] 16.152225 16.713650 17.989188 19.407605 19.555250 21.088042 22.983250
## [50] 26.405329 31.528027 42.304428 43.378299
plt+
geom_hline(aes(yintercept = HCc$height), color = "red", linetype = "dashed")
Permute genes according to the hierarchical clustering order.
heatmap(X[HCc$order,])
What will happen to the pairwise distances heatmap?
heatmap_bw(as.matrix(D)[HCc$order,HCc$order])
Let’s cut the dendrogram at \(K=2,\ldots,6\) clusters anc check the PCA result.
col = c("#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#CC79A7", "black")
for(k in 2:6){
CT = cutree(HCc, k = k)
print(autoplot(PCA, data = data.frame(X, cluster = as.factor(CT)), shape = F, color = "cluster")+
theme(legend.position = "none")+
scale_color_manual(values = col))
}
More fancy way to do this.
library(factoextra)
fviz_dend(HCc, k = 6, color_labels_by_k = TRUE, rect = TRUE)
fviz_cluster(list(data = X, cluster = CT))+
theme(legend.position = "none")
Let’s apply GMM to the data in \(p=2\).
set.seed(6)
n = 300
K = 3
M = matrix(rnorm(K), K, 1)
S = matrix(c(1, 0.7, 0.5), 3, 1)
colnames(M) = c("x1")
rat = c(1/6, 1/3, 1/2)
Ms = M[rep(c(1, 2, 3), times = n*rat),]
Ss = S[rep(c(1, 2, 3), times = n*rat),]
X = matrix(rnorm(n, 0, 0.3), n, 1) * Ss + Ms
colnames(X) = c("x1")
plt = ggplot()+
geom_histogram(X, mapping = aes(x1, y = ..density..), alpha = 0.5)
plt
Fit the GMM model.
library(mclust)
GMM = Mclust(X, G = 3)
Model “V” means that the distribution is variable variance.
GMM$modelName
## [1] "V"
Obtain the parameters of the fit.
GMM$parameters$pro
## [1] 0.3250472 0.1659476 0.5090052
GMM$parameters$mean
## 1 2 3
## -0.6652228 0.2065095 0.8658256
GMM$parameters$variance$sigmasq
## [1] 0.03836507 0.07283484 0.01772777
head(GMM$z)
## [,1] [,2] [,3]
## [1,] 5.581504e-13 0.01873396 9.812660e-01
## [2,] 2.623392e-05 0.99961413 3.596403e-04
## [3,] 2.128698e-06 0.99033374 9.664135e-03
## [4,] 1.094786e-01 0.89052141 1.192682e-11
## [5,] 1.036446e-07 0.82953260 1.704673e-01
## [6,] 2.265044e-05 0.99953307 4.442782e-04
gmmhist(X, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigmasq)
Use modelNames
to control the type of the variance used
for fitting. Model “E” means that the distribution is equal
variance.
GMM = Mclust(X, G = 3, modelNames = "E")
gmmhist(X, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigmasq)
Vary G
to control the number of clusters.
for(G in 2:6){
GMM = Mclust(X, G = G, modelNames = "V", verbose = F)
print(gmmhist(X, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigmasq))
}
Let’s see the convergence process.
G = 3
for(iter in seq(1, 25, 8)){
GMM = Mclust(X, G = G, modelNames = "V", control = emControl(itmax = iter), verbose = F)
print(gmmhist(X, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigmasq, title = paste("iteration = ", iter)))
}
Now consider \(p=3\).
set.seed(6)
n = 180
K = 3
M = matrix(rnorm(K * 2), K, 2)
S = matrix(c(1, 0.7, 0.5, 1, 0.7, 0.5), 3, 2)
colnames(M) = c("longtitude", "latitude")
rat = c(1/6, 1/3, 1/2)
Ms = M[rep(c(1, 2, 3), times = n*rat),]
Ss = S[rep(c(1, 2, 3), times = n*rat),]
X = matrix(rnorm(n * 2, 0, 0.5), n, 2) * Ss + Ms
colnames(X) = c("longtitude", "latitude")
plt = ggplot()+
geom_point(X, mapping = aes(longtitude, latitude))
plt
Fit the GMM model.
GMM = Mclust(X, G = 3, control = emControl(itmax = 100), verbose = F)
Model “VII” means that the distribution is spherical with unequal volume.
GMM$modelName
## [1] "VII"
Obtain the parameters of the fit.
GMM$parameters$pro
## [1] 0.1710492 0.5095774 0.3193734
GMM$parameters$mean
## [,1] [,2] [,3]
## longtitude 0.2723381 0.8339633 -0.682466292
## latitude 1.6415196 0.3519436 -0.007222801
GMM$parameters$variance$sigma
## , , 1
##
## longtitude latitude
## longtitude 0.2887978 0.0000000
## latitude 0.0000000 0.2887978
##
## , , 2
##
## longtitude latitude
## longtitude 0.05473706 0.00000000
## latitude 0.00000000 0.05473706
##
## , , 3
##
## longtitude latitude
## longtitude 0.1134929 0.0000000
## latitude 0.0000000 0.1134929
head(GMM$z)
## [,1] [,2] [,3]
## [1,] 0.9999770 3.926770e-12 2.297163e-05
## [2,] 0.9999999 1.188758e-07 2.169940e-09
## [3,] 0.9984440 1.431792e-03 1.242195e-04
## [4,] 0.9999521 4.671379e-10 4.792697e-05
## [5,] 1.0000000 6.020000e-14 1.301858e-15
## [6,] 0.9999883 7.932274e-12 1.165102e-05
gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma)
Use modelNames
to control the type of the variance used
for fitting. “EII” is spherical, equal volume
GMM = Mclust(X, G = 3, modelNames = "EII")
gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma)
“EEE” is ellipsoidal, equal volume, shape, and orientation.
GMM = Mclust(X, G = 3, modelNames = "EEE")
gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma)
“VVV” is ellipsoidal, varying volume, shape, and orientation.
GMM = Mclust(X, G = 3, modelNames = "VVV")
gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma)
Vary G
to control the number of clusters.
for(G in 2:6){
GMM = Mclust(X, G = G, modelNames = "VVV", verbose = F)
print(gmmplot(X, Z = GMM$z, Pi = GMM$parameters$pro, M = GMM$parameters$mean, S = GMM$parameters$variance$sigma))
}