library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)
library(cluster)
library(plotly)
library(ggdendro)
scatterplot = function(X, cluster = NULL){
if(is.null(cluster)) cluster = rep(1, nrow(X))
if(length(unique(cluster)) == 1){
ggplot()+
geom_point(X, mapping = aes(x1, x2))
} else {
ggplot()+
geom_point(data.frame(X, cluster = as.factor(cluster)), mapping = aes(x1, x2, color = cluster))+
theme(legend.position = "none")
}
}
scatterplot3D = function(X, cluster = NULL){
if(is.null(cluster)) cluster = rep(1, nrow(X))
plot_ly(x = X[,1], y = X[,2], z = X[,1], color = as.factor(cluster)) %>% add_markers() %>%
layout(scene = list(xaxis = list(title = 'first'),
yaxis = list(title = 'second'),
zaxis = list(title = 'third')), showlegend = FALSE)
}
barplot = function(values){
n = length(values)
df = data.frame(value = values, index = 1:n)
ggplot(df, aes(index, value, fill = value)) +
geom_bar(color = "black", stat = "identity")+
scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
scale_x_continuous(breaks = 1:n)+
theme_bw()+
theme(legend.position = "none")
}
heatmap = function(A){
n = nrow(A)
p = ncol(A)
df = data.frame(value = c(A), i = 1:n, j = rep(1:p, rep(n, p)))
ggplot(df, aes(j, i, fill = value)) +
geom_tile()+
scale_fill_viridis()+
scale_y_reverse()+
theme_void()
}
Generate data.
set.seed(0)
n = 100
r1 = 0.3
r2 = 1
r3 = 1.8
X1 = matrix(rnorm(n * 2), n, 2)
X1 = X1/sqrt(rowSums(X1^2)) * r1
X2 = matrix(rnorm(n * 2), n, 2)
X2 = X2/sqrt(rowSums(X2^2)) * r2
X3 = matrix(rnorm(n * 2), n, 2)
X3 = X3/sqrt(rowSums(X3^2)) * r3
E = matrix(rnorm(3*n * 2, 0, 0.1), 3*n, 2)
X = rbind(X1, X2, X3) + E
X = scale (X, scale = F, center = T)
colnames(X) = c("x1", "x2")
labels = c(rep(1, n), rep(2, n), rep(3, n))
df = data.frame(X, label = as.factor(labels))
ggplot(X, aes(x1, x2))+
geom_point()
Create a similarity matrix \(S\).
S = exp(-1/2 * as.matrix(dist(X)))
heatmap(S)
Convert it to a weighted adjacency matrix \(W\), use 10-nearest neighbors.
knn = function(values, k){
index = order(values, decreasing = TRUE)[(1:k) + 1]
values[-index] = NA
return(values)
}
W = apply(S, 2, function(x) knn(x, k = 10))
W = (W + t(W)) / 2
W[is.na(W)] = 0
heatmap(W)
Create a graph representation.
library(igraph)
G = graph.adjacency(W, mode="undirected", weighted = TRUE)
plot(G, vertex.size=3, vertex.label=NA)
Compute Laplacian \(L\).
D = diag(colSums(W))
L = D - W
heatmap(L)
Find the last \(K\) eigenvectors of \(L\).
n = nrow(L)
K = 3
ED = eigen(L)
ED$values[(n-K+1):n]
## [1] 1.263349e-02 5.165349e-03 2.572727e-15
U = ED$vectors[, (n-K+1):n]
heatmap(U)
scatterplot3D(U)
Run \(K\)-means on the last \(K\) eigenvectors of \(L\).
KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot3D(U, KM$cluster)
scatterplot(X, KM$cluster)
K = 4
U = ED$vectors[, (n-K+1):n]
heatmap(U)
KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)
K = 5
U = ED$vectors[, (n-K+1):n]
heatmap(U)
KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)
K = 6
U = ED$vectors[, (n-K+1):n]
heatmap(U)
KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)
Twenty neighbors.
K = 3
W = apply(S, 2, function(x) knn(x, k = 20))
W = (W + t(W)) / 2
W[is.na(W)] = 0
heatmap(W)
G = graph.adjacency(W, mode="undirected", weighted = TRUE)
plot(G, vertex.size=3, vertex.label=NA)
D = diag(colSums(W))
L = D - W
ED = eigen(L)
ED$values[(n-K+1):n]
## [1] 3.111176e-01 4.396184e-02 -7.105427e-15
U = ED$vectors[, (n-K+1):n]
heatmap(U)
KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)
This method is not sensitive to the values of distances.
eps = 0.8
W = (S > eps) * 1
diag(W) = 0
heatmap(W)
G = graph.adjacency(W, mode="undirected")
plot(G, vertex.size=3, vertex.label=NA)
D = diag(colSums(W))
L = D - W
ED = eigen(L)
ED$values[(n-K+1):n]
## [1] 7.295703e-03 1.513382e-14 -5.635310e-15
U = ED$vectors[, (n-K+1):n]
heatmap(U)
KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)
Decreasing \(\epsilon\) will make \(W\) more dense.
eps = 0.5
W = (S > eps) * 1
diag(W) = 0
heatmap(W)
G = graph.adjacency(W, mode="undirected")
plot(G, vertex.size=3, vertex.label=NA)
D = diag(colSums(W))
L = D - W
ED = eigen(L)
ED$values[(n-K+1):n]
## [1] 2.673156e+01 2.459718e+01 4.956622e-13
U = ED$vectors[, (n-K+1):n]
heatmap(U)
KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)
This method will also truncale more edges in the second and third clusters than in the first one.
lambda = 0.7
W = pmax(S - lambda, 0)
diag(W) = 0
heatmap(W)
G = graph.adjacency(W, mode="undirected", weighted = TRUE)
plot(G, vertex.size=3, vertex.label=NA)
D = diag(colSums(W))
L = D - W
ED = eigen(L)
ED$values[(n-K+1):n]
## [1] 4.926520e-02 4.355113e-02 -3.766508e-16
U = ED$vectors[, (n-K+1):n]
heatmap(U)
KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)
K = 3
KM = kmeans(X, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)