Clustering: spectral

Spectral clustering

Generate data.

set.seed(0)
n = 100
r1 = 0.3
r2 = 1
r3 = 1.8
X1 = matrix(rnorm(n * 2), n, 2)
X1 = X1/sqrt(rowSums(X1^2)) * r1
X2 = matrix(rnorm(n * 2), n, 2)
X2 = X2/sqrt(rowSums(X2^2)) * r2
X3 = matrix(rnorm(n * 2), n, 2)
X3 = X3/sqrt(rowSums(X3^2)) * r3
E = matrix(rnorm(3*n * 2, 0, 0.1), 3*n, 2)
X = rbind(X1, X2, X3) + E
X = scale (X, scale = F, center = T)
colnames(X) = c("x1", "x2")
labels = c(rep(1, n), rep(2, n), rep(3, n))
df = data.frame(X, label = as.factor(labels))
ggplot(X, aes(x1, x2))+
  geom_point()

Create a similarity matrix \(S\).

S = exp(-1/2 * as.matrix(dist(X)))
heatmap(S)

Convert it to a weighted adjacency matrix \(W\), use 10-nearest neighbors.

knn = function(values, k){
  index = order(values, decreasing = TRUE)[(1:k) + 1]
  values[-index] = NA
  return(values)
}
W = apply(S, 2, function(x) knn(x, k = 10))
W = (W + t(W)) / 2
W[is.na(W)] = 0
heatmap(W)

Create a graph representation.

library(igraph)
G = graph.adjacency(W, mode="undirected", weighted = TRUE)
plot(G, vertex.size=3, vertex.label=NA)

Compute Laplacian \(L\).

D = diag(colSums(W))
L = D - W
heatmap(L)

Find the last \(K\) eigenvectors of \(L\).

n = nrow(L)
K = 3
ED = eigen(L)
ED$values[(n-K+1):n]

## [1] 1.263349e-02 5.165349e-03 2.572727e-15

U = ED$vectors[, (n-K+1):n]
heatmap(U)

scatterplot3D(U)

Run \(K\)-means on the last \(K\) eigenvectors of \(L\).

KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot3D(U, KM$cluster)

scatterplot(X, KM$cluster)

What if we increase the number of clusters in K-means?

K = 4
U = ED$vectors[, (n-K+1):n]
heatmap(U)

KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)

K = 5
U = ED$vectors[, (n-K+1):n]
heatmap(U)

KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)

K = 6
U = ED$vectors[, (n-K+1):n]
heatmap(U)

KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)

What if we increase the number of neighbors?

Twenty neighbors.

K = 3
W = apply(S, 2, function(x) knn(x, k = 20))
W = (W + t(W)) / 2
W[is.na(W)] = 0
heatmap(W)

G = graph.adjacency(W, mode="undirected", weighted = TRUE)
plot(G, vertex.size=3, vertex.label=NA)

D = diag(colSums(W))
L = D - W
ED = eigen(L)
ED$values[(n-K+1):n]

## [1]  3.111176e-01  4.396184e-02 -7.105427e-15

U = ED$vectors[, (n-K+1):n]
heatmap(U)

KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)

What is we use the \(\epsilon\)-neighborhood adjacency matrix instead?

This method is not sensitive to the values of distances.

eps = 0.8
W = (S > eps) * 1
diag(W) = 0
heatmap(W)

G = graph.adjacency(W, mode="undirected")
plot(G, vertex.size=3, vertex.label=NA)

D = diag(colSums(W))
L = D - W
ED = eigen(L)
ED$values[(n-K+1):n]

## [1]  7.295703e-03  1.513382e-14 -5.635310e-15

U = ED$vectors[, (n-K+1):n]
heatmap(U)

KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)

Decreasing \(\epsilon\) will make \(W\) more dense.

eps = 0.5
W = (S > eps) * 1
diag(W) = 0
heatmap(W)

G = graph.adjacency(W, mode="undirected")
plot(G, vertex.size=3, vertex.label=NA)

D = diag(colSums(W))
L = D - W
ED = eigen(L)
ED$values[(n-K+1):n]

## [1] 2.673156e+01 2.459718e+01 4.956622e-13

U = ED$vectors[, (n-K+1):n]
heatmap(U)

KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)

What if we use soft-threshold adjacency matrix instead?

This method will also truncale more edges in the second and third clusters than in the first one.

lambda = 0.7
W = pmax(S - lambda, 0)
diag(W) = 0
heatmap(W)

G = graph.adjacency(W, mode="undirected", weighted = TRUE)
plot(G, vertex.size=3, vertex.label=NA)

D = diag(colSums(W))
L = D - W
ED = eigen(L)
ED$values[(n-K+1):n]

## [1]  4.926520e-02  4.355113e-02 -3.766508e-16

U = ED$vectors[, (n-K+1):n]
heatmap(U)

KM = kmeans(U, centers = K, iter.max = 100, nstart = 10)
scatterplot(X, KM$cluster)