Helper functions

library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(tictoc)

scatterplot = function(X, M, cluster, label = F){
  if(length(unique(cluster)) == 1){
    plt = ggplot()+
      geom_point(X, mapping = aes(longtitude, latitude))+
      geom_point(M, mapping = aes(longtitude, latitude), shape = 4, size = 4)
    if(label) return(plt + geom_text(X, mapping = aes(longtitude, latitude, label = 1:nrow(X)), nudge_x = 0.1))
    else return(plt)
  }
  else{
    ggplot()+
      geom_point(data.frame(X, cluster = as.factor(cluster)), mapping = aes(longtitude, latitude, color = cluster))+
      geom_point(M, mapping = aes(longtitude, latitude), shape = 4, size = 4)+
      theme(legend.position = "none")
  } 
}

barplot = function(values){
  n = length(values)
  df = data.frame(value = values,  index = 1:n)
  ggplot(df, aes(index, value, fill = value)) + 
    geom_bar(color = "black", stat = "identity")+
    scale_fill_gradient2(low="#619CFF", mid="white", high="#F8766D")+
    scale_x_continuous(breaks = 1:n)+
    theme_bw()+
    theme(legend.position = "none")
}

Urban planning

Imagine we have a location with \(n = 18\) houses and the city administration wants to build \(K = 3\) grocery shops.

set.seed(6)
n = 18
K = 3
M = matrix(rnorm(K * 2), K, 2)
colnames(M) =  c("longtitude", "latitude")
Ms = M[rep(c(1,2,3), times = c(n/6, n/3, n/2)),]
X = matrix(rnorm(n * 2, 0, 0.3), n, 2) + Ms
colnames(X) = c("longtitude", "latitude")
plt = ggplot()+
  geom_point(X, mapping = aes(longtitude, latitude))
plt

We will use K-means with \(K = 3\) to identify the locations for the grocery stores.

KM = kmeans(X, centers = K, algorithm = "Lloyd")
KM$cluster
##  [1] 2 2 2 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1
KM$centers
##   longtitude  latitude
## 1  0.9528791 0.3362779
## 2  0.2170350 1.6254519
## 3 -0.6706884 0.1156320
KM$size
## [1] 9 3 6
KM$iter
## [1] 4

Let’s investigate the impact of initialization on the results.

M0 = M - 0.6
scatterplot(X, M0, rep(1, n))

for(iter in 1:3){
  KM = kmeans(X, iter.max = iter, centers = M0, algorithm = "Lloyd")
  print(scatterplot(X, KM$centers, KM$cluster))
}

Now we try different initialization. The result is quite different.

M0 = M - 1
scatterplot(X, M0, rep(1, n))

for(iter in 1:3){
  KM = kmeans(X, iter.max = iter, centers = M0, algorithm = "Lloyd")
  print(scatterplot(X, KM$centers, KM$cluster))
}

K-means++

The first centroid \(m_1\) is chosen to be \(x_i\) at random.

set.seed(0)
M1 = X[sample(1:n, 1),,drop = F]
scatterplot(X, M1, rep(1, n), label = T)

The next centroid is chosen to be \(x_i\) with probability proportional to \(\|x_i-m_{k-1}\|^2\).

D = as.matrix(dist(rbind(M1, X)))[2:(n+1),1]
P = D^2/sum(D^2)
barplot(P)+
  ylab("probability")

pick = sample(1:n, 1, prob = P)
M2 = X[pick,,drop = F]
scatterplot(X, rbind(M1, M2), rep(1, n), label = T)

D = as.matrix(dist(rbind(M2, X)))[2:(n+1),1]
P = D^2/sum(D^2)
barplot(P)+
  ylab("probability")

pick = sample(1:n, 1, prob = P)
M3 = X[pick,,drop = F]
scatterplot(X, rbind(M1, M2, M3), rep(1, n), label = T)

Run K-means with the K-means++ initialization.

M0 = rbind(M1, M2, M3)
KM = kmeans(X, centers = M0, algorithm = "Lloyd")
print(scatterplot(X, KM$centers, KM$cluster))

Between- and within-cluster scatter

Check the pattern for \(B(K)\) and \(W(K)\).

Ks = 2:10
Bs = c()
Ws = c()
for(K in Ks){
  KM = kmeans(X, centers = K, iter.max = 10, nstart = 10)
  Bs = c(Bs, KM$betweenss)
  Ws = c(Ws, KM$tot.withinss)
}
df = data.frame(scatter = c(Bs, Ws, Bs + Ws), K = Ks, type = rep(c("between", "within", "total"), times = rep(length(Ks), 3)))
ggplot(df, aes(K, scatter, color = type))+
  geom_point()+
  geom_line()

Gap statistics

Compute the Gap statistics.

library(factoextra)
library(cluster)
gapstat = clusGap(X, FUN = kmeans, nstart = 50, K.max = 10, B = 50)
print(gapstat, method = "Tibs2001SEmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = X, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 50)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'Tibs2001SEmax', SE.factor=1): 1
##              logW      E.logW          gap     SE.sim
##  [1,]  1.73073261  1.65221990 -0.078512713 0.08206349
##  [2,]  1.31333413  1.23601719 -0.077316939 0.11057044
##  [3,]  0.91533878  0.92551080  0.010172019 0.12356291
##  [4,]  0.66582670  0.67139940  0.005572697 0.12418960
##  [5,]  0.47787073  0.43915446 -0.038716271 0.14208550
##  [6,]  0.33095868  0.22514662 -0.105812061 0.15041960
##  [7,]  0.14986913  0.01758715 -0.132281972 0.16519954
##  [8,] -0.02684498 -0.20254494 -0.175699965 0.18444538
##  [9,] -0.20442234 -0.42295155 -0.218529214 0.19449693
## [10,] -0.39694634 -0.64774195 -0.250795607 0.19662031
fviz_gap_stat(gapstat, maxSE = list(method = "Tibs2001SEmax", SE.factor = 1))

Make clusters more separated from each other.

set.seed(6)
n = 18
K = 3
M = matrix(rnorm(K * 2), K, 2)
colnames(M) =  c("longtitude", "latitude")
Ms = M[rep(c(1,2,3), times = c(n/6, n/3, n/2)),]
X = matrix(rnorm(n * 2, 0, 0.1), n, 2) + Ms
colnames(X) = c("longtitude", "latitude")
ggplot()+
  geom_point(X, mapping = aes(longtitude, latitude))

gapstat = clusGap(X, FUN = kmeans, nstart = 50, K.max = 10, B = 50)
fviz_gap_stat(gapstat, maxSE = list(method = "Tibs2001SEmax", SE.factor = 1))