Helper functions for visualization

library(ggplot2)
library(dplyr)

In high dimensions nobody can hear you scream

Generate uniform points in a segment in 1D.

p = 1
n = 100
esize = 1/n
X = matrix(runif(p * n), n, p)
colnames(X) = c("x1")
ggplot()+
  geom_point(X, mapping = aes(x1, 0), color = "cornflowerblue")+
  scale_x_continuous(breaks = seq(0, 1, esize), minor_breaks = NULL)+
  theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y = element_blank(), 
        panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())+
   theme(axis.text.x = element_text(angle = 90, size = 6))

Generate uniform points in a square in 2D.

p = 2
n = 100
esize = (1/n)^(1/p)
X = matrix(runif(p * n), n, p)
colnames(X) = c("x1", "x2")
ggplot()+
  geom_point(X, mapping = aes(x1, x2), color = "cornflowerblue")+
  scale_x_continuous(breaks = seq(0, 1, esize), minor_breaks = NULL)+
  scale_y_continuous(breaks = seq(0, 1, esize), minor_breaks = NULL)

Check the dependance of the volume per observation on the side of the hypercube.

ns = c(2, 4, 10, 100, 1000)
ps = c(1, 2, 5, 10, 100)
df = expand.grid(ns, ps)
colnames(df) = c("n", "p")
df = df %>% mutate(volpart = 1/n,  esize = (1/n)^(1/p), p = as.factor(p))
ggplot(df, aes(volpart, esize, color = p))+
  geom_point()+
  geom_line()+
  xlab("volume per observation")+
  ylab("hypercube edge size")

Orange peel

Generate uniform points on a circle in in 2D.

library(mvtnorm)
p = 2
n = 1000
X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
X = X/sqrt(rowSums(X^2))
colnames(X) = c("x1", "x2")
ggplot(X)+
  geom_point(aes(x1, x2), alpha = 0.2, color = "cornflowerblue")

Generate uniform points in a ball in 2D.

U = (runif(n))^(1/p)
Y = X * U
colnames(Y) = c("x1", "x2")
ggplot(Y)+
  geom_point(aes(x1, x2), alpha = 0.2, color = "cornflowerblue")

Check distance to the center.

dist = sqrt(rowSums(Y^2))
ggplot(data.frame(dist), aes(dist))+
  geom_histogram(fill = "orange")+
  xlab("distance to the origin")

  xlim(0, 1)
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1

Generate uniform points on a sphere in in 3D.

library(plotly)
p = 3
X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
X = X/sqrt(rowSums(X^2))
colnames(X) = c("x1", "x2", "x3")
plot_ly(x = ~x1, y = ~x2, z = ~x3, data = data.frame(X), type="scatter3d", mode = "markers", opacity = 0.2)

Generate uniform points in a ball in 3D.

U = (runif(n))^(1/p)
Y = X * U
colnames(Y) = c("x1", "x2", "x3")
plot_ly(x = ~x1, y = ~x2, z = ~x3, data = data.frame(Y), type="scatter3d", mode="markers", opacity = 0.2)

Check distance to the center.

dist = sqrt(rowSums(Y^2))
ggplot(data.frame(dist), aes(dist))+
  geom_histogram(fill = "orange")+
  xlab("distance to the origin")+
  xlim(0, 1)

What if we increase \(p\)?

rdist = function(p){
  X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
  X = X/sqrt(rowSums(X^2))
  U = (runif(n))^(1/p)
  Y = X * U
  dist = sqrt(rowSums(Y^2))
  return(dist)
}

ps = c(2, 3, 5, 10, 25, 50, 75, 100, 200)
dist = do.call(c, lapply(ps, rdist))
df = data.frame(dist, p = rep(ps, rep(n, length(ps))))
ggplot(df, aes(dist))+
  geom_histogram(fill = "orange")+
  facet_wrap(~p, labeller = label_both)+
  xlab("distance to the origin")

Use interactive ggplotly for visualization.

plt = ggplot(df, aes(dist, frame = p))+
  geom_histogram(position = "identity", fill = "orange")+
  xlab("distance to the origin")
ggplotly(plt)

Plot the distance to the nearest neighbor from the origin vs \(p\).

distmin = df %>%
  group_by(p) %>%
  summarise(distmin = min(dist))
ggplot(distmin, aes(p, distmin))+
  geom_point()+
  geom_line()+
  ylab("distance to the nearest neighbor")

Repeat the simulation 30 times and check the median value of the minimum distance.

library(tidyr)
nrep = 30
dist = replicate(nrep, do.call(c, lapply(ps, rdist)))
colnames(dist) = paste0("rep", 1:nrep)
df = data.frame(dist, p = rep(ps, rep(n, length(ps))))
dflong = df %>% 
  pivot_longer(!p, names_to = "replicate", values_to = "dist")
distmin = dflong %>%
  group_by(p, replicate) %>%
  summarise(distmin = min(dist))
distminmed = distmin %>%
  group_by(p) %>%
  summarise(distminmed = median(distmin))
ggplot()+
  geom_line(distmin, mapping = aes(p, distmin, group = replicate))+
  geom_line(distminmed, mapping = aes(p, distminmed), color = "red", size = 2)+
  ylab("distance to the nearest neighbor")

Distribution of the Eucledian distance

Generate a pair of points from \(N_2(0, I)\).

n = 1
p = 2
X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
colnames(X) = c("x1", "x2")
Y = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
colnames(Y) = c("y1", "y2")
dist = sqrt(rowSums((X - Y)^2))
df = data.frame(X, Y)
ggplot(df)+
  geom_point(aes(x1, x2), color = "cornflowerblue")+
  geom_text(aes(x1 + 0.1, x2 + 0.1), label = "x", color = "cornflowerblue")+
  geom_point(aes(y1, y2), color = "orange")+
  geom_text(aes(y1 + 0.1, y2 + 0.1), label = "y", color = "orange")+
  geom_segment(aes(x = x1, y = x2, xend = y1, yend = y2), alpha = 0.3)+
  ggtitle(paste("||x - y|| =", round(dist,3)))

Generate 100 pairs.

n = 100
X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
colnames(X) = c("x1", "x2")
Y = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
colnames(Y) = c("y1", "y2")
df = data.frame(X, Y)
ggplot(df)+
  geom_point(aes(x1, x2), color = "cornflowerblue")+
  geom_text(aes(x1 + 0.1, x2 + 0.1), label = "x", color = "cornflowerblue")+
  geom_point(aes(y1, y2), color = "orange")+
  geom_text(aes(y1 + 0.1, y2 + 0.1), label = "y", color = "orange")+
  geom_segment(aes(x = x1, y = x2, xend = y1, yend = y2), alpha = 0.3)

Check the distribution of Euclidean distances between pairs.

dist = sqrt(rowSums((X - Y)^2))
ggplot(data.frame(dist), aes(dist))+
  geom_histogram(fill = "orange")+
  geom_vline(aes(xintercept = sqrt(2 * p)), size = 1, linetype = "dashed")+
  xlab("distance between x and y")

Plot the results for different \(p\).

rdist = function(p){
  X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
  Y = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
  dist = sqrt(rowSums((X - Y)^2))
  return(dist)
}
ps = c(2, 3, 5, 10, 25, 50, 75, 100, 200)
dist = do.call(c, lapply(ps, rdist))
df = data.frame(dist, p = rep(ps, rep(n, length(ps))))
plt = ggplot(df, aes(dist, frame = p))+
  geom_histogram(position = "identity", fill = "orange")+
  geom_vline(aes(xintercept = sqrt(2*p), frame = p), size = 1, linetype = "dashed")+
  xlab("distance between x and y")
ggplotly(plt)

Compute mean and standard deviation for the distances between two points.

dfsumm  = df %>% group_by(p) %>%
  summarize(mean = mean(dist), sd = sd(dist))
ggplot(dfsumm)+
  geom_line(aes(p, mean), color = "orange", size = 1)+
  geom_ribbon(aes(x = p, ymin = mean - sd, ymax = mean + sd), fill = "orange", alpha = 0.2)+
  geom_line(aes(p, sqrt(2*p)), linetype = "dashed", size = 1)+
  ylab("distance between x and y")

Distribution of angles

Generate a pair of points from \(N_2(0, I)\).

n = 1
p = 2
X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
colnames(X) = c("x1", "x2")
Y = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
colnames(Y) = c("y1", "y2")
angles = function(X, Y){
  dotXY = rowSums(X * Y)
  normX = sqrt(rowSums(X^2))
  normY = sqrt(rowSums(Y^2))
  angle = acos(dotXY / (normX * normY))
  return(angle/pi * 180)
}
angle = angles(X, Y)

df = data.frame(X, Y)
ggplot(df)+
  geom_point(aes(0, 0), color = "black")+
  geom_point(aes(x1, x2), color = "cornflowerblue")+
  geom_text(aes(x1 + 0.1, x2 + 0.1), label = "x", color = "cornflowerblue")+
  geom_point(aes(y1, y2), color = "orange")+
  geom_text(aes(y1 + 0.1, y2 + 0.1), label = "y", color = "orange")+
  geom_segment(aes(x = 0, y = 0, xend = x1, yend = x2), color = "cornflowerblue", arrow = arrow(length = unit(0.3,"cm")), size = 1)+
  geom_segment(aes(x = 0, y = 0, xend = y1, yend = y2), color = "orange", arrow = arrow(length = unit(0.3,"cm")), size = 1)+
  geom_segment(aes(x = x1, y = x2, xend = y1, yend = y2), alpha = 0.3)+
  ggtitle(paste("angle(x, y) =", round(angle, 1)))

Generate 100 pairs.

n = 100
X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
colnames(X) = c("x1", "x2")
Y = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
colnames(Y) = c("y1", "y2")
df = data.frame(X, Y)
ggplot(df)+
  geom_point(aes(0, 0), color = "black")+
  geom_point(aes(x1, x2), color = "cornflowerblue")+
  geom_text(aes(x1 + 0.1, x2 + 0.1), label = "x", color = "cornflowerblue")+
  geom_point(aes(y1, y2), color = "orange")+
  geom_text(aes(y1 + 0.1, y2 + 0.1), label = "y", color = "orange")+
  geom_segment(aes(x = 0, y = 0, xend = x1, yend = x2), color = "cornflowerblue", arrow = arrow(length = unit(0.5,"cm")), size = 1)+
  geom_segment(aes(x = 0, y = 0, xend = y1, yend = y2), color = "orange", arrow = arrow(length = unit(0.5,"cm")), size = 1)+
  geom_segment(aes(x = x1, y = x2, xend = y1, yend = y2), alpha = 0.3)

Check the distribution of angles between pairs.

angle = angles(X, Y)
ggplot(data.frame(angle), aes(angle))+
  geom_histogram(fill = "orange")+
  geom_vline(aes(xintercept = 90), size = 1, linetype = "dashed")+
  xlab("angle between x and y")

Plot the results for different \(p\).

rangle = function(p){
  X = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
  Y = rmvnorm(n, mean = rep(0, p), sigma = diag(p))
  angle = angles(X, Y)
  return(angle)
}
ps = c(2, 3, 5, 10, 25, 50, 75, 100, 200)
angle = do.call(c, lapply(ps, rangle))
df = data.frame(angle, p = rep(ps, rep(n, length(ps))))
plt = ggplot(df, aes(angle, frame = p))+
  geom_histogram(position = "identity", fill = "orange")+
  geom_vline(aes(xintercept = 90, frame = p), size = 1, linetype = "dashed")+
  xlab("angle between x and y")
ggplotly(plt)

Compute mean and standard deviation for the angles between two vectors.

dfsumm  = df %>% group_by(p) %>%
  summarize(mean = mean(angle), sd = sd(angle))
ggplot(dfsumm)+
  geom_line(aes(p, mean), color = "orange", size = 1)+
  geom_ribbon(aes(x = p, ymin = mean - sd, ymax = mean + sd), fill = "orange", alpha = 0.2)+
  geom_line(aes(p, 90), linetype = "dashed", size = 1)+
  ylab("angle between x and y")