Review: multivariate normal distribution

Generating sample from MVN

Generate \(x\sim\mathcal{N}_2(0, I)\).

n = 1000
X = matrix(rnorm(2*n), n, 2)
colnames(X) = c("x1", "x2")
mu = c(0, 0)
axes = diag(2)

ggplot(data = data.frame(X), aes(x1, x2)) +
  geom_point(alpha = 0.1)+
  stat_ellipse(level = 0.8, color = "forestgreen", size = 1.5, type = "norm")+
  stat_ellipse(level = 0.95, color = "forestgreen", size = 1.5, type = "norm")+
  annotate("point", x = mu[1], y = mu[2], size = 3, color = "forestgreen")+
  annotate("segment", x = mu[1], y = mu[2], xend = mu[1] + axes[1,1], yend = mu[2] + axes[2,1],
                  arrow = arrow(length = unit(0.5, "cm")), color = "forestgreen", size = 1.5)+
  annotate("segment", x = mu[1], y = mu[2], xend = mu[1] + axes[1,2], yend = mu[2] + axes[2,2],
                  arrow = arrow(length = unit(0.5, "cm")), color = "forestgreen", size = 1.5)+
  coord_fixed()

Generate \(x\sim\mathcal{N}_2(\mu, \Sigma)\).

mu = c(1,2)
U = matrix(c(1/sqrt(2), 1/sqrt(2), -1/sqrt(2), 1/sqrt(2)), 2, 2)
lambdas = c(0.2, 1) 
Sigma = U %*% diag(lambdas) %*% t(U)

Sigmasqrt = U %*% diag(sqrt(lambdas)) %*% t(U)
X = t(Sigmasqrt %*% t(X) + mu)
colnames(X) = c("x1", "x2")
axes = U %*% diag(sqrt(lambdas))

plt = ggplot(data = data.frame(X), aes(x1, x2)) +
  geom_point(alpha = 0.1)

plt +
  stat_ellipse(level = 0.8, color = "forestgreen", size = 1.5, type = "norm")+
  stat_ellipse(level = 0.95, color = "forestgreen", size = 1.5, type = "norm")+
  annotate("point", x = mu[1], y = mu[2], size = 3, color = "forestgreen")+
  annotate("segment", x = mu[1], y = mu[2], xend = mu[1] + axes[1,1], yend = mu[2] + axes[2,1],
                  arrow = arrow(length = unit(0.5, "cm")), color = "forestgreen", size = 1.5)+
  annotate("segment", x = mu[1], y = mu[2], xend = mu[1] + axes[1,2], yend = mu[2] + axes[2,2],
                  arrow = arrow(length = unit(0.5, "cm")), color = "forestgreen", size = 1.5)+
  coord_fixed()

Alternative way: generate MVN data via rmvnorm.

library(mvtnorm)
X = rmvnorm(n, mean = mu, sigma = Sigma)
colnames(X) = c("x1", "x2")
plt+  
  geom_point(data = data.frame(X), aes(x1, x2), color = "red", alpha = 0.5)

Plot marginal distributions.

ggplot(X, aes(x = x1))+
  geom_histogram(aes(y = ..density..))+
  geom_density(color = "forestgreen", size = 1.5)+
  geom_vline(aes(xintercept = mu[1]), color = "forestgreen", size = 1.5)

ggplot(X, aes(x = x2))+
  geom_histogram(aes(y = ..density..))+
  geom_density(color = "forestgreen", size = 1.5)+
  geom_vline(aes(xintercept = mu[2]), color = "forestgreen", size = 1.5)

Add marginal distributions to the scatterplot plot.

library(ggExtra)
ggMarginal(plt, type="histogram")

Add conditional distributions to the scatterplot.

library(dplyr)
x1 = X[,1]; x2 = X[,2]
x1grid = seq(quantile(x1, 0.05), quantile(x1, 0.95), length.out = 5) 
x2grid = seq(min(x2), max(x2), length.out = 100) 
condMVN = function(x1, x2, mu, Sigma){
  muc = mu[2] + Sigma[2,1]/Sigma[1,1] * (x1 - mu[1])
  Sigmac = Sigma[2,2] - Sigma[1,2] * Sigma[2,1]/Sigma[1,1]
  dnorm(x2, muc, sqrt(Sigmac))
}
df = expand.grid(x1grid, x2grid) %>%
  rename(x1 = 1, x2 = 2) %>% 
  mutate(density = condMVN(x1, x2, mu, Sigma))

plt +
    geom_abline(aes(intercept = mu[2] - Sigma[2,1]/Sigma[1,1] * mu[1], 
                    slope = Sigma[2,1]/Sigma[1,1]), color = "orange", size = 1.5) + 
    geom_path(df, mapping = aes(x1 + density, x2, group = x1), color = "purple", size = 1) +
    geom_abline(aes(intercept = mu[2] - axes[2,1]/axes[1,1] * mu[1], 
                    slope = axes[2,1]/axes[1,1]), color = "forestgreen", size = 1, linetype = "dashed") + 
    geom_abline(aes(intercept = mu[2] - axes[2,2]/axes[1,2] * mu[1], 
                    slope = axes[2,2]/axes[1,2]), color = "forestgreen", size = 1, linetype = "dashed") +
  coord_fixed()

Wishart distribution withn = number of matrices, df \(= n\), Sigma \(=\Sigma\).

df = 5
nrep = 3
Slist = rWishart(n = nrep, df = df, Sigma = Sigma)
apply(Slist, 3, heatmap)

## [[1]]

## 
## [[2]]

## 
## [[3]]

Generate a sample using random covariance matrices. Compare the results.

Xlist = apply(Slist, 3, function(S) rmvnorm(500, mean = c(0,0), sigma = S/df), simplify = F)
Xs = do.call(rbind, Xlist)
colnames(Xs) = c("x1", "x2")
df = data.frame(Xs, replicate = as.factor(rep(1:nrep, rep(500, nrep))))
ggplot(data = df, aes(x1, x2, color = replicate, group = replicate)) +
  geom_point(alpha = 0.3) +
  stat_ellipse(level = 0.95, size = 1.5, type = "norm")

What if we increase df?

df = 100
Slist = rWishart(n = nrep, df = df, Sigma = Sigma)
apply(Slist, 3, heatmap)

## [[1]]

## 
## [[2]]

## 
## [[3]]

When increasing degrees of freedom, random Wishart matrices become more similar. As a result, the generated data look more similar as well.

Xlist = apply(Slist, 3, function(S) rmvnorm(500, mean = c(0,0), sigma = S/df), simplify = F)
Xs = do.call(rbind, Xlist)
colnames(Xs) = c("x1", "x2")
df = data.frame(Xs, replicate = as.factor(rep(1:nrep, rep(500, nrep))))
ggplot(data = df, aes(x1, x2, color = replicate, group = replicate)) +
  geom_point(alpha = 0.3) +
  stat_ellipse(level = 0.95, size = 1.5, type = "norm")

Review: multivariate normal distribution

Helper functions for visualization

Generating sample from MVN