Generating correlated random numbers in R from scratch : R'tichoke

Here’s a quick post on how to generate correlated random numbers in R inpired by this stack overflow post.

First step is to define a covariance matrix

# Covariance and correlation for standardised variables would be same
# Specifying correlations instead
(cor_mat <- matrix(c(1, 0.3, 0.3, 1), nrow = 2, byrow = T))
##      [,1] [,2]
## [1,]  1.0  0.3
## [2,]  0.3  1.0

Next decompose the matrix using Cholesky’s decomposition

(chol_mat <- chol(cor_mat))
##      [,1]      [,2]
## [1,]    1 0.3000000
## [2,]    0 0.9539392

Generate some random numbers

old_random <- matrix(rnorm(2000), ncol = 2)

Multiply this matrix with the upper triangular matrix from above

new_random <- old_random %*% chol_mat
cor(new_random)
##          [,1]     [,2]
## [1,] 1.000000 0.299686
## [2,] 0.299686 1.000000

cor(old_random)
##           [,1]      [,2]
## [1,] 1.0000000 0.0106811
## [2,] 0.0106811 1.0000000

cor(new_random)
##          [,1]     [,2]
## [1,] 1.000000 0.299686
## [2,] 0.299686 1.000000

Some notes and caveats

The original random variables need to be as uncorrelated as possible for this to work well.

corrs_high <- c()

for(i in 1:1000){
  
  x <- rnorm(1000)
  y <- 2 * x + rnorm(1000)
  
  old_random <- as.matrix(data.frame(x, y))
  chol_mat <- chol(matrix(c(1, 0.3, 0.3, 1), ncol = 2, byrow = T))
  new_random <- old_random %*% chol_mat
  corrs_high <- c(corrs_high, cor(new_random)[1,2])
}

# The specified correlation/covariance structure is not respected
hist(corrs_high)

corrs_low <- c()

for(i in 1:1000){
  
  x <- rnorm(1000)
  y <- 0.001 * x + rnorm(1000)
  
  old_random <- as.matrix(data.frame(x, y))
  chol_mat <- chol(matrix(c(1, 0.3, 0.3, 1), ncol = 2, byrow = T))
  new_random <- old_random %*% chol_mat
  corrs_low <- c(corrs_low, cor(new_random)[1,2])
}

# Now the correlation between the two variables is much closer to the specified value
hist(corrs_low)

Tends to not work results if the original samples (uncorrelated random variables) are from different distributions

x <- rchisq(1000, 2, 3)
y <- rnorm(1000)
  
old_random <- as.matrix(data.frame(x, y))
chol_mat <- chol(matrix(c(1, 0.3, 0.3, 1), ncol = 2, byrow = T))
new_random <- old_random %*% chol_mat

cor(new_random)
##           [,1]      [,2]
## [1,] 1.0000000 0.7868328
## [2,] 0.7868328 1.0000000

x <- rchisq(1000, 2, 3)
y <- rchisq(1000, 2, 3)
  
old_random <- as.matrix(data.frame(x, y))
chol_mat <- chol(matrix(c(1, 0.3, 0.3, 1), ncol = 2, byrow = T))
new_random <- old_random %*% chol_mat

cor(new_random)
##           [,1]      [,2]
## [1,] 1.0000000 0.2931943
## [2,] 0.2931943 1.0000000

There is no way to ensure that characteristics of the original distributions are maintained

x <- rchisq(1000, 2, 3)
y <- rchisq(1000, 2, 3)
  
old_random <- as.matrix(data.frame(x, y))
chol_mat <- chol(matrix(c(1, -0.3, -0.3, 1), ncol = 2, byrow = T))
new_random <- old_random %*% chol_mat

# While the correlation value seems fine
cor(new_random)
##            [,1]       [,2]
## [1,]  1.0000000 -0.3103062
## [2,] -0.3103062  1.0000000

# There are negative values! 
range(new_random)
## [1] -7.828389 28.882035

Or, just use mvtnorm::rmvnorm() 😄

sigma <- matrix(c(4,2,2,3), ncol=2)
cov2cor(sigma)  ## Expected correlation
##           [,1]      [,2]
## [1,] 1.0000000 0.5773503
## [2,] 0.5773503 1.0000000

x <- mvtnorm::rmvnorm(n = 500, mean = c(1,2), sigma = sigma)
cor(x)  ## Actual correlation
##           [,1]      [,2]
## [1,] 1.0000000 0.6057812
## [2,] 0.6057812 1.0000000

Thoughts? Comments? Helpful? Not helpful? Like to see anything else added in here? Let me know!