Simulating Sample Size Effects

Simulate and plot data in R to see the effects of sample size differences

Results: https://twitter.com/MoritzBuchi/status/1394967444209471488

library(truncnorm) 
# modified version of rnorm() to allow min and max specification

n <- 20 # base n
f <- 1:75 # sample size multiplication vector
N <- n * f # vector of 75 different sample sizes (20 to 1500)

# assume a 1 to 10 rating scale 
# (but underlying distribution assumed continuous)
scalemin <- 1
scalemax <- 10

# set distribution parameters
# (but these don't match the "empirical" means for some reason)
M <- 7 # this produces a simulated "true" mean of ~6.5
SD <- 2.5 # and this results in an SD of ~2.0
# probably because it just cuts off the normal distribution
# and if it's skewed, the mean will be off


# number of runs per sample size
runs <- 25 

# set up plot area: one grid plot (25 cells) for each run
par(mfrow=c(5,5))

# set up a matrix to save relevant results
# 5 columns/variables as output/results per run and sample size
# 108 rows (9 * 12)
out <- matrix( , nrow = runs * length(N), ncol = 5) 

#################
# 2 loops: one through the sample sizes 
# and within that one through the runs
# for the output matrix, the row number is calculated as i + (k - 1)*runs

for (k in f){
  for (i in 1:runs){
    y <- rtruncnorm(N[k], mean = M, sd = SD,
                    a = scalemin, b = scalemax)
    
    hist(y, main = "", xlab = "", ylab = "",
         xlim = c(1,10), xaxt = 'n', yaxt ='n',
         freq = FALSE)
    axis(side=1, at=1:10, labels=1:10)
    abline(v = mean(y), col = "blue")
    
    out[i + (k - 1)*runs , 1] <- N[k]
    out[i + (k - 1)*runs , 2] <- i
    out[i + (k - 1)*runs , 3] <- mean(y)
    out[i + (k - 1)*runs , 4] <- sd(y)
    out[i + (k - 1)*runs , 5] <- max(y) - min(y)
  }
}

###

par(mfrow=c(1,3))
###

out <- as.data.frame(out)
names(out) <- c("n", "run", "mean", "SD", "range")
# make n a factor var (with 75 ordered levels)
out$n <- ordered(out$n)

# look at results by sample size
a <- aggregate(x = out$mean, by = list(out$n), FUN = mean)
mean(a$x) # mean of means

b <- aggregate(x = out$SD, by = list(out$n), FUN = mean)
mean(b$x) # mean of SDs


plot(out$mean ~ out$n,
     xlab = "sample size", ylab = "mean", las=2)
abline(h = mean(a$x), col = "red")

plot(out$SD ~ out$n,
     xlab = "sample size", ylab = "standard deviation", las=2)
abline(h = mean(b$x), col = "red")

plot(out$range ~ out$n,
     xlab = "sample size", ylab = "range", las=2)
abline(h = scalemax - scalemin, col = "red")


#
par(mfrow=c(3,1))

hist(out$mean[out$n=="20"], xlim = c(5,8), 
     freq = T, breaks = 10, yaxt = 'n', ylab = '',
     xlab = "", main = "N = 20")

hist(out$mean[out$n=="200"], xlim = c(5,8), 
     freq = T, breaks = 10, yaxt = 'n', ylab = '',
     xlab="", main = "N = 200")

hist(out$mean[out$n=="1000"], xlim = c(5,8), 
     freq = T, breaks = 10, yaxt = 'n', ylab = '',
     xlab = "mean", main = "N = 1000")



################################################################

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s