Simulate and plot data in R to see the effects of sample size differences
Results: https://twitter.com/MoritzBuchi/status/1394967444209471488
library(truncnorm)
# modified version of rnorm() to allow min and max specification
n <- 20 # base n
f <- 1:75 # sample size multiplication vector
N <- n * f # vector of 75 different sample sizes (20 to 1500)
# assume a 1 to 10 rating scale
# (but underlying distribution assumed continuous)
scalemin <- 1
scalemax <- 10
# set distribution parameters
# (but these don't match the "empirical" means for some reason)
M <- 7 # this produces a simulated "true" mean of ~6.5
SD <- 2.5 # and this results in an SD of ~2.0
# probably because it just cuts off the normal distribution
# and if it's skewed, the mean will be off
# number of runs per sample size
runs <- 25
# set up plot area: one grid plot (25 cells) for each run
par(mfrow=c(5,5))
# set up a matrix to save relevant results
# 5 columns/variables as output/results per run and sample size
# 108 rows (9 * 12)
out <- matrix( , nrow = runs * length(N), ncol = 5)
#################
# 2 loops: one through the sample sizes
# and within that one through the runs
# for the output matrix, the row number is calculated as i + (k - 1)*runs
for (k in f){
for (i in 1:runs){
y <- rtruncnorm(N[k], mean = M, sd = SD,
a = scalemin, b = scalemax)
hist(y, main = "", xlab = "", ylab = "",
xlim = c(1,10), xaxt = 'n', yaxt ='n',
freq = FALSE)
axis(side=1, at=1:10, labels=1:10)
abline(v = mean(y), col = "blue")
out[i + (k - 1)*runs , 1] <- N[k]
out[i + (k - 1)*runs , 2] <- i
out[i + (k - 1)*runs , 3] <- mean(y)
out[i + (k - 1)*runs , 4] <- sd(y)
out[i + (k - 1)*runs , 5] <- max(y) - min(y)
}
}
###
par(mfrow=c(1,3))
###
out <- as.data.frame(out)
names(out) <- c("n", "run", "mean", "SD", "range")
# make n a factor var (with 75 ordered levels)
out$n <- ordered(out$n)
# look at results by sample size
a <- aggregate(x = out$mean, by = list(out$n), FUN = mean)
mean(a$x) # mean of means
b <- aggregate(x = out$SD, by = list(out$n), FUN = mean)
mean(b$x) # mean of SDs
plot(out$mean ~ out$n,
xlab = "sample size", ylab = "mean", las=2)
abline(h = mean(a$x), col = "red")
plot(out$SD ~ out$n,
xlab = "sample size", ylab = "standard deviation", las=2)
abline(h = mean(b$x), col = "red")
plot(out$range ~ out$n,
xlab = "sample size", ylab = "range", las=2)
abline(h = scalemax - scalemin, col = "red")
#
par(mfrow=c(3,1))
hist(out$mean[out$n=="20"], xlim = c(5,8),
freq = T, breaks = 10, yaxt = 'n', ylab = '',
xlab = "", main = "N = 20")
hist(out$mean[out$n=="200"], xlim = c(5,8),
freq = T, breaks = 10, yaxt = 'n', ylab = '',
xlab="", main = "N = 200")
hist(out$mean[out$n=="1000"], xlim = c(5,8),
freq = T, breaks = 10, yaxt = 'n', ylab = '',
xlab = "mean", main = "N = 1000")
################################################################