# Simulating Sample Size Effects

## Simulate and plot data in R to see the effects of sample size differences

```library(truncnorm)
# modified version of rnorm() to allow min and max specification

n <- 20 # base n
f <- 1:75 # sample size multiplication vector
N <- n * f # vector of 75 different sample sizes (20 to 1500)

# assume a 1 to 10 rating scale
# (but underlying distribution assumed continuous)
scalemin <- 1
scalemax <- 10

# set distribution parameters
# (but these don't match the "empirical" means for some reason)
M <- 7 # this produces a simulated "true" mean of ~6.5
SD <- 2.5 # and this results in an SD of ~2.0
# probably because it just cuts off the normal distribution
# and if it's skewed, the mean will be off

# number of runs per sample size
runs <- 25

# set up plot area: one grid plot (25 cells) for each run
par(mfrow=c(5,5))

# set up a matrix to save relevant results
# 5 columns/variables as output/results per run and sample size
# 108 rows (9 * 12)
out <- matrix( , nrow = runs * length(N), ncol = 5)

#################
# 2 loops: one through the sample sizes
# and within that one through the runs
# for the output matrix, the row number is calculated as i + (k - 1)*runs

for (k in f){
for (i in 1:runs){
y <- rtruncnorm(N[k], mean = M, sd = SD,
a = scalemin, b = scalemax)

hist(y, main = "", xlab = "", ylab = "",
xlim = c(1,10), xaxt = 'n', yaxt ='n',
freq = FALSE)
axis(side=1, at=1:10, labels=1:10)
abline(v = mean(y), col = "blue")

out[i + (k - 1)*runs , 1] <- N[k]
out[i + (k - 1)*runs , 2] <- i
out[i + (k - 1)*runs , 3] <- mean(y)
out[i + (k - 1)*runs , 4] <- sd(y)
out[i + (k - 1)*runs , 5] <- max(y) - min(y)
}
}

###

par(mfrow=c(1,3))
###

out <- as.data.frame(out)
names(out) <- c("n", "run", "mean", "SD", "range")
# make n a factor var (with 75 ordered levels)
out\$n <- ordered(out\$n)

# look at results by sample size
a <- aggregate(x = out\$mean, by = list(out\$n), FUN = mean)
mean(a\$x) # mean of means

b <- aggregate(x = out\$SD, by = list(out\$n), FUN = mean)
mean(b\$x) # mean of SDs

plot(out\$mean ~ out\$n,
xlab = "sample size", ylab = "mean", las=2)
abline(h = mean(a\$x), col = "red")

plot(out\$SD ~ out\$n,
xlab = "sample size", ylab = "standard deviation", las=2)
abline(h = mean(b\$x), col = "red")

plot(out\$range ~ out\$n,
xlab = "sample size", ylab = "range", las=2)
abline(h = scalemax - scalemin, col = "red")

#
par(mfrow=c(3,1))

hist(out\$mean[out\$n=="20"], xlim = c(5,8),
freq = T, breaks = 10, yaxt = 'n', ylab = '',
xlab = "", main = "N = 20")

hist(out\$mean[out\$n=="200"], xlim = c(5,8),
freq = T, breaks = 10, yaxt = 'n', ylab = '',
xlab="", main = "N = 200")

hist(out\$mean[out\$n=="1000"], xlim = c(5,8),
freq = T, breaks = 10, yaxt = 'n', ylab = '',
xlab = "mean", main = "N = 1000")

################################################################

```