Statistics for Laboratory Scientists ( 140.615 )

Analysis of Variance - Model Assumptions and Diagnostics - Advanced

Quantile-quantile plots

For the example, we take 100 random draws from a standard Normal distribution.

set.seed(1)
n <- 100
y <- rnorm(n)
plot(sort(y))

The expected normal quantiles. Note that other possibilities have been proposed as well, for example qnorm((1:n)/(n+1)).

p <- ((1:n)-0.5)/n
p
##   [1] 0.005 0.015 0.025 0.035 0.045 0.055 0.065 0.075 0.085 0.095 0.105 0.115 0.125 0.135 0.145 0.155 0.165
##  [18] 0.175 0.185 0.195 0.205 0.215 0.225 0.235 0.245 0.255 0.265 0.275 0.285 0.295 0.305 0.315 0.325 0.335
##  [35] 0.345 0.355 0.365 0.375 0.385 0.395 0.405 0.415 0.425 0.435 0.445 0.455 0.465 0.475 0.485 0.495 0.505
##  [52] 0.515 0.525 0.535 0.545 0.555 0.565 0.575 0.585 0.595 0.605 0.615 0.625 0.635 0.645 0.655 0.665 0.675
##  [69] 0.685 0.695 0.705 0.715 0.725 0.735 0.745 0.755 0.765 0.775 0.785 0.795 0.805 0.815 0.825 0.835 0.845
##  [86] 0.855 0.865 0.875 0.885 0.895 0.905 0.915 0.925 0.935 0.945 0.955 0.965 0.975 0.985 0.995
x <- qnorm(p) 
x
##   [1] -2.57582930 -2.17009038 -1.95996398 -1.81191067 -1.69539771 -1.59819314 -1.51410189 -1.43953147
##   [9] -1.37220381 -1.31057911 -1.25356544 -1.20035886 -1.15034938 -1.10306256 -1.05812162 -1.01522203
##  [17] -0.97411388 -0.93458929 -0.89647336 -0.85961736 -0.82389363 -0.78919165 -0.75541503 -0.72247905
##  [25] -0.69030882 -0.65883769 -0.62800601 -0.59776013 -0.56805150 -0.53883603 -0.51007346 -0.48172685
##  [33] -0.45376219 -0.42614801 -0.39885507 -0.37185609 -0.34512553 -0.31863936 -0.29237490 -0.26631061
##  [41] -0.24042603 -0.21470157 -0.18911843 -0.16365849 -0.13830421 -0.11303854 -0.08784484 -0.06270678
##  [49] -0.03760829 -0.01253347  0.01253347  0.03760829  0.06270678  0.08784484  0.11303854  0.13830421
##  [57]  0.16365849  0.18911843  0.21470157  0.24042603  0.26631061  0.29237490  0.31863936  0.34512553
##  [65]  0.37185609  0.39885507  0.42614801  0.45376219  0.48172685  0.51007346  0.53883603  0.56805150
##  [73]  0.59776013  0.62800601  0.65883769  0.69030882  0.72247905  0.75541503  0.78919165  0.82389363
##  [81]  0.85961736  0.89647336  0.93458929  0.97411388  1.01522203  1.05812162  1.10306256  1.15034938
##  [89]  1.20035886  1.25356544  1.31057911  1.37220381  1.43953147  1.51410189  1.59819314  1.69539771
##  [97]  1.81191067  1.95996398  2.17009038  2.57582930
plot(x, p)
abline(h=c(0,1))
axis(4, p, rep("",n))

The qq-plot ‘from scratch’, and the built-in function.

par(mfrow=c(1,2))
plot(x, sort(y), main="from scratch")
qqnorm(y)

Example from class: IL10 cytokines

Fancier plot. Re-order the factor levels (strain) for plotting.

library(SPH.140.615)
il10 <- cbind(il10, logIL10=log10(il10$IL10))
il10$Strain <- factor(il10$Strain, levels=rev(c("A","B6",as.character(c(1,2,4:8,10:15,17:19,24:26)))))

Calculate the strain means.

m <- tapply(il10$IL10, il10$Strain, mean)
logm <- tapply(il10$logIL10, il10$Strain, mean)

Plot the data. Note: ‘segments’ adds line segments at the strain means, ‘abline’ adds horizontal lines to indicate the groups, and ‘expression’ allows you to put fancy things in the axis labels.

par(las=1, mfrow=c(1,2))
stripchart(il10$IL10~il10$Strain, 
           method="jitter", pch=1, ylab="Strain", xlab="IL10 response", jitter=0.2)
segments(m, (1:21)-0.3, m, (1:21)+0.3, lwd=2, col="blue")
abline(h=1:21, lty=2, col="gray")
stripchart(il10$logIL10~il10$Strain, 
           method="jitter", pch=1, ylab="Strain", xlab=expression(paste(log[10]," IL10 response")), jitter=0.2)
segments(logm, (1:21)-0.3, logm, (1:21)+0.3, lwd=2, col="blue")
abline(h=1:21, lty=2, col="gray")

You can generate a bunch of diagnostic plots using the ‘plot’ function.

aov.il10 <- aov(IL10 ~ Strain, data=il10)
aov.logil10 <- aov(logIL10 ~ Strain, data=il10)
par(mfrow=c(2,2))
plot(aov.il10)

plot(aov.logil10)

Bartlett’s test for equality of variances

The brute-force method for IL10.

s <- tapply(il10$IL10/1000, il10$Strain, sd)
s
##        26        25        24        19        18        17        15        14        13        12        11 
## 0.1734159 0.3655768 0.8545685 1.2901688 0.6531433 0.5424332 1.0700615 0.1712304 0.4702388 1.1606007 0.5594971 
##        10         8         7         6         5         4         2         1        B6         A 
## 0.6338465 0.8701644 0.2195076 1.2141755 1.6383134 1.6405483 0.4498755 1.9180254 0.7141536 0.5934426
n <- table(il10$Strain)
n
## 
## 26 25 24 19 18 17 15 14 13 12 11 10  8  7  6  5  4  2  1 B6  A 
##  5  5 10  5  5  5  5  5 10 10  5  5 10  5  5  5 10 10  9  9  8
spsq <- sum((n-1)*s^2)/sum(n-1)
spsq
## [1] 0.9927934
xsq <- (sum(n)-length(n)) * log(spsq) - sum((n-1)*log(s^2))
xsq
## [1] 79.8956
K <- 1 + (sum(1/(n-1)) - 1/sum(n-1))/(3*(length(n)-1))
K
## [1] 1.067525
stat <- xsq/K
stat
## [1] 74.84187
pchisq(stat, length(n)-1, lower.tail=FALSE)
## [1] 2.894759e-08

Using the built-in function.

bartlett.test(il10$IL10 ~ il10$Strain) 
## 
##  Bartlett test of homogeneity of variances
## 
## data:  il10$IL10 by il10$Strain
## Bartlett's K-squared = 74.842, df = 20, p-value = 2.895e-08

The brute-force method for log10(IL10).

logs <- tapply(il10$logIL10, il10$Strain, sd)
logs
##         26         25         24         19         18         17         15         14         13         12 
## 0.11822716 0.29026204 0.27849516 0.25272549 0.16997770 0.19775197 0.35512698 0.05671307 0.20586828 0.34798329 
##         11         10          8          7          6          5          4          2          1         B6 
## 0.27747688 0.41181170 0.24434276 0.08611903 0.50270177 0.31693434 0.28082376 0.18508639 0.34149839 0.22880190 
##          A 
## 0.22178466
logspsq <- sum((n-1)*logs^2)/sum(n-1)
logspsq
## [1] 0.07429708
logxsq <- (sum(n)-length(n)) * log(logspsq) - sum((n-1)*log(logs^2))
logxsq
## [1] 33.99216
logstat <- logxsq/K
logstat
## [1] 31.84202
pchisq(logstat, length(n)-1, lower.tail=FALSE)
## [1] 0.04501105

Using the built-in function.

bartlett.test(il10$logIL10 ~ il10$Strain) 
## 
##  Bartlett test of homogeneity of variances
## 
## data:  il10$logIL10 by il10$Strain
## Bartlett's K-squared = 31.842, df = 20, p-value = 0.04501