xA <- c(27.0,54.6,33.5,27.6,46.0,22.2,44.2,17.3,15.9,32.8)
xB <- c(17.4,20.5,13.9,14.8,27.9,10.6,33.7,15.4,25.0,24.1)
stripchart(list(xA,xB),vertical=T,pch=21,xlim=c(0.5,2.5))
segments(0.9,mean(xA),1.1,mean(xA),lwd=2,col="red")
segments(1.9,mean(xB),2.1,mean(xB),lwd=2,col="blue")
abline(h=mean(c(xA,xB)),lty=2)
t.test(xA,xB)
##
## Welch Two Sample t-test
##
## data: xA and xB
## t = 2.5336, df = 14.224, p-value = 0.02364
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.822368 21.737632
## sample estimates:
## mean of x mean of y
## 32.11 20.33
wilcox.test(xA,xB)
##
## Wilcoxon rank sum test
##
## data: xA and xB
## W = 78, p-value = 0.03546
## alternative hypothesis: true location shift is not equal to 0
Let’s see how one outlier influences the test statistics!
sort(xA)
## [1] 15.9 17.3 22.2 27.0 27.6 32.8 33.5 44.2 46.0 54.6
sort(xA,decreasing=T)
## [1] 54.6 46.0 44.2 33.5 32.8 27.6 27.0 22.2 17.3 15.9
order(xA)
## [1] 9 8 6 1 4 10 3 7 5 2
order(xA,decreasing=T)
## [1] 2 5 7 3 10 4 1 6 8 9
xAnew <- xA
xAnew[2] <- 99.9
xA
## [1] 27.0 54.6 33.5 27.6 46.0 22.2 44.2 17.3 15.9 32.8
xAnew
## [1] 27.0 99.9 33.5 27.6 46.0 22.2 44.2 17.3 15.9 32.8
stripchart(list(xAnew,xB),vertical=T,pch=21,xlim=c(0.5,2.5))
segments(0.9,mean(xAnew),1.1,mean(xAnew),lwd=2,col="red")
segments(1.9,mean(xB),2.1,mean(xB),lwd=2,col="blue")
abline(h=mean(c(xAnew,xB)),lty=2)
t.test(xA,xB)$p.value
## [1] 0.02364417
t.test(xAnew,xB)$p.value # why is the p-value now > 0.05?
## [1] 0.06869718
wilcox.test(xA,xB)$p.value
## [1] 0.03546299
wilcox.test(xAnew,xB)$p.value # why are the p-values the same?
## [1] 0.03546299
Let’s write a function first, to make things easier. Note that the input are two vectors, generically called xa and xb.
my.perm <- function(xa,xb){
# combine the two vectots
x <- c(xa,xb)
# calculate the total length of observations
n <- length(x)
# calculate the length of the first vector
na <- length(xa)
# shuffle the order of the observations across grups
z <- x[sample(1:n)]
# return the t test statistic from the shuffled data
return(t.test(z[1:na],z[-(1:na)],var.equal=T)$stat)
}
Example:
# Let's try one with our xA and xB
my.perm(xA,xB)
## t
## -0.06278934
# And onther one
my.perm(xA,xB)
## t
## 0.6157433
# And onther one
my.perm(xA,xB)
## t
## -0.06278934
# Let's do 1000 shuffles
nit <- 1000
# Create a vector of length nit (1000)
myres <- rep(NA,nit)
# Fill in the slots of that vector with the permutation test t statistics
for(j in 1:nit) myres[j] <- my.perm(xA,xB)
# Plot the data
hist(myres,breaks=21,col="lightgrey")
# Add the observed test statistic to the plot
tst <- t.test(xA,xB,var.equal=T)$stat
abline(v=tst,col="red")
# Calculate the permutation p-value
mean(abs(myres)>abs(tst))
## [1] 0.029