## Methods in Biostatistics 2 (140.652) ### Goodness of fit #### Example data ```{r} obs <- c(35, 43, 22) # Expected proportions (i.e., the null hypothesis) p0 <- c(0.25, 0.5, 0.25) # Expected counts (note: sum(obs) = total count) exp <- sum(obs) * p0 # LRT statistic lrt <- 2 * sum(obs * log(obs/exp)) lrt # Chi-square statistic chis <- sum( (obs-exp)^2 / exp) chis # P-value for LRT 1 - pchisq(lrt, 3-1) # P-value for chi-square test 1 - pchisq(chis, 3-1) ``` #### The built-in function ```{r} # You can use chisq.test() to do the chi-square test chisq.test(obs, p=c(1/4,1/2,1/4)) ``` #### Likelihood ratio test for goodness of fit I start here with two functions: - lrt / for calculating the LRT goodness-of fit statistic, - chisq / for calculating the chi-square goodness-of-fit statistic. Copy and paste these into R before doing the next bits. ```{r} lrt <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)) 2 * (dmultinom(x, prob=x/sum(x),log=TRUE) - dmultinom(x, prob=p,log=TRUE)) # alternatively, this could have been the following, lrt2 <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)){ ex <- (p*sum(x))[x > 0] x <- x[x>0] 2 * sum(x * log(x / ex) ) } chisq <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)){ # check that x,p are appropriate p <- p/sum(p) # ensure that the probabilities sum to 1 if(any(p < 0)) stop("probabilities p must be non-negative.") if(any(x < 0)) stop("x's must all be non-negative.") if(length(x) != length(p)) stop("length(x) should be the same as length(p).") n <- sum(x) expected <- n*p sum( (x - expected)^2 / expected ) } ``` #### The Main Example ```{r} # The 35:43:22 table x <- c(35, 43, 22) # Note that you might also have started with a vector of 1's, 2's and 3's and used the function "table" instead, as follows: y <- rep(1:3, c(35, 43, 22)) x <- table(y) x # Calculate LRT test statistic to test the 1:2:1 proportions LRT <- lrt(x, p=c(0.25, 0.5, 0.25)) LRT # Calculate chi-square statistic to test the 1:2:1 proportions xsq <- chisq(x, p=c(0.25, 0.5, 0.25)) xsq # You could also use the built-in function chisq.test() chisq.test(x, p=c(0.25, 0.5, 0.25)) # P-value for LRT statistic, using the asymptotic approximation 1 - pchisq(LRT, 2) # P-value for chi-square statistic, using the asymptotic approx'n 1 - pchisq(xsq, 2) # Simulations to est the null dist'n of the LRT and chi-square stats set.seed(45) simdat <- rmultinom(1000, 100, c(0.25, 0.5, 0.25)) results <- cbind(apply(simdat, 2, lrt, p=c(0.25, 0.5, 0.25)), apply(simdat, 2, chisq, p=c(0.25, 0.5, 0.25))) # Estimated p-value for LRT stat mean(results[,1] >= LRT) # Estimated p-value for chi-square stat mean(results[,2] >= xsq) ``` #### Example 1 in the "composite hypotheses" section of the lecture I start with some additional functions: - sim.ex1, for simulating data under the null hypothesis (H0). - mle.ex1, for estimating the allele frequency under H0. - lrt.ex1, calculates LRT statistic for testing this H0. - chisq.ex1, calculates chi-square stat for testing this H0. ```{r} # Simulate data sim.ex1 <- function(n=100, f=0.2){ table(factor(sample(c("AA","AB","BB"), n, repl=TRUE, prob=c(f^2,2*f*(1-f),(1-f)^2)), levels=c("AA","AB","BB"))) } # MLE under H0 mle.ex1 <- function(x=c(5,20,75)) (x[1]+x[2]/2)/sum(x) # LRT statistic lrt.ex1 <- function(x=c(5,20,75)){ mle <- mle.ex1(x) p0 <- c(mle^2,2*mle*(1-mle),(1-mle)^2) 2*(dmultinom(x,prob=x/sum(x),log=TRUE) - dmultinom(x,prob=p0,log=TRUE)) } # Chi-square statistic chisq.ex1 <- function(x=c(5,20,75)){ mle <- mle.ex1(x) expected <- sum(x)*c(mle^2,2*mle*(1-mle),(1-mle)^2) sum((x-expected)^2/expected) } # The example data x <- c(5, 20, 75) # The estimated allele frequency mle <- mle.ex1(x) mle # LRT statistic LRT <- lrt.ex1(x) LRT # Chi-square stat xsq <- chisq.ex1(x) xsq # Asymptotic approximation for the p-value for the LRT statistic 1 - pchisq(LRT, 1) # Asymptotic approximation for the p-value for chi-square statistic 1 - pchisq(xsq, 1) # Simulations to estimate the null distribution of the LRT and chi-square statistics results <- matrix(ncol=2,nrow=1000) set.seed(33) for(i in 1:1000){ mytable <- sim.ex1(100, mle) results[i,1] <- lrt.ex1(mytable) results[i,2] <- chisq.ex1(mytable) } # Estimated p-value for the LRT statistic mean(results[,1] >= LRT) # Estimated p-value for chi-square statistic mean(results[,2] >= xsq) ``` #### Example 2 We can use the EM algorithm to find the MLEs of the allele frequencies. Some functions first. ```{r} estep <- function(n,p){ pAA <- p[2]^2/(p[2]^2+2*p[2]*p[1]) pBB <- p[3]^2/(p[3]^2+2*p[3]*p[1]) ex <- c(n[1], pAA*n[2], (1-pAA)*n[2], pBB*n[3], (1-pBB)*n[3], n[4]) } mstep <- function(ex){ n <- sum(ex) p <- c((ex[1]+(ex[3]+ex[5])/2)/n, (ex[2]+(ex[3]+ex[6])/2)/n, (ex[4]+(ex[5]+ex[6])/2)/n) p } em <- function(n, tol=1e-5, maxit=1000, verbose=TRUE){ curp <- c(n[1], n[2]+n[4]/2, n[3]+n[4]/2)/sum(n) curll <- llik(n,curp) if(verbose) cat(c(0,curp,curll),"\n") for(i in 1:maxit) { ex <- estep(n,curp) newp <- mstep(ex) if(max(abs(newp-curp))0)) return(-Inf) sum(n*log(pp)) } ``` Get te MLEs. ```{r} n <- c("O"=104, "A"=91, "B"=36, "AB"=19) mle <- em(n,verbose=FALSE) mle ``` Plot the data. ```{r} p <- seq(0,1,length=251) thellik <- matrix(ncol=length(p),nrow=length(p)) for(i in 1:length(p)) { for(j in which(p[i]+p<=1)) { thellik[i,j] <- llik(n,c(p[i],p[j],1-p[i]-p[j])) if(p[i]+p[j]<0 || p[i]+p[j]>1) cat(i,j,p[i],p[j],1-p[i]-p[j],"\n") } } par(pty="s",las=1,mar=c(4.1,4.1,0.6,0.6)) image(p,p,thellik,col=rev(rainbow(start=0,end=2/3,n=251)), xlab=expression(p[O]),ylab=expression(p[A]),main="") contour(p,p,thellik,add=TRUE,levels=seq(-400,-2000,by= -100)) contour(p,p,thellik,add=TRUE,levels=seq(-400,-250,by=25)[-1],col="blue") points(mle[1],mle[2],lwd=2,pch=4) ``` An alternative optimization algorithm. ```{r} dat <- c(104,91,36,19) simple <- function(dat){ fr <- dat/sum(dat) po <- sqrt(fr[1]) pa <- sqrt(po^2+fr[2])-po pb <- fr[4]/pa c(O=po,A=pa,B=pb) } loglik <- function(p,dat){ p <- c(p,1-p[1]-p[2]) ll <- -(dat[1]*log(p[1]^2)+dat[2]*log(p[2]^2+2*p[2]*p[1]) + dat[3]*log(p[3]^2+2*p[3]*p[1]) + dat[4]*log(2*p[3]*p[2])) } mles <- function(dat){ fs <- optim(p=simple(dat)[-3],loglik,dat=dat)$par fs <- c(fs,1-sum(fs)) names(fs) <- c("O","A","B") return(fs) } mles(dat) ``` #### Example 3 regarding fit of Poisson counts ```{r} # The raw data x <- c(2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 3, 0, 0, 0, 3, 0, 5, 0, 1, 0, 1, 2, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0) # Counts ob <- table(x) ob # Sample mean mu <- mean(x) mu # Expected counts; binning 5+ together ex <- c(dpois(0:4, mu), 1-ppois(4,mu)) * length(x) ex # Chi-square and LRT statistics chis <- sum((ob-ex)^2/ex) lrt <- sum(2*ob*log(ob/ex)) # P-values via asymptotic approximation 1-pchisq(chis, length(ob)-2) 1-pchisq(lrt, length(ob)-2) # P-value based on simulations n.sim <- 10000 set.seed(22) simd <- apply(matrix(rpois(n.sim*length(x), mu), ncol=n.sim),2, function(x) { x <-table(factor(x,levels=0:100)); x[6] <- sum(x[-(1:5)]); x[1:6] }) # Function to calculate the likelihood ratio test statistic lrtt <- function(a,b){ b <- b[a>0] a <- a[a>0] sum(2*a*log(a/b)) } # Calculate the statistics simlrt <- apply(simd, 2, lrtt, ex) simchisq <- apply(simd, 2, function(a,b) sum((a-b)^2/b), ex) # The p-values mean(simchisq >= sum((ob-ex)^2/ex)) mean(simlrt >= sum(2*ob*log(ob/ex))) # Plot the permutation null distributions hist(simchisq,col="lightgrey",breaks=51) abline(v=sum((ob-ex)^2/ex),col="red",lwd=2) hist(simlrt,col="lightgrey",breaks=51) abline(v=sum(2*ob*log(ob/ex)),col="red",lwd=2) ``` ### Contingency tables #### Example 1 ```{r} x <- rbind(c(18, 2), c(11, 9)) x # Marginal totals rs <- apply(x, 1, sum) # sum of each row rs cs <- apply(x, 2, sum) # sum of each column cs n <- sum(x) n # Expected counts expected <- outer(rs,cs,"*")/n expected # Chi-square statistic chi <- sum((x-expected)^2/expected) chi # P-value 1-pchisq(chi,1) # Lrt statistic lrt <- 2*sum(x * log(x/expected)) # P-value 1-pchisq(lrt,1) # Mind the correction factor! chisq.test(x) chisq.test(x,correct=FALSE) # same as above ``` #### Example 2 ```{r} x <- rbind(c(9,9), c(20, 62)) x # Marginal totals rs <- apply(x, 1, sum) # sum of each row rs cs <- apply(x, 2, sum) # sum of each column cs n <- sum(x) n # Expected counts expected <- outer(rs,cs,"*")/n expected # Chi-square statistic chi <- sum((x-expected)^2/expected) chi # P-value 1-pchisq(chi,1) # Lrt statistic lrt <- 2*sum(x * log(x/expected)) lrt # P-value 1-pchisq(lrt,1) ``` #### Blood groups by state ```{r} # The data x <- rbind(c(122, 117, 19, 244), c(1781, 1351, 288,3301), c(353, 269, 60, 713)) x # Chi-square test for independence chi <- chisq.test(x) chi # Expected counts ex <- chi$expected ex # A direct method to get the expected counts rs <- apply(x,1,sum) # row sums rs cs <- apply(x,2,sum) # col sums cs n <- sum(rs) # total sample size n ex <- outer(rs,cs,"*")/n # expected counts under independence ex # Lrt statistic lrt <- 2*sum(x * log(x/ex)) lrt # Degrees of freedom df <- prod(dim(x)-1) df # Another way to get the degrees of freedom df <- (ncol(x) - 1) * (nrow(x) - 1) df # P-value 1-pchisq(lrt, df) ``` ### End of code