## Methods in Biostatistics 2 (140.652)

### Goodness of fit

#### Example data

```{r}
obs <- c(35, 43, 22)

# Expected proportions (i.e., the null hypothesis)
p0 <- c(0.25, 0.5, 0.25)

# Expected counts (note: sum(obs) = total count)
exp <- sum(obs) * p0

# LRT statistic 
lrt <- 2 * sum(obs * log(obs/exp)) 
lrt

# Chi-square statistic 
chis <- sum( (obs-exp)^2 / exp)
chis

# P-value for LRT 
1 - pchisq(lrt, 3-1)

# P-value for chi-square test 
1 - pchisq(chis, 3-1)
```

#### The built-in function

```{r}
# You can use chisq.test() to do the chi-square test
chisq.test(obs, p=c(1/4,1/2,1/4))
```

#### Likelihood ratio test for goodness of fit

I start here with two functions:

- lrt / for calculating the LRT goodness-of fit statistic,
- chisq / for calculating the chi-square goodness-of-fit statistic.

Copy and paste these into R before doing the next bits.

```{r}
lrt <- function(x=c(35,43,22), p=c(0.25,0.5,0.25))
  2 * (dmultinom(x, prob=x/sum(x),log=TRUE) - dmultinom(x, prob=p,log=TRUE))

# alternatively, this could have been the following,

lrt2 <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)){
  ex <- (p*sum(x))[x > 0]
  x <- x[x>0]
  2 * sum(x * log(x / ex) )
}

chisq <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)){
  # check that x,p are appropriate
  p <- p/sum(p) # ensure that the probabilities sum to 1
  if(any(p < 0)) stop("probabilities p must be non-negative.")
  if(any(x < 0)) stop("x's must all be non-negative.")
  if(length(x) != length(p)) stop("length(x) should be the same as length(p).")
  n <- sum(x)
  expected <- n*p
  sum( (x - expected)^2 / expected )
}
```

#### The Main Example

```{r}
# The 35:43:22 table
x <- c(35, 43, 22)

# Note that you might also have started with a vector of 1's, 2's and 3's and used the function "table" instead, as follows:
y <- rep(1:3, c(35, 43, 22))
x <- table(y)
x

# Calculate LRT test statistic to test the 1:2:1 proportions
LRT <- lrt(x, p=c(0.25, 0.5, 0.25)) 
LRT

# Calculate chi-square statistic to test the 1:2:1 proportions
xsq <- chisq(x, p=c(0.25, 0.5, 0.25))
xsq

# You could also use the built-in function chisq.test()
chisq.test(x, p=c(0.25, 0.5, 0.25))

# P-value for LRT statistic, using the asymptotic approximation
1 - pchisq(LRT, 2)

# P-value for chi-square statistic, using the asymptotic approx'n
1 - pchisq(xsq, 2)

# Simulations to est the null dist'n of the LRT and chi-square stats
set.seed(45)
simdat <- rmultinom(1000, 100, c(0.25, 0.5, 0.25))
results <- cbind(apply(simdat, 2, lrt, p=c(0.25, 0.5, 0.25)),
                 apply(simdat, 2, chisq, p=c(0.25, 0.5, 0.25)))

# Estimated p-value for LRT stat
mean(results[,1] >= LRT)

# Estimated p-value for chi-square stat
mean(results[,2] >= xsq)
```

#### Example 1 in the "composite hypotheses" section of the lecture

I start with some additional functions:

- sim.ex1,   for simulating data under the null hypothesis (H0).
- mle.ex1,   for estimating the allele frequency under H0.
- lrt.ex1,   calculates LRT statistic for testing this H0.
- chisq.ex1, calculates chi-square stat for testing this H0.

```{r}
# Simulate data
sim.ex1 <- function(n=100, f=0.2){
  table(factor(sample(c("AA","AB","BB"), n, repl=TRUE, 
                      prob=c(f^2,2*f*(1-f),(1-f)^2)), 
               levels=c("AA","AB","BB")))
  }

# MLE under H0
mle.ex1 <- function(x=c(5,20,75)) (x[1]+x[2]/2)/sum(x)
  
# LRT statistic
lrt.ex1 <- function(x=c(5,20,75)){
  mle <- mle.ex1(x)
  p0 <- c(mle^2,2*mle*(1-mle),(1-mle)^2)
  2*(dmultinom(x,prob=x/sum(x),log=TRUE) - dmultinom(x,prob=p0,log=TRUE))
}

# Chi-square statistic
chisq.ex1 <- function(x=c(5,20,75)){
  mle <- mle.ex1(x)
  expected <- sum(x)*c(mle^2,2*mle*(1-mle),(1-mle)^2)
  sum((x-expected)^2/expected)
}

# The example data
x <- c(5, 20, 75)

# The estimated allele frequency
mle <- mle.ex1(x)
mle

# LRT statistic
LRT <- lrt.ex1(x)
LRT

# Chi-square stat
xsq <- chisq.ex1(x)
xsq

# Asymptotic approximation for the p-value for the LRT statistic
1 - pchisq(LRT, 1)

# Asymptotic approximation for the p-value for chi-square statistic
1 - pchisq(xsq, 1)

# Simulations to estimate the null distribution of the LRT and chi-square statistics
results <- matrix(ncol=2,nrow=1000)
set.seed(33)
for(i in 1:1000){
  mytable <- sim.ex1(100, mle)
  results[i,1] <- lrt.ex1(mytable)
  results[i,2] <- chisq.ex1(mytable)
}

# Estimated p-value for the LRT statistic
mean(results[,1] >= LRT)

# Estimated p-value for chi-square statistic
mean(results[,2] >= xsq)
```

#### Example 2 

We can use the EM algorithm to find the MLEs of the allele frequencies. Some functions first.

```{r}
estep <- function(n,p){
  pAA <- p[2]^2/(p[2]^2+2*p[2]*p[1])
  pBB <- p[3]^2/(p[3]^2+2*p[3]*p[1])
  ex <- c(n[1], pAA*n[2], (1-pAA)*n[2],
          pBB*n[3], (1-pBB)*n[3],
          n[4])
}

mstep <- function(ex){
  n <- sum(ex)
  p <- c((ex[1]+(ex[3]+ex[5])/2)/n,
         (ex[2]+(ex[3]+ex[6])/2)/n,
         (ex[4]+(ex[5]+ex[6])/2)/n)
  p
}

em <- function(n, tol=1e-5, maxit=1000, verbose=TRUE){
  curp <- c(n[1], n[2]+n[4]/2,
            n[3]+n[4]/2)/sum(n)
  curll <- llik(n,curp)
  if(verbose) cat(c(0,curp,curll),"\n")
  
  for(i in 1:maxit) {
    ex <- estep(n,curp)
    newp <- mstep(ex)
    if(max(abs(newp-curp))<tol) break
    newll <- llik(n,newp)
    if(verbose) cat(c(i,newp,newll,newll-curll),"\n")
    curp <- newp
    curll <- newll
  }
  newp
}

llik <- function(n, p,tol=1e-13){
  pp <- c(p[1]^2,
          p[2]^2+2*p[2]*p[1],
          p[3]^2+2*p[3]*p[1],
          2*p[2]*p[3])

  if(any(pp<tol & n>0)) return(-Inf)
  sum(n*log(pp))
}
```

Get te MLEs.

```{r}
n <- c("O"=104, "A"=91, "B"=36, "AB"=19)
mle <- em(n,verbose=FALSE)
mle
```

Plot the data.

```{r}
p <- seq(0,1,length=251)
thellik <- matrix(ncol=length(p),nrow=length(p))
for(i in 1:length(p)) {
  for(j in which(p[i]+p<=1)) {
    thellik[i,j] <- llik(n,c(p[i],p[j],1-p[i]-p[j]))
    if(p[i]+p[j]<0 || p[i]+p[j]>1)
      cat(i,j,p[i],p[j],1-p[i]-p[j],"\n")
  }
}

par(pty="s",las=1,mar=c(4.1,4.1,0.6,0.6))
image(p,p,thellik,col=rev(rainbow(start=0,end=2/3,n=251)),
      xlab=expression(p[O]),ylab=expression(p[A]),main="")
contour(p,p,thellik,add=TRUE,levels=seq(-400,-2000,by= -100))
contour(p,p,thellik,add=TRUE,levels=seq(-400,-250,by=25)[-1],col="blue")
points(mle[1],mle[2],lwd=2,pch=4)
```

An alternative optimization algorithm.

```{r}
dat <- c(104,91,36,19)

simple <- function(dat){
  fr <- dat/sum(dat)
  po <- sqrt(fr[1])
  pa <- sqrt(po^2+fr[2])-po
  pb <- fr[4]/pa
  c(O=po,A=pa,B=pb)
}

loglik <- function(p,dat){
  p <- c(p,1-p[1]-p[2]) 
  ll <- -(dat[1]*log(p[1]^2)+dat[2]*log(p[2]^2+2*p[2]*p[1]) + 
            dat[3]*log(p[3]^2+2*p[3]*p[1]) + 
            dat[4]*log(2*p[3]*p[2]))
}

mles <- function(dat){
  fs <- optim(p=simple(dat)[-3],loglik,dat=dat)$par
  fs <- c(fs,1-sum(fs))
  names(fs) <- c("O","A","B")
  return(fs)
}

mles(dat)
```

#### Example 3 regarding fit of Poisson counts

```{r}
# The raw data
x <- c(2, 2, 0, 0, 0,   0, 0, 1, 0, 0,
       0, 0, 4, 0, 0,   3, 0, 0, 0, 3,
       0, 5, 0, 1, 0,   1, 2, 0, 2, 1,
       0, 0, 0, 0, 0,   0, 0, 0)

# Counts
ob <- table(x)
ob

# Sample mean
mu <- mean(x)
mu

# Expected counts; binning 5+ together
ex <- c(dpois(0:4, mu), 1-ppois(4,mu)) * length(x)
ex

# Chi-square and LRT statistics
chis <- sum((ob-ex)^2/ex)
lrt <- sum(2*ob*log(ob/ex))

# P-values via asymptotic approximation
1-pchisq(chis, length(ob)-2)
1-pchisq(lrt, length(ob)-2)

# P-value based on simulations
n.sim <- 10000
set.seed(22)
simd <- apply(matrix(rpois(n.sim*length(x), mu), ncol=n.sim),2,
              function(x) { x <-table(factor(x,levels=0:100)); x[6] <- sum(x[-(1:5)]); x[1:6] })

# Function to calculate the likelihood ratio test statistic
lrtt <- function(a,b){
    b <- b[a>0]
    a <- a[a>0]
    sum(2*a*log(a/b))
  }

# Calculate the statistics
simlrt <- apply(simd, 2, lrtt, ex)
simchisq <- apply(simd, 2, function(a,b) sum((a-b)^2/b), ex)

# The p-values
mean(simchisq >= sum((ob-ex)^2/ex))
mean(simlrt >= sum(2*ob*log(ob/ex)))

# Plot the permutation null distributions
hist(simchisq,col="lightgrey",breaks=51)
abline(v=sum((ob-ex)^2/ex),col="red",lwd=2)
hist(simlrt,col="lightgrey",breaks=51)
abline(v=sum(2*ob*log(ob/ex)),col="red",lwd=2)
```


### Contingency tables

#### Example 1

```{r}
x <- rbind(c(18, 2), c(11, 9))
x

# Marginal totals
rs <- apply(x, 1, sum) # sum of each row
rs
cs <- apply(x, 2, sum) # sum of each column
cs
n  <- sum(x)
n

# Expected counts
expected <- outer(rs,cs,"*")/n
expected

# Chi-square statistic
chi <- sum((x-expected)^2/expected) 
chi
# P-value
1-pchisq(chi,1) 

# Lrt statistic
lrt <- 2*sum(x * log(x/expected)) 
# P-value
1-pchisq(lrt,1)

# Mind the correction factor!
chisq.test(x)
chisq.test(x,correct=FALSE) # same as above
```

#### Example 2

```{r}
x <- rbind(c(9,9), c(20, 62))
x

# Marginal totals
rs <- apply(x, 1, sum) # sum of each row
rs
cs <- apply(x, 2, sum) # sum of each column
cs
n  <- sum(x)
n

# Expected counts
expected <- outer(rs,cs,"*")/n
expected

# Chi-square statistic
chi <- sum((x-expected)^2/expected) 
chi
# P-value
1-pchisq(chi,1) 

# Lrt statistic
lrt <- 2*sum(x * log(x/expected)) 
lrt
# P-value
1-pchisq(lrt,1)
```

#### Blood groups by state

```{r}
# The data
x <- rbind(c(122, 117, 19, 244),
           c(1781, 1351, 288,3301),
           c(353, 269, 60, 713))
x

# Chi-square test for independence
chi <- chisq.test(x)
chi

# Expected counts
ex <- chi$expected
ex

# A direct method to get the expected counts
rs <- apply(x,1,sum) # row sums
rs
cs <- apply(x,2,sum) # col sums
cs
n <- sum(rs) # total sample size
n
ex <- outer(rs,cs,"*")/n # expected counts under independence
ex

# Lrt statistic
lrt <- 2*sum(x * log(x/ex)) 
lrt

# Degrees of freedom
df <- prod(dim(x)-1)  
df

# Another way to get the degrees of freedom
df <- (ncol(x) - 1) * (nrow(x) - 1)
df

# P-value
1-pchisq(lrt, df)
```

### End of code