############################################################
############################################################
#  DATA ANALYSIS OF THE SITKA TREES DATA
#  SPLUS FUNCTIONS WRITTEN BY RAFAEL IRIZARRY & FRANCESCA DOMINICI
#  - plotting data
#  - analysis of variance
#  - scatterplot matrix of residual
#  - autocorrelation function
#  - variogram
#  - ordinary least squares
#  - weighted least square
# - testing hypothesis   example pag 72 DLZ
###############################################
#########################################################
#read the data frame.  The as.is = T option prevents the days,
# chamber, ozone, year and tree variables from being converted into
# factors.  
sitka _ read.table("/home2/biostats/fdominic/teaching/LDA/datasets/sitka.data", header = T, as.is = T)
######################################################################
# For the trellis plot
postscript("trellisplot.ps")
compav _ function(x, y, ...)
{
        list(x = sort(unique(x)), y = unlist(lapply(split(y, x), mean)))
}


xyplot(logsize ~ days | chamber*year, data = sitka,
       panel = function(x,y,...){
	 plot(x,y,pch=1,...)
         lines(compav(x, y))})
dev.off()
######################################################################
# for the scatterplot of the residuals from a cursory regression, 1988
# data
# Fit an analysis of variance model
fit88 _ aov(logsize ~ as.factor(days) * as.factor(ozone),
            data = sitka[sitka$year == 88,])

residmatrix88 _ matrix(unlist(split(fit88$res, sitka$days[sitka$year ==
           88])), ncol = 5, byrow = T, dimnames = list(NULL,
           paste("Day", as.character(sitka$days[1:5]))))

pairs(residmatrix88)
cor(residmatrix88)
autocorr _ NULL
for (i in 1:4){ autocorr[i] _ cor(residmatrix88[1:((5-i)*79)],
                                  residmatrix88[-c(1:(i*79))])}
autocorr
######################################################################
# for the variogram
# compute the pairwise differences of the residuals within each tree;
# there are (4+3+2+1 = 10) pairs for each tree.

pairdiffs _ apply(residmatrix88, 1, function(x){
 xo _ outer(x, x, '-')
 xo[col(xo) > row(xo)]}
 )

# pairdiffs is a 10 by 79 matrix.

vs _ pairdiffs^2 / 2

# The corresponding us are the pairwise differences of the days
us _ outer(sitka$days[1:5], sitka$days[1:5], '-')
us _ us[col(us) > row(us)]
us _ -us # I'd rather have positive differences here
# plot the sample variogram values
plot(rep(us, 79), vs, xlab = "lag in days", ylab = "Variogram")
# add the average curve
vs.av _ apply(vs, 1, mean)
lines(sort(us), vs.av[order(us)], type = "b", pch = 1)
# one estimate of the process variance, assuming stationarity
abline(h = sum(fit88$resid^2)/fit88$df.residual, lty = 4)
#########################################
# Maximum Likelihood estimation
#
###############################################################
#assume saturated model, ignoring chamber:
fit88 _ aov(logsize ~ as.factor(ozone)*as.factor(days), data =
            sitka[sitka$year == 88,])

# extract the model matrix
X.88 _ model.matrix(fit88)

# v0 is a vector of length 14 of the [2:5,1], [2:5,2], [3:5,3],
# [4:5,4], [5,5] entries of the matrix V0. The [1,1] entry is
# constrained to be one.

makeV0 _ function(v0)
  {
#  take v0 to be the entries of the Cholesky factor for V0
    dim.V0 _ sqrt(8*length(v0) + 9)/2 - 1/2
#    V0.mat _ matrix(0, ncol = dim.V0, nrow = dim.V0)
#    V0.mat[row(V0.mat) >= col(V0.mat)] _ c(1.0, v0)
#    V0.mat _ V0.mat + t(V0.mat) - diag(diag(V0.mat))
#    V0.mat
    C.mat _ diag(dim.V0)
    C.mat[row(C.mat) >= col(C.mat)] _ c(1.0, v0)
    C.mat %*% t(C.mat)
  }

blockdiag.indices _ function(nsubjects, ntimes)
  {
    indices _ rep(0:(ntimes*nsubjects-1), rep(ntimes, ntimes*nsubjects))
    indices _ indices*ntimes*nsubjects
    d _ rep(1:nsubjects, rep(ntimes^2, nsubjects))
    e _ (d-1)*ntimes + (1:ntimes)
    indices + e
  }

betahat.RSS.sigma2 _ function(v0, X = X.88, y =
  sitka$logsize[sitka$year == 88]) 
  {
    V0 _ makeV0(v0)
    ntimes _ dim(V0)[1]
    V0inv _ solve(V0)
    nsubjects _ length(y)/ntimes  # assume balance here!!!
    Vinv _ diag(length(y))

    indices _ blockdiag.indices(nsubjects, ntimes)
    
    Vinv[indices] _ V0inv

    betahat _ solve(t(X) %*% Vinv %*% X) %*% t(X) %*% Vinv %*% y
    resids _ y - X %*% betahat
    RSS _ t(resids) %*% Vinv %*% resids
    list(betahat = betahat, RSS = RSS, sigma2 = RSS/(nsubjects*ntimes))
  }

Lr _ function(v0, X = X.88, y = sitka$logsize[sitka$year == 88])
  {
    RSS _ betahat.RSS.sigma2(v0, X, y)$RSS
    V0 _ makeV0(v0)
    ntimes _ dim(V0)[1]
    nsubjects _ length(y)/ntimes
    det _ prod(eigen(V0, symmetric = T)$values)
    -0.5*nsubjects*(ntimes*log(RSS) + log(det))
  }
#
# minimize the log likelihood Lr

nlmin(function(arg){-Lr(arg)}, x = rnorm(14,.2, .1), print.level=2,
      max.fcal = 50)

nlmin(function(arg){-Lr(arg)}, x =
      v0hat.88[c(2:5,7:10,13:15,19:20,25)],
      print.level=2,
      max.fcal = 50, max.iter = 50)

nlmin(function(arg){-Lr(arg)},
      x = initguess,
      print.level=2,
      max.fcal = 75, max.iter = 50)

######################################################################
# Try a simulation to see if this works:
sim.ml _ function(nsubjects = 10, ntimes = 3,
                  X = matrix(c(rep(1, nsubjects*ntimes),
                    rep(1:ntimes, nsubjects),
                    rep((1:ntimes)^2, nsubjects)), ncol = 3),
                    beta = c(20, 10, -5),
                  sigma2 = 1,
                  v0 = c(1/3, -1/6, 2/3, 1/6, 5/3))
  {
    V0 _ makeV0(v0)
    V _ diag(ntimes*nsubjects)
    indices _ blockdiag.indices(nsubjects, ntimes)
    V[indices] _ V0
    C _ chol(sigma2*V)
    errors _ C %*% rnorm(ntimes*nsubjects)
    y _ X %*% beta + errors
    data.frame(y = y, X = X, subject = rep(1:nsubjects, rep(ntimes,
                               nsubjects)))
  }

junk _ sim.ml()

resid _ lm(junk$y ~ -1 + as.matrix(junk[,2:4]))$resid
resid.covar _ var(matrix(resid, ncol = 3, byrow = T))
initguess _ resid.covar[c(2,3,5,6,9)]/resid.covar[1,1]

nlmin(function(arg){-Lr(arg, X = as.matrix(junk[,2:4]), y = junk$y)},
      x = initguess, print.level = 2)

nlmin(function(arg){-Lr(arg, X = as.matrix(junk[,2:4]), y = junk$y)},
      x = c(1/3, -1/6, 2/3, 1/6, 5/3), print.level = 2, max.iter = 50,
      max.fcal = 50)

######################################################################
lmefit.88 _ lme(fixed = logsize ~ as.factor(days)*as.factor(ozone),
                random = ~1,
                cluster = ~tree,
                method = "ml",
                data = sitka[sitka$year == 88,],
                control = list(verbose = T)
                )
## FIGURE 4.2 observed mean response in each of the four chambers
TREES <- matrix(scan("/home2/biostats/fdominic/teaching/LDA/datasets/trees.data"),byrow=T,ncol=13)
TIMES <- TREES[1,]
TREES <- TREES[-1,]
dimnames(TREES)[[2]] <- list(as.character(TIMES))
b <- c(1,55)
e <- c(54,79)
titles <- c("Ozone","Control")
avgs <- matrix(0,2,13)
sds <- matrix(0,2,13)
for(i in 1:2) {
  avgs[i,] <- t(apply(TREES[b[i]:e[i],],2,mean))
  sds[i,] <- t(apply(TREES[b[i]:e[i],],2,var))}
postscript("Fig4.2.ps")
plot(TIMES,avgs[1,],ylim=range(c(avgs-2*sds/sqrt(79),avgs+2*sds/sqrt(79))),ylab="Log-Size",type="n",main="Figure 16: Pooled Averages + 2SEs") 
for(i in 1:2){
apply(TREES[b[i]:e[i],],1,function(y) {
      lines(TIMES[1:5],avgs[i,1:5],pch=i+1,type="b")
      lines(TIMES[1:5],avgs[i,1:5]+sds[i,1:5]*2/sqrt(79),lty=i)
      lines(TIMES[1:5],avgs[i,1:5]-sds[i,1:5]*2/sqrt(79),lty=i)
      lines(TIMES[6:13],avgs[i,6:13],pch=i+1,type="b")
      lines(TIMES[6:13],avgs[i,6:13]+sds[i,6:13]*2/sqrt(79),lty=i) 
      lines(TIMES[6:13],avgs[i,6:13]-sds[i,6:13]*2/sqrt(79),lty=i)
     })}
legend(160,6.5,legend=c("Ozone","Control"),lty=c(1:2),marks=c(2,3))
dev.off()
###General linear models for correlated data
## Example 4.1 pag 73 DLZ
M <- dim(TREES)[1]
b <- c(1,28,55,67)
e <- c(27,54,66,79)
titles <- c("Ozone 1","Ozone 2","Control 1","Control 2")
avgs <- matrix(0,4,13)
sds <- matrix(0,4,13)
for(i in 1:4) {
  avgs[i,] <- t(apply(TREES[b[i]:e[i],],2,mean))}
###RESIDUAL PAIR PLOT
Residuals <- array(0,dim=dim(TREES))
for(i in 1:4) 
  Residuals[b[i]:e[i],] <- sweep(TREES[b[i]:e[i],],2,avgs[i,])

i <- 1

#ESTIMATE OF COVARIANCE MATRIX
##LIKE BOOK: 

###DO THE CHAMBERS HAVE EFFECT
V1 <- round(var(Residuals[,1:5])*(M-1)/(M-4),digit=3) #Division is to be like book
V2 <- round(var(Residuals[,6:13])*(M-1)/(M-4),digit=3) #Division is to be like book

cor(Residuals[,1:5])*(M-1)/(M-4)/V1
cor(Residuals[,6:13])*(M-1)/(M-4)/V2


V <- kronecker(diag(1,79),V1)
Y <- as.vector(t(TREES[,1:5]))
X <- matrix(0,M*5,7)
for(i in 1:5) X[seq(i,M*5,5),i] <- 1 #this one is the time effect
X[(e[2]*5+1):(e[4]*5),6] <- 1 #this is the difference for ozone
X[(e[2]*5+1):(e[4]*5),7] <- rep(TIMES[1:5]/100,length(b[3]:e[4]))
####OLS  
Beta1 <- solve(t(X)%*%X)%*%t(X)%*%Y
RW <- solve(t(X)%*%X)%*%t(X)%*%V%*%X%*%solve(t(X)%*%X)
SEs1 <- sqrt(diag(RW))
WrongSE <- sqrt(diag(solve(t(X)%*%X))*var(Y-X%*%Beta1))
print("Table 4.3")
print(rbind(signif(t(Beta1),4),signif(SEs1,2)))
###WLS 
Beta <- solve(t(X)%*%solve(V)%*%X)%*%t(X)%*%solve(V)%*%Y
RW <- solve(t(X)%*%solve(V)%*%X)
SEs <- sqrt(diag(RW))
rbind(signif(t(Beta),4),signif(SEs,2))

##TESTS:
Q <- matrix(c(0,0,0,0,0,0,0,0,0,0,1,0,0,1),nrow=2)
Chi <- t(Beta)%*%t(Q)%*%solve(Q%*%RW%*%t(Q))%*%Q%*%Beta

##NOW FOR THE SECOND YEAR
V <- kronecker(diag(1,79),V2)

Y <- as.vector(t(TREES[,6:13]))
X <- matrix(0,M*8,9)
for(i in 1:8) X[seq(i,M*8,8),i] <- 1 #this one is the time effect
X[(e[2]*8+1):(e[4]*8),9] <- 1 #this is the difference for ozone
  
Beta2 <- solve(t(X)%*%X)%*%t(X)%*%Y
RW <- solve(t(X)%*%X)%*%t(X)%*%V%*%X%*%solve(t(X)%*%X)
WrongSE <- sqrt(diag(solve(t(X)%*%X))*var(Y-X%*%Beta2))
SEs2 <- sqrt(diag(RW))
rbind(signif(t(Beta2),4),signif(SEs2,2))
###Plotting results
postscript("Fig4.4.ps")
par(mfrow=c(2,2))
plot(TIMES[1:5],Beta1[1:5],type="b",ylim=range(c(Beta1[1:5]+2*SEs1[1:5],Beta1[1:5]-2*SEs1[1:5])),ylab="Log Size",xlab="Time",main="First Season")
lines(TIMES[1:5],Beta1[1:5]+2*SEs1[1:5],lty=2)