############################################################ ############################################################ # DATA ANALYSIS OF THE SITKA TREES DATA # SPLUS FUNCTIONS WRITTEN BY RAFAEL IRIZARRY & FRANCESCA DOMINICI # - plotting data # - analysis of variance # - scatterplot matrix of residual # - autocorrelation function # - variogram # - ordinary least squares # - weighted least square # - testing hypothesis example pag 72 DLZ ############################################### ######################################################### #read the data frame. The as.is = T option prevents the days, # chamber, ozone, year and tree variables from being converted into # factors. sitka _ read.table("/home2/biostats/fdominic/teaching/LDA/datasets/sitka.data", header = T, as.is = T) ###################################################################### # For the trellis plot postscript("trellisplot.ps") compav _ function(x, y, ...) { list(x = sort(unique(x)), y = unlist(lapply(split(y, x), mean))) } xyplot(logsize ~ days | chamber*year, data = sitka, panel = function(x,y,...){ plot(x,y,pch=1,...) lines(compav(x, y))}) dev.off() ###################################################################### # for the scatterplot of the residuals from a cursory regression, 1988 # data # Fit an analysis of variance model fit88 _ aov(logsize ~ as.factor(days) * as.factor(ozone), data = sitka[sitka$year == 88,]) residmatrix88 _ matrix(unlist(split(fit88$res, sitka$days[sitka$year == 88])), ncol = 5, byrow = T, dimnames = list(NULL, paste("Day", as.character(sitka$days[1:5])))) pairs(residmatrix88) cor(residmatrix88) autocorr _ NULL for (i in 1:4){ autocorr[i] _ cor(residmatrix88[1:((5-i)*79)], residmatrix88[-c(1:(i*79))])} autocorr ###################################################################### # for the variogram # compute the pairwise differences of the residuals within each tree; # there are (4+3+2+1 = 10) pairs for each tree. pairdiffs _ apply(residmatrix88, 1, function(x){ xo _ outer(x, x, '-') xo[col(xo) > row(xo)]} ) # pairdiffs is a 10 by 79 matrix. vs _ pairdiffs^2 / 2 # The corresponding us are the pairwise differences of the days us _ outer(sitka$days[1:5], sitka$days[1:5], '-') us _ us[col(us) > row(us)] us _ -us # I'd rather have positive differences here # plot the sample variogram values plot(rep(us, 79), vs, xlab = "lag in days", ylab = "Variogram") # add the average curve vs.av _ apply(vs, 1, mean) lines(sort(us), vs.av[order(us)], type = "b", pch = 1) # one estimate of the process variance, assuming stationarity abline(h = sum(fit88$resid^2)/fit88$df.residual, lty = 4) ######################################### # Maximum Likelihood estimation # ############################################################### #assume saturated model, ignoring chamber: fit88 _ aov(logsize ~ as.factor(ozone)*as.factor(days), data = sitka[sitka$year == 88,]) # extract the model matrix X.88 _ model.matrix(fit88) # v0 is a vector of length 14 of the [2:5,1], [2:5,2], [3:5,3], # [4:5,4], [5,5] entries of the matrix V0. The [1,1] entry is # constrained to be one. makeV0 _ function(v0) { # take v0 to be the entries of the Cholesky factor for V0 dim.V0 _ sqrt(8*length(v0) + 9)/2 - 1/2 # V0.mat _ matrix(0, ncol = dim.V0, nrow = dim.V0) # V0.mat[row(V0.mat) >= col(V0.mat)] _ c(1.0, v0) # V0.mat _ V0.mat + t(V0.mat) - diag(diag(V0.mat)) # V0.mat C.mat _ diag(dim.V0) C.mat[row(C.mat) >= col(C.mat)] _ c(1.0, v0) C.mat %*% t(C.mat) } blockdiag.indices _ function(nsubjects, ntimes) { indices _ rep(0:(ntimes*nsubjects-1), rep(ntimes, ntimes*nsubjects)) indices _ indices*ntimes*nsubjects d _ rep(1:nsubjects, rep(ntimes^2, nsubjects)) e _ (d-1)*ntimes + (1:ntimes) indices + e } betahat.RSS.sigma2 _ function(v0, X = X.88, y = sitka$logsize[sitka$year == 88]) { V0 _ makeV0(v0) ntimes _ dim(V0)[1] V0inv _ solve(V0) nsubjects _ length(y)/ntimes # assume balance here!!! Vinv _ diag(length(y)) indices _ blockdiag.indices(nsubjects, ntimes) Vinv[indices] _ V0inv betahat _ solve(t(X) %*% Vinv %*% X) %*% t(X) %*% Vinv %*% y resids _ y - X %*% betahat RSS _ t(resids) %*% Vinv %*% resids list(betahat = betahat, RSS = RSS, sigma2 = RSS/(nsubjects*ntimes)) } Lr _ function(v0, X = X.88, y = sitka$logsize[sitka$year == 88]) { RSS _ betahat.RSS.sigma2(v0, X, y)$RSS V0 _ makeV0(v0) ntimes _ dim(V0)[1] nsubjects _ length(y)/ntimes det _ prod(eigen(V0, symmetric = T)$values) -0.5*nsubjects*(ntimes*log(RSS) + log(det)) } # # minimize the log likelihood Lr nlmin(function(arg){-Lr(arg)}, x = rnorm(14,.2, .1), print.level=2, max.fcal = 50) nlmin(function(arg){-Lr(arg)}, x = v0hat.88[c(2:5,7:10,13:15,19:20,25)], print.level=2, max.fcal = 50, max.iter = 50) nlmin(function(arg){-Lr(arg)}, x = initguess, print.level=2, max.fcal = 75, max.iter = 50) ###################################################################### # Try a simulation to see if this works: sim.ml _ function(nsubjects = 10, ntimes = 3, X = matrix(c(rep(1, nsubjects*ntimes), rep(1:ntimes, nsubjects), rep((1:ntimes)^2, nsubjects)), ncol = 3), beta = c(20, 10, -5), sigma2 = 1, v0 = c(1/3, -1/6, 2/3, 1/6, 5/3)) { V0 _ makeV0(v0) V _ diag(ntimes*nsubjects) indices _ blockdiag.indices(nsubjects, ntimes) V[indices] _ V0 C _ chol(sigma2*V) errors _ C %*% rnorm(ntimes*nsubjects) y _ X %*% beta + errors data.frame(y = y, X = X, subject = rep(1:nsubjects, rep(ntimes, nsubjects))) } junk _ sim.ml() resid _ lm(junk$y ~ -1 + as.matrix(junk[,2:4]))$resid resid.covar _ var(matrix(resid, ncol = 3, byrow = T)) initguess _ resid.covar[c(2,3,5,6,9)]/resid.covar[1,1] nlmin(function(arg){-Lr(arg, X = as.matrix(junk[,2:4]), y = junk$y)}, x = initguess, print.level = 2) nlmin(function(arg){-Lr(arg, X = as.matrix(junk[,2:4]), y = junk$y)}, x = c(1/3, -1/6, 2/3, 1/6, 5/3), print.level = 2, max.iter = 50, max.fcal = 50) ###################################################################### lmefit.88 _ lme(fixed = logsize ~ as.factor(days)*as.factor(ozone), random = ~1, cluster = ~tree, method = "ml", data = sitka[sitka$year == 88,], control = list(verbose = T) ) ## FIGURE 4.2 observed mean response in each of the four chambers TREES <- matrix(scan("/home2/biostats/fdominic/teaching/LDA/datasets/trees.data"),byrow=T,ncol=13) TIMES <- TREES[1,] TREES <- TREES[-1,] dimnames(TREES)[[2]] <- list(as.character(TIMES)) b <- c(1,55) e <- c(54,79) titles <- c("Ozone","Control") avgs <- matrix(0,2,13) sds <- matrix(0,2,13) for(i in 1:2) { avgs[i,] <- t(apply(TREES[b[i]:e[i],],2,mean)) sds[i,] <- t(apply(TREES[b[i]:e[i],],2,var))} postscript("Fig4.2.ps") plot(TIMES,avgs[1,],ylim=range(c(avgs-2*sds/sqrt(79),avgs+2*sds/sqrt(79))),ylab="Log-Size",type="n",main="Figure 16: Pooled Averages + 2SEs") for(i in 1:2){ apply(TREES[b[i]:e[i],],1,function(y) { lines(TIMES[1:5],avgs[i,1:5],pch=i+1,type="b") lines(TIMES[1:5],avgs[i,1:5]+sds[i,1:5]*2/sqrt(79),lty=i) lines(TIMES[1:5],avgs[i,1:5]-sds[i,1:5]*2/sqrt(79),lty=i) lines(TIMES[6:13],avgs[i,6:13],pch=i+1,type="b") lines(TIMES[6:13],avgs[i,6:13]+sds[i,6:13]*2/sqrt(79),lty=i) lines(TIMES[6:13],avgs[i,6:13]-sds[i,6:13]*2/sqrt(79),lty=i) })} legend(160,6.5,legend=c("Ozone","Control"),lty=c(1:2),marks=c(2,3)) dev.off() ###General linear models for correlated data ## Example 4.1 pag 73 DLZ M <- dim(TREES)[1] b <- c(1,28,55,67) e <- c(27,54,66,79) titles <- c("Ozone 1","Ozone 2","Control 1","Control 2") avgs <- matrix(0,4,13) sds <- matrix(0,4,13) for(i in 1:4) { avgs[i,] <- t(apply(TREES[b[i]:e[i],],2,mean))} ###RESIDUAL PAIR PLOT Residuals <- array(0,dim=dim(TREES)) for(i in 1:4) Residuals[b[i]:e[i],] <- sweep(TREES[b[i]:e[i],],2,avgs[i,]) i <- 1 #ESTIMATE OF COVARIANCE MATRIX ##LIKE BOOK: ###DO THE CHAMBERS HAVE EFFECT V1 <- round(var(Residuals[,1:5])*(M-1)/(M-4),digit=3) #Division is to be like book V2 <- round(var(Residuals[,6:13])*(M-1)/(M-4),digit=3) #Division is to be like book cor(Residuals[,1:5])*(M-1)/(M-4)/V1 cor(Residuals[,6:13])*(M-1)/(M-4)/V2 V <- kronecker(diag(1,79),V1) Y <- as.vector(t(TREES[,1:5])) X <- matrix(0,M*5,7) for(i in 1:5) X[seq(i,M*5,5),i] <- 1 #this one is the time effect X[(e[2]*5+1):(e[4]*5),6] <- 1 #this is the difference for ozone X[(e[2]*5+1):(e[4]*5),7] <- rep(TIMES[1:5]/100,length(b[3]:e[4])) ####OLS Beta1 <- solve(t(X)%*%X)%*%t(X)%*%Y RW <- solve(t(X)%*%X)%*%t(X)%*%V%*%X%*%solve(t(X)%*%X) SEs1 <- sqrt(diag(RW)) WrongSE <- sqrt(diag(solve(t(X)%*%X))*var(Y-X%*%Beta1)) print("Table 4.3") print(rbind(signif(t(Beta1),4),signif(SEs1,2))) ###WLS Beta <- solve(t(X)%*%solve(V)%*%X)%*%t(X)%*%solve(V)%*%Y RW <- solve(t(X)%*%solve(V)%*%X) SEs <- sqrt(diag(RW)) rbind(signif(t(Beta),4),signif(SEs,2)) ##TESTS: Q <- matrix(c(0,0,0,0,0,0,0,0,0,0,1,0,0,1),nrow=2) Chi <- t(Beta)%*%t(Q)%*%solve(Q%*%RW%*%t(Q))%*%Q%*%Beta ##NOW FOR THE SECOND YEAR V <- kronecker(diag(1,79),V2) Y <- as.vector(t(TREES[,6:13])) X <- matrix(0,M*8,9) for(i in 1:8) X[seq(i,M*8,8),i] <- 1 #this one is the time effect X[(e[2]*8+1):(e[4]*8),9] <- 1 #this is the difference for ozone Beta2 <- solve(t(X)%*%X)%*%t(X)%*%Y RW <- solve(t(X)%*%X)%*%t(X)%*%V%*%X%*%solve(t(X)%*%X) WrongSE <- sqrt(diag(solve(t(X)%*%X))*var(Y-X%*%Beta2)) SEs2 <- sqrt(diag(RW)) rbind(signif(t(Beta2),4),signif(SEs2,2)) ###Plotting results postscript("Fig4.4.ps") par(mfrow=c(2,2)) plot(TIMES[1:5],Beta1[1:5],type="b",ylim=range(c(Beta1[1:5]+2*SEs1[1:5],Beta1[1:5]-2*SEs1[1:5])),ylab="Log Size",xlab="Time",main="First Season") lines(TIMES[1:5],Beta1[1:5]+2*SEs1[1:5],lty=2)