############################################################################### ## Fit seasonal models ## Copyright (C) 2004, Roger D. Peng and Aidan McDermott ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ############################################################################### ## Fit a single city seasonal model using glm() and ns() library(splines) fitCitySeason1 <- function(data, pollutant = "l1pm10tmean", cause = "death", season = c("none", "smooth", "stepfun"), tempModel = c("default", "rm7", "tempInt"), dfyr.Time = 7, pdfyr.time = 0.15, df.Temp = 6, df.Dew = 3, df.Season = 1, extractors = NULL) { season <- match.arg(season) tempModel <- match.arg(tempModel) ## Specify temperature model temp.info <- setupTemp1(data, df.Temp, tempModel) data <- temp.info[["adj.data"]] temp.f <- temp.info[["temp.f"]] ## Need to modify degrees of freedom based on missingness of data sub <- data[, c("time", "agecat", "tmpd", "rmtmpd", "dptp", "rmdptp", temp.info[["addedVars"]], cause, pollutant)] subset <- complete.cases(sub) df.Time <- round( numdf(subset, dfyr.Time) ) df.time <- round( df.Time * pdfyr.time ) ## Don't setup smooth function of time where there are incomplete cases is.na(data[, "time"]) <- !subset ## Setup other formulas covariates.f <- paste(cause, "~ dow + agecat + Season") weather.f <- paste(c(paste("ns(dptp,", df.Dew, ")"), paste("ns(rmdptp,", df.Dew, ")")), collapse = " + ") ## Setup smooth functions of time time.f <- paste(c(paste("ns(time,", df.Time, ")"), paste("I(ns(time,", df.time, ")*Age2Ind)"), paste("I(ns(time,", df.time, ")*Age3Ind)")), collapse = "+") poll.f <- switch(season, none = paste(pollutant, collapse = " + "), smooth = { nfreq <- df.Season paste("harmonic(SeasonTime,", nfreq, ",365, intercept = TRUE):", pollutant, sep = "", collapse = "+") }, stepfun = { paste(paste("Season", pollutant, sep = ":"), collapse = "+") } ) form.str <- paste(covariates.f, time.f, temp.f, weather.f, poll.f, sep = " + ") modelFormula <- as.formula(form.str) ## Fit the model! fit <- glm(modelFormula, family = quasipoisson, data = data, control = glm.control(epsilon = 1e-10, maxit = 1000), na.action = na.omit) rval <- if(is.null(extractors)) fit else lapply(extractors, function(f) f(fit)) invisible(rval) } setupTemp1 <- function(dataframe, df.Temp, tempModel) { default.temp.f <- paste(c(paste("ns(tmpd,", df.Temp, ")"), paste("ns(rmtmpd,", df.Temp, ")")), collapse = " + ") orig.namelist <- names(dataframe) temp.f <- switch(tempModel, default = default.temp.f, rm7 = paste(default.temp.f, "ns(rm7tmpd, 3)", sep = "+"), tempInt = { paste("ns(tmpd,", df.Temp, "):", "ns(rmtmpd,", df.Temp, ")", sep = "") }) list(adj.data = dataframe, temp.f = temp.f, addedVars = setdiff(names(dataframe), orig.namelist)) } multiDFFit <- function(dfVec, city, ...) { if(!require(NMMAPSdata)) stop("Need to load package ", sQuote("NMMAPSdata")) results <- vector("list", length = length(dfVec)) dataframe <- readCity(city) results <- lapply(dfVec, function(d) { cat(d, "\n") try( fitCitySeason(dataframe, dfyr.Time = d[1], ...) ) }) cat("\n") invisible(results) } ## Fit a single city seasonal model using glm() and ns() ## Note: Just running `fitCitySeason(city)', where `city' is the data ## frame for a particular city, should fit the usual NMMAPS model, fitCitySeason <- function(data, pollutant = "l1pm10tmean", cause = "death", season = c("none", "periodic", "periodic2", "factor2"), tempModel = c("default", "rm7", "tempInt", "SeasonInt"), dfyr.Time = 7, pdfyr.time = 0.15, df.Temp = 6, df.Dew = 3, df.Season = 1, ## For "periodic" fits obsThreshold = 50, extractors = NULL) { season <- match.arg(season) tempModel <- match.arg(tempModel) if(inherits(pollutant, "formula")) pollutant <- attr(terms(pollutant), "term.labels") ## Create some seasonal variables data <- setupSeason(data) ## Some exclusions is.na(data[, cause]) <- as.logical(data[, paste("mark", cause, sep = "")]) ## Coerce to factor; very important! data <- transform(data, dow = as.factor(dow), agecat = as.factor(agecat)) nAges <- length(levels(data[, "agecat"])) ## Specify/setup temperature variables temp.info <- setupTemp(data, df.Temp, tempModel) data <- temp.info$adj.data temp.f <- temp.info$temp.f ## Need to modify degrees of freedom based on missingness of data subform <- paste("~", paste(c("time", "time2", "time3", "agecat", "tmpd", "rmtmpd", "dptp", "rmdptp", temp.info$addedVars, cause, paste(pollutant, collapse = "+")), collapse = "+")) sub <- model.frame(as.formula(subform), data = data, na.action = na.pass) subset <- complete.cases(sub) df.Time <- numdf(subset, dfyr.Time) df.time <- df.Time * pdfyr.time data$subset <- subset nobs <- sum(subset) / nAges if(nobs < obsThreshold) stop("Not enough observations: ", nobs, "\n") ## Set up smooth functions of time (with separate functions for ## the older two age categories smoothTime.info <- setupSmoothTime(data, df.Time, df.time) data <- smoothTime.info$adj.data time.f <- smoothTime.info$time.f ## Setup other formulas covariates.f <- paste(cause, "~ dow + agecat") weather.f <- paste(c(paste("ns(dptp,", df.Dew, ")"), paste("ns(rmdptp,", df.Dew, ")")), collapse = " + ") if(season == "none") { poll.f <- paste(pollutant, collapse = " + ") form.str <- paste(c(covariates.f, time.f, temp.f, weather.f, "Season", poll.f), collapse = " + ") } else { pollSeason.f <- switch(season, periodic = { nfreq <- df.Season paste("harmonic(SeasonTime,", nfreq, ",365, intercept = TRUE):", pollutant, sep = "") }, periodic2 = { nfreq <- df.Season paste("periodicBasis2(SeasonTime,", nfreq, ",365, intercept = TRUE):", pollutant, sep = "") }, factor2 = paste("Season + Season", pollutant, sep = ":"), ) form.str <- paste(covariates.f, time.f, temp.f, weather.f, pollSeason.f, sep = " + ") } form <- as.formula(form.str) ## Fit the model! fit <- glm(form, family = quasipoisson, subset = subset, data = data, control = glm.control(epsilon = 1e-10, maxit = 1000), na.action = na.exclude) ## Extract information from the fitted glm model object using the ## list of functions in `extractors'. If no extractors are ## specified, just return the entire fitted model object. if(is.null(extractors)) rval <- fit else rval <- lapply(extractors, function(f) f(fit)) invisible(rval) } periodicBasis2 <- function(x, nfreq, period, intercept = FALSE, ortho = TRUE) { pi <- base::pi ## Just in case someone has redefined pi! stopifnot(nfreq > 0) x <- as.numeric(x) nax <- is.na(x) N <- seq(0, nfreq - 1) k <- 2^N * 2 * pi / period M <- outer(x, k) sinM <- apply(M, 2, sin) cosM <- apply(M, 2, cos) if(!intercept) { basis <- cbind(sinM, cosM) if(ortho) basis <- qr.Q(qr(basis)) } else { basis <- cbind(1, sinM, cosM) if(ortho) { basis <- qr.Q(qr(basis)) basis[,1] <- rep.int(1, nrow(basis)) } } basis } setupTemp <- function(dataframe, df.Temp, tempModel) { default.temp.f <- paste(c(paste("ns(tmpd,", df.Temp, ")"), paste("ns(rmtmpd,", df.Temp, ")")), collapse = " + ") orig.namelist <- names(dataframe) if(tempModel == "default") temp.f <- default.temp.f else if(tempModel == "rm7") { library(ts) ## Create a running 7 day mean of tmpd tmpd <- dataframe[, "tmpd"] rm7tmpd <- filter(tmpd, filter = c(0, rep(1/7, 7)), sides = 1, circular = FALSE) dataframe[, "rm7tmpd"] <- as.numeric(rm7tmpd) ## Using 3 df here is arbitrary temp.f <- paste(default.temp.f, "ns(rm7tmpd, 3)", sep = " + ") } else if(tempModel == "tempInt") temp.f <- paste("ns(tmpd,", df.Temp, "):ns(rmtmpd,", df.Temp, ")") else if(tempModel == "SeasonInt") temp.f <- paste("(", default.temp.f, "):Season") else stop("Wrong temperature model specification") list(adj.data = dataframe, temp.f = temp.f, addedVars = setdiff(names(dataframe), orig.namelist)) } ## Return a modified data frame with two extra variables. `SeasonTime' ## is the day number within each year. `Season' is a factor ## indicating the season for each particular day. setupSeason <- function(dataframe) { n <- as.numeric(table(dataframe$agecat)[1]) year <- floor(dataframe[1:n, "date"] / 10000) nyeardays <- as.numeric(table(year)) seas.time <- unlist(lapply(paste(1, nyeardays, sep = ":"), function(x) eval(parse(text = x)))) SeasonTime <- rep(seas.time, 3) indlist <- genSeasonInd(dataframe) seas.ind <- with(indlist, factor(winter * 1 + spring * 2 + summer * 3 + fall * 4, labels = c("Winter", "Spring", "Summer", "Fall"))) Season <- rep(seas.ind, 3) data.frame(dataframe, SeasonTime = SeasonTime, Season = Season) } ## Re-create season indicators (taken from Aidan's `dates.R' file) genSeasonInd <- function(dataframe) { n <- as.numeric(table(dataframe$agecat)[1]) dates <- dataframe$date[1:n] year <- floor(dates/10000) mhyr <- dates %% 10000 month <- floor(mhyr/100) day <- mhyr %% 100 rm(mhyr) winter <- (( month == 12 & day > 21 ) | ( month == 1) | ( month == 2 ) | ( month == 3 & day <= 21)) spring <- (( month == 3 & day > 21 ) | ( month == 4) | ( month == 5 ) | ( month == 6 & day <= 21)) summer <- (( month == 6 & day > 21 ) | ( month == 7) | ( month == 8 ) | ( month == 9 & day <= 21)) fall <- (( month == 9 & day > 21 ) | ( month == 10) | ( month == 11 ) | ( month == 12 & day <= 21)) list(winter = winter, spring = spring, summer = summer, fall = fall, all = rep(TRUE, length(winter))) } ## This function modifies the original data frame so that the smooth ## functions of time for the older two age groups are setup correctly. ## The correct formula is also constructed. This code was extracted ## from Aidan's original fitmodel2() function. setupSmoothTime <- function(dataframe, df.Time, df.time) { library(splines) df.Time <- round(df.Time) df.time <- round(df.time) subset <- dataframe$subset if(is.null(subset)) stop(sQuote("dataframe"), " missing ", sQuote("subset"), " variable") Time <- dataframe[subset, "time"] if ( round(df.Time) == 1 | round(df.Time) == 0) { TIME <- Time Time <- matrix(NA, ncol = 1, nrow = nrow(dataframe)) Time[subset, ] <- TIME dimnames(Time) <- list(NULL, paste("Time.", 1, sep = "")) } else { knots <- quantile(Time, prob = (1:(round(df.Time)-1))/round(df.Time)) TIME <- ns(Time, knots = knots) Time <- matrix(NA, ncol = ncol(TIME), nrow = nrow(dataframe)) Time[subset, ] <- TIME dimnames(Time) <- list(NULL, paste("Time.", 1:ncol(TIME), sep = "")) } Age <- dataframe[, "agecat"] Time2 <- dataframe[subset & (Age == levels(Age)[2]), "time"] if ( round(df.time) == 1 | round(df.time) == 0) { TIME2 <- Time2 Time2 <- matrix(NA,ncol=1,nrow=nrow(dataframe)) Time2[subset & Age == levels(Age)[2],] <- TIME2 Time2[subset & Age == levels(Age)[1],] <- 0 Time2[subset & Age == levels(Age)[3],] <- 0 dimnames(Time2) <- list(NULL,paste("Time2.",1,sep="")) } else { knots2 <- quantile(Time2, prob= (1:(round(df.time)-1))/round(df.time) ) TIME2 <- ns(Time2, knots = knots2) Time2 <- matrix(NA,ncol = ncol(TIME2), nrow = nrow(dataframe)) Time2[subset & Age == levels(Age)[2],] <- TIME2 Time2[subset & Age == levels(Age)[1],] <- 0 Time2[subset & Age == levels(Age)[3],] <- 0 dimnames(Time2) <- list(NULL, paste("Time2.", 1:ncol(TIME2), sep = "")) } Time3 <- dataframe[subset & (Age == levels(Age)[3]), "time"] if ( round(df.time) == 1 | round(df.time) == 0 ) { TIME3 <- Time3 Time3 <- matrix(NA,ncol=1,nrow=nrow(dataframe)) Time3[subset & Age== levels(Age)[3],] <- TIME3 Time3[subset & Age== levels(Age)[1],] <- 0 Time3[subset & Age== levels(Age)[2],] <- 0 dimnames(Time3) <- list(NULL,paste("Time3.",1,sep="")) } else { knots3 <- quantile(Time3, prob= (1:(round(df.time)-1))/round(df.time) ) TIME3 <- ns(Time3,knots=knots3) Time3 <- matrix(NA,ncol=ncol(TIME3),nrow=nrow(dataframe)) Time3[subset & Age== levels(Age)[3],] <- TIME3 Time3[subset & Age== levels(Age)[1],] <- 0 Time3[subset & Age== levels(Age)[2],] <- 0 dimnames(Time3) <- list(NULL,paste("Time3.",1:ncol(TIME3),sep="")) } time.f <- paste(paste("Time.", 1:ncol(Time), sep="",collapse="+"), "+", paste("Time2.",1:ncol(Time2),sep="",collapse="+"), "+", paste("Time3.",1:ncol(Time3),sep="",collapse="+")) list(adj.data = cbind(dataframe, Time, Time2, Time3), time.f = time.f) } ############################################################################ ## ----------------------------------------------------------------------- ## Some support functions used in model fitting ## ----------------------------------------------------------------------- ##-------------------------------------------- numdf <- function(usedata,num=5){ n <- length(usedata) use <- usedata[1:(n/3)] ll <- round(length(use)/12) ## this is to eliminate the warning message the length of use is ## not a multiple of 12 usenew <- use[1:(ll*12)] mat <- matrix(usenew, ncol=12,byrow=T) m <- sum(ceiling(apply(mat,1,sum,na.rm=T)/12)) ##-365.25/12 df <- round(12*m/365.25*num) max(1,df) } ##-------------------------------------------- ## This function uses periodicBasis2 with 'ortho = TRUE' ## fitCitySeason2 <- function(data, pollutant = "l1pm10tmean", cause = "death", ## season = c("none", "periodic", "factor2"), ## tempModel = c("default", "rm7", "tempInt", "SeasonInt"), ## dfyr.Time = 7, pdfyr.time = 0.15, ## df.Temp = 6, df.Dew = 3, ## df.Season = 1, ## For "periodic" fits ## obsThreshold = 50, extractors = NULL) { ## season <- match.arg(season) ## tempModel <- match.arg(tempModel) ## ## if(inherits(pollutant, "formula")) ## pollutant <- attr(terms(pollutant), "term.labels") ## ## ## Create some seasonal variables ## data <- setupSeason(data) ## ## ## Some exclusions ## is.na(data[, cause]) <- as.logical(data[, paste("mark", cause, sep = "")]) ## ## ## Coerce to factor; very important! ## data <- transform(data, dow = as.factor(dow), agecat = as.factor(agecat)) ## nAges <- length(levels(data[, "agecat"])) ## ## ## Specify/setup temperature variables ## temp.info <- setupTemp(data, df.Temp, tempModel) ## data <- temp.info$adj.data ## temp.f <- temp.info$temp.f ## ## ## Need to modify degrees of freedom based on missingness of data ## subform <- paste("~", paste(c("time", "time2", "time3", "agecat", ## "tmpd", "rmtmpd", "dptp", "rmdptp", ## temp.info$addedVars, cause, ## paste(pollutant, collapse = "+")), ## collapse = "+")) ## sub <- model.frame(as.formula(subform), data = data, na.action = na.pass) ## subset <- complete.cases(sub) ## df.Time <- numdf(subset, dfyr.Time) ## df.time <- df.Time * pdfyr.time ## data$subset <- subset ## nobs <- sum(subset) / nAges ## if(nobs < obsThreshold) ## stop("Not enough observations: ", nobs, "\n") ## ## ## Set up smooth functions of time (with separate functions for ## ## the older two age categories ## smoothTime.info <- setupSmoothTime(data, df.Time, df.time) ## data <- smoothTime.info$adj.data ## time.f <- smoothTime.info$time.f ## ## ## Setup other formulas ## covariates.f <- paste(cause, "~ dow + agecat") ## weather.f <- paste(c(paste("ns(dptp,", df.Dew, ")"), ## paste("ns(rmdptp,", df.Dew, ")")), ## collapse = " + ") ## ## if(season == "none") { ## poll.f <- paste(pollutant, collapse = " + ") ## form.str <- paste(c(covariates.f, time.f, temp.f, ## weather.f, "Season", poll.f), collapse = " + ") ## } ## else { ## pollSeason.f <- ## switch(season, ## periodic = { ## nfreq <- df.Season ## paste("periodicBasis2(SeasonTime,", nfreq, ## ",365, intercept = TRUE):", pollutant) ## }, ## factor2 = paste("Season + Season", pollutant, sep = ":"), ## ) ## form.str <- paste(covariates.f, time.f, temp.f, ## weather.f, pollSeason.f, sep = " + ") ## } ## form <- as.formula(form.str) ## ## ## Fit the model! ## fit <- glm(form, family = quasipoisson, subset = subset, data = data, ## control = glm.control(epsilon = 1e-10, maxit = 1000), ## na.action = na.exclude) ## ## ## Extract information from the fitted glm model object using the ## ## list of functions in `extractors'. If no extractors are ## ## specified, just return the entire fitted model object. ## if(is.null(extractors)) ## rval <- fit ## else ## rval <- lapply(extractors, function(f) f(fit)) ## invisible(rval) ## } ##