###################### # ChIP-Seq functions ###################### ######################################################################## # Generic Functions #################### # shortcut for split splitit <- function(x) split(seq(along=x),x) #---------------------------------------------------------------------- # just converts an rle object to a data.frame # with columns: value, length, startIndex, endIndex rleFrame = function(r) { y <- data.frame(cbind(r[[2]], as.integer(r[[1]])), stringsAsFactors=FALSE) y[,2] <- as.integer(y[,2]) y <- cbind(y,cumsum(y[,2])) y <- cbind(y,(y[,3] - y[,2] + 1)) y = y[,c(1,2,4,3)] names(y) = c("x","len","start","end") return(y) } #---------------------------------------------------------------------- # from Rafa, makes clusters/read groups # defined by the max gap between reads clusterMaker <- function(chr,pos,order.it=TRUE,maxGap=300){ nonaIndex=which(!is.na(chr) & !is.na(pos)) Indexes=split(nonaIndex,chr[nonaIndex]) clusterIDs=rep(NA,length(chr)) LAST=0 for(i in seq(along=Indexes)){ Index=Indexes[[i]] x=pos[Index] if(order.it){ Index=Index[order(x)];x=pos[Index] } y=as.numeric(diff(x)>maxGap) z=cumsum(c(1,y)) clusterIDs[Index]=z+LAST LAST=max(z)+LAST } clusterIDs } #---------------------------------------------------------------------- # Regular smoothing (from Andrew Jaffe, JHU) # %% i don't think we need this anymore, given dropping the NA tails myfilter <- function(x,filter,...){ L=length(x) if(L>length(filter)){ res=filter(x,filter,...) ##now fill out the NAs M=length(filter) N=(M-1)/2 for(i in 1:N){ w=filter[(N-i+2):M] y=x[1:(M-N+i-1)] res[i] = sum(w*y)/sum(w) w=rev(w) ii=(L-(i-1)) y=x[(ii-N):L] res[ii]<-sum(w*y)/sum(w) } } else{ res=rep(mean(x),L) } return(res) } ######################################################################### # initialize #################### # this just loads in necessary packages # i'm going to add automatically installing packages # that aren't on someones computer to this initialize = function() { options(warn=-1) pckg = try(require(Biostrings)) if(!pckg) { source("http://bioconductor.org/biocLite.R") biocLite("Biostrings") require("Biostrings") } pckg = try(require(GenomicRanges)) if(!pckg) { source("http://bioconductor.org/biocLite.R") biocLite("GenomicRanges") require("GenomicRanges") } pckg = try(require(BSgenome.Hsapiens.UCSC.hg18)) # 844 MB! if(!pckg) { source("http://bioconductor.org/biocLite.R") biocLite("BSgenome.Hsapiens.UCSC.hg18") require("BSgenome.Hsapiens.UCSC.hg18") } pckg = try(require(splines)) if(!pckg) { getPckg(splines) require(splines) } options(warn=1) } #---------------------------------------------------------------------- # install new packages, probably need to add a bioconductor option # used below in initialize getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org") #------------------------------------------------------------------------ # *** Andrew: fix warning, pattern does not work # this needs to be cleaned up a bit, but basically # just reads in the data from .bin files (or something # user defined readData = function(data.path = "./", treat = "Treatment", input = "Input", type = "bed", saveIt = FALSE) { if(file.exists(paste(data.path,"chipseq_data.rda",sep="/"))) { cat("reading in the data from saved file.") load(paste(data.path,"chipseq_data.rda",sep="/")) cat("\n") } else { cat("Reading in the data.") trt = read.table(dir(data.path, pattern = c(treat,type), full.names = T), sep = "\t", as.is = T, header = F) cat(".") ctrl = read.table(dir(data.path,pattern = c(input,type), full.names = T), sep = "\t", as.is = T, header = F) names(trt) = names(ctrl) = c("chr","start","end","x","y","strand") cat("\n") rawData = vector("list",2) rawData[[1]] = trt rawData[[2]] = ctrl names(rawData) = c("Treat","Control") if(saveIt) { cat(paste("saving the data as an rda to",data.path),"\n") save(rawData, file=paste(data.path, "chipseq_data.rda", sep = "/"),compress=T) } } return(rawData) } ######################################################################## # Processing Functions: aligning and smoothing #################### #------------------------------------------------------------------------ # Aligns data from BED file using the given shift # Applies to list of chromosomes which.chr (default is "ALL") # uniq=T only keeps unique tags for a given strand (removes duplicates) # # output is "aligned" data class: data frame with columns "chr" (chromosome), "pos" (position) and "count". alignSeq = function(bed, shift, chr="ALL", uniq = TRUE) { cat("aligning\n") if(chr != "ALL") bed = bed[which(bed$chr %in% chr),] chrIndexes = splitit(bed$chr) chrNames = names(chrIndexes) aligned = NULL for(i in seq(along=chrIndexes)) { chr = chrNames[i] cat(chr) dat = bed[chrIndexes[[i]],] count = alignShift(dat=dat, shift=shift, uniq=uniq) # <1 seconds each count$chr = chr aligned = rbind(aligned, count[,c("chr","pos","count")]) } return(aligned) } #------------------------------------------------------------------------ # Auxiliary function for alignSeq: actual shift # takes raw data and performs the shift # uniq=T only keeps unique tags for a given strand (removes duplicates) # # output is data frame with columns "pos" (position) and "count". alignShift = function(dat, shift, uniq = TRUE) { cat(".") plus = which(dat$strand == "+") plus.pos = dat$start[plus] + shift if(uniq) plus.pos = unique(plus.pos) plus.pos = data.frame(cbind(plus.pos, rep(1))) names(plus.pos) = c("pos","count") cat(".") minus = which(dat$strand == "-") minus.pos = dat$end[minus] - shift if(uniq) minus.pos = unique(minus.pos) minus.pos = data.frame(cbind(minus.pos, rep(1))) names(minus.pos) = c("pos","count") out = merge(plus.pos, minus.pos, by = "pos", all = T) out$count = rowSums(out[,2:3], na.rm = T) out$count.x = out$count.y = NULL cat("\n") return(out) } #------------------------------------------------------------------------ # this takes "aligned" data class, the max distance between # reads in the same read group, an arbitrary (but unimodal) # smoothing kernel, and the chromosome name # # Output is data frame with columns "chr" (chromosome), "pos" (position) and "height" smoothSpeed = function(counts, chr, theFilter, readGroupSize, speed) { if(speed){ cat("speed option is ON. scaling.") speedScale = 10 counts$pos = round(counts$pos/speedScale); theFilter = approx(theFilter, n = (length(theFilter)-1)/speedScale + 1)$y readGroupSize = readGroupSize/speedScale cat("\n") } else cat("speed option is OFF","\n") cat("smoothing and peak hunting.") # Create groups by readGroupSize pns = clusterMaker(rep(chr, nrow(counts)), counts$pos, maxGap = readGroupSize) Indexes = splitit(pns) lens = sapply(Indexes,length) peak1 = which(lens == 1) peak2 = which(lens > 1) # correct heights of peaks with count 1 peak1 = as.vector(peak1) m = max(theFilter) thePeaks = data.frame(pos = counts$pos[peak1], height = m * counts$count[peak1]) # smooth and find maxima of groups with more counts peakList = vector("list",peak2) for(k in 1:length(peak2)) { if(k %% 50 == 0) cat(".") Index = Indexes[[peak2[k]]] count = counts[Index,] # for the k'th group # interpolate with 0s but make sure the counts # are in the right place xx = rep(0, 2*length(theFilter) + 1 + count$pos[length(count$pos)] - count$pos[1]) xx[length(theFilter) + count$pos - count$pos[1] + 1] = count$count xx = filter(xx,theFilter) xx[is.na(xx)] = rep(0) peaks = findMax(xx) peaks$pos = peaks$pos - length(theFilter) + count$pos[1] - 1 peakList[[k]] = peaks } # merge the peaks for(k in seq(along = peakList)) thePeaks = rbind(thePeaks, peakList[[k]]) # Remove peaks that are smaller than the filter height (artifact) # index = which(thePeaks$height < m) # thePeaks = thePeaks[-index,] # If speed, fix position if(speed){ thePeaks$pos = thePeaks$pos * speedScale } # order them thePeaks = thePeaks[order(thePeaks$pos),] thePeaks$chr = rep(chr) thePeaks = thePeaks[,c("chr","pos","height")] cat("\n") return(thePeaks) } #------------------------------------------------------------------------ # finds all local maxima, ie inflection changes, in xx # returns position of peak, and its height # *** Sometimes height is smaller than smallest peak findMax = function(xx) { d = sign(diff(xx)) # find inflection points d = c(d[1],d) # this keep all of the indexes the same r = rle(as.numeric(d)) y = rleFrame(r) # drop 0's first y = y[y$x != 0,] # this is because some patterns are 1, 0, 1 inc = which(y$x == 1) # see which are increasing Index = inc[which(y$x[inc+1] == -1)] # now 1, -1 is always a max ## loop peaks = data.frame(matrix(nr = length(Index), nc = 2)) colnames(peaks) = c("pos","height") for(j in 1:length(Index)) { # if(j %% 10000 == 0) cat(".") # look at i'th peak tmp = y[Index[j],] # now we just average from this to the next, which includes dropped 0's nstart = y[(Index[j]+1),3] - 1 # get the start and end of peak more precisely peaks[j,1] = floor(mean(tmp$end:nstart)) # avg location, must be integer peaks[j,2] = max(xx[tmp$end:nstart]) # avg function in region } return(peaks) } ######################################################################## # Preprocessing: Estimate shift and peak shape #################### # Main: Estimate shift and peak shape # Output is a list with slots: # 1. tag distribution (plus & minus) # 2. estimated shift # 3. new filter (normalized) # 4. new filter (unnormalized) # 5. new filter (untrimmed) # 6-. the parameters used when calling the function shiftPeakEst = function(rawData, chr = "chr1", firstShift = 100, firstKernel = "Gaussian", kernelBand = 50, readGroupSize = 1e4, speed = FALSE, topPeaks = 1000, tagWinSize = 2001, knots = c(1,501,801,901,951,976,1001,1026,1051,1101,1201,1501), trimFilter = TRUE, trimSize = 801) { # align with initial shift aligned = list(Treat = NULL, Control = NULL) aligned$Treat = alignSeq(rawData[["Treat"]], chr = chr, shift = firstShift) aligned$Control = alignSeq(rawData[["Control"]], chr = chr, shift = firstShift) # Choose filter if(firstKernel=="Gaussian") { theFilter = dnorm(-(4*kernelBand):(4*kernelBand)/(kernelBand+1)) } if(firstKernel=="Uniform") { unif = function(x) ifelse(abs(x)<1, 0.5, 0) theFilter= unif(seq(-kernelBand,kernelBand)/(kernelBand+1)) } # make sure it integrates to 1 theFilter = theFilter/sum(theFilter) # Smoothing cat("Smoothing.\n") thePeakList = smoothSpeed(aligned[["Treat"]], chr = chr, theFilter = theFilter, readGroupSize = readGroupSize, speed = speed) # 20 min. for chr 1 # Pick top peaks and obtain tag distribution cat("Estimating tag distribution and shift.\n") theTopPeaks = thePeakList[order(thePeakList$height, decreasing = T)[1:topPeaks],] tags = tagDistrEst(rawData[["Treat"]], chr = chr, theTopPeaks, tagWinSize = tagWinSize) # Get best shift and smoothing kernel newShiftInfo = estShift(tags, knots = knots, trimFilter = TRUE, trimSize = trimSize) # Output theOutput = list(tagDistr=tags) call = list(chr=chr, firstShift=firstShift, firstKernel=firstKernel, kernelBand=kernelBand, readGroupSize=readGroupSize, speed=speed, topPeaks=topPeaks, tagWinSize=tagWinSize, knots=knots, trimFilter=trimFilter, trimSize=trimSize) theOutput = c(theOutput, newShiftInfo, call) return(theOutput) } #------------------------------------------------------------------------ # this takes the raw treatment data, read in, and gets the # estimate of the tag distribution # rawDat is just trt, chr is the chromosome name, # tagWinSize determines how far to look in each direction # # Output is data frame with tagWinSize rows and columns "plus" and "minus". tagDistrEst = function(bed, chr, theTopPeaks, tagWinSize) { # Extract relevant chromosome bed = bed[bed$chr == chr,] loc = theTopPeaks$pos # half the window.size win.half = floor(tagWinSize/2) # get all indices for each peaks +/- win.half strands = splitit(bed$strand) # split by strand locIndex = vector("list",length(loc)) for(j in seq(along=locIndex)) locIndex[[j]] = (loc[j] - win.half):(loc[j] + win.half) cat("- strand\n") negPos = bed$end[strands[["-"]]] minusCount = lapply(locIndex, function(x) which(x %in% negPos)) minusWin = rep(0, tagWinSize) minusTab = table(unlist(minusCount)) minusWin[as.integer(names(minusTab))] = minusTab cat("+ strand\n") posPos = bed$start[strands[["+"]]] plusCount = lapply(locIndex, function(x) which(x %in% posPos)) plusWin = rep(0, tagWinSize) plusTab = table(unlist(plusCount)) plusWin[as.integer(names(plusTab))] = plusTab countsOut = data.frame(cbind(plusWin, minusWin)) names(countsOut) = c("plus","minus") return(countsOut) } #------------------------------------------------------------------------ # this takes the object returned by tagDistEst and returns # the optimal shift and the new unimodal filter # # Out is list with names "bestShift" (a scalar) and "newFilter" (vector of same length as rows in tags) estShift = function(tags, knots, trimFilter=TRUE, trimSize) { # Get best shift alignment from highest peaks # try a range of shifts around 63 (found by MACS) # Make symmetric, fit spline, find max n = nrow(tags) X = bs(1:n, knots=knots) # Setup for spline smoothing tags.smooth = tags tags.smooth$plus = lm(tags$plus ~ X - 1)$fitted.values tags.smooth$minus = lm(tags$minus ~ X - 1)$fitted.values shift = round(abs(which.max(tags.smooth$plus) - which.max(tags.smooth$minus))/2) # Obtain filter shifted = (c(rep(0,shift), tags$plus[1:(n-shift)]) + c(tags$minus[(1+shift):n], rep(0,shift))) / 2 shifted = (shifted + shifted[seq(n,1,-1)])/2 newFilter = lm(shifted ~ X - 1)$fitted.values # trim and multiply by quartic kernel shape if(trimFilter) { L = (trimSize-1)/2 x = -L:L quartic = (1 - (x/L)^2)^2 trimmed = newFilter[x + (n+1)/2] * quartic } out = vector("list",4) out[[1]] = bestShift out[[2]] = trimmed/sum(trimmed) out[[3]] = trimmed out[[4]] = newFilter names(out) = c("shift","kernel","filter","filter.untrimmed") return(out) } ######################################################################## # Preprocessing: global regression and MonteCarlo simulations #################### #------------------------------------------------------------------------ # regression, takes 'aligned' object to perform the regression # returns the linear regression model for the binned values # # Output is a list with slots: # 1. binned treatment data # 2. binned control data # 3. binned treatment variance # 4. binned control variance # 7. Regression coefficients # 8. Fitted values # 9. Fitted errors # 10. Fitted error variance # 11. Covariance matrix of coefficients # 12. Estimated background range # 13-. the parameters used when calling the function globalRegr = function(aligned, binSizes = 10^(3:4), quantile.rng = c(.5,1), var = FALSE, binSkip = 1, chr = "ALL") { cat("Binning the data.\n") y.all = data.frame() X.all = data.frame() Vy.all = data.frame() VX.all = data.frame() if(chr == "ALL") chrNames = unique(aligned[["Treat"]]$chr) else chrNames = chr for(i in seq(along = chrNames)) { # current chromosome chr = chrNames[i] chrSeq = Hsapiens[[chr]] cat(chr) # Fill aligned data with zeros #L = length(chrSeq) indTrt = which(aligned[["Treat"]]$chr == chr) indCtrl = which(aligned[["Control"]]$chr == chr) L = max(aligned[["Treat"]]$pos[indTrt], aligned[["Control"]]$pos[indCtrl]) fullTrt = fullCtrl = rep(0,L) fullTrt[aligned[["Treat"]]$pos[indTrt]] = aligned[["Treat"]]$count[indTrt] fullCtrl[aligned[["Control"]]$pos[indCtrl]] = aligned[["Control"]]$count[indCtrl] cat(".") # Smallest bin n = floor(L / binSizes[length(binSizes)]) * binSizes[length(binSizes)] / binSizes[1] # number of 1k bins that fit in 100k bins binTrt = bin(fullTrt, binSizes[1], var=var)[1:n,] # Response variable (38 sec.) binCtrl = bin(fullCtrl, binSizes[1], var=var)[1:n,] # Response and predictors y = data.frame(chr = chr, y = binTrt[,1]) X = data.frame(chr = chr, matrix(0, n, length(binSizes))) Vy = data.frame(chr = chr, var = binTrt[,2]) VX = data.frame(chr = chr, var = binCtrl[,2]) X[,2] = binCtrl[,1] if(length(binSizes) > 1) for(j in 2:length(binSizes)) { x = bin(X[,j], binSizes[j]/binSizes[1])[,1] X[,j+1] = rep(x, each = binSizes[j]/binSizes[1]) } cat(".") # Accumulate results y.all = rbind(y.all, y) X.all = rbind(X.all, X) Vy.all = rbind(Vy.all, Vy) VX.all = rbind(VX.all, VX) cat(".") } # Skip bins index = seq(1, nrow(X.all), binSkip) y.all = y.all[index,] X.all = X.all[index,] Vy.all = Vy.all[index,] VX.all = VX.all[index,] # Linear multiple regression for the mean cat("Performing regression.") yy = y.all[,"y"] XX = as.matrix(X.all[,2:ncol(X.all)]) lm1 = lm(yy ~ XX - 1) # Non-negative least squares coefs = lm1$coefficients y.hat = lm1$fitted.values e = yy - y.hat s2 = sum(e^2) / (length(yy) - ncol(XX)) coefs.cov = s2 * solve(t(XX) %*% XX) names(coefs) = colnames(XX) # Background range rng = quantile(y.hat, quantile.rng) # Output cat("\n") out = list(binTrt = y.all, binCtrl = X.all, binTrtVar = Vy.all, binCtrlVar = VX.all, coefs = coefs, y.hat = y.hat, error = e, s2 = s2, coefs.cov = coefs.cov, bkgr.rng = rng) call = list(binSizes=binSizes, var=var, binSkip=binSkip) out = c(out, call) return(out) } #---------------------------------------------------------------------- # Auxiliary functions for globalRegr # Takes a full vector of counts and returns mean and variance in bins bin = function(x, binSize, var=FALSE) { L = floor(length(x) / binSize) y = matrix(x[1:(L*binSize)], binSize, L) y.mean = colMeans(y) y.var = NaN if(var==TRUE) y.var = (colMeans(y^2) - y.mean^2) * binSize / (binSize-1) return(data.frame(mean = y.mean, var = y.var)) } #------------------------------------------------------------------------ # Computes distribution of heights of local maxima for various values of the Poisson rate parameter lambda via Monte Carlo simulation # returns look-up-table table for pvalues as a function of threshold, each column corresponding to a different value of lambda # # Output is a list with slots: # 1. thresholds # 2. lambda values # 3. p value look-up-table (thresholds x lambda values) # 4-. the parameters used when calling the function lambdaSimul = function(kernel, lambda.rng = c(1e-5, 1e-1), lambda.res = 300, counts = 100, min.simul = 1e5, thresh.res = 200, lambda.df = 5, tol = 0.25, trunc = TRUE ) { cat("Estimating height distributions.") # Get range of potential values of lambda lambda.rng = lambda.rng * c(1-tol, 1+tol) lambda = 10^seq(log10(lambda.rng[1]), log10(lambda.rng[2]), len=lambda.res) N = pmax(counts/lambda, min.simul) # Simulate Poisson sequences, smooth, get local maxima locmax.distr = vector('list',length(lambda)) for(j in 1:length(lambda)) { if(j %% 20 == 0) cat(".") x = rpois(N[j], lambda[j]) if(trunc) x = pmin(x, 2) # myfilter is okay here because of the tails y = myfilter(x, kernel) if(any(y>0)) { pks = findMax(y) locmax.distr[[j]] = ecdf(pks$height) } else locmax.distr[[j]] = ecdf(max(kernel)) } # Smooth height distributions for better accuracy u = 10^seq(log10(max(kernel)), log10(max(pks$height)), len=thresh.res) p.hat = p = matrix(0, length(u), length(lambda)) for(k in 1:length(lambda)) p[,k] = 1 - locmax.distr[[k]](u) lambda.spline = bs(lambda, df=lambda.df) p.hat[1,] = p[1,] for(j in 2:length(u)) { coefs = nonneg.ls(p[j,], lambda.spline) p.hat[j,] = lambda.spline %*% coefs } p.hat = pmin(pmax(p.hat,0),1) cat("\n") out = vector("list",3) out[[1]] = u out[[2]] = lambda out[[3]] = p.hat names(out) = c("thresholds","lambda","pvalTable") call = list(lambda.rng = lambda.rng, lambda.res = lambda.res, counts = counts, min.simul = min.simul, thresh.res = thresh.res, lambda.df = lambda.df) out = c(out, call) return(out) } #---------------------------------------------------------------------- # Auxiliary functions for lambdaSimul # Performs non-negative least squares. Finds coefficients. # nlminb is much faster than optim nonneg.ls = function(y, X) { fn <- function(par) sum((y - X %*% par)^2) # coefs = optim(rep(0,ncol(X)), fn, lower=rep(0,ncol(X)), method="L-BFGS-B")$par # 83 sec. for df=10 coefs = nlminb(rep(0,ncol(X)), fn, lower=rep(0,ncol(X)))$par # 6 sec. for df=5 return(coefs) } ######################################################################## # Analysis Functions #################### # Performs smoothing and finds all the peaks in the original data for all chromosomes # readGroupSize can be the same as was used in preprocessing peakFinder = function(trt, alignParams, chr = "ALL", speed = FALSE, readGroupSize = NULL) { cat("Smoothing.\n") theFilter = alignParams[["kernel"]] if(is.null(readGroupSize)) readGroupSize = alignParams$readGroupSize if(chr != "ALL") trt = trt[which(trt$chr %in% chr),] # One chromosome at a time trtChrIndexes = splitit(trt$chr) chrNames = names(trtChrIndexes) thePeakList = data.frame() cat("Current chromosome:\n") for(i in seq(along = trtChrIndexes)) { chr = chrNames[i] cat(paste(chr, "\n", sep="")) index = trtChrIndexes[[i]] peaks = smoothSpeed(trt[index,], chr = chr, theFilter = theFilter, readGroupSize = readGroupSize, speed = speed) # 20 min. for chr 1 thePeakList = rbind(thePeakList, peaks) } cat("\n") return(thePeakList) } #---------------------------------------------------------------------- # For each peak found by peakFinder(), computes local averages from Control # Control is of "aligned" class # Output is set of predictors for estimating background lambda bkgrPrep = function(Peaks, Control, regParams, chr="ALL") { cat("Computing local averages from Control\n") if(chr != "ALL") Peaks = Peaks[which(Peaks$chr %in% chr),] chrIndexes = splitit(Peaks$chr) chrNames = names(chrIndexes) X.all = NULL for(i in seq(along = chrNames)) { chr = chrNames[i] cat(chr) peaks = Peaks[chrIndexes[[i]],] chrIndexes.ctrl = which(Control$chr == chr) controlAlign = Control[chrIndexes.ctrl,] # Control ranges controlRanges = GRanges(seqnames = Rle(rep(chrNames[i], nrow(controlAlign))), ranges = IRanges(start = controlAlign$pos, end = controlAlign$pos, width = 1), score = controlAlign$count, strand = Rle(strand(rep("+", nrow(controlAlign))))) # Set up predictors: binned control binSizes = regParams[["binSizes"]] X = matrix(0, nr = nrow(peaks), nc = length(binSizes)) for(j in 1:length(binSizes)) { cat(".") ranges = GRanges(seqnames = Rle(peaks$chr), ranges = IRanges(start = peaks$pos - binSizes[j]/2, end = peaks$pos + binSizes[j]/2, width = binSizes[j]+1), strand = Rle(strand(rep("+",nrow(peaks))))) test = findOverlaps(ranges,controlRanges) test = as.matrix(test) Indexes = splitit(test[,1]) counts = lapply(Indexes, function(x) sum(controlAlign[test[x,2],3])) counts = unlist(counts)/(binSizes[j]+1) X[as.integer(names(Indexes)),j] = counts } # Put predictors together X.all = rbind(X.all, X) } cat("\n") return(X.all) } #---------------------------------------------------------------------- # For each peak found by peakFinder(), # -estimates background Poisson rate and assigns a p-value. # -Then applies BH for FDR global significance. signifSeq = function(peaks, predictors, regParams, distrParams, chr="ALL") { cat("Computing p-values.") thresholds = distrParams[["thresholds"]] lambdaTable = distrParams[["lambda"]] pvalTable = distrParams[["pvalTable"]] # thresholds x lambda # Estimate Poisson background rate for all peaks lambda.local = predictors %*% regParams$coefs peaks$lambda.local = lambda.local peaks$lambda = pmax(lambda.local, regParams$bkgr.rng[1]) peaks$SNR = peaks$height / peaks$lambda # Get pvalues by interpolation of look-up-table pval = rep(1, length(peaks$lambda)) lambda = pmax(pmin(peaks$lambda, max(lambdaTable)), min(lambdaTable)) index = which(peaks$height > min(thresholds)) for(j in index) { if(j %% 100000 == 0) cat(".") pval[j] = lut2(thresholds, lambdaTable, pvalTable, peaks$height[j], lambda[j]) } peaks$pvalue = pmax(pmin(pval, 1), 0) # Q-values and ordering peaks$qvalue = p.adjust(peaks$pvalue, method = "BH") cat("\n") return(peaks) } #---------------------------------------------------------------------- # Auxiliary function: bilinear interpolation in a look-up-table # x, y are vectors # z is table of size length(x) by length(y) # xout, yout are the values to interpolate lut2 <- function(x, y, z, xout, yout) { # get closest x and y in the look-up table d = sort(abs(xout - x), index.return=T) i1 = d$ix[1]; i2 = d$ix[2] d = sort(abs(yout - y), index.return=T) j1 = d$ix[1]; j2 = d$ix[2] # bilinear interpolation rx = (xout - x[i1]) / (x[i2] - x[i1]) ry = (yout - y[j1]) / (y[j2] - y[j1]) zout = (1-rx)*(1-ry)*z[i1,j1] + (1-rx)*ry*z[i1,j2] + rx*(1-ry)*z[i2,j1] + rx*ry*z[i2,j2] return(zout) } ######################################################################## # Wrapper for everything #################### seqAnalysis = function(dataDir, treat = "Treatment_tags.bed", input = "Input_tags.bed", saveIt = TRUE) { rawData = readData(data.path = dataDir, treat = treat, input = input) # 92 sec. # Estimate shift and peak shape alignParams = shiftPeakEst(rawData, chr = "chr1", trimSize = 401) # 21 min. for chr1 (speed OFF) if(saveIt) save(alignParams, file = "alignParams.rda", compress=T) # Perform alignment aligned = list(Treat = NULL, Control = NULL) aligned[["Treat"]] = alignSeq(rawData[["Treat"]], shift = alignParams$bestShift) aligned[["Control"]] = alignSeq(rawData[["Control"]], shift = alignParams$bestShift) if(saveIt) save(aligned, file = "aligned.rda", compress=T) # Find peaks allPeaks = peakFinder(aligned, alignParams, chr = "ALL") if(saveIt) save(allPeaks, file = "allPeaks.rda", compress=T) # Global regression regParams = globalRegr(aligned, var=TRUE) # ~8 mins. if(saveIt) save(regParams, file = "regParams.rda", compress=T) # MonteCarlo simulations distrParams = lambdaSimul(kernel = alignParams$kernel, lambda.rng = regParams$bkgr.rng) # 8 min for N=1e5, lambda.res=100; 40 min for N=3e5, lambda.res=500 if(saveIt) save(distrParams, file = "distrParams.rda", compress=T) # Prepare background predictors = bkgrPrep(allPeaks, aligned$Control, regParams) # ~60 min. if(saveIt) save(predictors, file = "predictors.rda", compress=T) # Multiple testing finalPeaks = signifSeq(allPeaks, predictors, regParams, distrParams) # ~30 min if(saveIt) save(finalPeaks, file = "finalPeaks.rda", compress=T) # Ordering and significance topPeaks = finalPeaks[order(finalPeaks$SNR, decreasing = T),] topPeaks = topPeaks[order(topPeaks$pvalue),] signifPeaks = topPeaks[topPeaks$qvalue < 0.01,] if(saveIt) save(signifPeaks, file = "signifPeaks.rda", compress=T) # Motif analysis motifObs = motifDist(signifPeaks, dist = c(25, 100, 400)) if (saveIt) save(motifObs, file = "motifObs.rda") return(signifPeaks) } ######################################################################## # Evaluation Functions #################### # Motif analysis for various distances from binding site motifDist = function(peaks, dist = 100, type = "FOXA1") { # motif from JASPAR - http://jaspar.genereg.net/cgi-bin/jaspar_db.pl if(type == "FOXA1") { A = c(13, 108, 1, 5, 6, 476, 6, 291,45,378, 90) C = c(2,7,7,7,3,12, 807, 106, 368,9, 125) G = c(2, 770, 5, 24, 78,368, 5,5, 69, 24, 466) T = c(875, 8, 882,860, 809, 40, 79, 495, 414, 482, 211) } if(type == "GABP") { A = c(100, 0, 75, 0, 0, 1800, 1800, 0, 1, 0, 0, 250 ) C = c(100, 600, 300, 0, 0, 0, 0, 0, 1, 0, 150, 0 ) G = c(100, 300, 50, 1800, 1800, 0, 0, 1800, 1, 1100, 50, 200) T = c( 0, 0, 1, 0, 0, 0, 0, 0, 1, 100, 50, 50) } # # Another possible motif # if(type == "GABP") { # A = c(75, 50, 250, 0, 0, 1800, 1800, 100, 1, 25, 1) # C = c( 0, 350, 500, 0, 0, 0, 0, 0, 1, 1, 1) # G = c(75, 100, 0, 1800, 1800, 0, 0, 800, 1, 75, 75) # T = c( 0, 0, 0, 0, 0, 0, 0, 0, 75, 1, 1) # } motif = rbind(A,C,G,T) # probability of each base motif = apply(motif, 2, function(x) x/sum(x)) motifObs = data.frame(matrix(0, nrow=nrow(peaks), ncol=length(dist))) names(motifObs) = dist for(i in 1:length(dist)) { cat(dist[i],"\n") motifObs[,i] = checkMotif(motif, peaks, dist = dist[i]) cat("\n") } return(motifObs) } #---------------------------------------------------------------------- # motif finding # motif is a PWM thats 4 x N checkMotif = function(motif, peaks, dist = 35) { chrs = unique(peaks$chr) motifCount = rep(0, nrow(peaks)) cat("finding motif locations.") for(i in 1:length(chrs)) { cat(".") chr = chrs[i] # subset data peaks.chr = peaks[peaks$chr %in% chr,] mm = motifCount[peaks$chr %in% chr] tmpI = IRanges(start=peaks.chr$pos, end=peaks.chr$pos) # look for all motifs on current chr # theSeq = Hsapiens[[chrs[i]]] theSeq = Hsapiens[[toString(chr)]] hits = matchPWM(motif, theSeq) # check peaks test = findOverlaps(tmpI, hits, maxgap = dist) test = as.matrix(test) tab = table(test[,1]) mm[as.integer(names(tab))] = tab # replace motifCount[peaks$chr %in% chr] = mm } cat("\n") return(motifCount) } #---------------------------------------------------------------------- # Find matches between two sets of results # first two columns, where first is chr and second is pos pointMatch = function(object1,object2,col1=2,col2=2,verbose=TRUE){ Indexes1=split(seq(along=as.character(object1$chr)),as.character(object1$chr)) Indexes2=split(seq(along=as.character(object2$chr)),as.character(object2$chr)) res=matrix(NA,nrow=nrow(object1),2) for(i in seq(along=Indexes1)){ CHR=names(Indexes1)[i] if(verbose) cat(CHR,",") Index1=Indexes1[[CHR]] Index2=Indexes2[[CHR]] if(length(Index1)>0 & length(Index2)>0){ tmpmat1=object1[Index1,col1] tmpmat2=object2[Index2,col2] tmp=fuzzy.match(tmpmat1,tmpmat2) tmp[,2] = Index2[tmp[,2]] res[Index1,]=tmp } } cat("\n") colnames(res)<-c("dist","matchIndex") return(res) } fuzzy.match = function(x, y, x.sorted = FALSE, y.sorted = FALSE) { l.x = length(x) # space for results r = array(0,c(l.x,2)) if (! x.sorted) { # sort x perm = order(x) X = x[perm] } l.y = length(y) if (! y.sorted) { Y = array(0,1+l.y) # sort y perm.y = order(y) Y[1:l.y] = y[perm.y] Y[1+l.y] = +Inf } else y[1+l.y] = +Inf # set up v = -Inf j = 1 if (y.sorted) u = y[j] else u = Y[j] # run over X, or x if x.sorted for (i in 1:l.x) { if (x.sorted) z = x[i] else z = X[i] e = z - v d = u - z while (d < 0) { v = u e = z - v j = j + 1 if (y.sorted) u = y[j] else u = Y[j] d = u - z } # result if (x.sorted) k = i else k = perm[i] if (y.sorted) r[k,2] = j else r[k,2] = perm.y[j] if (d == 0) r[k,1] = 0 else if (e < d) { r[k,1] = -e if (y.sorted) r[k,2] = j-1 else r[k,2] = perm.y[j-1] } else # e >= d r[k,1] = d } r } fuzzy.match2 = function(x, z, x.sorted = FALSE, z.sorted = FALSE) { # z is an Nx2 matrix whose rows are separate intervals # which play the role of the y-elements in fuzzy.match if (z.sorted) y = t(z) else { perm = order(z[,1]) y = t(z[perm,]) } f = fuzzy.match(x,y,x.sorted=x.sorted,y.sorted=TRUE) for (i in 1:length(x)) { d = f[i,1] # distance from as.vector(y) j = f[i,2] # nearest index of as.vector(y) jmod2 = j%%2 if (z.sorted) f[i,2] = (j+jmod2)/2 else f[i,2] = perm[(j+jmod2)/2] if ((d>0 && jmod2==0) || (d<0 && jmod2 == 1)) f[i,1] = 0 } f } ######################################################################## # Graphics #################### #---------------------------------------------------------------------- # Show data segment seqPlot <- function(peaks, aligned, kernel, chr, loc, L=10001, seg=NULL, alpha = 0.05) { if(is.null(seg)) seg = c(loc-(L-1)/2, loc+(L-1)/2) segDat = data.frame(t = seg[1]:seg[2], ctrl = rep(0, L), trt = rep(0, L), smooth = rep(0, L), height = rep(NaN, L), lambda = rep(NaN, L), SNR = rep(NaN, L), pvalue = rep(NaN, L), qvalue = rep(NaN, L), sig = rep(NaN, L), nonsig = rep(NaN, L)) # Control i = which((aligned$Control$chr == chr) & (aligned$Control$pos > seg[1]) & (aligned$Control$pos < seg[2])) segDat$ctrl[aligned$Control$pos[i] - loc + (L-1)/2] = aligned$Control$count[i] # Treat i = which((aligned$Treat$chr == chr) & (aligned$Treat$pos > seg[1]) & (aligned$Treat$pos < seg[2])) segDat$trt[aligned$Treat$pos[i] - loc + (L-1)/2] = aligned$Treat$count[i] segDat$smooth = myfilter(segDat$trt, kernel) # Peaks and background i = which((peaks$chr == chr) & (peaks$pos > seg[1]) & (peaks$pos < seg[2])) segDat[peaks$pos[i] - loc + (L-1)/2, 5:9] = cbind(peaks$height[i], peaks$lambda[i], peaks$SNR[i], peaks$pvalue[i], peaks$qvalue[i]) segDat[peaks$pos[i] - loc + (L-1)/2, 10] = ifelse(peaks$qvalue[i] < alpha, peaks$height[i], NaN) segDat[peaks$pos[i] - loc + (L-1)/2, 11] = ifelse(peaks$qvalue[i] > alpha, peaks$height[i], NaN) # Plotting par(mfrow=c(6,1), cex.axis=1.5, cex.lab=1.5, cex.main=1.5) xlab = paste('location (',chr,')',sep='') plot(segDat$t, segDat$ctrl, type='l', main='Control', xlab=xlab, ylab='', ylim=c(0,2)) plot(segDat$t, segDat$trt, type='l', main='Treatment (IP)', xlab=xlab, ylab='', ylim=c(0,2)) plot(segDat$t, 100*segDat$smooth, type='l', main='Smoothed IP', xlab=xlab, ylab='x 1e-2'); points(segDat$t, 100*segDat$sig, col='red'); points(segDat$t, 100*segDat$nonsig, col='blue'); plot(segDat$t, 1000*segDat$lambda, main='Background rate lambda', xlab=xlab, ylab='x 1e-3') plot(segDat$t, log10(segDat$SNR), main='log10(SNR)', xlab=xlab, ylab='') plot(segDat$t, log10(segDat$pvalue+1e-10), main='log10(p-value)', xlab=xlab, ylab='') } #---------------------------------------------------------------------- # Compare motif analysis results # *** Fix function to handle only one or more than two set of motifs motifPlot2 = function(motifObs, motifObsCis, label1 = "STEM+Regr", label2 = NULL, binSize = 500) { library(RColorBrewer) mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){ par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0)) par(mfrow=c(a,b),...) palette(brewer.pal(brewer.n,brewer.name)) par(cex.axis=1.5, cex.lab=1.5, cex.main=1.5) } mypar() bins = cut(1:nrow(motifObs), breaks=seq(1,nrow(motifObs),by=binSize), labels = F) binIndex = splitit(bins) plot(0~0, type = "n", ylim = c(0,1), xlim = c(1,max(bins,na.rm=T)), xlab = paste("Ranked by", binSize), ylab = "Proportion with Motif Nearby" #, main = paste("Motifs within",dist,"base pairs") ) for(i in seq(along=motifObs)) { tmp = motifObs[,i]; tmp[tmp>1] = rep(1) tmpcis = motifObsCis[,i]; tmpcis[tmpcis>1] = rep(1) yPoint = matrix(nr = max(bins,na.rm=T), nc = 2) colnames(yPoint) = c("no","yes") yPointCis = matrix(nr = floor(length(tmpcis)/binSize), nc = 2) colnames(yPointCis) = colnames(yPoint) for(j in seq(along=binIndex)) { Index = binIndex[[j]] tmp2 = table(tmp[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint[j,] = y points(y[2] ~ j, col = 1, pch = 19, cex = 1.3) if(max(Index) < length(tmpcis)) { tmp2 = table(tmpcis[Index]) tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)) y = as.numeric(tmp2) yPointCis[j,] = y points(y[2] ~ j, col = 2, pch = 19, cex = 1.3) } } legend("topright", c(label1,label2), pch = 19, pt.cex = 2, col = 1:2, cex=1.5) lines(lowess(yPoint[is.finite(yPoint[,1]),2], f = 0.6), lwd = 5, col = 1) lines(lowess(yPointCis[is.finite(yPointCis[,1]),2], f = 0.6), lwd = 5, col = 2) } } motifPlot3 = function(motifObs, motifObs2, motifObs3, label1 = "STEM+Regr", label2 = NULL, label3, binSize = 500) { library(RColorBrewer) mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){ par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0)) par(mfrow=c(a,b),...) palette(brewer.pal(brewer.n,brewer.name)) par(cex.axis=1.5, cex.lab=1.5, cex.main=1.5) } mypar() bins = cut(1:nrow(motifObs), breaks=seq(1,nrow(motifObs),by=binSize), labels = F) binIndex = splitit(bins) plot(0~0, type = "n", ylim = c(0,1), xlim = c(1,max(bins,na.rm=T)), xlab = paste("Ranked by", binSize), ylab = "Proportion with Motif Nearby" #, main = paste("Motifs within",dist,"base pairs") ) for(i in seq(along=motifObs)) { count1 = motifObs[,i]; count1[count1>1] = rep(1) count2 = motifObs2[,i]; count2[count2>1] = rep(1) count3 = motifObs3[,i]; count3[count3>1] = rep(1) yPoint = matrix(nr = max(bins,na.rm=T), nc = 2) colnames(yPoint) = c("no","yes") yPoint2 = matrix(nr = floor(length(count2)/binSize), nc = 2) colnames(yPoint2) = colnames(yPoint) yPoint3 = matrix(nr = floor(length(count3)/binSize), nc = 2) colnames(yPoint3) = colnames(yPoint) for(j in seq(along=binIndex)) { Index = binIndex[[j]] tmp2 = table(count1[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint[j,] = y points(y[2] ~ j, col = 1, pch = 19, cex = 1.3) if(max(Index) < length(count2)) { tmp2 = table(count2[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint2[j,] = y points(y[2] ~ j, col = 2, pch = 19, cex = 1.3) } if(max(Index) < length(count3)) { tmp2 = table(count3[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint3[j,] = y points(y[2] ~ j, col = 3, pch = 19, cex = 1.3) } } legend("bottomleft", c(label1,label2,label3), pch = 19, pt.cex = 2, col = 1:3, cex=1.5) lines(lowess(yPoint[is.finite(yPoint[,1]),2], f = 0.3), lwd = 5, col = 1) lines(lowess(yPoint2[is.finite(yPoint2[,1]),2], f = 0.3), lwd = 5, col = 2) lines(lowess(yPoint3[is.finite(yPoint3[,1]),2], f = 0.3), lwd = 5, col = 3) } } motifPlot4 = function(motifObs, motifObs2, motifObs3, motifObs4, label1 = "STEM+Regr", label2 = NULL, label3 = NULL, label4 = NULL, binSize = 500) { library(RColorBrewer) mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){ par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0)) par(mfrow=c(a,b),...) palette(brewer.pal(brewer.n,brewer.name)) par(cex.axis=1.5, cex.lab=1.5, cex.main=1.5) } mypar() bins = cut(1:nrow(motifObs), breaks=seq(1,nrow(motifObs),by=binSize), labels = F) binIndex = splitit(bins) plot(0~0, type = "n", ylim = c(0,1), xlim = c(1,max(bins,na.rm=TRUE)), xlab = paste("Ranked by", binSize), ylab = "Proportion with Motif Nearby" #, main = paste("Motifs within",dist,"base pairs") ) for(i in seq(along=motifObs)) { count1 = motifObs[,i]; count1[count1>1] = rep(1) count2 = motifObs2[,i]; count2[count2>1] = rep(1) count3 = motifObs3[,i]; count3[count3>1] = rep(1) count4 = motifObs4[,i]; count4[count4>1] = rep(1) yPoint = matrix(nr = max(bins,na.rm=TRUE), nc = 2) colnames(yPoint) = c("no","yes") yPoint2 = matrix(nr = floor(length(count2)/binSize), nc = 2) colnames(yPoint2) = colnames(yPoint) yPoint3 = matrix(nr = floor(length(count3)/binSize), nc = 2) colnames(yPoint3) = colnames(yPoint) yPoint4 = matrix(nr = floor(length(count4)/binSize), nc = 2) colnames(yPoint4) = colnames(yPoint) for(j in seq(along=binIndex)) { Index = binIndex[[j]] tmp2 = table(count1[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint[j,] = y points(y[2] ~ j, col = 1, pch = 19, cex = 1.3) if(max(Index) < length(count2)) { tmp2 = table(count2[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint2[j,] = y points(y[2] ~ j, col = 2, pch = 19, cex = 1.3) } if(max(Index) < length(count3)) { tmp2 = table(count3[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint3[j,] = y points(y[2] ~ j, col = 3, pch = 19, cex = 1.3) } if(max(Index) < length(count4)) { tmp2 = table(count4[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint4[j,] = y points(y[2] ~ j, col = 4, pch = 19, cex = 1.3) } } legend("bottomleft", c(label1,label2,label3,label4), pch = 19, pt.cex = 2, col = 1:4, cex=1.5) lines(lowess(yPoint[is.finite(yPoint[,1]),2], f = 0.3), lwd = 5, col = 1) lines(lowess(yPoint2[is.finite(yPoint2[,1]),2], f = 0.3), lwd = 5, col = 2) lines(lowess(yPoint3[is.finite(yPoint3[,1]),2], f = 0.3), lwd = 5, col = 3) lines(lowess(yPoint4[is.finite(yPoint4[,1]),2], f = 0.3), lwd = 5, col = 4) } } # Cumulative bins motifPlotCum = function(motifObs, motifObs2, motifObs3, label1 = "STEM+Regr", label2 = NULL, label3, cum= FALSE, binSize = 500) { library(RColorBrewer) mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){ par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0)) par(mfrow=c(a,b),...) palette(brewer.pal(brewer.n,brewer.name)) par(cex.axis=1.5, cex.lab=1.5, cex.main=1.5) } mypar() bins = cut(1:nrow(motifObs), breaks=seq(1,nrow(motifObs),by=binSize), labels = F) binIndex = splitit(bins) plot(0~0, type = "n", ylim = c(0,1), xlim = c(1,max(bins,na.rm=T)), xlab = paste("Ranked by", binSize), ylab = "Proportion with Motif Nearby" #, main = paste("Motifs within",dist,"base pairs") ) for(i in seq(along=motifObs)) { count1 = motifObs[,i]; count1[count1>1] = rep(1) count2 = motifObs2[,i]; count2[count2>1] = rep(1) count3 = motifObs3[,i]; count3[count3>1] = rep(1) yPoint = matrix(nr = max(bins,na.rm=T), nc = 2) colnames(yPoint) = c("no","yes") yPoint2 = matrix(nr = floor(length(count2)/binSize), nc = 2) colnames(yPoint2) = colnames(yPoint) yPoint3 = matrix(nr = floor(length(count3)/binSize), nc = 2) colnames(yPoint3) = colnames(yPoint) for(j in seq(along=binIndex)) { if(!cum) Index = binIndex[[j]] if(cum) { Index = list() for(k in 1:j) Index[[k]] = binIndex[[k]] Index= unlist(Index) } tmp2 = table(count1[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint[j,] = y points(y[2] ~ j, col = 1, pch = 19, cex = 1.3) if(max(Index) < length(count2)) { tmp2 = table(count2[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint2[j,] = y points(y[2] ~ j, col = 2, pch = 19, cex = 1.3) } if(max(Index) < length(count3)) { tmp2 = table(count3[Index]); tmp2 = tmp2/sum(tmp2) x = as.numeric(names(tmp2)); y = as.numeric(tmp2) yPoint3[j,] = y points(y[2] ~ j, col = 3, pch = 19, cex = 1.3) } } legend("bottomleft", c(label1,label2,label3), pch = 19, pt.cex = 2, col = 1:3, cex=1.5) lines(lowess(yPoint[is.finite(yPoint[,1]),2], f = 0.3), lwd = 5, col = 1) lines(lowess(yPoint2[is.finite(yPoint2[,1]),2], f = 0.3), lwd = 5, col = 2) lines(lowess(yPoint3[is.finite(yPoint3[,1]),2], f = 0.3), lwd = 5, col = 3) } }