######################
# ChIP-Seq functions
######################

########################################################################
# Generic Functions
####################

# shortcut for split
splitit <- function(x) split(seq(along=x),x)

#----------------------------------------------------------------------
# just converts an rle object to a data.frame
# 	with columns: value, length, startIndex, endIndex
rleFrame = function(r) {
	y <- data.frame(cbind(r[[2]], as.integer(r[[1]])),
		stringsAsFactors=FALSE)
	y[,2] <- as.integer(y[,2])
	y <- cbind(y,cumsum(y[,2]))
	y <- cbind(y,(y[,3] - y[,2] + 1))
	y = y[,c(1,2,4,3)]
	names(y) = c("x","len","start","end")
	return(y)
}

#----------------------------------------------------------------------
# from Rafa, makes clusters/read groups
#	defined by the max gap between reads
clusterMaker <- function(chr,pos,order.it=TRUE,maxGap=300){
  nonaIndex=which(!is.na(chr) & !is.na(pos))
  Indexes=split(nonaIndex,chr[nonaIndex])
  clusterIDs=rep(NA,length(chr))
  LAST=0
  for(i in seq(along=Indexes)){
    Index=Indexes[[i]]
    x=pos[Index]

    if(order.it){ Index=Index[order(x)];x=pos[Index] }

    y=as.numeric(diff(x)>maxGap)
    z=cumsum(c(1,y))
    clusterIDs[Index]=z+LAST
    LAST=max(z)+LAST
  }
  clusterIDs
}  

#----------------------------------------------------------------------
# Regular smoothing (from Andrew Jaffe, JHU) 
# %% i don't think we need this anymore, given dropping the NA tails
myfilter <- function(x,filter,...){
	L=length(x)
	if(L>length(filter)){
		res=filter(x,filter,...)
		##now fill out the NAs
		M=length(filter)
		N=(M-1)/2
		for(i in 1:N){
			w=filter[(N-i+2):M]
			y=x[1:(M-N+i-1)]
			res[i] = sum(w*y)/sum(w)

			w=rev(w)
			ii=(L-(i-1))
			y=x[(ii-N):L]
			res[ii]<-sum(w*y)/sum(w)
		}
	} else{ res=rep(mean(x),L) }
  return(res)
}

#########################################################################
# initialize 
####################

# this just loads in necessary packages
#	i'm going to add automatically installing packages
#	that aren't on someones computer to this
initialize = function() {
	options(warn=-1)
	
	pckg = try(require(Biostrings))
	if(!pckg) {
		source("http://bioconductor.org/biocLite.R")
		biocLite("Biostrings")
		require("Biostrings")
	}

	pckg = try(require(GenomicRanges))
	if(!pckg) {
		source("http://bioconductor.org/biocLite.R")
		biocLite("GenomicRanges")
		require("GenomicRanges")
	}
	
	pckg = try(require(BSgenome.Hsapiens.UCSC.hg18)) # 844 MB!
	if(!pckg) {
		source("http://bioconductor.org/biocLite.R")
		biocLite("BSgenome.Hsapiens.UCSC.hg18")
		require("BSgenome.Hsapiens.UCSC.hg18")
	}
	


	pckg = try(require(splines))
	if(!pckg) {
		getPckg(splines)
		require(splines)
	}

	options(warn=1)

}	

#----------------------------------------------------------------------
# install new packages, probably need to add a bioconductor option
# used below in initialize 
getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

#------------------------------------------------------------------------
# *** Andrew: fix warning, pattern does not work
# this needs to be cleaned up a bit, but basically
#	just reads in the data from .bin files (or something
#	user defined
readData = function(data.path = "./", treat = "Treatment", input = "Input",
	type = "bed", saveIt = FALSE) {
	
	if(file.exists(paste(data.path,"chipseq_data.rda",sep="/"))) {
		cat("reading in the data from saved file.")
		load(paste(data.path,"chipseq_data.rda",sep="/"))
		cat("\n")
	} else {
		cat("Reading in the data.")

		trt = read.table(dir(data.path, pattern = c(treat,type), full.names = T), sep = "\t", as.is = T, header = F)
		
		cat(".")
		ctrl = read.table(dir(data.path,pattern = c(input,type), full.names = T), sep = "\t", as.is = T, header = F)
		names(trt) = names(ctrl) = c("chr","start","end","x","y","strand")
		cat("\n")
		rawData = vector("list",2)
		rawData[[1]] = trt
		rawData[[2]] = ctrl
		names(rawData) = c("Treat","Control")
		
		if(saveIt) {
			cat(paste("saving the data as an rda to",data.path),"\n")
			save(rawData, file=paste(data.path, "chipseq_data.rda",
				sep = "/"),compress=T)
		}
	}
	return(rawData)
}


########################################################################
# Processing Functions: aligning and smoothing
####################

#------------------------------------------------------------------------
# Aligns data from BED file using the given shift
#	Applies to list of chromosomes which.chr (default is "ALL")
#	uniq=T only keeps unique tags for a given strand (removes duplicates)
#
# output is "aligned" data class: data frame with columns "chr" (chromosome), "pos" (position) and "count".

alignSeq = function(bed, shift, chr="ALL", uniq = TRUE) {
	cat("aligning\n")
        if(chr != "ALL") bed = bed[which(bed$chr %in% chr),]
	chrIndexes = splitit(bed$chr)
	chrNames = names(chrIndexes)
        
        aligned = NULL
        for(i in seq(along=chrIndexes)) {
          chr = chrNames[i]
          cat(chr)
          dat = bed[chrIndexes[[i]],]
          count = alignShift(dat=dat, shift=shift, uniq=uniq)    # <1 seconds each
          count$chr = chr
          aligned = rbind(aligned, count[,c("chr","pos","count")])
        }

        return(aligned)
}


#------------------------------------------------------------------------
# Auxiliary function for alignSeq: actual shift
# 	takes raw data and performs the shift
#	uniq=T only keeps unique tags for a given strand (removes duplicates)
#
# output is data frame with columns "pos" (position) and "count".

alignShift = function(dat, shift, uniq = TRUE) {
	cat(".")
	plus = which(dat$strand == "+")
        plus.pos = dat$start[plus] + shift
        if(uniq) plus.pos = unique(plus.pos)
        plus.pos = data.frame(cbind(plus.pos, rep(1)))
        names(plus.pos) = c("pos","count")

	cat(".")
        minus = which(dat$strand == "-")
        minus.pos = dat$end[minus] - shift
        if(uniq) minus.pos = unique(minus.pos)
        minus.pos = data.frame(cbind(minus.pos, rep(1)))
        names(minus.pos) = c("pos","count")

        out = merge(plus.pos, minus.pos, by = "pos", all = T)
        out$count = rowSums(out[,2:3], na.rm = T)
        out$count.x = out$count.y = NULL

        cat("\n")
        return(out)
}


#------------------------------------------------------------------------
# this takes "aligned" data class, the max distance between 
#	reads in the same read group, an arbitrary (but unimodal)
#	smoothing kernel, and the chromosome name
#
# Output is data frame with columns "chr" (chromosome), "pos" (position) and "height"

smoothSpeed = function(counts, chr, theFilter, readGroupSize, speed) {
	if(speed){
		cat("speed option is ON. scaling.")
		speedScale = 10
		counts$pos = round(counts$pos/speedScale);  
		theFilter = approx(theFilter, n = (length(theFilter)-1)/speedScale + 1)$y
		readGroupSize = readGroupSize/speedScale
		cat("\n")
	} else 	cat("speed option is OFF","\n")

	cat("smoothing and peak hunting.")
	
	# Create groups by readGroupSize
	pns = clusterMaker(rep(chr, nrow(counts)), counts$pos, maxGap = readGroupSize)
	Indexes = splitit(pns)
	lens = sapply(Indexes,length)
	
	peak1 = which(lens == 1)
	peak2 = which(lens > 1)
	
	# correct heights of peaks with count 1
        peak1 = as.vector(peak1)
        m = max(theFilter)
        thePeaks = data.frame(pos = counts$pos[peak1], height = m * counts$count[peak1])
        
	# smooth and find maxima of groups with more counts
	peakList = vector("list",peak2)
	for(k in 1:length(peak2)) {
		if(k %% 50 == 0) cat(".")
		Index = Indexes[[peak2[k]]]
		count = counts[Index,] # for the k'th group 

		# interpolate with 0s but make sure the counts
		# are in the right place
		xx = rep(0, 2*length(theFilter) + 1 + count$pos[length(count$pos)] - count$pos[1])
		xx[length(theFilter) + count$pos - count$pos[1] + 1] = count$count
		xx = filter(xx,theFilter)
		xx[is.na(xx)] = rep(0)
		peaks = findMax(xx)
		peaks$pos = peaks$pos - length(theFilter) + count$pos[1] - 1
		peakList[[k]] = peaks
	}

	# merge the peaks
	for(k in seq(along = peakList)) thePeaks = rbind(thePeaks, peakList[[k]])
        
        # Remove peaks that are smaller than the filter height (artifact)
        # index = which(thePeaks$height < m)
        # thePeaks = thePeaks[-index,]

	# If speed, fix position
        if(speed){
		thePeaks$pos = thePeaks$pos * speedScale
	}
	
	# order them
	thePeaks = thePeaks[order(thePeaks$pos),]
	thePeaks$chr = rep(chr)
	thePeaks = thePeaks[,c("chr","pos","height")]
	cat("\n")

	return(thePeaks)
}


#------------------------------------------------------------------------
# finds all local maxima, ie inflection changes, in xx
#	returns position of peak, and its height
# *** Sometimes height is smaller than smallest peak
findMax = function(xx) {
	d = sign(diff(xx)) # find inflection points
	d = c(d[1],d) # this keep all of the indexes the same

	r = rle(as.numeric(d))
	y = rleFrame(r)

	# drop 0's first
	y = y[y$x != 0,]
	# this is because some patterns are 1, 0, 1
	inc = which(y$x == 1) # see which are increasing
	Index = inc[which(y$x[inc+1] == -1)] 
	# now 1, -1 is always a max

	## loop
	peaks = data.frame(matrix(nr = length(Index), nc = 2))
	colnames(peaks) = c("pos","height")
	for(j in 1:length(Index)) {
		# if(j %% 10000 == 0) cat(".")
		# look at i'th peak
		tmp = y[Index[j],]
		
		# now we just average from this to the next, which includes dropped 0's
		nstart = y[(Index[j]+1),3] - 1
		# get the start and end of peak more precisely
		peaks[j,1] = floor(mean(tmp$end:nstart)) # avg location, must be integer
		peaks[j,2] = max(xx[tmp$end:nstart]) # avg function in region
	}

	return(peaks)
}



########################################################################
# Preprocessing: Estimate shift and peak shape
####################

# Main: Estimate shift and peak shape
# Output is a list with slots:
#       1. tag distribution (plus & minus)
#	2. estimated shift 
#	3. new filter (normalized)
#	4. new filter (unnormalized)
#	5. new filter (untrimmed)
#	6-. the parameters used when calling the function

shiftPeakEst = function(rawData, chr = "chr1", firstShift = 100, 
	firstKernel = "Gaussian", kernelBand = 50, readGroupSize = 1e4,
	speed = FALSE, topPeaks = 1000, tagWinSize = 2001,
	knots = c(1,501,801,901,951,976,1001,1026,1051,1101,1201,1501),
	trimFilter = TRUE, trimSize = 801) {

	# align with initial shift
        aligned = list(Treat = NULL, Control = NULL)
        aligned$Treat = alignSeq(rawData[["Treat"]], chr = chr, shift = firstShift)
        aligned$Control = alignSeq(rawData[["Control"]], chr = chr, shift = firstShift)

        # Choose filter
	if(firstKernel=="Gaussian") {
		theFilter = dnorm(-(4*kernelBand):(4*kernelBand)/(kernelBand+1)) 
	}
	if(firstKernel=="Uniform") {
		unif = function(x) ifelse(abs(x)<1, 0.5, 0)
		theFilter= unif(seq(-kernelBand,kernelBand)/(kernelBand+1))
	}
	# make sure it integrates to 1
	theFilter = theFilter/sum(theFilter)

	# Smoothing
	cat("Smoothing.\n")
        thePeakList = smoothSpeed(aligned[["Treat"]], chr = chr, theFilter = theFilter, readGroupSize = readGroupSize, speed = speed)      # 20 min. for chr 1

        # Pick top peaks and obtain tag distribution
	cat("Estimating tag distribution and shift.\n")
	theTopPeaks = thePeakList[order(thePeakList$height, decreasing = T)[1:topPeaks],]
	tags = tagDistrEst(rawData[["Treat"]], chr = chr, theTopPeaks, tagWinSize = tagWinSize)

	# Get best shift and smoothing kernel
        newShiftInfo = estShift(tags, knots = knots, trimFilter = TRUE, trimSize = trimSize)

        # Output
	theOutput = list(tagDistr=tags)
	call = list(chr=chr, firstShift=firstShift, firstKernel=firstKernel, kernelBand=kernelBand, readGroupSize=readGroupSize, speed=speed, topPeaks=topPeaks, tagWinSize=tagWinSize, knots=knots, trimFilter=trimFilter, trimSize=trimSize)
        theOutput = c(theOutput, newShiftInfo, call)
	return(theOutput)
}

        
#------------------------------------------------------------------------
# this takes the raw treatment data, read in, and gets the
#	estimate of the tag distribution
#	rawDat is just trt, chr is the chromosome name,
#	tagWinSize determines how far to look in each direction
#
# Output is data frame with tagWinSize rows and columns "plus" and "minus".

tagDistrEst = function(bed, chr, theTopPeaks, tagWinSize) {
	# Extract relevant chromosome
	bed = bed[bed$chr == chr,]

	loc = theTopPeaks$pos
	# half the window.size
	win.half = floor(tagWinSize/2)

	# get all indices for each peaks +/- win.half
	strands = splitit(bed$strand) # split by strand

	locIndex = vector("list",length(loc))
	for(j in seq(along=locIndex)) locIndex[[j]] = (loc[j] - win.half):(loc[j] + win.half)
	
	cat("- strand\n")
	negPos = bed$end[strands[["-"]]]	
	minusCount = lapply(locIndex, function(x) which(x %in% negPos))
	minusWin = rep(0, tagWinSize)
	minusTab = table(unlist(minusCount)) 
	minusWin[as.integer(names(minusTab))] = minusTab

	cat("+ strand\n")

	posPos = bed$start[strands[["+"]]]
	plusCount = lapply(locIndex, function(x) which(x %in% posPos))
	plusWin = rep(0, tagWinSize)
	plusTab = table(unlist(plusCount)) 
	plusWin[as.integer(names(plusTab))] = plusTab
	
	countsOut = data.frame(cbind(plusWin, minusWin))
	names(countsOut) = c("plus","minus")
	return(countsOut)
}

#------------------------------------------------------------------------
# this takes the object returned by tagDistEst and returns
#	the optimal shift and the new unimodal filter
#
# Out is list with names "bestShift" (a scalar) and "newFilter" (vector of same length as rows in tags)

estShift = function(tags, knots, trimFilter=TRUE, trimSize) {
	# Get best shift alignment from highest peaks
	# try a range of shifts around 63 (found by MACS)
	
        # Make symmetric, fit spline, find max
	n = nrow(tags)
        X = bs(1:n, knots=knots)  # Setup for spline smoothing
        tags.smooth = tags
        tags.smooth$plus = lm(tags$plus ~ X - 1)$fitted.values
        tags.smooth$minus = lm(tags$minus ~ X - 1)$fitted.values
        shift = round(abs(which.max(tags.smooth$plus) - which.max(tags.smooth$minus))/2)

        # Obtain filter
        shifted = (c(rep(0,shift), tags$plus[1:(n-shift)]) + c(tags$minus[(1+shift):n], rep(0,shift))) / 2
        shifted = (shifted + shifted[seq(n,1,-1)])/2
        newFilter = lm(shifted ~ X - 1)$fitted.values

	# trim and multiply by quartic kernel shape
	if(trimFilter) {
		L = (trimSize-1)/2
		x = -L:L
		quartic = (1 - (x/L)^2)^2
		trimmed = newFilter[x + (n+1)/2] * quartic
    }
		
	out = vector("list",4)
	out[[1]] = bestShift
	out[[2]] = trimmed/sum(trimmed)
	out[[3]] = trimmed
	out[[4]] = newFilter
	names(out) = c("shift","kernel","filter","filter.untrimmed")
	
	return(out)
}


########################################################################
# Preprocessing: global regression and MonteCarlo simulations
####################

#------------------------------------------------------------------------
# regression, takes 'aligned' object to perform the regression
#	returns the linear regression model for the binned values
#
# Output is a list with slots:
#	1. binned treatment data
#	2. binned control data
#	3. binned treatment variance
#	4. binned control variance
#	7. Regression coefficients
#	8. Fitted values
#	9. Fitted errors
#	10. Fitted error variance
#	11. Covariance matrix of coefficients
#	12. Estimated background range
#	13-. the parameters used when calling the function

globalRegr = function(aligned, binSizes = 10^(3:4), quantile.rng = c(.5,1), var = FALSE, binSkip = 1, chr = "ALL") {
	cat("Binning the data.\n")
	
        y.all = data.frame()
        X.all = data.frame()
        Vy.all = data.frame()
        VX.all = data.frame()
        
	if(chr == "ALL") chrNames = unique(aligned[["Treat"]]$chr)
        else chrNames = chr
        for(i in seq(along = chrNames)) {
		# current chromosome
                chr = chrNames[i]
                chrSeq = Hsapiens[[chr]]
                cat(chr)

		# Fill aligned data with zeros
                #L = length(chrSeq)
                indTrt = which(aligned[["Treat"]]$chr == chr)
                indCtrl = which(aligned[["Control"]]$chr == chr)
			L = max(aligned[["Treat"]]$pos[indTrt], aligned[["Control"]]$pos[indCtrl])
                fullTrt = fullCtrl = rep(0,L)
                fullTrt[aligned[["Treat"]]$pos[indTrt]] = aligned[["Treat"]]$count[indTrt]
                fullCtrl[aligned[["Control"]]$pos[indCtrl]] = aligned[["Control"]]$count[indCtrl]

                cat(".")

		# Smallest bin
           n = floor(L / binSizes[length(binSizes)]) * binSizes[length(binSizes)] / binSizes[1]	# number of 1k bins that fit in 100k bins
		binTrt = bin(fullTrt, binSizes[1], var=var)[1:n,]	# Response variable (38 sec.)
		binCtrl = bin(fullCtrl, binSizes[1], var=var)[1:n,]

		# Response and predictors
		y = data.frame(chr = chr, y = binTrt[,1])
		X = data.frame(chr = chr, matrix(0, n, length(binSizes)))
                Vy = data.frame(chr = chr, var = binTrt[,2])
                VX = data.frame(chr = chr, var = binCtrl[,2])
		X[,2] = binCtrl[,1]
		if(length(binSizes) > 1)
			for(j in 2:length(binSizes)) {
			x = bin(X[,j], binSizes[j]/binSizes[1])[,1]
			X[,j+1] = rep(x, each = binSizes[j]/binSizes[1])
		}

                cat(".")

		# Accumulate results
                y.all = rbind(y.all, y)
                X.all = rbind(X.all, X)
                Vy.all = rbind(Vy.all, Vy)
                VX.all = rbind(VX.all, VX)
                cat(".")
        }
	
        # Skip bins
        index = seq(1, nrow(X.all), binSkip)
        y.all = y.all[index,]
        X.all = X.all[index,]
        Vy.all = Vy.all[index,]
        VX.all = VX.all[index,]
        
        # Linear multiple regression for the mean
	cat("Performing regression.")
        yy = y.all[,"y"]
	XX = as.matrix(X.all[,2:ncol(X.all)])
	lm1 = lm(yy ~ XX - 1)   # Non-negative least squares 
	coefs = lm1$coefficients
	y.hat = lm1$fitted.values
        e = yy - y.hat
        s2 = sum(e^2) / (length(yy) - ncol(XX))
        coefs.cov = s2 * solve(t(XX) %*% XX)
        names(coefs) = colnames(XX)

        # Background range
        rng = quantile(y.hat, quantile.rng)

	# Output
        cat("\n")
	out = list(binTrt = y.all, binCtrl = X.all, binTrtVar = Vy.all, binCtrlVar = VX.all, coefs = coefs, y.hat = y.hat, error = e, s2 = s2, coefs.cov = coefs.cov, bkgr.rng = rng)
        call = list(binSizes=binSizes, var=var, binSkip=binSkip)
        out = c(out, call)
	return(out)
}

#----------------------------------------------------------------------
# Auxiliary functions for globalRegr

# Takes a full vector of counts and returns mean and variance in bins
bin = function(x, binSize, var=FALSE) {
	L = floor(length(x) / binSize)
	y = matrix(x[1:(L*binSize)], binSize, L)
	y.mean = colMeans(y)
        y.var = NaN
	if(var==TRUE) y.var = (colMeans(y^2) - y.mean^2) * binSize / (binSize-1)
        return(data.frame(mean = y.mean, var = y.var))
}


#------------------------------------------------------------------------
# Computes distribution of heights of local maxima for various values of the Poisson rate parameter lambda via Monte Carlo simulation
#	returns look-up-table table for pvalues as a function of threshold, each column corresponding to a different value of lambda
#
# Output is a list with slots:
#	1. thresholds
#	2. lambda values
#	3. p value look-up-table (thresholds x lambda values)   
#	4-. the parameters used when calling the function

lambdaSimul = function(kernel, lambda.rng = c(1e-5, 1e-1), lambda.res = 300, counts = 100, min.simul = 1e5, thresh.res = 200, lambda.df = 5, tol = 0.25, trunc = TRUE ) {
	cat("Estimating height distributions.")
        
	# Get range of potential values of lambda
     lambda.rng = lambda.rng * c(1-tol, 1+tol)
	lambda = 10^seq(log10(lambda.rng[1]), log10(lambda.rng[2]), len=lambda.res)
     N = pmax(counts/lambda, min.simul)

	# Simulate Poisson sequences, smooth, get local maxima
	locmax.distr = vector('list',length(lambda))
	for(j in 1:length(lambda)) { 
		if(j %% 20 == 0) cat(".")
        	x = rpois(N[j], lambda[j])
		if(trunc) x = pmin(x, 2)
		# myfilter is okay here because of the tails
		y = myfilter(x, kernel)
		if(any(y>0)) {
                    pks = findMax(y)
                    locmax.distr[[j]] = ecdf(pks$height)
                } else locmax.distr[[j]] = ecdf(max(kernel))
	}
        
	# Smooth height distributions for better accuracy
	u = 10^seq(log10(max(kernel)), log10(max(pks$height)), len=thresh.res)
	p.hat = p = matrix(0, length(u), length(lambda))
	
	for(k in 1:length(lambda)) p[,k] = 1 - locmax.distr[[k]](u)
	lambda.spline = bs(lambda, df=lambda.df)
	p.hat[1,] = p[1,]
	for(j in 2:length(u)) {
 		coefs = nonneg.ls(p[j,], lambda.spline)
                p.hat[j,] = lambda.spline %*% coefs
	}
        p.hat = pmin(pmax(p.hat,0),1)
	cat("\n")
        
	out = vector("list",3)
	out[[1]] = u
	out[[2]] = lambda
	out[[3]] = p.hat
	names(out) = c("thresholds","lambda","pvalTable")
        call = list(lambda.rng = lambda.rng, lambda.res = lambda.res, counts = counts, min.simul = min.simul, thresh.res = thresh.res, lambda.df = lambda.df)
        out = c(out, call)
	return(out)
}

#----------------------------------------------------------------------
# Auxiliary functions for lambdaSimul

# Performs non-negative least squares. Finds coefficients.
# nlminb is much faster than optim
nonneg.ls = function(y, X) {
        fn <- function(par) sum((y - X %*% par)^2)
        # coefs = optim(rep(0,ncol(X)), fn, lower=rep(0,ncol(X)), method="L-BFGS-B")$par   # 83 sec. for df=10
        coefs = nlminb(rep(0,ncol(X)), fn, lower=rep(0,ncol(X)))$par   # 6 sec. for df=5
        return(coefs)
}


########################################################################
# Analysis Functions
####################
# Performs smoothing and finds all the peaks in the original data for all chromosomes
# readGroupSize can be the same as was used in preprocessing

peakFinder = function(trt, alignParams, chr = "ALL", speed = FALSE, readGroupSize = NULL) {
	cat("Smoothing.\n")
	theFilter = alignParams[["kernel"]]
        if(is.null(readGroupSize)) readGroupSize = alignParams$readGroupSize
        if(chr != "ALL") trt = trt[which(trt$chr %in% chr),]
        
	# One chromosome at a time
        trtChrIndexes = splitit(trt$chr)
	chrNames = names(trtChrIndexes)
        
	thePeakList = data.frame()
        cat("Current chromosome:\n")
	for(i in seq(along = trtChrIndexes)) {
		chr = chrNames[i]
                cat(paste(chr, "\n", sep=""))
		index = trtChrIndexes[[i]]
                peaks = smoothSpeed(trt[index,], chr = chr, theFilter = theFilter, readGroupSize = readGroupSize, speed = speed)      # 20 min. for chr 1
                thePeakList = rbind(thePeakList, peaks)
	}
        cat("\n")
	return(thePeakList)
}


#----------------------------------------------------------------------
# For each peak found by peakFinder(), computes local averages from Control
# Control is of "aligned" class
# Output is set of predictors for estimating background lambda

bkgrPrep = function(Peaks, Control, regParams, chr="ALL") {
	cat("Computing local averages from Control\n")
        if(chr != "ALL") Peaks = Peaks[which(Peaks$chr %in% chr),]
	chrIndexes = splitit(Peaks$chr)
	chrNames = names(chrIndexes)

        X.all = NULL
        for(i in seq(along = chrNames)) {
		chr = chrNames[i]
                cat(chr)
                peaks = Peaks[chrIndexes[[i]],]
                chrIndexes.ctrl = which(Control$chr == chr)
                controlAlign = Control[chrIndexes.ctrl,]

                # Control ranges
		controlRanges = GRanges(seqnames = Rle(rep(chrNames[i], nrow(controlAlign))), ranges = IRanges(start = controlAlign$pos, end = controlAlign$pos, width = 1), score = controlAlign$count, strand = Rle(strand(rep("+", nrow(controlAlign)))))
		
                # Set up predictors: binned control
		binSizes = regParams[["binSizes"]]
                X = matrix(0, nr = nrow(peaks), nc = length(binSizes))
                for(j in 1:length(binSizes)) {
			cat(".")
			ranges = GRanges(seqnames = Rle(peaks$chr), ranges = IRanges(start = peaks$pos - binSizes[j]/2, end = peaks$pos + binSizes[j]/2, width = binSizes[j]+1), strand = Rle(strand(rep("+",nrow(peaks)))))
			test = findOverlaps(ranges,controlRanges)
			test = as.matrix(test)
			
			Indexes = splitit(test[,1])
			counts = lapply(Indexes, function(x) sum(controlAlign[test[x,2],3]))
			counts = unlist(counts)/(binSizes[j]+1)
			X[as.integer(names(Indexes)),j] = counts
		}
		
		# Put predictors together
                X.all = rbind(X.all, X)
        }

        cat("\n")
        return(X.all)
}

  
#----------------------------------------------------------------------
# For each peak found by peakFinder(), 
#   -estimates background Poisson rate and assigns a p-value. 
# 	-Then applies BH for FDR global significance.

signifSeq = function(peaks, predictors, regParams, distrParams, chr="ALL") {
	cat("Computing p-values.")  
	thresholds = distrParams[["thresholds"]]
	lambdaTable = distrParams[["lambda"]]
	pvalTable = distrParams[["pvalTable"]]     # thresholds x lambda
		
	# Estimate Poisson background rate for all peaks
	lambda.local = predictors %*% regParams$coefs
        peaks$lambda.local = lambda.local
        peaks$lambda = pmax(lambda.local, regParams$bkgr.rng[1])
        peaks$SNR = peaks$height / peaks$lambda
        
        # Get pvalues by interpolation of look-up-table
        pval = rep(1, length(peaks$lambda))
        lambda = pmax(pmin(peaks$lambda, max(lambdaTable)), min(lambdaTable))
        index = which(peaks$height > min(thresholds))
	for(j in index) {
                if(j %% 100000 == 0) cat(".")
		pval[j] = lut2(thresholds, lambdaTable, pvalTable, peaks$height[j], lambda[j])
        }
        peaks$pvalue = pmax(pmin(pval, 1), 0)

        # Q-values and ordering
	peaks$qvalue = p.adjust(peaks$pvalue, method = "BH")
	
	cat("\n")
        return(peaks)
}


#----------------------------------------------------------------------
# Auxiliary function: bilinear interpolation in a look-up-table
# x, y are vectors
# z is table of size length(x) by length(y)
# xout, yout are the values to interpolate
lut2 <- function(x, y, z, xout, yout) {
        # get closest x and y in the look-up table
        d = sort(abs(xout - x), index.return=T)
        i1 = d$ix[1]; i2 = d$ix[2]
        d = sort(abs(yout - y), index.return=T)
        j1 = d$ix[1]; j2 = d$ix[2]
        # bilinear interpolation
        rx = (xout - x[i1]) / (x[i2] - x[i1])
        ry = (yout - y[j1]) / (y[j2] - y[j1])
        zout = (1-rx)*(1-ry)*z[i1,j1] + (1-rx)*ry*z[i1,j2] + rx*(1-ry)*z[i2,j1] + rx*ry*z[i2,j2]
        return(zout)
}


########################################################################
# Wrapper for everything
####################

seqAnalysis = function(dataDir, treat = "Treatment_tags.bed", input = "Input_tags.bed", saveIt = TRUE) {
        rawData = readData(data.path = dataDir, treat = treat, input = input)    # 92 sec.
        
        # Estimate shift and peak shape
        alignParams = shiftPeakEst(rawData, chr = "chr1", trimSize = 401)    # 21 min. for chr1 (speed OFF)
        if(saveIt) save(alignParams, file = "alignParams.rda", compress=T)
        
        # Perform alignment
        aligned = list(Treat = NULL, Control = NULL)
        aligned[["Treat"]] = alignSeq(rawData[["Treat"]], shift = alignParams$bestShift)
        aligned[["Control"]] = alignSeq(rawData[["Control"]], shift = alignParams$bestShift)
        if(saveIt) save(aligned, file = "aligned.rda", compress=T)

        # Find peaks
        allPeaks = peakFinder(aligned, alignParams, chr = "ALL")
        if(saveIt) save(allPeaks, file = "allPeaks.rda", compress=T)

        # Global regression
        regParams = globalRegr(aligned, var=TRUE)   # ~8 mins.
        if(saveIt) save(regParams, file = "regParams.rda", compress=T)

        # MonteCarlo simulations
        distrParams = lambdaSimul(kernel = alignParams$kernel, lambda.rng = regParams$bkgr.rng)  # 8 min for N=1e5, lambda.res=100; 40 min for N=3e5, lambda.res=500
        if(saveIt) save(distrParams, file = "distrParams.rda", compress=T)

        # Prepare background
        predictors = bkgrPrep(allPeaks, aligned$Control, regParams)    # ~60 min.
        if(saveIt) save(predictors, file = "predictors.rda", compress=T)

        # Multiple testing
        finalPeaks = signifSeq(allPeaks, predictors, regParams, distrParams)     # ~30 min
        if(saveIt) save(finalPeaks, file = "finalPeaks.rda", compress=T)

        # Ordering and significance
        topPeaks = finalPeaks[order(finalPeaks$SNR, decreasing = T),]
        topPeaks = topPeaks[order(topPeaks$pvalue),]
        signifPeaks = topPeaks[topPeaks$qvalue < 0.01,]
        if(saveIt) save(signifPeaks, file = "signifPeaks.rda", compress=T)

        # Motif analysis
        motifObs = motifDist(signifPeaks, dist = c(25, 100, 400))
        if (saveIt) save(motifObs, file = "motifObs.rda")
        
        return(signifPeaks)
}



########################################################################
# Evaluation Functions
####################

# Motif analysis for various distances from binding site

motifDist = function(peaks, dist = 100, type = "FOXA1") {
	# motif from JASPAR - http://jaspar.genereg.net/cgi-bin/jaspar_db.pl
	if(type == "FOXA1") { 
		A = c(13, 108, 1, 5, 6, 476, 6, 291,45,378, 90)
		C = c(2,7,7,7,3,12, 807, 106, 368,9, 125)
		G = c(2, 770, 5, 24, 78,368, 5,5, 69, 24, 466)
		T = c(875, 8, 882,860, 809,  40,  79, 495, 414, 482, 211)
        }
	if(type == "GABP") { 
		A = c(100,   0,  75,    0,     0, 1800, 1800,    0, 1,    0,   0, 250 )
		C = c(100, 600, 300,    0,     0,    0,    0,    0, 1,    0, 150, 0 )
		G = c(100, 300,  50, 1800,  1800,    0,    0, 1800, 1, 1100,  50, 200)
		T = c(  0,   0,   1,    0,     0,    0,    0,    0, 1,  100,  50, 50)
	}

#	# Another possible motif
#       if(type == "GABP") { 
#		A = c(75,  50, 250,    0,     0, 1800, 1800,  100,  1, 25,   1)
#		C = c( 0, 350, 500,    0,     0,    0,    0,    0,  1,  1,   1)
#		G = c(75, 100,   0, 1800,  1800,    0,    0,  800,  1, 75,  75)
#		T = c( 0,   0,   0,    0,     0,    0,    0,    0, 75,  1,   1)
#	}
        
	motif = rbind(A,C,G,T)
	# probability of each base
        motif = apply(motif, 2, function(x) x/sum(x))

        motifObs = data.frame(matrix(0, nrow=nrow(peaks), ncol=length(dist)))
        names(motifObs) = dist
        for(i in 1:length(dist)) {
          cat(dist[i],"\n")
          motifObs[,i] = checkMotif(motif, peaks, dist = dist[i])
          cat("\n")
        }

        return(motifObs)
}


#----------------------------------------------------------------------
# motif finding
# motif is a PWM thats 4 x N
checkMotif = function(motif, peaks, dist = 35) {
	chrs = unique(peaks$chr)
	motifCount = rep(0, nrow(peaks))
	cat("finding motif locations.")
	for(i in 1:length(chrs)) {
		cat(".")
		chr = chrs[i]

		# subset data
		peaks.chr = peaks[peaks$chr %in% chr,]
		mm = motifCount[peaks$chr %in% chr]
		tmpI = IRanges(start=peaks.chr$pos, end=peaks.chr$pos)
		
		# look for all motifs on current chr
		# theSeq = Hsapiens[[chrs[i]]]
		theSeq = Hsapiens[[toString(chr)]]
		hits = matchPWM(motif, theSeq)

		# check peaks
		test = findOverlaps(tmpI, hits, maxgap = dist)
		test = as.matrix(test)
		tab = table(test[,1])
		mm[as.integer(names(tab))] = tab
		
		# replace
		motifCount[peaks$chr %in% chr] = mm
	}
	cat("\n")	
	return(motifCount)
}


#----------------------------------------------------------------------
# Find matches between two sets of results

# first two columns, where first is chr and second is pos 
pointMatch = function(object1,object2,col1=2,col2=2,verbose=TRUE){
	Indexes1=split(seq(along=as.character(object1$chr)),as.character(object1$chr))
        Indexes2=split(seq(along=as.character(object2$chr)),as.character(object2$chr))
        res=matrix(NA,nrow=nrow(object1),2)
        for(i in seq(along=Indexes1)){
                CHR=names(Indexes1)[i]
                if(verbose) cat(CHR,",")
                Index1=Indexes1[[CHR]]
                Index2=Indexes2[[CHR]]
                if(length(Index1)>0 & length(Index2)>0){
                        tmpmat1=object1[Index1,col1]
                        tmpmat2=object2[Index2,col2]
                        tmp=fuzzy.match(tmpmat1,tmpmat2)
                        tmp[,2] = Index2[tmp[,2]]
                        res[Index1,]=tmp
                }
        }
        cat("\n")

        colnames(res)<-c("dist","matchIndex")
        return(res)
      }


fuzzy.match = function(x, y, x.sorted = FALSE, y.sorted = FALSE) {
        l.x = length(x)
        # space for results
        r = array(0,c(l.x,2))

        if (! x.sorted) {
                # sort x
                perm = order(x)
                X = x[perm]
        }

        l.y = length(y)
        if (! y.sorted) {
                Y = array(0,1+l.y)
                # sort y
                perm.y = order(y)
                Y[1:l.y] = y[perm.y]
                Y[1+l.y] = +Inf
        }
        else y[1+l.y] = +Inf

        # set up
        v = -Inf
        j = 1
        if (y.sorted) u = y[j] else u = Y[j]
        
        # run over X, or x if x.sorted
        for (i in 1:l.x) {
                if (x.sorted) z = x[i] else z = X[i]
                e = z - v
                d = u - z
                while (d < 0) {
                        v = u
                        e = z - v
                        j = j + 1
                        if (y.sorted) u = y[j] else u = Y[j]
                        d = u - z
                }
                # result
                if (x.sorted) k = i else k = perm[i]
                if (y.sorted) r[k,2] = j else r[k,2] = perm.y[j]
                if (d == 0)
                        r[k,1] = 0
                else if (e < d) {
                        r[k,1] = -e
                        if (y.sorted) r[k,2] = j-1 else r[k,2] = perm.y[j-1]
                }
                else # e >= d
                        r[k,1] = d
        }
        r
}


fuzzy.match2 = function(x, z, x.sorted = FALSE, z.sorted = FALSE) {
        # z is an Nx2 matrix whose rows are separate intervals
        # which play the role of the y-elements in fuzzy.match
        if (z.sorted)
                y = t(z)
        else {
                perm = order(z[,1])
                y = t(z[perm,])
        }
        f = fuzzy.match(x,y,x.sorted=x.sorted,y.sorted=TRUE)
        for (i in 1:length(x)) {
                d = f[i,1]      # distance from as.vector(y)
                j = f[i,2]      # nearest index of as.vector(y)
                jmod2 = j%%2
                if (z.sorted)
                        f[i,2] = (j+jmod2)/2
                else
                        f[i,2] = perm[(j+jmod2)/2]
                if ((d>0 && jmod2==0) || (d<0 && jmod2 == 1))
                        f[i,1] = 0
        }
        f
}



########################################################################
# Graphics
####################

#----------------------------------------------------------------------
# Show data segment

seqPlot <- function(peaks, aligned, kernel, chr, loc, L=10001, seg=NULL, alpha = 0.05) {
  if(is.null(seg)) seg = c(loc-(L-1)/2, loc+(L-1)/2)
  segDat = data.frame(t = seg[1]:seg[2], ctrl = rep(0, L), trt = rep(0, L), smooth = rep(0, L), height = rep(NaN, L), lambda = rep(NaN, L), SNR = rep(NaN, L), pvalue = rep(NaN, L), qvalue = rep(NaN, L), sig = rep(NaN, L), nonsig = rep(NaN, L))

  # Control
  i = which((aligned$Control$chr == chr) & (aligned$Control$pos > seg[1]) & (aligned$Control$pos < seg[2]))
  segDat$ctrl[aligned$Control$pos[i] - loc + (L-1)/2] = aligned$Control$count[i]
  # Treat
  i = which((aligned$Treat$chr == chr) & (aligned$Treat$pos > seg[1]) & (aligned$Treat$pos < seg[2]))
  segDat$trt[aligned$Treat$pos[i] - loc + (L-1)/2] = aligned$Treat$count[i]
  segDat$smooth = myfilter(segDat$trt, kernel)
  
  # Peaks and background
  i = which((peaks$chr == chr) & (peaks$pos > seg[1]) & (peaks$pos < seg[2]))
  segDat[peaks$pos[i] - loc + (L-1)/2, 5:9] = cbind(peaks$height[i], peaks$lambda[i], peaks$SNR[i], peaks$pvalue[i], peaks$qvalue[i])
  segDat[peaks$pos[i] - loc + (L-1)/2, 10] = ifelse(peaks$qvalue[i] < alpha, peaks$height[i], NaN)
  segDat[peaks$pos[i] - loc + (L-1)/2, 11] = ifelse(peaks$qvalue[i] > alpha, peaks$height[i], NaN)

  # Plotting
  par(mfrow=c(6,1), cex.axis=1.5, cex.lab=1.5, cex.main=1.5)
  xlab = paste('location (',chr,')',sep='')
  plot(segDat$t, segDat$ctrl, type='l', main='Control', xlab=xlab, ylab='', ylim=c(0,2))
  plot(segDat$t, segDat$trt, type='l', main='Treatment (IP)', xlab=xlab, ylab='', ylim=c(0,2))
  plot(segDat$t, 100*segDat$smooth, type='l', main='Smoothed IP', xlab=xlab, ylab='x 1e-2'); points(segDat$t, 100*segDat$sig, col='red'); points(segDat$t, 100*segDat$nonsig, col='blue');
  plot(segDat$t, 1000*segDat$lambda, main='Background rate lambda', xlab=xlab, ylab='x 1e-3')
  plot(segDat$t, log10(segDat$SNR), main='log10(SNR)', xlab=xlab, ylab='')
  plot(segDat$t, log10(segDat$pvalue+1e-10), main='log10(p-value)', xlab=xlab, ylab='')
}


#----------------------------------------------------------------------
# Compare motif analysis results
# *** Fix function to handle only one or more than two set of motifs

motifPlot2 = function(motifObs, motifObsCis, label1 = "STEM+Regr", label2 = NULL, binSize = 500) {
  library(RColorBrewer)
  mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){
    par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0))
    par(mfrow=c(a,b),...)
    palette(brewer.pal(brewer.n,brewer.name))
    par(cex.axis=1.5, cex.lab=1.5, cex.main=1.5)
  }
  mypar()

  bins = cut(1:nrow(motifObs), breaks=seq(1,nrow(motifObs),by=binSize), labels = F)
  binIndex = splitit(bins)
  
  plot(0~0, type = "n", ylim = c(0,1),
	xlim = c(1,max(bins,na.rm=T)),
	xlab = paste("Ranked by", binSize),
	ylab = "Proportion with Motif Nearby"
	#, main = paste("Motifs within",dist,"base pairs")
       )

  for(i in seq(along=motifObs)) {
	tmp = motifObs[,i];  tmp[tmp>1] = rep(1)
	tmpcis = motifObsCis[,i];  tmpcis[tmpcis>1] = rep(1)
	
	yPoint = matrix(nr = max(bins,na.rm=T), nc = 2)
	colnames(yPoint) = c("no","yes")
	yPointCis = matrix(nr = floor(length(tmpcis)/binSize), nc = 2)
	colnames(yPointCis) =  colnames(yPoint)
	
	for(j in seq(along=binIndex)) {
		Index = binIndex[[j]]
		tmp2 = table(tmp[Index]);  tmp2 = tmp2/sum(tmp2)
		x = as.numeric(names(tmp2));  y = as.numeric(tmp2)
		yPoint[j,] = y
		points(y[2] ~ j, col = 1, pch = 19, cex = 1.3)
		
		if(max(Index) < length(tmpcis)) {
			tmp2 = table(tmpcis[Index])
			tmp2 = tmp2/sum(tmp2)
			
			x = as.numeric(names(tmp2))
			y = as.numeric(tmp2)
			
			yPointCis[j,] = y
			
			points(y[2] ~ j, col = 2, pch = 19, cex = 1.3)	
		}
	}
	
	legend("topright", c(label1,label2), pch = 19, pt.cex = 2, col = 1:2, cex=1.5)
	lines(lowess(yPoint[is.finite(yPoint[,1]),2], f = 0.6),	lwd = 5, col = 1)
	lines(lowess(yPointCis[is.finite(yPointCis[,1]),2], f = 0.6),	lwd = 5, col = 2)
      }
}



motifPlot3 = function(motifObs, motifObs2, motifObs3, label1 = "STEM+Regr", label2 = NULL, label3, binSize = 500) {
  library(RColorBrewer)
  mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){
    par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0))
    par(mfrow=c(a,b),...)
    palette(brewer.pal(brewer.n,brewer.name))
    par(cex.axis=1.5, cex.lab=1.5, cex.main=1.5)
  }
  mypar()

  bins = cut(1:nrow(motifObs), breaks=seq(1,nrow(motifObs),by=binSize), labels = F)
  binIndex = splitit(bins)
  
  plot(0~0, type = "n", ylim = c(0,1),
	xlim = c(1,max(bins,na.rm=T)),
	xlab = paste("Ranked by", binSize),
	ylab = "Proportion with Motif Nearby"
	#, main = paste("Motifs within",dist,"base pairs")
       )

  for(i in seq(along=motifObs)) {
	count1 = motifObs[,i];  count1[count1>1] = rep(1)
	count2 = motifObs2[,i];  count2[count2>1] = rep(1)
	count3 = motifObs3[,i];  count3[count3>1] = rep(1)
	
	yPoint = matrix(nr = max(bins,na.rm=T), nc = 2)
	colnames(yPoint) = c("no","yes")
	yPoint2 = matrix(nr = floor(length(count2)/binSize), nc = 2)
	colnames(yPoint2) =  colnames(yPoint)
	yPoint3 = matrix(nr = floor(length(count3)/binSize), nc = 2)
	colnames(yPoint3) =  colnames(yPoint)
	
	for(j in seq(along=binIndex)) {
		Index = binIndex[[j]]
		tmp2 = table(count1[Index]);  tmp2 = tmp2/sum(tmp2)
		x = as.numeric(names(tmp2));  y = as.numeric(tmp2)
		yPoint[j,] = y
		points(y[2] ~ j, col = 1, pch = 19, cex = 1.3)
		
		if(max(Index) < length(count2)) {
			tmp2 = table(count2[Index]); tmp2 = tmp2/sum(tmp2)
			x = as.numeric(names(tmp2)); y = as.numeric(tmp2)
			yPoint2[j,] = y
			points(y[2] ~ j, col = 2, pch = 19, cex = 1.3)	
		}
                
		if(max(Index) < length(count3)) {
			tmp2 = table(count3[Index]); tmp2 = tmp2/sum(tmp2)
			x = as.numeric(names(tmp2)); y = as.numeric(tmp2)
			yPoint3[j,] = y
			points(y[2] ~ j, col = 3, pch = 19, cex = 1.3)	
		}
	}
	
	legend("bottomleft", c(label1,label2,label3), pch = 19, pt.cex = 2, col = 1:3, cex=1.5)
	lines(lowess(yPoint[is.finite(yPoint[,1]),2], f = 0.3),	lwd = 5, col = 1)
	lines(lowess(yPoint2[is.finite(yPoint2[,1]),2], f = 0.3), lwd = 5, col = 2)
	lines(lowess(yPoint3[is.finite(yPoint3[,1]),2], f = 0.3), lwd = 5, col = 3)
      }
}


motifPlot4 = function(motifObs, motifObs2, motifObs3, motifObs4, label1 = "STEM+Regr", label2 = NULL, label3 = NULL, label4 = NULL, binSize = 500) {
  library(RColorBrewer)
  mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){
    par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0))
    par(mfrow=c(a,b),...)
    palette(brewer.pal(brewer.n,brewer.name))
    par(cex.axis=1.5, cex.lab=1.5, cex.main=1.5)
  }
  mypar()

  bins = cut(1:nrow(motifObs), breaks=seq(1,nrow(motifObs),by=binSize), labels = F)
  binIndex = splitit(bins)
  
  plot(0~0, type = "n", ylim = c(0,1),
	xlim = c(1,max(bins,na.rm=TRUE)),
	xlab = paste("Ranked by", binSize),
	ylab = "Proportion with Motif Nearby"
	#, main = paste("Motifs within",dist,"base pairs")
       )

  for(i in seq(along=motifObs)) {
	count1 = motifObs[,i];  count1[count1>1] = rep(1)
	count2 = motifObs2[,i];  count2[count2>1] = rep(1)
	count3 = motifObs3[,i];  count3[count3>1] = rep(1)
	count4 = motifObs4[,i];  count4[count4>1] = rep(1)
	
	yPoint = matrix(nr = max(bins,na.rm=TRUE), nc = 2)
	colnames(yPoint) = c("no","yes")
	yPoint2 = matrix(nr = floor(length(count2)/binSize), nc = 2)
	colnames(yPoint2) =  colnames(yPoint)
	yPoint3 = matrix(nr = floor(length(count3)/binSize), nc = 2)
	colnames(yPoint3) =  colnames(yPoint)
	yPoint4 = matrix(nr = floor(length(count4)/binSize), nc = 2)
	colnames(yPoint4) =  colnames(yPoint)
	
	for(j in seq(along=binIndex)) {
		Index = binIndex[[j]]
		tmp2 = table(count1[Index]);  tmp2 = tmp2/sum(tmp2)
		x = as.numeric(names(tmp2));  y = as.numeric(tmp2)
		yPoint[j,] = y
		points(y[2] ~ j, col = 1, pch = 19, cex = 1.3)
		
		if(max(Index) < length(count2)) {
			tmp2 = table(count2[Index]); tmp2 = tmp2/sum(tmp2)
			x = as.numeric(names(tmp2)); y = as.numeric(tmp2)
			yPoint2[j,] = y
			points(y[2] ~ j, col = 2, pch = 19, cex = 1.3)	
		}
                
		if(max(Index) < length(count3)) {
			tmp2 = table(count3[Index]); tmp2 = tmp2/sum(tmp2)
			x = as.numeric(names(tmp2)); y = as.numeric(tmp2)
			yPoint3[j,] = y
			points(y[2] ~ j, col = 3, pch = 19, cex = 1.3)	
		}

		if(max(Index) < length(count4)) {
			tmp2 = table(count4[Index]); tmp2 = tmp2/sum(tmp2)
			x = as.numeric(names(tmp2)); y = as.numeric(tmp2)
			yPoint4[j,] = y
			points(y[2] ~ j, col = 4, pch = 19, cex = 1.3)	
		}
	}
	
	legend("bottomleft", c(label1,label2,label3,label4), pch = 19, pt.cex = 2, col = 1:4, cex=1.5)
	lines(lowess(yPoint[is.finite(yPoint[,1]),2], f = 0.3),	lwd = 5, col = 1)
	lines(lowess(yPoint2[is.finite(yPoint2[,1]),2], f = 0.3), lwd = 5, col = 2)
	lines(lowess(yPoint3[is.finite(yPoint3[,1]),2], f = 0.3), lwd = 5, col = 3)
	lines(lowess(yPoint4[is.finite(yPoint4[,1]),2], f = 0.3), lwd = 5, col = 4)
      }
}


# Cumulative bins
motifPlotCum = function(motifObs, motifObs2, motifObs3,
	label1 = "STEM+Regr", label2 = NULL, label3, 
	cum=  FALSE, binSize = 500) {
	
  library(RColorBrewer)
  mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){
    par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0))
    par(mfrow=c(a,b),...)
    palette(brewer.pal(brewer.n,brewer.name))
    par(cex.axis=1.5, cex.lab=1.5, cex.main=1.5)
  }
  mypar()

  bins = cut(1:nrow(motifObs), breaks=seq(1,nrow(motifObs),by=binSize), labels = F)
  binIndex = splitit(bins)
  
  plot(0~0, type = "n", ylim = c(0,1),
	xlim = c(1,max(bins,na.rm=T)),
	xlab = paste("Ranked by", binSize),
	ylab = "Proportion with Motif Nearby"
	#, main = paste("Motifs within",dist,"base pairs")
       )

  for(i in seq(along=motifObs)) {
	count1 = motifObs[,i];  count1[count1>1] = rep(1)
	count2 = motifObs2[,i];  count2[count2>1] = rep(1)
	count3 = motifObs3[,i];  count3[count3>1] = rep(1)
	
	yPoint = matrix(nr = max(bins,na.rm=T), nc = 2)
	colnames(yPoint) = c("no","yes")
	yPoint2 = matrix(nr = floor(length(count2)/binSize), nc = 2)
	colnames(yPoint2) =  colnames(yPoint)
	yPoint3 = matrix(nr = floor(length(count3)/binSize), nc = 2)
	colnames(yPoint3) =  colnames(yPoint)
	
	for(j in seq(along=binIndex)) {
		if(!cum) Index = binIndex[[j]]
		if(cum) {
			Index = list()
			for(k in 1:j) Index[[k]] = binIndex[[k]]
			Index=  unlist(Index)
		}
		
		tmp2 = table(count1[Index]);  tmp2 = tmp2/sum(tmp2)
		x = as.numeric(names(tmp2));  y = as.numeric(tmp2)
		yPoint[j,] = y
		points(y[2] ~ j, col = 1, pch = 19, cex = 1.3)
		
		if(max(Index) < length(count2)) {
			tmp2 = table(count2[Index]); tmp2 = tmp2/sum(tmp2)
			x = as.numeric(names(tmp2)); y = as.numeric(tmp2)
			yPoint2[j,] = y
			points(y[2] ~ j, col = 2, pch = 19, cex = 1.3)	
		}
                
		if(max(Index) < length(count3)) {
			tmp2 = table(count3[Index]); tmp2 = tmp2/sum(tmp2)
			x = as.numeric(names(tmp2)); y = as.numeric(tmp2)
			yPoint3[j,] = y
			points(y[2] ~ j, col = 3, pch = 19, cex = 1.3)	
		}
	}
	
	legend("bottomleft", c(label1,label2,label3), pch = 19, pt.cex = 2, col = 1:3, cex=1.5)
	lines(lowess(yPoint[is.finite(yPoint[,1]),2], f = 0.3),	lwd = 5, col = 1)
	lines(lowess(yPoint2[is.finite(yPoint2[,1]),2], f = 0.3), lwd = 5, col = 2)
	lines(lowess(yPoint3[is.finite(yPoint3[,1]),2], f = 0.3), lwd = 5, col = 3)
      }
}