######################################################################################## # Calculating dimension of underlying factor models using the asymptotic # conditional SVD (see http://www.biostat.jhsph.edu/~jleek/publications.html) # Copyright (C) 2010 Jeffrey T. Leek (http://www.biostat.jhsph.edu/~jleek/contact.html) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details, see . # # Inputs: # dat - an m x n matrix of normalized data where n = # of samples, m = # of features # mod - an n x k model matrix (e.g., created with model.matrix) # plot - should the dimensionality plot be displayed # # Output: # est - the estimated dimension of the underlying factor model # ######################################################################################## dimfunc <- function(dat,mod,plot=FALSE){ dat <- as.matrix(dat) dims <- dim(dat) a <- seq(0,2,length=100) b <- sample(1:dims[1]) n <- floor(dims[1]/50) rhat <- matrix(0,nrow=100,ncol=50) P <- (diag(dims[2])-mod %*% solve(t(mod) %*% mod) %*% t(mod)) for(j in 1:50){ dats <- dat[b[1:(j*n)],] ee <- eigen(t(dats) %*% dats) sigbar <- abs(ee$values[dims[2]]/(j*n)) R <- (dats/sqrt(sigbar)) %*% P wm <- (1/(j*n))*t(R) %*% R - P ee <- eigen(wm) v <- c(rep(T, 100), rep(F, dims[2])) v <- v[order(c(a*(j*n)^(-1/3)*dims[2],ee$values), decreasing = T)] u <- 1:length(v) w <- 1:100 rhat[,j] <- rev((u[v==TRUE]-w)) } ss <- rowVars(rhat) peak <- which.max(ss) start <- which.max(c(rep(1e5,peak),ss[(peak+1):100]) < 0.5*(ss[peak] + ss[100])) finish <- which.max(ss*c(rep(0,start),rep(1,100-start)) > 0.75*(ss[peak] + ss[100])) if(finish==1){finish <- 100} est <- modefunc(rhat[start:finish,50]) if(plot){ par(mar=c(5,5,5,5)) plot(a*dims[2]*(j*50)^(-1/3),rhat[,50],type="l",lwd=3,col="blue",xlab="Threshold",ylab="",ylim=c(0,dims[2]),cex.axis=1.5,yaxt="n") lines(a*dims[2]*(j*50)^(-1/3),ss/max(ss)*dims[2],type="l",col="red",lwd=3) axis(4,at=seq(0,dims[2],length=5),labels=round(seq(0,max(ss),length=5),2),cex.axis=1.5,col.axis="red") axis(2,at=seq(0,dims[2],length=5),labels=round(seq(0,dims[2],length=5),2),cex.axis=1.5,col.axis="blue") lines(rep(a[start]*dims[2]*(j*50)^(-1/3),100),seq(0,dims[2],length=100),col="green",lwd=3,lty=2) lines(rep(a[finish]*dims[2]*(j*50)^(-1/3),100),seq(0,dims[2],length=100),col="green",lwd=3,lty=2) lines(a*dims[2]*(j*50)^(-1/3),rep(est,100),lty=2,lwd=2,col="black") mtext("Estimated # of Factors",side=2,line=3,col="blue",cex=1.5) mtext("Variance of Estimate",side=4,line=3,col="red",cex=1.5) legend(a[50]*dims[2]*(j*50)^(-1/3),dims[2],legend=c("2nd Stability Interval",paste("Estimated Value =",est)),col=c("green","black"),lwd=c(3,3),lty=c(2,2),bg="white") } return(est) } ## Borrowed from genefilter rowVars <- function(x, ...) { sqr = function(x) x * x n = rowSums(!is.na(x)) n[n <= 1] = NA return(rowSums(sqr(x - rowMeans(x, ...)), ...)/(n - 1)) } modefunc <- function(x){ return(as.numeric(names(sort(-table(x)))[1])) }