Dimension Reduction

Principal components

Example 1 - multivariate normal data in 10 dimensions

A function to simulate multivariate normal data.

myrmvn <- function(mu,sigma,hm=1,...){
  n=length(mu)
  if(sum((dim(sigma)-rep(n,2))^2)!=0) 
    stop("Check the dimensions of mu and sigma!")
  if(det(sigma)==0) stop("The covariance matrix is singular!")
  a=t(chol(sigma))
  z=matrix(rnorm(n*hm),nrow=n)
  y=t(a%*%z+mu)
  return(y)
}

Simulate data from a multivariate normal distribution.

mu <- rep(0,10)
sigma <- diag(0.2,10)+0.8
sigma[1:8,9:10] <- 0.2
sigma[9:10,1:8] <- 0.2
mu

##  [1] 0 0 0 0 0 0 0 0 0 0

sigma

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.8  0.8  0.8  0.8  0.8  0.8  0.8  0.2   0.2
##  [2,]  0.8  1.0  0.8  0.8  0.8  0.8  0.8  0.8  0.2   0.2
##  [3,]  0.8  0.8  1.0  0.8  0.8  0.8  0.8  0.8  0.2   0.2
##  [4,]  0.8  0.8  0.8  1.0  0.8  0.8  0.8  0.8  0.2   0.2
##  [5,]  0.8  0.8  0.8  0.8  1.0  0.8  0.8  0.8  0.2   0.2
##  [6,]  0.8  0.8  0.8  0.8  0.8  1.0  0.8  0.8  0.2   0.2
##  [7,]  0.8  0.8  0.8  0.8  0.8  0.8  1.0  0.8  0.2   0.2
##  [8,]  0.8  0.8  0.8  0.8  0.8  0.8  0.8  1.0  0.2   0.2
##  [9,]  0.2  0.2  0.2  0.2  0.2  0.2  0.2  0.2  1.0   0.8
## [10,]  0.2  0.2  0.2  0.2  0.2  0.2  0.2  0.2  0.8   1.0

set.seed(1)
n <- 100
dat <- myrmvn(mu,sigma,hm=n)

pairs(dat)

PCA

dat.pca <- prcomp(dat,retx=TRUE,center=TRUE,scale.=TRUE)
summary(dat.pca)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5    PC6     PC7
## Standard deviation     2.5745 1.3361 0.5604 0.50644 0.46858 0.4336 0.42375
## Proportion of Variance 0.6628 0.1785 0.0314 0.02565 0.02196 0.0188 0.01796
## Cumulative Proportion  0.6628 0.8413 0.8727 0.89834 0.92030 0.9391 0.95705
##                            PC8     PC9    PC10
## Standard deviation     0.40214 0.38626 0.34434
## Proportion of Variance 0.01617 0.01492 0.01186
## Cumulative Proportion  0.97322 0.98814 1.00000

plot(dat.pca)

Example 2 - genetic background

A simple toy example from statistical genetics, to differentiate genetic background of subjects.

Let’s generate a function to simulate SNP data.

my.snp.pc.data = function(n,maf){
  hm=length(maf)
  z=matrix(ncol=hm,nrow=n)
  for(j in 1:hm){
    p=1-maf[j]
    mp=c(p^2,2*p*(1-p),(1-p)^2)
    z[,j]=apply(rmultinom(n,1,mp),2,order)[3,]-1
  }
  return(z)
}

Let’s pick two populations, 100 subjects each, with 50 SNPs.

n1 <- n2 <- 100
ns <- 50
set.seed(1)
maf1 <- runif(ns,0.1,0.5)
maf2 <- runif(ns,0.1,0.5)
z1 <- my.snp.pc.data(n1,maf1)
z2 <- my.snp.pc.data(n2,maf2)
z <- data.frame(rbind(z1,z2))
head(z)

##   X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
## 1  1  0  0  2  0  0  0  1  2   0   0   0   0   0   0   1   2   1   0   1
## 2  0  0  0  0  1  0  2  0  1   0   0   0   2   0   1   0   2   1   0   0
## 3  1  0  1  0  1  2  1  1  1   1   0   0   0   1   1   0   1   1   1   1
## 4  0  2  1  0  2  1  0  0  1   0   0   1   1   1   1   0   1   1   0   2
## 5  0  0  0  0  1  1  2  0  0   0   1   1   0   1   0   0   1   0   0   0
## 6  0  0  1  1  0  1  0  1  1   0   0   0   2   0   1   0   2   0   1   1
##   X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38
## 1   1   0   0   0   1   0   0   1   2   1   0   1   0   0   2   0   0   0
## 2   2   1   1   0   0   0   0   0   1   0   0   0   0   0   1   0   1   0
## 3   1   0   1   2   0   1   1   1   2   1   0   2   0   1   1   1   0   0
## 4   1   0   2   0   0   0   0   1   0   0   0   2   0   0   1   1   2   0
## 5   2   1   0   0   1   0   1   0   0   1   1   0   0   0   1   0   1   0
## 6   0   0   0   0   0   1   0   0   1   0   1   2   1   1   1   1   1   0
##   X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50
## 1   0   1   0   0   2   0   1   2   1   0   0   1
## 2   0   0   1   1   0   0   1   1   0   1   0   1
## 3   2   0   0   1   0   0   0   0   1   0   0   1
## 4   2   0   1   0   0   0   0   0   0   0   1   1
## 5   1   0   2   1   1   0   1   0   0   1   1   0
## 6   0   1   0   0   1   1   1   1   1   0   1   0

“prcomp” is one function in R to do principal compenents.

snp.pca <- prcomp(z,retx=TRUE,center=TRUE,scale.=TRUE)
snp.p <- predict(snp.pca)

Let’s plot the first two principal compenents, and color them by population.

plot(snp.p[,1:2],pch=21,bg=rep(c("blue","red"),c(n1,n2)),xaxt="n",yaxt="n",
       xlab="Principal Component 1",ylab="Principal Component 2")

Let’s generate a new set of 50 samples from population 1, and see where they fall on the plot.

n3 <- 50
z3 <- data.frame(my.snp.pc.data(n3,maf1))
p3 <- predict(snp.pca,z3)
plot(snp.p[,1:2],pch=21,bg=rep(c("blue","red"),c(n1,n2)),xaxt="n",yaxt="n",
     xlab="Principal Component 1",ylab="Principal Component 2")
points(p3[,1:2],pch=21,bg="green3",cex=1.5)

Clustering

Let’s load the tissue gene expression data introduced in the lecture. You can download the data in binary format from the class web page and read them in (as I have done on my laptop), or read them directly over the network (uncomment the line with the url() command, and put a # in front of the first line). The latter will take longer.

load("/Users/ingo/kogo/web/robj/tissuesGeneExpression.rda")
# load(url("http://biostat.jhsph.edu/~iruczins/teaching/kogo/robj/tissuesGeneExpression.rda"))

library(Biobase)

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

rownames(tab) <- tab$filename
t <- ExpressionSet(e, AnnotatedDataFrame(tab))
t$Tissue <- factor(t$Tissue)
colnames(t) <- paste0(t$Tissue, seq_len(ncol(t)))
boxplot(exprs(t), range=0)

We will take the transpose of the vector, as the dist function will compute distances between the rows, and we want sample-sample distances: Note that the value in the 1,2 slot of the matrix of the object d is the same as the square root of the sum of squared differences, i.e., the definition of the Euclidean distance.

x <- t(exprs(t))
d <- dist(x)
class(d)

## [1] "dist"

as.matrix(d)[1,2]

## [1] 85.8546

sqrt(sum((x[1,] - x[2,])^2))

## [1] 85.8546

We can perform hierarchical clustering based on the distances defined above, using the hclust function. The plot method will make a plot of the tree that results from hclust.

hc <- hclust(d)
hc

## 
## Call:
## hclust(d = d)
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 189

plot(hc)

library(rafalib)
mypar()
plot(hc, labels=t$Tissue)

myplclust(hc, labels=t$Tissue, lab.col=as.numeric(t$Tissue))
abline(h=120)

If we use the line above to cut the tree into clusters, we can examine how the clusters overlap with the actual tissues:

hclusters <- cutree(hc, h=120)
table(true=t$Tissue, cluster=hclusters)

##              cluster
## true           1  2  3  4  5  6  7  8  9 10 11 12 13 14
##   cerebellum   0  0  0  0 31  0  0  0  2  0  0  5  0  0
##   colon        0  0  0  0  0  0 34  0  0  0  0  0  0  0
##   endometrium  0  0  0  0  0  0  0  0  0  0 15  0  0  0
##   hippocampus  0  0 12 19  0  0  0  0  0  0  0  0  0  0
##   kidney       9 18  0  0  0 10  0  0  2  0  0  0  0  0
##   liver        0  0  0  0  0  0  0 24  0  2  0  0  0  0
##   placenta     0  0  0  0  0  0  0  0  0  0  0  0  2  4

We can also call the kmeans function to perform k-means clustering as was explained in the lecture. Let’s run k-means on the samples in the space of the first two genes, so that we can observe the results of the algorithm in the same space:

plot(x[,1], x[,2])

km <- kmeans(x[,1:2], centers=3)
names(km)

## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(x[,1], x[,2], col=km$cluster, pch=16)

Note that if we perform k-means clustering using all of the genes, the results are not the same as before. Looking at the first two genes doesn’t explain why the clusters we see were generated. So we need to use a different method to see these inter-samples distances.

km <- kmeans(x, centers=3)
plot(x[,1], x[,2], col=km$cluster, pch=16)

Let’s try one last time with larger ‘k’:

km <- kmeans(x, centers=7)
table(true=t$Tissue, cluster=km$cluster)

##              cluster
## true           1  2  3  4  5  6  7
##   cerebellum   0  0 31  2  0  0  5
##   colon        0  0  0  0 34  0  0
##   endometrium  0  0  0  0  0 15  0
##   hippocampus  0  0  0  0  0  0 31
##   kidney      19  0  0  2  0 18  0
##   liver        0 24  0  2  0  0  0
##   placenta     0  0  0  0  6  0  0

We weren’t able to see why the k-means algorithm defined a certain set of clusters using only the first two genes.

Instead of the first two genes, let’s use the multi-dimensional scaling algorithm introduced in the lectures. This is a projection from the space of all genes to a two dimensional space, which mostly preserves the inter-sample distances. The cmdscale function in R takes a distance object and returns a matrix which has two dimensions (columns) for each sample.

mds <- cmdscale(dist(x))
plot(mds, col=km$cluster, pch=16)

We can also plot the names of the tissues with the color of the cluster.

plot(mds, type="n")
text(mds, colnames(t), col=km$cluster)

Heatmaps

Heatmaps are useful plots for visualizing the expression values for a subset of genes over all the samples. The dendrogram on top and on the side is a hierarchical clustering as we saw before. First we will use the heatmap available in base R. First define a color palette.

# install.packages("RColorBrewer")
library(RColorBrewer)
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)

Now, pick the genes with the top variance over all samples:

library(genefilter)
rv <- rowVars(exprs(t))
idx <- order(-rv)[1:40]

Now we can plot a heatmap of these genes:

heatmap(exprs(t)[idx,], col=hmcol)

The heatmap.2 function in the gplots package on CRAN is a bit more customizable, and stretches to fill the window. Here we add colors to indicate the tissue on the top:

library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

cols <- palette(brewer.pal(8, "Dark2"))[t$Tissue]
cbind(colnames(t),cols)

##                         cols     
##   [1,] "kidney1"        "#66A61E"
##   [2,] "kidney2"        "#66A61E"
##   [3,] "kidney3"        "#66A61E"
##   [4,] "kidney4"        "#66A61E"
##   [5,] "kidney5"        "#66A61E"
##   [6,] "kidney6"        "#66A61E"
##   [7,] "kidney7"        "#66A61E"
##   [8,] "kidney8"        "#66A61E"
##   [9,] "kidney9"        "#66A61E"
##  [10,] "kidney10"       "#66A61E"
##  [11,] "kidney11"       "#66A61E"
##  [12,] "kidney12"       "#66A61E"
##  [13,] "kidney13"       "#66A61E"
##  [14,] "kidney14"       "#66A61E"
##  [15,] "kidney15"       "#66A61E"
##  [16,] "kidney16"       "#66A61E"
##  [17,] "hippocampus17"  "#E7298A"
##  [18,] "hippocampus18"  "#E7298A"
##  [19,] "hippocampus19"  "#E7298A"
##  [20,] "hippocampus20"  "#E7298A"
##  [21,] "hippocampus21"  "#E7298A"
##  [22,] "hippocampus22"  "#E7298A"
##  [23,] "hippocampus23"  "#E7298A"
##  [24,] "hippocampus24"  "#E7298A"
##  [25,] "hippocampus25"  "#E7298A"
##  [26,] "hippocampus26"  "#E7298A"
##  [27,] "hippocampus27"  "#E7298A"
##  [28,] "hippocampus28"  "#E7298A"
##  [29,] "hippocampus29"  "#E7298A"
##  [30,] "hippocampus30"  "#E7298A"
##  [31,] "hippocampus31"  "#E7298A"
##  [32,] "hippocampus32"  "#E7298A"
##  [33,] "hippocampus33"  "#E7298A"
##  [34,] "hippocampus34"  "#E7298A"
##  [35,] "hippocampus35"  "#E7298A"
##  [36,] "hippocampus36"  "#E7298A"
##  [37,] "hippocampus37"  "#E7298A"
##  [38,] "hippocampus38"  "#E7298A"
##  [39,] "hippocampus39"  "#E7298A"
##  [40,] "hippocampus40"  "#E7298A"
##  [41,] "hippocampus41"  "#E7298A"
##  [42,] "hippocampus42"  "#E7298A"
##  [43,] "hippocampus43"  "#E7298A"
##  [44,] "hippocampus44"  "#E7298A"
##  [45,] "hippocampus45"  "#E7298A"
##  [46,] "hippocampus46"  "#E7298A"
##  [47,] "cerebellum47"   "#1B9E77"
##  [48,] "cerebellum48"   "#1B9E77"
##  [49,] "cerebellum49"   "#1B9E77"
##  [50,] "cerebellum50"   "#1B9E77"
##  [51,] "cerebellum51"   "#1B9E77"
##  [52,] "cerebellum52"   "#1B9E77"
##  [53,] "cerebellum53"   "#1B9E77"
##  [54,] "cerebellum54"   "#1B9E77"
##  [55,] "cerebellum55"   "#1B9E77"
##  [56,] "cerebellum56"   "#1B9E77"
##  [57,] "cerebellum57"   "#1B9E77"
##  [58,] "cerebellum58"   "#1B9E77"
##  [59,] "cerebellum59"   "#1B9E77"
##  [60,] "cerebellum60"   "#1B9E77"
##  [61,] "cerebellum61"   "#1B9E77"
##  [62,] "cerebellum62"   "#1B9E77"
##  [63,] "cerebellum63"   "#1B9E77"
##  [64,] "cerebellum64"   "#1B9E77"
##  [65,] "cerebellum65"   "#1B9E77"
##  [66,] "cerebellum66"   "#1B9E77"
##  [67,] "cerebellum67"   "#1B9E77"
##  [68,] "cerebellum68"   "#1B9E77"
##  [69,] "cerebellum69"   "#1B9E77"
##  [70,] "cerebellum70"   "#1B9E77"
##  [71,] "cerebellum71"   "#1B9E77"
##  [72,] "cerebellum72"   "#1B9E77"
##  [73,] "cerebellum73"   "#1B9E77"
##  [74,] "kidney74"       "#66A61E"
##  [75,] "kidney75"       "#66A61E"
##  [76,] "kidney76"       "#66A61E"
##  [77,] "kidney77"       "#66A61E"
##  [78,] "kidney78"       "#66A61E"
##  [79,] "kidney79"       "#66A61E"
##  [80,] "kidney80"       "#66A61E"
##  [81,] "kidney81"       "#66A61E"
##  [82,] "kidney82"       "#66A61E"
##  [83,] "kidney83"       "#66A61E"
##  [84,] "kidney84"       "#66A61E"
##  [85,] "kidney85"       "#66A61E"
##  [86,] "kidney86"       "#66A61E"
##  [87,] "colon87"        "#D95F02"
##  [88,] "colon88"        "#D95F02"
##  [89,] "colon89"        "#D95F02"
##  [90,] "colon90"        "#D95F02"
##  [91,] "colon91"        "#D95F02"
##  [92,] "colon92"        "#D95F02"
##  [93,] "colon93"        "#D95F02"
##  [94,] "colon94"        "#D95F02"
##  [95,] "colon95"        "#D95F02"
##  [96,] "colon96"        "#D95F02"
##  [97,] "colon97"        "#D95F02"
##  [98,] "colon98"        "#D95F02"
##  [99,] "colon99"        "#D95F02"
## [100,] "colon100"       "#D95F02"
## [101,] "colon101"       "#D95F02"
## [102,] "colon102"       "#D95F02"
## [103,] "colon103"       "#D95F02"
## [104,] "colon104"       "#D95F02"
## [105,] "colon105"       "#D95F02"
## [106,] "colon106"       "#D95F02"
## [107,] "colon107"       "#D95F02"
## [108,] "colon108"       "#D95F02"
## [109,] "colon109"       "#D95F02"
## [110,] "colon110"       "#D95F02"
## [111,] "colon111"       "#D95F02"
## [112,] "colon112"       "#D95F02"
## [113,] "colon113"       "#D95F02"
## [114,] "colon114"       "#D95F02"
## [115,] "colon115"       "#D95F02"
## [116,] "colon116"       "#D95F02"
## [117,] "colon117"       "#D95F02"
## [118,] "colon118"       "#D95F02"
## [119,] "colon119"       "#D95F02"
## [120,] "colon120"       "#D95F02"
## [121,] "kidney121"      "#66A61E"
## [122,] "kidney122"      "#66A61E"
## [123,] "kidney123"      "#66A61E"
## [124,] "liver124"       "#E6AB02"
## [125,] "liver125"       "#E6AB02"
## [126,] "liver126"       "#E6AB02"
## [127,] "kidney127"      "#66A61E"
## [128,] "kidney128"      "#66A61E"
## [129,] "kidney129"      "#66A61E"
## [130,] "liver130"       "#E6AB02"
## [131,] "liver131"       "#E6AB02"
## [132,] "liver132"       "#E6AB02"
## [133,] "kidney133"      "#66A61E"
## [134,] "kidney134"      "#66A61E"
## [135,] "cerebellum135"  "#1B9E77"
## [136,] "hippocampus136" "#E7298A"
## [137,] "liver137"       "#E6AB02"
## [138,] "liver138"       "#E6AB02"
## [139,] "liver139"       "#E6AB02"
## [140,] "liver140"       "#E6AB02"
## [141,] "liver141"       "#E6AB02"
## [142,] "liver142"       "#E6AB02"
## [143,] "cerebellum143"  "#1B9E77"
## [144,] "cerebellum144"  "#1B9E77"
## [145,] "liver145"       "#E6AB02"
## [146,] "liver146"       "#E6AB02"
## [147,] "kidney147"      "#66A61E"
## [148,] "kidney148"      "#66A61E"
## [149,] "endometrium149" "#7570B3"
## [150,] "endometrium150" "#7570B3"
## [151,] "endometrium151" "#7570B3"
## [152,] "endometrium152" "#7570B3"
## [153,] "endometrium153" "#7570B3"
## [154,] "endometrium154" "#7570B3"
## [155,] "endometrium155" "#7570B3"
## [156,] "endometrium156" "#7570B3"
## [157,] "endometrium157" "#7570B3"
## [158,] "endometrium158" "#7570B3"
## [159,] "endometrium159" "#7570B3"
## [160,] "endometrium160" "#7570B3"
## [161,] "endometrium161" "#7570B3"
## [162,] "endometrium162" "#7570B3"
## [163,] "endometrium163" "#7570B3"
## [164,] "liver164"       "#E6AB02"
## [165,] "liver165"       "#E6AB02"
## [166,] "liver166"       "#E6AB02"
## [167,] "liver167"       "#E6AB02"
## [168,] "liver168"       "#E6AB02"
## [169,] "liver169"       "#E6AB02"
## [170,] "liver170"       "#E6AB02"
## [171,] "liver171"       "#E6AB02"
## [172,] "liver172"       "#E6AB02"
## [173,] "liver173"       "#E6AB02"
## [174,] "liver174"       "#E6AB02"
## [175,] "liver175"       "#E6AB02"
## [176,] "cerebellum176"  "#1B9E77"
## [177,] "cerebellum177"  "#1B9E77"
## [178,] "cerebellum178"  "#1B9E77"
## [179,] "cerebellum179"  "#1B9E77"
## [180,] "cerebellum180"  "#1B9E77"
## [181,] "cerebellum181"  "#1B9E77"
## [182,] "cerebellum182"  "#1B9E77"
## [183,] "cerebellum183"  "#1B9E77"
## [184,] "placenta184"    "#A6761D"
## [185,] "placenta185"    "#A6761D"
## [186,] "placenta186"    "#A6761D"
## [187,] "placenta187"    "#A6761D"
## [188,] "placenta188"    "#A6761D"
## [189,] "placenta189"    "#A6761D"

heatmap.2(exprs(t)[idx,], trace="none", ColSideColors=cols, col=hmcol)