Basic questions

Some activities related to cluster analysis and classification with RNA-seq data

You will form five selected groups of size 4-5 persons. You will take 5-7 minutes to sketch written plans of attack on the following questions and then submit your plans for discussion and demonstration.

Basic questions

Given 12.9K gene-level read counts from 129 individuals representing an unspecified number of populations

Should we modify the data?
Define an approach to counting the populations and assigning samples to them. How can approaches be compared?
Fact: There are approximately equal numbers of samples from each population.

Suppose you are given the population labels. State principles for extracting expression profiles that distinguish the populations. Suggest how to develop biological interpretation of the profiles.
Define a classification procedure that can infer population of origin on the basis of one or more transcript profiles. How good is the algorithm?

An attempt at solution

The basic data. Derivation described later.

load("montpick_eset.RData")

montpick.eset

## Error: object 'montpick.eset' not found

Remove pure zeroes.

pz = apply(exprs(montpick.eset),1,
  function(x) all(x==0))
mnz = montpick.eset[-which(pz),]

There is some additional scrubbing needed…

dim(mnz)

## Features  Samples 
##    12160      129

Simple clustering approach.

hc = hclust(dist(t(exprs(mnz))))

table(cutree(hc, k=2))

## 
##   1   2 
## 128   1

table(cutree(hc, k=3))

## 
##  1  2  3 
## 84 44  1

table(cutree(hc, k=4))

## 
##  1  2  3  4 
## 84 33 11  1

Try with simple logarithm after shift by one.

hc.l = hclust(dist(t(log(exprs(mnz)+1))))

table(cutree(hc.l, k=2))

## 
##   1   2 
## 122   7

table(cutree(hc.l, k=3))

## 
##  1  2  3 
## 30 92  7

table(cutree(hc.l, k=4))

## 
##  1  2  3  4 
## 30 92  6  1

Try PAM:

library(cluster)
pam.2 = pam(dist(t(log(exprs(mnz)+1))), k=2)
pam.3 = pam(dist(t(log(exprs(mnz)+1))), k=3)
pam.4 = pam(dist(t(log(exprs(mnz)+1))), k=4)
table(pam.4$clust)

## 
##  1  2  3  4 
## 45 20 49 15

Try K-means:

km.2 = kmeans(t(log(exprs(mnz)+1)), 2)
km.3 = kmeans(t(log(exprs(mnz)+1)), 3)
table(km.2$clust)

## 
##  1  2 
## 77 52

table(km.3$clust)

## 
##  1  2  3 
## 33 60 36

Try kmeans with the voom transformation of the counts.

library(limma)
vmnz = voom(exprs(mnz))
txd = vmnz$E
lkm2v = kmeans(t(txd),2)
lkm4v = kmeans(t(txd),4)
table(lkm2v$clust)

## 
##  1  2 
## 63 66

table(lkm4v$clust)

## 
##  1  2  3  4 
## 15 43 60 11

Consider 'clusterStability' measure.

Try a DESeq analysis.

se = SummarizedExperiment(assays=list(counts=exprs(mnz)))
colData(se) = DataFrame(pData(mnz))
de = DESeqDataSet(se, ~population)
derun = DESeq(de)

## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 271 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds))
## estimating dispersions
## fitting model and testing

res = results(derun)

summarizeResults(res)

## 
## out of 12160 with nonzero total read count
## 
## at adjusted p-value < 0.1:
## 
## LFC > 0 (up)   : 3143, 26% 
## LFC < 0 (down) : 2975, 24% 
## 
## filtered *     : 1216, 10% 
## flagged **     : 0, 0% 
## 
## * for low mean count, see independentFiltering argument of results()
## ** for high Cook's distance, see cooksCutoff argument of results()

res[which(res$padj < .005),]

## log2 fold change (MAP): population YRI vs CEU 
## Wald test p-value: population YRI vs CEU 
## DataFrame with 3804 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000419    41.785         0.5559   0.09824     5.658 1.528e-08
## ENSG00000000457    48.692         0.4202   0.09047     4.645 3.400e-06
## ENSG00000002330    65.562        -0.6426   0.08949    -7.180 6.966e-13
## ENSG00000002745     1.779         1.7037   0.40252     4.233 2.310e-05
## ENSG00000002834   155.210        -0.6368   0.10974    -5.803 6.516e-09
## ...                   ...            ...       ...       ...       ...
## ENSG00000253873     8.008        -0.6856    0.1749    -3.920 8.849e-05
## ENSG00000253954     9.913        -0.6413    0.1505    -4.262 2.023e-05
## ENSG00000254004     4.520         1.0417    0.1723     6.045 1.492e-09
## ENSG00000254128     9.615         1.0443    0.1772     5.894 3.774e-09
## ENSG00000254206     1.843         2.2559    0.3519     6.410 1.452e-10
##                      padj
##                 <numeric>
## ENSG00000000419 1.151e-07
## ENSG00000000457 1.732e-05
## ENSG00000002330 1.015e-11
## ENSG00000002745 1.007e-04
## ENSG00000002834 5.190e-08
## ...                   ...
## ENSG00000253873 3.470e-04
## ENSG00000253954 8.945e-05
## ENSG00000254004 1.331e-08
## ENSG00000254128 3.120e-08
## ENSG00000254206 1.511e-09

Is this sensitive to technical artifacts? Use SVA on log-transformed data and check.

num = num.sv(log(exprs(mnz)+1), 
  mod=model.matrix(~population,data=pData(mnz)), method="leek")
sv1 = sva(log(exprs(mnz)+1), 
   mod=model.matrix(~population,data=pData(mnz)), n.sv=num)

## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5

print(num)

## [1] 1

summary(sv1$sv)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.1600 -0.0633 -0.0220  0.0000  0.0503  0.3120

secorr = se
secorr$sv = sv1$sv

decorr = DESeqDataSet(secorr, ~sv+population)
derun2 = DESeq(decorr)

## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

corres = results(derun2)

summarizeResults(corres)

## 
## out of 12160 with nonzero total read count
## 
## at adjusted p-value < 0.1:
## 
## LFC > 0 (up)   : 3161, 26% 
## LFC < 0 (down) : 3006, 25% 
## 
## filtered *     : 1216, 10% 
## flagged **     : 0, 0% 
## 
## * for low mean count, see independentFiltering argument of results()
## ** for high Cook's distance, see cooksCutoff argument of results()

corres[which(corres$padj < .005),]

## log2 fold change (MAP): population YRI vs CEU 
## Wald test p-value: population YRI vs CEU 
## DataFrame with 3818 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000419    41.785         0.5780   0.09948     5.810 6.247e-09
## ENSG00000000457    48.692         0.3842   0.09019     4.260 2.040e-05
## ENSG00000002330    65.562        -0.6635   0.09199    -7.213 5.487e-13
## ENSG00000002745     1.779         1.7222   0.40362     4.267 1.982e-05
## ENSG00000002834   155.210        -0.7126   0.11082    -6.430 1.277e-10
## ...                   ...            ...       ...       ...       ...
## ENSG00000253873     8.008        -0.6432    0.1780    -3.612 3.034e-04
## ENSG00000253954     9.913        -0.6010    0.1526    -3.938 8.220e-05
## ENSG00000254004     4.520         1.0304    0.1752     5.880 4.095e-09
## ENSG00000254128     9.615         1.0214    0.1800     5.673 1.400e-08
## ENSG00000254206     1.843         2.3030    0.3464     6.647 2.983e-11
##                      padj
##                 <numeric>
## ENSG00000000419 4.947e-08
## ENSG00000000457 8.866e-05
## ENSG00000002330 7.980e-12
## ENSG00000002745 8.639e-05
## ENSG00000002834 1.302e-09
## ...                   ...
## ENSG00000253873 1.045e-03
## ENSG00000253954 3.181e-04
## ENSG00000254004 3.337e-08
## ENSG00000254128 1.048e-07
## ENSG00000254206 3.281e-10

Approaches to classification

library(randomGLM)

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:genefilter':
## 
##     area
## 
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## 
## Loading required package: foreach
## foreach: simple, scalable parallel programming from Revolution Analytics
## Use Revolution R for scalability, fault tolerance and more.
## http://www.revolutionanalytics.com
## Loading required package: doParallel
## Loading required package: iterators

mysamp = sample(1:129, size=75)
train = mnz[,sort(mysamp)]
test = mnz[,-sort(mysamp)]
y = 1*(train$pop == "YRI")

## Warning: Name partially matched in data frame

rglm1 = randomGLM( t(exprs(train)), y, xtest=t(exprs(test)) )

table(rglm1$predictedTest, test$pop)

## Warning: Name partially matched in data frame

##    
##     CEU YRI
##   0  22   3
##   1   0  29

An alternate approach to ensemble learning

library(bigrf)

## Loading required package: bigmemory
## Loading required package: bigmemory.sri
## Loading required package: BH
## 
## bigmemory >= 4.0 is a major revision since 3.1.2; please see packages
## biganalytics and and bigtabulate and http://www.bigmemory.org for more information.

isyri = as.integer(1*(train$pop=="YRI")+1)

## Warning: Name partially matched in data frame

bb = bigrfc(data.frame(t(exprs(train))), isyri)

## OOB errors:
##  Tree  Overall error  Error by class
##                           1      2
##    10          12.00   5.26  18.92
##    20          10.67   7.89  13.51
##    30           6.67   5.26   8.11
##    40           6.67   2.63  10.81
##    50           5.33   2.63   8.11

bigpr = predict(bb, x=data.frame(t(exprs(test))))

## Processing tree number:
##    10
##    20
##    30
##    40
##    50

table(bigpr, test$pop)

## Warning: Name partially matched in data frame

##      
## bigpr CEU YRI
##     1  22   3
##     2   0  29