## Some activities related to cluster analysis and classification with RNA-seq data

You will form five selected groups of size 4-5 persons. You will take 5-7 minutes to sketch written plans of attack on the following questions and then submit your plans for discussion and demonstration.

### Basic questions

1. Given 12.9K gene-level read counts from 129 individuals representing an unspecified number of populations
• Should we modify the data?
• Define an approach to counting the populations and assigning samples to them. How can approaches be compared?
• Fact: There are approximately equal numbers of samples from each population.
2. Suppose you are given the population labels. State principles for extracting expression profiles that distinguish the populations. Suggest how to develop biological interpretation of the profiles.
3. Define a classification procedure that can infer population of origin on the basis of one or more transcript profiles. How good is the algorithm?

### An attempt at solution

#### The basic data. Derivation described later.

load("montpick_eset.RData")

montpick.eset

## Error: object 'montpick.eset' not found


Remove pure zeroes.

pz = apply(exprs(montpick.eset),1,
function(x) all(x==0))
mnz = montpick.eset[-which(pz),]


There is some additional scrubbing needed…

dim(mnz)

## Features  Samples
##    12160      129


Simple clustering approach.

hc = hclust(dist(t(exprs(mnz))))

table(cutree(hc, k=2))

##
##   1   2
## 128   1

table(cutree(hc, k=3))

##
##  1  2  3
## 84 44  1

table(cutree(hc, k=4))

##
##  1  2  3  4
## 84 33 11  1


Try with simple logarithm after shift by one.

hc.l = hclust(dist(t(log(exprs(mnz)+1))))

table(cutree(hc.l, k=2))

##
##   1   2
## 122   7

table(cutree(hc.l, k=3))

##
##  1  2  3
## 30 92  7

table(cutree(hc.l, k=4))

##
##  1  2  3  4
## 30 92  6  1


Try PAM:

library(cluster)
pam.2 = pam(dist(t(log(exprs(mnz)+1))), k=2)
pam.3 = pam(dist(t(log(exprs(mnz)+1))), k=3)
pam.4 = pam(dist(t(log(exprs(mnz)+1))), k=4)
table(pam.4$clust)  ## ## 1 2 3 4 ## 45 20 49 15  Try K-means: km.2 = kmeans(t(log(exprs(mnz)+1)), 2) km.3 = kmeans(t(log(exprs(mnz)+1)), 3) table(km.2$clust)

##
##  1  2
## 77 52

table(km.3$clust)  ## ## 1 2 3 ## 33 60 36  Try kmeans with the voom transformation of the counts. library(limma) vmnz = voom(exprs(mnz)) txd = vmnz$E
lkm2v = kmeans(t(txd),2)
lkm4v = kmeans(t(txd),4)
table(lkm2v$clust)  ## ## 1 2 ## 63 66  table(lkm4v$clust)

##
##  1  2  3  4
## 15 43 60 11


Consider 'clusterStability' measure.

Try a DESeq analysis.

se = SummarizedExperiment(assays=list(counts=exprs(mnz)))
colData(se) = DataFrame(pData(mnz))
de = DESeqDataSet(se, ~population)
derun = DESeq(de)

## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 271 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds))
## estimating dispersions
## fitting model and testing

res = results(derun)

summarizeResults(res)

##
## out of 12160 with nonzero total read count
##
## at adjusted p-value < 0.1:
##
## LFC > 0 (up)   : 3143, 26%
## LFC < 0 (down) : 2975, 24%
##
## filtered *     : 1216, 10%
## flagged **     : 0, 0%
##
## * for low mean count, see independentFiltering argument of results()
## ** for high Cook's distance, see cooksCutoff argument of results()

res[which(res$padj < .005),]  ## log2 fold change (MAP): population YRI vs CEU ## Wald test p-value: population YRI vs CEU ## DataFrame with 3804 rows and 6 columns ## baseMean log2FoldChange lfcSE stat pvalue ## <numeric> <numeric> <numeric> <numeric> <numeric> ## ENSG00000000419 41.785 0.5559 0.09824 5.658 1.528e-08 ## ENSG00000000457 48.692 0.4202 0.09047 4.645 3.400e-06 ## ENSG00000002330 65.562 -0.6426 0.08949 -7.180 6.966e-13 ## ENSG00000002745 1.779 1.7037 0.40252 4.233 2.310e-05 ## ENSG00000002834 155.210 -0.6368 0.10974 -5.803 6.516e-09 ## ... ... ... ... ... ... ## ENSG00000253873 8.008 -0.6856 0.1749 -3.920 8.849e-05 ## ENSG00000253954 9.913 -0.6413 0.1505 -4.262 2.023e-05 ## ENSG00000254004 4.520 1.0417 0.1723 6.045 1.492e-09 ## ENSG00000254128 9.615 1.0443 0.1772 5.894 3.774e-09 ## ENSG00000254206 1.843 2.2559 0.3519 6.410 1.452e-10 ## padj ## <numeric> ## ENSG00000000419 1.151e-07 ## ENSG00000000457 1.732e-05 ## ENSG00000002330 1.015e-11 ## ENSG00000002745 1.007e-04 ## ENSG00000002834 5.190e-08 ## ... ... ## ENSG00000253873 3.470e-04 ## ENSG00000253954 8.945e-05 ## ENSG00000254004 1.331e-08 ## ENSG00000254128 3.120e-08 ## ENSG00000254206 1.511e-09  Is this sensitive to technical artifacts? Use SVA on log-transformed data and check. num = num.sv(log(exprs(mnz)+1), mod=model.matrix(~population,data=pData(mnz)), method="leek") sv1 = sva(log(exprs(mnz)+1), mod=model.matrix(~population,data=pData(mnz)), n.sv=num)  ## Number of significant surrogate variables is: 1 ## Iteration (out of 5 ):1 2 3 4 5  print(num)  ## [1] 1  summary(sv1$sv)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
## -0.1600 -0.0633 -0.0220  0.0000  0.0503  0.3120

secorr = se
secorr$sv = sv1$sv

decorr = DESeqDataSet(secorr, ~sv+population)
derun2 = DESeq(decorr)

## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

corres = results(derun2)

summarizeResults(corres)

##
## out of 12160 with nonzero total read count
##
## at adjusted p-value < 0.1:
##
## LFC > 0 (up)   : 3161, 26%
## LFC < 0 (down) : 3006, 25%
##
## filtered *     : 1216, 10%
## flagged **     : 0, 0%
##
## * for low mean count, see independentFiltering argument of results()
## ** for high Cook's distance, see cooksCutoff argument of results()

corres[which(corres$padj < .005),]  ## log2 fold change (MAP): population YRI vs CEU ## Wald test p-value: population YRI vs CEU ## DataFrame with 3818 rows and 6 columns ## baseMean log2FoldChange lfcSE stat pvalue ## <numeric> <numeric> <numeric> <numeric> <numeric> ## ENSG00000000419 41.785 0.5780 0.09948 5.810 6.247e-09 ## ENSG00000000457 48.692 0.3842 0.09019 4.260 2.040e-05 ## ENSG00000002330 65.562 -0.6635 0.09199 -7.213 5.487e-13 ## ENSG00000002745 1.779 1.7222 0.40362 4.267 1.982e-05 ## ENSG00000002834 155.210 -0.7126 0.11082 -6.430 1.277e-10 ## ... ... ... ... ... ... ## ENSG00000253873 8.008 -0.6432 0.1780 -3.612 3.034e-04 ## ENSG00000253954 9.913 -0.6010 0.1526 -3.938 8.220e-05 ## ENSG00000254004 4.520 1.0304 0.1752 5.880 4.095e-09 ## ENSG00000254128 9.615 1.0214 0.1800 5.673 1.400e-08 ## ENSG00000254206 1.843 2.3030 0.3464 6.647 2.983e-11 ## padj ## <numeric> ## ENSG00000000419 4.947e-08 ## ENSG00000000457 8.866e-05 ## ENSG00000002330 7.980e-12 ## ENSG00000002745 8.639e-05 ## ENSG00000002834 1.302e-09 ## ... ... ## ENSG00000253873 1.045e-03 ## ENSG00000253954 3.181e-04 ## ENSG00000254004 3.337e-08 ## ENSG00000254128 1.048e-07 ## ENSG00000254206 3.281e-10  Approaches to classification library(randomGLM)  ## Loading required package: MASS ## ## Attaching package: 'MASS' ## ## The following object is masked from 'package:genefilter': ## ## area ## ## The following object is masked from 'package:AnnotationDbi': ## ## select ## ## Loading required package: foreach ## foreach: simple, scalable parallel programming from Revolution Analytics ## Use Revolution R for scalability, fault tolerance and more. ## http://www.revolutionanalytics.com ## Loading required package: doParallel ## Loading required package: iterators  mysamp = sample(1:129, size=75) train = mnz[,sort(mysamp)] test = mnz[,-sort(mysamp)] y = 1*(train$pop == "YRI")

## Warning: Name partially matched in data frame

rglm1 = randomGLM( t(exprs(train)), y, xtest=t(exprs(test)) )

table(rglm1$predictedTest, test$pop)

## Warning: Name partially matched in data frame

##
##     CEU YRI
##   0  22   3
##   1   0  29


An alternate approach to ensemble learning

library(bigrf)

## Loading required package: bigmemory

isyri = as.integer(1*(train$pop=="YRI")+1)  ## Warning: Name partially matched in data frame  bb = bigrfc(data.frame(t(exprs(train))), isyri)  ## OOB errors: ## Tree Overall error Error by class ## 1 2 ## 10 12.00 5.26 18.92 ## 20 10.67 7.89 13.51 ## 30 6.67 5.26 8.11 ## 40 6.67 2.63 10.81 ## 50 5.33 2.63 8.11  bigpr = predict(bb, x=data.frame(t(exprs(test))))  ## Processing tree number: ## 10 ## 20 ## 30 ## 40 ## 50  table(bigpr, test$pop)

## Warning: Name partially matched in data frame

##