## ----setup, include=FALSE----------------------------------------------------- options(fig_caption=TRUE) library(knitr) opts_chunk$set(out.extra='style="display:block; margin: auto"', fig.align="center") ## ----lib, warning=FALSE, message=FALSE, results="hide"------------------------ library(BioQC) library(hgu133plus2.db) ## to simulate an microarray expression dataset library(lattice) library(latticeExtra) library(gridExtra) library(gplots) pdf.options(family="ArialMT", useDingbats=FALSE) set.seed(1887) ## list human genes humanGenes <- unique(na.omit(unlist(as.list(hgu133plus2SYMBOL)))) ## read tissue-specific gene signatures gmtFile <- system.file("extdata/exp.tissuemark.affy.roche.symbols.gmt", package="BioQC") gmt <- readGmt(gmtFile) tissueInds <- sapply(gmt, function(x) match(x$genes, humanGenes)) ## ----sensitivity_benchmark_fig, echo=FALSE, fig.width=8, fig.height=4.5, dev='png', fig.cap=sprintf("**Figure 1:** Sensitivity benchmark. Expression levels of genes in the ovary signature are dedicately sampled randomly from normal distributions with different mean values. Left panel: enrichment scores reported by *BioQC* for the ovary signature, plotted against the differences in mean expression values; Right panel: rank of ovary enrichment scores in all %s signatures plotted against the difference in mean expression values.", length(gmt))---- tissueInds <- sapply(gmt, function(x) match(x$genes, humanGenes)) randomMatrixButOneSignature <- function(rows=humanGenes, signatureGenes, amplitudes=seq(0, 3, by=0.5)) { nrow <- length(rows) ncol <- length(amplitudes) mat <- matrix(rnorm(nrow*ncol), nrow=nrow, byrow=FALSE) rownames(mat) <- rows sigInd <- na.omit(match(signatureGenes, humanGenes)) colClass <- factor(amplitudes) for(colInd in unique(colClass)) { isCurrCol <- colInd==colClass replaceMatrix <- matrix(rnorm(length(sigInd)*sum(isCurrCol), mean=amplitudes[isCurrCol][1]), nrow=length(sigInd), byrow=FALSE) mat[sigInd, isCurrCol] <- replaceMatrix } return(mat) } selGeneSet <- "Ovary_NGS_RNASEQATLAS_0.6_3" selSignature <- gmt[[selGeneSet]]$genes senseAmplitudes <- rep(c(seq(0, 1, by=0.25), seq(1.5, 3, 0.5)), each=10) senseMat <- randomMatrixButOneSignature(rows=humanGenes, signatureGenes=selSignature, amplitudes=senseAmplitudes) senseBioQC <- wmwTest(senseMat, tissueInds, valType="p.greater", simplify=TRUE) senseRank <- apply(senseBioQC, 2, function(x) rank(x)[selGeneSet]) mydot <- function(x,y,abline=1,...) {panel.abline(h=abline, col="darkgray");panel.dotplot(x,y,...)} senseBW <- bwplot(-log10(senseBioQC[selGeneSet,])~senseAmplitudes, horizontal=FALSE, pch="|", do.out=FALSE, par.settings=list(box.rectangle=list(col="black", fill="#ccddee")), scales=list(tck=c(1,0), alternating=1L, x=list(at=seq(along=unique(senseAmplitudes)), labels=unique(senseAmplitudes))), ylab="Enrichment score", xlab="Mean expression difference") senseDot <- dotplot(-log10(senseBioQC[selGeneSet,])~senseAmplitudes, horizontal=FALSE, cex=0.9, panel=mydot, abline=0, scales=list(tck=c(1,0), alternating=1L, x=list(at=seq(along=unique(senseAmplitudes)), labels=unique(senseAmplitudes)))) senseRankBW <- bwplot(senseRank~senseAmplitudes, horizontal=FALSE, pch="|", do.out=FALSE, col="black", ylim=c(155, -5), par.settings=list(box.rectangle=list(col="black", fill="#d9dddd")), scales=list(tck=c(1,0), alternating=1L, y=list(at=c(1,50,100,150)), x=list(at=seq(along=unique(senseAmplitudes)), labels=unique(senseAmplitudes))), ylab="Enrichment score rank", xlab="Mean expression difference") senseRankDot <- dotplot(senseRank~senseAmplitudes, horizontal=FALSE, panel=mydot, abline=1, cex=0.9, col="black",ylim=c(155, -5), scales=list(tck=c(1,0), alternating=1L, x=list(at=seq(along=unique(senseAmplitudes)), labels=unique(senseAmplitudes)))) sensePlot <- senseBW + senseDot senseRankPlot <- senseRankBW + senseRankDot grid.arrange(sensePlot, senseRankPlot, ncol=2) ## ----mixing_benchmark, echo=FALSE--------------------------------------------- dogfile <- system.file("extdata/GSE20113.rda", package = "BioQC") # if(!file.exists(dogfile)) { # rawdog <- getGEO("GSE20113")[[1]] # filterFeatures <- function(eset) { # rawGeneSymbol <- fData(eset)[, "Gene Symbol"] # eset <- eset[!rawGeneSymbol %in% "" & !is.na(rawGeneSymbol),] # gs <- fData(eset)[, "Gene Symbol"] # # gsSplit <- split(1:nrow(eset), gs) # rmeans <- rowMeans(exprs(eset), na.rm=TRUE) # maxMean <- sapply(gsSplit, function(x) x[which.max(rmeans[x])]) # maxMean <- unlist(maxMean) # res <- eset[maxMean,] # fData(res)$GeneSymbol <- fData(res)[, "Gene Symbol"] # return(res) # } # dog <- filterFeatures(rawdog) # dog$Label <- gsub("[0-9]$", "", as.character(dog$description)) # save(dog, file=dogfile) # } else { # } load(dogfile) dogInds <- sapply(gmt, function(x) match(x$genes, fData(dog)$GeneSymbol)) dogBioQC <- wmwTest(dog, gmt, valType="p.greater", simplify=TRUE) dogEnrich <- absLog10p(dogBioQC) dogEnrich.best <- apply(dogEnrich,2, which.max) dogEnrich.second <- apply(dogEnrich,2, function(x) which(rank(-x)==2)) shortLabel <- function(x) gsub("_NR_0\\.7_3|_NGS_RNASEQATLAS_0\\.6_3", "", x) dogEnrich.bestLabels <- shortLabel(rownames(dogEnrich)[dogEnrich.best]) dogEnrich.secondLabels <- shortLabel(rownames(dogEnrich)[dogEnrich.second]) dogTable <- data.frame(Label=dog$Label, BioQC.best=dogEnrich.bestLabels, BioQC.second=dogEnrich.secondLabels, row.names=sampleNames(dog)) ## ----dog_table, echo=FALSE---------------------------------------------------- kable(dogTable, caption="Quality control of the mixing benchmark input data with *BioQC*. Four columns (f.l.t.r.): sample index; tissue reported by the authors; the tissue signature with the highest enrichment score reported by *BioQC*; the tissue signature with the second- highest enrichment score.") ## ----hj_mix, echo=FALSE------------------------------------------------------- heart <- rowMeans(exprs(dog)[, dog$Label=="Heart"]) jejunum <- rowMeans(exprs(dog)[, dog$Label=="Jejunum"]) linearMix <- function(vec1, vec2, prop2=0) { stopifnot(prop2>=0 && prop2<=1) return(vec1*(1-prop2)+vec2*prop2) } mixProps <- seq(0, 1, 0.05) mixPropsLabels <- sprintf("%f%%", mixProps*100) hjMix <- sapply(mixProps, function(x) linearMix(jejunum, heart, x)) hjMixBioQC <- wmwTest(hjMix, dogInds, valType="p.greater", simplify=TRUE) hjMixSub <- hjMixBioQC[c("Intestine_small_NR_0.7_3","Muscle_cardiac_NR_0.7_3"),] hjMixSubEnrich <- absLog10p(hjMixSub) hjMixBioQCrank <- apply(hjMixBioQC, 2, function(x) rank(x)) hjMixSubRank <- hjMixBioQCrank[c("Intestine_small_NR_0.7_3","Muscle_cardiac_NR_0.7_3"),] colnames(hjMixSubEnrich) <- colnames(hjMixSubRank) <- mixPropsLabels ## ----hjMixVis, echo=FALSE, fig.width=8, fig.height=4, dev='png', fig.cap="**Figure 2:** Results of a mixing case study. Left panel: *BioQC* enrichment scores of small intestine and cardiac muscle varying upon different proportions of jejunum; Right panel: ranks of enrichment scores varying upon different proportions of jejunum."---- mixPropsShow <- seq(0, 1, 0.25) mixPropsShowLabels <- sprintf("%d%%", mixPropsShow*100) hjCols <- c("orange", "lightblue") hjTissues <- c("Small intenstine","Cardiac muscle") hjMixData <- data.frame(Tissue=rep(hjTissues, ncol(hjMixSubEnrich)), Prop=rep(mixProps, each=nrow(hjMixSubEnrich)), EnrichScore=as.vector(hjMixSubEnrich)) hjMixDataRank <- data.frame(Tissue=rep(hjTissues, ncol(hjMixSubEnrich)), Prop=rep(mixProps, each=nrow(hjMixSubEnrich)), EnrichScore=as.vector(hjMixSubRank)) hjMixXY <- xyplot(EnrichScore ~ Prop, group=Tissue, data=hjMixData, type="b", xlab="Proportion of heart", ylab="Enrichment score", par.settings=list(superpose.symbol=list(cex=1.25, pch=16, col=hjCols), superpose.line=list(col=hjCols)), auto.key=list(columns=1L), abline=list(h=3, col="lightgray", lty=2, lwd=2), scales=list(tck=c(1,0), alternating=1L, x=list(at=mixPropsShow, labels=mixPropsShowLabels))) hjMixRankXY <- xyplot(EnrichScore ~ Prop, group=Tissue, data=hjMixDataRank, type="b", xlab="Proportion of heart", ylab="Enrichment score rank", ylim=c(155, 0.8), auto.key=list(columns=1L), par.settings=list(superpose.symbol=list(cex=1.25, pch=16, col=hjCols), superpose.line=list(col=hjCols)), abline=list(h=log2(10), col="lightgray", lty=2, lwd=2), scales=list(tck=c(1,0), alternating=1L, x=list(at=mixPropsShow, labels=mixPropsShowLabels), y=list(log=2, at=c(1,2,3,4,6,10,25,50,100,150)))) grid.arrange(hjMixXY, hjMixRankXY, ncol=2) ## ----dog_mix, echo=FALSE------------------------------------------------------ dogFilter <- dog[,-c(1,22,24)] dogAvg <- tapply(1:ncol(dogFilter), dogFilter$Label, function(x) rowMeans(exprs(dogFilter)[,x])) dogAvgMat <- do.call(cbind, dogAvg) dogLabels <- c("Brain_Cortex_prefrontal_NR_0.7_3", "Muscle_cardiac_NR_0.7_3", "Intestine_small_NR_0.7_3", "Kidney_NR_0.7_3", "Liver_NR_0.7_3", "Lung_NR_0.7_3", "Lymphocyte_B_FOLL_NR_0.7_3", "Pancreas_Islets_NR_0.7_3", "Muscle_skeletal_NR_0.7_3", "Monocytes_NR_0.7_3") dogComb <- subset(expand.grid(1:ncol(dogAvgMat),1:ncol(dogAvgMat)), Var2>Var1) dogPairwise <- apply(dogComb, 1, function(x) { vec1 <- dogAvgMat[,x[1]] vec2 <- dogAvgMat[,x[2]] label1 <- dogLabels[x[1]] label2 <- dogLabels[x[2]] mix <- sapply(mixProps, function(x) linearMix(vec1, vec2, x)) bioqc <- wmwTest(mix, dogInds, valType="p.greater", simplify=TRUE) ranks <- apply(bioqc, 2, rank) enrich <- absLog10p(bioqc) colnames(enrich) <- colnames(ranks) <- mixPropsLabels res <- list(EnrichScore=enrich[c(label1, label2),], Rank=ranks[c(label1, label2),]) return(res) }) contamInd <- sapply(dogPairwise, function(x) { successScore <- x$EnrichScore[2,]>=3 successRank <- x$Rank[2,]<=10 if(all(successScore) & all(successRank)) { return(NA) } pmin(min(which(successScore)), min(which(successRank))) }) revContamInd <- sapply(dogPairwise, function(x) { successScore <- x$EnrichScore[1,]>=3 successRank <- x$Rank[1,]<=10 if(all(successScore) & all(successRank)) { return(NA) } pmax(max(which(successScore)), max(which(successRank))) }) fromMinProp <- mixProps[contamInd] toMaxProp <- 1-mixProps[revContamInd] comProp <- matrix(NA, nrow=ncol(dogAvgMat), ncol=ncol(dogAvgMat)) colnames(comProp) <- rownames(comProp) <- colnames(dogAvgMat) for(i in 1:nrow(dogComb)) { comProp[dogComb[i,1], dogComb[i,2]] <- fromMinProp[i] comProp[dogComb[i,2], dogComb[i,1]] <- toMaxProp[i] } ## ----dog_mix_vis, echo=FALSE, fig.width=8, fig.height=5, dev='png', fig.cap="**Figure 3:** Results of the pairwise mixing experiment. Each cell represents the minimal percentage of tissue of the column as contamination in the tissue of the row that can be detected by *BioQC*. No values are available for cells on the diagonal because self-mixing was excluded. Heart and skeletal muscle are very close to each other and therefore their detection limit is not considered."---- dogMixCol <- colorRampPalette(c("#67A9CF", "black", "#EF8A62"))(100) heatmap.2(x=comProp, Rowv=NULL,Colv=NULL, col = dogMixCol, scale="none", margins=c(9,9), # ("margin.Y", "margin.X") trace='none', symkey=FALSE, symbreaks=FALSE, breaks=seq(0,1,0.01), dendrogram='none', density.info='none', denscol="black", na.col="gray", offsetRow=0, offsetCol=0, keysize=1, key.title="Enrichment score", key.xlab="Detection limit", xlab="Contamination tissue", ylab="Originating tissue", #( "bottom.margin", "left.margin", "top.margin", "right.margin" ) key.par=list(mar=c(5,0,1,8)), # lmat -- added 2 lattice sections (5 and 6) for padding lmat=rbind(c(5, 4, 2), c(6, 1, 3)), lhei=c(1.6, 5), lwid=c(1, 4, 1)) ## ----dog_mix_vis_example, echo=FALSE------------------------------------------ cell12 <- comProp[1,2] cell21 <- comProp[2,1] cell12Per <- as.integer(cell12*100) cell21Per <- as.integer(cell21*100) ## ----table_detect_limit, echo=FALSE, tab.cap="Median lower detection limites of tissues as contamination sources."---- meanThr <- colMeans(comProp, na.rm=TRUE) meanThrShow <- data.frame(Tissue=names(meanThr), MedianDetectionLimit=sprintf("%1.2f%%", meanThr*100)) kable(meanThrShow, caption="Median lower detection limites of tissues as contamination sources.") ## ----brain_low_exp, echo=FALSE, fig.width=6, fig.height=3, dev='png', fig.cap="**Figure 4:** Average expression of tissue-preferential genes in respective tissues. For each tissue (*e.g.* brain), we calculate the median ratio of gene expression level of specific genes over the median expression level of background genes. The value reflects the specificity of tissue-specific genes in respective tissues. Likely due to the sampling of different brain regions, the score of brain ranks the lowest."---- dogAvgGs <- c("Brain_Cortex_prefrontal_NR_0.7_3", "Muscle_cardiac_NR_0.7_3", "Intestine_small_NR_0.7_3", "Kidney_NR_0.7_3", "Liver_NR_0.7_3", "Lung_NR_0.7_3", "Lymphocyte_B_FOLL_NR_0.7_3", "Pancreas_Islets_NR_0.7_3", "Muscle_skeletal_NR_0.7_3", "Monocytes_NR_0.7_3") dogAvgGsGenes <- sapply(gmt[dogAvgGs], function(x) x$genes) dogAvgGsGeneInds <- lapply(dogAvgGsGenes, function(x) { inds <- match(x, fData(dog)$GeneSymbol) inds[!is.na(inds)] }) ## boxplot of tissue-specific genes dogAvgGsRel <- sapply(seq(along=dogAvgGsGeneInds), function(i) { ind <- dogAvgGsGeneInds[[i]] bgInd <- setdiff(1:nrow(dog), ind) apply(dogAvgMat, 2, function(x) median(x[ind]/median(x[bgInd]))) }) colnames(dogAvgGsRel) <- colnames(dogAvgMat) ##heatmap.2(log2(dogAvgGsRel),zlim=c(-1,1), ## cellnote=round(dogAvgGsRel,1), ## Rowv=NA, Colv=NA, dendrogram="none", col=greenred, ## lwid=c(1,3), lhei=c(1,3), ## key.title="Enrichment score", key.xlab="Detection limit", ## xlab="Tissue signature", ylab="Tissue profiles", ## key.par=list(cex.lab=1.25, mar=c(4,2,1,0), mgp=c(1.5,0.5,0)), ## cexRow=1.25, margins=c(8,8), trace="none", offsetRow=0, offsetCol=0, ## density.info="none",na.col="gray", breaks=seq(-1,1,0.01), symbreaks=TRUE) op <- par(mar=c(8,4,1,1)) barplot(sort(diag(dogAvgGsRel)), las=3, ylab="Median ratio of expression (signature/background)") abline(h=1.25) ## op <- par(mar=c(8,4,1,1)) ## dogAvgES <- absLog10p(diag(wmwTest(dogAvgMat, dogAvgGsGeneInds, alternative="greater"))) ## names(dogAvgES) <- colnames(dogAvgMat) ## barplot(sort(dogAvgES, decreasing=TRUE), las=3, ylab="ES in respective average tissue profile") ## par(op) ## plot(meanThr, dogAvgES, type="n", xlab="Average lower detection limit", ylab="BioQC ES score") ## text(meanThr, dogAvgES, colnames(dogAvgMat)) ## ----pca, echo=FALSE, fig.width=6, fig.height=6, dev='png', fig.cap="Sample separation revealed by principal component analysis (PCA) of the dog transcriptome dataset."---- par(mar=c(4,4,2,2)) dogEXP <- exprs(dog); colnames(dogEXP) <- dog$Label dogPCA <- prcomp(t(dogEXP)) expVar <- function(pcaRes, n) {vars <- pcaRes$sdev^2; (vars/sum(vars))[n]} biplot(dogPCA, col=c("#335555dd", "transparent"), cex=1.15, xlab=sprintf("Principal component 1 (%1.2f%%)", expVar(dogPCA,1)*100), ylab=sprintf("Principal component 1 (%1.2f%%)", expVar(dogPCA,2)*100)) ## ----session_info------------------------------------------------------------- sessionInfo()