## ----setup, echo=FALSE, results="hide"---------------------------------------- knitr::opts_chunk$set(tidy = FALSE, cache = FALSE, dev = "png", message = FALSE, error = FALSE, warning = TRUE) ## ----quickStart, eval=FALSE--------------------------------------------------- # dds <- DESeqDataSetFromMatrix(countData = cts, # colData = coldata, # design= ~ batch + condition) # dds <- DESeq(dds) # resultsNames(dds) # lists the coefficients # res <- results(dds, name="condition_trt_vs_untrt") # # or to shrink log fold changes association with condition: # res <- lfcShrink(dds, coef="condition_trt_vs_untrt", type="apeglm") ## ----txiSetup----------------------------------------------------------------- library("tximport") library("readr") library("tximportData") dir <- system.file("extdata", package="tximportData") samples <- read.table(file.path(dir,"samples.txt"), header=TRUE) samples$condition <- factor(rep(c("A","B"),each=3)) rownames(samples) <- samples$run samples[,c("pop","center","run","condition")] ## ----txiFiles----------------------------------------------------------------- files <- file.path(dir,"salmon", samples$run, "quant.sf.gz") names(files) <- samples$run tx2gene <- read_csv(file.path(dir, "tx2gene.gencode.v27.csv")) ## ----tximport, results="hide"------------------------------------------------- txi <- tximport(files, type="salmon", tx2gene=tx2gene) ## ----txi2dds, results="hide"-------------------------------------------------- library("DESeq2") ddsTxi <- DESeqDataSetFromTximport(txi, colData = samples, design = ~ condition) ## ----------------------------------------------------------------------------- coldata <- samples coldata$files <- files coldata$names <- coldata$run ## ----echo=FALSE--------------------------------------------------------------- library("tximeta") se <- tximeta(coldata, skipMeta=TRUE) ddsTxi2 <- DESeqDataSet(se, design = ~condition) ## ----eval=FALSE--------------------------------------------------------------- # library("tximeta") # se <- tximeta(coldata) # ddsTxi <- DESeqDataSet(se, design = ~ condition) ## ----loadPasilla-------------------------------------------------------------- library("pasilla") pasCts <- system.file("extdata", "pasilla_gene_counts.tsv", package="pasilla", mustWork=TRUE) pasAnno <- system.file("extdata", "pasilla_sample_annotation.csv", package="pasilla", mustWork=TRUE) cts <- as.matrix(read.csv(pasCts,sep="\t",row.names="gene_id")) coldata <- read.csv(pasAnno, row.names=1) coldata <- coldata[,c("condition","type")] coldata$condition <- factor(coldata$condition) coldata$type <- factor(coldata$type) ## ----showPasilla-------------------------------------------------------------- head(cts,2) coldata ## ----reorderPasila------------------------------------------------------------ rownames(coldata) <- sub("fb", "", rownames(coldata)) all(rownames(coldata) %in% colnames(cts)) all(rownames(coldata) == colnames(cts)) cts <- cts[, rownames(coldata)] all(rownames(coldata) == colnames(cts)) ## ----matrixInput-------------------------------------------------------------- library("DESeq2") dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~ condition) dds ## ----addFeatureData----------------------------------------------------------- featureData <- data.frame(gene=rownames(cts)) mcols(dds) <- DataFrame(mcols(dds), featureData) mcols(dds) ## ----htseqDirI, eval=FALSE---------------------------------------------------- # directory <- "/path/to/your/files/" ## ----htseqDirII--------------------------------------------------------------- directory <- system.file("extdata", package="pasilla", mustWork=TRUE) ## ----htseqInput--------------------------------------------------------------- sampleFiles <- grep("treated",list.files(directory),value=TRUE) sampleCondition <- sub("(.*treated).*","\\1",sampleFiles) sampleTable <- data.frame(sampleName = sampleFiles, fileName = sampleFiles, condition = sampleCondition) sampleTable$condition <- factor(sampleTable$condition) ## ----hsteqDds----------------------------------------------------------------- library("DESeq2") ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory, design= ~ condition) ddsHTSeq ## ----loadSumExp--------------------------------------------------------------- library("airway") data("airway") se <- airway ## ----sumExpInput-------------------------------------------------------------- library("DESeq2") ddsSE <- DESeqDataSet(se, design = ~ cell + dex) ddsSE ## ----prefilter---------------------------------------------------------------- smallestGroupSize <- 3 keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize dds <- dds[keep,] ## ----factorlvl---------------------------------------------------------------- dds$condition <- factor(dds$condition, levels = c("untreated","treated")) ## ----relevel------------------------------------------------------------------ dds$condition <- relevel(dds$condition, ref = "untreated") ## ----droplevels--------------------------------------------------------------- dds$condition <- droplevels(dds$condition) ## ----deseq-------------------------------------------------------------------- dds <- DESeq(dds) res <- results(dds) res ## ----eval=FALSE--------------------------------------------------------------- # res <- results(dds, name="condition_treated_vs_untreated") # res <- results(dds, contrast=c("condition","treated","untreated")) ## ----lfcShrink---------------------------------------------------------------- resultsNames(dds) resLFC <- lfcShrink(dds, coef="condition_treated_vs_untreated", type="apeglm") resLFC ## ----parallel, eval=FALSE----------------------------------------------------- # library("BiocParallel") # register(MulticoreParam(4)) ## ----resOrder----------------------------------------------------------------- resOrdered <- res[order(res$pvalue),] ## ----sumRes------------------------------------------------------------------- summary(res) ## ----sumRes01----------------------------------------------------------------- sum(res$padj < 0.1, na.rm=TRUE) ## ----resAlpha05--------------------------------------------------------------- res05 <- results(dds, alpha=0.05) summary(res05) sum(res05$padj < 0.05, na.rm=TRUE) ## ----IHW, eval=FALSE---------------------------------------------------------- # # (unevaluated code chunk) # library("IHW") # resIHW <- results(dds, filterFun=ihw) # summary(resIHW) # sum(resIHW$padj < 0.1, na.rm=TRUE) # metadata(resIHW)$ihwResult ## ----MA----------------------------------------------------------------------- plotMA(res, ylim=c(-2,2)) ## ----shrunkMA----------------------------------------------------------------- plotMA(resLFC, ylim=c(-2,2)) ## ----MAidentify, eval=FALSE--------------------------------------------------- # idx <- identify(res$baseMean, res$log2FoldChange) # rownames(res)[idx] ## ----warning=FALSE------------------------------------------------------------ resultsNames(dds) # because we are interested in treated vs untreated, we set 'coef=2' resNorm <- lfcShrink(dds, coef=2, type="normal") resAsh <- lfcShrink(dds, coef=2, type="ashr") ## ----fig.width=8, fig.height=3------------------------------------------------ par(mfrow=c(1,3), mar=c(4,4,2,1)) xlim <- c(1,1e5); ylim <- c(-3,3) plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm") plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal") plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr") ## ----plotCounts--------------------------------------------------------------- plotCounts(dds, gene=which.min(res$padj), intgroup="condition") ## ----plotCountsAdv------------------------------------------------------------ d <- plotCounts(dds, gene=which.min(res$padj), intgroup="condition", returnData=TRUE) library("ggplot2") ggplot(d, aes(x=condition, y=count)) + geom_point(position=position_jitter(w=0.1,h=0)) + scale_y_log10(breaks=c(25,100,400)) ## ----metadata----------------------------------------------------------------- mcols(res)$description ## ----export, eval=FALSE------------------------------------------------------- # write.csv(as.data.frame(resOrdered), # file="condition_treated_results.csv") ## ----subset------------------------------------------------------------------- resSig <- subset(resOrdered, padj < 0.1) resSig ## ----multifactor-------------------------------------------------------------- colData(dds) ## ----copyMultifactor---------------------------------------------------------- ddsMF <- dds ## ----fixLevels---------------------------------------------------------------- levels(ddsMF$type) levels(ddsMF$type) <- sub("-.*", "", levels(ddsMF$type)) levels(ddsMF$type) ## ----replaceDesign------------------------------------------------------------ design(ddsMF) <- formula(~ type + condition) ddsMF <- DESeq(ddsMF) ## ----multiResults------------------------------------------------------------- resMF <- results(ddsMF) head(resMF) ## ----multiTypeResults--------------------------------------------------------- resMFType <- results(ddsMF, contrast=c("type", "single", "paired")) head(resMFType) ## ----rlogAndVST--------------------------------------------------------------- vsd <- vst(dds, blind=FALSE) rld <- rlog(dds, blind=FALSE) head(assay(vsd), 3) ## ----meansd------------------------------------------------------------------- # this gives log2(n + 1) ntd <- normTransform(dds) library("vsn") meanSdPlot(assay(ntd)) meanSdPlot(assay(vsd)) meanSdPlot(assay(rld)) ## ----heatmap------------------------------------------------------------------ library("pheatmap") select <- order(rowMeans(counts(dds,normalized=TRUE)), decreasing=TRUE)[1:20] df <- as.data.frame(colData(dds)[,c("condition","type")]) pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) ## ----sampleClust-------------------------------------------------------------- sampleDists <- dist(t(assay(vsd))) ## ----figHeatmapSamples, fig.height=4, fig.width=6----------------------------- library("RColorBrewer") sampleDistMatrix <- as.matrix(sampleDists) rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-") colnames(sampleDistMatrix) <- NULL colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) pheatmap(sampleDistMatrix, clustering_distance_rows=sampleDists, clustering_distance_cols=sampleDists, col=colors) ## ----figPCA------------------------------------------------------------------- plotPCA(vsd, intgroup=c("condition", "type")) ## ----figPCA2------------------------------------------------------------------ pcaData <- plotPCA(vsd, intgroup=c("condition", "type"), returnData=TRUE) percentVar <- round(100 * attr(pcaData, "percentVar")) ggplot(pcaData, aes(PC1, PC2, color=condition, shape=type)) + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + coord_fixed() ## ----WaldTest, eval=FALSE----------------------------------------------------- # dds <- estimateSizeFactors(dds) # dds <- estimateDispersions(dds) # dds <- nbinomWaldTest(dds) ## ----eval=FALSE--------------------------------------------------------------- # dds <- estimateSizeFactors(dds, controlGenes=ctrlGenes) # dds <- DESeq(dds) ## ----simpleContrast, eval=FALSE----------------------------------------------- # results(dds, contrast=c("condition","C","B")) ## ----combineFactors, eval=FALSE----------------------------------------------- # dds$group <- factor(paste0(dds$genotype, dds$condition)) # design(dds) <- ~ group # dds <- DESeq(dds) # resultsNames(dds) # results(dds, contrast=c("group", "IB", "IA")) ## ----interFig, echo=FALSE, results="hide", fig.height=3----------------------- npg <- 20 mu <- 2^c(8,10,9,11,10,12) cond <- rep(rep(c("A","B"),each=npg),3) geno <- rep(c("I","II","III"),each=2*npg) table(cond, geno) counts <- rnbinom(6*npg, mu=rep(mu,each=npg), size=1/.01) d <- data.frame(log2c=log2(counts+1), cond, geno) library("ggplot2") plotit <- function(d, title) { ggplot(d, aes(x=cond, y=log2c, group=geno)) + geom_jitter(size=1.5, position = position_jitter(width=.15)) + facet_wrap(~ geno) + stat_summary(fun=mean, geom="line", colour="red", linewidth=0.8) + xlab("condition") + ylab("log2(counts+1)") + ggtitle(title) } plotit(d, "Gene 1") + ylim(7,13) lm(log2c ~ cond + geno + geno:cond, data=d) ## ----interFig2, echo=FALSE, results="hide", fig.height=3---------------------- mu[4] <- 2^12 mu[6] <- 2^8 counts <- rnbinom(6*npg, mu=rep(mu,each=npg), size=1/.01) d2 <- data.frame(log2c=log2(counts + 1), cond, geno) plotit(d2, "Gene 2") + ylim(7,13) lm(log2c ~ cond + geno + geno:cond, data=d2) ## ----simpleLRT, eval=FALSE---------------------------------------------------- # dds <- DESeq(dds, test="LRT", reduced=~1) # res <- results(dds) ## ----simpleLRT2, eval=FALSE--------------------------------------------------- # dds <- DESeq(dds, test="LRT", reduced=~batch) # res <- results(dds) ## ----apeThresh---------------------------------------------------------------- resApeT <- lfcShrink(dds, coef=2, type="apeglm", lfcThreshold=1) plotMA(resApeT, ylim=c(-3,3), cex=.8) abline(h=c(-1,1), col="dodgerblue", lwd=2) ## ----ashThresh---------------------------------------------------------------- resAshT <- lfcShrink(dds, coef=2, type="ashr", lfcThreshold=1) plotMA(resAshT, ylim=c(-3,3), cex=.8) abline(h=c(-1,1), col="dodgerblue", lwd=2) ## ----------------------------------------------------------------------------- condition <- factor(rep(c("A","B","C"),each=2)) model.matrix(~ condition) # to compare C vs B, make B the reference level, # and select the last coefficient condition <- relevel(condition, "B") model.matrix(~ condition) ## ----------------------------------------------------------------------------- grp <- factor(rep(1:3,each=4)) cnd <- factor(rep(rep(c("A","B"),each=2),3)) model.matrix(~ grp + cnd + grp:cnd) # to compare condition effect in group 3 vs 2, # make group 2 the reference level, # and select the last coefficient grp <- relevel(grp, "2") model.matrix(~ grp + cnd + grp:cnd) ## ----------------------------------------------------------------------------- grp <- factor(rep(1:2,each=4)) ind <- factor(rep(rep(1:2,each=2),2)) cnd <- factor(rep(c("A","B"),4)) model.matrix(~grp + grp:ind + grp:cnd) # to compare condition effect across group, # add a main effect for 'cnd', # and select the last coefficient model.matrix(~grp + cnd + grp:ind + grp:cnd) ## ----boxplotCooks------------------------------------------------------------- par(mar=c(8,5,2,2)) boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) ## ----dispFit------------------------------------------------------------------ plotDispEsts(dds) ## ----dispFitCustom------------------------------------------------------------ ddsCustom <- dds useForMedian <- mcols(ddsCustom)$dispGeneEst > 1e-7 medianDisp <- median(mcols(ddsCustom)$dispGeneEst[useForMedian], na.rm=TRUE) dispersionFunction(ddsCustom) <- function(mu) medianDisp ddsCustom <- estimateDispersionsMAP(ddsCustom) ## ----filtByMean--------------------------------------------------------------- metadata(res)$alpha metadata(res)$filterThreshold plot(metadata(res)$filterNumRej, type="b", ylab="number of rejections", xlab="quantiles of filter") lines(metadata(res)$lo.fit, col="red") abline(v=metadata(res)$filterTheta) ## ----noFilt------------------------------------------------------------------- resNoFilt <- results(dds, independentFiltering=FALSE) addmargins(table(filtering=(res$padj < .1), noFiltering=(resNoFilt$padj < .1))) ## ----lfcThresh---------------------------------------------------------------- par(mfrow=c(2,2),mar=c(2,2,1,1)) ylim <- c(-2.5,2.5) resGA <- results(dds, lfcThreshold=.5, altHypothesis="greaterAbs") resLA <- results(dds, lfcThreshold=.5, altHypothesis="lessAbs") resG <- results(dds, lfcThreshold=.5, altHypothesis="greater") resL <- results(dds, lfcThreshold=.5, altHypothesis="less") drawLines <- function() abline(h=c(-.5,.5),col="dodgerblue",lwd=2) plotMA(resGA, ylim=ylim); drawLines() plotMA(resLA, ylim=ylim); drawLines() plotMA(resG, ylim=ylim); drawLines() plotMA(resL, ylim=ylim); drawLines() ## ----mcols-------------------------------------------------------------------- mcols(dds,use.names=TRUE)[1:4,1:4] substr(names(mcols(dds)),1,10) mcols(mcols(dds), use.names=TRUE)[1:4,] ## ----muAndCooks--------------------------------------------------------------- head(assays(dds)[["mu"]]) head(assays(dds)[["cooks"]]) ## ----dispersions-------------------------------------------------------------- head(dispersions(dds)) head(mcols(dds)$dispersion) ## ----sizefactors-------------------------------------------------------------- sizeFactors(dds) ## ----coef--------------------------------------------------------------------- head(coef(dds)) ## ----betaPriorVar------------------------------------------------------------- attr(dds, "betaPriorVar") ## ----priorInfo---------------------------------------------------------------- priorInfo(resLFC) priorInfo(resNorm) priorInfo(resAsh) ## ----dispPriorVar------------------------------------------------------------- dispersionFunction(dds) attr(dispersionFunction(dds), "dispPriorVar") ## ----versionNum--------------------------------------------------------------- metadata(dds)[["version"]] ## ----normFactors, eval=FALSE-------------------------------------------------- # normFactors <- normFactors / exp(rowMeans(log(normFactors))) # normalizationFactors(dds) <- normFactors ## ----offsetTransform, eval=FALSE---------------------------------------------- # cqnOffset <- cqnObject$glm.offset # cqnNormFactors <- exp(cqnOffset) # EDASeqNormFactors <- exp(-1 * EDASeqOffset) ## ----lineardep, echo=FALSE---------------------------------------------------- DataFrame(batch=factor(c(1,1,2,2)), condition=factor(c("A","A","B","B"))) ## ----lineardep2, echo=FALSE--------------------------------------------------- DataFrame(batch=factor(c(1,1,1,1,2,2)), condition=factor(c("A","A","B","B","C","C"))) ## ----lineardep3, echo=FALSE--------------------------------------------------- DataFrame(batch=factor(c(1,1,1,2,2,2)), condition=factor(c("A","B","C","A","B","C"))) ## ----groupeffect-------------------------------------------------------------- coldata <- DataFrame(grp=factor(rep(c("X","Y"),each=6)), ind=factor(rep(1:6,each=2)), cnd=factor(rep(c("A","B"),6))) coldata ## ----------------------------------------------------------------------------- as.data.frame(coldata) ## ----groupeffect2------------------------------------------------------------- coldata$ind.n <- factor(rep(rep(1:3,each=2),2)) as.data.frame(coldata) ## ----groupeffect3------------------------------------------------------------- model.matrix(~ grp + grp:ind.n + grp:cnd, coldata) ## ----groupeffect4, eval=FALSE------------------------------------------------- # results(dds, contrast=list("grpY.cndB","grpX.cndB")) ## ----missingcombo------------------------------------------------------------- group <- factor(rep(1:3,each=6)) condition <- factor(rep(rep(c("A","B","C"),each=2),3)) d <- DataFrame(group, condition)[-c(17,18),] as.data.frame(d) ## ----missingcombo2------------------------------------------------------------ m1 <- model.matrix(~ condition*group, d) colnames(m1) unname(m1) all.zero <- apply(m1, 2, function(x) all(x==0)) all.zero ## ----missingcombo3------------------------------------------------------------ idx <- which(all.zero) m1 <- m1[,-idx] unname(m1) ## ----cooksPlot---------------------------------------------------------------- W <- res$stat maxCooks <- apply(assays(dds)[["cooks"]],1,max) idx <- !is.na(W) plot(rank(W[idx]), maxCooks[idx], xlab="rank of Wald statistic", ylab="maximum Cook's distance per gene", ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3)) m <- ncol(dds) p <- 3 abline(h=qf(.99, p, m - p)) ## ----indFilt------------------------------------------------------------------ plot(res$baseMean+1, -log10(res$pvalue), log="x", xlab="mean of normalized counts", ylab=expression(-log[10](pvalue)), ylim=c(0,30), cex=.4, col=rgb(0,0,0,.3)) ## ----histindepfilt------------------------------------------------------------ use <- res$baseMean > metadata(res)$filterThreshold h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE) h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE) colori <- c(`do not pass`="khaki", `pass`="powderblue") ## ----fighistindepfilt--------------------------------------------------------- barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "", ylab="frequency") text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA) legend("topright", fill=rev(colori), legend=rev(names(colori))) ## ----vanillaDESeq, eval=FALSE------------------------------------------------- # dds <- DESeq(dds, minReplicatesForReplace=Inf) # res <- results(dds, cooksCutoff=FALSE, independentFiltering=FALSE) ## ----eval=FALSE--------------------------------------------------------------- # mat <- assay(vsd) # mm <- model.matrix(~condition, colData(vsd)) # mat <- limma::removeBatchEffect(mat, batch=vsd$batch, design=mm) # assay(vsd) <- mat # plotPCA(vsd) ## ----varGroup, echo=FALSE----------------------------------------------------- set.seed(3) dds1 <- makeExampleDESeqDataSet(n=1000,m=12,betaSD=.3,dispMeanRel=function(x) 0.01) dds2 <- makeExampleDESeqDataSet(n=1000,m=12, betaSD=.3, interceptMean=mcols(dds1)$trueIntercept, interceptSD=0, dispMeanRel=function(x) 0.2) dds2 <- dds2[,7:12] dds2$condition <- rep("C",6) mcols(dds2) <- NULL dds12 <- cbind(dds1, dds2) rld <- rlog(dds12, blind=FALSE, fitType="mean") plotPCA(rld) ## ----convertNA, eval=FALSE---------------------------------------------------- # res$padj <- ifelse(is.na(res$padj), 1, res$padj) ## ----sessionInfo-------------------------------------------------------------- sessionInfo()