## ----include = FALSE------------------------------------------------------------------------------ knitr::opts_chunk$set( collapse = TRUE, comment = "#>", width = 100 ) options(width=100) ## ----echo=FALSE, fig.width=10, fig.height=6, out.width="100%", out.height="100%"------------------ library(epialleleR) data.beta <- data.table::data.table( pattern=rep(letters[1:10], each=10), pos=rep(10*c(1:10), 10), base=rep(c('meth',rep('unmeth',10)), length.out=100) ) val.beta <- data.table::data.table( pos=rep(1:10, 2), var=c(rep("VEF", 10), rep("beta\nvalue", 10)), val=c(rep(0, 10), rep(0.1, 10)) ) data.vef <- data.table::data.table( pattern=rep(letters[1:10], each=10), pos=rep(10*c(1:10), 10), base=c(rep('unmeth',20), rep('meth',10), rep('unmeth',70)) ) val.vef <- data.table::data.table( pos=rep(1:10, 2), var=c(rep("VEF", 10), rep("beta\nvalue", 10)), val=c(rep(0.1, 10), rep(0.1, 10)) ) if (require("ggplot2", quietly=TRUE) & require("gridExtra", quietly=TRUE)) { plot.data.beta <- ggplot(data.beta, aes(x=pos, y=pattern, group=pattern, color=factor(base))) + geom_line(color="grey") + geom_point() + scale_colour_grey(start=0, end=0.8) + theme_light() + theme(axis.text.x=element_blank(), axis.text.y=element_blank(), legend.position="top") + labs(x="genomic position", y="epiallele", title="scattered CpG methylation", color="base") plot.val.beta <- ggplot(val.beta, aes(x=pos, y=var, label=val, fill=factor(val))) + geom_text(size=3) + theme_light() + theme(axis.text.x=element_blank()) + labs(x="", y="", title=NULL) plot.data.vef <- ggplot(data.vef, aes(x=pos, y=pattern, group=pattern, color=factor(base))) + geom_line(color="grey") + geom_point() + scale_colour_grey(start=0, end=0.8) + theme_light() + theme(axis.text.x=element_blank(), axis.text.y=element_blank(), legend.position="top") + labs(x="genomic position", y="epiallele", title="epimutation", color="base") plot.val.vef <- ggplot(val.vef, aes(x=pos, y=var, label=val, fill=factor(val))) + geom_text(size=3) + theme_light() + theme(axis.text.x=element_blank()) + labs(x="", y="", title=NULL) grobs <- lapply(list(plot.data.beta, plot.data.vef, plot.val.beta, plot.val.vef), ggplotGrob) widths <- do.call(grid::unit.pmax, lapply(grobs, `[[`, "widths")) for (i in 1:length(grobs)) grobs[[i]]$widths[2:5] <- widths[2:5] do.call("grid.arrange", c(grobs, list(ncol=2, heights=c(3, 1)))) } ## ------------------------------------------------------------------------------------------------- bam.file <- tempfile(pattern="simulated", fileext=".bam") simulateBam(output.bam.file=bam.file, XM=c("ZZzZZ", "zzZzz"), XG="CT") # one can view the resulting file using `samtools view -h ` # or, if desired, file can be converted to SAM using `samtools view`, # manually corrected and converted back to BAM ## ------------------------------------------------------------------------------------------------- library(epialleleR) capture.bam <- system.file("extdata", "capture.bam", package="epialleleR") bam.data <- preprocessBam(capture.bam) # Specifics of long-read alignment processing out.bam <- tempfile(pattern="out-", fileext=".bam") simulateBam( seq=c("ACGCCATYCGCGCCA"), Mm=c("C+m,0,2,0;G-m,0,0,0;"), Ml=list(as.integer(c(102,128,153,138,101,96))), output.bam.file=out.bam ) generateCytosineReport(out.bam, threshold.reads=FALSE, report.context="CX") ## ------------------------------------------------------------------------------------------------- # bwa-meth sample output input.bam <- system.file("extdata", "test", "bwameth-se-unsort-yd.bam", package="epialleleR") # resulting BAM with XG/XM tags output.bam <- tempfile(pattern="output-", fileext=".bam") # sample reference genome genome <- preprocessGenome(system.file("extdata", "test", "reference.fasta.gz", package="epialleleR")) # calls cytosine methylation and stores it in the output BAM # Input BAM has 100 records of which 73 are mapped to the genome callMethylation(input.bam, output.bam, genome) # process this data further # bam.data <- preprocessBam(output.bam) ## ------------------------------------------------------------------------------------------------- # data.table::data.table object for # CpG VEF report cg.vef.report <- generateCytosineReport(bam.data) head(cg.vef.report[order(meth+unmeth, decreasing=TRUE)]) # CpG cytosine report cg.report <- generateCytosineReport(bam.data, threshold.reads=FALSE) head(cg.report[order(meth+unmeth, decreasing=TRUE)]) # CX cytosine report cx.report <- generateCytosineReport(bam.data, threshold.reads=FALSE, report.context="CX") head(cx.report[order(meth+unmeth, decreasing=TRUE)]) ## ------------------------------------------------------------------------------------------------- # report for amplicon-based data # matching is done by exact start or end positions plus/minus tolerance amplicon.report <- generateAmpliconReport( bam=system.file("extdata", "amplicon010meth.bam", package="epialleleR"), bed=system.file("extdata", "amplicon.bed", package="epialleleR") ) amplicon.report # report for capture-based data # matching is done by overlap capture.report <- generateCaptureReport( bam=system.file("extdata", "capture.bam", package="epialleleR"), bed=system.file("extdata", "capture.bed", package="epialleleR") ) head(capture.report) # generateBedReport is a generic function for BED-guided reports bed.report <- generateBedReport( bam=system.file("extdata", "capture.bam", package="epialleleR"), bed=system.file("extdata", "capture.bed", package="epialleleR"), bed.type="capture" ) identical(capture.report, bed.report) ## ------------------------------------------------------------------------------------------------- # lMHL report can be generated using mhl.report <- generateMhlReport( bam=system.file("extdata", "capture.bam", package="epialleleR") ) ## ----fig.width=10, fig.height=6, out.width="100%", out.height="100%"------------------------------ # First, let's extract base methylation information for sequencing reads # of 1:9 mix of methylated and non-methylated control DNA patterns <- extractPatterns( bam=system.file("extdata", "amplicon010meth.bam", package="epialleleR"), bed=as("chr17:43125200-43125600","GRanges") ) # that many read pairs overlap genomic region of interest nrow(patterns) # these are positions of bases within relevant read pairs base.positions <- grep("^[0-9]+$", colnames(patterns), value=TRUE) # let's make a summary table with counts for every pattern patterns.summary <- patterns[, c(lapply(.SD, unique), .N), by=.(pattern, beta), .SDcols=base.positions] # that many unique methylation patterns were found nrow(patterns.summary) # let's melt and plot patterns plot.data <- data.table::melt.data.table(patterns.summary, measure.vars=base.positions, variable.name="pos", value.name="base") # categorical positions, all patterns sorted by beta, with counts on the right if (require("ggplot2", quietly=TRUE) & require("ggstance", quietly=TRUE)) { ggplot(na.omit(plot.data), aes(x=pos, y=reorder(pattern,beta), group=pattern, color=factor(base))) + geom_line(color="grey", position=position_dodgev(height=0.5)) + geom_point(position=position_dodgev(height=0.5)) + scale_colour_grey(start=0.8, end=0) + theme_light() + theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1), axis.text.y=element_blank()) + labs(x="position", y="pattern", title="epialleles", color="base") + scale_x_discrete(expand=c(0.05,0)) + geom_text(inherit.aes=FALSE, data=patterns.summary, mapping=aes(x="count", y=pattern, label=N), size=3) } # continuous positions, nonunique patterns according to their counts if (require("ggplot2", quietly=TRUE) & require("ggstance", quietly=TRUE)) { ggplot(na.omit(plot.data)[N>1], aes(x=as.numeric(as.character(pos)), y=factor(N), group=pattern, color=factor(base))) + geom_line(color="grey", position=position_dodgev(height=0.5)) + geom_point(position=position_dodgev(height=0.5)) + scale_colour_grey(start=0.8, end=0) + theme_light() + labs(x="position", y="count", title="epialleles", color="base") } # upset-like plot of all patterns, continuous positions, sorted by counts if (require("ggplot2", quietly=TRUE) & require("gridExtra", quietly=TRUE)) { grid.arrange( ggplot(na.omit(plot.data), aes(x=as.numeric(as.character(pos)), y=reorder(pattern,N), color=factor(base))) + geom_line(color="grey") + geom_point() + scale_colour_grey(start=0.8, end=0) + theme_light() + theme(axis.text.y=element_blank(), legend.position="none") + labs(x="position", y=NULL, title="epialleles", color="base"), ggplot(unique(na.omit(plot.data)[, .(pattern, N, beta)]), aes(x=N+0.5, y=reorder(pattern,N), alpha=beta, label=N)) + geom_col() + geom_text(alpha=0.5, nudge_x=0.2, size=3) + scale_x_log10() + theme_minimal() + theme(axis.text.y=element_blank(), legend.position="none") + labs(x="count", y=NULL, title=""), ncol=2, widths=c(0.75, 0.25) ) } # now let's explore methylation patterns in RAD51C gene promoter using # methylation capture data capture.patterns <- extractPatterns( bam=system.file("extdata", "capture.bam", package="epialleleR"), bed=as("chr17:58691673-58693108", "GRanges"), verbose=FALSE ) capture.positions <- grep("^[0-9]+$", colnames(capture.patterns), value=TRUE) capture.summary <- capture.patterns[, c(lapply(.SD, unique), .N), by=.(pattern, beta), .SDcols=capture.positions] capture.data <- data.table::melt.data.table(capture.summary, measure.vars=capture.positions, variable.name="pos", value.name="base") if (require("ggplot2", quietly=TRUE) & require("ggstance", quietly=TRUE)) { ggplot(na.omit(capture.data), aes(x=as.numeric(as.character(pos)), y=factor(N), group=pattern, color=factor(base))) + geom_line(color="grey", position=position_dodgev(height=0.9)) + geom_point(position=position_dodgev(height=0.9)) + scale_colour_grey(start=0.8, end=0) + theme_light() + labs(x="position", y="count", title="RAD51C", color="base") } ## ----fig.width=10, fig.height=6, out.width="100%", out.height="100%"------------------------------ # VCF report vcf.report <- generateVcfReport( bam=system.file("extdata", "amplicon010meth.bam", package="epialleleR"), bed=system.file("extdata", "amplicon.bed", package="epialleleR"), vcf=system.file("extdata", "amplicon.vcf.gz", package="epialleleR"), # thresholds on alignment and base quality min.mapq=30, min.baseq=13, # when VCF seqlevels are different from BED and BAM it is possible # to convert them internally vcf.style="NCBI" ) # NA values are shown for the C->T variants on the "+" and G->A on the "-" # strands, because bisulfite conversion makes their counting impossible head(vcf.report) # let's sort the report by increasing Fisher's exact test's p-values. # the p-values are given separately for reads that map to the "+" head(vcf.report[order(`FEp-`, na.last=TRUE)]) # and to the "-" strand head(vcf.report[order(`FEp+`, na.last=TRUE)]) # and finally, let's plot methylation patterns overlapping one of the most # covered SNPs in the methylation capture test data set - rs573296191 # (chr17:61864584) in BRIP1 gene brip1.patterns <- extractPatterns( bam=system.file("extdata", "capture.bam", package="epialleleR"), bed=as("chr17:61864583-61864585", "GRanges"), highlight.positions=61864584, verbose=FALSE ) brip1.positions <- grep("^[0-9]+$", colnames(brip1.patterns), value=TRUE) brip1.summary <- brip1.patterns[, c(lapply(.SD, unique), .N), by=.(pattern, beta), .SDcols=brip1.positions] brip1.data <- data.table::melt.data.table(brip1.summary, measure.vars=brip1.positions, variable.name="pos", value.name="base") if (require("ggplot2", quietly=TRUE) & require("ggstance", quietly=TRUE)) { ggplot(na.omit(brip1.data), aes(x=as.numeric(as.character(pos)), y=factor(N), group=pattern, color=factor(base))) + geom_line(color="grey", position=position_dodgev(height=0.9)) + geom_point(position=position_dodgev(height=0.9)) + scale_colour_manual(values=c("blue",NA,"red","grey80","black")) + theme_light() + labs(x="position", y="count", title="BRIP1", color="base") } ## ----fig.width=10, fig.height=6, out.width="100%", out.height="100%"------------------------------ # First, let's visualise eCDFs for within- and out-of-context beta values # for all four amplicons and unmatched reads. Note that within-the-context eCDF # of 0.5 is very close to the expected 1-VEF value (0.1) for all amplicons # produced from this 1:9 mix of methylated and non-methylated control DNA # let's compute eCDF amplicon.ecdfs <- generateBedEcdf( bam=system.file("extdata", "amplicon010meth.bam", package="epialleleR"), bed=system.file("extdata", "amplicon.bed", package="epialleleR"), bed.rows=NULL ) # there are 5 items in amplicon.ecdfs, let's plot all of them par(mfrow=c(1,length(amplicon.ecdfs))) # cycle through items for (x in 1:length(amplicon.ecdfs)) { # four of them have names corresponding to genomic regions of amplicon.bed # fifth - NA for all the reads that don't match to any of those regions main <- if (is.na(names(amplicon.ecdfs[x]))) "unmatched" else names(amplicon.ecdfs[x]) # plotting eCDF for within-the-context per-read beta values (in red) plot(amplicon.ecdfs[[x]]$context, col="red", verticals=TRUE, do.points=FALSE, xlim=c(0,1), xlab="per-read beta value", ylab="cumulative density", main=main) # adding eCDF for out-of-context per-read beta values (in blue) plot(amplicon.ecdfs[[x]]$out.of.context, add=TRUE, col="blue", verticals=TRUE, do.points=FALSE) } # Second, let's compare eCDFs for within-the-context beta values for only one # amplicon but all three sequenced samples: pure non-methylated DNA, 1:9 mix of # methylated and non-methylated DNA, and fully methylated DNA # our files bam.files <- c("amplicon000meth.bam", "amplicon010meth.bam", "amplicon100meth.bam") # let's plot all of them par(mfrow=c(1,length(bam.files))) # cycle through items for (f in bam.files) { # let's compute eCDF amplicon.ecdfs <- generateBedEcdf( bam=system.file("extdata", f, package="epialleleR"), bed=system.file("extdata", "amplicon.bed", package="epialleleR"), # only the second amplicon bed.rows=2, verbose=FALSE ) # plotting eCDF for within-the-context per-read beta values (in red) plot(amplicon.ecdfs[[1]]$context, col="red", verticals=TRUE, do.points=FALSE, xlim=c(0,1), xlab="per-read beta value", ylab="cumulative density", main=f) # adding eCDF for out-of-context per-read beta values (in blue) plot(amplicon.ecdfs[[1]]$out.of.context, add=TRUE, col="blue", verticals=TRUE, do.points=FALSE) } ## ----session-------------------------------------------------------------------------------------- sessionInfo()