## ----rhdf5Intro---------------------------------------------------------- #We use the rhdf5 package library(rhdf5) filename <- tempfile() #using a temporary file h5createFile(filename) #creating the basic file layout ## ------------------------------------------------------------------------ h5ls(filename) ## ----h5write------------------------------------------------------------- h5write(obj = array(rnorm(20), dim=c(4,5)), file = filename, name = "/RandomArray") h5write(obj = rpois(100, 10), file = filename, name = "/PoissonVector") h5ls(filename) ## ----h5read-------------------------------------------------------------- h5read(file = filename, name = "/PoissonVector") h5read(file = filename, name = "/RandomArray", index = list(2:4, 3:5)) ## ------------------------------------------------------------------------ getwd() ## ----initialise, message=FALSE------------------------------------------- library(h5vc) library(Rsamtools) ## ------------------------------------------------------------------------ bamFiles <- list.files("ExampleData", "bam$") #list available data files bamFiles <- file.path( "ExampleData", bamFiles) # we use scanBamHeader to extract information about the contigs chromdim <- sapply( scanBamHeader(bamFiles), function(x) x$targets ) colnames(chromdim) <- bamFiles chromdim[,1] ## ----setLocation--------------------------------------------------------- chrom = "1" startpos = 115200000 endpos = 115300000 ## ----parallelTallying---------------------------------------------------- library(BiocParallel) #load the library to enable parallel execution maxWorkers <- 2 #Set this to 1 if you want serial execution tallyList <- list() #outputs go in this list timeList <- list() #time measurements go here for( nWorkers in 1:maxWorkers ){ # set the number of concurrent jobs (see ?register for details) register(MulticoreParam(workers = nWorkers)) timeList[[nWorkers]] <- system.time( tallyList[[nWorkers]] <- applyTallies(bamFiles, chrom, startpos, endpos ) ) print(paste("Tallied with", nWorkers, "parallel tasks!")) } timeList ## ----tallyBAMFiles------------------------------------------------------- # we load a reference genome and use a subset of it as a parameter to applyTallies suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg19)) tallies <- applyTallies( bamFiles, chrom = chrom, start = startpos, stop = endpos, ncycles = 10, reference = BSgenome.Hsapiens.UCSC.hg19[[paste0("chr", chrom)]][startpos:endpos] ) # the first and last 10 sequencing cycles are called unreliable str(tallies) ## ----prepareTallyFile---------------------------------------------------- chromlength <- chromdim[chrom,1] #grab the chromosome length from the first sample study <- "/NRAS" #This will be the name of the main folder in the HDF5 file tallyFile <- "NRAS.tally.hfs5" if( file.exists(tallyFile) ){ file.remove(tallyFile) } if( prepareTallyFile(tallyFile, study, chrom, chromlength, nsamples = length(bamFiles))){ h5ls(tallyFile) }else{ message( paste( "Preparation of:", tallyFile, "failed" ) ) } ## ----writingToTallyFile-------------------------------------------------- group <- paste(study, chrom, sep="/") h5write( tallies$Counts, tallyFile, paste( group, "Counts", sep = "/" ), index = list( NULL, NULL, NULL, startpos:endpos ) ) h5write( tallies$Coverages, tallyFile, paste( group, "Coverages", sep = "/" ), index = list( NULL, NULL, startpos:endpos ) ) h5write( tallies$Deletions, tallyFile, paste( group, "Deletions", sep = "/" ), index = list( NULL, NULL, startpos:endpos ) ) h5write( tallies$Reference, tallyFile, paste( group, "Reference", sep = "/" ), index = list( startpos:endpos ) ) h5ls(tallyFile) ## ----sampleDataSetUp----------------------------------------------------- sampleData <- data.frame( Sample = gsub( ".bam", "", gsub( pattern = "ExampleData/", "", bamFiles)), Column = seq_along(bamFiles), Type = "Control", Library = "WholeExome", stringsAsFactors = FALSE ) sampleData$SampleID <- sapply( strsplit( sampleData$Sample, "\\."), function(x) as.numeric(x[2]) ) sampleData$Type[sampleData$SampleID %% 2 == 0] <- "Case" sampleData$Patient <- paste0( "Patient", floor( (sampleData$SampleID + 1 ) / 2 ) ) sampleData$Library[sampleData$SampleID >= 37] <- "WholeGenome" setSampleData( tallyFile, group, sampleData, largeAttributes = TRUE ) getSampleData( tallyFile, group ) ## ----sampleDataModify---------------------------------------------------- sampleData <- getSampleData( tallyFile, group ) #read from file sampleData$ClinicalVariable <- rnorm(nrow(x = sampleData)) # add some data setSampleData( tallyFile, group, sampleData, largeAttributes = TRUE ) # write it back head(getSampleData( tallyFile, group )) #did it work? ## ------------------------------------------------------------------------ sampleData <- getSampleData( tallyFile, group ) #read from file sampleData$Sample <- "********" head(sampleData) setSampleData( tallyFile, group, sampleData ) # write it back without largeAttributes=TRUE sampleData <- getSampleData( tallyFile, group ) #did it work? head(sampleData) #apparently not ## ----spaceUsage---------------------------------------------------------- tallySize <- file.info(tallyFile)$size bamSize <- sum(sapply( list.files("ExampleData/", pattern= "*.bam$", full.names=TRUE), function(x) file.info(x)$size ) ) tallySize / bamSize ## ----h5readBlock--------------------------------------------------------- startpos <- 115247090 endpos <- 115259515 data <- h5readBlock( filename = tallyFile, group = group, range = c(startpos, endpos) ) str(data) data <- h5readBlock( filename = tallyFile, group = group, names = c("Coverages", "Reference"), range = c(startpos, endpos) ) str(data) data <- h5readBlock( filename = tallyFile, group = group, names = c("Coverages", "Reference"), samples = sampleData$Sample[3:7], range = c(startpos, endpos) ) str(data) ## ----binnedCoverageFull-------------------------------------------------- startpos = 115200000 endpos = 115300000 data <- h5dapply( # extracting coverage binned at 1000 bases filename = tallyFile, group = group, blocksize = 1000, FUN = binnedCoverage, sampledata = sampleData, gccount = TRUE, names = c( "Coverages", "Reference" ), range = c(startpos, endpos) ) data <- do.call(rbind, data) rownames(data) <- NULL head(data) ## ------------------------------------------------------------------------ binnedCoverage ## ----binnedCoverageWholeGenome------------------------------------------- # Grab only the whole genome samples selectedSamples <- sampleData$Sample[sampleData$Library == "WholeGenome"] sampleDataSubset <- subset( sampleData, Sample %in% selectedSamples) # The Column propertie specifies the position of the sample in the datasets, # since we subset, the indexes are off and need to be adjusted sampleDataSubset$Column <- rank(sampleDataSubset$Column) # Apply the binnedCoverage function data <- h5dapply( # extracting coverage binned at 1000 bases filename = tallyFile, group = group, blocksize = 1000, samples = selectedSamples, FUN = binnedCoverage, sampledata = sampleDataSubset, gccount = TRUE, names = c( "Coverages", "Reference" ), range = c(startpos, endpos) ) # h5dapply returns a list, we need to merge the results by row data <- do.call(rbind, data) rownames(data) <- NULL head(data) ## ----GCContentPlot, fig.width=10.5, fig.height=8.5, dpi=300-------------- library(ggplot2) library(reshape2) data <- melt(data, id.vars = c("Start", "End", "GCCount")) colnames(data)[4] <- "Sample" colnames(data)[5] <- "Coverage" p <- ggplot(data, aes(x=GCCount, y = Coverage, col = Sample)) + geom_point() + facet_wrap(~ Sample, ncol = 2) print(p) ## ----parallelh5dapply---------------------------------------------------- startpos = 115200000 endpos = 115300000 system.time( dataS <- h5dapply( # extracting coverage binned at 1000 bases filename = tallyFile, group = group, blocksize = 1000, FUN = binnedCoverage, sampledata = sampleData, gccount = TRUE, names = c( "Coverages", "Reference" ), range = c(startpos, endpos) ) ) dataS <- do.call(rbind, dataS) system.time( dataP <- h5dapply( # extracting coverage binned at 1000 bases filename = tallyFile, group = group, blocksize = 1000, FUN = binnedCoverage, sampledata = sampleData, gccount = TRUE, names = c( "Coverages", "Reference" ), range = c(startpos, endpos), BPPARAM = MulticoreParam(workers = 4) ) ) dataP <- do.call(rbind, dataP) # Are results of serial and parallel execution identical? identical(dataS, dataP) ## ----myVariantCaller----------------------------------------------------- myVariantCaller <- function( data, sampleNames, minAF = 0.2, minCov = 20 ){ #sanity checks for the data stopifnot("Counts" %in% names(data)) stopifnot("Coverages" %in% names(data)) #sum up coverages for each sample (forward strand + reverse strand coverage) coverages <- apply(data$Coverages, c(1,3), sum) # sum up all counts for each sample (i.e. #A + #C + #G + #T on both strands) counts <- apply(data$Counts, c(2,4), sum) # Now counts and coverage have a compatible shape # and we can calculate estimates for the allelic frequency afs <- counts / coverages # Here we do the calling, by filtering for a minimal allelic Frequency and # Coverage in each sample and position filter <- afs >= minAF & coverages >= minCov require(reshape2) #use melt to convert the array into a data.frame filter <- melt(filter) #rename the columns colnames(filter) <- c("Sample", "Position", "Called") filter$Sample <- sampleNames[filter$Sample] # fix the position to reflect genomic coordinates filter$Position <- filter$Position + data$h5dapplyInfo$Blockstart - 1 # We return only those positions that pass the filter return(subset(filter, Called == TRUE)) } ## ----callingVariantsNaively---------------------------------------------- selectedSamples <- sampleData$Sample[sampleData$Library == "WholeGenome"] sampleDataSubset <- subset( sampleData, Sample %in% selectedSamples) sampleDataSubset$Column <- rank(sampleDataSubset$Column) data <- h5dapply( # calling variants in bins of size 10kb filename = tallyFile, group = group, blocksize = 10000, samples = selectedSamples, FUN = myVariantCaller, sampleNames = selectedSamples, names = c( "Coverages", "Counts" ), range = c(startpos, endpos) ) data <- do.call(rbind, data) rownames(data) <- NULL head(data) ## ----mismatchPlotOne, fig.width=10.5, fig.height=10.5, dpi=300----------- varpos <- data$Position[1] windowsize <- 50 plotData <- h5readBlock( filename = tallyFile, group = group, samples = selectedSamples, range = c(varpos - windowsize, varpos + windowsize) ) p <- mismatchPlot( data = plotData, sampledata = sampleDataSubset, position = varpos, windowsize = windowsize) print(p) ## ----callingVariantsPaired----------------------------------------------- startpos = 115200000 endpos = 115300000 data <- h5dapply( # calling variants pairwise in bins of size 5kb filename = tallyFile, group = group, blocksize = 10000, FUN = callVariantsPaired, sampledata = sampleData, cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ), names = c( "Coverages", "Counts", "Reference" ), range = c(startpos, endpos), verbose = TRUE ) data <- do.call(rbind, data) rownames(data) <- NULL head(data) ## ------------------------------------------------------------------------ vcConfParams() ## ----mismatchPlotsSomatic, fig.width=10.5, fig.height=6.0, dpi=300------- windowsize = 50 plots = list() for(idx in seq(length=nrow(data))){ varpos <- data$Start[idx] sample <- data$Sample[idx] selectedSamples <- subset( sampleData, Patient == sampleData$Patient[sampleData$Sample == data$Sample[idx]] )$Sample sampleDataSubset <- subset( sampleData, Sample %in% selectedSamples) sampleDataSubset$Column <- rank(sampleDataSubset$Column) plotData <- h5readBlock( filename = tallyFile, group = group, samples = selectedSamples, range = c(varpos - windowsize, varpos + windowsize) ) p <- mismatchPlot( data = plotData, sampledata = sampleDataSubset, position = varpos, windowsize = windowsize) plots[[idx]] <- p print(p + theme(strip.text.y = element_text(family="serif", angle = 0, size = 12))) } ## ----variantsAF---------------------------------------------------------- data$Support <- data$caseCountFwd + data$caseCountRev data$Coverage <- data$caseCoverageFwd + data$caseCoverageRev data$AllelicFrequency <- data$Support / data$Coverage data ## ----reportingTools------------------------------------------------------ library(ReportingTools) library(hwriter) for(idx in seq_len(length(plots))){ png(paste("Var", idx, "png", sep="."), 1200, 800) print(plots[[idx]]) dev.off() } reportData <- data reportData$mismatchPlot <- paste0( "Link" ) # add a link to the variant plot files htmlRep <- HTMLReport(shortName = "VariantReport", baseDirectory = getwd()) publish( reportData, htmlRep ) # publish the data.frame to the HTML reports finish(htmlRep) ## ----geom_h5vc----------------------------------------------------------- tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" ) sampleData <- getSampleData( tallyFile, "/ExampleStudy/16" ) position <- 29979629 windowsize <- 30 samples <- sampleData$Sample[sampleData$Patient == "Patient8"] data <- h5readBlock( filename = tallyFile, group = "/ExampleStudy/16", names = c("Coverages", "Counts"), range = c(position - windowsize, position + windowsize) ) # Summing up all mismatches irrespective of the alternative allele data$CountsAggregate = colSums(data$Counts) # Simple overview plot showing number of mismatches per position p <- ggplot() + geom_h5vc( data=data, sampledata=sampleData, windowsize = 35, position = 500, dataset = "Coverages", fill = "gray" ) + geom_h5vc( data=data, sampledata=sampleData, windowsize = 35, position = 500, dataset = "CountsAggregate", fill = "#A535C0" ) + facet_wrap( ~ Sample, ncol = 2 ) print(p)