## ----rhdf5Intro----------------------------------------------------------
#We use the rhdf5 package
library(rhdf5)
filename <- tempfile() #using a temporary file
h5createFile(filename) #creating the basic file layout


## ------------------------------------------------------------------------
h5ls(filename)


## ----h5write-------------------------------------------------------------
h5write(obj = array(rnorm(20), dim=c(4,5)), file = filename, name = "/RandomArray")
h5write(obj = rpois(100, 10), file = filename, name = "/PoissonVector")
h5ls(filename)


## ----h5read--------------------------------------------------------------
h5read(file = filename, name = "/PoissonVector")
h5read(file = filename, name = "/RandomArray", index = list(2:4, 3:5))


## ------------------------------------------------------------------------
getwd()


## ----initialise, message=FALSE-------------------------------------------
library(h5vc)
library(Rsamtools)


## ------------------------------------------------------------------------
bamFiles <- list.files("ExampleData", "bam$") #list available data files
bamFiles <- file.path( "ExampleData", bamFiles)
# we use scanBamHeader to extract information about the contigs
chromdim <- sapply( scanBamHeader(bamFiles), function(x) x$targets )
colnames(chromdim) <- bamFiles
chromdim[,1]


## ----setLocation---------------------------------------------------------
chrom = "1"
startpos = 115200000
endpos   = 115300000


## ----parallelTallying----------------------------------------------------
library(BiocParallel) #load the library to enable parallel execution
maxWorkers <- 2 #Set this to 1 if you want serial execution
tallyList <- list() #outputs go in this list
timeList <- list() #time measurements go here
for( nWorkers in 1:maxWorkers ){
  # set the number of concurrent jobs (see ?register for details)
  register(MulticoreParam(workers = nWorkers))
  timeList[[nWorkers]] <- system.time(
      tallyList[[nWorkers]] <- applyTallies(bamFiles, chrom, startpos, endpos )
    )
  print(paste("Tallied with", nWorkers, "parallel tasks!"))
}
timeList


## ----tallyBAMFiles-------------------------------------------------------
# we load a reference genome and use a subset of it as a parameter to applyTallies
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg19))
tallies <- applyTallies(
  bamFiles, chrom = chrom, start = startpos, stop = endpos, ncycles = 10,
  reference = BSgenome.Hsapiens.UCSC.hg19[[paste0("chr", chrom)]][startpos:endpos] )
# the first and last 10 sequencing cycles are called unreliable
str(tallies)


## ----prepareTallyFile----------------------------------------------------
chromlength <- chromdim[chrom,1] #grab the chromosome length from the first sample
study <- "/NRAS" #This will be the name of the main folder in the HDF5 file
tallyFile <- "NRAS.tally.hfs5"
if( file.exists(tallyFile) ){
  file.remove(tallyFile)
}
if( prepareTallyFile(tallyFile, study, chrom, chromlength, nsamples = length(bamFiles))){
  h5ls(tallyFile)
}else{
  message( paste( "Preparation of:", tallyFile, "failed" ) )
}


## ----writingToTallyFile--------------------------------------------------
group <- paste(study, chrom, sep="/")
h5write( tallies$Counts, tallyFile, paste( group, "Counts", sep = "/" ),
         index = list( NULL, NULL, NULL, startpos:endpos ) )
h5write( tallies$Coverages, tallyFile, paste( group, "Coverages", sep = "/" ),
         index = list( NULL, NULL, startpos:endpos ) )
h5write( tallies$Deletions, tallyFile, paste( group, "Deletions", sep = "/" ),
         index = list( NULL, NULL, startpos:endpos ) )
h5write( tallies$Reference, tallyFile, paste( group, "Reference", sep = "/" ),
         index = list( startpos:endpos ) )
h5ls(tallyFile)


## ----sampleDataSetUp-----------------------------------------------------
sampleData <- data.frame(
  Sample = gsub( ".bam", "", gsub( pattern = "ExampleData/", "", bamFiles)),
  Column = seq_along(bamFiles),
  Type = "Control",
  Library = "WholeExome",
  stringsAsFactors = FALSE
  )
sampleData$SampleID <- sapply(
  strsplit( sampleData$Sample, "\\."),
  function(x) as.numeric(x[2])
  )
sampleData$Type[sampleData$SampleID %% 2 == 0] <- "Case"
sampleData$Patient <- paste0( "Patient", floor( (sampleData$SampleID + 1 ) / 2 ) )
sampleData$Library[sampleData$SampleID >= 37] <- "WholeGenome"
setSampleData( tallyFile, group, sampleData, largeAttributes = TRUE ) 
getSampleData( tallyFile, group )


## ----sampleDataModify----------------------------------------------------
sampleData <- getSampleData( tallyFile, group ) #read from file
sampleData$ClinicalVariable <- rnorm(nrow(x = sampleData)) # add some data
setSampleData( tallyFile, group, sampleData, largeAttributes = TRUE ) # write it back
head(getSampleData( tallyFile, group )) #did it work?


## ------------------------------------------------------------------------
sampleData <- getSampleData( tallyFile, group ) #read from file
sampleData$Sample <- "********"
head(sampleData)
setSampleData( tallyFile, group, sampleData ) # write it back without largeAttributes=TRUE
sampleData <- getSampleData( tallyFile, group ) #did it work?
head(sampleData) #apparently not


## ----spaceUsage----------------------------------------------------------
tallySize <- file.info(tallyFile)$size
bamSize <- sum(sapply(
  list.files("ExampleData/", pattern= "*.bam$", full.names=TRUE),
  function(x) file.info(x)$size )
  )
tallySize / bamSize


## ----h5readBlock---------------------------------------------------------
startpos <- 115247090
endpos <- 115259515
data <- h5readBlock(
    filename = tallyFile,
    group = group,
    range = c(startpos, endpos)
  )
str(data)
data <- h5readBlock(
    filename = tallyFile,
    group = group,
    names = c("Coverages", "Reference"),
    range = c(startpos, endpos)
  )
str(data)
data <- h5readBlock(
    filename = tallyFile,
    group = group,
    names = c("Coverages", "Reference"),
    samples = sampleData$Sample[3:7],
    range = c(startpos, endpos)
  )
str(data)


## ----binnedCoverageFull--------------------------------------------------
startpos = 115200000
endpos   = 115300000
data <- h5dapply( # extracting coverage binned at 1000 bases
    filename = tallyFile,
    group = group,
    blocksize = 1000,
    FUN = binnedCoverage,
    sampledata = sampleData,
    gccount = TRUE,
    names = c( "Coverages", "Reference" ),
    range = c(startpos, endpos)
  )
  data <- do.call(rbind, data)
  rownames(data) <- NULL
  head(data)


## ------------------------------------------------------------------------
binnedCoverage


## ----binnedCoverageWholeGenome-------------------------------------------
# Grab only the whole genome samples
selectedSamples <- sampleData$Sample[sampleData$Library == "WholeGenome"]
sampleDataSubset <- subset( sampleData, Sample %in% selectedSamples)
# The Column propertie specifies the position of the sample in the datasets,
# since we subset, the indexes are off and need to be adjusted
sampleDataSubset$Column <- rank(sampleDataSubset$Column)
# Apply the binnedCoverage function
data <- h5dapply( # extracting coverage binned at 1000 bases
    filename = tallyFile,
    group = group,
    blocksize = 1000,
    samples = selectedSamples,
    FUN = binnedCoverage,
    sampledata = sampleDataSubset,
    gccount = TRUE,
    names = c( "Coverages", "Reference" ),
    range = c(startpos, endpos)
  )
# h5dapply returns a list, we need to merge the results by row
data <- do.call(rbind, data)
rownames(data) <- NULL
head(data)


## ----GCContentPlot, fig.width=10.5, fig.height=8.5, dpi=300--------------
library(ggplot2)
library(reshape2)
data <- melt(data, id.vars = c("Start", "End", "GCCount"))
colnames(data)[4] <- "Sample"
colnames(data)[5] <- "Coverage"
p <- ggplot(data, aes(x=GCCount, y = Coverage, col = Sample)) + 
  geom_point() + facet_wrap(~ Sample, ncol = 2)
print(p)


## ----parallelh5dapply----------------------------------------------------
startpos = 115200000
endpos   = 115300000
system.time(
  dataS <- h5dapply( # extracting coverage binned at 1000 bases
    filename = tallyFile,
    group = group,
    blocksize = 1000,
    FUN = binnedCoverage,
    sampledata = sampleData,
    gccount = TRUE,
    names = c( "Coverages", "Reference" ),
    range = c(startpos, endpos)
  )
)
dataS <- do.call(rbind, dataS)
system.time(
  dataP <- h5dapply( # extracting coverage binned at 1000 bases
    filename = tallyFile,
    group = group,
    blocksize = 1000,
    FUN = binnedCoverage,
    sampledata = sampleData,
    gccount = TRUE,
    names = c( "Coverages", "Reference" ),
    range = c(startpos, endpos),
    BPPARAM = MulticoreParam(workers = 4)
  )
)
dataP <- do.call(rbind, dataP)
# Are results of serial and parallel execution identical?
identical(dataS, dataP)


## ----myVariantCaller-----------------------------------------------------
myVariantCaller <- function( data, sampleNames, minAF = 0.2, minCov = 20 ){
  #sanity checks for the data
  stopifnot("Counts" %in% names(data))
  stopifnot("Coverages" %in% names(data))
  #sum up coverages for each sample (forward strand + reverse strand coverage)
  coverages <- apply(data$Coverages, c(1,3), sum)
  # sum up all counts for each sample (i.e. #A + #C + #G + #T on both strands)
  counts <- apply(data$Counts, c(2,4), sum)
  # Now counts and coverage have a compatible shape
  # and we can calculate estimates for the allelic frequency
  afs <- counts / coverages
  # Here we do the calling, by filtering for a minimal allelic Frequency and
  # Coverage in each sample and position
  filter <- afs >= minAF & coverages >= minCov
  require(reshape2) #use melt to convert the array into a data.frame
  filter <- melt(filter)
  #rename the columns
  colnames(filter) <- c("Sample", "Position", "Called")
  filter$Sample <- sampleNames[filter$Sample]
  # fix the position to reflect genomic coordinates
  filter$Position <- filter$Position + data$h5dapplyInfo$Blockstart - 1
  # We return only those positions that pass the filter
  return(subset(filter, Called == TRUE))
}


## ----callingVariantsNaively----------------------------------------------
selectedSamples <- sampleData$Sample[sampleData$Library == "WholeGenome"]
sampleDataSubset <- subset( sampleData, Sample %in% selectedSamples)
sampleDataSubset$Column <- rank(sampleDataSubset$Column)
data <- h5dapply( # calling variants in bins of size 10kb
    filename = tallyFile,
    group = group,
    blocksize = 10000,
    samples = selectedSamples,
    FUN = myVariantCaller,
    sampleNames = selectedSamples,
    names = c( "Coverages", "Counts" ),
    range = c(startpos, endpos)
  )
data <- do.call(rbind, data)
rownames(data) <- NULL
head(data)


## ----mismatchPlotOne, fig.width=10.5, fig.height=10.5, dpi=300-----------
varpos <- data$Position[1]
windowsize <- 50
plotData <- h5readBlock(
    filename = tallyFile,
    group = group,
    samples = selectedSamples,
    range = c(varpos - windowsize, varpos + windowsize)
  )
p <- mismatchPlot(
  data = plotData,
  sampledata = sampleDataSubset,
  position = varpos, windowsize = windowsize)
print(p)


## ----callingVariantsPaired-----------------------------------------------
startpos = 115200000
endpos   = 115300000
data <- h5dapply( # calling variants pairwise in bins of size 5kb
    filename = tallyFile,
    group = group,
    blocksize = 10000,
    FUN = callVariantsPaired,
    sampledata = sampleData,
    cl = vcConfParams(
      returnDataPoints = TRUE,
      annotateWithBackground = TRUE
      ),
    names = c( "Coverages", "Counts", "Reference" ),
    range = c(startpos, endpos),
    verbose = TRUE
  )
data <- do.call(rbind, data)
rownames(data) <- NULL
head(data)


## ------------------------------------------------------------------------
vcConfParams()


## ----mismatchPlotsSomatic, fig.width=10.5, fig.height=6.0, dpi=300-------
windowsize = 50
plots = list()
for(idx in seq(length=nrow(data))){
  varpos <- data$Start[idx]
  sample <- data$Sample[idx]
  selectedSamples <- subset(
    sampleData,
    Patient == sampleData$Patient[sampleData$Sample == data$Sample[idx]]
    )$Sample
  sampleDataSubset <- subset( sampleData, Sample %in% selectedSamples)
  sampleDataSubset$Column <- rank(sampleDataSubset$Column)
  plotData <- h5readBlock(
    filename = tallyFile,
    group = group,
    samples = selectedSamples,
    range = c(varpos - windowsize, varpos + windowsize)
  )
  p <- mismatchPlot(
    data = plotData,
    sampledata = sampleDataSubset,
    position = varpos,
    windowsize = windowsize)
  plots[[idx]] <- p
  print(p + theme(strip.text.y = element_text(family="serif", angle = 0, size = 12)))
}


## ----variantsAF----------------------------------------------------------
data$Support <- data$caseCountFwd + data$caseCountRev
data$Coverage <- data$caseCoverageFwd + data$caseCoverageRev
data$AllelicFrequency <- data$Support / data$Coverage
data


## ----reportingTools------------------------------------------------------
library(ReportingTools)
library(hwriter)

for(idx in seq_len(length(plots))){
  png(paste("Var", idx, "png", sep="."), 1200, 800)
  print(plots[[idx]])
  dev.off()
}

reportData <- data
reportData$mismatchPlot <- paste0(
  "<a target=\"_blank\" href=\"Var.",
  seq(length=nrow(data)),
  ".png\">Link</a>" ) # add a link to the variant plot files
htmlRep <- HTMLReport(shortName = "VariantReport", baseDirectory = getwd())
publish( reportData, htmlRep ) # publish the data.frame to the HTML reports
finish(htmlRep)


## ----geom_h5vc-----------------------------------------------------------
tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
sampleData <- getSampleData( tallyFile, "/ExampleStudy/16" )
position <- 29979629
windowsize <- 30
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
data <- h5readBlock(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  names = c("Coverages", "Counts"),
  range = c(position - windowsize, position + windowsize)
)
# Summing up all mismatches irrespective of the alternative allele
data$CountsAggregate = colSums(data$Counts)
# Simple overview plot showing number of mismatches per position
p <- ggplot() + 
geom_h5vc( data=data, 
           sampledata=sampleData, 
           windowsize = 35, position = 500, 
           dataset = "Coverages", fill = "gray" ) + 
geom_h5vc( data=data, 
           sampledata=sampleData, 
           windowsize = 35, position = 500, 
           dataset = "CountsAggregate", fill = "#A535C0" ) + 
facet_wrap( ~ Sample, ncol = 2 )
print(p)