Advanced Analyses

Prepare the environment

Firstly we need to make sure the tallyFile has been downloaded correctly and we load the sample data. See the vignette h5vc for more in-depth explanations of this step.

library(h5vc)
library(rhdf5)
library(GenomicRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
##     as.data.frame, as.vector, cbind, colnames, do.call,
##     duplicated, eval, evalq, get, intersect, is.unsorted, lapply,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, rank, rbind, rep.int, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unlist
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following objects are masked from 'package:reshape':
## 
##     expand, rename
## 
## The following objects are masked from 'package:plyr':
## 
##     desc, rename
## 
## Loading required package: GenomeInfoDb
library(h5vcData)
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")

Calling Variants

We use the callVariantsPaired function to perform variant calling in a cohort paired tumour/normal samples. This function has the following signature:

callVariantsPaired <- function (data, sampledata, cl = vcConfParams())

Where vcConfParams() is a function used to configure the behaviour of the calling function, it returns a list of paramters.

vcConfParams()
## $minStrandCov
## [1] 5
## 
## $maxStrandCov
## [1] 200
## 
## $minStrandAltSupport
## [1] 2
## 
## $maxStrandAltSupportControl
## [1] 0
## 
## $minStrandDelSupport
## [1] 2
## 
## $maxStrandDelSupportControl
## [1] 0
## 
## $minStrandCovControl
## [1] 5
## 
## $maxStrandCovControl
## [1] 200
## 
## $bases
## [1] 5 6 7 8
## 
## $returnDataPoints
## [1] FALSE
## 
## $annotateWithBackground
## [1] FALSE
## 
## $mergeCalls
## [1] FALSE
## 
## $mergeAggregator
## function (x, ...) 
## UseMethod("mean")
## <bytecode: 0x62ccf38>
## <environment: namespace:base>
## 
## $pValueAggregator
## function (..., na.rm = FALSE)  .Primitive("max")
calls16 <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ),
  names = c( "Coverages", "Counts", "Reference" ),
  range = c(29950000,30000000)
  )

calls22 <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/22",
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ),
  names = c( "Coverages", "Counts", "Reference" ),
  range = c(39350000,39400000)
  )

calls <- rbind(do.call(rbind,calls16), do.call(rbind,calls22))

Plotting Variants

We can use the mismatchPlot function to create plots of the suspected variant regions. First we extract a set of interesting positions:

calls$Patient = sampleData$Patient[match(calls$Sample, sampleData$Sample)]
calls <- subset(calls, pValueFwd < 0.05 & pValueRev < 0.05)
calls$varID = paste(calls$seqnames, calls$start, calls$altAllele, calls$Patient)
calls <- calls[!duplicated(calls$varID), ]
calls
##                      Chrom    Start      End           Sample altAllele
## 29960000:29969999       16 29968525 29968525    PT5PrimaryDNA         G
## 29970000:29979999       16 29971655 29971655    PT5PrimaryDNA         T
## 29980000:29989999.3     16 29983015 29983015 PT8EarlyStageDNA         G
## 39350000:39359999.1     22 39350222 39350222    PT5PrimaryDNA         A
## 39350000:39359999.7     22 39351466 39351466 PT8EarlyStageDNA         A
## 39360000:39369999.7     22 39366770 39366770    PT5PrimaryDNA         C
## 39360000:39369999.26    22 39364411 39364411 PT8EarlyStageDNA         T
## 39360000:39369999.29    22 39366770 39366770 PT8EarlyStageDNA         C
##                      refAllele caseCountFwd caseCountRev caseCoverageFwd
## 29960000:29969999            T            2            3              23
## 29970000:29979999            C            2            2              27
## 29980000:29989999.3          C            8           14              19
## 39350000:39359999.1          G           13           11              20
## 39350000:39359999.7          G            7           13              16
## 39360000:39369999.7          T            8            5              25
## 39360000:39369999.26         C            9            9              25
## 39360000:39369999.29         T           12            7              23
##                      caseCoverageRev controlCountFwd controlCountRev
## 29960000:29969999                 18               0               0
## 29970000:29979999                 33               0               0
## 29980000:29989999.3               30               0               0
## 39350000:39359999.1               44               0               0
## 39350000:39359999.7               22               0               0
## 39360000:39369999.7               15               0               0
## 39360000:39369999.26              25               0               0
## 39360000:39369999.29              14               0               0
##                      controlCoverageFwd controlCoverageRev
## 29960000:29969999                     7                 19
## 29970000:29979999                    19                 28
## 29980000:29989999.3                  13                 10
## 39350000:39359999.1                  20                 32
## 39350000:39359999.7                   7                  9
## 39360000:39369999.7                  32                 24
## 39360000:39369999.26                 10                  8
## 39360000:39369999.29                 10                  9
##                      backgroundFrequencyFwd backgroundFrequencyRev
## 29960000:29969999                         0                      0
## 29970000:29979999                         0                      0
## 29980000:29989999.3                       0                      0
## 39350000:39359999.1                       0                      0
## 39350000:39359999.7                       0                      0
## 39360000:39369999.7                       0                      0
## 39360000:39369999.26                      0                      0
## 39360000:39369999.29                      0                      0
##                      pValueFwd pValueRev  Patient        varID
## 29960000:29969999            0         0 Patient5   G Patient5
## 29970000:29979999            0         0 Patient5   T Patient5
## 29980000:29989999.3          0         0 Patient8   G Patient8
## 39350000:39359999.1          0         0 Patient5   A Patient5
## 39350000:39359999.7          0         0 Patient8   A Patient8
## 39360000:39369999.7          0         0 Patient5   C Patient5
## 39360000:39369999.26         0         0 Patient8   T Patient8
## 39360000:39369999.29         0         0 Patient8   C Patient8

The remaining two calls have good significance (binom.test against the estimated background error rate, i.e. the mean of the mismatch rates of all control samples in the cohort) and we filter out double entries that come from the same patient. You will see in the plots below, that patient 5 has the variant in both the primary and the relapse tumour. Now we can generate the mismatch plots.

windowsize <- 35
for (i in seq(nrow(calls))) {
    position <- calls$Start[i]
    data <- h5readBlock(filename = tallyFile, group = paste("/ExampleStudy", 
        calls$Chrom[i], sep = "/"), names = c("Coverages", "Counts", "Deletions"), 
        range = c(position - windowsize, position + windowsize))
    p <- mismatchPlot(data = data, sampledata = sampleData, samples = sampleData$Sample[sampleData$Patient == 
        calls$Patient[i]], windowsize = windowsize, position = position)
    print(p)
}

plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6

Plotting Arbitrary Datasets

Sometimes you want more flexibility that what mismatchPlot can provide, for this case the geom_h5vc function can be used to create an independed ggplot2 plot layer from a dataset that can be added to any ggplot object, e.g. an existing mismatchPlot or an empty plot. The example below demonstrates this.

library(h5vc)
library(ggplot2)
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
position <- 29979629
windowsize <- 30
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
data <- h5readBlock(filename = tallyFile, group = "/ExampleStudy/16", names = c("Coverages", 
    "Counts", "Deletions"), range = c(position - windowsize, position + windowsize))
# Summing up all mismatches irrespective of the alternative allele
data$CountsAggregate = colSums(data$Counts)
# Simple overview plot showing number of mismatches per position
p <- ggplot() + geom_h5vc(data = data, sampledata = sampleData, windowsize = 35, 
    position = 500, dataset = "Coverages", fill = "gray") + geom_h5vc(data = data, 
    sampledata = sampleData, windowsize = 35, position = 500, dataset = "CountsAggregate", 
    fill = "#D50000") + facet_wrap(~Sample, ncol = 2)
print(p)

plot of chunk unnamed-chunk-7

Plotting the Reference Track

Plotting the Reference as a track alongside the mismatch plot can give additional insights into the context of a variant call (e.g. presence of homopolymers). The mismatchPlot function will automatically add a plot for the reference sequnce if the data object that is passed to it has a slot named Reference, this behaviour can be supressed by setting the parameter plotReference to FALSE.

windowsize <- 35
for (i in seq(nrow(calls))) {
    position <- calls$Start[i]
    data <- h5readBlock(filename = tallyFile, group = paste("/ExampleStudy", 
        calls$Chrom[i], sep = "/"), names = c("Coverages", "Counts", "Deletions", 
        "Reference"), range = c(position - windowsize, position + windowsize))
    p <- mismatchPlot(data = data, sampledata = sampleData, samples = sampleData$Sample[sampleData$Patient == 
        calls$Patient[i]], windowsize = windowsize, position = position)
    print(p)
}

plot of chunk plotWithReference plot of chunk plotWithReference plot of chunk plotWithReference plot of chunk plotWithReference plot of chunk plotWithReference plot of chunk plotWithReference plot of chunk plotWithReference plot of chunk plotWithReference

Mutation Spectrum Analysis

We can also easily perform mutation spectrum analysis by using the function mutationSpectrum which works on a set of variant calls in a data.frame form as it would be produced by a call to e.g. callVariantsPaired and a tallyFile parameter specifying hte location of a tally file as well as a context parameter. The context parameter specifies how much sequence context should be taken into account for the mutation spectrum. An example with context 1 (i.e. one base up- and one base downstream of the variant) is shown below.

We skip the calling of variants (since ittakes al ong time on larger regions) and load the corresponding example data instead, we would generate the calls like this:

# loading library and example data
library(h5vc)
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
windowsize <- 1e+05
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
# Calling Variants
vars <- h5dapply(filename = tallyFile, group = "/ExampleStudy/16", blocksize = 10000, 
    FUN = callVariantsPaired, sampledata = sampleData, cl = vcConfParams(returnDataPoints = TRUE), 
    names = c("Coverages", "Counts", "Reference"), range = c(2.9e+07, 3e+07), 
    verbose = TRUE)

variantCalls <- do.call(rbind, vars)

vars <- h5dapply(filename = tallyFile, group = "/ExampleStudy/22", blocksize = 10000, 
    FUN = callVariantsPaired, sampledata = sampleData, cl = vcConfParams(returnDataPoints = TRUE), 
    names = c("Coverages", "Counts", "Reference"), range = c(3.8e+07, 4e+07), 
    verbose = TRUE)

variantCalls <- rbind(variantCalls, do.call(rbind, vars))
rownames(variantCalls) <- NULL

save(variantCalls, file = "example.variants.RDa")

The mutation spectrum can now be extracted from the tallyFile and variant calls like so:

data("example.variants", package = "h5vcData")
ms <- mutationSpectrum(variantCalls, tallyFile, "/ExampleStudy")
head(ms)
##   refAllele altAllele        Sample Prefix Suffix MutationType Context
## 1         T         T PT5PrimaryDNA      A      A          C>A     TGT
## 2         C         A PT5PrimaryDNA      A      A          C>A     ACA
## 3         C         G PT8PrimaryDNA      C      C          C>G     CCC
## 4         C         A PT8PrimaryDNA      G      C          T>G     GAC
## 5         C         C PT5PrimaryDNA      C      C          T>C     GAG
## 6         C         T PT5PrimaryDNA      G      C          C>T     GCC

An overview is given in the following plot:

plotMutationSpectrum(ms)

plot of chunk mutationSpectrumPlot

The y axis shows the number of this kind of mutation and the x-axis shows the sequence context (of size 1 in this case). At this point one could go on to perform a NMF-based analysis of the mutation spectra as is described in Alexandrov et. al. - Signatures of mutational processes in human cancer