fastreeR 1.10.0
The goal of fastreeR is to provide functions for calculating distance matrix, building phylogenetic tree or performing hierarchical clustering between samples, directly from a VCF or FASTA file.
To install fastreeR
package:
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("fastreeR")
You should allocate minimum 10kb per sample per variant of RAM for the JVM.
The more RAM you allocate, the faster the execution will be (less pauses
for garbage collection).
In order to allocate RAM, a special parameter needs to be passed while JVM
initializes. JVM parameters can be passed by setting java.parameters
option.
The -Xmx
parameter, followed (without space) by an integer value and a
letter, is used to tell JVM what is the maximum amount of heap RAM that it can
use. The letter in the parameter (uppercase or lowercase), indicates RAM units.
For example, parameters -Xmx1024m
or -Xmx1024M
or -Xmx1g
or -Xmx1G
, all
allocate 1 Gigabyte or 1024 Megabytes of maximum RAM for JVM.
options(java.parameters="-Xmx1G")
unloadNamespace("fastreeR")
library(fastreeR)
library(utils)
library(ape)
library(stats)
library(grid)
library(BiocFileCache)
We download, in a temporary location, a small vcf file
from 1K project, with around 150 samples and 100k variants (SNPs and INDELs).
We use BiocFileCache
for this retrieval process
so that it is not repeated needlessly.
If for any reason we cannot download, we use the small sample vcf from
fastreeR
package.
bfc <- BiocFileCache::BiocFileCache(ask = FALSE)
tempVcfUrl <-
paste0("https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/",
"1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/",
"supporting/related_samples/",
"ALL.chrX.shapeit2_integrated_snvindels_v2a_related_samples_27022019.",
"GRCh38.phased.vcf.gz")
tempVcf <- BiocFileCache::bfcquery(bfc,field = "rname", "tempVcf")$rpath[1]
if(is.na(tempVcf)) {
tryCatch(
{ tempVcf <- BiocFileCache::bfcadd(bfc,"tempVcf",fpath=tempVcfUrl)[[1]]
},
error=function(cond) {
tempVcf <- system.file("extdata", "samples.vcf.gz", package="fastreeR")
},
warning=function(cond) {
tempVcf <- system.file("extdata", "samples.vcf.gz", package="fastreeR")
}
)
}
if(file.size(tempVcf) == 0L) {
tempVcf <- system.file("extdata", "samples.vcf.gz", package="fastreeR")
}
We download, in temporary location, some small bacterial genomes.
We use BiocFileCache
for this retrieval process
so that it is not repeated needlessly.
If for any reason we cannot download, we use the small sample fasta from
fastreeR
package.
tempFastasUrls <- c(
#Mycobacterium liflandii
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Mycobacterium_liflandii/latest_assembly_versions/",
"GCF_000026445.2_ASM2644v2/GCF_000026445.2_ASM2644v2_genomic.fna.gz"),
#Pelobacter propionicus
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Pelobacter_propionicus/latest_assembly_versions/",
"GCF_000015045.1_ASM1504v1/GCF_000015045.1_ASM1504v1_genomic.fna.gz"),
#Rickettsia prowazekii
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Rickettsia_prowazekii/latest_assembly_versions/",
"GCF_000022785.1_ASM2278v1/GCF_000022785.1_ASM2278v1_genomic.fna.gz"),
#Salmonella enterica
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Salmonella_enterica/reference/",
"GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.fna.gz"),
#Staphylococcus aureus
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Staphylococcus_aureus/reference/",
"GCF_000013425.1_ASM1342v1/GCF_000013425.1_ASM1342v1_genomic.fna.gz")
)
tempFastas <- list()
for (i in seq(1,5)) {
tempFastas[[i]] <- BiocFileCache::bfcquery(bfc,field = "rname",
paste0("temp_fasta",i))$rpath[1]
if(is.na(tempFastas[[i]])) {
tryCatch(
{ tempFastas[[i]] <-
BiocFileCache::bfcadd(bfc, paste0("temp_fasta",i),
fpath=tempFastasUrls[i])[[1]]
},
error=function(cond) {
tempFastas <- system.file("extdata", "samples.fasta.gz",
package="fastreeR")
break
},
warning=function(cond) {
tempFastas <- system.file("extdata", "samples.fasta.gz",
package="fastreeR")
break
}
)
}
if(!file.exists(tempFastas[[i]])) {
tempFastas <- system.file("extdata", "samples.fasta.gz",
package="fastreeR")
break
}
if(file.size(tempFastas[[i]]) == 0L) {
tempFastas <- system.file("extdata", "samples.fasta.gz",
package="fastreeR")
break
}
}
myVcfIstats <- fastreeR::vcf2istats(inputFile = tempVcf)
plot(myVcfIstats[,7:9])
The most time consuming process is calculating distances between samples. Assign more processors in order to speed up this operation.
myVcfDist <- fastreeR::vcf2dist(inputFile = tempVcf, threads = 2)
graphics::hist(myVcfDist, breaks = 100, main=NULL,
xlab = "Distance", xlim = c(0,max(myVcfDist)))
We note two distinct groups of distances. One around of distance value 0.05 and the second around distance value 0.065.
fastreeR::dist2tree
Notice that the generated tree is ultrametric.
myVcfTree <- fastreeR::dist2tree(inputDist = myVcfDist)
plot(ape::read.tree(text = myVcfTree), direction = "down", cex = 0.3)
ape::add.scale.bar()
ape::axisPhylo(side = 2)
Of course the same can be achieved directly from the vcf file, without calculating distances.
myVcfTree <- fastreeR::vcf2tree(inputFile = tempVcf, threads = 2)
plot(ape::read.tree(text = myVcfTree), direction = "down", cex = 0.3)
ape::add.scale.bar()
ape::axisPhylo(side = 2)