fastreeR 1.0.0
The goal of fastreeR is to provide functions for calculating distance matrix, building phylogenetic tree or performing hierarchical clustering between samples, directly from a VCF or FASTA file.
To install fastreeR package:
if (!requireNamespace("BiocManager", quietly=TRUE))
    install.packages("BiocManager")
BiocManager::install("fastreeR")You should allocate minimum 10kb per sample per variant of RAM for the JVM.
The more RAM you allocate, the faster the execution will be (less pauses
for garbage collection).
In order to allocate RAM, a special parameter needs to be passed while JVM
initializes. JVM parameters can be passed by setting java.parameters option.
The -Xmx parameter, followed (without space) by an integer value and a
letter, is used to tell JVM what is the maximum amount of heap RAM that it can
use. The letter in the parameter (uppercase or lowercase), indicates RAM units.
For example, parameters -Xmx1024m or -Xmx1024M or -Xmx1g or -Xmx1G, all
allocate 1 Gigabyte or 1024 Megabytes of maximum RAM for JVM.
options(java.parameters="-Xmx1G")
unloadNamespace("fastreeR")
library(fastreeR)
library(utils)
library(ape)
library(stats)
library(grid)
library(BiocFileCache)We download, in a temporary location, a small vcf file
from 1K project, with around 150 samples and 100k variants (SNPs and INDELs).
We use BiocFileCache for this retrieval process
so that it is not repeated needlessly.
If for any reason we cannot download, we use the small sample vcf from
fastreeR package.
bfc <- BiocFileCache::BiocFileCache(ask = FALSE)
tempVcfUrl <-
    paste0("https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/",
        "1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/",
        "supporting/related_samples/",
        "ALL.chrX.shapeit2_integrated_snvindels_v2a_related_samples_27022019.",
        "GRCh38.phased.vcf.gz")
tempVcf <- BiocFileCache::bfcquery(bfc,field = "rname", "tempVcf")$rpath[1]
if(is.na(tempVcf)) {
    tryCatch(
    { tempVcf <- BiocFileCache::bfcadd(bfc,"tempVcf",fpath=tempVcfUrl)[[1]]
    },error=function(cond) {
        tempVcf <- system.file("extdata", "samples.vcf.gz", package="fastreeR")
        }
    )
}We download, in temporary location, some small bacterial genomes.
We use BiocFileCache for this retrieval process
so that it is not repeated needlessly.
If for any reason we cannot download, we use the small sample fasta from
fastreeR package.
tempFastasUrls <- c(
    #Mycobacterium liflandii
    paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
        "Mycobacterium_liflandii/latest_assembly_versions/",
        "GCF_000026445.2_ASM2644v2/GCF_000026445.2_ASM2644v2_genomic.fna.gz"),
    #Pelobacter propionicus
    paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
        "Pelobacter_propionicus/latest_assembly_versions/",
        "GCF_000015045.1_ASM1504v1/GCF_000015045.1_ASM1504v1_genomic.fna.gz"),
    #Rickettsia prowazekii
    paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
        "Rickettsia_prowazekii/latest_assembly_versions/",
        "GCF_000022785.1_ASM2278v1/GCF_000022785.1_ASM2278v1_genomic.fna.gz"),
    #Salmonella enterica
    paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
        "Salmonella_enterica/reference/",
        "GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.fna.gz"),
    #Staphylococcus aureus
    paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
        "Staphylococcus_aureus/reference/",
        "GCF_000013425.1_ASM1342v1/GCF_000013425.1_ASM1342v1_genomic.fna.gz")
)
tempFastas <- list()
for (i in seq(1,5)) {
    tempFastas[[i]] <- BiocFileCache::bfcquery(bfc,field = "rname", 
                                                paste0("temp_fasta",i))$rpath[1]
    if(is.na(tempFastas[[i]])) {
        tryCatch(
        { tempFastas[[i]] <- 
            BiocFileCache::bfcadd(bfc, paste0("temp_fasta",i), 
                                                fpath=tempFastasUrls[i])[[1]]
        },error=function(cond) {
            tempFastas <- system.file("extdata", "samples.fasta.gz", 
                                                        package="fastreeR")
            break
            }
        )
    }
}myVcfIstats <- fastreeR::vcf2istats(inputFile = tempVcf)
plot(myVcfIstats[,7:9])
Figure 1: Sample statistics from vcf file
The most time consuming process is calculating distances between samples. Assign more processors in order to speed up this operation.
myVcfDist <- fastreeR::vcf2dist(inputFile = tempVcf, threads = 2)graphics::hist(myVcfDist, breaks = 100, main=NULL, 
                                xlab = "Distance", xlim = c(0,max(myVcfDist)))
Figure 2: Histogram of distances from vcf file
We note two distinct groups of distances. One around of distance value 0.05 and the second around distance value 0.065.
fastreeR::dist2treeNotice that the generated tree is ultrametric.
myVcfTree <- fastreeR::dist2tree(inputDist = myVcfDist)
plot(ape::read.tree(text = myVcfTree), direction = "down", cex = 0.3)
ape::add.scale.bar()
ape::axisPhylo(side = 2)
Figure 3: Tree from vcf with fastreeR
Of course the same can be achieved directly from the vcf file, without calculating distances.
myVcfTree <- fastreeR::vcf2tree(inputFile = tempVcf, threads = 2)
plot(ape::read.tree(text = myVcfTree), direction = "down", cex = 0.3)
ape::add.scale.bar()
ape::axisPhylo(side = 2)
Figure 4: Tree from vcf with fastreeR
As expected from the histogram of distances, two groups of samples also emerge in the tree. The two branches, one at height around 0.055 and the second around height 0.065, are clearly visible.
stats::hclustFor comparison, we generate a tree by using stats package and distances
calculated by fastreeR.
myVcfTreeStats <- stats::hclust(myVcfDist)
plot(myVcfTreeStats, ann = FALSE, cex = 0.3)
Figure 5: Tree from vcf with stats::hclust
Although it does not initially look very similar, because it is not ultrametric, it is indeed quite the same tree. We note again the two groups (two branches) of samples and the 4 samples, possibly clones, that they show very close distances between them.
We can identify the two groups of samples, apparent from the hierarchical tree,
by using dist2clusters
or vcf2clusters
or tree2clusters.
By playing a little with the cutHeight parameter, we find that a
value of cutHeight=0.067 cuts the tree into two branches.
The first group contains 106 samples and the second 44.
myVcfClust <- fastreeR::dist2clusters(inputDist = myVcfDist, cutHeight = 0.067)
#>  ..done.
tree <- myVcfClust[[1]]
clusters1 <- myVcfClust[[2]][[2]][1]
clusters2 <- myVcfClust[[2]][[2]][2]
clusters1
#> [1] "2 44 HG02478 HG02762 HG02869 HG02964 HG02965 HG03033 HG03034 HG03076 HG03249 HG03250 HG03306 HG03307 HG03309 HG03312 HG03339 HG03361 HG03373 HG03383 HG03408 HG03454 HG03493 HG03508 HG03566 HG03569 HG03574 HG03582 NA18487 NA19150 NA19240 NA19311 NA19313 NA19373 NA19381 NA19382 NA19396 NA19444 NA19453 NA19469 NA19470 NA19985 NA20313 NA20322 NA20341 NA20361"
clusters2
#> [1] NASimilar analysis we can perform when we have samples represented as sequences in a fasta file.
Use of the downloaded sample fasta file :
myFastaDist <- fastreeR::fasta2dist(tempFastas, kmer = 6)Or use the provided by fastreeR fasta file of 48 bacterial RefSeq :
myFastaDist <- fastreeR::fasta2dist(
    system.file("extdata", "samples.fasta.gz", package="fastreeR"), kmer = 6)graphics::hist(myFastaDist, breaks = 100, main=NULL, 
                                xlab="Distance", xlim = c(0,max(myFastaDist)))
Figure 6: Histogram of distances from fasta file
fastreeR::dist2treemyFastaTree <- fastreeR::dist2tree(inputDist = myFastaDist)
plot(ape::read.tree(text = myFastaTree), direction = "down", cex = 0.3)
ape::add.scale.bar()
ape::axisPhylo(side = 2)
Figure 7: Tree from fasta with fastreeR
stats::hclustmyFastaTreeStats <- stats::hclust(myFastaDist)
plot(myFastaTreeStats, ann = FALSE, cex = 0.3)
Figure 8: Tree from fasta with stats::hclust
utils::sessionInfo()
#> R version 4.2.0 RC (2022-04-19 r82224)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
#> LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8          LC_NUMERIC=C                 
#>  [3] LC_TIME=en_GB                 LC_COLLATE=C                 
#>  [5] LC_MONETARY=en_US.UTF-8       LC_MESSAGES=en_US.UTF-8      
#>  [7] LC_PAPER=en_US.UTF-8          LC_NAME=en_US.UTF-8          
#>  [9] LC_ADDRESS=en_US.UTF-8        LC_TELEPHONE=en_US.UTF-8     
#> [11] LC_MEASUREMENT=en_US.UTF-8    LC_IDENTIFICATION=en_US.UTF-8
#> 
#> attached base packages:
#> [1] grid      stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#> [1] BiocFileCache_2.4.0 dbplyr_2.1.1        ape_5.6-2          
#> [4] fastreeR_1.0.0      BiocStyle_2.24.0   
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.1.2      xfun_0.30             bslib_0.3.1          
#>  [4] purrr_0.3.4           rJava_1.0-6           lattice_0.20-45      
#>  [7] vctrs_0.4.1           generics_0.1.2        htmltools_0.5.2      
#> [10] yaml_2.3.5            utf8_1.2.2            blob_1.2.3           
#> [13] rlang_1.0.2           R.oo_1.24.0           jquerylib_0.1.4      
#> [16] pillar_1.7.0          R.utils_2.11.0        withr_2.5.0          
#> [19] glue_1.6.2            DBI_1.1.2             rappdirs_0.3.3       
#> [22] bit64_4.0.5           lifecycle_1.0.1       stringr_1.4.0        
#> [25] R.methodsS3_1.8.1     evaluate_0.15         memoise_2.0.1        
#> [28] knitr_1.38            fastmap_1.1.0         parallel_4.2.0       
#> [31] curl_4.3.2            fansi_1.0.3           highr_0.9            
#> [34] Rcpp_1.0.8.3          filelock_1.0.2        BiocManager_1.30.17  
#> [37] cachem_1.0.6          magick_2.7.3          jsonlite_1.8.0       
#> [40] bit_4.0.4             digest_0.6.29         stringi_1.7.6        
#> [43] bookdown_0.26         dplyr_1.0.8           cli_3.3.0            
#> [46] tools_4.2.0           magrittr_2.0.3        sass_0.4.1           
#> [49] tibble_3.1.6          RSQLite_2.2.12        dynamicTreeCut_1.63-1
#> [52] crayon_1.5.1          pkgconfig_2.0.3       ellipsis_0.3.2       
#> [55] assertthat_0.2.1      rmarkdown_2.14        httr_1.4.2           
#> [58] R6_2.5.1              nlme_3.1-157          compiler_4.2.0