metabinR 2.0.0
metabinR performs abundance- and composition-based binning on
metagenomic samples directly from FASTA or FASTQ files. Abundance-based
binning (AB) analyzes long k-mers (k > 8); composition-based binning
(CB) analyzes short k-mers (k < 8); hierarchical binning (ABxCB) chains
the two.
The heavy lifting is implemented in Java and called via
rJava. From v2.0.0 the R API
returns an S4 MetabinResult object and accepts Biostrings / ShortRead
inputs in addition to file paths.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("metabinR")
A JDK (Java >= 17) must be available before installing metabinR.
JVM flags are passed through the java.parameters option. The -Xmx
flag controls the maximum heap: -Xmx1500M or -Xmx3G, etc. Set this
before loading the package.
options(java.parameters = "-Xmx1500M")
library(metabinR)
library(data.table)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(cvms)
library(sabre)
library(Biostrings)
To customise additional JVM flags (on top of the -Xmx heap setting),
set options(metabinR.jvm.flags = c("-XX:+UseG1GC", ...)) before the
package is loaded. The defaults returned by metabinR_jvm_options()
already enable G1GC and string deduplication.
MetabinResult classEach binning function returns a MetabinResult S4 object:
assignments(res) — DataFrame of per-read cluster assignments.nClusters(res) — number of clusters produced.parameters(res) — list of parameters actually used.algorithm(res) — "AB", "CB", or "ABxCB".as.data.frame(res) — the v1.x tabular layout for back-compat.The toy simulated metagenome contains 26,664 Illumina reads (13,332 pairs of 2x150bp) sampled from 10 bacterial genomes across two abundance classes (high vs. low).
Abundance ground truth:
abundances <- read.table(
system.file("extdata", "distribution_0.txt", package = "metabinR"),
col.names = c("genome_id", "abundance", "AB_id"))
Read-level ground truth:
reads.mapping <- fread(
system.file("extdata", "reads_mapping.tsv.gz", package = "metabinR")) %>%
merge(abundances[, c("genome_id", "AB_id")], by = "genome_id") %>%
arrange(anonymous_read_id)
Run abundance-based binning with 10-mers into 2 clusters:
res.AB <- abundance_based_binning(
system.file("extdata", "reads.metagenome.fasta.gz", package = "metabinR"),
numOfClustersAB = 2,
kMerSizeAB = 10,
dryRun = FALSE,
outputAB = "vignette"
)
res.AB
#> MetabinResult (AB)
#> reads: 26,664
#> clusters: 2
#> inputs: 1 file(s)
#> - /tmp/RtmpJeILFe/Rinst1dcc1c24cfe6fb/metabinR/extdata/reads.metagenome.fasta.gz
res.AB is a MetabinResult. Extract the per-read table and pull the
cluster labels out of it:
assignments.AB <- as.data.frame(res.AB) %>% arrange(read_id)
abundance_based_binning() wrote one FASTA per cluster and a k-mer
count histogram:
histogram.AB <- read.table("vignette__AB.histogram.tsv", header = TRUE)
ggplot(histogram.AB, aes(x = counts, y = frequency)) +
geom_area() +
labs(title = "kmer counts histogram") +
theme_bw()
Evaluate against the abundance-class ground truth:
eval.AB.cvms <- cvms::evaluate(
data = data.frame(
prediction = as.character(assignments.AB$AB),
target = as.character(reads.mapping$AB_id),
stringsAsFactors = FALSE),
target_col = "target",
prediction_cols = "prediction",
type = "binomial"
)
eval.AB.sabre <- sabre::vmeasure(
as.character(assignments.AB$AB),
as.character(reads.mapping$AB_id))
p <- cvms::plot_confusion_matrix(eval.AB.cvms) +
labs(title = "Confusion Matrix",
x = "Target Abundance Class",
y = "Predicted Abundance Class")
tab <- as.data.frame(
c(
Accuracy = round(eval.AB.cvms$Accuracy, 4),
Specificity = round(eval.AB.cvms$Specificity, 4),
Sensitivity = round(eval.AB.cvms$Sensitivity, 4),
Fscore = round(eval.AB.cvms$F1, 4),
Kappa = round(eval.AB.cvms$Kappa, 4),
Vmeasure = round(eval.AB.sabre$v_measure, 4)
)
)
grid.arrange(p, ncol = 1)
knitr::kable(tab, caption = "AB binning evaluation", col.names = NULL)
| Accuracy | 0.8700 |
| Specificity | 0.9058 |
| Sensitivity | 0.7608 |
| Fscore | 0.7430 |
| Kappa | 0.6560 |
| Vmeasure | 0.3553 |
Read-level ground truth (bacterial genome of origin):
reads.mapping <- fread(
system.file("extdata", "reads_mapping.tsv.gz", package = "metabinR")) %>%
arrange(anonymous_read_id)
Run composition-based binning with 4-mers into 10 clusters:
res.CB <- composition_based_binning(
system.file("extdata", "reads.metagenome.fasta.gz", package = "metabinR"),
numOfClustersCB = 10,
kMerSizeCB = 4,
dryRun = TRUE,
outputCB = "vignette"
)
assignments.CB <- as.data.frame(res.CB) %>% arrange(read_id)
As a pure clustering task, evaluate with extrinsic measures:
eval.CB.sabre <- sabre::vmeasure(
as.character(assignments.CB$CB),
as.character(reads.mapping$genome_id))
tab <- as.data.frame(
c(
Vmeasure = round(eval.CB.sabre$v_measure, 4),
Homogeneity = round(eval.CB.sabre$homogeneity, 4),
Completeness = round(eval.CB.sabre$completeness, 4)
)
)
knitr::kable(tab, caption = "CB binning evaluation", col.names = NULL)
| Vmeasure | 0.2279 |
| Homogeneity | 0.1906 |
| Completeness | 0.2833 |
res.ABxCB <- hierarchical_binning(
system.file("extdata", "reads.metagenome.fasta.gz", package = "metabinR"),
numOfClustersAB = 2,
kMerSizeAB = 10,
kMerSizeCB = 4,
dryRun = TRUE,
outputC = "vignette"
)
assignments.ABxCB <- as.data.frame(res.ABxCB) %>% arrange(read_id)
eval.ABxCB.sabre <- sabre::vmeasure(
as.character(assignments.ABxCB$ABxCB),
as.character(reads.mapping$genome_id))
tab <- as.data.frame(
c(
Vmeasure = round(eval.ABxCB.sabre$v_measure, 4),
Homogeneity = round(eval.ABxCB.sabre$homogeneity, 4),
Completeness = round(eval.ABxCB.sabre$completeness, 4)
)
)
knitr::kable(tab, caption = "ABxCB binning evaluation", col.names = NULL)
| Vmeasure | 0.2830 |
| Homogeneity | 0.4722 |
| Completeness | 0.2021 |
Instead of a path, you can pass a DNAStringSet,
QualityScaledDNAStringSet, or ShortReadQ. The Java backend reads
from disk, so non-file inputs are staged to a tempfile transparently.
reads <- Biostrings::readDNAStringSet(
system.file("extdata", "reads.metagenome.fasta.gz", package = "metabinR"))
res.AB.mem <- abundance_based_binning(
reads,
numOfClustersAB = 2,
kMerSizeAB = 10,
dryRun = TRUE
)
identical(nrow(assignments(res.AB.mem)), nrow(assignments(res.AB)))
#> [1] TRUE
Clean up files written by the AB run:
unlink("vignette__*")
utils::sessionInfo()
#> R version 4.6.0 RC (2026-04-17 r89917)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8
#> [9] LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] Biostrings_2.80.0 Seqinfo_1.2.0 XVector_0.52.0
#> [4] IRanges_2.46.0 S4Vectors_0.50.0 BiocGenerics_0.58.0
#> [7] generics_0.1.4 sabre_0.4.3 cvms_2.0.0
#> [10] gridExtra_2.3 ggplot2_4.0.3 dplyr_1.2.1
#> [13] data.table_1.18.2.1 metabinR_2.0.0 BiocStyle_2.40.0
#>
#> loaded via a namespace (and not attached):
#> [1] DBI_1.3.0 bitops_1.0-9
#> [3] deldir_2.0-4 pROC_1.19.0.1
#> [5] rlang_1.2.0 magrittr_2.0.5
#> [7] otel_0.2.0 matrixStats_1.5.0
#> [9] e1071_1.7-17 compiler_4.6.0
#> [11] systemfonts_1.3.2 png_0.1-9
#> [13] vctrs_0.7.3 pwalign_1.8.0
#> [15] pkgconfig_2.0.3 crayon_1.5.3
#> [17] fastmap_1.2.0 backports_1.5.1
#> [19] magick_2.9.1 labeling_0.4.3
#> [21] Rsamtools_2.28.0 rmarkdown_2.31
#> [23] tinytex_0.59 purrr_1.2.2
#> [25] xfun_0.57 cachem_1.1.0
#> [27] cigarillo_1.2.0 jsonlite_2.0.0
#> [29] DelayedArray_0.38.0 BiocParallel_1.46.0
#> [31] jpeg_0.1-11 terra_1.9-11
#> [33] parallel_4.6.0 R6_2.6.1
#> [35] bslib_0.10.0 RColorBrewer_1.1-3
#> [37] GenomicRanges_1.64.0 jquerylib_0.1.4
#> [39] Rcpp_1.1.1-1.1 bookdown_0.46
#> [41] SummarizedExperiment_1.42.0 knitr_1.51
#> [43] R.utils_2.13.0 Matrix_1.7-5
#> [45] tidyselect_1.2.1 dichromat_2.0-0.1
#> [47] abind_1.4-8 yaml_2.3.12
#> [49] codetools_0.2-20 hwriter_1.3.2.1
#> [51] lattice_0.22-9 tibble_3.3.1
#> [53] plyr_1.8.9 Biobase_2.72.0
#> [55] withr_3.0.2 ShortRead_1.70.0
#> [57] S7_0.2.2 evaluate_1.0.5
#> [59] ggimage_0.3.5 gridGraphics_0.5-1
#> [61] sf_1.1-0 rJava_1.0-18
#> [63] units_1.0-1 proxy_0.4-29
#> [65] pillar_1.11.1 BiocManager_1.30.27
#> [67] MatrixGenerics_1.24.0 KernSmooth_2.23-26
#> [69] checkmate_2.3.4 ggfun_0.2.0
#> [71] sp_2.2-1 scales_1.4.0
#> [73] class_7.3-23 glue_1.8.1
#> [75] gdtools_0.5.0 tools_4.6.0
#> [77] interp_1.1-6 ggnewscale_0.5.2
#> [79] GenomicAlignments_1.48.0 ggiraph_0.9.6
#> [81] fs_2.1.0 grid_4.6.0
#> [83] tidyr_1.3.2 latticeExtra_0.6-31
#> [85] raster_3.6-32 cli_3.6.6
#> [87] rappdirs_0.3.4 rsvg_2.7.0
#> [89] fontBitstreamVera_0.1.1 S4Arrays_1.12.0
#> [91] gtable_0.3.6 yulab.utils_0.2.4
#> [93] R.methodsS3_1.8.2 fontquiver_0.2.1
#> [95] sass_0.4.10 digest_0.6.39
#> [97] classInt_0.4-11 ggplotify_0.1.3
#> [99] SparseArray_1.12.0 htmlwidgets_1.6.4
#> [101] farver_2.1.2 entropy_1.3.2
#> [103] htmltools_0.5.9 R.oo_1.27.1
#> [105] lifecycle_1.0.5 fontLiberation_0.1.0
#> [107] MASS_7.3-65