Contents

1 About metabinR

metabinR performs abundance- and composition-based binning on metagenomic samples directly from FASTA or FASTQ files. Abundance-based binning (AB) analyzes long k-mers (k > 8); composition-based binning (CB) analyzes short k-mers (k < 8); hierarchical binning (ABxCB) chains the two.

The heavy lifting is implemented in Java and called via rJava. From v2.0.0 the R API returns an S4 MetabinResult object and accepts Biostrings / ShortRead inputs in addition to file paths.

2 Installation

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("metabinR")

A JDK (Java >= 17) must be available before installing metabinR.

3 Preparation

3.1 JVM heap size

JVM flags are passed through the java.parameters option. The -Xmx flag controls the maximum heap: -Xmx1500M or -Xmx3G, etc. Set this before loading the package.

options(java.parameters = "-Xmx1500M")
library(metabinR)
library(data.table)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(cvms)
library(sabre)
library(Biostrings)

To customise additional JVM flags (on top of the -Xmx heap setting), set options(metabinR.jvm.flags = c("-XX:+UseG1GC", ...)) before the package is loaded. The defaults returned by metabinR_jvm_options() already enable G1GC and string deduplication.

4 The MetabinResult class

Each binning function returns a MetabinResult S4 object:

5 Abundance based binning example

The toy simulated metagenome contains 26,664 Illumina reads (13,332 pairs of 2x150bp) sampled from 10 bacterial genomes across two abundance classes (high vs. low).

Abundance ground truth:

abundances <- read.table(
    system.file("extdata", "distribution_0.txt", package = "metabinR"),
    col.names = c("genome_id", "abundance", "AB_id"))

Read-level ground truth:

reads.mapping <- fread(
        system.file("extdata", "reads_mapping.tsv.gz", package = "metabinR")) %>%
    merge(abundances[, c("genome_id", "AB_id")], by = "genome_id") %>%
    arrange(anonymous_read_id)

Run abundance-based binning with 10-mers into 2 clusters:

res.AB <- abundance_based_binning(
    system.file("extdata", "reads.metagenome.fasta.gz", package = "metabinR"),
    numOfClustersAB = 2,
    kMerSizeAB = 10,
    dryRun = FALSE,
    outputAB = "vignette"
)
res.AB
#> MetabinResult (AB)
#>   reads:     26,664
#>   clusters:  2
#>   inputs:    1 file(s)
#>     - /tmp/RtmpJeILFe/Rinst1dcc1c24cfe6fb/metabinR/extdata/reads.metagenome.fasta.gz

res.AB is a MetabinResult. Extract the per-read table and pull the cluster labels out of it:

assignments.AB <- as.data.frame(res.AB) %>% arrange(read_id)

abundance_based_binning() wrote one FASTA per cluster and a k-mer count histogram:

histogram.AB <- read.table("vignette__AB.histogram.tsv", header = TRUE)
ggplot(histogram.AB, aes(x = counts, y = frequency)) +
    geom_area() +
    labs(title = "kmer counts histogram") +
    theme_bw()

Evaluate against the abundance-class ground truth:

eval.AB.cvms <- cvms::evaluate(
    data = data.frame(
        prediction = as.character(assignments.AB$AB),
        target = as.character(reads.mapping$AB_id),
        stringsAsFactors = FALSE),
    target_col = "target",
    prediction_cols = "prediction",
    type = "binomial"
)
eval.AB.sabre <- sabre::vmeasure(
    as.character(assignments.AB$AB),
    as.character(reads.mapping$AB_id))

p <- cvms::plot_confusion_matrix(eval.AB.cvms) +
    labs(title = "Confusion Matrix",
         x = "Target Abundance Class",
         y = "Predicted Abundance Class")
tab <- as.data.frame(
    c(
        Accuracy    = round(eval.AB.cvms$Accuracy, 4),
        Specificity = round(eval.AB.cvms$Specificity, 4),
        Sensitivity = round(eval.AB.cvms$Sensitivity, 4),
        Fscore      = round(eval.AB.cvms$F1, 4),
        Kappa       = round(eval.AB.cvms$Kappa, 4),
        Vmeasure    = round(eval.AB.sabre$v_measure, 4)
    )
)
grid.arrange(p, ncol = 1)

knitr::kable(tab, caption = "AB binning evaluation", col.names = NULL)

Table 1: AB binning evaluation
Accuracy 0.8700
Specificity 0.9058
Sensitivity 0.7608
Fscore 0.7430
Kappa 0.6560
Vmeasure 0.3553

6 Composition based binning example

Read-level ground truth (bacterial genome of origin):

reads.mapping <- fread(
        system.file("extdata", "reads_mapping.tsv.gz", package = "metabinR")) %>%
    arrange(anonymous_read_id)

Run composition-based binning with 4-mers into 10 clusters:

res.CB <- composition_based_binning(
    system.file("extdata", "reads.metagenome.fasta.gz", package = "metabinR"),
    numOfClustersCB = 10,
    kMerSizeCB = 4,
    dryRun = TRUE,
    outputCB = "vignette"
)
assignments.CB <- as.data.frame(res.CB) %>% arrange(read_id)

As a pure clustering task, evaluate with extrinsic measures:

eval.CB.sabre <- sabre::vmeasure(
    as.character(assignments.CB$CB),
    as.character(reads.mapping$genome_id))
tab <- as.data.frame(
    c(
        Vmeasure     = round(eval.CB.sabre$v_measure, 4),
        Homogeneity  = round(eval.CB.sabre$homogeneity, 4),
        Completeness = round(eval.CB.sabre$completeness, 4)
    )
)
knitr::kable(tab, caption = "CB binning evaluation", col.names = NULL)

Table 2: CB binning evaluation
Vmeasure 0.2279
Homogeneity 0.1906
Completeness 0.2833

7 Hierarchical (2-step ABxCB) binning example

res.ABxCB <- hierarchical_binning(
    system.file("extdata", "reads.metagenome.fasta.gz", package = "metabinR"),
    numOfClustersAB = 2,
    kMerSizeAB = 10,
    kMerSizeCB = 4,
    dryRun = TRUE,
    outputC = "vignette"
)
assignments.ABxCB <- as.data.frame(res.ABxCB) %>% arrange(read_id)

eval.ABxCB.sabre <- sabre::vmeasure(
    as.character(assignments.ABxCB$ABxCB),
    as.character(reads.mapping$genome_id))
tab <- as.data.frame(
    c(
        Vmeasure     = round(eval.ABxCB.sabre$v_measure, 4),
        Homogeneity  = round(eval.ABxCB.sabre$homogeneity, 4),
        Completeness = round(eval.ABxCB.sabre$completeness, 4)
    )
)
knitr::kable(tab, caption = "ABxCB binning evaluation", col.names = NULL)

Table 3: ABxCB binning evaluation
Vmeasure 0.2830
Homogeneity 0.4722
Completeness 0.2021

8 In-memory inputs (Biostrings / ShortRead)

Instead of a path, you can pass a DNAStringSet, QualityScaledDNAStringSet, or ShortReadQ. The Java backend reads from disk, so non-file inputs are staged to a tempfile transparently.

reads <- Biostrings::readDNAStringSet(
    system.file("extdata", "reads.metagenome.fasta.gz", package = "metabinR"))

res.AB.mem <- abundance_based_binning(
    reads,
    numOfClustersAB = 2,
    kMerSizeAB = 10,
    dryRun = TRUE
)
identical(nrow(assignments(res.AB.mem)), nrow(assignments(res.AB)))
#> [1] TRUE

Clean up files written by the AB run:

unlink("vignette__*")

9 Session Info

utils::sessionInfo()
#> R version 4.6.0 RC (2026-04-17 r89917)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8          LC_NUMERIC=C                 
#>  [3] LC_TIME=en_GB                 LC_COLLATE=C                 
#>  [5] LC_MONETARY=en_US.UTF-8       LC_MESSAGES=en_US.UTF-8      
#>  [7] LC_PAPER=en_US.UTF-8          LC_NAME=en_US.UTF-8          
#>  [9] LC_ADDRESS=en_US.UTF-8        LC_TELEPHONE=en_US.UTF-8     
#> [11] LC_MEASUREMENT=en_US.UTF-8    LC_IDENTIFICATION=en_US.UTF-8
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] Biostrings_2.80.0   Seqinfo_1.2.0       XVector_0.52.0     
#>  [4] IRanges_2.46.0      S4Vectors_0.50.0    BiocGenerics_0.58.0
#>  [7] generics_0.1.4      sabre_0.4.3         cvms_2.0.0         
#> [10] gridExtra_2.3       ggplot2_4.0.3       dplyr_1.2.1        
#> [13] data.table_1.18.2.1 metabinR_2.0.0      BiocStyle_2.40.0   
#> 
#> loaded via a namespace (and not attached):
#>   [1] DBI_1.3.0                   bitops_1.0-9               
#>   [3] deldir_2.0-4                pROC_1.19.0.1              
#>   [5] rlang_1.2.0                 magrittr_2.0.5             
#>   [7] otel_0.2.0                  matrixStats_1.5.0          
#>   [9] e1071_1.7-17                compiler_4.6.0             
#>  [11] systemfonts_1.3.2           png_0.1-9                  
#>  [13] vctrs_0.7.3                 pwalign_1.8.0              
#>  [15] pkgconfig_2.0.3             crayon_1.5.3               
#>  [17] fastmap_1.2.0               backports_1.5.1            
#>  [19] magick_2.9.1                labeling_0.4.3             
#>  [21] Rsamtools_2.28.0            rmarkdown_2.31             
#>  [23] tinytex_0.59                purrr_1.2.2                
#>  [25] xfun_0.57                   cachem_1.1.0               
#>  [27] cigarillo_1.2.0             jsonlite_2.0.0             
#>  [29] DelayedArray_0.38.0         BiocParallel_1.46.0        
#>  [31] jpeg_0.1-11                 terra_1.9-11               
#>  [33] parallel_4.6.0              R6_2.6.1                   
#>  [35] bslib_0.10.0                RColorBrewer_1.1-3         
#>  [37] GenomicRanges_1.64.0        jquerylib_0.1.4            
#>  [39] Rcpp_1.1.1-1.1              bookdown_0.46              
#>  [41] SummarizedExperiment_1.42.0 knitr_1.51                 
#>  [43] R.utils_2.13.0              Matrix_1.7-5               
#>  [45] tidyselect_1.2.1            dichromat_2.0-0.1          
#>  [47] abind_1.4-8                 yaml_2.3.12                
#>  [49] codetools_0.2-20            hwriter_1.3.2.1            
#>  [51] lattice_0.22-9              tibble_3.3.1               
#>  [53] plyr_1.8.9                  Biobase_2.72.0             
#>  [55] withr_3.0.2                 ShortRead_1.70.0           
#>  [57] S7_0.2.2                    evaluate_1.0.5             
#>  [59] ggimage_0.3.5               gridGraphics_0.5-1         
#>  [61] sf_1.1-0                    rJava_1.0-18               
#>  [63] units_1.0-1                 proxy_0.4-29               
#>  [65] pillar_1.11.1               BiocManager_1.30.27        
#>  [67] MatrixGenerics_1.24.0       KernSmooth_2.23-26         
#>  [69] checkmate_2.3.4             ggfun_0.2.0                
#>  [71] sp_2.2-1                    scales_1.4.0               
#>  [73] class_7.3-23                glue_1.8.1                 
#>  [75] gdtools_0.5.0               tools_4.6.0                
#>  [77] interp_1.1-6                ggnewscale_0.5.2           
#>  [79] GenomicAlignments_1.48.0    ggiraph_0.9.6              
#>  [81] fs_2.1.0                    grid_4.6.0                 
#>  [83] tidyr_1.3.2                 latticeExtra_0.6-31        
#>  [85] raster_3.6-32               cli_3.6.6                  
#>  [87] rappdirs_0.3.4              rsvg_2.7.0                 
#>  [89] fontBitstreamVera_0.1.1     S4Arrays_1.12.0            
#>  [91] gtable_0.3.6                yulab.utils_0.2.4          
#>  [93] R.methodsS3_1.8.2           fontquiver_0.2.1           
#>  [95] sass_0.4.10                 digest_0.6.39              
#>  [97] classInt_0.4-11             ggplotify_0.1.3            
#>  [99] SparseArray_1.12.0          htmlwidgets_1.6.4          
#> [101] farver_2.1.2                entropy_1.3.2              
#> [103] htmltools_0.5.9             R.oo_1.27.1                
#> [105] lifecycle_1.0.5             fontLiberation_0.1.0       
#> [107] MASS_7.3-65