TileDBArray 1.8.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 1.10948655 0.67870374 0.63946919 . -0.64903038 0.30498071
## [2,] -1.14166356 0.18722590 -0.37684431 . -0.08778127 -0.34225088
## [3,] 0.08676118 0.37417664 -0.50525616 . -0.22008563 1.53662064
## [4,] -1.46287784 -0.23924618 1.87025298 . -0.18782777 -2.04921957
## [5,] 0.69105953 0.13391677 0.14442484 . 1.36264488 0.27929275
## ... . . . . . .
## [96,] -0.687462758 0.830348863 0.263463440 . 0.8609641 -0.2923603
## [97,] 1.409568802 -0.651175801 -0.823900909 . -0.9501900 0.3150091
## [98,] -0.001393451 0.821128318 -0.177940790 . 0.9296760 -0.2713504
## [99,] -0.658589764 0.753097348 -0.606092235 . -0.1691708 1.2085352
## [100,] 1.617004812 -0.574926341 0.728996524 . 0.2463510 -1.0353272
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 1.10948655 0.67870374 0.63946919 . -0.64903038 0.30498071
## [2,] -1.14166356 0.18722590 -0.37684431 . -0.08778127 -0.34225088
## [3,] 0.08676118 0.37417664 -0.50525616 . -0.22008563 1.53662064
## [4,] -1.46287784 -0.23924618 1.87025298 . -0.18782777 -2.04921957
## [5,] 0.69105953 0.13391677 0.14442484 . 1.36264488 0.27929275
## ... . . . . . .
## [96,] -0.687462758 0.830348863 0.263463440 . 0.8609641 -0.2923603
## [97,] 1.409568802 -0.651175801 -0.823900909 . -0.9501900 0.3150091
## [98,] -0.001393451 0.821128318 -0.177940790 . 0.9296760 -0.2713504
## [99,] -0.658589764 0.753097348 -0.606092235 . -0.1691708 1.2085352
## [100,] 1.617004812 -0.574926341 0.728996524 . 0.2463510 -1.0353272
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0.0 0.0 0.0 . 0 0
## [997,] 0.0 0.0 0.0 . 0 0
## [998,] 0.0 0.0 -1.6 . 0 0
## [999,] 0.0 0.0 0.0 . 0 0
## [1000,] 0.0 0.0 0.0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 1.10948655 0.67870374 0.63946919 . -0.64903038 0.30498071
## GENE_2 -1.14166356 0.18722590 -0.37684431 . -0.08778127 -0.34225088
## GENE_3 0.08676118 0.37417664 -0.50525616 . -0.22008563 1.53662064
## GENE_4 -1.46287784 -0.23924618 1.87025298 . -0.18782777 -2.04921957
## GENE_5 0.69105953 0.13391677 0.14442484 . 1.36264488 0.27929275
## ... . . . . . .
## GENE_96 -0.687462758 0.830348863 0.263463440 . 0.8609641 -0.2923603
## GENE_97 1.409568802 -0.651175801 -0.823900909 . -0.9501900 0.3150091
## GENE_98 -0.001393451 0.821128318 -0.177940790 . 0.9296760 -0.2713504
## GENE_99 -0.658589764 0.753097348 -0.606092235 . -0.1691708 1.2085352
## GENE_100 1.617004812 -0.574926341 0.728996524 . 0.2463510 -1.0353272
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## 1.10948655 -1.14166356 0.08676118 -1.46287784 0.69105953 1.00809007
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 1.10948655 0.67870374 0.63946919 -0.45147138 1.79851419
## GENE_2 -1.14166356 0.18722590 -0.37684431 0.78618913 -0.56076358
## GENE_3 0.08676118 0.37417664 -0.50525616 0.26990656 -0.26209638
## GENE_4 -1.46287784 -0.23924618 1.87025298 -0.87645851 2.14434563
## GENE_5 0.69105953 0.13391677 0.14442484 -0.99333248 -0.20718601
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 2.2189731 1.3574075 1.2789384 . -1.2980608 0.6099614
## GENE_2 -2.2833271 0.3744518 -0.7536886 . -0.1755625 -0.6845018
## GENE_3 0.1735224 0.7483533 -1.0105123 . -0.4401713 3.0732413
## GENE_4 -2.9257557 -0.4784924 3.7405060 . -0.3756555 -4.0984391
## GENE_5 1.3821191 0.2678335 0.2888497 . 2.7252898 0.5585855
## ... . . . . . .
## GENE_96 -1.374925516 1.660697727 0.526926879 . 1.7219282 -0.5847206
## GENE_97 2.819137605 -1.302351602 -1.647801819 . -1.9003801 0.6300182
## GENE_98 -0.002786902 1.642256637 -0.355881580 . 1.8593521 -0.5427007
## GENE_99 -1.317179527 1.506194695 -1.212184470 . -0.3383416 2.4170704
## GENE_100 3.234009624 -1.149852682 1.457993047 . 0.4927020 -2.0706543
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6 SAMP_7
## 8.818284 11.279016 3.487372 -8.223192 -2.255284 11.363784 -11.167122
## SAMP_8 SAMP_9 SAMP_10
## -12.255966 20.200384 12.435115
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 3.3409012
## GENE_2 -1.4795895
## GENE_3 -0.2156759
## GENE_4 -1.0363638
## GENE_5 1.1738708
## ... .
## GENE_96 3.0216254
## GENE_97 0.9886182
## GENE_98 1.0949151
## GENE_99 0.9793918
## GENE_100 -0.2829722
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.1966568 -2.3094810 0.4332954 . 0.52544062 0.02948438
## [2,] 2.6771611 -1.1489553 -0.4059475 . -2.23327402 -0.70891487
## [3,] -1.1448585 -0.6226113 2.1156979 . -0.97221385 -0.98883238
## [4,] -0.7005082 0.5736088 0.5968611 . 0.70387703 -1.10467787
## [5,] 1.8419513 1.8527950 -1.1850064 . 0.68401880 -0.62052545
## ... . . . . . .
## [96,] -2.3161899 0.3861056 -0.9124989 . 1.0280584 2.4935248
## [97,] 0.1339696 -0.2151672 -1.4298700 . 1.1735952 -0.2784753
## [98,] 2.3259954 -0.4826189 2.2592829 . 0.8662012 -1.1194123
## [99,] -0.9034578 -0.7776481 -1.2646247 . 1.6995948 -0.6297408
## [100,] -0.8962552 -0.8983277 0.5184715 . -0.8221680 -0.7790969
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.1966568 -2.3094810 0.4332954 . 0.52544062 0.02948438
## [2,] 2.6771611 -1.1489553 -0.4059475 . -2.23327402 -0.70891487
## [3,] -1.1448585 -0.6226113 2.1156979 . -0.97221385 -0.98883238
## [4,] -0.7005082 0.5736088 0.5968611 . 0.70387703 -1.10467787
## [5,] 1.8419513 1.8527950 -1.1850064 . 0.68401880 -0.62052545
## ... . . . . . .
## [96,] -2.3161899 0.3861056 -0.9124989 . 1.0280584 2.4935248
## [97,] 0.1339696 -0.2151672 -1.4298700 . 1.1735952 -0.2784753
## [98,] 2.3259954 -0.4826189 2.2592829 . 0.8662012 -1.1194123
## [99,] -0.9034578 -0.7776481 -1.2646247 . 1.6995948 -0.6297408
## [100,] -0.8962552 -0.8983277 0.5184715 . -0.8221680 -0.7790969
sessionInfo()
## R version 4.2.1 Patched (2022-07-09 r82577)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_GB/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] TileDBArray_1.8.0 DelayedArray_0.24.0 IRanges_2.32.0
## [4] S4Vectors_0.36.0 MatrixGenerics_1.10.0 matrixStats_0.62.0
## [7] BiocGenerics_0.44.0 Matrix_1.5-1 BiocStyle_2.26.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.9 bslib_0.4.0 compiler_4.2.1
## [4] BiocManager_1.30.19 jquerylib_0.1.4 tools_4.2.1
## [7] digest_0.6.30 bit_4.0.4 jsonlite_1.8.3
## [10] evaluate_0.17 lattice_0.20-45 nanotime_0.3.7
## [13] rlang_1.0.6 cli_3.4.1 RcppCCTZ_0.2.11
## [16] yaml_2.3.6 xfun_0.34 fastmap_1.1.0
## [19] stringr_1.4.1 knitr_1.40 sass_0.4.2
## [22] bit64_4.0.5 grid_4.2.1 data.table_1.14.4
## [25] R6_2.5.1 rmarkdown_2.17 bookdown_0.29
## [28] tiledb_0.16.0 magrittr_2.0.3 htmltools_0.5.3
## [31] stringi_1.7.8 cachem_1.0.6 zoo_1.8-11