TileDBArray 1.6.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 1.72602585 0.43955768 -0.71983643 . 0.68977380 0.25330644
## [2,] -1.92863745 0.21404920 -1.52165654 . -0.98688044 0.09532796
## [3,] 0.09414961 0.17309053 0.11152534 . 0.07290348 0.83499383
## [4,] -1.34465160 0.35555503 -0.52568276 . 0.01450333 0.58887487
## [5,] 0.40331292 1.32660963 0.49680861 . 1.00507620 0.58047471
## ... . . . . . .
## [96,] -1.12203601 -0.59814556 0.35822470 . -1.95005969 1.24758683
## [97,] 1.27248670 -0.96408570 -1.06377517 . -1.72356448 0.08408464
## [98,] -0.05879468 0.61644725 -0.72983461 . -0.09462854 -0.84746922
## [99,] 1.72711199 3.16381459 0.16048838 . -0.48311567 -0.09565278
## [100,] 0.26491121 -0.98859317 1.61561234 . -0.40570814 0.82899982
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 1.72602585 0.43955768 -0.71983643 . 0.68977380 0.25330644
## [2,] -1.92863745 0.21404920 -1.52165654 . -0.98688044 0.09532796
## [3,] 0.09414961 0.17309053 0.11152534 . 0.07290348 0.83499383
## [4,] -1.34465160 0.35555503 -0.52568276 . 0.01450333 0.58887487
## [5,] 0.40331292 1.32660963 0.49680861 . 1.00507620 0.58047471
## ... . . . . . .
## [96,] -1.12203601 -0.59814556 0.35822470 . -1.95005969 1.24758683
## [97,] 1.27248670 -0.96408570 -1.06377517 . -1.72356448 0.08408464
## [98,] -0.05879468 0.61644725 -0.72983461 . -0.09462854 -0.84746922
## [99,] 1.72711199 3.16381459 0.16048838 . -0.48311567 -0.09565278
## [100,] 0.26491121 -0.98859317 1.61561234 . -0.40570814 0.82899982
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 1.72602585 0.43955768 -0.71983643 . 0.68977380 0.25330644
## GENE_2 -1.92863745 0.21404920 -1.52165654 . -0.98688044 0.09532796
## GENE_3 0.09414961 0.17309053 0.11152534 . 0.07290348 0.83499383
## GENE_4 -1.34465160 0.35555503 -0.52568276 . 0.01450333 0.58887487
## GENE_5 0.40331292 1.32660963 0.49680861 . 1.00507620 0.58047471
## ... . . . . . .
## GENE_96 -1.12203601 -0.59814556 0.35822470 . -1.95005969 1.24758683
## GENE_97 1.27248670 -0.96408570 -1.06377517 . -1.72356448 0.08408464
## GENE_98 -0.05879468 0.61644725 -0.72983461 . -0.09462854 -0.84746922
## GENE_99 1.72711199 3.16381459 0.16048838 . -0.48311567 -0.09565278
## GENE_100 0.26491121 -0.98859317 1.61561234 . -0.40570814 0.82899982
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## 1.72602585 -1.92863745 0.09414961 -1.34465160 0.40331292 -0.61534410
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 1.72602585 0.43955768 -0.71983643 1.23386919 -0.34884502
## GENE_2 -1.92863745 0.21404920 -1.52165654 -1.05059326 0.39375312
## GENE_3 0.09414961 0.17309053 0.11152534 0.24705821 -1.92693771
## GENE_4 -1.34465160 0.35555503 -0.52568276 -0.37223353 0.11815716
## GENE_5 0.40331292 1.32660963 0.49680861 0.85734770 0.99038527
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 3.4520517 0.8791154 -1.4396729 . 1.37954761 0.50661288
## GENE_2 -3.8572749 0.4280984 -3.0433131 . -1.97376089 0.19065591
## GENE_3 0.1882992 0.3461811 0.2230507 . 0.14580697 1.66998765
## GENE_4 -2.6893032 0.7111101 -1.0513655 . 0.02900666 1.17774973
## GENE_5 0.8066258 2.6532193 0.9936172 . 2.01015241 1.16094943
## ... . . . . . .
## GENE_96 -2.2440720 -1.1962911 0.7164494 . -3.9001194 2.4951737
## GENE_97 2.5449734 -1.9281714 -2.1275503 . -3.4471290 0.1681693
## GENE_98 -0.1175894 1.2328945 -1.4596692 . -0.1892571 -1.6949384
## GENE_99 3.4542240 6.3276292 0.3209768 . -0.9662313 -0.1913056
## GENE_100 0.5298224 -1.9771863 3.2312247 . -0.8114163 1.6579996
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## 21.68601027 -6.28260026 3.28892075 13.13047527 7.87728653 -7.77582661
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## -0.09776693 -11.60699942 -11.55760515 -0.71970792
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 4.9745872
## GENE_2 -5.9373816
## GENE_3 -0.7386153
## GENE_4 -2.1790847
## GENE_5 2.3392248
## ... .
## GENE_96 -1.776945
## GENE_97 -1.253874
## GENE_98 -3.004146
## GENE_99 3.525255
## GENE_100 1.889253
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.45922798 0.28819376 -1.90518050 . -0.42471988 -1.78362023
## [2,] 0.77159438 -0.38122349 0.94989769 . 1.78588434 -0.28203242
## [3,] 0.34788023 0.85900328 -0.09973514 . 1.20697855 -0.02729627
## [4,] 0.52448237 -0.38593604 1.48056694 . -0.32099059 -0.42868487
## [5,] 1.91129884 0.17617489 2.00361279 . -0.74537090 0.09379967
## ... . . . . . .
## [96,] -1.41201283 -0.53214539 -0.03704517 . -0.51623062 0.71660635
## [97,] 0.80484616 -1.49971214 0.40791294 . 0.31714110 0.49464432
## [98,] 0.61221367 0.30285905 -0.28963755 . -0.23958691 0.93068270
## [99,] 1.23303391 0.87707182 0.37516770 . 0.08228600 -1.35507213
## [100,] -0.60133805 -0.47019577 0.19688402 . -0.02666644 0.77034298
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.45922798 0.28819376 -1.90518050 . -0.42471988 -1.78362023
## [2,] 0.77159438 -0.38122349 0.94989769 . 1.78588434 -0.28203242
## [3,] 0.34788023 0.85900328 -0.09973514 . 1.20697855 -0.02729627
## [4,] 0.52448237 -0.38593604 1.48056694 . -0.32099059 -0.42868487
## [5,] 1.91129884 0.17617489 2.00361279 . -0.74537090 0.09379967
## ... . . . . . .
## [96,] -1.41201283 -0.53214539 -0.03704517 . -0.51623062 0.71660635
## [97,] 0.80484616 -1.49971214 0.40791294 . 0.31714110 0.49464432
## [98,] 0.61221367 0.30285905 -0.28963755 . -0.23958691 0.93068270
## [99,] 1.23303391 0.87707182 0.37516770 . 0.08228600 -1.35507213
## [100,] -0.60133805 -0.47019577 0.19688402 . -0.02666644 0.77034298
sessionInfo()
## R version 4.2.0 RC (2022-04-19 r82224 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows Server x64 (build 20348)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=C
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] TileDBArray_1.6.0 DelayedArray_0.22.0 IRanges_2.30.0
## [4] S4Vectors_0.34.0 MatrixGenerics_1.8.0 matrixStats_0.62.0
## [7] BiocGenerics_0.42.0 Matrix_1.4-1 BiocStyle_2.24.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.8.3 bslib_0.3.1 compiler_4.2.0
## [4] BiocManager_1.30.17 jquerylib_0.1.4 tools_4.2.0
## [7] digest_0.6.29 bit_4.0.4 jsonlite_1.8.0
## [10] evaluate_0.15 lattice_0.20-45 nanotime_0.3.6
## [13] rlang_1.0.2 cli_3.3.0 RcppCCTZ_0.2.10
## [16] yaml_2.3.5 xfun_0.30 fastmap_1.1.0
## [19] stringr_1.4.0 knitr_1.38 sass_0.4.1
## [22] bit64_4.0.5 grid_4.2.0 data.table_1.14.2
## [25] R6_2.5.1 rmarkdown_2.14 bookdown_0.26
## [28] tiledb_0.12.0 magrittr_2.0.3 htmltools_0.5.2
## [31] stringi_1.7.6 zoo_1.8-10