TileDBArray 1.10.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.75871322 0.41298175 -0.01982588 . -1.55172626 -0.68799730
## [2,] -1.33915619 -1.77777393 -0.67945935 . -1.34934084 0.74345276
## [3,] 0.23629813 -0.87721386 0.65523937 . -0.18619768 -0.43505630
## [4,] 0.17660113 0.54293719 1.67654667 . 0.11455089 -0.01476603
## [5,] 1.48317674 -0.50868006 0.64664866 . -0.01870977 1.69279973
## ... . . . . . .
## [96,] 1.19636478 0.83938330 -1.13153439 . 0.6479819 -0.5210383
## [97,] 0.88386938 0.05392357 2.49034960 . -1.6625731 0.5033967
## [98,] 0.17621649 -0.04841620 0.78298475 . 0.2368585 0.2745226
## [99,] -0.72552024 1.07069886 -0.35489272 . 0.5503865 0.7026936
## [100,] 0.37901908 1.17328548 0.24524220 . -0.2037174 -1.1827543
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.75871322 0.41298175 -0.01982588 . -1.55172626 -0.68799730
## [2,] -1.33915619 -1.77777393 -0.67945935 . -1.34934084 0.74345276
## [3,] 0.23629813 -0.87721386 0.65523937 . -0.18619768 -0.43505630
## [4,] 0.17660113 0.54293719 1.67654667 . 0.11455089 -0.01476603
## [5,] 1.48317674 -0.50868006 0.64664866 . -0.01870977 1.69279973
## ... . . . . . .
## [96,] 1.19636478 0.83938330 -1.13153439 . 0.6479819 -0.5210383
## [97,] 0.88386938 0.05392357 2.49034960 . -1.6625731 0.5033967
## [98,] 0.17621649 -0.04841620 0.78298475 . 0.2368585 0.2745226
## [99,] -0.72552024 1.07069886 -0.35489272 . 0.5503865 0.7026936
## [100,] 0.37901908 1.17328548 0.24524220 . -0.2037174 -1.1827543
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.75871322 0.41298175 -0.01982588 . -1.55172626 -0.68799730
## GENE_2 -1.33915619 -1.77777393 -0.67945935 . -1.34934084 0.74345276
## GENE_3 0.23629813 -0.87721386 0.65523937 . -0.18619768 -0.43505630
## GENE_4 0.17660113 0.54293719 1.67654667 . 0.11455089 -0.01476603
## GENE_5 1.48317674 -0.50868006 0.64664866 . -0.01870977 1.69279973
## ... . . . . . .
## GENE_96 1.19636478 0.83938330 -1.13153439 . 0.6479819 -0.5210383
## GENE_97 0.88386938 0.05392357 2.49034960 . -1.6625731 0.5033967
## GENE_98 0.17621649 -0.04841620 0.78298475 . 0.2368585 0.2745226
## GENE_99 -0.72552024 1.07069886 -0.35489272 . 0.5503865 0.7026936
## GENE_100 0.37901908 1.17328548 0.24524220 . -0.2037174 -1.1827543
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## -0.7587132 -1.3391562 0.2362981 0.1766011 1.4831767 -1.7503243
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 -0.75871322 0.41298175 -0.01982588 -1.18089371 -0.22791701
## GENE_2 -1.33915619 -1.77777393 -0.67945935 -0.34479545 -0.33065733
## GENE_3 0.23629813 -0.87721386 0.65523937 -0.22675855 -1.86654122
## GENE_4 0.17660113 0.54293719 1.67654667 0.59987329 2.29846317
## GENE_5 1.48317674 -0.50868006 0.64664866 0.60346836 0.21578080
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -1.51742643 0.82596351 -0.03965176 . -3.10345251 -1.37599460
## GENE_2 -2.67831237 -3.55554786 -1.35891870 . -2.69868168 1.48690553
## GENE_3 0.47259625 -1.75442772 1.31047875 . -0.37239536 -0.87011261
## GENE_4 0.35320225 1.08587439 3.35309335 . 0.22910179 -0.02953206
## GENE_5 2.96635348 -1.01736012 1.29329732 . -0.03741955 3.38559946
## ... . . . . . .
## GENE_96 2.3927296 1.6787666 -2.2630688 . 1.2959637 -1.0420767
## GENE_97 1.7677388 0.1078471 4.9806992 . -3.3251462 1.0067934
## GENE_98 0.3524330 -0.0968324 1.5659695 . 0.4737170 0.5490452
## GENE_99 -1.4510405 2.1413977 -0.7097854 . 1.1007729 1.4053872
## GENE_100 0.7580382 2.3465710 0.4904844 . -0.4074348 -2.3655085
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6 SAMP_7
## -7.522580 -13.717892 8.895570 -1.041235 13.538297 1.441545 -4.783201
## SAMP_8 SAMP_9 SAMP_10
## -10.147279 -4.952959 -4.793807
out %*% runif(ncol(out))
## <100 x 1> DelayedMatrix object of type "double":
## y
## GENE_1 -2.7179909
## GENE_2 -3.1150108
## GENE_3 -0.6615898
## GENE_4 4.0556770
## GENE_5 4.5208846
## ... .
## GENE_96 1.0506217
## GENE_97 0.0158005
## GENE_98 1.5523678
## GENE_99 0.8067949
## GENE_100 -0.0965134
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.3599280 -0.4710653 -0.1803033 . -0.2032877 0.5827217
## [2,] 1.5820875 -0.5802424 -0.3140938 . -0.4040206 1.4070076
## [3,] 0.3286649 0.4902643 0.8484645 . -0.9551406 -1.3846514
## [4,] -0.5375075 1.8817169 -0.1574309 . 1.5933507 -1.3332408
## [5,] 0.4665883 1.4397156 0.6450615 . -0.3696888 -1.1330089
## ... . . . . . .
## [96,] 0.45966897 0.29577316 0.59593964 . 0.07465632 -0.01511324
## [97,] -0.58598624 1.41063727 -0.03990368 . 0.13778678 -0.69753626
## [98,] -1.64170434 0.26954219 -1.24965235 . -1.25207121 -0.86196953
## [99,] 2.49557165 1.16281828 -0.53044032 . -0.05763667 2.41891779
## [100,] -0.26216894 0.95278458 -1.44999944 . -1.82512800 -0.05030921
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.3599280 -0.4710653 -0.1803033 . -0.2032877 0.5827217
## [2,] 1.5820875 -0.5802424 -0.3140938 . -0.4040206 1.4070076
## [3,] 0.3286649 0.4902643 0.8484645 . -0.9551406 -1.3846514
## [4,] -0.5375075 1.8817169 -0.1574309 . 1.5933507 -1.3332408
## [5,] 0.4665883 1.4397156 0.6450615 . -0.3696888 -1.1330089
## ... . . . . . .
## [96,] 0.45966897 0.29577316 0.59593964 . 0.07465632 -0.01511324
## [97,] -0.58598624 1.41063727 -0.03990368 . 0.13778678 -0.69753626
## [98,] -1.64170434 0.26954219 -1.24965235 . -1.25207121 -0.86196953
## [99,] 2.49557165 1.16281828 -0.53044032 . -0.05763667 2.41891779
## [100,] -0.26216894 0.95278458 -1.44999944 . -1.82512800 -0.05030921
sessionInfo()
## R version 4.3.0 RC (2023-04-13 r84257)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.6.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.12 TileDBArray_1.10.0 DelayedArray_0.26.2
## [4] S4Arrays_1.0.1 IRanges_2.34.0 S4Vectors_0.38.1
## [7] MatrixGenerics_1.12.0 matrixStats_0.63.0 BiocGenerics_0.46.0
## [10] Matrix_1.5-4 BiocStyle_2.28.0
##
## loaded via a namespace (and not attached):
## [1] crayon_1.5.2 cli_3.6.1 knitr_1.42
## [4] rlang_1.1.1 xfun_0.39 data.table_1.14.8
## [7] jsonlite_1.8.4 zoo_1.8-12 bit_4.0.5
## [10] htmltools_0.5.5 nanotime_0.3.7 sass_0.4.6
## [13] rmarkdown_2.21 grid_4.3.0 evaluate_0.21
## [16] jquerylib_0.1.4 fastmap_1.1.1 yaml_2.3.7
## [19] bookdown_0.34 BiocManager_1.30.20 compiler_4.3.0
## [22] Rcpp_1.0.10 RcppCCTZ_0.2.12 lattice_0.21-8
## [25] digest_0.6.31 R6_2.5.1 tiledb_0.19.1
## [28] bslib_0.4.2 bit64_4.0.5 tools_4.3.0
## [31] spdl_0.0.4 cachem_1.0.8