TileDBArray 1.14.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -2.480316053 -2.211536948 -0.068249219 . 0.954866165 -0.608248352
## [2,] -0.706247091 -0.921172831 0.172562919 . 1.276362192 -0.008757421
## [3,] -1.152821910 0.151118600 0.058687869 . -1.741704837 1.700653786
## [4,] 2.273958047 0.037099535 2.014636883 . 0.917307053 1.457872673
## [5,] -0.520180533 -1.526644569 -0.007060158 . 0.030536254 0.663200222
## ... . . . . . .
## [96,] -1.3934603 0.9218087 0.9642769 . -0.7311150 -0.3182954
## [97,] -0.1146807 0.6451473 -1.5693491 . -0.3575308 -0.1528309
## [98,] -0.7629428 -1.1953285 -0.2467730 . -0.4778209 -0.3146100
## [99,] -2.2331846 -0.3214879 -1.5213762 . 0.9574313 -0.3435485
## [100,] -1.6298656 -0.1527579 -1.3034255 . 1.3568400 -0.7233477
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -2.480316053 -2.211536948 -0.068249219 . 0.954866165 -0.608248352
## [2,] -0.706247091 -0.921172831 0.172562919 . 1.276362192 -0.008757421
## [3,] -1.152821910 0.151118600 0.058687869 . -1.741704837 1.700653786
## [4,] 2.273958047 0.037099535 2.014636883 . 0.917307053 1.457872673
## [5,] -0.520180533 -1.526644569 -0.007060158 . 0.030536254 0.663200222
## ... . . . . . .
## [96,] -1.3934603 0.9218087 0.9642769 . -0.7311150 -0.3182954
## [97,] -0.1146807 0.6451473 -1.5693491 . -0.3575308 -0.1528309
## [98,] -0.7629428 -1.1953285 -0.2467730 . -0.4778209 -0.3146100
## [99,] -2.2331846 -0.3214879 -1.5213762 . 0.9574313 -0.3435485
## [100,] -1.6298656 -0.1527579 -1.3034255 . 1.3568400 -0.7233477
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -2.480316053 -2.211536948 -0.068249219 . 0.954866165 -0.608248352
## GENE_2 -0.706247091 -0.921172831 0.172562919 . 1.276362192 -0.008757421
## GENE_3 -1.152821910 0.151118600 0.058687869 . -1.741704837 1.700653786
## GENE_4 2.273958047 0.037099535 2.014636883 . 0.917307053 1.457872673
## GENE_5 -0.520180533 -1.526644569 -0.007060158 . 0.030536254 0.663200222
## ... . . . . . .
## GENE_96 -1.3934603 0.9218087 0.9642769 . -0.7311150 -0.3182954
## GENE_97 -0.1146807 0.6451473 -1.5693491 . -0.3575308 -0.1528309
## GENE_98 -0.7629428 -1.1953285 -0.2467730 . -0.4778209 -0.3146100
## GENE_99 -2.2331846 -0.3214879 -1.5213762 . 0.9574313 -0.3435485
## GENE_100 -1.6298656 -0.1527579 -1.3034255 . 1.3568400 -0.7233477
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## -2.4803161 -0.7062471 -1.1528219 2.2739580 -0.5201805 0.8447000
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 -2.480316053 -2.211536948 -0.068249219 -1.428149720 0.475859845
## GENE_2 -0.706247091 -0.921172831 0.172562919 0.073441218 -0.764209568
## GENE_3 -1.152821910 0.151118600 0.058687869 -1.245379673 2.245319196
## GENE_4 2.273958047 0.037099535 2.014636883 -1.235676193 0.206861346
## GENE_5 -0.520180533 -1.526644569 -0.007060158 0.840129767 0.245940829
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -4.96063211 -4.42307390 -0.13649844 . 1.90973233 -1.21649670
## GENE_2 -1.41249418 -1.84234566 0.34512584 . 2.55272438 -0.01751484
## GENE_3 -2.30564382 0.30223720 0.11737574 . -3.48340967 3.40130757
## GENE_4 4.54791609 0.07419907 4.02927377 . 1.83461411 2.91574535
## GENE_5 -1.04036107 -3.05328914 -0.01412032 . 0.06107251 1.32640044
## ... . . . . . .
## GENE_96 -2.7869205 1.8436175 1.9285538 . -1.4622300 -0.6365908
## GENE_97 -0.2293614 1.2902946 -3.1386982 . -0.7150617 -0.3056619
## GENE_98 -1.5258857 -2.3906570 -0.4935460 . -0.9556418 -0.6292199
## GENE_99 -4.4663691 -0.6429757 -3.0427524 . 1.9148627 -0.6870970
## GENE_100 -3.2597313 -0.3055158 -2.6068510 . 2.7136799 -1.4466955
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6 SAMP_7 SAMP_8
## -6.819848 7.166784 8.037860 2.611655 13.285335 2.232655 9.088340 -2.662934
## SAMP_9 SAMP_10
## -1.519763 10.629035
out %*% runif(ncol(out))
## [,1]
## GENE_1 -4.070990163
## GENE_2 2.022773418
## GENE_3 0.146956309
## GENE_4 2.558143103
## GENE_5 -1.715603754
## GENE_6 -1.483859620
## GENE_7 -2.672435036
## GENE_8 -0.094269975
## GENE_9 1.594711579
## GENE_10 1.723948654
## GENE_11 0.939141899
## GENE_12 -3.584390929
## GENE_13 -0.361595496
## GENE_14 2.090862960
## GENE_15 2.881210045
## GENE_16 2.720140999
## GENE_17 3.569585322
## GENE_18 0.502604179
## GENE_19 -2.924548040
## GENE_20 0.167447684
## GENE_21 -2.085034255
## GENE_22 3.877332186
## GENE_23 -0.166031441
## GENE_24 -0.550793327
## GENE_25 0.097770259
## GENE_26 3.297816607
## GENE_27 0.960829489
## GENE_28 0.273141157
## GENE_29 1.406518728
## GENE_30 -5.039352825
## GENE_31 1.632233999
## GENE_32 -0.820231672
## GENE_33 -1.811175352
## GENE_34 1.707006162
## GENE_35 4.048662422
## GENE_36 1.847785701
## GENE_37 -0.022137469
## GENE_38 -1.770163122
## GENE_39 3.220042535
## GENE_40 0.008270668
## GENE_41 3.389151031
## GENE_42 1.212606018
## GENE_43 -5.167159962
## GENE_44 0.708261870
## GENE_45 0.014473165
## GENE_46 -2.589045065
## GENE_47 0.359388653
## GENE_48 -0.347480083
## GENE_49 -0.074953941
## GENE_50 -4.692923084
## GENE_51 0.545002780
## GENE_52 0.416712113
## GENE_53 -1.421916326
## GENE_54 0.377555276
## GENE_55 2.499389249
## GENE_56 3.124515151
## GENE_57 4.364299268
## GENE_58 -1.359491031
## GENE_59 0.081034472
## GENE_60 2.583310004
## GENE_61 3.531564592
## GENE_62 -1.506398012
## GENE_63 -0.673481071
## GENE_64 -0.450211066
## GENE_65 0.238411679
## GENE_66 -1.548429084
## GENE_67 -2.414992452
## GENE_68 2.163685228
## GENE_69 0.436985187
## GENE_70 -1.328889777
## GENE_71 -0.705351136
## GENE_72 1.095011667
## GENE_73 -1.683285412
## GENE_74 -2.552022444
## GENE_75 1.170783754
## GENE_76 -0.586706164
## GENE_77 1.330135487
## GENE_78 3.859994420
## GENE_79 2.658812572
## GENE_80 -1.355837618
## GENE_81 0.054346834
## GENE_82 0.001941655
## GENE_83 -0.453025906
## GENE_84 1.092713130
## GENE_85 2.188631805
## GENE_86 -0.370523411
## GENE_87 2.711369396
## GENE_88 1.751246950
## GENE_89 0.395700142
## GENE_90 2.024599430
## GENE_91 -1.040366540
## GENE_92 -3.563476902
## GENE_93 -0.936976943
## GENE_94 -0.085106877
## GENE_95 3.046330048
## GENE_96 -0.085000437
## GENE_97 -0.226824360
## GENE_98 -1.109893747
## GENE_99 -0.815969294
## GENE_100 -0.655069079
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.40824005 1.49673555 2.50791028 . -0.7459822 1.0169056
## [2,] 0.18419247 0.70145576 0.42332408 . -0.5820055 1.5437987
## [3,] 1.16447007 0.09201967 -2.15232875 . 0.9929936 1.2915748
## [4,] -0.59861248 0.35543796 1.40515219 . 1.4839408 -0.2844463
## [5,] 0.18323619 0.28319451 -0.72727248 . 1.8005431 -0.4498523
## ... . . . . . .
## [96,] -0.681974302 1.384312115 -1.019595123 . -0.178502867 0.669751574
## [97,] 0.434843208 -0.928759688 -0.805250318 . 0.480070714 -0.422204241
## [98,] 0.238130539 -1.043045235 -0.009429849 . -1.290246469 0.612706332
## [99,] 1.684672389 1.423369722 0.475088184 . 1.272742993 -0.001385024
## [100,] -0.709473173 0.934373248 0.892812771 . -1.470345005 -1.291007622
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.40824005 1.49673555 2.50791028 . -0.7459822 1.0169056
## [2,] 0.18419247 0.70145576 0.42332408 . -0.5820055 1.5437987
## [3,] 1.16447007 0.09201967 -2.15232875 . 0.9929936 1.2915748
## [4,] -0.59861248 0.35543796 1.40515219 . 1.4839408 -0.2844463
## [5,] 0.18323619 0.28319451 -0.72727248 . 1.8005431 -0.4498523
## ... . . . . . .
## [96,] -0.681974302 1.384312115 -1.019595123 . -0.178502867 0.669751574
## [97,] 0.434843208 -0.928759688 -0.805250318 . 0.480070714 -0.422204241
## [98,] 0.238130539 -1.043045235 -0.009429849 . -1.290246469 0.612706332
## [99,] 1.684672389 1.423369722 0.475088184 . 1.272742993 -0.001385024
## [100,] -0.709473173 0.934373248 0.892812771 . -1.470345005 -1.291007622
sessionInfo()
## R version 4.4.0 alpha (2024-03-27 r86216)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.6.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.16 TileDBArray_1.14.0 DelayedArray_0.30.0
## [4] SparseArray_1.4.0 S4Arrays_1.4.0 abind_1.4-5
## [7] IRanges_2.38.0 S4Vectors_0.42.0 MatrixGenerics_1.16.0
## [10] matrixStats_1.2.0 BiocGenerics_0.50.0 Matrix_1.7-0
## [13] BiocStyle_2.32.0
##
## loaded via a namespace (and not attached):
## [1] bit_4.0.5 jsonlite_1.8.8 compiler_4.4.0
## [4] BiocManager_1.30.22 crayon_1.5.2 Rcpp_1.0.12
## [7] nanoarrow_0.4.0.1 jquerylib_0.1.4 yaml_2.3.8
## [10] fastmap_1.1.1 lattice_0.22-6 R6_2.5.1
## [13] RcppCCTZ_0.2.12 XVector_0.44.0 tiledb_0.25.0
## [16] knitr_1.45 bookdown_0.38 bslib_0.6.2
## [19] rlang_1.1.3 cachem_1.0.8 xfun_0.43
## [22] sass_0.4.9 bit64_4.0.5 cli_3.6.2
## [25] zlibbioc_1.50.0 spdl_0.0.5 digest_0.6.35
## [28] grid_4.4.0 lifecycle_1.0.4 data.table_1.15.4
## [31] evaluate_0.23 nanotime_0.3.7 zoo_1.8-12
## [34] rmarkdown_2.26 tools_4.4.0 htmltools_0.5.8