TileDBArray 1.16.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.19077577 1.46606301 2.59241280 . 1.36852331 0.03878149
## [2,] -0.87107976 0.58155023 0.47985639 . -1.74381373 1.40519052
## [3,] 0.37308658 -0.31367882 0.53266067 . -1.16329702 0.29000904
## [4,] 0.40944235 1.73879329 -0.48694378 . 0.42047123 -1.28243723
## [5,] -0.63837720 -1.07512355 0.06965862 . -0.63075183 0.78089338
## ... . . . . . .
## [96,] -0.93338713 -0.01000261 0.35246689 . 1.1291893 -0.2606430
## [97,] 0.72330606 -0.77674736 -1.89155646 . 0.9800840 0.7554257
## [98,] 0.39010687 0.60769615 0.97577553 . -1.1187665 0.3029906
## [99,] 1.64737301 -0.15677170 -1.46686265 . 2.1304327 -0.3644709
## [100,] -0.65357448 -1.34213964 0.16270691 . 0.7716056 -1.0431349
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.19077577 1.46606301 2.59241280 . 1.36852331 0.03878149
## [2,] -0.87107976 0.58155023 0.47985639 . -1.74381373 1.40519052
## [3,] 0.37308658 -0.31367882 0.53266067 . -1.16329702 0.29000904
## [4,] 0.40944235 1.73879329 -0.48694378 . 0.42047123 -1.28243723
## [5,] -0.63837720 -1.07512355 0.06965862 . -0.63075183 0.78089338
## ... . . . . . .
## [96,] -0.93338713 -0.01000261 0.35246689 . 1.1291893 -0.2606430
## [97,] 0.72330606 -0.77674736 -1.89155646 . 0.9800840 0.7554257
## [98,] 0.39010687 0.60769615 0.97577553 . -1.1187665 0.3029906
## [99,] 1.64737301 -0.15677170 -1.46686265 . 2.1304327 -0.3644709
## [100,] -0.65357448 -1.34213964 0.16270691 . 0.7716056 -1.0431349
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.19077577 1.46606301 2.59241280 . 1.36852331 0.03878149
## GENE_2 -0.87107976 0.58155023 0.47985639 . -1.74381373 1.40519052
## GENE_3 0.37308658 -0.31367882 0.53266067 . -1.16329702 0.29000904
## GENE_4 0.40944235 1.73879329 -0.48694378 . 0.42047123 -1.28243723
## GENE_5 -0.63837720 -1.07512355 0.06965862 . -0.63075183 0.78089338
## ... . . . . . .
## GENE_96 -0.93338713 -0.01000261 0.35246689 . 1.1291893 -0.2606430
## GENE_97 0.72330606 -0.77674736 -1.89155646 . 0.9800840 0.7554257
## GENE_98 0.39010687 0.60769615 0.97577553 . -1.1187665 0.3029906
## GENE_99 1.64737301 -0.15677170 -1.46686265 . 2.1304327 -0.3644709
## GENE_100 -0.65357448 -1.34213964 0.16270691 . 0.7716056 -1.0431349
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## -0.19077577 -0.87107976 0.37308658 0.40944235 -0.63837720 -0.03444201
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 -0.19077577 1.46606301 2.59241280 1.57449797 1.36291397
## GENE_2 -0.87107976 0.58155023 0.47985639 0.74970970 1.17285272
## GENE_3 0.37308658 -0.31367882 0.53266067 0.19527805 0.31921267
## GENE_4 0.40944235 1.73879329 -0.48694378 1.83131921 -0.04294374
## GENE_5 -0.63837720 -1.07512355 0.06965862 1.70289497 0.83556755
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.3815515 2.9321260 5.1848256 . 2.73704661 0.07756298
## GENE_2 -1.7421595 1.1631005 0.9597128 . -3.48762745 2.81038104
## GENE_3 0.7461732 -0.6273576 1.0653213 . -2.32659404 0.58001809
## GENE_4 0.8188847 3.4775866 -0.9738876 . 0.84094246 -2.56487446
## GENE_5 -1.2767544 -2.1502471 0.1393172 . -1.26150367 1.56178676
## ... . . . . . .
## GENE_96 -1.86677426 -0.02000521 0.70493379 . 2.2583786 -0.5212860
## GENE_97 1.44661211 -1.55349472 -3.78311292 . 1.9601679 1.5108514
## GENE_98 0.78021373 1.21539231 1.95155106 . -2.2375330 0.6059811
## GENE_99 3.29474602 -0.31354340 -2.93372529 . 4.2608654 -0.7289419
## GENE_100 -1.30714896 -2.68427929 0.32541381 . 1.5432112 -2.0862699
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6 SAMP_7
## -7.461344 23.772452 -17.855251 -3.856237 8.146078 16.622155 3.217246
## SAMP_8 SAMP_9 SAMP_10
## -10.810978 2.279920 2.183133
out %*% runif(ncol(out))
## [,1]
## GENE_1 4.702378696
## GENE_2 0.462426085
## GENE_3 0.989586023
## GENE_4 0.982307627
## GENE_5 1.222306214
## GENE_6 0.358733696
## GENE_7 2.293122212
## GENE_8 -0.831383292
## GENE_9 0.273553507
## GENE_10 -0.166022976
## GENE_11 2.386632387
## GENE_12 0.372929132
## GENE_13 0.604073926
## GENE_14 -1.360170256
## GENE_15 -0.024642556
## GENE_16 0.197048418
## GENE_17 -3.227516973
## GENE_18 -0.319117790
## GENE_19 0.125606587
## GENE_20 2.133127265
## GENE_21 -3.660379367
## GENE_22 -0.597664592
## GENE_23 -0.596321615
## GENE_24 0.283258389
## GENE_25 -0.894699013
## GENE_26 0.396553082
## GENE_27 0.623805473
## GENE_28 -1.026481698
## GENE_29 2.459317069
## GENE_30 2.017622654
## GENE_31 -0.384273102
## GENE_32 0.119023669
## GENE_33 -1.286822754
## GENE_34 1.044131511
## GENE_35 0.975727374
## GENE_36 -0.251123510
## GENE_37 1.369088099
## GENE_38 1.061020875
## GENE_39 0.079795822
## GENE_40 -0.693839607
## GENE_41 0.222542810
## GENE_42 0.702534719
## GENE_43 -1.941998173
## GENE_44 0.954517958
## GENE_45 1.838290446
## GENE_46 0.517526166
## GENE_47 0.001732914
## GENE_48 -1.550871694
## GENE_49 0.136230843
## GENE_50 -1.207419173
## GENE_51 -0.017990757
## GENE_52 0.917056797
## GENE_53 -1.409260964
## GENE_54 0.130501762
## GENE_55 -2.336919939
## GENE_56 -0.678384777
## GENE_57 0.705133091
## GENE_58 0.944606988
## GENE_59 3.175479734
## GENE_60 -2.060327914
## GENE_61 0.560821521
## GENE_62 -1.010482367
## GENE_63 -1.189126581
## GENE_64 -2.184209188
## GENE_65 1.442073882
## GENE_66 1.514133509
## GENE_67 -0.344980010
## GENE_68 -0.378509631
## GENE_69 1.510939525
## GENE_70 1.033870625
## GENE_71 -2.264441314
## GENE_72 -1.351705059
## GENE_73 1.001294825
## GENE_74 1.250752294
## GENE_75 -0.975183843
## GENE_76 -0.604711192
## GENE_77 0.931676472
## GENE_78 -0.497187918
## GENE_79 -3.317168712
## GENE_80 -0.452205133
## GENE_81 -3.132136771
## GENE_82 -0.277911404
## GENE_83 -0.160902409
## GENE_84 0.500201876
## GENE_85 0.290935465
## GENE_86 0.880743056
## GENE_87 0.473242696
## GENE_88 -2.365861887
## GENE_89 0.932715227
## GENE_90 0.042215349
## GENE_91 1.470602388
## GENE_92 0.455379913
## GENE_93 -1.796817790
## GENE_94 0.546130620
## GENE_95 -0.118955626
## GENE_96 1.863574954
## GENE_97 0.200227752
## GENE_98 1.354321611
## GENE_99 1.070466525
## GENE_100 -2.580804556
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.8302901 -0.2629753 0.8447242 . -0.7543306 -0.8826342
## [2,] 0.4825298 -1.1302678 -0.6943423 . 1.2882732 0.2186397
## [3,] -0.8815314 0.2320300 -0.7955889 . -0.3979149 1.7807074
## [4,] 0.5328442 0.4975232 1.5406959 . 1.1577459 -1.1672533
## [5,] 0.3877280 1.0784284 -2.0704452 . 0.0954797 -0.1774471
## ... . . . . . .
## [96,] -1.29125369 0.80271001 -0.00170399 . 2.05892713 1.14795435
## [97,] -0.70704373 -0.12169196 0.31754693 . 0.98355706 1.04152200
## [98,] 0.21329196 -0.74523862 0.35949313 . 1.42443849 -0.16125825
## [99,] -2.48901818 1.03710793 -0.04927774 . 0.92297332 1.29494821
## [100,] -1.28123103 -1.34987873 0.25157767 . 1.81731027 0.04795617
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.8302901 -0.2629753 0.8447242 . -0.7543306 -0.8826342
## [2,] 0.4825298 -1.1302678 -0.6943423 . 1.2882732 0.2186397
## [3,] -0.8815314 0.2320300 -0.7955889 . -0.3979149 1.7807074
## [4,] 0.5328442 0.4975232 1.5406959 . 1.1577459 -1.1672533
## [5,] 0.3877280 1.0784284 -2.0704452 . 0.0954797 -0.1774471
## ... . . . . . .
## [96,] -1.29125369 0.80271001 -0.00170399 . 2.05892713 1.14795435
## [97,] -0.70704373 -0.12169196 0.31754693 . 0.98355706 1.04152200
## [98,] 0.21329196 -0.74523862 0.35949313 . 1.42443849 -0.16125825
## [99,] -2.48901818 1.03710793 -0.04927774 . 0.92297332 1.29494821
## [100,] -1.28123103 -1.34987873 0.25157767 . 1.81731027 0.04795617
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.20-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.18 TileDBArray_1.16.0 DelayedArray_0.32.0
## [4] SparseArray_1.6.0 S4Arrays_1.6.0 IRanges_2.40.0
## [7] abind_1.4-8 S4Vectors_0.44.0 MatrixGenerics_1.18.0
## [10] matrixStats_1.4.1 BiocGenerics_0.52.0 Matrix_1.7-1
## [13] BiocStyle_2.34.0
##
## loaded via a namespace (and not attached):
## [1] bit_4.5.0 jsonlite_1.8.9 compiler_4.4.1
## [4] BiocManager_1.30.25 crayon_1.5.3 Rcpp_1.0.13
## [7] nanoarrow_0.6.0 jquerylib_0.1.4 yaml_2.3.10
## [10] fastmap_1.2.0 lattice_0.22-6 R6_2.5.1
## [13] RcppCCTZ_0.2.12 XVector_0.46.0 tiledb_0.30.2
## [16] knitr_1.48 bookdown_0.41 bslib_0.8.0
## [19] rlang_1.1.4 cachem_1.1.0 xfun_0.48
## [22] sass_0.4.9 bit64_4.5.2 cli_3.6.3
## [25] zlibbioc_1.52.0 spdl_0.0.5 digest_0.6.37
## [28] grid_4.4.1 lifecycle_1.0.4 data.table_1.16.2
## [31] evaluate_1.0.1 nanotime_0.3.10 zoo_1.8-12
## [34] rmarkdown_2.28 tools_4.4.1 htmltools_0.5.8.1