When a matrix contains a lot of zero, there is no need to store all its values. But only the non-zero values, for example the following matrix:
\[\begin{equation} M = \bordermatrix{ ~ & \text{sp. A} & \text{sp. B} & {sp. C} \cr \text{site 1} & 1 & 0 & 0 \cr \text{site 2} & 0 & 1 & 1 \cr \text{site 3} & 0 & 1 & 0 \cr} \end{equation}\]
can be stored using only a matrix with non zero-values \(S\), with row index \(i\) and column index \(j\):
\[\begin{equation} S = \begin{matrix} i & j & \text{value} \\ \text{site 1} & \text{sp. A} & 1 \\ \text{site 2} & \text{sp. B} & 1 \\ \text{site 3} & \text{sp. B} & 1 \\ \text{site 2} & \text{sp. C} & 1 \end{matrix} \end{equation}\]
If your site-species matrix is big, it may contain a lot zero because it is very unlikely that all species are present at all sites when looking at thousands of species and hundreds of sites. Thus, the sparse matrix notation would save a good amount of lines (look how \(S\) compared to \(M\)).
# Generate a matrix with 1000 species and 200 sites
= matrix(sample(c(0, 0, 0, 0, 0, 0, 1), replace = TRUE, size = 200000),
my_mat ncol = 1000, nrow = 200)
colnames(my_mat) = paste0("sp", seq_len(ncol(my_mat)))
rownames(my_mat) = paste0("site", seq_len(nrow(my_mat)))
1:5, 1:5] my_mat[
## sp1 sp2 sp3 sp4 sp5
## site1 0 0 1 0 0
## site2 0 1 1 0 1
## site3 0 0 0 0 0
## site4 0 0 0 0 0
## site5 0 0 0 1 0
funrar
lets you use sparse matrices directly using a
function implemented in the Matrix
package. When your
matrix is filled with 0s it can be quicker to use sparse matrices. To
know if you should use sparse matrices you can compute the filling of
your matrix, i.e. the percentage of non-zero cells in your matrix:
= 1 - sum(my_mat == 0)/(ncol(my_mat)*nrow(my_mat))
filling
filling
## [1] 0.142755
To convert from a normal matrix to a sparse matrix you can use
as(my_mat, "dgCMatrix")
:
library(Matrix)
## Warning: le package 'Matrix' a été compilé avec la version R 4.2.1
= as(my_mat, "dgCMatrix")
sparse_mat
is(my_mat, "dgCMatrix")
## [1] FALSE
is(sparse_mat, "dgCMatrix")
## [1] TRUE
it completely changes the structure as well as the memory it takes in the RAM:
# Regular Matrix
str(my_mat)
## num [1:200, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:200] "site1" "site2" "site3" "site4" ...
## ..$ : chr [1:1000] "sp1" "sp2" "sp3" "sp4" ...
print(object.size(my_mat), units = "Kb")
## 1638 Kb
# Sparse Matrix from 'Matrix' package
str(sparse_mat)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:28551] 18 20 30 40 42 43 45 47 56 57 ...
## ..@ p : int [1:1001] 0 32 62 95 119 142 176 205 236 262 ...
## ..@ Dim : int [1:2] 200 1000
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:200] "site1" "site2" "site3" "site4" ...
## .. ..$ : chr [1:1000] "sp1" "sp2" "sp3" "sp4" ...
## ..@ x : num [1:28551] 1 1 1 1 1 1 1 1 1 1 ...
## ..@ factors : list()
print(object.size(sparse_mat), units = "Kb")
## 415.1 Kb
sparse matrices reduce the amount of RAM necessary to store them. The more zeros there are in a given matrix the more RAM will be spared when turning it into a sparse matrix.
We can now compare the performances of the algorithms in rarity
indices computation between a sparse matrix and a regular one, using the
popular microbenchmark
R package:
library(funrar)
# Get a table of traits
= data.frame(trait = runif(ncol(my_mat), 0, 1))
trait_df rownames(trait_df) = paste0("sp", seq_len(ncol(my_mat)))
# Compute distance matrix
= compute_dist_matrix(trait_df)
dist_mat
if (requireNamespace("microbenchmark", quietly = TRUE)) {
::microbenchmark(
microbenchmarkregular = distinctiveness(my_mat, dist_mat),
sparse = distinctiveness(sparse_mat, dist_mat))
}
We generate matrices with different filling rate and compare the speed of regular matrix and sparse matrices computation.
= function(n_zero = 5, nrow = 200, ncol = 1000) {
generate_matrix matrix(sample(c(rep(0, n_zero), 1), replace = TRUE, size = nrow*ncol),
ncol = ncol, nrow = nrow)
}
= function(my_mat) {
mat_filling sum(my_mat != 0)/(ncol(my_mat)*nrow(my_mat))
}
= function(n_zero) {
sparse_and_mat = generate_matrix(n_zero)
my_mat colnames(my_mat) = paste0("sp", seq_len(ncol(my_mat)))
rownames(my_mat) = paste0("site", seq_len(nrow(my_mat)))
= as(my_mat, "dgCMatrix")
sparse_mat
return(list(mat = my_mat, sparse = sparse_mat))
}
= c(0, 1, 2, 49, 99)
n_zero_vector names(n_zero_vector) = n_zero_vector
= lapply(n_zero_vector, sparse_and_mat)
all_mats
mat_filling(all_mats$`0`$mat)
## [1] 1
mat_filling(all_mats$`99`$mat)
## [1] 0.009845
Now we can compare the speed of the algorithms:
if (requireNamespace("microbenchmark", quietly = TRUE)) {
= microbenchmark::microbenchmark(
mat_bench mat_0 = distinctiveness(all_mats$`0`$mat, dist_mat),
sparse_0 = distinctiveness(all_mats$`0`$sparse, dist_mat),
mat_1 = distinctiveness(all_mats$`1`$mat, dist_mat),
sparse_1 = distinctiveness(all_mats$`1`$sparse, dist_mat),
mat_2 = distinctiveness(all_mats$`2`$mat, dist_mat),
sparse_2 = distinctiveness(all_mats$`2`$sparse, dist_mat),
mat_49 = distinctiveness(all_mats$`49`$mat, dist_mat),
sparse_49 = distinctiveness(all_mats$`49`$sparse, dist_mat),
mat_99 = distinctiveness(all_mats$`99`$mat, dist_mat),
sparse_99 = distinctiveness(all_mats$`99`$sparse, dist_mat),
times = 5)
autoplot(mat_bench)
}