scGOclust
is a package that leverages Gene Ontology to
analyse the functional profile of cells with scRNA-seq data.
# load required libraries
library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.3.1 but the current version is
#> 4.3.2; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#>
#> Attaching package: 'SeuratObject'
#> The following object is masked from 'package:base':
#>
#> intersect
library(pheatmap)
library(httr)
## if (!require("devtools")) install.packages("devtools")
## install latest from source
## for reproducibility we do not update dependencies
# devtools::install_github("Papatheodorou-Group/scGOclust", upgrade = FALSE)
library(scGOclust)
# get a gene to GO BP terms mapping table
# remove electronically inferred records
# sometimes ensembl complains about ssh certificate has expired, this is a known issue, run this code
httr::set_config(httr::config(ssl_verifypeer = FALSE))
#mmu_tbl = ensemblToGo(species = 'mmusculus', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
#dme_tbl = ensemblToGo(species = 'dmelanogaster', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
# here we load the example data for convenience
data(mmu_tbl)
data(dme_tbl)
## construct a Seurat object with GO BP as features
mmu_go_obj <- makeGOSeurat(ensembl_to_GO = mmu_tbl, feature_type = 'external_gene_name', seurat_obj = mmu_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.66 secs
#> removing 0 all zero terms
#> returning GO Seurat object
dme_go_obj <- makeGOSeurat(ensembl_to_GO = dme_tbl, feature_type = 'external_gene_name', seurat_obj = dme_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.2 secs
#> removing 0 all zero terms
#> returning GO Seurat object
# specify the column with cell type annotation in seurat_obj@meta.data
mmu_ct_go <- getCellTypeGO(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix
#> As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
#> Names of identity class contain underscores ('_'), replacing with dashes ('-')
#> This message is displayed once per session.
dme_ct_go <- getCellTypeGO(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix
# calculate Pearson's correlation coefficient of cell type average BP profiles across species
corr = crossSpeciesCellTypeGOCorr(species_1 = 'mmusculus', species_2 = 'dmelanogaster', cell_type_go_sp1 = mmu_ct_go, cell_type_go_sp2 = dme_ct_go, corr_method = 'pearson')
# analyze the cell-by-GO BP profile as a count matrix
# Note that the example data has very few cells (for reducing data size), so I am using a small UMAP min_dist and small spread here. Default min_dist is 0.3 and spread is 1.
mmu_go_analyzed = analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation', min.dist = 0.1, spread = 0.5)
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 219
#> Number of edges: 9790
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.4765
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 17:05:29 UMAP embedding parameters a = 5.069 b = 1.003
#> Found more than one class "dist" in cache; using the first, from namespace 'spam'
#> Also defined by 'BiocGenerics'
#> 17:05:29 Read 219 rows and found 50 numeric columns
#> 17:05:29 Using Annoy for neighbor search, n_neighbors = 30
#> Found more than one class "dist" in cache; using the first, from namespace 'spam'
#> Also defined by 'BiocGenerics'
#> 17:05:29 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 17:05:29 Writing NN index file to temp file /var/folders/37/wf962dk574750g0xnnlxjwvm0000gp/T//RtmpDbhzmO/file123c6576dc9b7
#> 17:05:29 Searching Annoy index using 1 thread, search_k = 3000
#> 17:05:29 Annoy recall = 100%
#> 17:05:29 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 17:05:30 Initializing from normalized Laplacian + noise (using RSpectra)
#> 17:05:30 Commencing optimization for 500 epochs, with 7794 positive edges
#> 17:05:30 Optimization finished
# UMAP plot of the analyzed cell-by-GO BP profile
# labeled by previously specified cell annotation column in meta.data
DimPlot(mmu_go_analyzed, label = TRUE) + NoLegend()
dme_go_analyzed = analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation', min_dist=0.1, spread=0.5)
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: min_dist, spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 180
#> Number of edges: 6265
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.5638
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The following arguments are not used: min_dist
#> 17:05:31 UMAP embedding parameters a = 3.244 b = 1.447
#> Found more than one class "dist" in cache; using the first, from namespace 'spam'
#> Also defined by 'BiocGenerics'
#> 17:05:31 Read 180 rows and found 50 numeric columns
#> 17:05:31 Using Annoy for neighbor search, n_neighbors = 30
#> Found more than one class "dist" in cache; using the first, from namespace 'spam'
#> Also defined by 'BiocGenerics'
#> 17:05:31 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 17:05:31 Writing NN index file to temp file /var/folders/37/wf962dk574750g0xnnlxjwvm0000gp/T//RtmpDbhzmO/file123c67377bc6e
#> 17:05:31 Searching Annoy index using 1 thread, search_k = 3000
#> 17:05:31 Annoy recall = 100%
#> 17:05:31 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 17:05:32 Initializing from normalized Laplacian + noise (using RSpectra)
#> 17:05:32 Commencing optimization for 500 epochs, with 6618 positive edges
#> 17:05:32 Optimization finished
## calculation takes a few minutes due to the Wilcox signed rank test
ct_shared_go = scGOclust::getCellTypeSharedGO(species_1 = 'mmusculus', species_2 = 'dmelanogaster', analyzed_go_seurat_sp1 = mmu_go_analyzed, analyzed_go_seurat_sp2 = dme_go_analyzed, cell_type_col_sp1 = 'cell_type_annotation', cell_type_col_sp2 = 'annotation', layer_use = 'data')
# query shared GO terms for specific cell type pairs
getCellTypeSharedTerms(shared_go = ct_shared_go,
cell_type_sp1 = 'intestine_Enteroendocrine cell',
cell_type_sp2 = 'enteroendocrine cell',
return_full = FALSE)
# viola
sessionInfo()
#> R version 4.3.2 (2023-10-31)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS Sonoma 14.2.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
#>
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> time zone: Europe/London
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] scGOclust_0.2.1 httr_1.4.7 pheatmap_1.0.12 Seurat_5.0.1
#> [5] SeuratObject_5.0.1 sp_2.1-1
#>
#> loaded via a namespace (and not attached):
#> [1] RcppAnnoy_0.0.21 splines_4.3.2 later_1.3.1
#> [4] bitops_1.0-7 filelock_1.0.2 tibble_3.2.1
#> [7] polyclip_1.10-6 XML_3.99-0.14 fastDummies_1.7.3
#> [10] lifecycle_1.0.4 globals_0.16.2 lattice_0.21-9
#> [13] MASS_7.3-60 magrittr_2.0.3 limma_3.56.2
#> [16] plotly_4.10.3 sass_0.4.7 rmarkdown_2.25
#> [19] jquerylib_0.1.4 yaml_2.3.7 httpuv_1.6.12
#> [22] sctransform_0.4.1 spam_2.10-0 spatstat.sparse_3.0-3
#> [25] reticulate_1.34.0 cowplot_1.1.1 pbapply_1.7-2
#> [28] DBI_1.1.3 RColorBrewer_1.1-3 abind_1.4-5
#> [31] zlibbioc_1.46.0 Rtsne_0.16 purrr_1.0.2
#> [34] BiocGenerics_0.46.0 RCurl_1.98-1.12 pracma_2.4.2
#> [37] rappdirs_0.3.3 GenomeInfoDbData_1.2.10 IRanges_2.34.1
#> [40] S4Vectors_0.38.2 ggrepel_0.9.4 irlba_2.3.5.1
#> [43] listenv_0.9.0 spatstat.utils_3.0-4 goftest_1.2-3
#> [46] RSpectra_0.16-1 spatstat.random_3.2-1 fitdistrplus_1.1-11
#> [49] parallelly_1.36.0 leiden_0.4.3.1 codetools_0.2-19
#> [52] xml2_1.3.5 tidyselect_1.2.0 farver_2.1.1
#> [55] matrixStats_1.1.0 stats4_4.3.2 BiocFileCache_2.8.0
#> [58] spatstat.explore_3.2-5 jsonlite_1.8.7 ellipsis_0.3.2
#> [61] progressr_0.14.0 ggridges_0.5.4 survival_3.5-7
#> [64] tools_4.3.2 progress_1.2.2 ica_1.0-3
#> [67] Rcpp_1.0.11 glue_1.6.2 gridExtra_2.3
#> [70] xfun_0.41 GenomeInfoDb_1.36.4 dplyr_1.1.4
#> [73] withr_2.5.2 fastmap_1.1.1 fansi_1.0.5
#> [76] digest_0.6.33 R6_2.5.1 mime_0.12
#> [79] colorspace_2.1-0 networkD3_0.4 scattermore_1.2
#> [82] tensor_1.5 spatstat.data_3.0-3 biomaRt_2.56.1
#> [85] RSQLite_2.3.1 utf8_1.2.4 tidyr_1.3.0
#> [88] generics_0.1.3 data.table_1.14.8 prettyunits_1.2.0
#> [91] htmlwidgets_1.6.2 slanter_0.2-0 uwot_0.1.16
#> [94] pkgconfig_2.0.3 gtable_0.3.4 blob_1.2.4
#> [97] lmtest_0.9-40 XVector_0.40.0 htmltools_0.5.7
#> [100] dotCall64_1.1-0 scales_1.2.1 Biobase_2.60.0
#> [103] png_0.1-8 knitr_1.45 rstudioapi_0.15.0
#> [106] reshape2_1.4.4 nlme_3.1-163 curl_5.1.0
#> [109] cachem_1.0.8 zoo_1.8-12 stringr_1.5.1
#> [112] KernSmooth_2.23-22 parallel_4.3.2 miniUI_0.1.1.1
#> [115] AnnotationDbi_1.62.2 pillar_1.9.0 grid_4.3.2
#> [118] vctrs_0.6.4 RANN_2.6.1 promises_1.2.1
#> [121] dbplyr_2.3.4 xtable_1.8-4 cluster_2.1.4
#> [124] evaluate_0.23 cli_3.6.1 compiler_4.3.2
#> [127] rlang_1.1.2 crayon_1.5.2 future.apply_1.11.0
#> [130] labeling_0.4.3 plyr_1.8.9 stringi_1.8.1
#> [133] viridisLite_0.4.2 deldir_1.0-9 munsell_0.5.0
#> [136] Biostrings_2.68.1 lazyeval_0.2.2 spatstat.geom_3.2-7
#> [139] Matrix_1.6-3 RcppHNSW_0.5.0 hms_1.1.3
#> [142] patchwork_1.1.3 bit64_4.0.5 future_1.33.0
#> [145] ggplot2_3.4.4 KEGGREST_1.40.1 shiny_1.7.5.1
#> [148] highr_0.10 ROCR_1.0-11 igraph_1.5.1
#> [151] memoise_2.0.1 bslib_0.5.1 bit_4.0.5