scGOclust_vignette

Load packages

scGOclust is a package that leverages Gene Ontology to analyse the functional profile of cells with scRNA-seq data.

# load required libraries

library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.3.1 but the current version is
#> 4.3.2; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#> 
#> Attaching package: 'SeuratObject'
#> The following object is masked from 'package:base':
#> 
#>     intersect
library(pheatmap)
library(httr)

## if (!require("devtools")) install.packages("devtools")

## install latest from source
## for reproducibility we do not update dependencies
# devtools::install_github("Papatheodorou-Group/scGOclust", upgrade = FALSE)

library(scGOclust)

2. Load input data

# get a gene to GO BP terms mapping table
# remove electronically inferred records

# sometimes ensembl complains about ssh certificate has expired, this is a known issue, run this code
httr::set_config(httr::config(ssl_verifypeer = FALSE)) 

#mmu_tbl = ensemblToGo(species = 'mmusculus', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
#dme_tbl = ensemblToGo(species = 'dmelanogaster', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))

# here we load the example data for convenience
data(mmu_tbl)
data(dme_tbl)
# load the gene expression raw count objects
data(mmu_subset)
data(dme_subset)
ls()
#> [1] "dme_subset" "dme_tbl"    "mmu_subset" "mmu_tbl"

3. Build GO BP profile

## construct a Seurat object with GO BP as features

mmu_go_obj <- makeGOSeurat(ensembl_to_GO = mmu_tbl, feature_type = 'external_gene_name', seurat_obj = mmu_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.66 secs
#> removing 0 all zero terms
#> returning GO Seurat object

dme_go_obj <- makeGOSeurat(ensembl_to_GO = dme_tbl, feature_type = 'external_gene_name', seurat_obj = dme_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.2 secs
#> removing 0 all zero terms
#> returning GO Seurat object

4. Calculate cell type average GO BP profile

# specify the column with cell type annotation in seurat_obj@meta.data

mmu_ct_go <- getCellTypeGO(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix
#> As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
#> Names of identity class contain underscores ('_'), replacing with dashes ('-')
#> This message is displayed once per session.
dme_ct_go <- getCellTypeGO(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix

5. Calculate within-species cell type functional similariy

# heatmap of Pearson's correlation coefficient of cell type average BP profiles within species

mmu_corr = cellTypeGOCorr(cell_type_go = mmu_ct_go, corr_method = 'pearson')
pheatmap(mmu_corr)

dme_corr = cellTypeGOCorr(cell_type_go = dme_ct_go, corr_method = 'pearson')
pheatmap(dme_corr)

5. Calculate cross-species cell type functional similariy


# calculate Pearson's correlation coefficient of cell type average BP profiles across species

corr = crossSpeciesCellTypeGOCorr(species_1 = 'mmusculus', species_2 = 'dmelanogaster', cell_type_go_sp1 = mmu_ct_go, cell_type_go_sp2 = dme_ct_go, corr_method = 'pearson')

# tries to put cells with higher values on the diagonal
# helpful when cross-species cell type similarity signal is less clear

plotCellTypeCorrHeatmap(corr, width = 9, height = 10)


# scale per column or row to see the relative similarity
plotCellTypeCorrHeatmap(corr, scale = 'column', width = 9, height = 10)

6. Dimensional reduction and UMAP visualization of cells with GO profile


# analyze the cell-by-GO BP profile as a count matrix
# Note that the example data has very few cells (for reducing data size), so I am using a small UMAP min_dist and small spread here. Default min_dist is 0.3 and spread is 1.

mmu_go_analyzed = analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation', min.dist = 0.1, spread = 0.5)
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: spread

#> Warning: The following arguments are not used: spread

#> Warning: The following arguments are not used: spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: spread

#> Warning: The following arguments are not used: spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 219
#> Number of edges: 9790
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.4765
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 17:05:29 UMAP embedding parameters a = 5.069 b = 1.003
#> Found more than one class "dist" in cache; using the first, from namespace 'spam'
#> Also defined by 'BiocGenerics'
#> 17:05:29 Read 219 rows and found 50 numeric columns
#> 17:05:29 Using Annoy for neighbor search, n_neighbors = 30
#> Found more than one class "dist" in cache; using the first, from namespace 'spam'
#> Also defined by 'BiocGenerics'
#> 17:05:29 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 17:05:29 Writing NN index file to temp file /var/folders/37/wf962dk574750g0xnnlxjwvm0000gp/T//RtmpDbhzmO/file123c6576dc9b7
#> 17:05:29 Searching Annoy index using 1 thread, search_k = 3000
#> 17:05:29 Annoy recall = 100%
#> 17:05:29 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 17:05:30 Initializing from normalized Laplacian + noise (using RSpectra)
#> 17:05:30 Commencing optimization for 500 epochs, with 7794 positive edges
#> 17:05:30 Optimization finished
# UMAP plot of the analyzed cell-by-GO BP profile
# labeled by previously specified cell annotation column in meta.data

DimPlot(mmu_go_analyzed, label = TRUE) + NoLegend()

dme_go_analyzed = analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation', min_dist=0.1, spread=0.5)
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: min_dist, spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: min_dist, spread

#> Warning: The following arguments are not used: min_dist, spread

#> Warning: The following arguments are not used: min_dist, spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: min_dist, spread

#> Warning: The following arguments are not used: min_dist, spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 180
#> Number of edges: 6265
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.5638
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The following arguments are not used: min_dist
#> 17:05:31 UMAP embedding parameters a = 3.244 b = 1.447
#> Found more than one class "dist" in cache; using the first, from namespace 'spam'
#> Also defined by 'BiocGenerics'
#> 17:05:31 Read 180 rows and found 50 numeric columns
#> 17:05:31 Using Annoy for neighbor search, n_neighbors = 30
#> Found more than one class "dist" in cache; using the first, from namespace 'spam'
#> Also defined by 'BiocGenerics'
#> 17:05:31 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 17:05:31 Writing NN index file to temp file /var/folders/37/wf962dk574750g0xnnlxjwvm0000gp/T//RtmpDbhzmO/file123c67377bc6e
#> 17:05:31 Searching Annoy index using 1 thread, search_k = 3000
#> 17:05:31 Annoy recall = 100%
#> 17:05:31 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 17:05:32 Initializing from normalized Laplacian + noise (using RSpectra)
#> 17:05:32 Commencing optimization for 500 epochs, with 6618 positive edges
#> 17:05:32 Optimization finished
DimPlot(dme_go_analyzed, label = TRUE) + NoLegend()

7. Get co-up and co-down regulated terms between pairs of cell types


## calculation takes a few minutes due to the Wilcox signed rank test

ct_shared_go = scGOclust::getCellTypeSharedGO(species_1 = 'mmusculus', species_2 = 'dmelanogaster', analyzed_go_seurat_sp1 = mmu_go_analyzed, analyzed_go_seurat_sp2 = dme_go_analyzed, cell_type_col_sp1 = 'cell_type_annotation', cell_type_col_sp2 = 'annotation', layer_use = 'data')
head(ct_shared_go$shared_sig_markers)

# query shared GO terms for specific cell type pairs

getCellTypeSharedTerms(shared_go = ct_shared_go,
                       cell_type_sp1 = 'intestine_Enteroendocrine cell', 
                       cell_type_sp2 = 'enteroendocrine cell',
                       return_full = FALSE)

# viola
sessionInfo()
#> R version 4.3.2 (2023-10-31)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS Sonoma 14.2.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
#> 
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> time zone: Europe/London
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] scGOclust_0.2.1    httr_1.4.7         pheatmap_1.0.12    Seurat_5.0.1      
#> [5] SeuratObject_5.0.1 sp_2.1-1          
#> 
#> loaded via a namespace (and not attached):
#>   [1] RcppAnnoy_0.0.21        splines_4.3.2           later_1.3.1            
#>   [4] bitops_1.0-7            filelock_1.0.2          tibble_3.2.1           
#>   [7] polyclip_1.10-6         XML_3.99-0.14           fastDummies_1.7.3      
#>  [10] lifecycle_1.0.4         globals_0.16.2          lattice_0.21-9         
#>  [13] MASS_7.3-60             magrittr_2.0.3          limma_3.56.2           
#>  [16] plotly_4.10.3           sass_0.4.7              rmarkdown_2.25         
#>  [19] jquerylib_0.1.4         yaml_2.3.7              httpuv_1.6.12          
#>  [22] sctransform_0.4.1       spam_2.10-0             spatstat.sparse_3.0-3  
#>  [25] reticulate_1.34.0       cowplot_1.1.1           pbapply_1.7-2          
#>  [28] DBI_1.1.3               RColorBrewer_1.1-3      abind_1.4-5            
#>  [31] zlibbioc_1.46.0         Rtsne_0.16              purrr_1.0.2            
#>  [34] BiocGenerics_0.46.0     RCurl_1.98-1.12         pracma_2.4.2           
#>  [37] rappdirs_0.3.3          GenomeInfoDbData_1.2.10 IRanges_2.34.1         
#>  [40] S4Vectors_0.38.2        ggrepel_0.9.4           irlba_2.3.5.1          
#>  [43] listenv_0.9.0           spatstat.utils_3.0-4    goftest_1.2-3          
#>  [46] RSpectra_0.16-1         spatstat.random_3.2-1   fitdistrplus_1.1-11    
#>  [49] parallelly_1.36.0       leiden_0.4.3.1          codetools_0.2-19       
#>  [52] xml2_1.3.5              tidyselect_1.2.0        farver_2.1.1           
#>  [55] matrixStats_1.1.0       stats4_4.3.2            BiocFileCache_2.8.0    
#>  [58] spatstat.explore_3.2-5  jsonlite_1.8.7          ellipsis_0.3.2         
#>  [61] progressr_0.14.0        ggridges_0.5.4          survival_3.5-7         
#>  [64] tools_4.3.2             progress_1.2.2          ica_1.0-3              
#>  [67] Rcpp_1.0.11             glue_1.6.2              gridExtra_2.3          
#>  [70] xfun_0.41               GenomeInfoDb_1.36.4     dplyr_1.1.4            
#>  [73] withr_2.5.2             fastmap_1.1.1           fansi_1.0.5            
#>  [76] digest_0.6.33           R6_2.5.1                mime_0.12              
#>  [79] colorspace_2.1-0        networkD3_0.4           scattermore_1.2        
#>  [82] tensor_1.5              spatstat.data_3.0-3     biomaRt_2.56.1         
#>  [85] RSQLite_2.3.1           utf8_1.2.4              tidyr_1.3.0            
#>  [88] generics_0.1.3          data.table_1.14.8       prettyunits_1.2.0      
#>  [91] htmlwidgets_1.6.2       slanter_0.2-0           uwot_0.1.16            
#>  [94] pkgconfig_2.0.3         gtable_0.3.4            blob_1.2.4             
#>  [97] lmtest_0.9-40           XVector_0.40.0          htmltools_0.5.7        
#> [100] dotCall64_1.1-0         scales_1.2.1            Biobase_2.60.0         
#> [103] png_0.1-8               knitr_1.45              rstudioapi_0.15.0      
#> [106] reshape2_1.4.4          nlme_3.1-163            curl_5.1.0             
#> [109] cachem_1.0.8            zoo_1.8-12              stringr_1.5.1          
#> [112] KernSmooth_2.23-22      parallel_4.3.2          miniUI_0.1.1.1         
#> [115] AnnotationDbi_1.62.2    pillar_1.9.0            grid_4.3.2             
#> [118] vctrs_0.6.4             RANN_2.6.1              promises_1.2.1         
#> [121] dbplyr_2.3.4            xtable_1.8-4            cluster_2.1.4          
#> [124] evaluate_0.23           cli_3.6.1               compiler_4.3.2         
#> [127] rlang_1.1.2             crayon_1.5.2            future.apply_1.11.0    
#> [130] labeling_0.4.3          plyr_1.8.9              stringi_1.8.1          
#> [133] viridisLite_0.4.2       deldir_1.0-9            munsell_0.5.0          
#> [136] Biostrings_2.68.1       lazyeval_0.2.2          spatstat.geom_3.2-7    
#> [139] Matrix_1.6-3            RcppHNSW_0.5.0          hms_1.1.3              
#> [142] patchwork_1.1.3         bit64_4.0.5             future_1.33.0          
#> [145] ggplot2_3.4.4           KEGGREST_1.40.1         shiny_1.7.5.1          
#> [148] highr_0.10              ROCR_1.0-11             igraph_1.5.1           
#> [151] memoise_2.0.1           bslib_0.5.1             bit_4.0.5