pareg 1.0.0
library(tidyverse)
library(ComplexHeatmap)
library(circlize)
library(GGally)
library(pareg)
data(pathway_similarities, package = "pareg")
set.seed(42)
Pathway similarities describe how similar two pathways are (you’re welcome). For example, when interpreting pathways as gene sets, one could count how many genes are shared between two sets. Many more sophisticated methods, such as the Jaccard index, exist (Gu and Huebschmann 2021).
pareg
provides various pre-computed similarity measures (jaccard, overlap_coefficient, semantic) for selected pathway databases (C2@CP:KEGG, C5@GO:BP) in matrix form.
mat <- pathway_similarities$`C2@CP:KEGG`$jaccard %>%
as_dense_sim()
mat[1:3, 1:3]
## hsa00970 hsa05340 hsa04621
## hsa00970 1 0.000000000 0.000000000
## hsa05340 0 1.000000000 0.008196721
## hsa04621 0 0.008196721 1.000000000
Heatmap(
mat,
name = "similarity",
col = colorRamp2(c(0, 1), c("white", "black")),
show_row_names = FALSE,
show_column_names = FALSE
)
On the Gene Ontology’s Biological Process subcategory, we can observe how much pathway similarity measures can differ from each other.
df_sim <- pathway_similarities$`C5@GO:BP` %>%
map_dfr(function(mat) {
if (is.null(mat)) {
return(NULL)
}
mat %>%
as_dense_sim() %>%
as.data.frame %>%
rownames_to_column() %>%
pivot_longer(-rowname)
}, .id = "measure") %>%
filter(value > 0) %>%
pivot_wider(names_from = measure, values_from = value) %>%
select(-rowname, -name)
ggpairs(df_sim) +
theme_minimal()
## Warning: Removed 514552 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 514552 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 586334 rows containing missing values
## Warning: Removed 514552 rows containing missing values (geom_point).
## Warning: Removed 514552 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 586334 rows containing missing values
## Warning: Removed 586334 rows containing missing values (geom_point).
## Removed 586334 rows containing missing values (geom_point).
## Warning: Removed 71782 rows containing non-finite values (stat_density).
sessionInfo()
## R version 4.2.0 RC (2022-04-19 r82224)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] GGally_2.1.2 circlize_0.4.14 pareg_1.0.0
## [4] tfprobability_0.15.0 tensorflow_2.8.0 enrichplot_1.16.0
## [7] ComplexHeatmap_2.12.0 forcats_0.5.1 stringr_1.4.0
## [10] dplyr_1.0.8 purrr_0.3.4 readr_2.1.2
## [13] tidyr_1.2.0 tibble_3.1.6 tidyverse_1.3.1
## [16] ggraph_2.0.5 ggplot2_3.3.5 BiocStyle_2.24.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 reticulate_1.24 tidyselect_1.1.2
## [4] RSQLite_2.2.12 AnnotationDbi_1.58.0 BiocParallel_1.30.0
## [7] scatterpie_0.1.7 munsell_0.5.0 codetools_0.2-18
## [10] future_1.25.0 withr_2.5.0 keras_2.8.0
## [13] colorspace_2.0-3 GOSemSim_2.22.0 Biobase_2.56.0
## [16] highr_0.9 knitr_1.38 rstudioapi_0.13
## [19] stats4_4.2.0 DOSE_3.22.0 listenv_0.8.0
## [22] labeling_0.4.2 GenomeInfoDbData_1.2.8 matrixLaplacian_1.0
## [25] polyclip_1.10-0 bit64_4.0.5 farver_2.1.0
## [28] rprojroot_2.0.3 parallelly_1.31.1 vctrs_0.4.1
## [31] treeio_1.20.0 generics_0.1.2 xfun_0.30
## [34] R6_2.5.1 doParallel_1.0.17 GenomeInfoDb_1.32.0
## [37] clue_0.3-60 graphlayouts_0.8.0 reshape_0.8.9
## [40] bitops_1.0-7 cachem_1.0.6 fgsea_1.22.0
## [43] gridGraphics_0.5-1 assertthat_0.2.1 scales_1.2.0
## [46] gtable_0.3.0 Cairo_1.5-15 globals_0.14.0
## [49] tidygraph_1.2.1 rlang_1.0.2 zeallot_0.1.0
## [52] scatterplot3d_0.3-41 GlobalOptions_0.1.2 splines_4.2.0
## [55] lazyeval_0.2.2 broom_0.8.0 BiocManager_1.30.17
## [58] yaml_2.3.5 reshape2_1.4.4 modelr_0.1.8
## [61] backports_1.4.1 qvalue_2.28.0 tools_4.2.0
## [64] bookdown_0.26 ggplotify_0.1.0 ellipsis_0.3.2
## [67] jquerylib_0.1.4 RColorBrewer_1.1-3 proxy_0.4-26
## [70] BiocGenerics_0.42.0 Rcpp_1.0.8.3 plyr_1.8.7
## [73] base64enc_0.1-3 progress_1.2.2 zlibbioc_1.42.0
## [76] RCurl_1.98-1.6 prettyunits_1.1.1 GetoptLong_1.0.5
## [79] viridis_0.6.2 S4Vectors_0.34.0 haven_2.5.0
## [82] ggrepel_0.9.1 cluster_2.1.3 here_1.0.1
## [85] fs_1.5.2 furrr_0.2.3 magrittr_2.0.3
## [88] magick_2.7.3 data.table_1.14.2 DO.db_2.9
## [91] reprex_2.0.1 whisker_0.4 ggnewscale_0.4.7
## [94] matrixStats_0.62.0 hms_1.1.1 patchwork_1.1.1
## [97] evaluate_0.15 readxl_1.4.0 IRanges_2.30.0
## [100] gridExtra_2.3 shape_1.4.6 tfruns_1.5.0
## [103] compiler_4.2.0 crayon_1.5.1 shadowtext_0.1.2
## [106] htmltools_0.5.2 ggfun_0.0.6 tzdb_0.3.0
## [109] aplot_0.1.3 lubridate_1.8.0 DBI_1.1.2
## [112] tweenr_1.0.2 dbplyr_2.1.1 rappdirs_0.3.3
## [115] MASS_7.3-57 Matrix_1.4-1 cli_3.3.0
## [118] parallel_4.2.0 igraph_1.3.1 pkgconfig_2.0.3
## [121] xml2_1.3.3 foreach_1.5.2 ggtree_3.4.0
## [124] bslib_0.3.1 XVector_0.36.0 rvest_1.0.2
## [127] yulab.utils_0.0.4 digest_0.6.29 Biostrings_2.64.0
## [130] rmarkdown_2.14 cellranger_1.1.0 fastmatch_1.1-3
## [133] tidytree_0.3.9 rjson_0.2.21 nloptr_2.0.0
## [136] lifecycle_1.0.1 nlme_3.1-157 jsonlite_1.8.0
## [139] viridisLite_0.4.0 fansi_1.0.3 pillar_1.7.0
## [142] lattice_0.20-45 KEGGREST_1.36.0 fastmap_1.1.0
## [145] httr_1.4.2 GO.db_3.15.0 glue_1.6.2
## [148] png_0.1-7 iterators_1.0.14 bit_4.0.4
## [151] ggforce_0.3.3 stringi_1.7.6 sass_0.4.1
## [154] blob_1.2.3 memoise_2.0.1 ape_5.6-2
Gu, Zuguang, and Daniel Huebschmann. 2021. “SimplifyEnrichment: An R/Bioconductor Package for Clustering and Visualizing Functional Enrichment Results.” bioRxiv.