COTAN 1.0.0
library(COTAN)
library(data.table)
library(Matrix)
library(ggrepel)
#> Loading required package: ggplot2
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(Rtsne)
library(utils)
library(plotly)
#>
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#> The following object is masked from 'package:stats':
#>
#> filter
#> The following object is masked from 'package:graphics':
#>
#> layout
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✔ tibble 3.1.6 ✔ dplyr 1.0.8
#> ✔ tidyr 1.2.0 ✔ stringr 1.4.0
#> ✔ readr 2.1.2 ✔ forcats 0.5.1
#> ✔ purrr 0.3.4
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::between() masks data.table::between()
#> ✖ tidyr::expand() masks Matrix::expand()
#> ✖ dplyr::filter() masks plotly::filter(), stats::filter()
#> ✖ dplyr::first() masks data.table::first()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ dplyr::last() masks data.table::last()
#> ✖ tidyr::pack() masks Matrix::pack()
#> ✖ purrr::transpose() masks data.table::transpose()
#> ✖ tidyr::unpack() masks Matrix::unpack()
library(htmlwidgets)
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
#> The following object is masked from 'package:plotly':
#>
#> select
library(dendextend)
#>
#> ---------------------
#> Welcome to dendextend version 1.15.2
#> Type citation('dendextend') for how to cite the package.
#>
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#>
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> You may ask questions at stackoverflow, use the r and dendextend tags:
#> https://stackoverflow.com/questions/tagged/dendextend
#>
#> To suppress this message use: suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#>
#> Attaching package: 'dendextend'
#> The following object is masked from 'package:data.table':
#>
#> set
#> The following object is masked from 'package:stats':
#>
#> cutree
mycolours <- c("A" = "#8491B4B2","B"="#E64B35FF")
my_theme = theme(axis.text.x = element_text(size = 14, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
axis.text.y = element_text( size = 14, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),
axis.title.x = element_text( size = 14, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
axis.title.y = element_text( size = 14, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"))
data_dir = tempdir()
This is just an example on a toy dataset. If the user want to try on a real dataset it can be used the previous command to download it and the next commented line to use it as input data. There are also available online many other examples at link.
data("ERCCraw", package = "COTAN")
ERCCraw = as.data.frame(ERCCraw)
rownames(ERCCraw) = ERCCraw$V1
ERCCraw = ERCCraw[,2:ncol(ERCCraw)]
ERCCraw[1:5,1:5]
#> AAACCGTGAAGCCT.1 AAACGCACTGCTGA.1 AAACTTGACCGAAT.1 AAAGACGAGGAAGC.1
#> ERCC-00002 1662 4036 3919 4124
#> ERCC-00003 95 283 298 290
#> ERCC-00004 25 75 70 75
#> ERCC-00009 41 57 78 98
#> ERCC-00012 0 0 0 0
#> AAAGACGATCCGAA.1
#> ERCC-00002 4220
#> ERCC-00003 312
#> ERCC-00004 87
#> ERCC-00009 87
#> ERCC-00012 0
Define a directory where the output will be stored.
out_dir <- paste0(tempdir(),"/")
Inizialise the COTAN object with the row count table and the metadata for the experiment.
obj = new("scCOTAN",raw = ERCCraw)
#obj = initRaw(obj,GEO="GSM2861514" ,sc.method="Drop_seq",cond = "mouse cortex E17.5")
obj = initRaw(obj,GEO="ERCC" ,sc.method="10X",cond = "negative ERCC dataset")
#> [1] "Initializing S4 object"
Now we can start the cleaning. Analysis requires and starts from a matrix of raw UMI counts after removing possible cell doublets or multiplets and low quality or dying cells (with too high mtRNA percentage, easily done with Seurat or other tools).
If we do not want to consider the mitochondrial genes we can remove them before starting the analysis.
genes_to_rem = get.genes(obj)[grep('^mt', get.genes(obj))]
cells_to_rem = names(get.cell.size(obj)[which(get.cell.size(obj) == 0)])
obj = drop.genes.cells(obj,genes_to_rem,cells_to_rem )
We want also to define a prefix to identify the sample.
#t = "E17.5_cortex"
t = "ERCC"
print(paste("Condition ",t,sep = ""))
#> [1] "Condition ERCC"
#--------------------------------------
n_cells = length(get.cell.size(object = obj))
print(paste("n cells", n_cells, sep = " "))
#> [1] "n cells 1015"
n_it = 1
First we create the directory to store all information regarding the data cleaning.
if(!file.exists(out_dir)){
dir.create(file.path(out_dir))
}
if(!file.exists(paste(out_dir,"cleaning", sep = ""))){
dir.create(file.path(out_dir, "cleaning"))
}
ttm = clean(obj)
#> [1] "Start estimation mu with linear method"
#> [1] 88 1015
#> rowname AACCTACTAGTGTC.1 AACGTTCTTGCCCT.1 AACTGTCTTGTCGA.1
#> 1 ERCC-00002 3868.72569 3666.17330 3583.62138
#> 50 ERCC-00096 2155.29007 2103.20280 2365.56635
#> 39 ERCC-00074 1193.60684 1170.86640 1098.13070
#> 65 ERCC-00130 738.75666 754.80039 639.59647
#> 59 ERCC-00113 565.81363 774.40559 834.76745
#> 88 ERCC-00171 452.85084 417.15519 493.80610
#> 25 ERCC-00046 397.86895 393.19327 399.74779
#> 2 ERCC-00003 256.91538 281.00794 249.25451
#> 73 ERCC-00145 156.94830 144.86068 101.11268
#> 23 ERCC-00043 126.95818 123.07712 122.27580
#> 57 ERCC-00111 74.97530 79.51000 61.13790
#> 4 ERCC-00009 55.98156 81.68835 84.65247
#> 3 ERCC-00004 56.98123 69.70739 54.08352
#> 22 ERCC-00042 70.97662 68.61822 56.43498
#> 40 ERCC-00076 58.98057 64.26150 47.02915
#> AAGAGATGGACGAG.1 AAGTAACTGAATAG.1 AATGTAACGAGATA.1 AGCAAGCTCCCACT.1
#> 1 3843.20177 3596.41754 3658.35653 3794.25176
#> 50 2149.36255 2231.89998 2293.19618 2171.19647
#> 39 972.96247 1259.40899 1138.91426 1177.10994
#> 65 782.79254 725.34382 713.74237 712.58352
#> 59 773.94742 595.61523 676.17698 552.78643
#> 88 411.29777 465.88663 448.22338 428.29336
#> 25 327.26920 438.42583 414.07303 432.93862
#> 2 393.60755 241.46563 275.76410 290.79354
#> 73 172.47971 155.29554 149.40779 194.17204
#> 23 110.56392 145.82630 122.94127 146.79035
#> 57 79.60602 66.28468 68.30071 67.82086
#> 4 48.64812 73.86008 74.27702 47.38169
#> 3 75.18346 64.39084 60.61688 79.89854
#> 22 48.64812 88.06394 58.90936 65.96275
#> 40 53.07068 57.76237 45.24922 75.25328
#> AGTATCCTTCTATC.1 AGTCTACTCATTCT.1 AGTTGTCTAGAACA.1 AGTTTGCTCATGCA.1
#> 1 3662.89484 3742.58314 3732.19890 3598.95146
#> 50 2126.18440 2230.92416 2160.79985 2151.40591
#> 39 1190.99693 1136.99996 1138.43168 1184.35545
#> 65 747.03776 730.28456 692.34409 780.91273
#> 59 601.52296 610.07311 626.74297 613.82165
#> 88 462.49608 433.76298 427.92113 444.99905
#> 25 425.42225 429.75593 446.08759 399.97969
#> 2 280.83430 254.44757 277.54318 304.74643
#> 73 160.34433 144.25374 157.44268 174.01714
#> 23 185.36917 125.22026 158.45193 170.55411
#> 57 76.92821 70.12335 73.67510 91.77023
#> 4 69.51344 75.13216 94.86931 83.11266
#> 3 54.68391 74.13039 62.57337 75.32085
#> 22 62.09867 77.13568 52.48089 58.87147
#> 40 69.51344 64.11277 68.62886 48.48239
#> ATACTCTGCCAAGT.1 ATAGATTGGTGTAC.1 ATCAGGTGCGGAGA.1 ATCCCGTGCCAACA.1
#> 1 3861.52507 3712.95257 3684.84776 3823.53240
#> 50 2036.19303 2217.69942 2203.66257 2158.99362
#> 39 1060.69470 1164.70423 1146.58652 1190.84351
#> 65 613.41380 693.14552 739.52699 709.59930
#> 59 609.15398 630.88146 613.78609 681.29082
#> 88 472.83980 444.08927 511.48841 420.85278
#> 25 472.83980 444.08927 425.17474 368.01028
#> 2 268.36854 248.14060 269.59702 279.31036
#> 73 236.41990 137.34720 136.39691 149.09134
#> 23 155.48336 165.73228 157.70893 143.42965
#> 57 63.89727 59.51712 67.13285 67.94036
#> 4 112.88518 75.99878 77.78886 79.26375
#> 3 83.06645 70.50489 83.11687 54.72973
#> 22 61.76736 64.09536 59.67365 56.61697
#> 40 57.50754 83.32397 56.47685 54.72973
#> CAAAGCACGCTTCC.1 CAAGGTTGGGAACG.1 CAGACTGAACACCA.1 CAGCGTCTTTCACT.1
#> 1 3655.08341 3657.20427 3715.69806 3711.63105
#> 50 2194.86149 2163.14214 2182.81283 2206.38856
#> 39 1251.90632 1259.28094 1188.74807 1247.32477
#> 65 697.40441 708.98352 672.54233 677.30494
#> 59 590.73072 649.59223 592.89916 593.86097
#> 88 474.99983 455.64255 448.36156 438.89360
#> 25 416.63120 405.53115 439.51231 434.55885
#> 2 258.63338 267.26080 272.35998 252.49928
#> 73 174.09951 152.19018 149.45385 146.29787
#> 23 139.88342 136.41437 163.21934 153.88368
#> 57 70.44489 79.80705 81.60967 82.36028
#> 4 71.45125 64.95922 76.69342 59.60283
#> 3 68.43218 101.15079 68.82743 92.11347
#> 22 61.38769 58.46330 65.87768 63.93759
#> 40 71.45125 63.10325 54.07870 62.85390
#> CATTACACACCCAA.1 CATTTGACGGTCTA.1 CCACGGGATTCGGA.1 CCCAACACATCTTC.1
#> 1 3622.15916 3742.63053 3765.13428 3790.41464
#> 50 2096.84700 2194.21820 2187.73734 2108.92219
#> 39 1400.94618 1085.22019 1158.64321 1230.87667
#> 65 697.72972 718.09048 729.85399 696.58950
#> 59 641.94792 611.56580 626.76212 666.34683
#> 88 428.87974 466.04548 455.24643 420.37311
#> 25 393.21597 443.21876 388.64725 398.19515
#> 2 269.76444 282.48063 231.72864 246.98180
#> 73 162.77312 154.08034 142.32153 177.42366
#> 23 135.33945 142.66698 121.33823 150.20526
#> 57 74.98537 80.84462 123.16286 73.59050
#> 4 64.01190 86.55130 62.03759 70.56623
#> 3 76.81428 80.84462 69.33613 85.68756
#> 22 68.58418 57.06679 68.42381 66.53387
#> 40 57.61071 55.16457 59.30064 49.39636
#> CCTATTGAAGCCAT.1 CGACTCTGAAGTGA.1 CGCACGGACTCTAT.1 CGCATAGAAGCTAC.1
#> 1 3671.13788 3677.86374 3841.61207 3681.38464
#> 50 2091.82163 2269.64334 2155.18209 2090.32164
#> 39 1344.87229 1150.01049 1215.52270 1321.56699
#> 65 696.97098 708.44986 724.14118 649.55449
#> 59 663.34917 699.77053 558.19216 665.10234
#> 88 452.53135 432.88131 455.82101 426.70202
#> 25 408.00517 403.58859 397.63110 428.42956
#> 2 264.43097 259.29482 280.17367 281.58878
#> 73 139.03072 141.03902 133.62129 138.20308
#> 23 143.57420 120.42563 168.10420 162.38862
#> 57 62.70013 90.04799 58.18992 84.64939
#> 4 73.60450 75.94409 60.34510 79.46677
#> 3 77.23929 67.26476 64.65546 67.37400
#> 22 59.06534 60.75527 62.50028 72.55662
#> 40 60.88273 68.34968 51.72437 60.46385
#> CTATGTACATGTGC.1 GACATTCTACTCTT.1 GATAGCACACCATG.1 GATTCTACCCATGA.1
#> 1 3751.26648 3703.12656 3685.11091 3464.21213
#> 50 2260.16572 2120.92139 2308.64506 2205.08496
#> 39 1176.65094 1186.92761 1132.93095 1327.92409
#> 65 669.47381 706.24383 706.74487 735.98383
#> 59 575.41551 629.59721 546.30803 703.01864
#> 88 446.31587 526.67175 458.27345 431.41409
#> 25 414.96311 399.65736 462.38721 404.18197
#> 2 275.71993 277.02277 267.39474 277.33763
#> 73 136.47675 140.15381 146.45004 165.54262
#> 23 165.06310 173.00236 155.50033 161.24281
#> 57 67.31624 88.69109 69.93401 73.09674
#> 4 79.30406 67.88700 65.82024 73.81337
#> 3 52.56199 84.31128 80.62980 54.46424
#> 22 75.61550 59.12739 59.23822 79.54645
#> 40 60.86126 54.74758 60.88373 53.74760
#> GCTTAACTTCTATC.1 GGACAACTACGTAC.1 GGCGCATGCACTAG.1 GGTTGAACGTCGAT.1
#> 1 3766.93854 3645.87095 3694.27792 3618.43371
#> 50 2399.38432 2164.73588 2267.94103 2201.27042
#> 39 1177.87958 1202.34333 1137.70683 1160.88586
#> 65 648.68730 743.15693 717.37179 762.04350
#> 59 572.81744 650.80177 670.66790 636.45058
#> 88 407.80050 455.73387 460.50038 431.08919
#> 25 367.96882 387.54641 391.37862 378.47595
#> 2 265.54451 295.19126 240.99209 260.52044
#> 73 170.70718 153.63755 139.17760 168.02295
#> 23 127.08202 171.76333 142.91391 185.84357
#> 57 75.86986 72.50312 83.13293 86.55728
#> 4 106.21780 79.40817 72.85807 84.01148
#> 3 70.17962 68.18745 75.66031 75.52547
#> 22 45.52192 69.05059 56.97875 78.91987
#> 40 49.31541 57.82987 59.78098 57.70485
#> GTCATACTAAACAG.1 GTGGAGGATAGAAG.1 GTTATAGAAAACGA.1 TAAGCTCTAAGTAG.1
#> 1 3739.83042 3795.44448 3637.07900 3751.27681
#> 50 2130.02706 2162.74744 2221.33295 2180.26749
#> 39 1114.62746 1162.03356 1210.59109 1160.36395
#> 65 716.75266 705.55203 710.96803 725.56296
#> 59 620.41494 629.32405 590.70478 634.30843
#> 88 492.28577 424.57214 464.25151 430.32772
#> 25 440.26340 426.34488 481.05300 453.58868
#> 2 274.56251 261.47971 275.01375 249.60797
#> 73 165.70088 130.29667 139.71760 155.66949
#> 23 143.54321 123.20569 162.70910 135.09249
#> 57 76.10680 78.88710 83.12313 68.88822
#> 4 60.69277 75.34161 56.59447 70.67753
#> 3 85.74057 67.36426 70.74309 70.67753
#> 22 97.30110 103.70551 65.43736 69.78287
#> 40 49.13224 52.29594 61.90020 47.41657
#> TACATCACGACTAC.1 TCCGGACTCTGCAA.1 TCGAATCTAAGGGC.1 TGATACCTCTTCCG.1
#> 1 3636.08433 3657.77388 3803.30743 3721.74044
#> 50 2249.19502 2097.72405 2210.40930 2216.40234
#> 39 1161.45551 1196.30307 1071.87308 1123.33019
#> 65 747.44591 716.89896 680.66572 742.26785
#> 59 646.30070 808.71853 607.86256 594.75984
#> 88 378.86593 466.16090 455.23906 423.61273
#> 25 420.00974 358.44948 454.36191 430.23168
#> 2 252.00584 298.41361 273.66972 239.22773
#> 73 204.86189 143.02664 145.60633 174.92937
#> 23 138.86036 190.70219 142.09774 148.45357
#> 57 108.00250 68.86468 102.62615 83.20964
#> 4 86.57344 67.09892 70.17172 62.40723
#> 3 64.28721 60.03587 66.66314 74.69957
#> 22 70.28734 67.09892 67.54028 70.91731
#> 40 58.28707 62.68452 52.62879 71.86287
#> TGCCACTGTGCTTT.1 TGGAACTGTGCTAG.1
#> 1 3622.12519 3810.36193
#> 50 2269.69815 2065.01775
#> 39 1207.37506 1168.46200
#> 65 726.30341 701.81208
#> 59 586.46913 797.34671
#> 88 478.98458 485.02196
#> 25 445.59132 426.23142
#> 2 264.01546 260.88303
#> 73 160.70506 124.92990
#> 23 126.26827 150.65076
#> 57 63.65590 47.76731
#> 4 79.30899 29.39527
#> 3 65.74298 36.74409
#> 22 62.61236 47.76731
#> 40 58.43821 66.13936
obj = ttm$object
ttm$pca.cell.2
Run this when B cells need to be removed.
pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
nudge_y = 0.05,
nudge_x = 0.05,
direction = "x",
angle = 90,
vjust = 0,
segment.size = 0.2)+
ggtitle("B cell group genes mean expression")+my_theme +
theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 5,hjust = 0.02 ))
dev.off()
if (length(ttm$cl1) < length(ttm$cl2)) {
to_rem = ttm$cl1
}else{
to_rem = ttm$cl2
}
n_it = n_it+1
obj = drop.genes.cells(object = obj,genes = c(),cells = to_rem)
gc()
ttm = clean(obj)
#ttm = clean.sqrt(obj, cells)
obj = ttm$object
ttm$pca.cell.2
Run this only in the last iteration, instead the previous code, when B cells group has not to be removed
pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
nudge_y = 0.05,
nudge_x = 0.05,
direction = "x",
angle = 90,
vjust = 0,
segment.size = 0.2)+
ggtitle(label = "B cell group genes mean expression", subtitle = " - B group NOT removed -")+my_theme +
theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 10,hjust = 0.02 ),
plot.subtitle = element_text(color = "darkred",vjust = - 15,hjust = 0.01 ))
dev.off()
#> png
#> 2
To color the pca based on nu_j (so the cells’ efficiency)
nu_est = round(get.nu(object = obj), digits = 7)
plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))
plot.nu = plot.nu + geom_point(size = 1,alpha= 0.8)+
scale_color_gradient2(low = "#E64B35B2",mid = "#4DBBD5B2", high = "#3C5488B2" ,
midpoint = log(mean(nu_est)),name = "ln(nu)")+
ggtitle("Cells PCA coloured by cells efficiency") +
my_theme + theme(plot.title = element_text(color = "#3C5488FF", size = 20),
legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
legend.text = element_text(color = "#3C5488FF", size = 11),
legend.key.width = unit(2, "mm"),
legend.position="right")
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored.pdf", sep = "")))
plot.nu
dev.off()
#> png
#> 2
plot.nu
The next part is use to remove the cells with efficiency too low.
nu_df = data.frame("nu"= sort(get.nu(obj)), "n"=c(1:length(get.nu(obj))))
ggplot(nu_df, aes(x = n, y=nu)) +
geom_point(colour = "#8491B4B2", size=1)+
my_theme #+ ylim(0,1) + xlim(0,70)
We can zoom on the smallest values and, if we detect a clear elbow, we can decide to remove the cells.
yset = 0.4#threshold to remove low UDE cells
plot.ude <- ggplot(nu_df, aes(x = n, y=nu)) +
geom_point(colour = "#8491B4B2", size=1) +
my_theme + ylim(0,1) + xlim(0,400) +
geom_hline(yintercept=yset, linetype="dashed", color = "darkred") +
annotate(geom="text", x=200, y=0.25,
label=paste("to remove cells with nu < ",yset,sep = " "),
color="darkred", size=4.5)
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
plot.ude
#> Warning: Removed 668 rows containing missing values (geom_point).
dev.off()
#> png
#> 2
plot.ude
#> Warning: Removed 668 rows containing missing values (geom_point).
We also save the defined threshold in the metadata and re run the estimation
obj = add.row.to.meta(obj,c("Threshold low UDE cells:",yset))
to_rem = rownames(nu_df[which(nu_df$nu < yset),])
obj = drop.genes.cells(object = obj, genes = c(),cells = to_rem)
Repeat the estimation after the cells are removed
ttm = clean(obj)
#> [1] "Start estimation mu with linear method"
#> [1] 88 1004
#> rowname ATACTCTGCCAAGT.1
#> 1 ERCC-00002 3891.24110
#> 50 ERCC-00096 2051.86238
#> 39 ERCC-00074 1068.85718
#> 65 ERCC-00130 618.13427
#> 59 ERCC-00113 613.84167
#> 25 ERCC-00046 476.47850
#> 88 ERCC-00171 476.47850
#> 2 ERCC-00003 270.43374
#> 73 ERCC-00145 238.23925
#> 23 ERCC-00043 156.67987
#> 4 ERCC-00009 113.75388
#> 3 ERCC-00004 83.70568
#> 11 ERCC-00022 70.82789
#> 57 ERCC-00111 64.38899
#> 22 ERCC-00042 62.24269
obj = ttm$object
ttm$pca.cell.2
In case we do not want to remove anything, we can run:
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
ggplot(nu_df, aes(x = n, y=nu)) + geom_point(colour = "#8491B4B2", size=1) +my_theme + #xlim(0,100)+
annotate(geom="text", x=50, y=0.25, label="nothing to remove ", color="darkred")
dev.off()
#> png
#> 2
Just to check again, we plot the final efficiency colored PCA
nu_est = round(get.nu(object = obj), digits = 7)
plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))
plot.nu = plot.nu + geom_point(size = 2,alpha= 0.8)+
scale_color_gradient2(low = "#E64B35B2",mid = "#4DBBD5B2", high = "#3C5488B2" ,
midpoint = log(mean(nu_est)),name = "ln(nu)")+
ggtitle("Cells PCA coloured by cells efficiency: last") +
my_theme + theme(plot.title = element_text(color = "#3C5488FF", size = 20),
legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
legend.text = element_text(color = "#3C5488FF", size = 11),
legend.key.width = unit(2, "mm"),
legend.position="right")
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored_FINAL.pdf", sep = "")))
plot.nu
dev.off()
#> png
#> 2
plot.nu
In this part all the contingency tables are computed and used to get the statistics (S) To storage efficiency of all the observed tables only the yes_yes is stored. Note that if will be necessary re-computing the yes-yes table, the yes-yes table should be cancelled before running cotan_analysis.
obj = cotan_analysis(obj,cores = 2)
#> [1] "cotan analysis"
#> [1] "mu estimator creation"
#> [1] "save effective constitutive genes"
#> [1] "start a minimization"
#> [1] "Final trance!"
#> [1] "a min: -0.147216796875 | a max 13.5546875 | negative a %: 26.984126984127"
COEX evaluation and storing
obj = get.coex(obj)
#> [1] "coex dataframe creation"
#> [1] "creation of observed yes/yes contingency table"
#> [1] "mu estimator creation"
#> [1] "expected contingency tables creation"
#> [1] "The distance between estimated n of zeros and observed number of zero is 0.0041370202609155 over 63"
#> [1] "Done"
#> [1] "coex estimation"
#> [1] "Cleaning RAM"
# saving the structure
saveRDS(obj,file = paste(out_dir,t,".cotan.RDS", sep = ""))
The next function can directly plot the GDI for the dataset with the 1.5 threshold (in red) and the two higher quantiles (in blue).
plot_GDI(obj, cond = "ERCC")
#> [1] "GDI plot "
#> [1] "function to generate GDI dataframe"
#> [1] "Using S"
#> [1] "function to generate S "
If a more complex plot is needed, or if we want to analyze more in detail the GDI data, we can get directly the GDI dataframe.
quant.p = get.GDI(obj)
#> [1] "function to generate GDI dataframe"
#> [1] "Using S"
#> [1] "function to generate S "
head(quant.p)
#> sum.raw.norm GDI exp.cells
#> ERCC-00012 3.175703 1.1614957 2.390438
#> ERCC-00013 5.995188 1.3512649 27.788845
#> ERCC-00014 6.077256 1.3473405 34.860558
#> ERCC-00016 3.569692 1.1518864 3.685259
#> ERCC-00017 1.609829 0.7957451 0.498008
#> ERCC-00019 7.157739 1.4853109 66.135458
In the third column of this dataframe is reported the percentage of cells expressing the gene.
Next we can define some gene sets (in this case three) that we want to specifically label in the GDI plot.
AA=c("ERCC-00012","ERCC-00013","ERCC-00014")
BB=c("ERCC-00016","ERCC-00017","ERCC-00019")
CC=c("ERCC-00022","ERCC-00024","ERCC-00028")
text.size = 12
quant.p$highlight = with(quant.p, ifelse(rownames(quant.p) %in% AA, "AA",
ifelse(rownames(quant.p) %in% CC,"Constitutive" ,
ifelse(rownames(quant.p) %in% BB,"BB" , "normal"))))
textdf <- quant.p[rownames(quant.p) %in% c(AA,CC,BB), ]
mycolours <- c("Con" = "#00A087FF","AA"="#E64B35FF","BB"="#F39B7FFF","normal" = "#8491B4B2")
f1 = ggplot(subset(quant.p,highlight == "normal" ), aes(x=sum.raw.norm, y=GDI)) + geom_point(alpha = 0.1, color = "#8491B4B2", size=2.5)
GDI_plot = f1 + geom_point(data = subset(quant.p,highlight != "normal" ), aes(x=sum.raw.norm, y=GDI, colour=highlight),size=2.5,alpha = 0.8) +
geom_hline(yintercept=quantile(quant.p$GDI)[4], linetype="dashed", color = "darkblue") +
geom_hline(yintercept=quantile(quant.p$GDI)[3], linetype="dashed", color = "darkblue") +
geom_hline(yintercept=1.5, linetype="dotted", color = "red", size= 0.5) +
scale_color_manual("Status", values = mycolours) +
scale_fill_manual("Status", values = mycolours) +
xlab("log normalized counts")+ylab("GDI")+
geom_label_repel(data = textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf),fill=highlight),
label.size = NA,
alpha = 0.5,
direction ="both",
na.rm=TRUE,
seed = 1234) +
geom_label_repel(data =textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf)),
label.size = NA,
segment.color = 'black',
segment.size = 0.5,
direction = "both",
alpha = 0.8,
na.rm=TRUE,
fill = NA,
seed = 1234) +
theme(axis.text.x = element_text(size = text.size, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
axis.text.y = element_text( size = text.size, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),
axis.title.x = element_text( size = text.size, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
axis.title.y = element_text( size = text.size, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"),
legend.title = element_blank(),
legend.text = element_text(color = "#3C5488FF",face ="italic" ),
legend.position = "bottom")
legend <- cowplot::get_legend(GDI_plot)
GDI_plot =GDI_plot + theme(
legend.position = "none")
GDI_plot
For the Gene Pair Analysis, we can plot an heatmap with the coex values between two genes sets. To do so we need to define, in a list, the different gene sets (list.genes). Then in the function parameter sets we can decide which sets need to be considered (in the example from 1 to 3). In the condition parameter we should insert an array with each file name prefix to be considered (in the example, the file names is “E17.5_cortex”).
list.genes = list("Ref.col"= BB, "AA"=AA, "Const."=CC )
plot_heatmap(df_genes = list.genes,sets = c(1:3),conditions = "ERCC",dir = out_dir)
#> [1] "plot heatmap"
#> [1] "Loading condition ERCC"
#> [1] "ERCC-00016" "ERCC-00017" "ERCC-00019"
#> [1] "Get p-values on a set of genes on columns on a set of genes on rows"
#> [1] "Using function S"
#> [1] "function to generate S "
#> [1] "Ref.col"
#> [1] "AA"
#> [1] "Const."
#> [1] "min coex: 0 max coex 0.98246092315627"
We can also plot a general heatmap of coex values based on some markers like the following one.
plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"), condition = "ERCC",dir = out_dir, p_value = 0.05)
#> [1] "ploting a general heatmap"
#> NULL
#> [1] "Get p-values genome wide on columns genome wide on rows"
#> [1] "Using function S"
#> [1] "function to generate S "
plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"),markers.list =c("ERCC-00084" ,"ERCC-00085" ,"ERCC-00086") ,symmetric = FALSE, condition = "ERCC",dir = out_dir, p_value = 0.05)
#> [1] "ploting a general heatmap"
#> NULL
#> [1] "Get p-values genome wide on columns genome wide on rows"
#> [1] "Using function S"
#> [1] "function to generate S "
Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions:
get.observed.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
#> ERCC-00014.yes ERCC-00014.no
#> ERCC-00019.yes 241 423
#> ERCC-00019.no 109 231
get.expected.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
#> ERCC-00014.yes ERCC-00014.no
#> ERCC-00019.yes 235.7119 428.2888
#> ERCC-00019.no 114.2885 225.7108
Another useful function is extract.coex. This can be used to extract the whole or a partial coex matrix from a COTAN object.
# For the whole matrix
coex <- extract.coex(object = obj)
coex[1:5,1:5]
#> ERCC-00012 ERCC-00013 ERCC-00014 ERCC-00016 ERCC-00017
#> ERCC-00012 0.769954876 0.03079731 -0.008469118 0.035409255 -0.004367265
#> ERCC-00013 0.030797310 0.98637914 0.020020333 -0.018534811 -0.013478967
#> ERCC-00014 -0.008469118 0.02002033 0.983490318 -0.003222142 0.094474696
#> ERCC-00016 0.035409255 -0.01853481 -0.003222142 0.982460923 0.028389361
#> ERCC-00017 -0.004367265 -0.01347897 0.094474696 0.028389361 0.185954347
# For a partial matrix
coex <- extract.coex(object = obj,genes = c("ERCC-00017", "ERCC-00019", "ERCC-00024", "ERCC-00025", "ERCC-00028", "ERCC-00031"))
head(coex)
#> ERCC-00017 ERCC-00019 ERCC-00024 ERCC-00025 ERCC-00028
#> ERCC-00012 -0.004367265 5.272805e-02 -0.011980011 0.017574171 -0.0120737279
#> ERCC-00013 -0.013478967 4.360570e-02 -0.005497781 0.068917037 0.0310017944
#> ERCC-00014 0.094474696 2.345844e-02 -0.016243015 -0.006247842 -0.0226624019
#> ERCC-00016 0.028389361 7.232674e-05 -0.004746929 0.033126881 0.0005889499
#> ERCC-00017 0.185954347 1.880656e-02 -0.010473707 0.012328879 0.0162721054
#> ERCC-00019 0.018806561 9.695314e-01 -0.010600940 -0.032637961 0.0473851891
#> ERCC-00031
#> ERCC-00012 0.013066001
#> ERCC-00013 -0.020289792
#> ERCC-00014 -0.054916592
#> ERCC-00016 -0.012314141
#> ERCC-00017 -0.017522836
#> ERCC-00019 -0.003396497
The next few lines are just to clean after compilation of the documentation.
if (file.exists(paste(out_dir,t,".cotan.RDS", sep = ""))) {
#Delete file if it exists
file.remove(paste(out_dir,t,".cotan.RDS", sep = ""))
}
#> [1] TRUE
unlink(paste(out_dir,"cleaning", sep = ""),recursive = TRUE)
print(sessionInfo())
#> R version 4.2.0 RC (2022-04-19 r82224)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
#> LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] dendextend_1.15.2 MASS_7.3-57 htmlwidgets_1.5.4 forcats_0.5.1
#> [5] stringr_1.4.0 dplyr_1.0.8 purrr_0.3.4 readr_2.1.2
#> [9] tidyr_1.2.0 tibble_3.1.6 tidyverse_1.3.1 plotly_4.10.0
#> [13] Rtsne_0.16 factoextra_1.0.7 ggrepel_0.9.1 ggplot2_3.3.5
#> [17] Matrix_1.4-1 data.table_1.14.2 COTAN_1.0.0 BiocStyle_2.24.0
#>
#> loaded via a namespace (and not attached):
#> [1] colorspace_2.0-3 rjson_0.2.21 ellipsis_0.3.2
#> [4] rprojroot_2.0.3 circlize_0.4.14 GlobalOptions_0.1.2
#> [7] fs_1.5.2 clue_0.3-60 rstudioapi_0.13
#> [10] farver_2.1.0 fansi_1.0.3 lubridate_1.8.0
#> [13] xml2_1.3.3 codetools_0.2-18 doParallel_1.0.17
#> [16] knitr_1.38 jsonlite_1.8.0 Cairo_1.5-15
#> [19] broom_0.8.0 cluster_2.1.3 dbplyr_2.1.1
#> [22] png_0.1-7 BiocManager_1.30.17 compiler_4.2.0
#> [25] httr_1.4.2 basilisk_1.8.0 backports_1.4.1
#> [28] assertthat_0.2.1 RcppZiggurat_0.1.6 fastmap_1.1.0
#> [31] lazyeval_0.2.2 cli_3.3.0 htmltools_0.5.2
#> [34] tools_4.2.0 gtable_0.3.0 glue_1.6.2
#> [37] Rcpp_1.0.8.3 cellranger_1.1.0 jquerylib_0.1.4
#> [40] vctrs_0.4.1 iterators_1.0.14 xfun_0.30
#> [43] rvest_1.0.2 lifecycle_1.0.1 scales_1.2.0
#> [46] basilisk.utils_1.8.0 hms_1.1.1 parallel_4.2.0
#> [49] RColorBrewer_1.1-3 ComplexHeatmap_2.12.0 yaml_2.3.5
#> [52] reticulate_1.24 gridExtra_2.3 sass_0.4.1
#> [55] stringi_1.7.6 highr_0.9 S4Vectors_0.34.0
#> [58] foreach_1.5.2 BiocGenerics_0.42.0 filelock_1.0.2
#> [61] shape_1.4.6 rlang_1.0.2 pkgconfig_2.0.3
#> [64] matrixStats_0.62.0 evaluate_0.15 lattice_0.20-45
#> [67] labeling_0.4.2 Rfast_2.0.6 cowplot_1.1.1
#> [70] tidyselect_1.1.2 here_1.0.1 magrittr_2.0.3
#> [73] bookdown_0.26 R6_2.5.1 IRanges_2.30.0
#> [76] magick_2.7.3 generics_0.1.2 DBI_1.1.2
#> [79] pillar_1.7.0 haven_2.5.0 withr_2.5.0
#> [82] dir.expiry_1.4.0 modelr_0.1.8 crayon_1.5.1
#> [85] utf8_1.2.2 tzdb_0.3.0 rmarkdown_2.14
#> [88] viridis_0.6.2 GetoptLong_1.0.5 grid_4.2.0
#> [91] readxl_1.4.0 reprex_2.0.1 digest_0.6.29
#> [94] stats4_4.2.0 munsell_0.5.0 viridisLite_0.4.0
#> [97] bslib_0.3.1