COTAN 1.0.0
library(COTAN)
library(data.table)
library(Matrix)
library(ggrepel)
#> Loading required package: ggplot2
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(Rtsne)
library(utils)
library(plotly)
#>
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#> The following object is masked from 'package:stats':
#>
#> filter
#> The following object is masked from 'package:graphics':
#>
#> layout
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✔ tibble 3.1.6 ✔ dplyr 1.0.8
#> ✔ tidyr 1.2.0 ✔ stringr 1.4.0
#> ✔ readr 2.1.2 ✔ forcats 0.5.1
#> ✔ purrr 0.3.4
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::between() masks data.table::between()
#> ✖ tidyr::expand() masks Matrix::expand()
#> ✖ dplyr::filter() masks plotly::filter(), stats::filter()
#> ✖ dplyr::first() masks data.table::first()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ dplyr::last() masks data.table::last()
#> ✖ tidyr::pack() masks Matrix::pack()
#> ✖ purrr::transpose() masks data.table::transpose()
#> ✖ tidyr::unpack() masks Matrix::unpack()
library(htmlwidgets)
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
#> The following object is masked from 'package:plotly':
#>
#> select
library(dendextend)
#>
#> ---------------------
#> Welcome to dendextend version 1.15.2
#> Type citation('dendextend') for how to cite the package.
#>
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#>
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> You may ask questions at stackoverflow, use the r and dendextend tags:
#> https://stackoverflow.com/questions/tagged/dendextend
#>
#> To suppress this message use: suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#>
#> Attaching package: 'dendextend'
#> The following object is masked from 'package:data.table':
#>
#> set
#> The following object is masked from 'package:stats':
#>
#> cutree
mycolours <- c("A" = "#8491B4B2","B"="#E64B35FF")
my_theme = theme(axis.text.x = element_text(size = 14, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
axis.text.y = element_text( size = 14, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),
axis.title.x = element_text( size = 14, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
axis.title.y = element_text( size = 14, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"))
data_dir = tempdir()
This is just an example on a toy dataset. If the user want to try on a real dataset it can be used the previous command to download it and the next commented line to use it as input data. There are also available online many other examples at link.
data("ERCCraw", package = "COTAN")
ERCCraw = as.data.frame(ERCCraw)
rownames(ERCCraw) = ERCCraw$V1
ERCCraw = ERCCraw[,2:ncol(ERCCraw)]
ERCCraw[1:5,1:5]
#> AAACCGTGAAGCCT.1 AAACGCACTGCTGA.1 AAACTTGACCGAAT.1 AAAGACGAGGAAGC.1
#> ERCC-00002 1662 4036 3919 4124
#> ERCC-00003 95 283 298 290
#> ERCC-00004 25 75 70 75
#> ERCC-00009 41 57 78 98
#> ERCC-00012 0 0 0 0
#> AAAGACGATCCGAA.1
#> ERCC-00002 4220
#> ERCC-00003 312
#> ERCC-00004 87
#> ERCC-00009 87
#> ERCC-00012 0
Define a directory where the output will be stored.
out_dir <- paste0(tempdir(),"/")
Inizialise the COTAN object with the row count table and the metadata for the experiment.
obj = new("scCOTAN",raw = ERCCraw)
#obj = initRaw(obj,GEO="GSM2861514" ,sc.method="Drop_seq",cond = "mouse cortex E17.5")
obj = initRaw(obj,GEO="ERCC" ,sc.method="10X",cond = "negative ERCC dataset")
#> [1] "Initializing S4 object"
Now we can start the cleaning. Analysis requires and starts from a matrix of raw UMI counts after removing possible cell doublets or multiplets and low quality or dying cells (with too high mtRNA percentage, easily done with Seurat or other tools).
If we do not want to consider the mitochondrial genes we can remove them before starting the analysis.
genes_to_rem = get.genes(obj)[grep('^mt', get.genes(obj))]
cells_to_rem = names(get.cell.size(obj)[which(get.cell.size(obj) == 0)])
obj = drop.genes.cells(obj,genes_to_rem,cells_to_rem )
We want also to define a prefix to identify the sample.
#t = "E17.5_cortex"
t = "ERCC"
print(paste("Condition ",t,sep = ""))
#> [1] "Condition ERCC"
#--------------------------------------
n_cells = length(get.cell.size(object = obj))
print(paste("n cells", n_cells, sep = " "))
#> [1] "n cells 1015"
n_it = 1
First we create the directory to store all information regarding the data cleaning.
if(!file.exists(out_dir)){
dir.create(file.path(out_dir))
}
if(!file.exists(paste(out_dir,"cleaning", sep = ""))){
dir.create(file.path(out_dir, "cleaning"))
}
ttm = clean(obj)
#> [1] "Start estimation mu with linear method"
#> [1] 88 1015
#> rowname AACCTACTAGTGTC.1 AACGTTCTTGCCCT.1 AAGAGATGGACGAG.1
#> 1 ERCC-00002 3868.72569 3666.17330 3843.20177
#> 50 ERCC-00096 2155.29007 2103.20280 2149.36255
#> 39 ERCC-00074 1193.60684 1170.86640 972.96247
#> 65 ERCC-00130 738.75666 754.80039 782.79254
#> 59 ERCC-00113 565.81363 774.40559 773.94742
#> 88 ERCC-00171 452.85084 417.15519 411.29777
#> 25 ERCC-00046 397.86895 393.19327 327.26920
#> 2 ERCC-00003 256.91538 281.00794 393.60755
#> 73 ERCC-00145 156.94830 144.86068 172.47971
#> 23 ERCC-00043 126.95818 123.07712 110.56392
#> 57 ERCC-00111 74.97530 79.51000 79.60602
#> 4 ERCC-00009 55.98156 81.68835 48.64812
#> 3 ERCC-00004 56.98123 69.70739 75.18346
#> 22 ERCC-00042 70.97662 68.61822 48.64812
#> 40 ERCC-00076 58.98057 64.26150 53.07068
#> AAGTAACTGAATAG.1 AATGTAACGAGATA.1 AGCAAGCTCCCACT.1 AGCGATACCTACCC.1
#> 1 3596.41754 3658.35653 3794.25176 3776.94978
#> 50 2231.89998 2293.19618 2171.19647 2065.55255
#> 39 1259.40899 1138.91426 1177.10994 1206.67289
#> 65 725.34382 713.74237 712.58352 713.61235
#> 59 595.61523 676.17698 552.78643 609.69852
#> 88 465.88663 448.22338 428.29336 483.51743
#> 25 438.42583 414.07303 432.93862 400.81050
#> 2 241.46563 275.76410 290.79354 366.87946
#> 73 155.29554 149.40779 194.17204 115.57763
#> 23 145.82630 122.94127 146.79035 148.44833
#> 57 66.28468 68.30071 67.82086 86.94831
#> 4 73.86008 74.27702 47.38169 55.13795
#> 3 64.39084 60.61688 79.89854 74.22417
#> 22 88.06394 58.90936 65.96275 54.07761
#> 40 57.76237 45.24922 75.25328 51.95692
#> AGCTGAACCTTCCG.1 AGTATCCTTCTATC.1 AGTCTACTCATTCT.1 AGTTGTCTAGAACA.1
#> 1 3850.52926 3662.89484 3742.58314 3732.19890
#> 50 2145.75739 2126.18440 2230.92416 2160.79985
#> 39 1132.04891 1190.99693 1136.99996 1138.43168
#> 65 684.36487 747.03776 730.28456 692.34409
#> 59 620.72899 601.52296 610.07311 626.74297
#> 88 445.45120 462.49608 433.76298 427.92113
#> 25 355.02125 425.42225 429.75593 446.08759
#> 2 320.41226 280.83430 254.44757 277.54318
#> 73 107.17623 160.34433 144.25374 157.44268
#> 23 147.36731 185.36917 125.22026 158.45193
#> 57 73.68366 76.92821 70.12335 73.67510
#> 4 79.26575 69.51344 75.13216 94.86931
#> 3 117.22400 54.68391 74.13039 62.57337
#> 22 54.70453 62.09867 77.13568 52.48089
#> 40 64.75230 69.51344 64.11277 68.62886
#> AGTTTGCTCATGCA.1 ATACTCTGCCAAGT.1 ATAGATTGGTGTAC.1 ATCAGGTGCGGAGA.1
#> 1 3598.95146 3861.52507 3712.95257 3684.84776
#> 50 2151.40591 2036.19303 2217.69942 2203.66257
#> 39 1184.35545 1060.69470 1164.70423 1146.58652
#> 65 780.91273 613.41380 693.14552 739.52699
#> 59 613.82165 609.15398 630.88146 613.78609
#> 88 444.99905 472.83980 444.08927 511.48841
#> 25 399.97969 472.83980 444.08927 425.17474
#> 2 304.74643 268.36854 248.14060 269.59702
#> 73 174.01714 236.41990 137.34720 136.39691
#> 23 170.55411 155.48336 165.73228 157.70893
#> 57 91.77023 63.89727 59.51712 67.13285
#> 4 83.11266 112.88518 75.99878 77.78886
#> 3 75.32085 83.06645 70.50489 83.11687
#> 22 58.87147 61.76736 64.09536 59.67365
#> 40 48.48239 57.50754 83.32397 56.47685
#> ATCCCGTGCCAACA.1 CAAAGCACGCTTCC.1 CAAGCTGAGCTTCC.1 CAAGGTTGGGAACG.1
#> 1 3823.53240 3655.08341 3638.06138 3657.20427
#> 50 2158.99362 2194.86149 2140.34343 2163.14214
#> 39 1190.84351 1251.90632 1217.33121 1259.28094
#> 65 709.59930 697.40441 635.65936 708.98352
#> 59 681.29082 590.73072 667.00694 649.59223
#> 88 420.85278 474.99983 447.57385 455.64255
#> 25 368.01028 416.63120 426.67546 405.53115
#> 2 279.31036 258.63338 404.03554 267.26080
#> 73 149.09134 174.09951 144.54720 152.19018
#> 23 143.42965 139.88342 142.80566 136.41437
#> 57 67.94036 70.44489 78.36896 79.80705
#> 4 79.26375 71.45125 90.55969 64.95922
#> 3 54.72973 68.43218 57.47057 101.15079
#> 22 56.61697 61.38769 74.88590 58.46330
#> 40 54.72973 71.45125 67.91977 63.10325
#> CAGACTGAACACCA.1 CAGCGTCTTTCACT.1 CATTACACACCCAA.1 CATTTGACGGTCTA.1
#> 1 3715.69806 3711.63105 3622.15916 3742.63053
#> 50 2182.81283 2206.38856 2096.84700 2194.21820
#> 39 1188.74807 1247.32477 1400.94618 1085.22019
#> 65 672.54233 677.30494 697.72972 718.09048
#> 59 592.89916 593.86097 641.94792 611.56580
#> 88 448.36156 438.89360 428.87974 466.04548
#> 25 439.51231 434.55885 393.21597 443.21876
#> 2 272.35998 252.49928 269.76444 282.48063
#> 73 149.45385 146.29787 162.77312 154.08034
#> 23 163.21934 153.88368 135.33945 142.66698
#> 57 81.60967 82.36028 74.98537 80.84462
#> 4 76.69342 59.60283 64.01190 86.55130
#> 3 68.82743 92.11347 76.81428 80.84462
#> 22 65.87768 63.93759 68.58418 57.06679
#> 40 54.07870 62.85390 57.61071 55.16457
#> CCAAAGTGCTGAAC.1 CCACGGGATTCGGA.1 CCCAACACATCTTC.1 CCTATTGAAGCCAT.1
#> 1 3813.94816 3765.13428 3790.41464 3671.13788
#> 50 2165.18213 2187.73734 2108.92219 2091.82163
#> 39 1110.83257 1158.64321 1230.87667 1344.87229
#> 65 684.07204 729.85399 696.58950 696.97098
#> 59 636.55458 626.76212 666.34683 663.34917
#> 88 451.86410 455.24643 420.37311 452.53135
#> 25 368.48441 388.64725 398.19515 408.00517
#> 2 338.00152 231.72864 246.98180 264.43097
#> 73 97.72458 142.32153 177.42366 139.03072
#> 23 141.65581 121.33823 150.20526 143.57420
#> 57 87.86246 123.16286 73.59050 62.70013
#> 4 75.31068 62.03759 70.56623 73.60450
#> 3 60.96579 69.33613 85.68756 77.23929
#> 22 64.55201 68.42381 66.53387 59.06534
#> 40 84.27624 59.30064 49.39636 60.88273
#> CGACTCTGAAGTGA.1 CGAGGCTGAAGTGA.1 CGCACGGACTCTAT.1 CGCATAGAAGCTAC.1
#> 1 3677.86374 3718.13071 3841.61207 3681.38464
#> 50 2269.64334 2221.08565 2155.18209 2090.32164
#> 39 1150.01049 1123.04424 1215.52270 1321.56699
#> 65 708.44986 733.41665 724.14118 649.55449
#> 59 699.77053 604.23531 558.19216 665.10234
#> 88 432.88131 459.42719 455.82101 426.70202
#> 25 403.58859 387.54402 397.63110 428.42956
#> 2 259.29482 378.16796 280.17367 281.58878
#> 73 141.03902 113.55457 133.62129 138.20308
#> 23 120.42563 137.51562 168.10420 162.38862
#> 57 90.04799 79.17566 58.18992 84.64939
#> 4 75.94409 67.71603 60.34510 79.46677
#> 3 67.26476 59.38175 64.65546 67.37400
#> 22 60.75527 78.13388 62.50028 72.55662
#> 40 68.34968 55.21461 51.72437 60.46385
#> CGGTCACTTACTGG.1 CTATGTACATGTGC.1 GACATTCTACTCTT.1 GATAGCACACCATG.1
#> 1 3644.24751 3751.26648 3703.12656 3685.11091
#> 50 2231.39769 2260.16572 2120.92139 2308.64506
#> 39 1185.58714 1176.65094 1186.92761 1132.93095
#> 65 679.77685 669.47381 706.24383 706.74487
#> 59 622.45839 575.41551 629.59721 546.30803
#> 88 418.32422 446.31587 526.67175 458.27345
#> 25 392.17895 414.96311 399.65736 462.38721
#> 2 367.03928 275.71993 277.02277 267.39474
#> 73 124.69280 136.47675 140.15381 146.45004
#> 23 148.82689 165.06310 173.00236 155.50033
#> 57 69.38551 67.31624 88.69109 69.93401
#> 4 83.46373 79.30406 67.88700 65.82024
#> 3 86.48049 52.56199 84.31128 80.62980
#> 22 67.37433 75.61550 59.12739 59.23822
#> 40 49.27377 60.86126 54.74758 60.88373
#> GATTCTACCCATGA.1 GCTTAACTTCTATC.1 GGACAACTACGTAC.1 GGCGCATGCACTAG.1
#> 1 3464.21213 3766.93854 3645.87095 3694.27792
#> 50 2205.08496 2399.38432 2164.73588 2267.94103
#> 39 1327.92409 1177.87958 1202.34333 1137.70683
#> 65 735.98383 648.68730 743.15693 717.37179
#> 59 703.01864 572.81744 650.80177 670.66790
#> 88 431.41409 407.80050 455.73387 460.50038
#> 25 404.18197 367.96882 387.54641 391.37862
#> 2 277.33763 265.54451 295.19126 240.99209
#> 73 165.54262 170.70718 153.63755 139.17760
#> 23 161.24281 127.08202 171.76333 142.91391
#> 57 73.09674 75.86986 72.50312 83.13293
#> 4 73.81337 106.21780 79.40817 72.85807
#> 3 54.46424 70.17962 68.18745 75.66031
#> 22 79.54645 45.52192 69.05059 56.97875
#> 40 53.74760 49.31541 57.82987 59.78098
#> GGTTGAACGTCGAT.1 GTCATACTAAACAG.1 GTGGAGGATAGAAG.1 GTTATAGAAAACGA.1
#> 1 3618.43371 3739.83042 3795.44448 3637.07900
#> 50 2201.27042 2130.02706 2162.74744 2221.33295
#> 39 1160.88586 1114.62746 1162.03356 1210.59109
#> 65 762.04350 716.75266 705.55203 710.96803
#> 59 636.45058 620.41494 629.32405 590.70478
#> 88 431.08919 492.28577 424.57214 464.25151
#> 25 378.47595 440.26340 426.34488 481.05300
#> 2 260.52044 274.56251 261.47971 275.01375
#> 73 168.02295 165.70088 130.29667 139.71760
#> 23 185.84357 143.54321 123.20569 162.70910
#> 57 86.55728 76.10680 78.88710 83.12313
#> 4 84.01148 60.69277 75.34161 56.59447
#> 3 75.52547 85.74057 67.36426 70.74309
#> 22 78.91987 97.30110 103.70551 65.43736
#> 40 57.70485 49.13224 52.29594 61.90020
#> TAAGCTCTAAGTAG.1 TACATCACGACTAC.1 TCAACACTTCAGAC.1 TCCAGAGATTCTTG.1
#> 1 3751.27681 3636.08433 3756.52188 3778.26520
#> 50 2180.26749 2249.19502 2269.78990 2136.34106
#> 39 1160.36395 1161.45551 1111.37381 1230.29329
#> 65 725.56296 747.44591 659.57193 712.88023
#> 59 634.30843 646.30070 610.56956 554.20689
#> 88 430.32772 378.86593 480.22325 413.93045
#> 25 453.58868 420.00974 367.51779 388.63470
#> 2 249.60797 252.00584 334.19618 423.12891
#> 73 155.66949 204.86189 104.86508 94.28416
#> 23 135.09249 138.86036 150.92731 170.17141
#> 57 68.88822 108.00250 75.46365 55.19073
#> 4 70.67753 86.57344 88.20427 75.88725
#> 3 70.67753 64.28721 67.62327 78.18686
#> 22 69.78287 70.28734 75.46365 62.08957
#> 40 47.41657 58.28707 60.76294 36.79382
#> TCGAATCTAAGGGC.1 TGATACCTCTTCCG.1 TGCCACTGTGCTTT.1
#> 1 3803.30743 3721.74044 3622.12519
#> 50 2210.40930 2216.40234 2269.69815
#> 39 1071.87308 1123.33019 1207.37506
#> 65 680.66572 742.26785 726.30341
#> 59 607.86256 594.75984 586.46913
#> 88 455.23906 423.61273 478.98458
#> 25 454.36191 430.23168 445.59132
#> 2 273.66972 239.22773 264.01546
#> 73 145.60633 174.92937 160.70506
#> 23 142.09774 148.45357 126.26827
#> 57 102.62615 83.20964 63.65590
#> 4 70.17172 62.40723 79.30899
#> 3 66.66314 74.69957 65.74298
#> 22 67.54028 70.91731 62.61236
#> 40 52.62879 71.86287 58.43821
obj = ttm$object
ttm$pca.cell.2
Run this when B cells need to be removed.
pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
nudge_y = 0.05,
nudge_x = 0.05,
direction = "x",
angle = 90,
vjust = 0,
segment.size = 0.2)+
ggtitle("B cell group genes mean expression")+my_theme +
theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 5,hjust = 0.02 ))
dev.off()
if (length(ttm$cl1) < length(ttm$cl2)) {
to_rem = ttm$cl1
}else{
to_rem = ttm$cl2
}
n_it = n_it+1
obj = drop.genes.cells(object = obj,genes = c(),cells = to_rem)
gc()
ttm = clean(obj)
#ttm = clean.sqrt(obj, cells)
obj = ttm$object
ttm$pca.cell.2
Run this only in the last iteration, instead the previous code, when B cells group has not to be removed
pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
nudge_y = 0.05,
nudge_x = 0.05,
direction = "x",
angle = 90,
vjust = 0,
segment.size = 0.2)+
ggtitle(label = "B cell group genes mean expression", subtitle = " - B group NOT removed -")+my_theme +
theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 10,hjust = 0.02 ),
plot.subtitle = element_text(color = "darkred",vjust = - 15,hjust = 0.01 ))
dev.off()
#> png
#> 2
To color the pca based on nu_j (so the cells’ efficiency)
nu_est = round(get.nu(object = obj), digits = 7)
plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))
plot.nu = plot.nu + geom_point(size = 1,alpha= 0.8)+
scale_color_gradient2(low = "#E64B35B2",mid = "#4DBBD5B2", high = "#3C5488B2" ,
midpoint = log(mean(nu_est)),name = "ln(nu)")+
ggtitle("Cells PCA coloured by cells efficiency") +
my_theme + theme(plot.title = element_text(color = "#3C5488FF", size = 20),
legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
legend.text = element_text(color = "#3C5488FF", size = 11),
legend.key.width = unit(2, "mm"),
legend.position="right")
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored.pdf", sep = "")))
plot.nu
dev.off()
#> png
#> 2
plot.nu
The next part is use to remove the cells with efficiency too low.
nu_df = data.frame("nu"= sort(get.nu(obj)), "n"=c(1:length(get.nu(obj))))
ggplot(nu_df, aes(x = n, y=nu)) +
geom_point(colour = "#8491B4B2", size=1)+
my_theme #+ ylim(0,1) + xlim(0,70)
We can zoom on the smallest values and, if we detect a clear elbow, we can decide to remove the cells.
yset = 0.4#threshold to remove low UDE cells
plot.ude <- ggplot(nu_df, aes(x = n, y=nu)) +
geom_point(colour = "#8491B4B2", size=1) +
my_theme + ylim(0,1) + xlim(0,400) +
geom_hline(yintercept=yset, linetype="dashed", color = "darkred") +
annotate(geom="text", x=200, y=0.25,
label=paste("to remove cells with nu < ",yset,sep = " "),
color="darkred", size=4.5)
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
plot.ude
#> Warning: Removed 668 rows containing missing values (geom_point).
dev.off()
#> png
#> 2
plot.ude
#> Warning: Removed 668 rows containing missing values (geom_point).
We also save the defined threshold in the metadata and re run the estimation
obj = add.row.to.meta(obj,c("Threshold low UDE cells:",yset))
to_rem = rownames(nu_df[which(nu_df$nu < yset),])
obj = drop.genes.cells(object = obj, genes = c(),cells = to_rem)
Repeat the estimation after the cells are removed
ttm = clean(obj)
#> [1] "Start estimation mu with linear method"
#> [1] 88 1004
#> rowname ATACTCTGCCAAGT.1
#> 1 ERCC-00002 3891.24110
#> 50 ERCC-00096 2051.86238
#> 39 ERCC-00074 1068.85718
#> 65 ERCC-00130 618.13427
#> 59 ERCC-00113 613.84167
#> 25 ERCC-00046 476.47850
#> 88 ERCC-00171 476.47850
#> 2 ERCC-00003 270.43374
#> 73 ERCC-00145 238.23925
#> 23 ERCC-00043 156.67987
#> 4 ERCC-00009 113.75388
#> 3 ERCC-00004 83.70568
#> 11 ERCC-00022 70.82789
#> 57 ERCC-00111 64.38899
#> 22 ERCC-00042 62.24269
obj = ttm$object
ttm$pca.cell.2
In case we do not want to remove anything, we can run:
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
ggplot(nu_df, aes(x = n, y=nu)) + geom_point(colour = "#8491B4B2", size=1) +my_theme + #xlim(0,100)+
annotate(geom="text", x=50, y=0.25, label="nothing to remove ", color="darkred")
dev.off()
#> png
#> 2
Just to check again, we plot the final efficiency colored PCA
nu_est = round(get.nu(object = obj), digits = 7)
plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))
plot.nu = plot.nu + geom_point(size = 2,alpha= 0.8)+
scale_color_gradient2(low = "#E64B35B2",mid = "#4DBBD5B2", high = "#3C5488B2" ,
midpoint = log(mean(nu_est)),name = "ln(nu)")+
ggtitle("Cells PCA coloured by cells efficiency: last") +
my_theme + theme(plot.title = element_text(color = "#3C5488FF", size = 20),
legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
legend.text = element_text(color = "#3C5488FF", size = 11),
legend.key.width = unit(2, "mm"),
legend.position="right")
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored_FINAL.pdf", sep = "")))
plot.nu
dev.off()
#> png
#> 2
plot.nu
In this part all the contingency tables are computed and used to get the statistics (S) To storage efficiency of all the observed tables only the yes_yes is stored. Note that if will be necessary re-computing the yes-yes table, the yes-yes table should be cancelled before running cotan_analysis.
obj = cotan_analysis(obj,cores = 2)
#> [1] "cotan analysis"
#> [1] "mu estimator creation"
#> [1] "save effective constitutive genes"
#> [1] "start a minimization"
#> [1] "Final trance!"
#> [1] "a min: -0.147216796875 | a max 13.5546875 | negative a %: 26.984126984127"
COEX evaluation and storing
obj = get.coex(obj)
#> [1] "coex dataframe creation"
#> [1] "creation of observed yes/yes contingency table"
#> [1] "mu estimator creation"
#> [1] "expected contingency tables creation"
#> [1] "The distance between estimated n of zeros and observed number of zero is 0.0041370202609155 over 63"
#> [1] "Done"
#> [1] "coex estimation"
#> [1] "Cleaning RAM"
# saving the structure
saveRDS(obj,file = paste(out_dir,t,".cotan.RDS", sep = ""))
The next function can directly plot the GDI for the dataset with the 1.5 threshold (in red) and the two higher quantiles (in blue).
plot_GDI(obj, cond = "ERCC")
#> [1] "GDI plot "
#> [1] "function to generate GDI dataframe"
#> [1] "Using S"
#> [1] "function to generate S "
If a more complex plot is needed, or if we want to analyze more in detail the GDI data, we can get directly the GDI dataframe.
quant.p = get.GDI(obj)
#> [1] "function to generate GDI dataframe"
#> [1] "Using S"
#> [1] "function to generate S "
head(quant.p)
#> sum.raw.norm GDI exp.cells
#> ERCC-00012 3.175703 1.1614957 2.390438
#> ERCC-00013 5.995188 1.3512649 27.788845
#> ERCC-00014 6.077256 1.3473405 34.860558
#> ERCC-00016 3.569692 1.1518864 3.685259
#> ERCC-00017 1.609829 0.7957451 0.498008
#> ERCC-00019 7.157739 1.4853109 66.135458
In the third column of this dataframe is reported the percentage of cells expressing the gene.
Next we can define some gene sets (in this case three) that we want to specifically label in the GDI plot.
AA=c("ERCC-00012","ERCC-00013","ERCC-00014")
BB=c("ERCC-00016","ERCC-00017","ERCC-00019")
CC=c("ERCC-00022","ERCC-00024","ERCC-00028")
text.size = 12
quant.p$highlight = with(quant.p, ifelse(rownames(quant.p) %in% AA, "AA",
ifelse(rownames(quant.p) %in% CC,"Constitutive" ,
ifelse(rownames(quant.p) %in% BB,"BB" , "normal"))))
textdf <- quant.p[rownames(quant.p) %in% c(AA,CC,BB), ]
mycolours <- c("Con" = "#00A087FF","AA"="#E64B35FF","BB"="#F39B7FFF","normal" = "#8491B4B2")
f1 = ggplot(subset(quant.p,highlight == "normal" ), aes(x=sum.raw.norm, y=GDI)) + geom_point(alpha = 0.1, color = "#8491B4B2", size=2.5)
GDI_plot = f1 + geom_point(data = subset(quant.p,highlight != "normal" ), aes(x=sum.raw.norm, y=GDI, colour=highlight),size=2.5,alpha = 0.8) +
geom_hline(yintercept=quantile(quant.p$GDI)[4], linetype="dashed", color = "darkblue") +
geom_hline(yintercept=quantile(quant.p$GDI)[3], linetype="dashed", color = "darkblue") +
geom_hline(yintercept=1.5, linetype="dotted", color = "red", size= 0.5) +
scale_color_manual("Status", values = mycolours) +
scale_fill_manual("Status", values = mycolours) +
xlab("log normalized counts")+ylab("GDI")+
geom_label_repel(data = textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf),fill=highlight),
label.size = NA,
alpha = 0.5,
direction ="both",
na.rm=TRUE,
seed = 1234) +
geom_label_repel(data =textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf)),
label.size = NA,
segment.color = 'black',
segment.size = 0.5,
direction = "both",
alpha = 0.8,
na.rm=TRUE,
fill = NA,
seed = 1234) +
theme(axis.text.x = element_text(size = text.size, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
axis.text.y = element_text( size = text.size, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),
axis.title.x = element_text( size = text.size, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
axis.title.y = element_text( size = text.size, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"),
legend.title = element_blank(),
legend.text = element_text(color = "#3C5488FF",face ="italic" ),
legend.position = "bottom")
legend <- cowplot::get_legend(GDI_plot)
GDI_plot =GDI_plot + theme(
legend.position = "none")
GDI_plot
For the Gene Pair Analysis, we can plot an heatmap with the coex values between two genes sets. To do so we need to define, in a list, the different gene sets (list.genes). Then in the function parameter sets we can decide which sets need to be considered (in the example from 1 to 3). In the condition parameter we should insert an array with each file name prefix to be considered (in the example, the file names is “E17.5_cortex”).
list.genes = list("Ref.col"= BB, "AA"=AA, "Const."=CC )
plot_heatmap(df_genes = list.genes,sets = c(1:3),conditions = "ERCC",dir = out_dir)
#> [1] "plot heatmap"
#> [1] "Loading condition ERCC"
#> [1] "ERCC-00016" "ERCC-00017" "ERCC-00019"
#> [1] "Get p-values on a set of genes on columns on a set of genes on rows"
#> [1] "Using function S"
#> [1] "function to generate S "
#> [1] "Ref.col"
#> [1] "AA"
#> [1] "Const."
#> [1] "min coex: 0 max coex 0.98246092315627"
We can also plot a general heatmap of coex values based on some markers like the following one.
plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"), condition = "ERCC",dir = out_dir, p_value = 0.05)
#> [1] "ploting a general heatmap"
#> NULL
#> [1] "Get p-values genome wide on columns genome wide on rows"
#> [1] "Using function S"
#> [1] "function to generate S "
plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"),markers.list =c("ERCC-00084" ,"ERCC-00085" ,"ERCC-00086") ,symmetric = FALSE, condition = "ERCC",dir = out_dir, p_value = 0.05)
#> [1] "ploting a general heatmap"
#> NULL
#> [1] "Get p-values genome wide on columns genome wide on rows"
#> [1] "Using function S"
#> [1] "function to generate S "
Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions:
get.observed.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
#> ERCC-00014.yes ERCC-00014.no
#> ERCC-00019.yes 241 423
#> ERCC-00019.no 109 231
get.expected.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
#> ERCC-00014.yes ERCC-00014.no
#> ERCC-00019.yes 235.7119 428.2888
#> ERCC-00019.no 114.2885 225.7108
Another useful function is extract.coex. This can be used to extract the whole or a partial coex matrix from a COTAN object.
# For the whole matrix
coex <- extract.coex(object = obj)
coex[1:5,1:5]
#> ERCC-00012 ERCC-00013 ERCC-00014 ERCC-00016 ERCC-00017
#> ERCC-00012 0.769954876 0.03079731 -0.008469118 0.035409255 -0.004367265
#> ERCC-00013 0.030797310 0.98637914 0.020020333 -0.018534811 -0.013478967
#> ERCC-00014 -0.008469118 0.02002033 0.983490318 -0.003222142 0.094474696
#> ERCC-00016 0.035409255 -0.01853481 -0.003222142 0.982460923 0.028389361
#> ERCC-00017 -0.004367265 -0.01347897 0.094474696 0.028389361 0.185954347
# For a partial matrix
coex <- extract.coex(object = obj,genes = c("ERCC-00017", "ERCC-00019", "ERCC-00024", "ERCC-00025", "ERCC-00028", "ERCC-00031"))
head(coex)
#> ERCC-00017 ERCC-00019 ERCC-00024 ERCC-00025 ERCC-00028
#> ERCC-00012 -0.004367265 5.272805e-02 -0.011980011 0.017574171 -0.0120737279
#> ERCC-00013 -0.013478967 4.360570e-02 -0.005497781 0.068917037 0.0310017944
#> ERCC-00014 0.094474696 2.345844e-02 -0.016243015 -0.006247842 -0.0226624019
#> ERCC-00016 0.028389361 7.232674e-05 -0.004746929 0.033126881 0.0005889499
#> ERCC-00017 0.185954347 1.880656e-02 -0.010473707 0.012328879 0.0162721054
#> ERCC-00019 0.018806561 9.695314e-01 -0.010600940 -0.032637961 0.0473851891
#> ERCC-00031
#> ERCC-00012 0.013066001
#> ERCC-00013 -0.020289792
#> ERCC-00014 -0.054916592
#> ERCC-00016 -0.012314141
#> ERCC-00017 -0.017522836
#> ERCC-00019 -0.003396497
The next few lines are just to clean after compilation of the documentation.
if (file.exists(paste(out_dir,t,".cotan.RDS", sep = ""))) {
#Delete file if it exists
file.remove(paste(out_dir,t,".cotan.RDS", sep = ""))
}
#> [1] TRUE
unlink(paste(out_dir,"cleaning", sep = ""),recursive = TRUE)
print(sessionInfo())
#> R version 4.2.0 RC (2022-04-19 r82224)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Mojave 10.14.6
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] dendextend_1.15.2 MASS_7.3-57 htmlwidgets_1.5.4 forcats_0.5.1
#> [5] stringr_1.4.0 dplyr_1.0.8 purrr_0.3.4 readr_2.1.2
#> [9] tidyr_1.2.0 tibble_3.1.6 tidyverse_1.3.1 plotly_4.10.0
#> [13] Rtsne_0.16 factoextra_1.0.7 ggrepel_0.9.1 ggplot2_3.3.5
#> [17] Matrix_1.4-1 data.table_1.14.2 COTAN_1.0.0 BiocStyle_2.24.0
#>
#> loaded via a namespace (and not attached):
#> [1] colorspace_2.0-3 rjson_0.2.21 ellipsis_0.3.2
#> [4] rprojroot_2.0.3 circlize_0.4.14 GlobalOptions_0.1.2
#> [7] fs_1.5.2 clue_0.3-60 rstudioapi_0.13
#> [10] farver_2.1.0 fansi_1.0.3 lubridate_1.8.0
#> [13] xml2_1.3.3 codetools_0.2-18 doParallel_1.0.17
#> [16] knitr_1.38 jsonlite_1.8.0 Cairo_1.5-15
#> [19] broom_0.8.0 cluster_2.1.3 dbplyr_2.1.1
#> [22] png_0.1-7 BiocManager_1.30.17 compiler_4.2.0
#> [25] httr_1.4.2 basilisk_1.8.0 backports_1.4.1
#> [28] assertthat_0.2.1 RcppZiggurat_0.1.6 fastmap_1.1.0
#> [31] lazyeval_0.2.2 cli_3.3.0 htmltools_0.5.2
#> [34] tools_4.2.0 gtable_0.3.0 glue_1.6.2
#> [37] Rcpp_1.0.8.3 cellranger_1.1.0 jquerylib_0.1.4
#> [40] vctrs_0.4.1 iterators_1.0.14 xfun_0.30
#> [43] rvest_1.0.2 lifecycle_1.0.1 scales_1.2.0
#> [46] basilisk.utils_1.8.0 hms_1.1.1 parallel_4.2.0
#> [49] RColorBrewer_1.1-3 ComplexHeatmap_2.12.0 yaml_2.3.5
#> [52] reticulate_1.24 gridExtra_2.3 sass_0.4.1
#> [55] stringi_1.7.6 highr_0.9 S4Vectors_0.34.0
#> [58] foreach_1.5.2 BiocGenerics_0.42.0 filelock_1.0.2
#> [61] shape_1.4.6 rlang_1.0.2 pkgconfig_2.0.3
#> [64] matrixStats_0.62.0 evaluate_0.15 lattice_0.20-45
#> [67] labeling_0.4.2 Rfast_2.0.6 cowplot_1.1.1
#> [70] tidyselect_1.1.2 here_1.0.1 magrittr_2.0.3
#> [73] bookdown_0.26 R6_2.5.1 IRanges_2.30.0
#> [76] magick_2.7.3 generics_0.1.2 DBI_1.1.2
#> [79] pillar_1.7.0 haven_2.5.0 withr_2.5.0
#> [82] dir.expiry_1.4.0 modelr_0.1.8 crayon_1.5.1
#> [85] utf8_1.2.2 tzdb_0.3.0 rmarkdown_2.14
#> [88] viridis_0.6.2 GetoptLong_1.0.5 grid_4.2.0
#> [91] readxl_1.4.0 reprex_2.0.1 digest_0.6.29
#> [94] stats4_4.2.0 munsell_0.5.0 viridisLite_0.4.0
#> [97] bslib_0.3.1