Contents

library(COTAN)
library(data.table)
library(Matrix)
library(ggrepel)
#> Loading required package: ggplot2
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(Rtsne)
library(utils)
library(plotly)
#> 
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#> 
#>     last_plot
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following object is masked from 'package:graphics':
#> 
#>     layout
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✔ tibble  3.1.6     ✔ dplyr   1.0.8
#> ✔ tidyr   1.2.0     ✔ stringr 1.4.0
#> ✔ readr   2.1.2     ✔ forcats 0.5.1
#> ✔ purrr   0.3.4
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::between()   masks data.table::between()
#> ✖ tidyr::expand()    masks Matrix::expand()
#> ✖ dplyr::filter()    masks plotly::filter(), stats::filter()
#> ✖ dplyr::first()     masks data.table::first()
#> ✖ dplyr::lag()       masks stats::lag()
#> ✖ dplyr::last()      masks data.table::last()
#> ✖ tidyr::pack()      masks Matrix::pack()
#> ✖ purrr::transpose() masks data.table::transpose()
#> ✖ tidyr::unpack()    masks Matrix::unpack()
library(htmlwidgets)
library(MASS)
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> The following object is masked from 'package:plotly':
#> 
#>     select
library(dendextend)
#> 
#> ---------------------
#> Welcome to dendextend version 1.15.2
#> Type citation('dendextend') for how to cite the package.
#> 
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#> 
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> You may ask questions at stackoverflow, use the r and dendextend tags: 
#>   https://stackoverflow.com/questions/tagged/dendextend
#> 
#>  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#> 
#> Attaching package: 'dendextend'
#> The following object is masked from 'package:data.table':
#> 
#>     set
#> The following object is masked from 'package:stats':
#> 
#>     cutree
mycolours <- c("A" = "#8491B4B2","B"="#E64B35FF")
my_theme = theme(axis.text.x = element_text(size = 14, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
                 axis.text.y = element_text( size = 14, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),
                 axis.title.x = element_text( size = 14, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
                 axis.title.y = element_text( size = 14, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"))
data_dir = tempdir()

This is just an example on a toy dataset. If the user want to try on a real dataset it can be used the previous command to download it and the next commented line to use it as input data. There are also available online many other examples at link.

data("ERCCraw", package = "COTAN")

ERCCraw = as.data.frame(ERCCraw)
rownames(ERCCraw) = ERCCraw$V1
ERCCraw = ERCCraw[,2:ncol(ERCCraw)]
ERCCraw[1:5,1:5]
#>            AAACCGTGAAGCCT.1 AAACGCACTGCTGA.1 AAACTTGACCGAAT.1 AAAGACGAGGAAGC.1
#> ERCC-00002             1662             4036             3919             4124
#> ERCC-00003               95              283              298              290
#> ERCC-00004               25               75               70               75
#> ERCC-00009               41               57               78               98
#> ERCC-00012                0                0                0                0
#>            AAAGACGATCCGAA.1
#> ERCC-00002             4220
#> ERCC-00003              312
#> ERCC-00004               87
#> ERCC-00009               87
#> ERCC-00012                0

Define a directory where the output will be stored.

out_dir <- paste0(tempdir(),"/")

1 Analytical pipeline

Inizialise the COTAN object with the row count table and the metadata for the experiment.

obj = new("scCOTAN",raw = ERCCraw)
#obj = initRaw(obj,GEO="GSM2861514" ,sc.method="Drop_seq",cond = "mouse cortex E17.5")
obj = initRaw(obj,GEO="ERCC" ,sc.method="10X",cond = "negative ERCC dataset")
#> [1] "Initializing S4 object"

Now we can start the cleaning. Analysis requires and starts from a matrix of raw UMI counts after removing possible cell doublets or multiplets and low quality or dying cells (with too high mtRNA percentage, easily done with Seurat or other tools).

If we do not want to consider the mitochondrial genes we can remove them before starting the analysis.

genes_to_rem = get.genes(obj)[grep('^mt', get.genes(obj))] 
cells_to_rem = names(get.cell.size(obj)[which(get.cell.size(obj) == 0)])
obj = drop.genes.cells(obj,genes_to_rem,cells_to_rem )

We want also to define a prefix to identify the sample.

#t = "E17.5_cortex"
t = "ERCC"

print(paste("Condition ",t,sep = ""))
#> [1] "Condition ERCC"
#--------------------------------------
n_cells = length(get.cell.size(object = obj))
print(paste("n cells", n_cells, sep = " "))
#> [1] "n cells 1015"

n_it = 1

1.1 Data cleaning

First we create the directory to store all information regarding the data cleaning.

if(!file.exists(out_dir)){
  dir.create(file.path(out_dir))
}

if(!file.exists(paste(out_dir,"cleaning", sep = ""))){   
  dir.create(file.path(out_dir, "cleaning"))
}
ttm = clean(obj)
#> [1] "Start estimation mu with linear method"
#> [1]   88 1015
#>       rowname AACCTACTAGTGTC.1 AACGTTCTTGCCCT.1 AAGAGATGGACGAG.1
#> 1  ERCC-00002       3868.72569       3666.17330       3843.20177
#> 50 ERCC-00096       2155.29007       2103.20280       2149.36255
#> 39 ERCC-00074       1193.60684       1170.86640        972.96247
#> 65 ERCC-00130        738.75666        754.80039        782.79254
#> 59 ERCC-00113        565.81363        774.40559        773.94742
#> 88 ERCC-00171        452.85084        417.15519        411.29777
#> 25 ERCC-00046        397.86895        393.19327        327.26920
#> 2  ERCC-00003        256.91538        281.00794        393.60755
#> 73 ERCC-00145        156.94830        144.86068        172.47971
#> 23 ERCC-00043        126.95818        123.07712        110.56392
#> 57 ERCC-00111         74.97530         79.51000         79.60602
#> 4  ERCC-00009         55.98156         81.68835         48.64812
#> 3  ERCC-00004         56.98123         69.70739         75.18346
#> 22 ERCC-00042         70.97662         68.61822         48.64812
#> 40 ERCC-00076         58.98057         64.26150         53.07068
#>    AAGTAACTGAATAG.1 AATGTAACGAGATA.1 AGCAAGCTCCCACT.1 AGCGATACCTACCC.1
#> 1        3596.41754       3658.35653       3794.25176       3776.94978
#> 50       2231.89998       2293.19618       2171.19647       2065.55255
#> 39       1259.40899       1138.91426       1177.10994       1206.67289
#> 65        725.34382        713.74237        712.58352        713.61235
#> 59        595.61523        676.17698        552.78643        609.69852
#> 88        465.88663        448.22338        428.29336        483.51743
#> 25        438.42583        414.07303        432.93862        400.81050
#> 2         241.46563        275.76410        290.79354        366.87946
#> 73        155.29554        149.40779        194.17204        115.57763
#> 23        145.82630        122.94127        146.79035        148.44833
#> 57         66.28468         68.30071         67.82086         86.94831
#> 4          73.86008         74.27702         47.38169         55.13795
#> 3          64.39084         60.61688         79.89854         74.22417
#> 22         88.06394         58.90936         65.96275         54.07761
#> 40         57.76237         45.24922         75.25328         51.95692
#>    AGCTGAACCTTCCG.1 AGTATCCTTCTATC.1 AGTCTACTCATTCT.1 AGTTGTCTAGAACA.1
#> 1        3850.52926       3662.89484       3742.58314       3732.19890
#> 50       2145.75739       2126.18440       2230.92416       2160.79985
#> 39       1132.04891       1190.99693       1136.99996       1138.43168
#> 65        684.36487        747.03776        730.28456        692.34409
#> 59        620.72899        601.52296        610.07311        626.74297
#> 88        445.45120        462.49608        433.76298        427.92113
#> 25        355.02125        425.42225        429.75593        446.08759
#> 2         320.41226        280.83430        254.44757        277.54318
#> 73        107.17623        160.34433        144.25374        157.44268
#> 23        147.36731        185.36917        125.22026        158.45193
#> 57         73.68366         76.92821         70.12335         73.67510
#> 4          79.26575         69.51344         75.13216         94.86931
#> 3         117.22400         54.68391         74.13039         62.57337
#> 22         54.70453         62.09867         77.13568         52.48089
#> 40         64.75230         69.51344         64.11277         68.62886
#>    AGTTTGCTCATGCA.1 ATACTCTGCCAAGT.1 ATAGATTGGTGTAC.1 ATCAGGTGCGGAGA.1
#> 1        3598.95146       3861.52507       3712.95257       3684.84776
#> 50       2151.40591       2036.19303       2217.69942       2203.66257
#> 39       1184.35545       1060.69470       1164.70423       1146.58652
#> 65        780.91273        613.41380        693.14552        739.52699
#> 59        613.82165        609.15398        630.88146        613.78609
#> 88        444.99905        472.83980        444.08927        511.48841
#> 25        399.97969        472.83980        444.08927        425.17474
#> 2         304.74643        268.36854        248.14060        269.59702
#> 73        174.01714        236.41990        137.34720        136.39691
#> 23        170.55411        155.48336        165.73228        157.70893
#> 57         91.77023         63.89727         59.51712         67.13285
#> 4          83.11266        112.88518         75.99878         77.78886
#> 3          75.32085         83.06645         70.50489         83.11687
#> 22         58.87147         61.76736         64.09536         59.67365
#> 40         48.48239         57.50754         83.32397         56.47685
#>    ATCCCGTGCCAACA.1 CAAAGCACGCTTCC.1 CAAGCTGAGCTTCC.1 CAAGGTTGGGAACG.1
#> 1        3823.53240       3655.08341       3638.06138       3657.20427
#> 50       2158.99362       2194.86149       2140.34343       2163.14214
#> 39       1190.84351       1251.90632       1217.33121       1259.28094
#> 65        709.59930        697.40441        635.65936        708.98352
#> 59        681.29082        590.73072        667.00694        649.59223
#> 88        420.85278        474.99983        447.57385        455.64255
#> 25        368.01028        416.63120        426.67546        405.53115
#> 2         279.31036        258.63338        404.03554        267.26080
#> 73        149.09134        174.09951        144.54720        152.19018
#> 23        143.42965        139.88342        142.80566        136.41437
#> 57         67.94036         70.44489         78.36896         79.80705
#> 4          79.26375         71.45125         90.55969         64.95922
#> 3          54.72973         68.43218         57.47057        101.15079
#> 22         56.61697         61.38769         74.88590         58.46330
#> 40         54.72973         71.45125         67.91977         63.10325
#>    CAGACTGAACACCA.1 CAGCGTCTTTCACT.1 CATTACACACCCAA.1 CATTTGACGGTCTA.1
#> 1        3715.69806       3711.63105       3622.15916       3742.63053
#> 50       2182.81283       2206.38856       2096.84700       2194.21820
#> 39       1188.74807       1247.32477       1400.94618       1085.22019
#> 65        672.54233        677.30494        697.72972        718.09048
#> 59        592.89916        593.86097        641.94792        611.56580
#> 88        448.36156        438.89360        428.87974        466.04548
#> 25        439.51231        434.55885        393.21597        443.21876
#> 2         272.35998        252.49928        269.76444        282.48063
#> 73        149.45385        146.29787        162.77312        154.08034
#> 23        163.21934        153.88368        135.33945        142.66698
#> 57         81.60967         82.36028         74.98537         80.84462
#> 4          76.69342         59.60283         64.01190         86.55130
#> 3          68.82743         92.11347         76.81428         80.84462
#> 22         65.87768         63.93759         68.58418         57.06679
#> 40         54.07870         62.85390         57.61071         55.16457
#>    CCAAAGTGCTGAAC.1 CCACGGGATTCGGA.1 CCCAACACATCTTC.1 CCTATTGAAGCCAT.1
#> 1        3813.94816       3765.13428       3790.41464       3671.13788
#> 50       2165.18213       2187.73734       2108.92219       2091.82163
#> 39       1110.83257       1158.64321       1230.87667       1344.87229
#> 65        684.07204        729.85399        696.58950        696.97098
#> 59        636.55458        626.76212        666.34683        663.34917
#> 88        451.86410        455.24643        420.37311        452.53135
#> 25        368.48441        388.64725        398.19515        408.00517
#> 2         338.00152        231.72864        246.98180        264.43097
#> 73         97.72458        142.32153        177.42366        139.03072
#> 23        141.65581        121.33823        150.20526        143.57420
#> 57         87.86246        123.16286         73.59050         62.70013
#> 4          75.31068         62.03759         70.56623         73.60450
#> 3          60.96579         69.33613         85.68756         77.23929
#> 22         64.55201         68.42381         66.53387         59.06534
#> 40         84.27624         59.30064         49.39636         60.88273
#>    CGACTCTGAAGTGA.1 CGAGGCTGAAGTGA.1 CGCACGGACTCTAT.1 CGCATAGAAGCTAC.1
#> 1        3677.86374       3718.13071       3841.61207       3681.38464
#> 50       2269.64334       2221.08565       2155.18209       2090.32164
#> 39       1150.01049       1123.04424       1215.52270       1321.56699
#> 65        708.44986        733.41665        724.14118        649.55449
#> 59        699.77053        604.23531        558.19216        665.10234
#> 88        432.88131        459.42719        455.82101        426.70202
#> 25        403.58859        387.54402        397.63110        428.42956
#> 2         259.29482        378.16796        280.17367        281.58878
#> 73        141.03902        113.55457        133.62129        138.20308
#> 23        120.42563        137.51562        168.10420        162.38862
#> 57         90.04799         79.17566         58.18992         84.64939
#> 4          75.94409         67.71603         60.34510         79.46677
#> 3          67.26476         59.38175         64.65546         67.37400
#> 22         60.75527         78.13388         62.50028         72.55662
#> 40         68.34968         55.21461         51.72437         60.46385
#>    CGGTCACTTACTGG.1 CTATGTACATGTGC.1 GACATTCTACTCTT.1 GATAGCACACCATG.1
#> 1        3644.24751       3751.26648       3703.12656       3685.11091
#> 50       2231.39769       2260.16572       2120.92139       2308.64506
#> 39       1185.58714       1176.65094       1186.92761       1132.93095
#> 65        679.77685        669.47381        706.24383        706.74487
#> 59        622.45839        575.41551        629.59721        546.30803
#> 88        418.32422        446.31587        526.67175        458.27345
#> 25        392.17895        414.96311        399.65736        462.38721
#> 2         367.03928        275.71993        277.02277        267.39474
#> 73        124.69280        136.47675        140.15381        146.45004
#> 23        148.82689        165.06310        173.00236        155.50033
#> 57         69.38551         67.31624         88.69109         69.93401
#> 4          83.46373         79.30406         67.88700         65.82024
#> 3          86.48049         52.56199         84.31128         80.62980
#> 22         67.37433         75.61550         59.12739         59.23822
#> 40         49.27377         60.86126         54.74758         60.88373
#>    GATTCTACCCATGA.1 GCTTAACTTCTATC.1 GGACAACTACGTAC.1 GGCGCATGCACTAG.1
#> 1        3464.21213       3766.93854       3645.87095       3694.27792
#> 50       2205.08496       2399.38432       2164.73588       2267.94103
#> 39       1327.92409       1177.87958       1202.34333       1137.70683
#> 65        735.98383        648.68730        743.15693        717.37179
#> 59        703.01864        572.81744        650.80177        670.66790
#> 88        431.41409        407.80050        455.73387        460.50038
#> 25        404.18197        367.96882        387.54641        391.37862
#> 2         277.33763        265.54451        295.19126        240.99209
#> 73        165.54262        170.70718        153.63755        139.17760
#> 23        161.24281        127.08202        171.76333        142.91391
#> 57         73.09674         75.86986         72.50312         83.13293
#> 4          73.81337        106.21780         79.40817         72.85807
#> 3          54.46424         70.17962         68.18745         75.66031
#> 22         79.54645         45.52192         69.05059         56.97875
#> 40         53.74760         49.31541         57.82987         59.78098
#>    GGTTGAACGTCGAT.1 GTCATACTAAACAG.1 GTGGAGGATAGAAG.1 GTTATAGAAAACGA.1
#> 1        3618.43371       3739.83042       3795.44448       3637.07900
#> 50       2201.27042       2130.02706       2162.74744       2221.33295
#> 39       1160.88586       1114.62746       1162.03356       1210.59109
#> 65        762.04350        716.75266        705.55203        710.96803
#> 59        636.45058        620.41494        629.32405        590.70478
#> 88        431.08919        492.28577        424.57214        464.25151
#> 25        378.47595        440.26340        426.34488        481.05300
#> 2         260.52044        274.56251        261.47971        275.01375
#> 73        168.02295        165.70088        130.29667        139.71760
#> 23        185.84357        143.54321        123.20569        162.70910
#> 57         86.55728         76.10680         78.88710         83.12313
#> 4          84.01148         60.69277         75.34161         56.59447
#> 3          75.52547         85.74057         67.36426         70.74309
#> 22         78.91987         97.30110        103.70551         65.43736
#> 40         57.70485         49.13224         52.29594         61.90020
#>    TAAGCTCTAAGTAG.1 TACATCACGACTAC.1 TCAACACTTCAGAC.1 TCCAGAGATTCTTG.1
#> 1        3751.27681       3636.08433       3756.52188       3778.26520
#> 50       2180.26749       2249.19502       2269.78990       2136.34106
#> 39       1160.36395       1161.45551       1111.37381       1230.29329
#> 65        725.56296        747.44591        659.57193        712.88023
#> 59        634.30843        646.30070        610.56956        554.20689
#> 88        430.32772        378.86593        480.22325        413.93045
#> 25        453.58868        420.00974        367.51779        388.63470
#> 2         249.60797        252.00584        334.19618        423.12891
#> 73        155.66949        204.86189        104.86508         94.28416
#> 23        135.09249        138.86036        150.92731        170.17141
#> 57         68.88822        108.00250         75.46365         55.19073
#> 4          70.67753         86.57344         88.20427         75.88725
#> 3          70.67753         64.28721         67.62327         78.18686
#> 22         69.78287         70.28734         75.46365         62.08957
#> 40         47.41657         58.28707         60.76294         36.79382
#>    TCGAATCTAAGGGC.1 TGATACCTCTTCCG.1 TGCCACTGTGCTTT.1
#> 1        3803.30743       3721.74044       3622.12519
#> 50       2210.40930       2216.40234       2269.69815
#> 39       1071.87308       1123.33019       1207.37506
#> 65        680.66572        742.26785        726.30341
#> 59        607.86256        594.75984        586.46913
#> 88        455.23906        423.61273        478.98458
#> 25        454.36191        430.23168        445.59132
#> 2         273.66972        239.22773        264.01546
#> 73        145.60633        174.92937        160.70506
#> 23        142.09774        148.45357        126.26827
#> 57        102.62615         83.20964         63.65590
#> 4          70.17172         62.40723         79.30899
#> 3          66.66314         74.69957         65.74298
#> 22         67.54028         70.91731         62.61236
#> 40         52.62879         71.86287         58.43821

obj = ttm$object

ttm$pca.cell.2

Run this when B cells need to be removed.

pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
  geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
                  nudge_y      = 0.05,
                  nudge_x      = 0.05,
                  direction    = "x",
                  angle        = 90,
                  vjust        = 0,
                  segment.size = 0.2)+
  ggtitle("B cell group genes mean expression")+my_theme +
  theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 5,hjust = 0.02 ))
dev.off()

if (length(ttm$cl1) < length(ttm$cl2)) {
  to_rem = ttm$cl1
}else{
  to_rem = ttm$cl2
}
n_it = n_it+1
obj = drop.genes.cells(object = obj,genes = c(),cells = to_rem)

gc()

ttm = clean(obj)
#ttm = clean.sqrt(obj, cells)
obj = ttm$object

ttm$pca.cell.2

Run this only in the last iteration, instead the previous code, when B cells group has not to be removed

pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
  geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
                  nudge_y      = 0.05,
                  nudge_x      = 0.05,
                  direction    = "x",
                  angle        = 90,
                  vjust        = 0,
                  segment.size = 0.2)+
  ggtitle(label = "B cell group genes mean expression", subtitle = " - B group NOT removed -")+my_theme +
  theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 10,hjust = 0.02 ),
        plot.subtitle = element_text(color = "darkred",vjust = - 15,hjust = 0.01 ))

dev.off()
#> png 
#>   2

To color the pca based on nu_j (so the cells’ efficiency)

nu_est = round(get.nu(object = obj), digits = 7)

plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))

plot.nu = plot.nu + geom_point(size = 1,alpha= 0.8)+
  scale_color_gradient2(low = "#E64B35B2",mid =  "#4DBBD5B2", high =  "#3C5488B2" ,
                        midpoint = log(mean(nu_est)),name = "ln(nu)")+
  ggtitle("Cells PCA coloured by cells efficiency") +
  my_theme +  theme(plot.title = element_text(color = "#3C5488FF", size = 20),
                    legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
                    legend.text = element_text(color = "#3C5488FF", size = 11),
                    legend.key.width = unit(2, "mm"),
                    legend.position="right")

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored.pdf", sep = "")))
plot.nu
dev.off()
#> png 
#>   2

plot.nu

The next part is use to remove the cells with efficiency too low.

nu_df = data.frame("nu"= sort(get.nu(obj)), "n"=c(1:length(get.nu(obj))))

ggplot(nu_df, aes(x = n, y=nu)) + 
    geom_point(colour = "#8491B4B2", size=1)+
    my_theme #+ ylim(0,1) + xlim(0,70)

We can zoom on the smallest values and, if we detect a clear elbow, we can decide to remove the cells.

yset = 0.4#threshold to remove low UDE cells
plot.ude <- ggplot(nu_df, aes(x = n, y=nu)) + 
    geom_point(colour = "#8491B4B2", size=1) + 
    my_theme + ylim(0,1) + xlim(0,400) +
    geom_hline(yintercept=yset, linetype="dashed", color = "darkred") +
    annotate(geom="text", x=200, y=0.25, 
             label=paste("to remove cells with nu < ",yset,sep = " "), 
             color="darkred", size=4.5)


pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
plot.ude
#> Warning: Removed 668 rows containing missing values (geom_point).
dev.off()
#> png 
#>   2

plot.ude
#> Warning: Removed 668 rows containing missing values (geom_point).

We also save the defined threshold in the metadata and re run the estimation

obj = add.row.to.meta(obj,c("Threshold low UDE cells:",yset)) 

to_rem = rownames(nu_df[which(nu_df$nu < yset),])

obj = drop.genes.cells(object = obj, genes = c(),cells =  to_rem)

Repeat the estimation after the cells are removed

ttm = clean(obj)
#> [1] "Start estimation mu with linear method"
#> [1]   88 1004
#>       rowname ATACTCTGCCAAGT.1
#> 1  ERCC-00002       3891.24110
#> 50 ERCC-00096       2051.86238
#> 39 ERCC-00074       1068.85718
#> 65 ERCC-00130        618.13427
#> 59 ERCC-00113        613.84167
#> 25 ERCC-00046        476.47850
#> 88 ERCC-00171        476.47850
#> 2  ERCC-00003        270.43374
#> 73 ERCC-00145        238.23925
#> 23 ERCC-00043        156.67987
#> 4  ERCC-00009        113.75388
#> 3  ERCC-00004         83.70568
#> 11 ERCC-00022         70.82789
#> 57 ERCC-00111         64.38899
#> 22 ERCC-00042         62.24269
obj = ttm$object
ttm$pca.cell.2

In case we do not want to remove anything, we can run:

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
ggplot(nu_df, aes(x = n, y=nu)) + geom_point(colour = "#8491B4B2", size=1) +my_theme + #xlim(0,100)+
  annotate(geom="text", x=50, y=0.25, label="nothing to remove ", color="darkred")
dev.off()
#> png 
#>   2

Just to check again, we plot the final efficiency colored PCA

nu_est = round(get.nu(object = obj), digits = 7)
plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))
plot.nu = plot.nu + geom_point(size = 2,alpha= 0.8)+
  scale_color_gradient2(low = "#E64B35B2",mid =  "#4DBBD5B2", high =  "#3C5488B2" ,
                        midpoint = log(mean(nu_est)),name = "ln(nu)")+
  ggtitle("Cells PCA coloured by cells efficiency: last") +
  my_theme +  theme(plot.title = element_text(color = "#3C5488FF", size = 20),
                    legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
                    legend.text = element_text(color = "#3C5488FF", size = 11),
                    legend.key.width = unit(2, "mm"),
                    legend.position="right")

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored_FINAL.pdf", sep = "")))
plot.nu
dev.off()
#> png 
#>   2

plot.nu

1.2 COTAN analysis

In this part all the contingency tables are computed and used to get the statistics (S) To storage efficiency of all the observed tables only the yes_yes is stored. Note that if will be necessary re-computing the yes-yes table, the yes-yes table should be cancelled before running cotan_analysis.

obj = cotan_analysis(obj,cores = 2)
#> [1] "cotan analysis"
#> [1] "mu estimator creation"
#> [1] "save effective constitutive genes"
#> [1] "start a minimization"
#> [1] "Final trance!"
#> [1] "a min: -0.147216796875 | a max 13.5546875 | negative a %: 26.984126984127"

COEX evaluation and storing

obj = get.coex(obj)
#> [1] "coex dataframe creation"
#> [1] "creation of observed yes/yes contingency table"
#> [1] "mu estimator creation"
#> [1] "expected contingency tables creation"
#> [1] "The distance between estimated n of zeros and observed number of zero is 0.0041370202609155 over 63"
#> [1] "Done"
#> [1] "coex estimation"
#> [1] "Cleaning RAM"

# saving the structure 
saveRDS(obj,file = paste(out_dir,t,".cotan.RDS", sep = ""))

2 Analysis of the elaborated data

2.1 GDI

The next function can directly plot the GDI for the dataset with the 1.5 threshold (in red) and the two higher quantiles (in blue).

plot_GDI(obj, cond = "ERCC")
#> [1] "GDI plot "
#> [1] "function to generate GDI dataframe"
#> [1] "Using S"
#> [1] "function to generate S "

If a more complex plot is needed, or if we want to analyze more in detail the GDI data, we can get directly the GDI dataframe.

quant.p = get.GDI(obj)
#> [1] "function to generate GDI dataframe"
#> [1] "Using S"
#> [1] "function to generate S "

head(quant.p)
#>            sum.raw.norm       GDI exp.cells
#> ERCC-00012     3.175703 1.1614957  2.390438
#> ERCC-00013     5.995188 1.3512649 27.788845
#> ERCC-00014     6.077256 1.3473405 34.860558
#> ERCC-00016     3.569692 1.1518864  3.685259
#> ERCC-00017     1.609829 0.7957451  0.498008
#> ERCC-00019     7.157739 1.4853109 66.135458

In the third column of this dataframe is reported the percentage of cells expressing the gene.

Next we can define some gene sets (in this case three) that we want to specifically label in the GDI plot.

AA=c("ERCC-00012","ERCC-00013","ERCC-00014") 
BB=c("ERCC-00016","ERCC-00017","ERCC-00019")
CC=c("ERCC-00022","ERCC-00024","ERCC-00028")

text.size = 12

quant.p$highlight = with(quant.p, ifelse(rownames(quant.p) %in% AA, "AA",
                                                 ifelse(rownames(quant.p) %in% CC,"Constitutive" ,
                                                               ifelse(rownames(quant.p) %in% BB,"BB" , "normal"))))

textdf <- quant.p[rownames(quant.p) %in% c(AA,CC,BB), ]
mycolours <- c("Con" = "#00A087FF","AA"="#E64B35FF","BB"="#F39B7FFF","normal" = "#8491B4B2")
f1 = ggplot(subset(quant.p,highlight == "normal" ), aes(x=sum.raw.norm, y=GDI)) +  geom_point(alpha = 0.1, color = "#8491B4B2", size=2.5)
GDI_plot = f1 +  geom_point(data = subset(quant.p,highlight != "normal"  ), aes(x=sum.raw.norm, y=GDI, colour=highlight),size=2.5,alpha = 0.8) +
  geom_hline(yintercept=quantile(quant.p$GDI)[4], linetype="dashed", color = "darkblue") +
  geom_hline(yintercept=quantile(quant.p$GDI)[3], linetype="dashed", color = "darkblue") +
  geom_hline(yintercept=1.5, linetype="dotted", color = "red", size= 0.5) +
  scale_color_manual("Status", values = mycolours)  +
  scale_fill_manual("Status", values = mycolours)  +
  xlab("log normalized counts")+ylab("GDI")+
  geom_label_repel(data = textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf),fill=highlight),
                   label.size = NA, 
                   alpha = 0.5, 
                   direction ="both",
                   na.rm=TRUE,
                   seed = 1234) +
  geom_label_repel(data =textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf)),
                   label.size = NA, 
                   segment.color = 'black',
                   segment.size = 0.5,
                   direction = "both",
                   alpha = 0.8, 
                   na.rm=TRUE,
                   fill = NA,
                   seed = 1234) +
  theme(axis.text.x = element_text(size = text.size, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
        axis.text.y = element_text( size = text.size, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),  
        axis.title.x = element_text( size = text.size, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
        axis.title.y = element_text( size = text.size, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"),
        legend.title = element_blank(),
        legend.text = element_text(color = "#3C5488FF",face ="italic" ),
        legend.position = "bottom") 
legend <- cowplot::get_legend(GDI_plot)
GDI_plot =GDI_plot + theme(
        legend.position = "none") 
GDI_plot

2.2 Heatmaps

For the Gene Pair Analysis, we can plot an heatmap with the coex values between two genes sets. To do so we need to define, in a list, the different gene sets (list.genes). Then in the function parameter sets we can decide which sets need to be considered (in the example from 1 to 3). In the condition parameter we should insert an array with each file name prefix to be considered (in the example, the file names is “E17.5_cortex”).

list.genes = list("Ref.col"= BB, "AA"=AA, "Const."=CC )

plot_heatmap(df_genes = list.genes,sets = c(1:3),conditions = "ERCC",dir = out_dir)
#> [1] "plot heatmap"
#> [1] "Loading condition ERCC"
#> [1] "ERCC-00016" "ERCC-00017" "ERCC-00019"
#> [1] "Get p-values on a set of genes on columns on a set of genes on rows"
#> [1] "Using function S"
#> [1] "function to generate S "
#> [1] "Ref.col"
#> [1] "AA"
#> [1] "Const."
#> [1] "min coex: 0 max coex 0.98246092315627"

We can also plot a general heatmap of coex values based on some markers like the following one.

plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"), condition = "ERCC",dir = out_dir, p_value = 0.05)
#> [1] "ploting a general heatmap"
#> NULL
#> [1] "Get p-values genome wide on columns genome wide on rows"
#> [1] "Using function S"
#> [1] "function to generate S "

plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"),markers.list =c("ERCC-00084" ,"ERCC-00085" ,"ERCC-00086") ,symmetric = FALSE, condition = "ERCC",dir = out_dir, p_value = 0.05)
#> [1] "ploting a general heatmap"
#> NULL
#> [1] "Get p-values genome wide on columns genome wide on rows"
#> [1] "Using function S"
#> [1] "function to generate S "

2.3 Get data tables

Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions:

  1. get.observed.ct to produce the observed data
 get.observed.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
#>                ERCC-00014.yes ERCC-00014.no
#> ERCC-00019.yes            241           423
#> ERCC-00019.no             109           231
  1. get.expected.ct to produce the expected values
get.expected.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
#>                ERCC-00014.yes ERCC-00014.no
#> ERCC-00019.yes       235.7119      428.2888
#> ERCC-00019.no        114.2885      225.7108

Another useful function is extract.coex. This can be used to extract the whole or a partial coex matrix from a COTAN object.

# For the whole matrix
coex <- extract.coex(object = obj)
coex[1:5,1:5]
#>              ERCC-00012  ERCC-00013   ERCC-00014   ERCC-00016   ERCC-00017
#> ERCC-00012  0.769954876  0.03079731 -0.008469118  0.035409255 -0.004367265
#> ERCC-00013  0.030797310  0.98637914  0.020020333 -0.018534811 -0.013478967
#> ERCC-00014 -0.008469118  0.02002033  0.983490318 -0.003222142  0.094474696
#> ERCC-00016  0.035409255 -0.01853481 -0.003222142  0.982460923  0.028389361
#> ERCC-00017 -0.004367265 -0.01347897  0.094474696  0.028389361  0.185954347
# For a partial matrix
coex <- extract.coex(object = obj,genes = c("ERCC-00017", "ERCC-00019", "ERCC-00024", "ERCC-00025", "ERCC-00028", "ERCC-00031"))
head(coex)
#>              ERCC-00017   ERCC-00019   ERCC-00024   ERCC-00025    ERCC-00028
#> ERCC-00012 -0.004367265 5.272805e-02 -0.011980011  0.017574171 -0.0120737279
#> ERCC-00013 -0.013478967 4.360570e-02 -0.005497781  0.068917037  0.0310017944
#> ERCC-00014  0.094474696 2.345844e-02 -0.016243015 -0.006247842 -0.0226624019
#> ERCC-00016  0.028389361 7.232674e-05 -0.004746929  0.033126881  0.0005889499
#> ERCC-00017  0.185954347 1.880656e-02 -0.010473707  0.012328879  0.0162721054
#> ERCC-00019  0.018806561 9.695314e-01 -0.010600940 -0.032637961  0.0473851891
#>              ERCC-00031
#> ERCC-00012  0.013066001
#> ERCC-00013 -0.020289792
#> ERCC-00014 -0.054916592
#> ERCC-00016 -0.012314141
#> ERCC-00017 -0.017522836
#> ERCC-00019 -0.003396497

The next few lines are just to clean after compilation of the documentation.

if (file.exists(paste(out_dir,t,".cotan.RDS", sep = ""))) {
  #Delete file if it exists
  file.remove(paste(out_dir,t,".cotan.RDS", sep = ""))
}
#> [1] TRUE
unlink(paste(out_dir,"cleaning", sep = ""),recursive = TRUE)

print(sessionInfo())
#> R version 4.2.0 RC (2022-04-19 r82224)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Mojave 10.14.6
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] dendextend_1.15.2 MASS_7.3-57       htmlwidgets_1.5.4 forcats_0.5.1    
#>  [5] stringr_1.4.0     dplyr_1.0.8       purrr_0.3.4       readr_2.1.2      
#>  [9] tidyr_1.2.0       tibble_3.1.6      tidyverse_1.3.1   plotly_4.10.0    
#> [13] Rtsne_0.16        factoextra_1.0.7  ggrepel_0.9.1     ggplot2_3.3.5    
#> [17] Matrix_1.4-1      data.table_1.14.2 COTAN_1.0.0       BiocStyle_2.24.0 
#> 
#> loaded via a namespace (and not attached):
#>  [1] colorspace_2.0-3      rjson_0.2.21          ellipsis_0.3.2       
#>  [4] rprojroot_2.0.3       circlize_0.4.14       GlobalOptions_0.1.2  
#>  [7] fs_1.5.2              clue_0.3-60           rstudioapi_0.13      
#> [10] farver_2.1.0          fansi_1.0.3           lubridate_1.8.0      
#> [13] xml2_1.3.3            codetools_0.2-18      doParallel_1.0.17    
#> [16] knitr_1.38            jsonlite_1.8.0        Cairo_1.5-15         
#> [19] broom_0.8.0           cluster_2.1.3         dbplyr_2.1.1         
#> [22] png_0.1-7             BiocManager_1.30.17   compiler_4.2.0       
#> [25] httr_1.4.2            basilisk_1.8.0        backports_1.4.1      
#> [28] assertthat_0.2.1      RcppZiggurat_0.1.6    fastmap_1.1.0        
#> [31] lazyeval_0.2.2        cli_3.3.0             htmltools_0.5.2      
#> [34] tools_4.2.0           gtable_0.3.0          glue_1.6.2           
#> [37] Rcpp_1.0.8.3          cellranger_1.1.0      jquerylib_0.1.4      
#> [40] vctrs_0.4.1           iterators_1.0.14      xfun_0.30            
#> [43] rvest_1.0.2           lifecycle_1.0.1       scales_1.2.0         
#> [46] basilisk.utils_1.8.0  hms_1.1.1             parallel_4.2.0       
#> [49] RColorBrewer_1.1-3    ComplexHeatmap_2.12.0 yaml_2.3.5           
#> [52] reticulate_1.24       gridExtra_2.3         sass_0.4.1           
#> [55] stringi_1.7.6         highr_0.9             S4Vectors_0.34.0     
#> [58] foreach_1.5.2         BiocGenerics_0.42.0   filelock_1.0.2       
#> [61] shape_1.4.6           rlang_1.0.2           pkgconfig_2.0.3      
#> [64] matrixStats_0.62.0    evaluate_0.15         lattice_0.20-45      
#> [67] labeling_0.4.2        Rfast_2.0.6           cowplot_1.1.1        
#> [70] tidyselect_1.1.2      here_1.0.1            magrittr_2.0.3       
#> [73] bookdown_0.26         R6_2.5.1              IRanges_2.30.0       
#> [76] magick_2.7.3          generics_0.1.2        DBI_1.1.2            
#> [79] pillar_1.7.0          haven_2.5.0           withr_2.5.0          
#> [82] dir.expiry_1.4.0      modelr_0.1.8          crayon_1.5.1         
#> [85] utf8_1.2.2            tzdb_0.3.0            rmarkdown_2.14       
#> [88] viridis_0.6.2         GetoptLong_1.0.5      grid_4.2.0           
#> [91] readxl_1.4.0          reprex_2.0.1          digest_0.6.29        
#> [94] stats4_4.2.0          munsell_0.5.0         viridisLite_0.4.0    
#> [97] bslib_0.3.1