We downloaded the data from (Mahata et al. 2014) (ArrayExpress: E-MTAB-2512). This consists of 96 single-cell RNA-seq samples.

The metadata of the ArrayExpress Accession (downloaded from http://www.ebi.ac.uk/arrayexpress/files/E-MTAB-2512/E-MTAB-2512.sdrf.txt) indicate that there are two cells of low quality (noted as “low read count”) and three cells for which dublets have been captured in the Fluidigm platform (noted as “multiple cells”). This is in agreement with (Mahata et al. 2014) and (Brennecke et al. 2013) that analyzed 91 cells.

system("wget http://www.ebi.ac.uk/arrayexpress/files/E-MTAB-2512/E-MTAB-2512.sdrf.txt")
metadata <- read.table("E-MTAB-2512.sdrf.txt", header=TRUE, sep='\t')
system("rm E-MTAB-2512.sdrf.txt")

as.character(unique(metadata[,1][which(metadata$Characteristics.single.cell.well.quality.=="multiple cells")]))
## [1] "Cell 74" "Cell 92" "Cell 93"
as.character(unique(metadata[,1][which(metadata$Comment.read.count.=="low read count")]))
## [1] "Cell 6" "Cell 7"

The data were also analyzed by (Buettner et al. 2015). In this paper the authors analyzed only 81 cells, because they removed “duplets and cells with low yield or poor quality cDNA.” The data used in (Buettner et al. 2015) can be downloaded from the scLVM github repository (https://github.com/PMBio/scLVM/blob/master/data/Tcell/data_Tcells.Rdata).

system("wget https://github.com/PMBio/scLVM/raw/master/data/Tcell/data_Tcells.Rdata")
load("data_Tcells.Rdata")
system("rm data_Tcells.Rdata")

dim(dataMouse)
## [1] 38385    81
colnames(dataMouse)
##  [1] "X9108_01.txt" "X9108_02.txt" "X9108_03.txt" "X9108_04.txt"
##  [5] "X9108_05.txt" "X9108_06.txt" "X9108_07.txt" "X9108_08.txt"
##  [9] "X9108_09.txt" "X9108_10.txt" "X9108_12.txt" "X9108_13.txt"
## [13] "X9108_14.txt" "X9108_15.txt" "X9108_16.txt" "X9108_17.txt"
## [17] "X9108_18.txt" "X9108_20.txt" "X9108_21.txt" "X9108_22.txt"
## [21] "X9108_24.txt" "X9108_25.txt" "X9108_27.txt" "X9108_28.txt"
## [25] "X9108_29.txt" "X9108_30.txt" "X9108_31.txt" "X9108_32.txt"
## [29] "X9108_33.txt" "X9108_34.txt" "X9108_36.txt" "X9108_37.txt"
## [33] "X9108_39.txt" "X9108_40.txt" "X9108_41.txt" "X9108_42.txt"
## [37] "X9108_43.txt" "X9108_44.txt" "X9108_45.txt" "X9108_46.txt"
## [41] "X9108_47.txt" "X9108_48.txt" "X9108_49.txt" "X9108_50.txt"
## [45] "X9108_51.txt" "X9108_52.txt" "X9108_54.txt" "X9108_55.txt"
## [49] "X9108_56.txt" "X9108_57.txt" "X9108_58.txt" "X9108_59.txt"
## [53] "X9108_60.txt" "X9108_61.txt" "X9108_62.txt" "X9108_64.txt"
## [57] "X9108_65.txt" "X9108_66.txt" "X9108_67.txt" "X9108_70.txt"
## [61] "X9108_71.txt" "X9108_72.txt" "X9108_75.txt" "X9108_76.txt"
## [65] "X9108_77.txt" "X9108_78.txt" "X9108_79.txt" "X9108_80.txt"
## [69] "X9108_82.txt" "X9108_83.txt" "X9108_84.txt" "X9108_85.txt"
## [73] "X9108_86.txt" "X9108_88.txt" "X9108_90.txt" "X9108_91.txt"
## [77] "X9108_92.txt" "X9108_93.txt" "X9108_94.txt" "X9108_95.txt"
## [81] "X9108_96.txt"

As expected the data consist of 81 cells. The colnames of the R object suggests that the number after the underscore is the id of the cell. To make sure that this is the case we can look at the correlation between the values of the Rdata object and our version of the data, obtained from the reanalysis of the data downloaded from ArrayExpress.

To do this, we have to first map the ENSEMBL IDs with the gene symbols.

library(scRNAseq)
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
data(th2)

ens2sym <- read.table("../tmp/ens2symbols.txt", stringsAsFactors = FALSE)
ids <- ens2sym[,2]
names(ids) <- ens2sym[,1]
head(ids)
## ENSMUSG00000000001 ENSMUSG00000000003 ENSMUSG00000000028 
##            "Gnai3"             "Pbsn"            "Cdc45" 
## ENSMUSG00000000031 ENSMUSG00000000037 ENSMUSG00000000049 
##              "H19"            "Scml2"             "Apoh"
## only genes present in annotation
common <- intersect(rownames(th2), ids[rownames(dataMouse)])

## not 1to1 correspondence; take the first
common_sym <- ids[which(ids %in% common)]
sym <- tapply(common_sym, common_sym, function(x) x[1])
names(sym) <- tapply(names(common_sym), common_sym, function(x) x[1])
ens <- names(sym)

stopifnot(all(ens %in% rownames(dataMouse)))
stopifnot(all(sym %in% rownames(th2)))

## reduce to same genes
dataMouse <- dataMouse[ens,]
counts <- assay(th2)[sym,]
rownames(dataMouse) <- sym

stopifnot(all(!is.na(counts)))
stopifnot(all(!is.na(dataMouse)))

## reorder the cells
colnames(counts) <- sapply(strsplit(as.character(th2$id), "\ "), function(x) x[2])
counts <- counts[,order(as.numeric(colnames(counts)))]

We can now plot the correlation between the two matrices.

par(mfrow=c(2, 4))
for(j in 1:NCOL(counts)) {
  cors <- sapply(seq_len(NCOL(dataMouse)), function(i)
    cor(log1p(counts[,j]), log1p(dataMouse[,i])))
  plot(ecdf(cors), main=paste("Cell", j, ":", colnames(dataMouse)[which.max(cors)]))
}

par(mfrow=c(1, 1))

The title of each plot is the ID of our re-processed data matched with the ID of the data from the github repository that has the highest correlation.

Note that for the 81 retained cells, we can perfectly match the IDs in the two datasets.

This means that the cells that were filtered out by (Buettner et al. 2015) are:

idx <- as.numeric(sapply(sapply(strsplit(colnames(dataMouse), "_"), function(x) strsplit(x[2], ".", fixed=TRUE)), function(x) x[1]))
print(which(!(1:96 %in% idx)))
##  [1] 11 19 23 26 35 38 53 63 68 69 73 74 81 87 89

These IDs do not match with the ones that are marked as dublets and low quality in the ArrayExpressed metadata.

If we look at the number of reads and percentage of mapped reads

qc <- as.matrix(colData(th2)[,metadata(th2)$which_qc])

ids <- as.numeric(sapply(strsplit(as.character(colData(th2)$id), " "), function(x) x[2]))

plot(qc[,1], qc[,3], xlab="Total number of reads", ylab="Percentage of mapped reads")
points(qc[which(!(ids %in% idx)),c(1, 3)], pch=19, col=2) #filtered
points(qc[which(ids %in% c(74, 92, 93)),c(1, 3)], pch=19, col=4) #doublets
points(qc[which(ids %in% c(6, 7)),c(1, 3)], pch=19, col=6) #low reads
legend("bottomright", c("Filtered by Buettner", "Doublets", "Low reads"), fill=c(2, 4, 6))

References

Brennecke, Philip, Simon Anders, Jong Kyoung Kim, Aleksandra A Kołodziejczyk, Xiuwei Zhang, Valentina Proserpio, Bianka Baying, et al. 2013. “Accounting for Technical Noise in Single-Cell RNA-seq Experiments.” Nature Methods 10 (11): 1093–5.

Buettner, Florian, Kedar N Natarajan, F Paolo Casale, Valentina Proserpio, Antonio Scialdone, Fabian J Theis, Sarah A Teichmann, John C Marioni, and Oliver Stegle. 2015. “Computational Analysis of Cell-to-Cell Heterogeneity in Single-Cell RNA-sequencing Data Reveals Hidden Subpopulations of Cells.” Nature Biotechnology 33 (2): 155–60.

Mahata, B, X Zhang, AA Kolodziejczyk, V Proserpio, L Haim-Vilmovsky, AE Taylor, D Hebenstreit, et al. 2014. “Single-Cell RNA Sequencing Reveals T Helper Cells Synthesizing Steroids de Novo to Contribute to Immune Homeostasis.” Cell Reports 7 (4): 1130–42.