<!– badges: start –>
   <!– badges: end –>
  <!– badges: end –>
In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.
Create tidybulk tibble.
tt = se_mini
Aggregate duplicated transcripts
Tidy transcriptomics
“{.r .yellow}
tt.aggr = tt %>% aggregate_duplicates()
”
Base R
“r
temp = data.frame(
    symbol = dge_list$genes$symbol,
    dge_list$counts
)
dge_list.nr <- by(temp, temp$symbol,
    function(df)
        if(length(df[1,1])>0)
            matrixStats:::colSums(as.matrix(df[,-1]))
)
dge_list.nr <- do.call("rbind", dge_list.nr)
colnames(dge_list.nr) <- colnames(dge_list)
```
Scale counts
Tidy transcriptomics
”r
tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance()
“
Base R
”r
library(edgeR)
dgList <- DGEList(count_m=x,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
[...]
dgList <- calcNormFactors(dgList, method="TMM")
norm_counts.table <- cpm(dgList)
```
Filter variable transcripts
We may want to identify and filter variable transcripts.
Tidy transcriptomics
“r
tt.norm.variable = tt.norm %>% keep_variable()
”
Base R
“r
library(edgeR)
x = norm_counts.table
s <- rowMeans((x-rowMeans(x))^2)
o <- order(s,decreasing=TRUE)
x <- x[o[1L:top],,drop=FALSE]
norm_counts.table = norm_counts.table[rownames(x)]
norm_counts.table$cell_type = tibble_counts[
    match(
        tibble_counts$sample,
        rownames(norm_counts.table)
    ),
    "Cell type"
]
```
Reduce dimensions
Tidy transcriptomics
”r
tt.norm.MDS =
  tt.norm %>%
  reduce_dimensions(method=“MDS”, .dims = 2)
“
Base R
”r
library(limma)
count_m_log = log(count_m + 1)
cmds = limma::plotMDS(ndim = .dims, plot = FALSE)
cmds = cmds %$% 
    cmdscale.out %>%
    setNames(sprintf(“Dim%s”, 1:6))
cmds$cell_type = tibble_counts[
    match(tibble_counts$sample, rownames(cmds)),
    “Cell type”
]
“
PCA
Tidy transcriptomics
”r
tt.norm.PCA =
  tt.norm %>%
  reduce_dimensions(method=“PCA”, .dims = 2)
“
Base R
”r
count_m_log = log(count_m + 1)
pc = count_m_log %>% prcomp(scale = TRUE)
variance = pc$sdev^2
variance = (variance / sum(variance))[1:6]
pc$cell_type = counts[
    match(counts$sample, rownames(pc)),
    “Cell type”
]
“
tSNE
Tidy transcriptomics
”r
tt.norm.tSNE =
    breast_tcga_mini_SE %>%
    tidybulk(       sample, ens, count_scaled) %>%
    identify_abundant() %>%
    reduce_dimensions(
        method = “tSNE”,
        perplexity=10,
        pca_scale =TRUE
    )
“
Base R
”r
count_m_log = log(count_m + 1)
tsne = Rtsne::Rtsne(
    t(count_m_log),
    perplexity=10,
        pca_scale =TRUE
)$Y
tsne$cell_type = tibble_counts[
    match(tibble_counts$sample, rownames(tsne)),
    “Cell type”
]
“
Rotate dimensions
Tidy transcriptomics
”r
tt.norm.MDS.rotated =
  tt.norm.MDS %>%
    rotate_dimensions(Dim1, Dim2, rotation_degrees = 45, action=“get”)
“
Base R
”r
rotation = function(m, d) {
    r = d * pi / 180
    ((bind_rows(
        c(1 = cos®, 2 = -sin®),
        c(1 = sin®, 2 = cos®)
    ) %>% as_matrix) %*% m)
}
mds_r = pca %>% rotation(rotation_degrees)
mds_r$cell_type = counts[
    match(counts$sample, rownames(mds_r)),
    “Cell type”
]
“
Test differential abundance
Tidy transcriptomics
”r
tt.de =
    tt %>%
    test_differential_abundance( ~ condition, action=“get”)
tt.de
“
Base R
”r
library(edgeR)
dgList <- DGEList(counts=counts_m,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
design <- model.matrix(~group)
dgList <- estimateDisp(dgList,design)
fit <- glmQLFit(dgList,design)
qlf <- glmQLFTest(fit,coef=2)
topTags(qlf, n=Inf)
```
Adjust counts
Tidy transcriptomics
“r
tt.norm.adj =
    tt.norm %>% adjust_abundance(   ~ condition + time)
”
Base R
“r
library(sva)
count_m_log = log(count_m + 1)
design =
        model.matrix(
            object = ~ condition + time,
            data = annotation
        )
count_m_log.sva =
    ComBat(
            batch = design[,2],
            mod = design,
            …
        )
count_m_log.sva = ceiling(exp(count_m_log.sva) -1)
count_m_log.sva$cell_type = counts[
    match(counts$sample, rownames(count_m_log.sva)),
    "Cell type”
]
“
Deconvolve Cell type composition
Tidy transcriptomics
”r
tt.cibersort =
    tt %>%
    deconvolve_cellularity(action=“get”, cores=1)
“
Base R
”r
source(‘CIBERSORT.R’)
count_m %>% write.table(“mixture_file.txt”)
results <- CIBERSORT(
    "sig_matrix_file.txt",
    "mixture_file.txt",
    perm=100, QN=TRUE
)
results$cell_type = tibble_counts[
    match(tibble_counts$sample, rownames(results)),
    "Cell type"
]
```
Cluster samples
k-means
Tidy transcriptomics
“r
tt.norm.cluster = tt.norm.MDS %>%
  cluster_elements(method="kmeans”, centers = 2, action=“get” )
“
Base R
”r
count_m_log = log(count_m + 1)
k = kmeans(count_m_log, iter.max = 1000, …)
cluster = k$cluster
cluster$cell_type = tibble_counts[
    match(tibble_counts$sample, rownames(cluster)),
    c(“Cell type”, “Dim1”, “Dim2”)
]
“
SNN
Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable.
Tidy transcriptomics
”r
tt.norm.SNN =
    tt.norm.tSNE %>%
    cluster_elements(method = “SNN”)
“
Base R
”r
library(Seurat)
snn = CreateSeuratObject(count_m)
snn = ScaleData(
    snn, display.progress = TRUE,
    num.cores=4, do.par = TRUE
)
snn = FindVariableFeatures(snn, selection.method = “vst”)
snn = FindVariableFeatures(snn, selection.method = “vst”)
snn = RunPCA(snn, npcs = 30)
snn = FindNeighbors(snn)
snn = FindClusters(snn, method = “igraph”, …)
snn = snn[[“seurat_clusters”]]
snn$cell_type = tibble_counts[
    match(tibble_counts$sample, rownames(snn)),
    c(“Cell type”, “Dim1”, “Dim2”)
]
“
Drop redundant transcripts
Tidy transcriptomics
”r
tt.norm.non_redundant =
    tt.norm.MDS %>%
  remove_redundancy(    method = “correlation” )
“
Base R
”r
library(widyr)
.data.correlated =
    pairwise_cor(
        counts,
        sample,
        transcript,
        rc,
        sort = TRUE,
        diag = FALSE,
        upper = FALSE
    ) %>%
    filter(correlation > correlation_threshold) %>%
    distinct(item1) %>%
    rename(!!.element := item1)
# Return non redundant data frame
counts %>% anti_join(.data.correlated) %>%
    spread(sample, rc, - transcript) %>%
    left_join(annotation)
“
Draw heatmap
tidytranscriptomics
”r
tt.norm.MDS %>%
  # filter lowly abundant
  keep_abundant() %>%
  # extract 500 most variable genes
  keep_variable( .abundance = count_scaled, top = 500) %>%
  # create heatmap
  heatmap(sample, transcript, count_scaled, transform = log1p) %>%
    add_tile(Cell type) 
“
Base R
”r
# Example taken from airway dataset from BioC2020 workshop. 
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$`Cell type`)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")
```
Draw density plot
tidytranscriptomics
“r
# Example taken from airway dataset from BioC2020 workshop. 
airway %>%
    tidybulk() %>%
      identify_abundant() %>%
    scale_abundance() %>%
    pivot_longer(cols = starts_with("counts”), names_to = “source”, values_to = “abundance”) %>%
    filter(!lowly_abundant) %>%
    ggplot(aes(x=abundance + 1, color=sample)) +
    geom_density() +
    facet_wrap(~source) +
    scale_x_log10() 
“
Base R
”r
# Example taken from airway dataset from BioC2020 workshop. 
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$dex)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")
```
Appendix
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] tidySummarizedExperiment_1.2.0 SummarizedExperiment_1.22.0   
##  [3] Biobase_2.52.0                 GenomicRanges_1.44.0          
##  [5] GenomeInfoDb_1.28.0            IRanges_2.26.0                
##  [7] S4Vectors_0.30.0               BiocGenerics_0.38.0           
##  [9] MatrixGenerics_1.4.0           matrixStats_0.58.0            
## [11] tidybulk_1.4.0                 ggrepel_0.9.1                 
## [13] ggplot2_3.3.3                  magrittr_2.0.1                
## [15] tibble_3.1.2                   tidyr_1.1.3                   
## [17] dplyr_1.0.6                    knitr_1.33                    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-152           bitops_1.0-7           bit64_4.0.5           
##  [4] httr_1.4.2             SnowballC_0.7.0        backports_1.2.1       
##  [7] tools_4.1.0            utf8_1.2.1             R6_2.5.0              
## [10] mgcv_1.8-35            DBI_1.1.1              lazyeval_0.2.2        
## [13] colorspace_2.0-1       withr_2.4.2            tidyselect_1.1.1      
## [16] bit_4.0.4              compiler_4.1.0         preprocessCore_1.54.0 
## [19] cli_2.5.0              DelayedArray_0.18.0    plotly_4.9.3          
## [22] scales_1.1.1           readr_1.4.0            genefilter_1.74.0     
## [25] stringr_1.4.0          digest_0.6.27          XVector_0.32.0        
## [28] pkgconfig_2.0.3        htmltools_0.5.1.1      fastmap_1.1.0         
## [31] limma_3.48.0           htmlwidgets_1.5.3      rlang_0.4.11          
## [34] rstudioapi_0.13        RSQLite_2.2.7          generics_0.1.0        
## [37] jsonlite_1.7.2         BiocParallel_1.26.0    tokenizers_0.2.1      
## [40] RCurl_1.98-1.3         GenomeInfoDbData_1.2.6 Matrix_1.3-3          
## [43] Rcpp_1.0.6             munsell_0.5.0          fansi_0.4.2           
## [46] lifecycle_1.0.0        stringi_1.6.2          edgeR_3.34.0          
## [49] zlibbioc_1.38.0        plyr_1.8.6             Rtsne_0.15            
## [52] grid_4.1.0             blob_1.2.1             crayon_1.4.1          
## [55] lattice_0.20-44        Biostrings_2.60.0      splines_4.1.0         
## [58] annotate_1.70.0        hms_1.1.0              KEGGREST_1.32.0       
## [61] locfit_1.5-9.4         ps_1.6.0               pillar_1.6.1          
## [64] widyr_0.1.3            reshape2_1.4.4         codetools_0.2-18      
## [67] XML_3.99-0.6           glue_1.4.2             evaluate_0.14         
## [70] tidytext_0.3.1         data.table_1.14.0      vctrs_0.3.8           
## [73] png_0.1-7              gtable_0.3.0           purrr_0.3.4           
## [76] assertthat_0.2.1       cachem_1.0.5           xfun_0.23             
## [79] broom_0.7.6            xtable_1.8-4           janeaustenr_0.1.5     
## [82] survival_3.2-11        viridisLite_0.4.0      AnnotationDbi_1.54.0  
## [85] memoise_2.0.0          sva_3.40.0             ellipsis_0.3.2