Create <code>tidybulk</code> tibble.

<!– badges: start –> <!– badges: end –>

In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.

Create `tidybulk` tibble.

tt = se_mini

Aggregate duplicated `transcripts`

Tidy transcriptomics “{.r .yellow} tt.aggr = tt %>% aggregate_duplicates() ”

Base R “r temp = data.frame( symbol = dge_list$genes$symbol, dge_list$counts ) dge_list.nr <- by(temp, temp$symbol, function(df) if(length(df[1,1])>0) matrixStats:::colSums(as.matrix(df[,-1])) ) dge_list.nr <- do.call("rbind", dge_list.nr) colnames(dge_list.nr) <- colnames(dge_list) ```

Scale `counts`

Tidy transcriptomics ”r tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance() “

Base R ”r library(edgeR) dgList <- DGEList(count_m=x,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] [...] dgList <- calcNormFactors(dgList, method="TMM") norm_counts.table <- cpm(dgList) ```

Filter `variable transcripts`

We may want to identify and filter variable transcripts.

Tidy transcriptomics “r tt.norm.variable = tt.norm %>% keep_variable() ”

Base R “r library(edgeR) x = norm_counts.table s <- rowMeans((x-rowMeans(x))^2) o <- order(s,decreasing=TRUE) x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] norm_counts.table$cell_type = tibble_counts[ match( tibble_counts$sample, rownames(norm_counts.table) ), "Cell type" ] ```

Reduce `dimensions`

Tidy transcriptomics ”r tt.norm.MDS = tt.norm %>% reduce_dimensions(method=“MDS”, .dims = 2) “

Base R ”r library(limma) count_m_log = log(count_m + 1) cmds = limma::plotMDS(ndim = .dims, plot = FALSE) cmds = cmds %$% cmdscale.out %>% setNames(sprintf(“Dim%s”, 1:6)) cmds$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(cmds)), “Cell type” ] “

PCA

Tidy transcriptomics ”r tt.norm.PCA = tt.norm %>% reduce_dimensions(method=“PCA”, .dims = 2) “

Base R ”r count_m_log = log(count_m + 1) pc = count_m_log %>% prcomp(scale = TRUE) variance = pc$sdev^2 variance = (variance / sum(variance))[1:6] pc$cell_type = counts[ match(counts$sample, rownames(pc)), “Cell type” ] “

tSNE

Tidy transcriptomics ”r tt.norm.tSNE = breast_tcga_mini_SE %>% tidybulk( sample, ens, count_scaled) %>% identify_abundant() %>% reduce_dimensions( method = “tSNE”, perplexity=10, pca_scale =TRUE ) “

Base R ”r count_m_log = log(count_m + 1) tsne = Rtsne::Rtsne( t(count_m_log), perplexity=10, pca_scale =TRUE )$Y tsne$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(tsne)), “Cell type” ] “

Rotate `dimensions`

Tidy transcriptomics ”r tt.norm.MDS.rotated = tt.norm.MDS %>% rotate_dimensions(Dim1, Dim2, rotation_degrees = 45, action=“get”) “

Base R ”r rotation = function(m, d) { r = d * pi / 180 ((bind_rows( c(1 = cos®, 2 = -sin®), c(1 = sin®, 2 = cos®) ) %>% as_matrix) %*% m) } mds_r = pca %>% rotation(rotation_degrees) mds_r$cell_type = counts[ match(counts$sample, rownames(mds_r)), “Cell type” ] “

Test `differential abundance`

Tidy transcriptomics ”r tt.de = tt %>% test_differential_abundance( ~ condition, action=“get”) tt.de “

Base R ”r library(edgeR) dgList <- DGEList(counts=counts_m,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) design <- model.matrix(~group) dgList <- estimateDisp(dgList,design) fit <- glmQLFit(dgList,design) qlf <- glmQLFTest(fit,coef=2) topTags(qlf, n=Inf) ```

Adjust `counts`

Tidy transcriptomics “r tt.norm.adj = tt.norm %>% adjust_abundance( ~ condition + time) ”

Base R “r library(sva) count_m_log = log(count_m + 1) design = model.matrix( object = ~ condition + time, data = annotation ) count_m_log.sva = ComBat( batch = design[,2], mod = design, … ) count_m_log.sva = ceiling(exp(count_m_log.sva) -1) count_m_log.sva$cell_type = counts[ match(counts$sample, rownames(count_m_log.sva)), "Cell type” ] “

Deconvolve `Cell type composition`

Tidy transcriptomics ”r tt.cibersort = tt %>% deconvolve_cellularity(action=“get”, cores=1) “

Base R ”r source(‘CIBERSORT.R’) count_m %>% write.table(“mixture_file.txt”) results <- CIBERSORT( "sig_matrix_file.txt", "mixture_file.txt", perm=100, QN=TRUE ) results$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(results)), "Cell type" ] ```

Cluster `samples`

k-means

Tidy transcriptomics “r tt.norm.cluster = tt.norm.MDS %>% cluster_elements(method="kmeans”, centers = 2, action=“get” ) “

Base R ”r count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, …) cluster = k$cluster cluster$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(cluster)), c(“Cell type”, “Dim1”, “Dim2”) ] “

SNN

Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable.

Tidy transcriptomics ”r tt.norm.SNN = tt.norm.tSNE %>% cluster_elements(method = “SNN”) “

Base R ”r library(Seurat) snn = CreateSeuratObject(count_m) snn = ScaleData( snn, display.progress = TRUE, num.cores=4, do.par = TRUE ) snn = FindVariableFeatures(snn, selection.method = “vst”) snn = FindVariableFeatures(snn, selection.method = “vst”) snn = RunPCA(snn, npcs = 30) snn = FindNeighbors(snn) snn = FindClusters(snn, method = “igraph”, …) snn = snn[[“seurat_clusters”]] snn$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(snn)), c(“Cell type”, “Dim1”, “Dim2”) ] “

Drop `redundant` transcripts

Tidy transcriptomics ”r tt.norm.non_redundant = tt.norm.MDS %>% remove_redundancy( method = “correlation” ) “

Base R ”r library(widyr) .data.correlated = pairwise_cor( counts, sample, transcript, rc, sort = TRUE, diag = FALSE, upper = FALSE ) %>% filter(correlation > correlation_threshold) %>% distinct(item1) %>% rename(!!.element := item1) # Return non redundant data frame counts %>% anti_join(.data.correlated) %>% spread(sample, rc, - transcript) %>% left_join(annotation) “

Draw `heatmap`

tidytranscriptomics ”r tt.norm.MDS %>% # filter lowly abundant keep_abundant() %>% # extract 500 most variable genes keep_variable( .abundance = count_scaled, top = 500) %>% # create heatmap heatmap(sample, transcript, count_scaled, transform = log1p) %>% add_tile(Cell type) “

Base R ”r # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$`Cell type`) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

Draw `density plot`

tidytranscriptomics “r # Example taken from airway dataset from BioC2020 workshop. airway %>% tidybulk() %>% identify_abundant() %>% scale_abundance() %>% pivot_longer(cols = starts_with("counts”), names_to = “source”, values_to = “abundance”) %>% filter(!lowly_abundant) %>% ggplot(aes(x=abundance + 1, color=sample)) + geom_density() + facet_wrap(~source) + scale_x_log10() “

Base R ”r # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$dex) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

Appendix

sessionInfo()

## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] tidySummarizedExperiment_1.2.0 SummarizedExperiment_1.22.0   
##  [3] Biobase_2.52.0                 GenomicRanges_1.44.0          
##  [5] GenomeInfoDb_1.28.0            IRanges_2.26.0                
##  [7] S4Vectors_0.30.0               BiocGenerics_0.38.0           
##  [9] MatrixGenerics_1.4.0           matrixStats_0.58.0            
## [11] tidybulk_1.4.0                 ggrepel_0.9.1                 
## [13] ggplot2_3.3.3                  magrittr_2.0.1                
## [15] tibble_3.1.2                   tidyr_1.1.3                   
## [17] dplyr_1.0.6                    knitr_1.33                    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-152           bitops_1.0-7           bit64_4.0.5           
##  [4] httr_1.4.2             SnowballC_0.7.0        backports_1.2.1       
##  [7] tools_4.1.0            utf8_1.2.1             R6_2.5.0              
## [10] mgcv_1.8-35            DBI_1.1.1              lazyeval_0.2.2        
## [13] colorspace_2.0-1       withr_2.4.2            tidyselect_1.1.1      
## [16] bit_4.0.4              compiler_4.1.0         preprocessCore_1.54.0 
## [19] cli_2.5.0              DelayedArray_0.18.0    plotly_4.9.3          
## [22] scales_1.1.1           readr_1.4.0            genefilter_1.74.0     
## [25] stringr_1.4.0          digest_0.6.27          XVector_0.32.0        
## [28] pkgconfig_2.0.3        htmltools_0.5.1.1      fastmap_1.1.0         
## [31] limma_3.48.0           htmlwidgets_1.5.3      rlang_0.4.11          
## [34] rstudioapi_0.13        RSQLite_2.2.7          generics_0.1.0        
## [37] jsonlite_1.7.2         BiocParallel_1.26.0    tokenizers_0.2.1      
## [40] RCurl_1.98-1.3         GenomeInfoDbData_1.2.6 Matrix_1.3-3          
## [43] Rcpp_1.0.6             munsell_0.5.0          fansi_0.4.2           
## [46] lifecycle_1.0.0        stringi_1.6.2          edgeR_3.34.0          
## [49] zlibbioc_1.38.0        plyr_1.8.6             Rtsne_0.15            
## [52] grid_4.1.0             blob_1.2.1             crayon_1.4.1          
## [55] lattice_0.20-44        Biostrings_2.60.0      splines_4.1.0         
## [58] annotate_1.70.0        hms_1.1.0              KEGGREST_1.32.0       
## [61] locfit_1.5-9.4         ps_1.6.0               pillar_1.6.1          
## [64] widyr_0.1.3            reshape2_1.4.4         codetools_0.2-18      
## [67] XML_3.99-0.6           glue_1.4.2             evaluate_0.14         
## [70] tidytext_0.3.1         data.table_1.14.0      vctrs_0.3.8           
## [73] png_0.1-7              gtable_0.3.0           purrr_0.3.4           
## [76] assertthat_0.2.1       cachem_1.0.5           xfun_0.23             
## [79] broom_0.7.6            xtable_1.8-4           janeaustenr_0.1.5     
## [82] survival_3.2-11        viridisLite_0.4.0      AnnotationDbi_1.54.0  
## [85] memoise_2.0.0          sva_3.40.0             ellipsis_0.3.2

Create tidybulk tibble.

Aggregate duplicated transcripts

Scale counts

Filter variable transcripts

Reduce dimensions

PCA

tSNE

Rotate dimensions

Test differential abundance

Adjust counts

Deconvolve Cell type composition

Cluster samples