<!– badges: start –> Lifecycle:maturing <!– badges: end –>

In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.

Create tidybulk tibble.

tt = counts_mini %>% tidybulk(sample, transcript, count)

Aggregate duplicated transcripts

Tidy transcriptomics “{.r .yellow} tt.aggr = tt %>% aggregate_duplicates() ”
Base R “r temp = data.frame( symbol = dge_list$genes$symbol, dge_list$counts ) dge_list.nr <- by(temp, temp$symbol, function(df) if(length(df[1,1])>0) matrixStats:::colSums(as.matrix(df[,-1])) ) dge_list.nr <- do.call("rbind", dge_list.nr) colnames(dge_list.nr) <- colnames(dge_list) ```

Scale counts

Tidy transcriptomics ”r tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance() “
Base R ”r library(edgeR) dgList <- DGEList(count_m=x,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] [...] dgList <- calcNormFactors(dgList, method="TMM") norm_counts.table <- cpm(dgList) ```

Filter variable transcripts

We may want to identify and filter variable transcripts.

Tidy transcriptomics “r tt.norm.variable = tt.norm %>% keep_variable() ”
Base R “r library(edgeR) x = norm_counts.table s <- rowMeans((x-rowMeans(x))^2) o <- order(s,decreasing=TRUE) x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] norm_counts.table$cell_type = tidybulk::counts[ match( tidybulk::counts$sample, rownames(norm_counts.table) ), "Cell type" ] ```

Reduce dimensions

Tidy transcriptomics ”r tt.norm.MDS = tt.norm %>% reduce_dimensions(method=“MDS”, .dims = 2) “
Base R ”r library(limma) count_m_log = log(count_m + 1) cmds = limma::plotMDS(ndim = .dims, plot = FALSE) cmds = cmds %$% cmdscale.out %>% setNames(sprintf(“Dim%s”, 1:6)) cmds$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(cmds)), “Cell type” ] “

PCA

Tidy transcriptomics ”r tt.norm.PCA = tt.norm %>% reduce_dimensions(method=“PCA”, .dims = 2) “
Base R ”r count_m_log = log(count_m + 1) pc = count_m_log %>% prcomp(scale = TRUE) variance = pc$sdev^2 variance = (variance / sum(variance))[1:6] pc$cell_type = counts[ match(counts$sample, rownames(pc)), “Cell type” ] “

tSNE

Tidy transcriptomics ”r tt.norm.tSNE = breast_tcga_mini %>% tidybulk( sample, ens, count_scaled) %>% identify_abundant() %>% reduce_dimensions( method = “tSNE”, perplexity=10, pca_scale =TRUE ) “
Base R ”r count_m_log = log(count_m + 1) tsne = Rtsne::Rtsne( t(count_m_log), perplexity=10, pca_scale =TRUE )$Y tsne$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(tsne)), “Cell type” ] “

Rotate dimensions

Tidy transcriptomics ”r tt.norm.MDS.rotated = tt.norm.MDS %>% rotate_dimensions(Dim1, Dim2, rotation_degrees = 45, action=“get”) “
Base R ”r rotation = function(m, d) { r = d * pi / 180 ((bind_rows( c(1 = cos®, 2 = -sin®), c(1 = sin®, 2 = cos®) ) %>% as_matrix) %*% m) } mds_r = pca %>% rotation(rotation_degrees) mds_r$cell_type = counts[ match(counts$sample, rownames(mds_r)), “Cell type” ] “

Test differential abundance

Tidy transcriptomics ”r tt.de = tt %>% test_differential_abundance( ~ condition, action=“get”) tt.de “
Base R ”r library(edgeR) dgList <- DGEList(counts=counts_m,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) design <- model.matrix(~group) dgList <- estimateDisp(dgList,design) fit <- glmQLFit(dgList,design) qlf <- glmQLFTest(fit,coef=2) topTags(qlf, n=Inf) ```

Adjust counts

Tidy transcriptomics “r tt.norm.adj = tt.norm %>% adjust_abundance( ~ condition + time) ”
Base R “r library(sva) count_m_log = log(count_m + 1) design = model.matrix( object = ~ condition + time, data = annotation ) count_m_log.sva = ComBat( batch = design[,2], mod = design, … ) count_m_log.sva = ceiling(exp(count_m_log.sva) -1) count_m_log.sva$cell_type = counts[ match(counts$sample, rownames(count_m_log.sva)), "Cell type” ] “

Deconvolve Cell type composition

Tidy transcriptomics ”r tt.cibersort = tt %>% deconvolve_cellularity(action=“get”, cores=1) “
Base R ”r source(‘CIBERSORT.R’) count_m %>% write.table(“mixture_file.txt”) results <- CIBERSORT( "sig_matrix_file.txt", "mixture_file.txt", perm=100, QN=TRUE ) results$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(results)), "Cell type" ] ```

Cluster samples

k-means

Tidy transcriptomics “r tt.norm.cluster = tt.norm.MDS %>% cluster_elements(method="kmeans”, centers = 2, action=“get” ) “
Base R ”r count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, …) cluster = k$cluster cluster$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(cluster)), c(“Cell type”, “Dim1”, “Dim2”) ] “

SNN

Tidy transcriptomics ”r tt.norm.SNN = tt.norm.tSNE %>% cluster_elements(method = “SNN”) “
Base R ”r library(Seurat) snn = CreateSeuratObject(count_m) snn = ScaleData( snn, display.progress = TRUE, num.cores=4, do.par = TRUE ) snn = FindVariableFeatures(snn, selection.method = “vst”) snn = FindVariableFeatures(snn, selection.method = “vst”) snn = RunPCA(snn, npcs = 30) snn = FindNeighbors(snn) snn = FindClusters(snn, method = “igraph”, …) snn = snn[[“seurat_clusters”]] snn$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(snn)), c(“Cell type”, “Dim1”, “Dim2”) ] “

Drop redundant transcripts

Tidy transcriptomics ”r tt.norm.non_redundant = tt.norm.MDS %>% remove_redundancy( method = “correlation” ) “
Base R ”r library(widyr) .data.correlated = pairwise_cor( counts, sample, transcript, rc, sort = TRUE, diag = FALSE, upper = FALSE ) %>% filter(correlation > correlation_threshold) %>% distinct(item1) %>% rename(!!.element := item1) # Return non redundant data frame counts %>% anti_join(.data.correlated) %>% spread(sample, rc, - transcript) %>% left_join(annotation) “

Draw heatmap

tidytranscriptomics ”r tt.norm.MDS %>% # filter lowly abundant keep_abundant() %>% # extract 500 most variable genes keep_variable( .abundance = count_scaled, top = 500) %>% # create heatmap heatmap(sample, transcript, count_scaled, transform = log1p) %>% add_tile(Cell type) “
Base R ”r # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$`Cell type`) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

Draw density plot

tidytranscriptomics “r # Example taken from airway dataset from BioC2020 workshop. airway %>% tidybulk() %>% identify_abundant() %>% scale_abundance() %>% pivot_longer(cols = starts_with("counts”), names_to = “source”, values_to = “abundance”) %>% filter(!lowly_abundant) %>% ggplot(aes(x=abundance + 1, color=sample)) + geom_density() + facet_wrap(~source) + scale_x_log10() “
Base R ”r # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$dex) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

Appendix

sessionInfo()
## R Under development (unstable) (2020-10-17 r79346)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tidybulk_1.3.0 ggrepel_0.8.2  ggplot2_3.3.2  magrittr_1.5   tibble_3.0.4  
## [6] tidyr_1.1.2    dplyr_1.0.2    knitr_1.30    
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.10            tidytext_0.2.6             
##   [3] plyr_1.8.6                  igraph_1.2.6               
##   [5] lazyeval_0.2.2              splines_4.1.0              
##   [7] BiocParallel_1.25.0         listenv_0.8.0              
##   [9] SnowballC_0.7.0             GenomeInfoDb_1.27.0        
##  [11] sva_3.39.0                  digest_0.6.27              
##  [13] htmltools_0.5.0             fansi_0.4.1                
##  [15] memoise_1.1.0               tensor_1.5                 
##  [17] cluster_2.1.0               ROCR_1.0-11                
##  [19] limma_3.47.0                globals_0.13.1             
##  [21] readr_1.4.0                 annotate_1.69.0            
##  [23] matrixStats_0.57.0          colorspace_1.4-1           
##  [25] blob_1.2.1                  rappdirs_0.3.1             
##  [27] xfun_0.18                   crayon_1.3.4               
##  [29] RCurl_1.98-1.2              jsonlite_1.7.1             
##  [31] genefilter_1.73.0           spatstat_1.64-1            
##  [33] spatstat.data_1.4-3         survival_3.2-7             
##  [35] zoo_1.8-8                   glue_1.4.2                 
##  [37] polyclip_1.10-0             gtable_0.3.0               
##  [39] zlibbioc_1.37.0             XVector_0.31.0             
##  [41] leiden_0.3.3                DelayedArray_0.17.0        
##  [43] future.apply_1.6.0          BiocGenerics_0.37.0        
##  [45] abind_1.4-5                 scales_1.1.1               
##  [47] DBI_1.1.0                   edgeR_3.33.0               
##  [49] miniUI_0.1.1.1              Rcpp_1.0.5                 
##  [51] widyr_0.1.3                 viridisLite_0.3.0          
##  [53] xtable_1.8-4                reticulate_1.18            
##  [55] bit_4.0.4                   rsvd_1.0.3                 
##  [57] preprocessCore_1.53.0       stats4_4.1.0               
##  [59] htmlwidgets_1.5.2           httr_1.4.2                 
##  [61] RColorBrewer_1.1-2          ellipsis_0.3.1             
##  [63] Seurat_3.2.2                ica_1.0-2                  
##  [65] pkgconfig_2.0.3             XML_3.99-0.5               
##  [67] uwot_0.1.8                  deldir_0.1-29              
##  [69] locfit_1.5-9.4              utf8_1.1.4                 
##  [71] tidyselect_1.1.0            rlang_0.4.8                
##  [73] reshape2_1.4.4              later_1.1.0.1              
##  [75] AnnotationDbi_1.53.0        munsell_0.5.0              
##  [77] tools_4.1.0                 cli_2.1.0                  
##  [79] generics_0.0.2              RSQLite_2.2.1              
##  [81] broom_0.7.2                 ggridges_0.5.2             
##  [83] evaluate_0.14               stringr_1.4.0              
##  [85] fastmap_1.0.1               goftest_1.2-2              
##  [87] bit64_4.0.5                 fitdistrplus_1.1-1         
##  [89] purrr_0.3.4                 RANN_2.6.1                 
##  [91] pbapply_1.4-3               future_1.19.1              
##  [93] nlme_3.1-150                mime_0.9                   
##  [95] tokenizers_0.2.1            compiler_4.1.0             
##  [97] plotly_4.9.2.1              png_0.1-7                  
##  [99] e1071_1.7-4                 spatstat.utils_1.17-0      
## [101] stringi_1.5.3               lattice_0.20-41            
## [103] Matrix_1.2-18               vctrs_0.3.4                
## [105] pillar_1.4.6                lifecycle_0.2.0            
## [107] lmtest_0.9-38               RcppAnnoy_0.0.16           
## [109] data.table_1.13.2           cowplot_1.1.0              
## [111] bitops_1.0-6                irlba_2.3.3                
## [113] httpuv_1.5.4                patchwork_1.0.1            
## [115] GenomicRanges_1.43.0        R6_2.4.1                   
## [117] promises_1.1.1              KernSmooth_2.23-17         
## [119] gridExtra_2.3               janeaustenr_0.1.5          
## [121] IRanges_2.25.0              codetools_0.2-16           
## [123] MASS_7.3-53                 assertthat_0.2.1           
## [125] SummarizedExperiment_1.21.0 withr_2.3.0                
## [127] sctransform_0.3.1           S4Vectors_0.29.0           
## [129] GenomeInfoDbData_1.2.4      mgcv_1.8-33                
## [131] parallel_4.1.0              hms_0.5.3                  
## [133] grid_4.1.0                  rpart_4.1-15               
## [135] class_7.3-17                MatrixGenerics_1.3.0       
## [137] Rtsne_0.15                  Biobase_2.51.0             
## [139] shiny_1.5.0