library(scater)
library(ggplot2)
library(BiocParallel)
library(blase)
library(limma)
library(ggVennDiagram)
RNGversion("3.5.0")
#> Warning in RNGkind("Mersenne-Twister", "Inversion", "Rounding"): non-uniform
#> 'Rounding' sampler used
SEED <- 7
set.seed(SEED)
N_CORES <- 2
bpparam <- MulticoreParam(N_CORES)
This article will show how BLASE can be used for excluding the effect of developmental differences when analysing bulk RNA-seq data. We make use of scRNA-seq Dogga et al. 2024 and bulk RNA-seq data from Zhang et al. 2021. Code for generating the objects used here is available in the BLASE reproducibility repository.
First we’ll load in the data we’re using, pre-prepared from the BLASE reproducibility repository.
data(zhang_2021_heat_shock_bulk, package = "blase")
data(MCA_PF_SCE, package = "blase")
We can examine the true lifecycle stages, and also the calculated pseudotime trajectory (Slingshot).
plotUMAP(MCA_PF_SCE, colour_by = "STAGE_HR")
#| fig.alt: >
#| UMAP of Dogga et al. single-cell RNA-seq reference coloured by pseudotime,
#| starting with Rings, and ending with Schizonts.
plotUMAP(MCA_PF_SCE, colour_by = "slingPseudotime_1")
## Prepare BLASE
Now we’ll prepare BLASE for use.
First, we create the object, giving it the number of bins we want to use, and how to calculate them.
blase_data <- as.BlaseData(
MCA_PF_SCE,
pseudotime_slot = "slingPseudotime_1",
n_bins = 8,
split_by = "pseudotime_range"
)
# Add these bins to the sc metadata
MCA_PF_SCE <- assign_pseudotime_bins(MCA_PF_SCE,
pseudotime_slot = "slingPseudotime_1",
n_bins = 8,
split_by = "pseudotime_range"
)
Now we will select the genes we want to use, using BLASE’s gene peakedness metric.
gene_peakedness_info <- calculate_gene_peakedness(
MCA_PF_SCE,
window_pct = 5,
knots = 18,
BPPARAM = bpparam
)
genes_to_use <- gene_peakedness_spread_selection(
MCA_PF_SCE,
gene_peakedness_info,
genes_per_bin = 30,
n_gene_bins = 30
)
head(gene_peakedness_info)
#> gene peak_pseudotime mean_in_window mean_out_window ratio
#> 28 PF3D7-1401100 3.8311312 5.793085 1.528934 3.788969
#> 44 PF3D7-1401200 6.0203490 4.791942 1.921439 2.493934
#> 29 PF3D7-1401400 3.9679573 696.174509 39.689901 17.540344
#> 84 PF3D7-1401600 11.4933936 93.076303 7.631160 12.196875
#> 1 PF3D7-1401900 0.1368261 43.270750 3.321464 13.027613
#> 80 PF3D7-1402200 10.9460892 3.754015 1.554102 2.415553
#> window_start window_end deviance_explained
#> 28 3.4890659 4.1731965 0.1881312
#> 44 5.6782838 6.3624143 0.2035355
#> 29 3.6258920 4.3100226 0.2513795
#> 84 11.1513283 11.8354589 0.6891177
#> 1 -0.2052392 0.4788914 0.3846938
#> 80 10.6040239 11.2881544 0.1768597
Here, we add them to the BLASE object for mapping with.
genes(blase_data) <- genes_to_use
Now we can perform the actual mapping step, and review the results.
mapping_results <- map_all_best_bins(
blase_data,
zhang_2021_heat_shock_bulk,
BPPARAM = bpparam
)
plot_mapping_result_heatmap(mapping_results)
We calculate DE genes using Limma (ref) as we have access to only normalised counts.
metadata <- data.frame(row.names = seq_len(12))
metadata$strain <- c(
rep("NF54", 4),
rep("PB4", 4),
rep("PB31", 4)
)
metadata$growth_conditions <- rep(c("Normal", "Normal", "HS", "HS"), 3)
metadata$group <- paste0(metadata$strain, "_", metadata$growth_conditions)
rownames(metadata) <- c(
"NF54_37_1",
"NF54_37_2",
"NF54_41_1",
"NF54_41_2",
"PB4_37_1",
"PB4_37_2",
"PB4_41_1",
"PB4_41_2",
"PB31_37_1",
"PB31_37_2",
"PB31_41_1",
"PB31_41_2"
)
design <- model.matrix(~ 0 + group, metadata)
colnames(design) <- gsub("group", "", colnames(design))
contr.matrix <- makeContrasts(
WT_normal_v_HS = NF54_Normal - NF54_HS,
PB31_normal_v_HS = PB31_Normal - PB31_HS,
normal_NF54_v_PB31 = NF54_Normal - PB31_Normal,
levels = colnames(design)
)
vfit <- lmFit(zhang_2021_heat_shock_bulk, design)
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
efit <- eBayes(vfit)
summary(decideTests(efit))
#> WT_normal_v_HS PB31_normal_v_HS normal_NF54_v_PB31
#> Down 118 118 179
#> NotSig 822 616 767
#> Up 50 256 44
PB31_normal_v_HS_BULK_DE <- topTable(efit, n = Inf, coef = 2)
PB31_normal_v_HS_BULK_DE <- PB31_normal_v_HS_BULK_DE[PB31_normal_v_HS_BULK_DE$adj.P.Val < 0.05 & PB31_normal_v_HS_BULK_DE$logFC > 0, ]
PB31_normal_v_HS_BULK_DE <- PB31_normal_v_HS_BULK_DE[order(-PB31_normal_v_HS_BULK_DE$logFC), ]
First, let’s pseudobulk based on the pseudotime bins
pseudobulked_MCA_PF_SCE <- data.frame(row.names = rownames(MCA_PF_SCE))
for (r in mapping_results) {
pseudobulked_MCA_PF_SCE[bulk_name(r)] <- rowSums(counts(MCA_PF_SCE[, MCA_PF_SCE$pseudotime_bin == best_bin(r)]))
}
pseudobulked_MCA_PF_SCE <- as.matrix(pseudobulked_MCA_PF_SCE)
And now calculate DE genes for these
## Normalise, not needed for true bulks
par(mfrow = c(1, 2))
v <- voom(pseudobulked_MCA_PF_SCE, design, plot = FALSE)
vfit_sc <- lmFit(v, design)
vfit_sc <- contrasts.fit(vfit_sc, contrasts = contr.matrix)
efit_sc <- eBayes(vfit_sc)
summary(decideTests(efit_sc))
#> WT_normal_v_HS PB31_normal_v_HS normal_NF54_v_PB31
#> Down 0 257 127
#> NotSig 1746 1370 1343
#> Up 0 119 276
PB31_normal_v_HS_SC_DE <- topTable(efit_sc, n = Inf, coef = 2)
PB31_normal_v_HS_SC_DE <- PB31_normal_v_HS_SC_DE[PB31_normal_v_HS_SC_DE$adj.P.Val < 0.05 & PB31_normal_v_HS_SC_DE$logFC > 0, ]
PB31_normal_v_HS_SC_DE <- PB31_normal_v_HS_SC_DE[order(-PB31_normal_v_HS_SC_DE$logFC), ]
How many of these genes overlap? We can look at the intersection using a Venn Diagram:
ggVennDiagram(list(
Phenotype = rownames(PB31_normal_v_HS_BULK_DE),
Development = rownames(PB31_normal_v_HS_SC_DE)
)) + ggtitle("PB31 Normal vs Heat Shock")
Great, there are genes which we can correct. We can now remove these from the original DE genes list.
NB: This is a primitive approach, and there are likely many other and better ways to calculate which genes should be kept or removed.
PB31_normal_v_HS_corrected_DE <- rownames(PB31_normal_v_HS_BULK_DE[!(rownames(PB31_normal_v_HS_BULK_DE) %in% rownames(PB31_normal_v_HS_SC_DE)), ])
head(PB31_normal_v_HS_corrected_DE)
#> [1] "PF3D7-0725400" "PF3D7-0905400" "PF3D7-1010300" "PF3D7-0731500"
#> [5] "PF3D7-0913800" "PF3D7-1017500"
Further downstream analysis can now be performed, as desired by the researcher, for example Gene Ontology term enrichment.
sessionInfo()
#> R version 4.5.1 Patched (2025-08-23 r88802)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.22-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> Random number generation:
#> RNG: Mersenne-Twister
#> Normal: Inversion
#> Sample: Rounding
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] ggVennDiagram_1.5.4 limma_3.65.5
#> [3] patchwork_1.3.2 blase_0.99.3
#> [5] BiocParallel_1.43.4 scater_1.37.0
#> [7] ggplot2_4.0.0 scuttle_1.19.0
#> [9] SingleCellExperiment_1.31.1 SummarizedExperiment_1.39.2
#> [11] Biobase_2.69.1 GenomicRanges_1.61.5
#> [13] Seqinfo_0.99.2 IRanges_2.43.5
#> [15] S4Vectors_0.47.4 BiocGenerics_0.55.1
#> [17] generics_0.1.4 MatrixGenerics_1.21.0
#> [19] matrixStats_1.5.0 BiocStyle_2.37.1
#>
#> loaded via a namespace (and not attached):
#> [1] RcppAnnoy_0.0.22 splines_4.5.1 later_1.4.4
#> [4] tibble_3.3.0 polyclip_1.10-7 fastDummies_1.7.5
#> [7] lifecycle_1.0.4 globals_0.18.0 lattice_0.22-7
#> [10] MASS_7.3-65 magrittr_2.0.4 plotly_4.11.0
#> [13] sass_0.4.10 rmarkdown_2.30 jquerylib_0.1.4
#> [16] yaml_2.3.10 httpuv_1.6.16 Seurat_5.3.0
#> [19] sctransform_0.4.2 spam_2.11-1 sp_2.2-0
#> [22] spatstat.sparse_3.1-0 reticulate_1.43.0 cowplot_1.2.0
#> [25] pbapply_1.7-4 RColorBrewer_1.1-3 abind_1.4-8
#> [28] Rtsne_0.17 purrr_1.1.0 ggrepel_0.9.6
#> [31] irlba_2.3.5.1 listenv_0.9.1 spatstat.utils_3.2-0
#> [34] goftest_1.2-3 RSpectra_0.16-2 spatstat.random_3.4-2
#> [37] fitdistrplus_1.2-4 parallelly_1.45.1 codetools_0.2-20
#> [40] DelayedArray_0.35.3 tidyselect_1.2.1 farver_2.1.2
#> [43] ScaledMatrix_1.17.0 viridis_0.6.5 spatstat.explore_3.5-3
#> [46] jsonlite_2.0.0 BiocNeighbors_2.3.1 progressr_0.16.0
#> [49] ggridges_0.5.7 survival_3.8-3 tools_4.5.1
#> [52] ica_1.0-3 Rcpp_1.1.0 glue_1.8.0
#> [55] gridExtra_2.3 SparseArray_1.9.1 xfun_0.53
#> [58] mgcv_1.9-3 dplyr_1.1.4 withr_3.0.2
#> [61] BiocManager_1.30.26 fastmap_1.2.0 boot_1.3-32
#> [64] digest_0.6.37 rsvd_1.0.5 R6_2.6.1
#> [67] mime_0.13 scattermore_1.2 tensor_1.5.1
#> [70] dichromat_2.0-0.1 spatstat.data_3.1-8 tidyr_1.3.1
#> [73] data.table_1.17.8 httr_1.4.7 htmlwidgets_1.6.4
#> [76] S4Arrays_1.9.1 uwot_0.2.3 pkgconfig_2.0.3
#> [79] gtable_0.3.6 lmtest_0.9-40 S7_0.2.0
#> [82] XVector_0.49.1 htmltools_0.5.8.1 dotCall64_1.2
#> [85] bookdown_0.45 SeuratObject_5.2.0 scales_1.4.0
#> [88] png_0.1-8 spatstat.univar_3.1-4 knitr_1.50
#> [91] reshape2_1.4.4 nlme_3.1-168 cachem_1.1.0
#> [94] zoo_1.8-14 stringr_1.5.2 KernSmooth_2.23-26
#> [97] parallel_4.5.1 miniUI_0.1.2 vipor_0.4.7
#> [100] pillar_1.11.1 grid_4.5.1 vctrs_0.6.5
#> [103] RANN_2.6.2 promises_1.3.3 BiocSingular_1.25.0
#> [106] beachmat_2.25.5 xtable_1.8-4 cluster_2.1.8.1
#> [109] beeswarm_0.4.0 evaluate_1.0.5 tinytex_0.57
#> [112] magick_2.9.0 cli_3.6.5 compiler_4.5.1
#> [115] rlang_1.1.6 crayon_1.5.3 future.apply_1.20.0
#> [118] labeling_0.4.3 plyr_1.8.9 ggbeeswarm_0.7.2
#> [121] stringi_1.8.7 viridisLite_0.4.2 deldir_2.0-4
#> [124] lazyeval_0.2.2 spatstat.geom_3.6-0 Matrix_1.7-4
#> [127] RcppHNSW_0.6.0 sparseMatrixStats_1.21.0 future_1.67.0
#> [130] statmod_1.5.0 shiny_1.11.1 ROCR_1.0-11
#> [133] igraph_2.1.4 bslib_0.9.0