Chapter 4 Human PBMC with surface proteins (10X Genomics)

4.1 Introduction

Here, we describe a brief analysis of yet another peripheral blood mononuclear cell (PBMC) dataset from 10X Genomics (Zheng et al. 2017). Data are publicly available from the 10X Genomics website, from which we download the filtered gene/barcode count matrices for gene expression and cell surface proteins.

4.2 Data loading

library(BiocFileCache)
bfc <- BiocFileCache(ask=FALSE)
exprs.data <- bfcrpath(bfc, file.path(
    "http://cf.10xgenomics.com/samples/cell-vdj/3.1.0",
    "vdj_v1_hs_pbmc3",
    "vdj_v1_hs_pbmc3_filtered_feature_bc_matrix.tar.gz"))
untar(exprs.data, exdir=tempdir())

library(DropletUtils)
sce.pbmc <- read10xCounts(file.path(tempdir(), "filtered_feature_bc_matrix"))
sce.pbmc <- splitAltExps(sce.pbmc, rowData(sce.pbmc)$Type)

4.3 Quality control

unfiltered <- sce.pbmc

We discard cells with high mitochondrial proportions and few detectable ADT counts.

library(scater)
is.mito <- grep("^MT-", rowData(sce.pbmc)$Symbol)
stats <- perCellQCMetrics(sce.pbmc, subsets=list(Mito=is.mito))

high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher")
low.adt <- stats$`altexps_Antibody Capture_detected` < nrow(altExp(sce.pbmc))/2

discard <- high.mito | low.adt
sce.pbmc <- sce.pbmc[,!discard]

We examine some of the statistics:

summary(high.mito)

##    Mode   FALSE    TRUE 
## logical    6660     571

summary(low.adt)

##    Mode   FALSE 
## logical    7231

summary(discard)

##    Mode   FALSE    TRUE 
## logical    6660     571

We examine the distribution of each QC metric (Figure 4.1).

colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- discard

gridExtra::grid.arrange(
    plotColData(unfiltered, y="sum", colour_by="discard") +
        scale_y_log10() + ggtitle("Total count"),
    plotColData(unfiltered, y="detected", colour_by="discard") +
        scale_y_log10() + ggtitle("Detected features"),
    plotColData(unfiltered, y="subsets_Mito_percent",
        colour_by="discard") + ggtitle("Mito percent"),
    plotColData(unfiltered, y="altexps_Antibody Capture_detected",
        colour_by="discard") + ggtitle("ADT detected"),
    ncol=2
)

Distribution of each QC metric in the PBMC dataset, where each point is a cell and is colored by whether or not it was discarded by the outlier-based QC approach.

Figure 4.1: Distribution of each QC metric in the PBMC dataset, where each point is a cell and is colored by whether or not it was discarded by the outlier-based QC approach.

We also plot the mitochondrial proportion against the total count for each cell, as one does (Figure 4.2).

plotColData(unfiltered, x="sum", y="subsets_Mito_percent",
    colour_by="discard") + scale_x_log10()

Figure 4.2: Percentage of UMIs mapped to mitochondrial genes against the totalcount for each cell.

4.4 Normalization

Computing size factors for the gene expression and ADT counts.

library(scran)

set.seed(1000)
clusters <- quickCluster(sce.pbmc)
sce.pbmc <- computeSumFactors(sce.pbmc, cluster=clusters)
altExp(sce.pbmc) <- computeMedianFactors(altExp(sce.pbmc))
sce.pbmc <- applySCE(sce.pbmc, logNormCounts)

We generate some summary statistics for both sets of size factors:

summary(sizeFactors(sce.pbmc))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.074   0.719   0.908   1.000   1.133   8.858

summary(sizeFactors(altExp(sce.pbmc)))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.10    0.70    0.83    1.00    1.03  227.36

We also look at the distribution of size factors compared to the library size for each set of features (Figure 4.3).

par(mfrow=c(1,2))
plot(librarySizeFactors(sce.pbmc), sizeFactors(sce.pbmc), pch=16,
    xlab="Library size factors", ylab="Deconvolution factors", 
    main="Gene expression", log="xy")
plot(librarySizeFactors(altExp(sce.pbmc)), sizeFactors(altExp(sce.pbmc)), pch=16,
    xlab="Library size factors", ylab="Median-based factors", 
    main="Antibody capture", log="xy")

Plot of the deconvolution size factors for the gene expression values (left) or the median-based size factors for the ADT expression values (right) compared to the library size-derived factors for the corresponding set of features. Each point represents a cell.

Figure 4.3: Plot of the deconvolution size factors for the gene expression values (left) or the median-based size factors for the ADT expression values (right) compared to the library size-derived factors for the corresponding set of features. Each point represents a cell.

4.5 Dimensionality reduction

We omit the PCA step for the ADT expression matrix, given that it is already so low-dimensional, and progress directly to $t$-SNE and UMAP visualizations.

set.seed(100000)
altExp(sce.pbmc) <- runTSNE(altExp(sce.pbmc))

set.seed(1000000)
altExp(sce.pbmc) <- runUMAP(altExp(sce.pbmc))

4.6 Clustering

We perform graph-based clustering on the ADT data and use the assignments as the column labels of the alternative Experiment.

g.adt <- buildSNNGraph(altExp(sce.pbmc), k=10, d=NA)
clust.adt <- igraph::cluster_walktrap(g.adt)$membership
colLabels(altExp(sce.pbmc)) <- factor(clust.adt)

We examine some basic statistics about the size of each cluster, their separation (Figure 4.4) and their distribution in our $t$-SNE plot (Figure 4.5).

table(colLabels(altExp(sce.pbmc)))

## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  160  507  662   39  691 1415   32  650   76 1037  121   47   68   25   15  562 
##   17   18   19   20   21   22   23   24 
##  139   32   44  120   84   65   52   17

library(bluster)
mod <- pairwiseModularity(g.adt, clust.adt, as.ratio=TRUE)

library(pheatmap)
pheatmap::pheatmap(log10(mod + 10), cluster_row=FALSE, cluster_col=FALSE,
    color=colorRampPalette(c("white", "blue"))(101))

Heatmap of the pairwise cluster modularity scores in the PBMC dataset, computed based on the shared nearest neighbor graph derived from the ADT expression values.

Figure 4.4: Heatmap of the pairwise cluster modularity scores in the PBMC dataset, computed based on the shared nearest neighbor graph derived from the ADT expression values.

plotTSNE(altExp(sce.pbmc), colour_by="label", text_by="label", text_colour="red")

Obligatory $t$-SNE plot of PBMC dataset based on its ADT expression values, where each point is a cell and is colored by the cluster of origin. Cluster labels are also overlaid at the median coordinates across all cells in the cluster.

Figure 4.5: Obligatory $t$-SNE plot of PBMC dataset based on its ADT expression values, where each point is a cell and is colored by the cluster of origin. Cluster labels are also overlaid at the median coordinates across all cells in the cluster.

We perform some additional subclustering using the expression data to mimic an in silico FACS experiment.

set.seed(1010010)
subclusters <- quickSubCluster(sce.pbmc, clust.adt,
    prepFUN=function(x) {
        dec <- modelGeneVarByPoisson(x)
        top <- getTopHVGs(dec, prop=0.1)
        denoisePCA(x, dec, subset.row=top)
    }, 
    clusterFUN=function(x) {
        g.gene <- buildSNNGraph(x, k=10, use.dimred = 'PCA')
        igraph::cluster_walktrap(g.gene)$membership
    }
)

We counting the number of gene expression-derived subclusters in each ADT-derived parent cluster.

data.frame(
    Cluster=names(subclusters),
    Ncells=vapply(subclusters, ncol, 0L),
    Nsub=vapply(subclusters, function(x) length(unique(x$subcluster)), 0L)
)

##    Cluster Ncells Nsub
## 1        1    160    3
## 2        2    507    4
## 3        3    662    5
## 4        4     39    1
## 5        5    691    5
## 6        6   1415    7
## 7        7     32    1
## 8        8    650    7
## 9        9     76    2
## 10      10   1037    8
## 11      11    121    2
## 12      12     47    1
## 13      13     68    2
## 14      14     25    1
## 15      15     15    1
## 16      16    562    9
## 17      17    139    3
## 18      18     32    1
## 19      19     44    1
## 20      20    120    4
## 21      21     84    3
## 22      22     65    2
## 23      23     52    3
## 24      24     17    1

Session Info

R version 4.3.0 RC (2023-04-13 r84269)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS

Matrix products: default
BLAS:   /home/biocbuild/bbs-3.17-bioc/R/lib/libRblas.so 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB              LC_COLLATE=C              
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: America/New_York
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] pheatmap_1.0.12             bluster_1.10.0             
 [3] scran_1.28.0                scater_1.28.0              
 [5] ggplot2_3.4.2               scuttle_1.10.0             
 [7] DropletUtils_1.20.0         SingleCellExperiment_1.22.0
 [9] SummarizedExperiment_1.30.0 Biobase_2.60.0             
[11] GenomicRanges_1.52.0        GenomeInfoDb_1.36.0        
[13] IRanges_2.34.0              S4Vectors_0.38.0           
[15] BiocGenerics_0.46.0         MatrixGenerics_1.12.0      
[17] matrixStats_0.63.0          BiocFileCache_2.8.0        
[19] dbplyr_2.3.2                BiocStyle_2.28.0           
[21] rebook_1.10.0              

loaded via a namespace (and not attached):
  [1] DBI_1.1.3                 bitops_1.0-7             
  [3] gridExtra_2.3             CodeDepends_0.6.5        
  [5] rlang_1.1.0               magrittr_2.0.3           
  [7] RcppAnnoy_0.0.20          compiler_4.3.0           
  [9] RSQLite_2.3.1             dir.expiry_1.8.0         
 [11] DelayedMatrixStats_1.22.0 vctrs_0.6.2              
 [13] pkgconfig_2.0.3           fastmap_1.1.1            
 [15] XVector_0.40.0            labeling_0.4.2           
 [17] utf8_1.2.3                rmarkdown_2.21           
 [19] graph_1.78.0              ggbeeswarm_0.7.1         
 [21] purrr_1.0.1               bit_4.0.5                
 [23] xfun_0.39                 zlibbioc_1.46.0          
 [25] cachem_1.0.7              beachmat_2.16.0          
 [27] jsonlite_1.8.4            blob_1.2.4               
 [29] highr_0.10                rhdf5filters_1.12.0      
 [31] DelayedArray_0.26.0       Rhdf5lib_1.22.0          
 [33] BiocParallel_1.34.0       cluster_2.1.4            
 [35] irlba_2.3.5.1             parallel_4.3.0           
 [37] R6_2.5.1                  RColorBrewer_1.1-3       
 [39] bslib_0.4.2               limma_3.56.0             
 [41] jquerylib_0.1.4           Rcpp_1.0.10              
 [43] bookdown_0.33             knitr_1.42               
 [45] R.utils_2.12.2            igraph_1.4.2             
 [47] Matrix_1.5-4              tidyselect_1.2.0         
 [49] viridis_0.6.2             yaml_2.3.7               
 [51] codetools_0.2-19          curl_5.0.0               
 [53] lattice_0.21-8            tibble_3.2.1             
 [55] withr_2.5.0               Rtsne_0.16               
 [57] evaluate_0.20             pillar_1.9.0             
 [59] BiocManager_1.30.20       filelock_1.0.2           
 [61] generics_0.1.3            RCurl_1.98-1.12          
 [63] sparseMatrixStats_1.12.0  munsell_0.5.0            
 [65] scales_1.2.1              glue_1.6.2               
 [67] metapod_1.8.0             tools_4.3.0              
 [69] BiocNeighbors_1.18.0      ScaledMatrix_1.8.0       
 [71] locfit_1.5-9.7            XML_3.99-0.14            
 [73] cowplot_1.1.1             rhdf5_2.44.0             
 [75] grid_4.3.0                edgeR_3.42.0             
 [77] colorspace_2.1-0          GenomeInfoDbData_1.2.10  
 [79] beeswarm_0.4.0            BiocSingular_1.16.0      
 [81] HDF5Array_1.28.0          vipor_0.4.5              
 [83] cli_3.6.1                 rsvd_1.0.5               
 [85] fansi_1.0.4               viridisLite_0.4.1        
 [87] dplyr_1.1.2               uwot_0.1.14              
 [89] gtable_0.3.3              R.methodsS3_1.8.2        
 [91] sass_0.4.5                digest_0.6.31            
 [93] ggrepel_0.9.3             dqrng_0.3.0              
 [95] farver_2.1.1              memoise_2.0.1            
 [97] htmltools_0.5.5           R.oo_1.25.0              
 [99] lifecycle_1.0.3           httr_1.4.5               
[101] statmod_1.5.0             bit64_4.0.5

References

Zheng, G. X., J. M. Terry, P. Belgrader, P. Ryvkin, Z. W. Bent, R. Wilson, S. B. Ziraldo, et al. 2017. “Massively parallel digital transcriptional profiling of single cells.” Nat Commun 8 (January): 14049.