7/27/2017
library(SingleCellExperiment)
Light-weight container for single-cell genomics data that extends the RangedSummarizedExperiment
class with the following additional slots and methods specific to single-cell genomics datasets.
int_elementMetadata
int_colData
int_metadata
reducedDims
As suggested by the int_
prefix, the first three slots are not meant for direct manipulation.
isSpike<-
will set a proper column of int_elementMetadata
sizeFactors<-
will set a column in the int_colData
slot.There are two main ways to create instances of SingleCellExperiment
. The first is via the constructor.
sce <- SingleCellExperiment( assays = list(counts = matrix(rpois(100, lambda = 10), ncol=10, nrow=10))) sce
## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(1): counts ## rownames: NULL ## rowData names(0): ## colnames: NULL ## colData names(0): ## reduced(0): ## spikes(0):
The second is via coercion from SummarizedExperiment
objects.
se <- SummarizedExperiment( assays = list(counts = matrix(rpois(100, lambda = 10), ncol=10, nrow=10))) as(se, "SingleCellExperiment")
## class: SingleCellExperiment ## dim: 10 10 ## metadata(0): ## assays(1): counts ## rownames: NULL ## rowData names(0): ## colnames: NULL ## colData names(0): ## reduced(0): ## spikes(0):
library(scRNAseq) data(allen) allen
## class: SummarizedExperiment ## dim: 20908 379 ## metadata(2): SuppInfo which_qc ## assays(4): tophat_counts cufflinks_fpkm rsem_counts rsem_tpm ## rownames(20908): 0610007P14Rik 0610009B22Rik ... Zzef1 Zzz3 ## rowData names(0): ## colnames(379): SRR2140028 SRR2140022 ... SRR2139341 SRR2139336 ## colData names(22): NREADS NALIGNED ... Animal.ID ## passes_qc_checks_s
sce <- as(allen, "SingleCellExperiment") sce
## class: SingleCellExperiment ## dim: 20908 379 ## metadata(2): SuppInfo which_qc ## assays(4): tophat_counts cufflinks_fpkm rsem_counts rsem_tpm ## rownames(20908): 0610007P14Rik 0610009B22Rik ... Zzef1 Zzz3 ## rowData names(0): ## colnames(379): SRR2140028 SRR2140022 ... SRR2139341 SRR2139336 ## colData names(22): NREADS NALIGNED ... Animal.ID ## passes_qc_checks_s ## reduced(0): ## spikes(0):
isSpike(sce, "ERCC") <- grepl("^ERCC-", rownames(sce)) sce
## class: SingleCellExperiment ## dim: 20908 379 ## metadata(2): SuppInfo which_qc ## assays(4): tophat_counts cufflinks_fpkm rsem_counts rsem_tpm ## rownames(20908): 0610007P14Rik 0610009B22Rik ... Zzef1 Zzz3 ## rowData names(0): ## colnames(379): SRR2140028 SRR2140022 ... SRR2139341 SRR2139336 ## colData names(22): NREADS NALIGNED ... Animal.ID ## passes_qc_checks_s ## reduced(0): ## spikes(1): ERCC
table(isSpike(sce))
## ## FALSE TRUE ## 20816 92
spikeNames(sce)
## [1] "ERCC"
Let us pretend that the members of the Adam gene family have beeen spiked-in as external genes in these data.
isSpike(sce, "Adam") <- grepl("^Adam[0-9]", rownames(sce)) sce
## class: SingleCellExperiment ## dim: 20908 379 ## metadata(2): SuppInfo which_qc ## assays(4): tophat_counts cufflinks_fpkm rsem_counts rsem_tpm ## rownames(20908): 0610007P14Rik 0610009B22Rik ... Zzef1 Zzz3 ## rowData names(0): ## colnames(379): SRR2140028 SRR2140022 ... SRR2139341 SRR2139336 ## colData names(22): NREADS NALIGNED ... Animal.ID ## passes_qc_checks_s ## reduced(0): ## spikes(2): ERCC Adam
table(isSpike(sce))
## ## FALSE TRUE ## 20783 125
table(isSpike(sce, "ERCC"))
## ## FALSE TRUE ## 20816 92
table(isSpike(sce, "Adam"))
## ## FALSE TRUE ## 20875 33
For illustration, we simply compute the total number of reads as size factors, but better ways to compute size factors are available (see, e.g., the scran package).
sizeFactors(sce) <- colSums(assay(sce)) head(sizeFactors(sce))
## SRR2140028 SRR2140022 SRR2140055 SRR2140083 SRR2139991 SRR2140067 ## 5173863 6445002 2343379 5438526 4757468 2364851
We can compute multiple size factors and store them in the object, by providing a name.
sizeFactors(sce, "ERCC") <- colSums(assay(sce)[isSpike(sce, "ERCC"),]) head(sizeFactors(sce, "ERCC"))
## SRR2140028 SRR2140022 SRR2140055 SRR2140083 SRR2139991 SRR2140067 ## 224648 186208 162370 512991 278034 64975
colData
and rowData
By default, spike-ins and size factors are not returned by such methods, as they are conceptually distinct from the rest of the metadata.
colnames(colData(sce))
## [1] "NREADS" "NALIGNED" ## [3] "RALIGN" "TOTAL_DUP" ## [5] "PRIMER" "PCT_RIBOSOMAL_BASES" ## [7] "PCT_CODING_BASES" "PCT_UTR_BASES" ## [9] "PCT_INTRONIC_BASES" "PCT_INTERGENIC_BASES" ## [11] "PCT_MRNA_BASES" "MEDIAN_CV_COVERAGE" ## [13] "MEDIAN_5PRIME_BIAS" "MEDIAN_3PRIME_BIAS" ## [15] "MEDIAN_5PRIME_TO_3PRIME_BIAS" "driver_1_s" ## [17] "dissection_s" "Core.Type" ## [19] "Primary.Type" "Secondary.Type" ## [21] "Animal.ID" "passes_qc_checks_s"
colData
and rowData
rowData(sce)
## DataFrame with 20908 rows and 0 columns
colData
and rowData
But theycan be accessed by specifying internal=TRUE
.
colnames(colData(sce, internal=TRUE))
## [1] "NREADS" "NALIGNED" ## [3] "RALIGN" "TOTAL_DUP" ## [5] "PRIMER" "PCT_RIBOSOMAL_BASES" ## [7] "PCT_CODING_BASES" "PCT_UTR_BASES" ## [9] "PCT_INTRONIC_BASES" "PCT_INTERGENIC_BASES" ## [11] "PCT_MRNA_BASES" "MEDIAN_CV_COVERAGE" ## [13] "MEDIAN_5PRIME_BIAS" "MEDIAN_3PRIME_BIAS" ## [15] "MEDIAN_5PRIME_TO_3PRIME_BIAS" "driver_1_s" ## [17] "dissection_s" "Core.Type" ## [19] "Primary.Type" "Secondary.Type" ## [21] "Animal.ID" "passes_qc_checks_s" ## [23] "size_factor" "size_factor_ERCC"
colData
and rowData
rowData(sce, internal=TRUE)
## DataFrame with 20908 rows and 3 columns ## is_spike_ERCC is_spike is_spike_Adam ## <logical> <logical> <logical> ## 1 FALSE FALSE FALSE ## 2 FALSE FALSE FALSE ## 3 FALSE FALSE FALSE ## 4 FALSE FALSE FALSE ## 5 FALSE FALSE FALSE ## ... ... ... ... ## 20904 FALSE FALSE FALSE ## 20905 FALSE FALSE FALSE ## 20906 FALSE FALSE FALSE ## 20907 FALSE FALSE FALSE ## 20908 FALSE FALSE FALSE
library(Rtsne) set.seed(5252) pca_data <- prcomp(t(log1p(assay(sce_sub)))) tsne_data <- Rtsne(pca_data$x[,1:50], pca = FALSE) reducedDims(sce_sub) <- SimpleList(PCA=pca_data$x, TSNE=tsne_data$Y)
sce_sub
## class: SingleCellExperiment ## dim: 100 379 ## metadata(2): SuppInfo which_qc ## assays(4): tophat_counts cufflinks_fpkm rsem_counts rsem_tpm ## rownames(100): Lamp5 Fam19a1 ... Rnf2 Zfp35 ## rowData names(0): ## colnames(379): SRR2140028 SRR2140022 ... SRR2139341 SRR2139336 ## colData names(22): NREADS NALIGNED ... Animal.ID ## passes_qc_checks_s ## reduced(2): PCA TSNE ## spikes(2): ERCC Adam
reducedDims(sce_sub)
## List of length 2 ## names(2): PCA TSNE
head(reducedDim(sce_sub, "PCA")[,1:2])
## PC1 PC2 ## SRR2140028 17.557295 -7.717162 ## SRR2140022 21.468975 -1.198212 ## SRR2140055 4.303756 -11.360330 ## SRR2140083 21.440479 -9.435868 ## SRR2139991 15.592089 -11.043989 ## SRR2140067 16.539336 -9.831779
RangedSummarizedExperiment
rather than SummarizedExperiment
(rowRanges()
available).SingleCellExperiment
class in specialized packages.library(DelayedArray) saveHDF5SummarizedExperiment(sce, dir="./sce")
sce_h5 <- loadHDF5SummarizedExperiment("./sce/") sce_h5
## class: SingleCellExperiment ## dim: 20908 379 ## metadata(2): SuppInfo which_qc ## assays(4): tophat_counts cufflinks_fpkm rsem_counts rsem_tpm ## rownames(20908): 0610007P14Rik 0610009B22Rik ... Zzef1 Zzz3 ## rowData names(0): ## colnames(379): SRR2140028 SRR2140022 ... SRR2139341 SRR2139336 ## colData names(22): NREADS NALIGNED ... Animal.ID ## passes_qc_checks_s ## reduced(0): ## spikes(2): ERCC Adam
assay(sce_h5)
## DelayedMatrix object of 20908 x 379 doubles: ## SRR2140028 SRR2140022 ... SRR2139341 SRR2139336 ## 0610007P14Rik 234 486 . 1530 299 ## 0610009B22Rik 170 0 . 1182 719 ## 0610009L18Rik 0 0 . 0 0 ## 0610009O20Rik 0 1574 . 787 429 ## 0610010F05Rik 0 0 . 1125 254 ## ... . . . . . ## Zyg11a 0 0 . 0 0 ## Zyg11b 377 715 . 830 106 ## Zyx 0 11 . 803 0 ## Zzef1 37 698 . 0 678 ## Zzz3 1 85 . 4 0
library(pryr) object_size(sce)
## 256 MB
object_size(sce_h5)
## 2.53 MB