Contents

1 Theory

See slides.

A recent tweet provides a nice summary of efforts to benchmark gene set enrichement analysis methods using the GSEABenchmarkR package.

2 Practice

Data input and massage

library(airway)
data(airway)
airway$dex <- relevel(airway$dex, "untrt")

Differential expression analysis

library(DESeq2)
des <- DESeqDataSet(airway, design = ~ cell + dex)
des <- DESeq(des)
res <- results(des)

Transition to tidy data

library(dplyr)
library(tibble)
tbl <- res %>%
    as.data.frame() %>%
    as_tibble(rownames = "ENSEMBL")
tbl
## # A tibble: 64,102 x 7
##    ENSEMBL      baseMean log2FoldChange   lfcSE   stat     pvalue      padj
##    <chr>           <dbl>          <dbl>   <dbl>  <dbl>      <dbl>     <dbl>
##  1 ENSG0000000…  709.           -0.381   0.101  -3.79     1.52e-4   1.28e-3
##  2 ENSG0000000…    0            NA      NA      NA       NA        NA      
##  3 ENSG0000000…  520.            0.207   0.112   1.84     6.53e-2   1.97e-1
##  4 ENSG0000000…  237.            0.0379  0.143   0.264    7.92e-1   9.11e-1
##  5 ENSG0000000…   57.9          -0.0882  0.287  -0.307    7.59e-1   8.95e-1
##  6 ENSG0000000…    0.318        -1.38    3.50   -0.394    6.94e-1  NA      
##  7 ENSG0000000… 5817.            0.426   0.0883  4.83     1.38e-6   1.82e-5
##  8 ENSG0000000… 1282.           -0.241   0.0887 -2.72     6.58e-3   3.28e-2
##  9 ENSG0000000…  610.           -0.0476  0.167  -0.286    7.75e-1   9.03e-1
## 10 ENSG0000000…  369.           -0.500   0.121  -4.14     3.48e-5   3.42e-4
## # … with 64,092 more rows

2.1 Example: hypergeometric test using limma::goana()

Requires ENTREZ identifiers

library(org.Hs.eg.db)
tbl <- tbl %>%
    mutate(
        ENTREZID = mapIds(org.Hs.eg.db, ENSEMBL, "ENTREZID", "ENSEMBL")
    ) %>%
    dplyr::select(ENSEMBL, ENTREZID, everything())
tbl
## # A tibble: 64,102 x 8
##    ENSEMBL ENTREZID baseMean log2FoldChange   lfcSE   stat   pvalue
##    <chr>   <chr>       <dbl>          <dbl>   <dbl>  <dbl>    <dbl>
##  1 ENSG00… 7105      709.           -0.381   0.101  -3.79   1.52e-4
##  2 ENSG00… 64102       0            NA      NA      NA     NA      
##  3 ENSG00… 8813      520.            0.207   0.112   1.84   6.53e-2
##  4 ENSG00… 57147     237.            0.0379  0.143   0.264  7.92e-1
##  5 ENSG00… 55732      57.9          -0.0882  0.287  -0.307  7.59e-1
##  6 ENSG00… 2268        0.318        -1.38    3.50   -0.394  6.94e-1
##  7 ENSG00… 3075     5817.            0.426   0.0883  4.83   1.38e-6
##  8 ENSG00… 2519     1282.           -0.241   0.0887 -2.72   6.58e-3
##  9 ENSG00… 2729      610.           -0.0476  0.167  -0.286  7.75e-1
## 10 ENSG00… 4800      369.           -0.500   0.121  -4.14   3.48e-5
## # … with 64,092 more rows, and 1 more variable: padj <dbl>

Universe – must be testable for DE

tbl <- tbl %>%
    filter(!is.na(padj), !is.na(ENTREZID))
tbl
## # A tibble: 14,550 x 8
##    ENSEMBL   ENTREZID baseMean log2FoldChange  lfcSE   stat  pvalue    padj
##    <chr>     <chr>       <dbl>          <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
##  1 ENSG0000… 7105        709.         -0.381  0.101  -3.79  1.52e-4 1.28e-3
##  2 ENSG0000… 8813        520.          0.207  0.112   1.84  6.53e-2 1.97e-1
##  3 ENSG0000… 57147       237.          0.0379 0.143   0.264 7.92e-1 9.11e-1
##  4 ENSG0000… 55732        57.9        -0.0882 0.287  -0.307 7.59e-1 8.95e-1
##  5 ENSG0000… 3075       5817.          0.426  0.0883  4.83  1.38e-6 1.82e-5
##  6 ENSG0000… 2519       1282.         -0.241  0.0887 -2.72  6.58e-3 3.28e-2
##  7 ENSG0000… 2729        610.         -0.0476 0.167  -0.286 7.75e-1 9.03e-1
##  8 ENSG0000… 4800        369.         -0.500  0.121  -4.14  3.48e-5 3.42e-4
##  9 ENSG0000… 90529       183.         -0.124  0.180  -0.689 4.91e-1 7.24e-1
## 10 ENSG0000… 57185      2814.         -0.0411 0.103  -0.400 6.89e-1 8.57e-1
## # … with 14,540 more rows

limma::goana() – Hypergeometric

library(limma)
go <-
    goana(tbl$ENTREZID[tbl$padj < .05], tbl$ENTREZID, "Hs") %>%
    as_tibble(rownames = "GOID") %>%
    dplyr::select(GOID, Ont, everything())
go
## # A tibble: 20,656 x 6
##    GOID      Ont   Term                                    N    DE     P.DE
##    <chr>     <chr> <chr>                               <dbl> <dbl>    <dbl>
##  1 GO:00017… BP    cell activation                       939   333 6.80e-12
##  2 GO:00022… BP    immune effector process               808   251 2.98e- 4
##  3 GO:00022… BP    cell activation involved in immune…   498   148 2.43e- 2
##  4 GO:00022… BP    myeloid leukocyte activation          463   147 2.01e- 3
##  5 GO:00022… BP    myeloid cell activation involved i…   399   118 4.61e- 2
##  6 GO:00022… BP    neutrophil activation involved in …   361   108 4.08e- 2
##  7 GO:00023… BP    leukocyte activation involved in i…   494   148 1.86e- 2
##  8 GO:00023… BP    immune system process                1958   652 8.14e-16
##  9 GO:00024… BP    leukocyte mediated immunity           538   165 5.33e- 3
## 10 GO:00024… BP    myeloid leukocyte mediated immunity   408   124 1.89e- 2
## # … with 20,646 more rows

What GO id’s are most differentially expressed? (Why do these gene sets seem to have a large size, N?)

go %>% arrange(P.DE)
## # A tibble: 20,656 x 6
##    GOID       Ont   Term                                   N    DE     P.DE
##    <chr>      <chr> <chr>                              <dbl> <dbl>    <dbl>
##  1 GO:0071944 CC    cell periphery                      3081  1104 8.45e-45
##  2 GO:0050896 BP    response to stimulus                5799  1859 8.77e-45
##  3 GO:0032502 BP    developmental process               4168  1410 9.73e-44
##  4 GO:0048856 BP    anatomical structure development    3891  1330 3.45e-43
##  5 GO:0005886 CC    plasma membrane                     3013  1078 3.83e-43
##  6 GO:0032501 BP    multicellular organismal process    4678  1546 1.83e-42
##  7 GO:0048731 BP    system development                  3202  1128 7.52e-42
##  8 GO:0023052 BP    signaling                           4060  1372 7.66e-42
##  9 GO:0007275 BP    multicellular organism development  3579  1231 1.46e-40
## 10 GO:0007154 BP    cell communication                  4076  1371 1.57e-40
## # … with 20,646 more rows

Sanity check?

go %>%
    filter(grepl("glucocorticoid", Term)) %>%
    arrange(P.DE)
## # A tibble: 22 x 6
##    GOID     Ont   Term                                     N    DE     P.DE
##    <chr>    <chr> <chr>                                <dbl> <dbl>    <dbl>
##  1 GO:0051… BP    response to glucocorticoid              92    43  1.11e-5
##  2 GO:0071… BP    cellular response to glucocorticoid…    42    21  6.42e-4
##  3 GO:2000… BP    positive regulation of glucocortico…     2     2  6.64e-2
##  4 GO:2000… BP    regulation of glucocorticoid recept…     8     4  1.25e-1
##  5 GO:0006… BP    glucocorticoid biosynthetic process      8     4  1.25e-1
##  6 GO:0043… BP    glucocorticoid mediated signaling p…     3     2  1.65e-1
##  7 GO:0008… BP    glucocorticoid metabolic process        13     5  2.26e-1
##  8 GO:0004… MF    glucocorticoid receptor activity         1     1  2.58e-1
##  9 GO:0031… BP    negative regulation of glucocortico…     4     2  2.75e-1
## 10 GO:0031… BP    negative regulation of glucocortico…     4     2  2.75e-1
## # … with 12 more rows

What genes in set?

genesets <-
    AnnotationDbi::select(org.Hs.eg.db, tbl$ENTREZID, "GOALL", "ENTREZID") %>%
    as_tibble() %>%
    dplyr::select(GOID = GOALL, Ont = ONTOLOGYALL, ENTREZID) %>%
    distinct() %>%
    arrange(Ont, GOID)
genesets
## # A tibble: 1,574,080 x 3
##    GOID       Ont   ENTREZID
##    <chr>      <chr> <chr>   
##  1 GO:0000002 BP    3980    
##  2 GO:0000002 BP    1890    
##  3 GO:0000002 BP    50484   
##  4 GO:0000002 BP    4205    
##  5 GO:0000002 BP    9093    
##  6 GO:0000002 BP    56652   
##  7 GO:0000002 BP    10891   
##  8 GO:0000002 BP    55186   
##  9 GO:0000002 BP    4358    
## 10 GO:0000002 BP    10000   
## # … with 1,574,070 more rows

3 Provenance

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19
## 
## Matrix products: default
## BLAS:   /home/msmith/Applications/R/R-3.6.0/lib/libRblas.so
## LAPACK: /home/msmith/Applications/R/R-3.6.0/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] tibble_2.1.3                limma_3.40.2               
##  [3] GO.db_3.8.2                 org.Hs.eg.db_3.8.2         
##  [5] AnnotationDbi_1.46.0        dplyr_0.8.3                
##  [7] airway_1.4.0                DESeq2_1.24.0              
##  [9] SummarizedExperiment_1.14.0 DelayedArray_0.10.0        
## [11] BiocParallel_1.18.0         matrixStats_0.54.0         
## [13] Biobase_2.44.0              GenomicRanges_1.36.0       
## [15] GenomeInfoDb_1.20.0         IRanges_2.18.1             
## [17] S4Vectors_0.22.0            BiocGenerics_0.30.0        
## [19] BiocStyle_2.12.0           
## 
## loaded via a namespace (and not attached):
##  [1] bit64_0.9-7            splines_3.6.0          Formula_1.2-3         
##  [4] assertthat_0.2.1       BiocManager_1.30.4     latticeExtra_0.6-28   
##  [7] blob_1.2.0             GenomeInfoDbData_1.2.1 yaml_2.2.0            
## [10] pillar_1.4.2           RSQLite_2.1.1          backports_1.1.4       
## [13] lattice_0.20-38        glue_1.3.1             digest_0.6.20         
## [16] RColorBrewer_1.1-2     XVector_0.24.0         checkmate_1.9.4       
## [19] colorspace_1.4-1       htmltools_0.3.6        Matrix_1.2-17         
## [22] XML_3.98-1.20          pkgconfig_2.0.2        genefilter_1.66.0     
## [25] bookdown_0.12          zlibbioc_1.30.0        purrr_0.3.2           
## [28] xtable_1.8-4           scales_1.0.0           htmlTable_1.13.1      
## [31] annotate_1.62.0        ggplot2_3.2.0          nnet_7.3-12           
## [34] lazyeval_0.2.2         cli_1.1.0              survival_2.44-1.1     
## [37] magrittr_1.5           crayon_1.3.4           memoise_1.1.0         
## [40] evaluate_0.14          fansi_0.4.0            foreign_0.8-71        
## [43] tools_3.6.0            data.table_1.12.2      stringr_1.4.0         
## [46] locfit_1.5-9.1         munsell_0.5.0          cluster_2.1.0         
## [49] compiler_3.6.0         rlang_0.4.0            grid_3.6.0            
## [52] RCurl_1.95-4.12        rstudioapi_0.10        htmlwidgets_1.3       
## [55] bitops_1.0-6           base64enc_0.1-3        rmarkdown_1.14        
## [58] codetools_0.2-16       gtable_0.3.0           DBI_1.0.0             
## [61] R6_2.4.0               gridExtra_2.3          knitr_1.23            
## [64] utf8_1.1.4             zeallot_0.1.0          bit_1.1-14            
## [67] Hmisc_4.2-0            stringi_1.4.3          Rcpp_1.0.1            
## [70] geneplotter_1.62.0     vctrs_0.2.0            rpart_4.1-15          
## [73] acepack_1.4.1          tidyselect_0.2.5       xfun_0.8