License: GPL (>= 3)

1 Read GMT Files

GMT files, such as those available through the Molecular Signatures Database (MSigDB)(Liberzon et al. 2011, 2015), can be read as named lists with readGMT:

# Path to GMT file - MSigDB Gene Ontology sets
pathToGMT <- system.file(
    "extdata", "c5.go.v2023.2.Hs.symbols.gmt.gz",
    package = "TMSig"
)

geneSets <- readGMT(path = pathToGMT)

length(geneSets) # 10461 gene sets
#> [1] 10461

head(names(geneSets)) # first 6 gene set names
#> [1] "GOBP_10_FORMYLTETRAHYDROFOLATE_METABOLIC_PROCESS"          
#> [2] "GOBP_2FE_2S_CLUSTER_ASSEMBLY"                              
#> [3] "GOBP_2_OXOGLUTARATE_METABOLIC_PROCESS"                     
#> [4] "GOBP_3_PHOSPHOADENOSINE_5_PHOSPHOSULFATE_METABOLIC_PROCESS"
#> [5] "GOBP_3_UTR_MEDIATED_MRNA_DESTABILIZATION"                  
#> [6] "GOBP_3_UTR_MEDIATED_MRNA_STABILIZATION"

geneSets[[1]] # genes in first set
#> [1] "AASDHPPT" "ALDH1L1"  "ALDH1L2"  "MTHFD1"   "MTHFD1L"  "MTHFD2L"

2 Filter Sets

filterSets keeps sets that satisfy minimum and maximum size constraints. Optionally, sets can be restricted to a smaller background of genes before filtering by size.

# Filter by size - no background
filt <- filterSets(x = geneSets, 
                   min_size = 10L, 
                   max_size = 500L)
length(filt) # 7096 gene sets remain (down from 10461)
#> [1] 7096

Normally, the background is the set of genes quantified in a particular experiment. For the purposes of this example, the top 2000 most common genes will be selected to serve as the background. This ensures the gene sets will maintain some degree of overlap for later steps.

# Top 2000 most common genes
topGenes <- table(unlist(geneSets))
topGenes <- head(sort(topGenes, decreasing = TRUE), 2000L)
head(topGenes)
#> 
#>    TNF   AKT1  WNT5A CTNNB1  TGFB1 NOTCH1 
#>    809    675    669    655    609    588

background <- names(topGenes)

The gene sets will be restricted to elements of the background before filtering by size.

# Restrict genes sets to background of genes
geneSetsFilt <- filterSets(
    x = geneSets, 
    background = background, 
    min_size = 10L # min. overlap of each set with background
)

length(geneSetsFilt) # 4629 gene sets pass
#> [1] 4629

The proportion of genes in each set that overlap with the background will be calculated and used to further select sets.

# Calculate proportion of overlap with background
setSizes <- lengths(geneSetsFilt)
setSizesOld <- lengths(geneSets)[names(geneSetsFilt)]
overlapProp <- setSizes / setSizesOld

plot(setSizesOld, overlapProp, main = "Set Size vs. Overlap")