1 Exclude ranges

Genomic ranges of problematic genomic regions that should be avoided when working with genomic data. For human, mouse, and selected model organisms.

TL;DR - For human hg38 genome assembly, Anshul recommends ENCFF356LFX exclusion list regions.

BED files of exclusion regions are available on the ENCODE project website (Amemiya, Kundaje, and Boyle 2019). Human (hg19, hg38) and mouse (mm9, mm10) exclusion regions are available. However, exclusion lists generated by multiple labs often create uncertainty what to use. The purpose of this package is to provide a unified place for informed retrieval of exclusion regions.

Naming convention: <genome assembly>.<lab>.<original file name>, e.g., hg19.Birney.wgEncodeDacMapabilityConsensusExcludable.

See make-data.R how to create the excluderanges GRanges objects.

1.1 Install excluderanges

if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}
# Install the development version of Bioconductor (need 3.14 and above)
# BiocManager::install(version = "devel")
# Check that you have a valid Bioconductor installation
# BiocManager::valid()
# Install the package
BiocManager::install("excluderanges", version = "devel")

# BiocManager::install("mdozmorov/excluderanges")

1.2 Use excluderanges

Get an overview of whatโ€™s available

suppressMessages(library(AnnotationHub))
ah <- AnnotationHub()
#> snapshotDate(): 2021-10-18
query_data <- query(ah, "excluderanges")
query_data
#> AnnotationHub with 42 records
#> # snapshotDate(): 2021-10-18
#> # $dataprovider: UCSC, ENCODE, mitra.stanford.edu/kundaje/akundaje/release/b...
#> # $species: Homo sapiens, Mus musculus, Drosophila melanogaster, Caenorhabdi...
#> # $rdataclass: GRanges
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["AH95908"]]' 
#> 
#>             title                                                    
#>   AH95908 | ce10.Kundaje.ce10-Excludable.rds                         
#>   AH95909 | dm3.Kundaje.dm3-Excludable.rds                           
#>   AH95910 | hg19.Bernstein.Mint_Excludable_hg19.rds                  
#>   AH95911 | hg19.Birney.wgEncodeDacMapabilityConsensusExcludable.rds 
#>   AH95912 | hg19.Crawford.wgEncodeDukeMapabilityRegionsExcludable.rds
#>   ...       ...                                                      
#>   AH95945 | mm10.UCSC.telomere.rds                                   
#>   AH95946 | mm9.UCSC.centromere.rds                                  
#>   AH95947 | mm9.UCSC.contig.rds                                      
#>   AH95948 | mm9.UCSC.fragment.rds                                    
#>   AH95949 | mm10.UCSC.scaffold.rds

hg38 excluderanges coordinates recommended by Anshul

# Check titles
# as.data.frame(mcols(query_data[1:10])["title"]) 
excludeGR.hg38.Kundaje.1 <- query_data[["AH95917"]]
#> downloading 1 resources
#> retrieving 1 resource
#> loading from cache
excludeGR.hg38.Kundaje.1
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following objects are masked from 'package:base':
#> 
#>     I, expand.grid, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> GRanges object with 910 ranges and 0 metadata columns:
#>         seqnames            ranges strand
#>            <Rle>         <IRanges>  <Rle>
#>     [1]     chr1     628903-635104      *
#>     [2]     chr1   5850087-5850571      *
#>     [3]     chr1   8909610-8910014      *
#>     [4]     chr1   9574580-9574997      *
#>     [5]     chr1 32043823-32044203      *
#>     ...      ...               ...    ...
#>   [906]     chrY 11290797-11334278      *
#>   [907]     chrY 11493053-11592850      *
#>   [908]     chrY 11671014-11671046      *
#>   [909]     chrY 11721528-11749472      *
#>   [910]     chrY 56694632-56889743      *
#>   -------
#>   seqinfo: 24 sequences from hg38 genome

Save the data in a BED file, if needed.

rtracklayer::export(excludeGR.hg38.Kundaje.1, "hg38.Kundaje.GRCh38_unified_Excludable.bed", format = "bed")

We can load other excludable regions for the hg38 genome assembly and compare them.

query_data <- query(ah, c("excluderanges", "hg38", "Exclusion regions"))
query_data
#> AnnotationHub with 6 records
#> # snapshotDate(): 2021-10-18
#> # $dataprovider: ENCODE
#> # $species: Homo sapiens
#> # $rdataclass: GRanges
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["AH95915"]]' 
#> 
#>             title                                                       
#>   AH95915 | hg38.Bernstein.Mint_Excludable_GRCh38.rds                   
#>   AH95916 | hg38.Kundaje.GRCh38.Excludable.rds                          
#>   AH95917 | hg38.Kundaje.GRCh38_unified_Excludable.rds                  
#>   AH95918 | hg38.Reddy.wgEncodeDacMapabilityConsensusExcludable.hg38.rds
#>   AH95919 | hg38.Wold.hg38mitoExcludable.rds                            
#>   AH95920 | hg38.Yeo.eCLIP_Excludableregions.hg38liftover.bed.fixed.rds
excludeGR.hg38.Bernstein <- query_data[["AH95915"]]
#> downloading 1 resources
#> retrieving 1 resource
#> loading from cache
excludeGR.hg38.Kundaje.2 <- query_data[["AH95916"]]
#> downloading 1 resources
#> retrieving 1 resource
#> loading from cache
excludeGR.hg38.Reddy     <- query_data[["AH95918"]]
#> downloading 1 resources
#> retrieving 1 resource
#> loading from cache
excludeGR.hg38.Wold      <- query_data[["AH95919"]]
#> downloading 1 resources
#> retrieving 1 resource
#> loading from cache
excludeGR.hg38.Yeo       <- query_data[["AH95920"]]
#> downloading 1 resources
#> retrieving 1 resource
#> loading from cache

Compare the number of excludable regions.

library(ggplot2)
mtx_to_plot <- data.frame(Count = c(length(excludeGR.hg38.Bernstein), 
                                    length(excludeGR.hg38.Kundaje.1), 
                                    length(excludeGR.hg38.Kundaje.2), 
                                    length(excludeGR.hg38.Reddy), 
                                    length(excludeGR.hg38.Wold), 
                                    length(excludeGR.hg38.Yeo)),
                          Source = c("Bernstein.Mint_Excludable_GRCh38", 
                                     "Kundaje.GRCh38_unified_Excludable", 
                                     "Kundaje.GRCh38.Excludable", 
                                     "Reddy.wgEncodeDacMapabilityConsensusExcludable", 
                                     "Wold.hg38mitoExcludable", 
                                     "Yeo.eCLIP_Excludableregions.hg38liftover.bed"))
# Order Source by the number of regions
mtx_to_plot$Source <- factor(mtx_to_plot$Source, levels = mtx_to_plot$Source[order(mtx_to_plot$Count)])

ggplot(mtx_to_plot, aes(x = Source, y = Count, fill = Source)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_bw() + theme(legend.position = "none")

# ggsave("man/figures/excluderanges_hg38_count.png", width = 5.5, height = 2)

Compare the width of excludable regions. log2 scale because of heavy right tail distributions.

library(ggridges)
mtx_to_plot <- data.frame(Width = c(width(excludeGR.hg38.Bernstein), 
                                    width(excludeGR.hg38.Kundaje.1), 
                                    width(excludeGR.hg38.Kundaje.2), 
                                    width(excludeGR.hg38.Reddy), 
                                    width(excludeGR.hg38.Wold), 
                                    width(excludeGR.hg38.Yeo)),
                          Source = c(rep("Bernstein.Mint_Excludable_GRCh38", length(excludeGR.hg38.Bernstein)),
                                     rep("Kundaje.GRCh38_unified_Excludable", length(excludeGR.hg38.Kundaje.1)),
                                     rep("Kundaje.GRCh38.Excludable", length(excludeGR.hg38.Kundaje.2)),
                                     rep("Reddy.wgEncodeDacMapabilityConsensusExcludable", length(excludeGR.hg38.Reddy)),
                                     rep("Wold.hg38mitoExcludable", length(excludeGR.hg38.Wold)),
                                     rep("Yeo.eCLIP_Excludableregions.hg38liftover.bed", length(excludeGR.hg38.Yeo))))

ggplot(mtx_to_plot, aes(x = log2(Width), y = Source, fill = Source)) +
  geom_density_ridges() +
  theme_bw() + theme(legend.position = "none")
#> Picking joint bandwidth of 0.372

# ggsave("man/figures/excluderanges_hg38_width.png", width = 5.5, height = 2)

We can investigate the total width of each set of excludable ranges.

mtx_to_plot <- data.frame(TotalWidth = c(sum(width(excludeGR.hg38.Bernstein)), 
                                         sum(width(excludeGR.hg38.Kundaje.1)), 
                                         sum(width(excludeGR.hg38.Kundaje.2)), 
                                         sum(width(excludeGR.hg38.Reddy)), 
                                         sum(width(excludeGR.hg38.Wold)), 
                                         sum(width(excludeGR.hg38.Yeo))), 
                          Source = c("Bernstein.Mint_Excludable_GRCh38", 
                                     "Kundaje.GRCh38_unified_Excludable", 
                                     "Kundaje.GRCh38.Excludable", 
                                     "Reddy.wgEncodeDacMapabilityConsensusExcludable", 
                                     "Wold.hg38mitoExcludable", 
                                     "Yeo.eCLIP_Excludableregions.hg38liftover"))
ggplot(mtx_to_plot, aes(x = TotalWidth, y = Source, fill = Source)) + 
  geom_bar(stat="identity") + scale_x_log10() + scale_y_discrete(label=abbreviate) +
  xlab("log10 total width")