DNA, amino acid, and other biological sequences. See earlier example in B.1 Introduction to Bioconductor.
GRanges()
: genomic coordinates to represent annotations (exons, genes, regulatory marks, …) and data (called peaks, variants, aligned reads)
GRangesList()
: genomic coordinates grouped into list elements (e.g., paired-end reads; exons grouped by transcript)
start()
/ end()
/ width()
length()
, subset, etc.mcols()
Seqinfo
, including seqlevels
and seqlengths
shift()
, narrow()
, flank()
, promoters()
, resize()
, restrict()
, trim()
?"intra-range-methods"
range()
, reduce()
, gaps()
, disjoin()
coverage()
(!)?"inter-range-methods"
findOverlaps()
, countOverlaps()
, …, %over%
, %within%
, %outside%
; union()
, intersect()
, setdiff()
, punion()
, pintersect()
, psetdiff()
library(GenomicRanges)
gr <- GRanges("A", IRanges(c(10, 20, 22), width=5), "+")
shift(gr, 1) # intra-range
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A [11, 15] +
## [2] A [21, 25] +
## [3] A [23, 27] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
range(gr) # inter-range
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A [10, 26] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
reduce(gr) # inter-range
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A [10, 14] +
## [2] A [20, 26] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
snps <- GRanges("A", IRanges(c(11, 17, 24), width=1))
findOverlaps(snps, gr) # between-range
## Hits object with 3 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 3 2
## [3] 3 3
## -------
## queryLength: 3 / subjectLength: 3
setdiff(range(gr), gr) # 'introns'
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A [15, 19] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Representation of aligned reads. See exercises below.
colData()
data frame for desciption of samplesrowRanges()
GRanges / GRangeList or data frame for description of featuresexptData()
to describe the entire objectassays()
can be any matrix-like object, including very large on-disk representations such as HDF5Arraylibrary(SummarizedExperiment)
library(airway)
data(airway)
airway
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
colData(airway)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
airway[, airway$dex %in% "trt"]
## class: RangedSummarizedExperiment
## dim: 64102 4
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
chr14 <- as(seqinfo(airway), "GRanges")["14"]
airway[airway %over% chr14,]
## class: RangedSummarizedExperiment
## dim: 2244 8
## metadata(1): ''
## assays(1): counts
## rownames(2244): ENSG00000006432 ENSG00000009830 ... ENSG00000273259
## ENSG00000273307
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
?select
?exonsBy
page to retrieve all exons grouped by gene or transcript.org.*: identifier mappings
select()
, columns()
, keys()
mapIds()
library(org.Hs.eg.db)
org <- org.Hs.eg.db
select(org, "BRCA1", c("ENSEMBL", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENSEMBL GENENAME
## 1 BRCA1 ENSG00000012048 BRCA1, DNA repair associated
TxDb.*: gene models
exons()
, transcripts()
, genes()
, promoters()
, …exonsBy()
, transcriptsBy()
select()
, etc.library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
promoters(txdb)
## GRanges object with 82960 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr1 [ 9874, 12073] + | 1 uc001aaa.3
## [2] chr1 [ 9874, 12073] + | 2 uc010nxq.1
## [3] chr1 [ 9874, 12073] + | 3 uc010nxr.1
## [4] chr1 [ 67091, 69290] + | 4 uc001aal.1
## [5] chr1 [319084, 321283] + | 5 uc001aaq.2
## ... ... ... ... . ... ...
## [82956] chrUn_gl000237 [ 2487, 4686] - | 82956 uc011mgu.1
## [82957] chrUn_gl000241 [36676, 38875] - | 82957 uc011mgv.2
## [82958] chrUn_gl000243 [ 9501, 11700] + | 82958 uc011mgw.1
## [82959] chrUn_gl000243 [11608, 13807] + | 82959 uc022brq.1
## [82960] chrUn_gl000247 [ 5617, 7816] - | 82960 uc022brr.1
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
library(AnnotationHub)
hub = AnnotationHub()
## snapshotDate(): 2017-04-24
hub
## AnnotationHub with 38907 records
## # snapshotDate(): 2017-04-24
## # $dataprovider: BroadInstitute, UCSC, Ensembl, Haemcode, Inparanoid8, ft...
## # $species: Homo sapiens, Mus musculus, Bos taurus, Pan troglodytes, Dani...
## # $rdataclass: GRanges, BigWigFile, FaFile, TwoBitFile, ChainFile, Rle, I...
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH2"]]'
##
## title
## AH2 | Ailuropoda_melanoleuca.ailMel1.69.dna.toplevel.fa
## AH3 | Ailuropoda_melanoleuca.ailMel1.69.dna_rm.toplevel.fa
## AH4 | Ailuropoda_melanoleuca.ailMel1.69.dna_sm.toplevel.fa
## AH5 | Ailuropoda_melanoleuca.ailMel1.69.ncrna.fa
## AH6 | Ailuropoda_melanoleuca.ailMel1.69.pep.all.fa
## ... ...
## AH54202 | Xiphophorus_maculatus.Xipmac4.4.2.cdna.all.2bit
## AH54203 | Xiphophorus_maculatus.Xipmac4.4.2.dna.toplevel.2bit
## AH54204 | Xiphophorus_maculatus.Xipmac4.4.2.dna_rm.toplevel.2bit
## AH54205 | Xiphophorus_maculatus.Xipmac4.4.2.dna_sm.toplevel.2bit
## AH54206 | Xiphophorus_maculatus.Xipmac4.4.2.ncrna.2bit
query(hub, c("ensembl", "81.gtf"))
## AnnotationHub with 69 records
## # snapshotDate(): 2017-04-24
## # $dataprovider: Ensembl
## # $species: Ailuropoda melanoleuca, Anas platyrhynchos, Anolis carolinens...
## # $rdataclass: GRanges
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH47937"]]'
##
## title
## AH47937 | Ailuropoda_melanoleuca.ailMel1.81.gtf
## AH47938 | Anas_platyrhynchos.BGI_duck_1.0.81.gtf
## AH47939 | Anolis_carolinensis.AnoCar2.0.81.gtf
## AH47940 | Astyanax_mexicanus.AstMex102.81.gtf
## AH47941 | Bos_taurus.UMD3.1.81.gtf
## ... ...
## AH48001 | Tupaia_belangeri.TREESHREW.81.gtf
## AH48002 | Tursiops_truncatus.turTru1.81.gtf
## AH48003 | Vicugna_pacos.vicPac1.81.gtf
## AH48004 | Xenopus_tropicalis.JGI_4.2.81.gtf
## AH48005 | Xiphophorus_maculatus.Xipmac4.4.2.81.gtf
hub[["AH48004"]]
## loading from cache '/home/mtmorgan//.AnnotationHub/54310'
## using guess work to populate seqinfo
## GRanges object with 581787 ranges and 19 metadata columns:
## seqnames ranges strand | source type score
## <Rle> <IRanges> <Rle> | <factor> <factor> <numeric>
## [1] GL172637.1 [ 34, 148] - | ensembl gene <NA>
## [2] GL172637.1 [ 34, 148] - | ensembl transcript <NA>
## [3] GL172637.1 [ 34, 148] - | ensembl exon <NA>
## [4] GL172637.1 [606, 720] - | ensembl gene <NA>
## [5] GL172637.1 [606, 720] - | ensembl transcript <NA>
## ... ... ... ... . ... ... ...
## [581783] GL180121.1 [ 865, 867] + | ensembl start_codon <NA>
## [581784] GL180121.1 [ 992, 1334] + | ensembl exon <NA>
## [581785] GL180121.1 [ 992, 1334] + | ensembl CDS <NA>
## [581786] GL180121.1 [1817, 1835] + | ensembl exon <NA>
## [581787] GL180121.1 [1817, 1835] + | ensembl CDS <NA>
## phase gene_id gene_version gene_name gene_source
## <integer> <character> <numeric> <character> <character>
## [1] <NA> ENSXETG00000030486 1 U5 ensembl
## [2] <NA> ENSXETG00000030486 1 U5 ensembl
## [3] <NA> ENSXETG00000030486 1 U5 ensembl
## [4] <NA> ENSXETG00000031766 1 U5 ensembl
## [5] <NA> ENSXETG00000031766 1 U5 ensembl
## ... ... ... ... ... ...
## [581783] 0 ENSXETG00000033193 1 <NA> ensembl
## [581784] <NA> ENSXETG00000033193 1 <NA> ensembl
## [581785] 2 ENSXETG00000033193 1 <NA> ensembl
## [581786] <NA> ENSXETG00000033193 1 <NA> ensembl
## [581787] 1 ENSXETG00000033193 1 <NA> ensembl
## gene_biotype transcript_id transcript_version
## <character> <character>