## Now getting the GODb Object directly
## Now getting the OrgDb Object directly
## Now getting the TxDb Object directly

Author: Martin Morgan (mtmorgan@fredhutch.org)
Date: 7 September, 2015
Back to Workshop Outline

The material in this document requires R version 3.2 and Bioconductor version 3.1

stopifnot(
    getRversion() >= '3.2' && getRversion() < '3.3',
    BiocInstaller::biocVersion() >= "3.1"
)

Annotation

Gene model annotation resources – TxDb packages

TxDb.Hsapiens.UCSC.hg19.knownGene

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-08-20 18:12:27 -0700 (Thu, 20 Aug 2015)
## # GenomicFeatures version at creation time: 1.21.16
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
methods(class=class(txdb))
##  [1] $                      $<-                    ExpressionSet          annotatedDataFrameFrom
##  [5] as.list                asBED                  asGFF                  assayData             
##  [9] assayData<-            cds                    cdsBy                  cdsByOverlaps         
## [13] coerce                 columns                combine                contents              
## [17] dbInfo                 dbconn                 dbfile                 dbmeta                
## [21] dbschema               disjointExons          distance               exons                 
## [25] exonsBy                exonsByOverlaps        extractUpstreamSeqs    featureNames          
## [29] featureNames<-         fiveUTRsByTranscript   genes                  initialize            
## [33] intronsByTranscript    isActiveSeq            isActiveSeq<-          isNA                  
## [37] keys                   keytypes               locateVariants         mapIds                
## [41] mapToTranscripts       mappedkeys             metadata               microRNAs             
## [45] nhit                   organism               predictCoding          promoters             
## [49] revmap                 sample                 sampleNames            sampleNames<-         
## [53] saveDb                 select                 seqinfo                seqinfo<-             
## [57] seqlevels0             show                   species                storageMode           
## [61] storageMode<-          summarizeVariants      tRNAs                  taxonomyId            
## [65] threeUTRsByTranscript  transcripts            transcriptsBy          transcriptsByOverlaps 
## [69] updateObject          
## see '?methods' for accessing help and source code

TxDb objects

Accessing gene models

exons(txdb)
## GRanges object with 289969 ranges and 1 metadata column:
##            seqnames               ranges strand   |   exon_id
##               <Rle>            <IRanges>  <Rle>   | <integer>
##        [1]     chr1       [11874, 12227]      +   |         1
##        [2]     chr1       [12595, 12721]      +   |         2
##        [3]     chr1       [12613, 12721]      +   |         3
##        [4]     chr1       [12646, 12697]      +   |         4
##        [5]     chr1       [13221, 14409]      +   |         5
##        ...      ...                  ...    ... ...       ...
##   [289965]     chrY [27607404, 27607432]      -   |    277746
##   [289966]     chrY [27635919, 27635954]      -   |    277747
##   [289967]     chrY [59358329, 59359508]      -   |    277748
##   [289968]     chrY [59360007, 59360115]      -   |    277749
##   [289969]     chrY [59360501, 59360854]      -   |    277750
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
exonsBy(txdb, "tx")
## GRangesList object of length 82960:
## $1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
## 
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12595, 12721]      + |       2      <NA>         2
##   [3]     chr1 [13403, 14409]      + |       6      <NA>         3
## 
## $3 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12646, 12697]      + |       4      <NA>         2
##   [3]     chr1 [13221, 14409]      + |       5      <NA>         3
## 
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

Identifier mapping – OrgDb

library(org.Hs.eg.db)
org.Hs.eg.db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: HUMAN_DB
## | ORGANISM: Homo sapiens
## | SPECIES: Human
## | EGSOURCEDATE: 2015-Aug11
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 9606
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 20150808
## | GOEGSOURCEDATE: 2015-Aug11
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19
## | GPSOURCEDATE: 2010-Mar22
## | ENSOURCEDATE: 2015-Jul16
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Thu Aug 20 15:34:08 2015
## 
## Please see: help('select') for usage information

OrgDb objects

select()

Related functionality

Other annotation resources – biomaRt, AnnotationHub

biomaRt & friends

http://biomart.org; Bioconductor package biomaRt

## NEEDS INTERNET ACCESS !!
library(biomaRt)
head(listMarts(), 3)                      ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <-                                ## fully specified mart
    useMart("ensembl", dataset = "hsapiens_gene_ensembl")

head(listFilters(ensembl), 3)             ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3)          ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")

## assemble and query the mart
res <- getBM(attributes =  myAttributes, filters =  myFilter,
             values =  myValues, mart = ensembl)

Other internet resources

AnnotationHub

Example: Ensembl 'GTF' files to R / Bioconductor GRanges and TxDb

library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub)                   # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)

Example: non-model organism OrgDb packages

library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")

Example: Map Roadmap epigenomic marks to hg38

Annotating Variants

Example: read variants from a VCF file, and annotate with respect to a known gene model

## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
    TxDb.Hsapiens.UCSC.hg19.knownGene,
    CodingVariants())
head(coding)
## GRanges object with 6 ranges and 9 metadata columns:
##     seqnames               ranges strand | LOCATION  LOCSTART    LOCEND   QUERYID        TXID
##        <Rle>            <IRanges>  <Rle> | <factor> <integer> <integer> <integer> <character>
##   1    chr22 [50301422, 50301422]      - |   coding       939       939        24       75253
##   2    chr22 [50301476, 50301476]      - |   coding       885       885        25       75253
##   3    chr22 [50301488, 50301488]      - |   coding       873       873        26       75253
##   4    chr22 [50301494, 50301494]      - |   coding       867       867        27       75253
##   5    chr22 [50301584, 50301584]      - |   coding       777       777        28       75253
##   6    chr22 [50302962, 50302962]      - |   coding       698       698        57       75253
##             CDSID      GENEID       PRECEDEID        FOLLOWID
##     <IntegerList> <character> <CharacterList> <CharacterList>
##   1        218562       79087                                
##   2        218562       79087                                
##   3        218562       79087                                
##   4        218562       79087                                
##   5        218562       79087                                
##   6        218563       79087                                
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths