Authors: Martin Morgan (mtmorgan@fhcrc.org), Sonali Arora (sarora@fredhutch.org)
Date: 16 June, 2015

Mapping identifiers in org.* packages

library(org.Hs.eg.db)
org <- org.Hs.eg.db       # convenient alias
keytypes(org)             # map from keys...
##  [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"     
##  [5] "ACCNUM"       "ALIAS"        "ENZYME"       "MAP"         
##  [9] "PATH"         "PMID"         "REFSEQ"       "SYMBOL"      
## [13] "UNIGENE"      "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
## [17] "GENENAME"     "UNIPROT"      "GO"           "EVIDENCE"    
## [21] "ONTOLOGY"     "GOALL"        "EVIDENCEALL"  "ONTOLOGYALL" 
## [25] "OMIM"         "UCSCKG"
columns(org)              # ...to columns
##  [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"     
##  [5] "ACCNUM"       "ALIAS"        "CHR"          "CHRLOC"      
##  [9] "CHRLOCEND"    "ENZYME"       "MAP"          "PATH"        
## [13] "PMID"         "REFSEQ"       "SYMBOL"       "UNIGENE"     
## [17] "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "GENENAME"    
## [21] "UNIPROT"      "GO"           "EVIDENCE"     "ONTOLOGY"    
## [25] "GOALL"        "EVIDENCEALL"  "ONTOLOGYALL"  "OMIM"        
## [29] "UCSCKG"

mapIds() enforces 1:1 mapping by default

sym <- keys(org, "SYMBOL")
mapIds(org, c("BRCA1", "PTEN"), "GENENAME", "SYMBOL")
##                            BRCA1                             PTEN 
##   "breast cancer 1, early onset" "phosphatase and tensin homolog"
map <- mapIds(org, sym, "GENENAME", "SYMBOL")
length(map)
## [1] 56332
head(map)
##                                                    A1BG 
##                                "alpha-1-B glycoprotein" 
##                                                     A2M 
##                                 "alpha-2-macroglobulin" 
##                                                   A2MP1 
##                    "alpha-2-macroglobulin pseudogene 1" 
##                                                    NAT1 
## "N-acetyltransferase 1 (arylamine N-acetyltransferase)" 
##                                                    NAT2 
## "N-acetyltransferase 2 (arylamine N-acetyltransferase)" 
##                                                    NATP 
##                        "N-acetyltransferase pseudogene"

mapIds() supports 1:many mapping

head(select(org, keys(org), "ALIAS"))
##   ENTREZID    ALIAS
## 1        1      A1B
## 2        1      ABG
## 3        1      GAB
## 4        1 HYST2477
## 5        1     A1BG
## 6        2     A2MD
head(mapIds(org, keys(org), "ALIAS", "ENTREZID"))
##      1      2      3      9     10     11 
##  "A1B" "A2MD" "A2MP" "AAC1" "AAC2" "AACP"
head(mapIds(org, keys(org), "ALIAS", "ENTREZID", multiVals="CharacterList"))
## CharacterList of length 6
## [["1"]] A1B ABG GAB HYST2477 A1BG
## [["2"]] A2MD CPAMD5 FWP007 S863-7 A2M
## [["3"]] A2MP A2MP1
## [["9"]] AAC1 MNAT NAT-1 NATI NAT1
## [["10"]] AAC2 NAT-2 PNAT NAT2
## [["11"]] AACP NATP1 NATP
str(head(mapIds(org, keys(org), "ALIAS", "ENTREZID", multiVals="list")))
## List of 6
##  $ 1 : chr [1:5] "A1B" "ABG" "GAB" "HYST2477" ...
##  $ 2 : chr [1:5] "A2MD" "CPAMD5" "FWP007" "S863-7" ...
##  $ 3 : chr [1:2] "A2MP" "A2MP1"
##  $ 9 : chr [1:5] "AAC1" "MNAT" "NAT-1" "NATI" ...
##  $ 10: chr [1:4] "AAC2" "NAT-2" "PNAT" "NAT2"
##  $ 11: chr [1:3] "AACP" "NATP1" "NATP"

select() can be used to select several different columns

OrgDb versus OrganismDb packages

OrgDb (e.g., org.Hs.eg.db)

OrganismDb (e.g., Homo.sapiens) packages.

library(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

sym <- "BRCA1"

eid <- mapIds(org.Hs.eg.db, sym, "ENTREZID", "SYMBOL")
txid <- mapIds(txdb, eid, "TXNAME", "GENEID", multiVals="list")[[eid]]
txid
##  [1] "uc010whl.2" "uc002icp.4" "uc010whm.2" "uc002icu.3" "uc010cyx.3"
##  [6] "uc002icq.3" "uc002ict.3" "uc010whn.2" "uc010who.3" "uc010whp.2"
## [11] "uc010whq.1" "uc002idc.1" "uc010whr.1" "uc002idd.3" "uc002ide.1"
## [16] "uc010cyy.1" "uc010whs.1" "uc010cyz.2" "uc010cza.2" "uc010wht.1"
cds <- cdsBy(txdb, by="tx", use.names=TRUE)
cds[names(cds) %in% txid]
## GRangesList object of length 20:
## $uc010whl.2 
## GRanges object with 22 ranges and 3 metadata columns:
##        seqnames               ranges strand   |    cds_id    cds_name
##           <Rle>            <IRanges>  <Rle>   | <integer> <character>
##    [1]    chr17 [41276034, 41276113]      -   |    186246        <NA>
##    [2]    chr17 [41267743, 41267796]      -   |    186245        <NA>
##    [3]    chr17 [41258473, 41258550]      -   |    186243        <NA>
##    ...      ...                  ...    ... ...       ...         ...
##   [21]    chr17 [41199660, 41199720]      -   |    186214        <NA>
##   [22]    chr17 [41197695, 41197819]      -   |    186212        <NA>
##        exon_rank
##        <integer>
##    [1]         1
##    [2]         2
##    [3]         3
##    ...       ...
##   [21]        21
##   [22]        22
## 
## ...
## <19 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
library(Homo.sapiens)

txid <- mapIds(Homo.sapiens, sym, "TXNAME", "SYMBOL", multiVals="list")[[sym]]
cds <- cdsBy(Homo.sapiens, by="tx", use.names=TRUE)

Possible to create custom versions, e.g., OrganismDbi::makeOrganismDbFromBiomart().

Discovery and retrieval from biomaRt

Web resource: http://biomart.org
Package: biomaRt

library(biomaRt)
head(listMarts())                               # available marts, 52!
##               biomart                             version
## 1             ensembl        ENSEMBL GENES 80 (SANGER UK)
## 2                 snp    ENSEMBL VARIATION 80 (SANGER UK)
## 3          regulation   ENSEMBL REGULATION 80 (SANGER UK)
## 4                vega                VEGA 60  (SANGER UK)
## 5       fungi_mart_26           ENSEMBL FUNGI 26 (EBI UK)
## 6 fungi_variations_26 ENSEMBL FUNGI VARIATION 26 (EBI UK)
head(listDatasets(useMart("ensembl")))          # datasets in mart, 69!
##                          dataset
## 1         oanatinus_gene_ensembl
## 2        cporcellus_gene_ensembl
## 3        gaculeatus_gene_ensembl
## 4         lafricana_gene_ensembl
## 5 itridecemlineatus_gene_ensembl
## 6        choffmanni_gene_ensembl
##                                  description version
## 1     Ornithorhynchus anatinus genes (OANA5)   OANA5
## 2            Cavia porcellus genes (cavPor3) cavPor3
## 3     Gasterosteus aculeatus genes (BROADS1) BROADS1
## 4         Loxodonta africana genes (loxAfr3) loxAfr3
## 5 Ictidomys tridecemlineatus genes (spetri2) spetri2
## 6        Choloepus hoffmanni genes (choHof1) choHof1
ensembl <-                                      # fully specified mart
  useMart("ensembl", dataset = "hsapiens_gene_ensembl")

head(listFilters(ensembl), 3)                   # filters, 296!
##              name     description
## 1 chromosome_name Chromosome name
## 2           start Gene Start (bp)
## 3             end   Gene End (bp)
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) # return values
## [1] "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,2"
myValues <- c("21", "22")
head(listAttributes(ensembl), 3)                # attributes
##                    name           description
## 1       ensembl_gene_id       Ensembl Gene ID
## 2 ensembl_transcript_id Ensembl Transcript ID
## 3    ensembl_peptide_id    Ensembl Protein ID
myAttributes <- c("ensembl_gene_id","chromosome_name")

## assemble and query the mart
res <- getBM(myAttributes, myFilter, myValues, ensembl)

Discovery and retrieval from AnnotationHub

Package: AnnotationHub

library(AnnotationHub)
hub = AnnotationHub()
hub
## AnnotationHub with 16754 records
## # snapshotDate(): 2015-05-26 
## # $dataprovider: UCSC, Ensembl, NCBI, Haemcode, Inparanoid8, Pazar, dbS...
## # $species: Homo sapiens, Mus musculus, Bos taurus, Pan troglodytes, Da...
## # $rdataclass: FaFile, GRanges, OrgDb, ChainFile, BigWigFile, Inparanoi...
## # additional mcols(): taxonomyid, genome, description, tags,
## #   sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH2"]]' 
## 
##             title                                               
##   AH2     | Ailuropoda_melanoleuca.ailMel1.69.dna.toplevel.fa   
##   AH3     | Ailuropoda_melanoleuca.ailMel1.69.dna_rm.toplevel.fa
##   AH4     | Ailuropoda_melanoleuca.ailMel1.69.dna_sm.toplevel.fa
##   ...       ...                                                 
##   AH47935 | Xiphophorus_maculatus.Xipmac4.4.2.ncrna.fa          
##   AH47936 | Xiphophorus_maculatus.Xipmac4.4.2.pep.all.fa
query(hub, c("Ensembl", "80", "gtf"))
## AnnotationHub with 102 records
## # snapshotDate(): 2015-05-26 
## # $dataprovider: Ensembl
## # $species: Gadus morhua, Oryzias latipes, Xiphophorus maculatus, Ailur...
## # $rdataclass: GRanges
## # additional mcols(): taxonomyid, genome, description, tags,
## #   sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH7535"]]' 
## 
##             title                                   
##   AH7535  | Xiphophorus_maculatus.Xipmac4.4.2.69.gtf
##   AH7554  | Gadus_morhua.gadMor1.70.gtf             
##   AH7575  | Oryzias_latipes.MEDAKA1.70.gtf          
##   ...       ...                                     
##   AH47107 | Xenopus_tropicalis.JGI_4.2.80.gtf       
##   AH47108 | Xiphophorus_maculatus.Xipmac4.4.2.80.gtf
## ensgtf = display(hub)                   # visual choice
hub["AH47107"]
## AnnotationHub with 1 record
## # snapshotDate(): 2015-05-26 
## # names(): AH47107
## # $dataprovider: Ensembl
## # $species: Xenopus tropicalis
## # $rdataclass: GRanges
## # $title: Xenopus_tropicalis.JGI_4.2.80.gtf
## # $description: Gene Annotation for Xenopus tropicalis
## # $taxonomyid: 8364
## # $genome: JGI_4.2
## # $sourcetype: GTF
## # $sourceurl: ftp://ftp.ensembl.org/pub/release-80/gtf/xenopus_tropical...
## # $sourcelastmodifieddate: 2015-05-01
## # $sourcesize: 8492889
## # $tags: GTF, ensembl, Gene, Transcript, Annotation 
## # retrieve record with 'object[["AH47107"]]'
gtf <- hub[["AH47107"]]
gtf
## GRanges object with 581787 ranges and 19 metadata columns:
##              seqnames       ranges strand   |   source       type
##                 <Rle>    <IRanges>  <Rle>   | <factor>   <factor>
##        [1] GL172637.1    [34, 148]      -   |  ensembl       gene
##        [2] GL172637.1    [34, 148]      -   |  ensembl transcript
##        [3] GL172637.1    [34, 148]      -   |  ensembl       exon
##        ...        ...          ...    ... ...      ...        ...
##   [581786] GL180121.1 [1817, 1835]      +   |  ensembl       exon
##   [581787] GL180121.1 [1817, 1835]      +   |  ensembl        CDS
##                score     phase            gene_id gene_version   gene_name
##            <numeric> <integer>        <character>    <numeric> <character>
##        [1]      <NA>      <NA> ENSXETG00000030486            1          U5
##        [2]      <NA>      <NA> ENSXETG00000030486            1          U5
##        [3]      <NA>      <NA> ENSXETG00000030486            1          U5
##        ...       ...       ...                ...          ...         ...
##   [581786]      <NA>      <NA> ENSXETG00000033193            1        <NA>
##   [581787]      <NA>         1 ENSXETG00000033193            1        <NA>
##            gene_source   gene_biotype      transcript_id
##            <character>    <character>        <character>
##        [1]     ensembl          snRNA               <NA>
##        [2]     ensembl          snRNA ENSXETT00000065882
##        [3]     ensembl          snRNA ENSXETT00000065882
##        ...         ...            ...                ...
##   [581786]     ensembl protein_coding ENSXETT00000053735
##   [581787]     ensembl protein_coding ENSXETT00000053735
##            transcript_version transcript_name transcript_source
##                     <numeric>     <character>       <character>
##        [1]               <NA>            <NA>              <NA>
##        [2]                  1          U5-201           ensembl
##        [3]                  1          U5-201           ensembl
##        ...                ...             ...               ...
##   [581786]                  2            <NA>           ensembl
##   [581787]                  2            <NA>           ensembl
##            transcript_biotype exon_number            exon_id exon_version
##                   <character>   <numeric>        <character>    <numeric>
##        [1]               <NA>        <NA>               <NA>         <NA>
##        [2]              snRNA        <NA>               <NA>         <NA>
##        [3]              snRNA           1 ENSXETE00000393193            1
##        ...                ...         ...                ...          ...
##   [581786]     protein_coding           3 ENSXETE00000416553            1
##   [581787]     protein_coding           3               <NA>         <NA>
##                    protein_id protein_version
##                   <character>       <numeric>
##        [1]               <NA>            <NA>
##        [2]               <NA>            <NA>
##        [3]               <NA>            <NA>
##        ...                ...             ...
##   [581786]               <NA>            <NA>
##   [581787] ENSXETP00000053735               2
##   -------
##   seqinfo: 2375 sequences from an unspecified genome; no seqlengths
## org.* data bases available from AnnotationHub
query(hub, "OrgDb")
## AnnotationHub with 1164 records
## # snapshotDate(): 2015-05-26 
## # $dataprovider: NCBI, ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
## # $species: Escherichia coli, Anopheles gambiae, Macaca mulatta, Pan tr...
## # $rdataclass: OrgDb
## # additional mcols(): taxonomyid, genome, description, tags,
## #   sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH12818"]]' 
## 
##             title                                      
##   AH12818 | org.Pseudomonas_mendocina_NK-01.eg.sqlite  
##   AH12819 | org.Streptomyces_coelicolor_A3(2).eg.sqlite
##   AH12820 | org.Cricetulus_griseus.eg.sqlite           
##   ...       ...                                        
##   AH47000 | org.Dr.eg.db.sqlite                        
##   AH47001 | org.Pf.plasmo.db.sqlite
mcols(query(hub, "OrgDb"))[, "species", drop=FALSE]
## DataFrame with 1164 rows and 1 column
##                               species
##                           <character>
## AH12818   Pseudomonas mendocina_NK-01
## AH12819 Streptomyces coelicolor_A3(2)
## AH12820            Cricetulus griseus
## ...                               ...
## AH47000                   Danio rerio
## AH47001         Plasmodium falciparum

In Bioc-devel, metadata(gtf) returns information about the AnnotationHub record gtf is derived from.

AnnotationHub for non-model-organism org.* packages

library(AnnotationHub)
hub = AnnotationHub()
query(hub, "OrgDb")
## AnnotationHub with 1164 records
## # snapshotDate(): 2015-05-26 
## # $dataprovider: NCBI, ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
## # $species: Escherichia coli, Anopheles gambiae, Macaca mulatta, Pan tr...
## # $rdataclass: OrgDb
## # additional mcols(): taxonomyid, genome, description, tags,
## #   sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH12818"]]' 
## 
##             title                                      
##   AH12818 | org.Pseudomonas_mendocina_NK-01.eg.sqlite  
##   AH12819 | org.Streptomyces_coelicolor_A3(2).eg.sqlite
##   AH12820 | org.Cricetulus_griseus.eg.sqlite           
##   ...       ...                                        
##   AH47000 | org.Dr.eg.db.sqlite                        
##   AH47001 | org.Pf.plasmo.db.sqlite
hub[["AH12818"]]
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | DBSCHEMA: NOSCHEMA_DB
## | ORGANISM: Pseudomonas mendocina_NK-01
## | SPECIES: Pseudomonas mendocina_NK-01
## | CENTRALID: GID
## | TAXID: 1001585
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## 
## Please see: help('select') for usage information

AnnotationHub for Ensembl GTF, FASTA, and TxDb resources

Discover and retrieve available GTF and FASTA resources for pufferfish

library(AnnotationHub)
hub = AnnotationHub()
query(hub, c("ensembl","release-80", "Takifugu"))
## AnnotationHub with 7 records
## # snapshotDate(): 2015-05-26 
## # $dataprovider: Ensembl
## # $species: Takifugu rubripes
## # $rdataclass: FaFile, GRanges
## # additional mcols(): taxonomyid, genome, description, tags,
## #   sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH47101"]]' 
## 
##             title                                     
##   AH47101 | Takifugu_rubripes.FUGU4.80.gtf            
##   AH47475 | Takifugu_rubripes.FUGU4.cdna.all.fa       
##   AH47476 | Takifugu_rubripes.FUGU4.dna_rm.toplevel.fa
##   ...       ...                                       
##   AH47479 | Takifugu_rubripes.FUGU4.ncrna.fa          
##   AH47480 | Takifugu_rubripes.FUGU4.pep.all.fa
gtf <- hub[["AH47101"]]
dna <- hub[["AH47477"]]

gtf
## GRanges object with 1388717 ranges and 19 metadata columns:
##                   seqnames         ranges strand   |   source       type
##                      <Rle>      <IRanges>  <Rle>   | <factor>   <factor>
##         [1]     scaffold_1 [10422, 11354]      -   |  ensembl       gene
##         [2]     scaffold_1 [10422, 11354]      -   |  ensembl transcript
##         [3]     scaffold_1 [10422, 11354]      -   |  ensembl       exon
##         ...            ...            ...    ... ...      ...        ...
##   [1388716] scaffold_16598   [1807, 1936]      +   |  ensembl       exon
##   [1388717] scaffold_16598   [1807, 1936]      +   |  ensembl        CDS
##                 score     phase            gene_id gene_version
##             <numeric> <integer>        <character>    <numeric>
##         [1]      <NA>      <NA> ENSTRUG00000003702            1
##         [2]      <NA>      <NA> ENSTRUG00000003702            1
##         [3]      <NA>      <NA> ENSTRUG00000003702            1
##         ...       ...       ...                ...          ...
##   [1388716]      <NA>      <NA> ENSTRUG00000004123            1
##   [1388717]      <NA>         1 ENSTRUG00000004123            1
##             gene_source   gene_biotype      transcript_id
##             <character>    <character>        <character>
##         [1]     ensembl protein_coding               <NA>
##         [2]     ensembl protein_coding ENSTRUT00000008740
##         [3]     ensembl protein_coding ENSTRUT00000008740
##         ...         ...            ...                ...
##   [1388716]     ensembl protein_coding ENSTRUT00000009819
##   [1388717]     ensembl protein_coding ENSTRUT00000009819
##             transcript_version transcript_source transcript_biotype
##                      <numeric>       <character>        <character>
##         [1]               <NA>              <NA>               <NA>
##         [2]                  1           ensembl     protein_coding
##         [3]                  1           ensembl     protein_coding
##         ...                ...               ...                ...
##   [1388716]                  1           ensembl     protein_coding
##   [1388717]                  1           ensembl     protein_coding
##             exon_number            exon_id exon_version         protein_id
##               <numeric>        <character>    <numeric>        <character>
##         [1]        <NA>               <NA>         <NA>               <NA>
##         [2]        <NA>               <NA>         <NA>               <NA>
##         [3]           1 ENSTRUE00000055472            1               <NA>
##         ...         ...                ...          ...                ...
##   [1388716]           4 ENSTRUE00000062911            1               <NA>
##   [1388717]           4               <NA>         <NA> ENSTRUP00000009764
##             protein_version     gene_name   transcript_name
##                   <numeric>   <character>       <character>
##         [1]            <NA>          <NA>              <NA>
##         [2]            <NA>          <NA>              <NA>
##         [3]            <NA>          <NA>              <NA>
##         ...             ...           ...               ...
##   [1388716]            <NA> IWS1 (1 of 2) IWS1 (1 of 2)-201
##   [1388717]               1 IWS1 (1 of 2) IWS1 (1 of 2)-201
##   -------
##   seqinfo: 2056 sequences from an unspecified genome; no seqlengths
dna
## class: FaFile 
## path: /home/mtmorgan/.AnnotationHub/53323
## index: /home/mtmorgan/.AnnotationHub/53324
## isOpen: FALSE 
## yieldSize: NA
head(seqlevels(dna))
## [1] "scaffold_1" "scaffold_2" "scaffold_3" "scaffold_4" "scaffold_5"
## [6] "scaffold_6"

Create a TxDb instance

library(GenomicFeatures)
txdb <- makeTxDbFromGRanges(gtf)
## saveDb(txdb, "TxDb.Takifugu.Ensembl.80.sqlite")
## loadDb("TxDb.Takifugu.Ensembl.80.sqlite")

Use txdb

library(Rsamtools)     # for getSeq,FaFile-method
## Loading required package: XVector
## Loading required package: Biostrings
exons <- exons(txdb)
getSeq(dna, exons)
##   A DNAStringSet instance of length 322622
##          width seq                                     names               
##      [1]    68 GCTAGCGTAGCTTAACCA...TGAAAAGTCCCGCAGGCA MT
##      [2]   947 CAAAAGCTTGGTCCTGAC...AGGTGCACTTGGAAAAAC MT
##      [3]    74 CGGAGCATAGCTTAACAG...AAATCGAGCTGCCCCGAC MT
##      ...   ... ...
## [322621]    93 CCGTCAGAGCAGCAGGTG...GACGGCGACTACCATCAG scaffold_9956
## [322622]   198 AGGATTGAGCTGCGCTCC...GAGGTGGACGGGGTCAAG scaffold_9956