ChipDb central key are the probe ids by default. The OrgDb org.Hs.eg.db
is an entrez orgDb (eg
in name) and therefore the default is Entrez Gene Id. To verify we print out the first 10 keys
head(keys(hugene20sttranscriptcluster.db), 10)
## [1] "16650001" "16650003" "16650005" "16650007" "16650009" "16650011"
## [7] "16650013" "16650015" "16650017" "16650019"
head(keys(org.Hs.eg.db), 10)
## [1] "1" "2" "3" "9" "10" "11" "12" "13" "14" "15"
keytypes(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
setdiff(columns(hugene20sttranscriptcluster.db), keytypes(hugene20sttranscriptcluster.db))
## character(0)
# so in this case yes
select(org.Hs.eg.db, "1000","SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## ENTREZID SYMBOL
## 1 1000 CDH2
select(hugene20sttranscriptcluster.db, "1000","SYMBOL", "ENTREZID")
## 'select()' returned 1:1 mapping between keys and columns
## ENTREZID SYMBOL
## 1 1000 CDH2
select(org.Hs.eg.db, "PPARG", "ENSEMBL", "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENSEMBL
## 1 PPARG ENSG00000132170
select(org.Hs.eg.db, "GAPDH", "UNIPROT","SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
## SYMBOL UNIPROT
## 1 GAPDH P04406
## 2 GAPDH V9HVZ4
z <- mapIds(hugene20sttranscriptcluster.db, keys(hugene20sttranscriptcluster.db),
"ENTREZID","PROBEID", multiVals = "list")
## 'select()' returned 1:many mapping between keys and columns
#or
z <- mapIds(hugene20sttranscriptcluster.db, keys(hugene20sttranscriptcluster.db),
"SYMBOL","PROBEID", multiVals = "list")
## 'select()' returned 1:many mapping between keys and columns
table(sapply(z, function(x) length(x[!is.na(x)])))
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 21909 29422 1514 350 135 78 51 19 20 29 4 11
## 12 13 14 15 18 20 21 22 25 26 28 29
## 13 14 1 1 3 1 5 6 14 2 2 3
## 31 34 35
## 2 1 7
z <- mapIds(org.Hs.eg.db, keys(org.Hs.eg.db), "ALIAS", "ENTREZID", multiVals="CharacterList")
## 'select()' returned 1:many mapping between keys and columns
z[["1000"]]
## [1] "CD325" "CDHN" "CDw325" "NCAD" "CDH2"
# or
select(org.Hs.eg.db, "1000", "ALIAS")
## 'select()' returned 1:many mapping between keys and columns
## ENTREZID ALIAS
## 1 1000 CD325
## 2 1000 CDHN
## 3 1000 CDw325
## 4 1000 NCAD
## 5 1000 CDH2
# look on the website or
length(BiocManager::available("TxDb"))
## [1] 34
length(BiocManager::available("EnsDb"))
## [1] 8
The name infers the species, source, build, and table used to generate the species.
Resource | Species | Source | Build | Table |
---|---|---|---|---|
TxDb.Hsapiens.UCSC.hg19.knownGene | Homo sapiens | UCSC | hg19 (GRCh37) | knownGene |
TxDb.Ggallus.UCSC.galGal4.refGene | Gallus gallus | UCSC | galGal4 | refGene |
TxDb.Rnorvegicus.BioMart.igis | Rattus norvegicus | Biomart | igis | |
EnsDb.Rnorvegicus.v79 | Rattus norvegicus | Ensembl | v79 |
head(keys(TxDb.Hsapiens.UCSC.hg19.knownGene))
## [1] "1" "10" "100" "1000" "10000" "100008586"
keytypes(TxDb.Hsapiens.UCSC.hg19.knownGene)
## [1] "CDSID" "CDSNAME" "EXONID" "EXONNAME" "GENEID" "TXID"
## [7] "TXNAME"
# central key is TXID
setdiff(columns(TxDb.Hsapiens.UCSC.hg19.knownGene), keytypes(TxDb.Hsapiens.UCSC.hg19.knownGene))
## [1] "CDSCHROM" "CDSEND" "CDSSTART" "CDSSTRAND" "EXONCHROM"
## [6] "EXONEND" "EXONRANK" "EXONSTART" "EXONSTRAND" "TXCHROM"
## [11] "TXEND" "TXSTART" "TXSTRAND" "TXTYPE"
# These are columns available that are NOT available as keytypes...
# so No not all columns can be used as keys
transcriptsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by="gene")
## GRangesList object of length 23459:
## $`1`
## GRanges object with 2 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr19 58858172-58864865 - | 70455 uc002qsd.4
## [2] chr19 58859832-58874214 - | 70456 uc002qsf.2
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
##
## $`10`
## GRanges object with 1 range and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr8 18248755-18258723 + | 31944 uc003wyw.1
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
##
## $`100`
## GRanges object with 1 range and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr20 43248163-43280376 - | 72132 uc002xmj.3
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
##
## ...
## <23456 more elements>
promoters(TxDb.Hsapiens.UCSC.hg19.knownGene, upstream=1000, downstream=200, use.names=TRUE)
## GRanges object with 82960 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## uc001aaa.3 chr1 10874-12073 + | 1 uc001aaa.3
## uc010nxq.1 chr1 10874-12073 + | 2 uc010nxq.1
## uc010nxr.1 chr1 10874-12073 + | 3 uc010nxr.1
## uc001aal.1 chr1 68091-69290 + | 4 uc001aal.1
## uc001aaq.2 chr1 320084-321283 + | 5 uc001aaq.2
## ... ... ... ... . ... ...
## uc011mgu.1 chrUn_gl000237 2487-3686 - | 82956 uc011mgu.1
## uc011mgv.2 chrUn_gl000241 36676-37875 - | 82957 uc011mgv.2
## uc011mgw.1 chrUn_gl000243 10501-11700 + | 82958 uc011mgw.1
## uc022brq.1 chrUn_gl000243 12608-13807 + | 82959 uc022brq.1
## uc022brr.1 chrUn_gl000247 5617-6816 - | 82960 uc022brr.1
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
select(org.Hs.eg.db, "PPARG", "ENTREZID", "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENTREZID
## 1 PPARG 5468
# so entrez id is 5468
transcriptsBy(TxDb.Hsapiens.UCSC.hg19.knownGene)[["5468"]]
## GRanges object with 10 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr3 12328984-12475855 + | 13253 uc031rym.1
## [2] chr3 12329349-12475855 + | 13254 uc003bwr.3
## [3] chr3 12329349-12475855 + | 13255 uc003bws.3
## [4] chr3 12330436-12421430 + | 13256 uc010hdz.1
## [5] chr3 12330436-12448345 + | 13257 uc003bwt.1
## [6] chr3 12330436-12475855 + | 13258 uc003bwu.3
## [7] chr3 12353879-12475855 + | 13259 uc003bwv.3
## [8] chr3 12393001-12448345 + | 13260 uc003bww.1
## [9] chr3 12393001-12475855 + | 13261 uc003bwx.3
## [10] chr3 12421203-12458653 + | 13262 uc010hea.1
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
select(org.Hs.eg.db, "PPARG", "ENSEMBL", "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENSEMBL
## 1 PPARG ENSG00000132170
# so ensembl id is ENSG00000132170
transcriptsBy(EnsDb.Hsapiens.v86)[["ENSG00000132170"]]
## GRanges object with 14 ranges and 6 metadata columns:
## seqnames ranges strand | tx_id
## <Rle> <IRanges> <Rle> | <character>
## [1] 3 12287368-12434344 + | ENST00000397010
## [2] 3 12287388-12381491 + | ENST00000397029
## [3] 3 12287860-12434344 + | ENST00000309576
## [4] 3 12287901-12434311 + | ENST00000397015
## [5] 3 12288838-12379834 + | ENST00000455517
## ... ... ... ... . ...
## [10] 3 12312388-12434216 + | ENST00000397000
## [11] 3 12351472-12406846 + | ENST00000477039
## [12] 3 12351472-12434356 + | ENST00000287820
## [13] 3 12351569-12434344 + | ENST00000397023
## [14] 3 12379706-12417154 + | ENST00000396999
## tx_biotype tx_cds_seq_start tx_cds_seq_end
## <character> <integer> <integer>
## [1] protein_coding 12379706 12434145
## [2] protein_coding 12379706 12381491
## [3] protein_coding 12379706 12434145
## [4] protein_coding 12379706 12434145
## [5] protein_coding 12379706 12379834
## ... ... ... ...
## [10] protein_coding 12379706 12433915
## [11] retained_intron <NA> <NA>
## [12] protein_coding 12351593 12434145
## [13] nonsense_mediated_decay 12351593 12371891
## [14] nonsense_mediated_decay 12379706 12399373
## gene_id tx_name
## <character> <character>
## [1] ENSG00000132170 ENST00000397010
## [2] ENSG00000132170 ENST00000397029
## [3] ENSG00000132170 ENST00000309576
## [4] ENSG00000132170 ENST00000397015
## [5] ENSG00000132170 ENST00000455517
## ... ... ...
## [10] ENSG00000132170 ENST00000397000
## [11] ENSG00000132170 ENST00000477039
## [12] ENSG00000132170 ENST00000287820
## [13] ENSG00000132170 ENST00000397023
## [14] ENSG00000132170 ENST00000396999
## -------
## seqinfo: 357 sequences from GRCh38 genome
# No they don't agree 10 from entrez and 14 from ensembl
# No the builds are different
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
transcriptsBy(TxDb.Hsapiens.UCSC.hg38.knownGene)[["5468"]]
## GRanges object with 17 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr3 12287368-12434344 + | 33852 ENST00000397010.6
## [2] chr3 12287388-12381491 + | 33853 ENST00000397029.7
## [3] chr3 12287860-12434344 + | 33854 ENST00000309576.10
## [4] chr3 12287901-12434311 + | 33855 ENST00000397015.6
## [5] chr3 12287943-12434162 + | 33856 ENST00000643888.1
## ... ... ... ... . ... ...
## [13] chr3 12312388-12434216 + | 33864 ENST00000397000.5
## [14] chr3 12351472-12406846 + | 33866 ENST00000477039.5
## [15] chr3 12351472-12434356 + | 33867 ENST00000287820.10
## [16] chr3 12351569-12434344 + | 33868 ENST00000397023.5
## [17] chr3 12379706-12417154 + | 33869 ENST00000396999.2
## -------
## seqinfo: 595 sequences (1 circular) from hg38 genome
# The still don't agree 17 to 14
# There is a difference. Which is better? for you to decide
GRanges
objectgns <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
gns[gns %over% GRanges("chr2:2858473-3271812")]
## GRanges object with 1 range and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## 7260 chr2 3192741-3381653 - | 7260
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
## OR
subsetByOverlaps(gns, GRanges("chr2:2858473-3271812"))
## GRanges object with 1 range and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## 7260 chr2 3192741-3381653 - | 7260
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
select(Homo.sapiens, "BRCA1", "GO", "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
## SYMBOL GO EVIDENCE ONTOLOGY
## 1 BRCA1 GO:0000151 NAS CC
## 2 BRCA1 GO:0000724 IDA BP
## 3 BRCA1 GO:0000729 TAS BP
## 4 BRCA1 GO:0000800 IDA CC
## 5 BRCA1 GO:0003677 TAS MF
## 6 BRCA1 GO:0003684 IEA MF
## 7 BRCA1 GO:0003713 IMP MF
## 8 BRCA1 GO:0003713 NAS MF
## 9 BRCA1 GO:0003723 IDA MF
## 10 BRCA1 GO:0004842 IDA MF
## 11 BRCA1 GO:0005515 IPI MF
## 12 BRCA1 GO:0005634 IDA CC
## 13 BRCA1 GO:0005654 TAS CC
## 14 BRCA1 GO:0005694 ISS CC
## 15 BRCA1 GO:0005737 IDA CC
## 16 BRCA1 GO:0005886 IDA CC
## 17 BRCA1 GO:0006260 TAS BP
## 18 BRCA1 GO:0006301 IDA BP
## 19 BRCA1 GO:0006302 IDA BP
## 20 BRCA1 GO:0006302 IMP BP
## 21 BRCA1 GO:0006303 TAS BP
## 22 BRCA1 GO:0006349 IEA BP
## 23 BRCA1 GO:0006357 IMP BP
## 24 BRCA1 GO:0006359 TAS BP
## 25 BRCA1 GO:0006633 IEA BP
## 26 BRCA1 GO:0006915 TAS BP
## 27 BRCA1 GO:0006974 TAS BP
## 28 BRCA1 GO:0006978 TAS BP
## 29 BRCA1 GO:0007059 IMP BP
## 30 BRCA1 GO:0007098 IEA BP
## 31 BRCA1 GO:0008270 IEA MF
## 32 BRCA1 GO:0008274 NAS CC
## 33 BRCA1 GO:0008630 IDA BP
## 34 BRCA1 GO:0010212 IMP BP
## 35 BRCA1 GO:0010575 IMP BP
## 36 BRCA1 GO:0010628 IMP BP
## 37 BRCA1 GO:0015631 NAS MF
## 38 BRCA1 GO:0016567 IDA BP
## 39 BRCA1 GO:0016579 TAS BP
## 40 BRCA1 GO:0019899 IPI MF
## 41 BRCA1 GO:0030521 NAS BP
## 42 BRCA1 GO:0031398 IDA BP
## 43 BRCA1 GO:0031436 IDA CC
## 44 BRCA1 GO:0031625 IPI MF
## 45 BRCA1 GO:0032991 IDA CC
## 46 BRCA1 GO:0033147 IMP BP
## 47 BRCA1 GO:0035066 IDA BP
## 48 BRCA1 GO:0042127 TAS BP
## 49 BRCA1 GO:0042802 IPI MF
## 50 BRCA1 GO:0042981 TAS BP
## 51 BRCA1 GO:0043627 IDA BP
## 52 BRCA1 GO:0044030 IEA BP
## 53 BRCA1 GO:0044212 IDA MF
## 54 BRCA1 GO:0044818 IEA BP
## 55 BRCA1 GO:0045717 IMP BP
## 56 BRCA1 GO:0045739 IMP BP
## 57 BRCA1 GO:0045766 IMP BP
## 58 BRCA1 GO:0045892 IDA BP
## 59 BRCA1 GO:0045893 IDA BP
## 60 BRCA1 GO:0045893 NAS BP
## 61 BRCA1 GO:0045893 TAS BP
## 62 BRCA1 GO:0045944 IDA BP
## 63 BRCA1 GO:0045944 IMP BP
## 64 BRCA1 GO:0046600 NAS BP
## 65 BRCA1 GO:0050681 NAS MF
## 66 BRCA1 GO:0051571 IDA BP
## 67 BRCA1 GO:0051572 IEA BP
## 68 BRCA1 GO:0051573 IDA BP
## 69 BRCA1 GO:0051574 IEA BP
## 70 BRCA1 GO:0051865 IDA BP
## 71 BRCA1 GO:0070063 IDA MF
## 72 BRCA1 GO:0070317 TAS BP
## 73 BRCA1 GO:0070512 IDA BP
## 74 BRCA1 GO:0070531 IDA CC
## 75 BRCA1 GO:0071158 IDA BP
## 76 BRCA1 GO:0071356 IMP BP
## 77 BRCA1 GO:0071681 IDA BP
## 78 BRCA1 GO:0072425 IMP BP
## 79 BRCA1 GO:0085020 IDA BP
## 80 BRCA1 GO:1901796 TAS BP
## 81 BRCA1 GO:1902042 IMP BP
## 82 BRCA1 GO:1990904 IDA CC
## 83 BRCA1 GO:2000378 IMP BP
## 84 BRCA1 GO:2000617 IDA BP
## 85 BRCA1 GO:2000620 IDA BP
select(Homo.sapiens, "uc002fai.3", "SYMBOL", "TXNAME")
## 'select()' returned 1:1 mapping between keys and columns
## TXNAME SYMBOL
## 1 uc002fai.3 ZNF23
transcriptsBy(Homo.sapiens)[[mapIds(Homo.sapiens, "uc002fai.3", "ENTREZID","TXNAME")]]
## 'select()' returned 1:1 mapping between keys and columns
## GRanges object with 6 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr16 71481503-71496117 - | 59822 uc002faf.3
## [2] chr16 71481503-71496273 - | 59823 uc002fah.3
## [3] chr16 71481506-71487876 - | 59824 uc002fad.3
## [4] chr16 71481506-71496273 - | 59825 uc002fag.3
## [5] chr16 71481506-71496273 - | 59826 uc010vmf.2
## [6] chr16 71481506-71523217 - | 59827 uc002fai.3
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
transcriptsBy(Homo.sapiens, columns = c("ENSEMBL", "SYMBOL"))
## 'select()' returned 1:many mapping between keys and columns
## GRangesList object of length 23459:
## $`1`
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | tx_name ENSEMBL
## <Rle> <IRanges> <Rle> | <character> <CharacterList>
## [1] chr19 58858172-58864865 - | uc002qsd.4 ENSG00000121410
## [2] chr19 58859832-58874214 - | uc002qsf.2 ENSG00000121410
## SYMBOL
## <CharacterList>
## [1] A1BG
## [2] A1BG
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
##
## $`10`
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | tx_name ENSEMBL
## <Rle> <IRanges> <Rle> | <character> <CharacterList>
## [1] chr8 18248755-18258723 + | uc003wyw.1 ENSG00000156006
## SYMBOL
## <CharacterList>
## [1] NAT2
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
##
## $`100`
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | tx_name ENSEMBL
## <Rle> <IRanges> <Rle> | <character> <CharacterList>
## [1] chr20 43248163-43280376 - | uc002xmj.3 ENSG00000196839
## SYMBOL
## <CharacterList>
## [1] ADA
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
##
## ...
## <23456 more elements>
library(Organism.dplyr)
supportedOrganisms()
## # A tibble: 21 x 3
## organism OrgDb TxDb
## <chr> <chr> <chr>
## 1 Bos taurus org.Bt.eg.db TxDb.Btaurus.UCSC.bosTau8.refGene
## 2 Caenorhabditis elegans org.Ce.eg.db TxDb.Celegans.UCSC.ce11.refGene
## 3 Caenorhabditis elegans org.Ce.eg.db TxDb.Celegans.UCSC.ce6.ensGene
## 4 Canis familiaris org.Cf.eg.db TxDb.Cfamiliaris.UCSC.canFam3.refGene
## 5 Drosophila melanogaster org.Dm.eg.db TxDb.Dmelanogaster.UCSC.dm3.ensGene
## 6 Drosophila melanogaster org.Dm.eg.db TxDb.Dmelanogaster.UCSC.dm6.ensGene
## 7 Danio rerio org.Dr.eg.db TxDb.Drerio.UCSC.danRer10.refGene
## 8 Gallus gallus org.Gg.eg.db TxDb.Ggallus.UCSC.galGal4.refGene
## 9 Homo sapiens org.Hs.eg.db TxDb.Hsapiens.UCSC.hg18.knownGene
## 10 Homo sapiens org.Hs.eg.db TxDb.Hsapiens.UCSC.hg19.knownGene
## # … with 11 more rows
dim(supportedOrganisms())[1]
## [1] 21
src <- src_organism(dbpath = hg38light())
select(src, "NAT2", c("ensembl", "genename"),"symbol")
## symbol ensembl genename
## 1 NAT2 ENSG00000156006 N-acetyltransferase 2
select(src, "NAT2", c("alias"),"symbol")
## symbol alias
## 1 NAT2 AAC2
## 2 NAT2 NAT-2
## 3 NAT2 PNAT
## 4 NAT2 NAT2
promoters(src)
## GRanges object with 77 ranges and 2 metadata columns:
## seqnames ranges strand |
## <Rle> <IRanges> <Rle> |
## ENST00000336199.9 chr1 243843037-243845236 - |
## ENST00000366540.5 chr1 243843385-243845584 - |
## ENST00000263826.9 chr1 243843083-243845282 - |
## ENST00000366539.5 chr1 243849929-243852128 - |
## ENST00000492957.1 chr1 243614947-243617146 - |
## ... ... ... ... .
## ENST00000619536.4 chr1_KI270763v1_alt 855483-857682 - |
## ENST00000621586.3 chr1_KI270763v1_alt 855069-857268 - |
## ENST00000629115.1 chr1_KI270763v1_alt 867343-869542 - |
## ENST00000629924.2 chr1_KI270763v1_alt 826591-828790 - |
## ENST00000627932.1 chr1_KI270763v1_alt 830085-832284 - |
## tx_id tx_name
## <integer> <character>
## ENST00000336199.9 18165 ENST00000336199.9
## ENST00000366540.5 18166 ENST00000366540.5
## ENST00000263826.9 18167 ENST00000263826.9
## ENST00000366539.5 18168 ENST00000366539.5
## ENST00000492957.1 18169 ENST00000492957.1
## ... ... ...
## ENST00000619536.4 206918 ENST00000619536.4
## ENST00000621586.3 206919 ENST00000621586.3
## ENST00000629115.1 206920 ENST00000629115.1
## ENST00000629924.2 206921 ENST00000629924.2
## ENST00000627932.1 206922 ENST00000627932.1
## -------
## seqinfo: 595 sequences (1 circular) from hg38 genome
tbl(src, "id")
## # Source: table<id> [?? x 6]
## # Database: sqlite 3.22.0 []
## entrez map ensembl symbol genename alias
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 19q13.43 ENSG00000121410 A1BG alpha-1-B glycoprotein A1B
## 2 1 19q13.43 ENSG00000121410 A1BG alpha-1-B glycoprotein ABG
## 3 1 19q13.43 ENSG00000121410 A1BG alpha-1-B glycoprotein GAB
## 4 1 19q13.43 ENSG00000121410 A1BG alpha-1-B glycoprotein HYST2477
## 5 1 19q13.43 ENSG00000121410 A1BG alpha-1-B glycoprotein A1BG
## 6 10 8p22 ENSG00000156006 NAT2 N-acetyltransferase 2 AAC2
## 7 10 8p22 ENSG00000156006 NAT2 N-acetyltransferase 2 NAT-2
## 8 10 8p22 ENSG00000156006 NAT2 N-acetyltransferase 2 PNAT
## 9 10 8p22 ENSG00000156006 NAT2 N-acetyltransferase 2 NAT2
## 10 100 20q13.12 ENSG00000196839 ADA adenosine deaminase ADA
## # … with more rows
src_tbls(src)
## [1] "id_accession" "id_transcript" "id" "id_omim_pm"
## [5] "id_protein" "id_go" "id_go_all" "ranges_gene"
## [9] "ranges_tx" "ranges_exon" "ranges_cds"
colnames(tbl(src, "id_go"))
## [1] "entrez" "go" "evidence" "ontology"
inner_join(tbl(src, "id"), tbl(src, "id_go")) %>%
filter(symbol == "NAT2") %>%
dplyr::select(entrez, ensembl, symbol, go, evidence, ontology)
## Joining, by = "entrez"
## # Source: lazy query [?? x 6]
## # Database: sqlite 3.22.0 []
## entrez ensembl symbol go evidence ontology
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 10 ENSG00000156006 NAT2 GO:0004060 TAS MF
## 2 10 ENSG00000156006 NAT2 GO:0005515 IPI MF
## 3 10 ENSG00000156006 NAT2 GO:0005829 TAS CC
## 4 10 ENSG00000156006 NAT2 GO:0006805 TAS BP
## 5 10 ENSG00000156006 NAT2 GO:0004060 TAS MF
## 6 10 ENSG00000156006 NAT2 GO:0005515 IPI MF
## 7 10 ENSG00000156006 NAT2 GO:0005829 TAS CC
## 8 10 ENSG00000156006 NAT2 GO:0006805 TAS BP
## 9 10 ENSG00000156006 NAT2 GO:0004060 TAS MF
## 10 10 ENSG00000156006 NAT2 GO:0005515 IPI MF
## # … with more rows
length(BiocManager::available("BSgenome"))
## [1] 94
library(BSgenome.Hsapiens.UCSC.hg19)
seq = getSeq(BSgenome.Hsapiens.UCSC.hg19, "chr1")
alphabetFrequency(seq)
## A C G T M R W S
## 65570891 47024412 47016562 65668756 0 0 0 0
## Y K V H D B N -
## 0 0 0 0 0 0 23970000 0
## + .
## 0 0
# only bases
alphabetFrequency(seq, baseOnly=TRUE)
## A C G T other
## 65570891 47024412 47016562 65668756 23970000
seq = getSeq(BSgenome.Hsapiens.UCSC.hg19, GRanges("chr6:35310335-35395968"))
complement(seq)
## A DNAStringSet instance of length 1
## width seq
## [1] 85634 CGCCTCGCACACTGCGACGCCGGCGGCGCCTG...TACCCTTAATTTATAAATTCTCGACTGACCTT
reverseComplement(seq)
## A DNAStringSet instance of length 1
## width seq
## [1] 85634 TTCCAGTCAGCTCTTAAATATTTAATTCCCAT...GTCCGCGGCGGCCGCAGCGTCACACGCTCCGC
tp53 <- transcriptsBy(Homo.sapiens)[[mapIds(Homo.sapiens, "TP53", "ENTREZID","SYMBOL")]]
## 'select()' returned 1:1 mapping between keys and columns
getSeq(Hsapiens, tp53)
## A DNAStringSet instance of length 17
## width seq
## [1] 14841 CCAGACTGCCTTCCGGGTCACTGCCATGGAGG...CAGCCTGTGCAACAGAGTGAGACTCTGTCTC
## [2] 10534 CCAGACTGCCTTCCGGGTCACTGCCATGGAGG...TAGCCTGGGCAACAGAGCAAGACTCCATCTC
## [3] 5207 CACTGCCCAACAACACCAGCTCCTCTCCCCAG...ACTTTGCTGCCACCTGTGTGTCTGAGGGGTG
## [4] 7092 TGAGGCCAGGAGATGGAGGCTGCAGTGAGCTG...ACTTTGCTGCCACCTGTGTGTCTGAGGGGTG
## [5] 7092 TGAGGCCAGGAGATGGAGGCTGCAGTGAGCTG...ACTTTGCTGCCACCTGTGTGTCTGAGGGGTG
## ... ... ...
## [13] 1056 TACTCCCCTGCCCTCAACAAGATGTTTTGCCA...TCCTCACCATCATCACACTGGAAGACTCCAG
## [14] 13370 GATGGGATTGGGGTTTTCCCCTCCCATGTGCT...TCCTCACCATCATCACACTGGAAGACTCCAG
## [15] 13370 GATGGGATTGGGGTTTTCCCCTCCCATGTGCT...TCCTCACCATCATCACACTGGAAGACTCCAG
## [16] 13018 GATGGGATTGGGGTTTTCCCCTCCCATGTGCT...GGTGAAACCCCGTCTCTACTGAAAAATACAA
## [17] 11557 GATGGGATTGGGGTTTTCCCCTCCCATGTGCT...TTCTGGGACAGCCAAGTCTGTGACTTGCACG
hub <- AnnotationHub()
## snapshotDate(): 2019-07-10
length(hub)
## [1] 46429
length(query(hub, "salmo salar"))
## [1] 1
dog <- query(hub, c("canis familiaris","ensembl","gtf"))
doggy <- dog[["AH69344"]]
## downloading 0 resources
## loading from cache
## Importing File into R ..
TxDoggy <- makeTxDbFromGRanges(doggy)
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of
## type stop_codon. This information was ignored.
getInfoOnIds(hub, c("AH73986", "AH73881","AH64538", "AH69344"))
## ah_id fetch_id title rdataclass status
## 291824 AH73986 80732 Ensembl 79 EnsDb for Homo sapiens EnsDb Public
## 291719 AH73881 80627 Ensembl 97 EnsDb for Homo sapiens EnsDb Public
## 282384 AH64538 71284 Canis_familiaris.CanFam3.1.94.gtf GRanges Public
## 287182 AH69344 76090 Canis_familiaris.CanFam3.1.96.gtf GRanges Public
## biocversion rdatadateadded rdatadateremoved file_size
## 291824 3.9 2019-05-02 <NA> 334757888
## 291719 3.9 2019-05-02 <NA> 380694528
## 282384 3.8 2018-10-04 <NA> <NA>
## 287182 3.8 2019-04-22 <NA> <NA>
listMarts()
## biomart version
## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 97
## 2 ENSEMBL_MART_MOUSE Mouse strains 97
## 3 ENSEMBL_MART_SNP Ensembl Variation 97
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 97
mart <- useMart("ENSEMBL_MART_ENSEMBL")
head(listDatasets(mart))
## dataset description
## 1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1)
## 2 acalliptera_gene_ensembl Eastern happy genes (fAstCal1.2)
## 3 acarolinensis_gene_ensembl Anole lizard genes (AnoCar2.0)
## 4 acitrinellus_gene_ensembl Midas cichlid genes (Midas_v5)
## 5 ahaastii_gene_ensembl Great spotted kiwi genes (aptHaa1)
## 6 amelanoleuca_gene_ensembl Panda genes (ailMel1)
## version
## 1 ASM259213v1
## 2 fAstCal1.2
## 3 AnoCar2.0
## 4 Midas_v5
## 5 aptHaa1
## 6 ailMel1
mart <- useMart("ENSEMBL_MART_ENSEMBL","hsapiens_gene_ensembl")
getBM(c("ensembl_gene_id","hgnc_symbol","entrezgene_id"), "entrezgene_id", c("672","5468","7157"), mart)
## Cache found
## ensembl_gene_id hgnc_symbol entrezgene_id
## 1 ENSG00000132170 PPARG 5468
## 2 ENSG00000012048 BRCA1 672
## 3 ENSG00000141510 TP53 7157
res <- getBM(c("hgnc_symbol","gene_exon"),"hgnc_symbol","GAPDH",mart)
## Cache found
nms <- listAttributes(mart)$name
tempdir()
. We use a temporary directory so it will be cleaned up at the end. If using to manage file would want a more permanent location.# the ask is not needed here
# because this is a vignette and not an interactive session when knitting,
# this gives permission to create the cache on the system.
bfc <- BiocFileCache(tempdir(), ask=FALSE)
bfccache(bfc)
## [1] "/tmp/RtmpYoNrM1"
bfc
## class: BiocFileCache
## bfccache: /tmp/RtmpYoNrM1
## bfccount: 0
## For more information see: bfcinfo() or bfcquery()
names(bfcinfo(bfc))
## [1] "rid" "rname" "create_time"
## [4] "access_time" "rpath" "rtype"
## [7] "fpath" "last_modified_time" "etag"
## [10] "expires"
savepath <- bfcnew(bfc, "NewResource", ext=".RData")
savepath
## BFC1
## "/tmp/RtmpYoNrM1/4dd313a59cd5_4dd313a59cd5.RData"
# we can see the info has been added
bfcinfo(bfc)
## # A tibble: 1 x 10
## rid rname create_time access_time rpath rtype fpath last_modified_t…
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 BFC1 NewR… 2019-07-19… 2019-07-19… /tmp… rela… 4dd3… NA
## # … with 2 more variables: etag <chr>, expires <dbl>
# use this path to save an object
m = matrix(1:12, nrow=3)
save(m, file=savepath)
httpbin
site has mock urls that can be used for testing “http://httpbin.org/get”. Give this resource the name “TestWeb”url <- "http://httpbin.org/get"
resource1 <- bfcadd(bfc, "TestWeb", fpath=url)
rid1 <- names(resource1)
resource2 <- bfcadd(bfc, "TestNoDownload", fpath=url, download=FALSE)
rid2 <- names(resource2)
bfcinfo(bfc)
## # A tibble: 3 x 10
## rid rname create_time access_time rpath rtype fpath last_modified_t…
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 BFC1 NewR… 2019-07-19… 2019-07-19… /tmp… rela… 4dd3… NA
## 2 BFC2 Test… 2019-07-19… 2019-07-19… /tmp… web http… NA
## 3 BFC3 Test… 2019-07-19… 2019-07-19… /tmp… web http… NA
## # … with 2 more variables: etag <chr>, expires <dbl>
bfccount(bfc)
## [1] 3
res <- bfcquery(bfc, "Test")
res
## # A tibble: 2 x 10
## rid rname create_time access_time rpath rtype fpath last_modified_t…
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 BFC2 Test… 2019-07-19… 2019-07-19… /tmp… web http… NA
## 2 BFC3 Test… 2019-07-19… 2019-07-19… /tmp… web http… NA
## # … with 2 more variables: etag <chr>, expires <dbl>
bfccount(res)
## [1] 2
bfcneedsupdate(bfc)
## BFC2 BFC3
## NA TRUE
# the metdata data.frmae can be for a subset of data or the entire cache.
# the constrain is a column rid must match the rids in the cache.
rid = c("BFC1", "BFC3")
vls1 = c(1000, 2000)
vls2 = c("An experiment", "Online resource")
meta <- as.data.frame(list(rid=rid, vls1=vls1, vls2=vls2))
bfcmeta(bfc, name="resourceData") <- meta
names(bfcinfo(bfc))
## [1] "rid" "rname" "create_time"
## [4] "access_time" "rpath" "rtype"
## [7] "fpath" "last_modified_time" "etag"
## [10] "expires" "vls1" "vls2"
bfcquerycols(bfc)
## [1] "rid" "rname" "create_time"
## [4] "access_time" "rpath" "rtype"
## [7] "fpath" "last_modified_time" "etag"
## [10] "expires" "vls1" "vls2"
library(dplyr)
bfcinfo(bfc) %>% dplyr::select("rid", "vls2")
## # A tibble: 3 x 2
## rid vls2
## <chr> <chr>
## 1 BFC1 An experiment
## 2 BFC2 <NA>
## 3 BFC3 Online resource
res <- bfcquery(bfc, field="vls2", "experiment")
res
## # A tibble: 1 x 12
## rid rname create_time access_time rpath rtype fpath last_modified_t…
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 BFC1 NewR… 2019-07-19… 2019-07-19… /tmp… rela… 4dd3… NA
## # … with 4 more variables: etag <chr>, expires <dbl>, vls1 <dbl>, vls2 <chr>
# get id of resource matching query
# use that to load the resource
bfcrid(res)
## [1] "BFC1"
load(bfcrpath(bfc, rids=bfcrid(res)))