## ## 1. Retrieve enhancers, SNPs from AnnotationHub, gene coordinates ## from TxDb.*; harmonize genome and chromosome names ## library(AnnotationHub) hub <- AnnotationHub() hg19 <- hub[metadata(hub)$Genome %in% c("hg19", "GRCh37")] ## Enhancers, from ENCODE BroadHmm track ## - details of track ## http://hgwdev.sdsc.edu/cgi-bin/hgTrackUi?db=hg19&g=wgEncodeBroadHmm ## http://www.ncbi.nlm.nih.gov/pubmed/21441907 hmm <- hub$goldenpath.hg18.database.wgEncodeBroadHmm_0.0.1.RData ## wrong 'seqlengths' -- 'hg18' in title, but actually liftOver to hg19 library(BSgenome.Hsapiens.UCSC.hg19) seqlevels(hmm, force=TRUE) <- paste0("chr", 1:22) seqlengths(hmm) <- seqlengths(BSgenome.Hsapiens.UCSC.hg19)[names(seqlengths(hmm))] genome(hmm) <- "hg19" enhancers <- hmm[grep("Enhancer", hmm$name)] ## SNPs, e.g., from chr1 of CEU population snps <- query(hub, "dbSNP.*17.*CEU")[[1]] ## true (length 1) SNPs with single ALT allele snps <- snps[width(snps) == 1 & elementLengths(alt(snps)) == 1] seqlevelsStyle(snps) <- "UCSC" genome(snps) <- "hg19" ## gene coordinates library(TxDb.Hsapiens.UCSC.hg19.knownGene) genes <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene) genes <- genes[seqnames(genes) == "chr17"] ## ## 2. Download (large!) conservation track as BED file from UCSC, ## query for chromosome 1 using rtracklayer ## library(rtracklayer) url <- paste("http://hgdownload.cse.ucsc.edu/goldenPath/hg19", "phastCons100way/hg19.100way.phastCons.bw", sep="/") dest <- "~/ISMB-2014/resources/hg19.100way.phastCons.bw" ## download.file(url, dest) bw <- BigWigFile(dest) chr17 <- as(seqinfo(BigWigFile(dest)), "GRanges")["chr17"] score <- import(bw, which=chr17, as="NumericList")[["chr17"]] ## ## 4. subset SNPs by enhancers ## enhancerSnps <- subsetByOverlaps(snps, enhancers) ## ## 5. Annotate enhancer SNPs with evolutionary conservation score ## score <- score[start(enhancerSnps)] ## info(enhancerSnps)$score <- score[start(enhancerSnps)] ## ## 6. find nearest and distanceToNearest genes ## hits <- distanceToNearest(enhancerSnps, genes) df <- data.frame( Near=names(genes)[subjectHits(hits)], Distance=mcols(hits)$distance, Score=score, stringsAsFactors=FALSE)