## ----echo = FALSE,hide=TRUE, message=FALSE,warning=FALSE---------------------- library(ELMER.data) library(DT) library(dplyr) ## ----eval = FALSE------------------------------------------------------------- # devtools::install_github(repo = "tiagochst/ELMER.data") # library("ELMER.data") # library("GenomicRanges") ## ----eval=FALSE, include=TRUE------------------------------------------------- # # getTranscripts <- function(genome = "hg38"){ # # tries <- 0L # msg <- character() # while (tries < 3L) { # tss <- tryCatch({ # host <- ifelse(genome == "hg19", "grch37.ensembl.org","www.ensembl.org") # message("Accessing ", host, " to get TSS information") # # ensembl <- tryCatch({ # useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", host = host) # }, error = function(e) { # message(e) # for(mirror in c("asia","useast","uswest")){ # x <- useEnsembl("ensembl", # dataset = "hsapiens_gene_ensembl", # mirror = mirror, # host = host) # if(class(x) == "Mart") { # return(x) # } # } # return(NULL) # }) # # if(is.null(host)) { # message("Problems accessing ensembl database") # return(NULL) # } # attributes <- c("chromosome_name", # "start_position", # "end_position", "strand", # "ensembl_gene_id", # "transcription_start_site", # "transcript_start", # "ensembl_transcript_id", # "transcript_end", # "external_gene_name") # chrom <- c(1:22, "X", "Y","M","*") # db.datasets <- listDatasets(ensembl) # description <- db.datasets[db.datasets$dataset=="hsapiens_gene_ensembl",]$description # message(paste0("Downloading transcripts information from ", ensembl@host, ". Using: ", description)) # # filename <- paste0(gsub("[[:punct:]]| ", "_",description),"_tss.rda") # tss <- getBM(attributes = attributes, filters = c("chromosome_name"), values = list(chrom), mart = ensembl) # tss <- tss[!duplicated(tss$ensembl_transcript_id),] # save(tss, file = filename, compress = "xz") # }) # } # return(tss) # } # # getGenes <- function (genome = "hg19"){ # tries <- 0L # msg <- character() # while (tries < 3L) { # gene.location <- tryCatch({ # host <- ifelse(genome == "hg19", "grch37.ensembl.org", # "www.ensembl.org") # message("Accessing ", host, " to get gene information") # ensembl <- tryCatch({ # useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", # host = host) # }, error = function(e) { # message(e) # for (mirror in c("asia", "useast", "uswest")) { # x <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", # mirror = mirror, host = host) # if (class(x) == "Mart") { # return(x) # } # } # return(NULL) # }) # if (is.null(host)) { # message("Problems accessing ensembl database") # return(NULL) # } # attributes <- c("chromosome_name", "start_position", # "end_position", "strand", "ensembl_gene_id", # "entrezgene", "external_gene_name") # db.datasets <- listDatasets(ensembl) # description <- db.datasets[db.datasets$dataset == # "hsapiens_gene_ensembl", ]$description # message(paste0("Downloading genome information (try:", # tries, ") Using: ", description)) # filename <- paste0(gsub("[[:punct:]]| ", "_", description), # ".rda") # if (!file.exists(filename)) { # chrom <- c(1:22, "X", "Y") # gene.location <- getBM(attributes = attributes, # filters = c("chromosome_name"), values = list(chrom), # mart = ensembl) # } # gene.location # }, error = function(e) { # msg <<- conditionMessage(e) # tries <<- tries + 1L # }) # if (!is.null(gene.location)) break # } # if (tries == 3L) # stop("failed to get URL after 3 tries:", "\n error: ", msg) # # return(gene.location) # } # # Human_genes__GRCh37_p13__tss <- getTranscripts(genome = "hg19") # Human_genes__GRCh38_p12__tss <- getTranscripts(genome = "hg38") # Human_genes__GRCh37_p13 <- getGenes("hg19") # Human_genes__GRCh38_p12 <- getGenes("hg38") # save(Human_genes__GRCh37_p13__tss, # file = "Human_genes__GRCh37_p13__tss.rda", # compress = "xz") # # save(Human_genes__GRCh38_p12, # file = "Human_genes__GRCh38_p12.rda", # compress = "xz") # # save(Human_genes__GRCh38_p12__tss, # file = "Human_genes__GRCh38_p12__tss.rda", # compress = "xz") # # save(Human_genes__GRCh37_p13, # file = "Human_genes__GRCh37_p13.rda", # compress = "xz") ## ----eval=FALSE, include=TRUE------------------------------------------------- # for(plat in c("450K","EPIC")) { # for(genome in c("hg38","hg19")) { # base <- "http://zwdzwd.io/InfiniumAnnotation/current/" # path <- file.path(base,plat,paste(plat,"hg19.manifest.rds", sep =".")) # if (grepl("hg38", genome)) path <- gsub("hg19","hg38",path) # if(plat == "EPIC") { # annotation <- paste0(base,"EPIC/EPIC.hg19.manifest.rds") # } else { # annotation <- paste0(base,"hm450/hm450.hg19.manifest.rds") # } # if(grepl("hg38", genome)) annotation <- gsub("hg19","hg38",annotation) # if(!file.exists(basename(annotation))) { # if(Sys.info()["sysname"] == "Windows") mode <- "wb" else mode <- "w" # downloader::download(annotation, basename(annotation), mode = mode) # } # } # } # # devtools::use_data(EPIC.hg19.manifest,overwrite = T,compress = "xz") # devtools::use_data(EPIC.hg38.manifest,overwrite = T,compress = "xz") # devtools::use_data(hm450.hg19.manifest,overwrite = T,compress = "xz") # devtools::use_data(hm450.hg38.manifest,overwrite = T,compress = "xz") ## ----message=FALSE------------------------------------------------------------ data("EPIC.hg19.manifest") as.data.frame(EPIC.hg19.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) data("EPIC.hg38.manifest") as.data.frame(EPIC.hg38.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) data("hm450.hg19.manifest") as.data.frame(hm450.hg19.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) data("hm450.hg38.manifest") as.data.frame(hm450.hg38.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) ## ----eval=FALSE, include=TRUE------------------------------------------------- # library(xml2) # library(httr) # library(dplyr) # library(rvest) # createMotifRelevantTfs <- function(classification = "family"){ # # message("Accessing hocomoco to get last version of TFs ", classification) # file <- paste0(classification,".motif.relevant.TFs.rda") # # # Download from http://hocomoco.autosome.ru/human/mono # tf.family <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html() %>% html_table() # tf.family <- tf.family[[1]] # # Split TF for each family, this will help us map for each motif which are the some ones in the family # # basicaly: for a TF get its family then get all TF in that family # col <- ifelse(classification == "family", "TF family","TF subfamily") # family <- split(tf.family,f = tf.family[[col]]) # # motif.relevant.TFs <- plyr::alply(tf.family,1, function(x){ # f <- x[[col]] # if(f == "") return(x$`Transcription factor`) # Case without family, we will get only the same object # return(unique(family[as.character(f)][[1]]$`Transcription factor`)) # },.progress = "text") # #names(motif.relevant.TFs) <- tf.family$`Transcription factor` # names(motif.relevant.TFs) <- tf.family$Model # # Cleaning object # attr(motif.relevant.TFs,which="split_type") <- NULL # attr(motif.relevant.TFs,which="split_labels") <- NULL # # return(motif.relevant.TFs) # } # # updateTFClassList <- function(tf.list, classification = "family"){ # col <- ifelse(classification == "family","family.name","subfamily.name") # TFclass <- getTFClass() # # Hocomoco # tf.family <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html() %>% html_table() # tf.family <- tf.family[[1]] # # tf.members <- plyr::alply(unique(TFclass %>% pull(col)),1, function(x){ # TFclass$Gene[which(x == TFclass[,col])] # },.progress = "text") # names(tf.members) <- unique(TFclass %>% pull(col)) # attr(tf.members,which="split_type") <- NULL # attr(tf.members,which="split_labels") <- NULL # # for(i in names(tf.list)){ # x <- tf.family[tf.family$Model == i,"Transcription factor"] # idx <- which(sapply(lapply(tf.members, function(ch) grep(paste0("^",x,"$"), ch)), function(x) length(x) > 0)) # if(length(idx) == 0) next # members <- tf.members[[idx]] # tf.list[[i]] <- sort(unique(c(tf.list[[i]],members))) # } # return(tf.list) # } # # getTFClass <- function(){ # # get TF classification # file <- "TFClass.rda" # if(file.exists(file)) { # return(get(load(file))) # } # file <- "http://tfclass.bioinf.med.uni-goettingen.de/suppl/tfclass.ttl.gz" # downloader::download(file,basename(file)) # char_vector <- readLines(basename(file)) # # Find TF idx # idx <- grep("genus",char_vector,ignore.case = T) # # # get TF names # TF <- char_vector[sort(c( idx +1, idx + 2, idx + 4))] # TF <- TF[-grep("LOGO_|rdf:type",TF)] # TF <- gsub(" rdfs:label | ;| rdfs:subClassOf ","",TF) # TF <- stringr::str_trim(gsub('"', '', TF)) # TF <- tibble::as.tibble(t(matrix(TF,nrow = 2))) # colnames(TF) <- c("Gene", "class") # # # Get family and subfamily classification # family.pattern <- "^" # # idx <- grep(family.pattern,char_vector) # family.names <- char_vector[ sort(c(idx,idx+ 2))] # family.names <- gsub(" rdfs:label | ;| rdfs:subClassOf |" # # idx <- grep(subfamily.pattern,char_vector) # subfamily.names <- char_vector[ sort(c(idx,idx+ 2))] # subfamily.names <- gsub(" rdfs:label | ;| rdfs:subClassOf | 0] # aux <- TFclass[rep(i,length(gene)),] # aux$Gene <- gene # df <- rbind(df,aux) # } # TFclass <- rbind(TFclass,df) # TFclass <- TFclass[!duplicated(TFclass),] # # # Break ( into multiple cases) # m <- grep("-",TFclass$Gene) # df <- NULL # for(i in m){ # gene <- gsub("-","",sort(stringr::str_trim(unlist(stringr::str_split(TFclass$Gene[i],"\\(|,|\\)|/"))))) # gene <- gene[stringr::str_length(gene) > 0] # aux <- TFclass[rep(i,length(gene)),] # aux$Gene <- gene # df <- rbind(df,aux) # } # TFclass <- rbind(TFclass,df) # # df <- NULL # for(i in 1:length(TFclass$Gene)){ # m <- TFclass$Gene[i] # gene <- unique(c(toupper(alias2Symbol(toupper(m))),toupper(m),toupper(alias2Symbol(m)))) # if(all(gene %in% TFclass$Gene)) next # aux <- TFclass[rep(i,length(gene)),] # aux$Gene <- gene # df <- rbind(df,aux) # } # TFclass <- rbind(TFclass,df) # TFclass <- TFclass[!duplicated(TFclass),] # TFclass <- TFclass[TFclass$Gene %in% human.TF$external_gene_name,] # save(TFclass,file = "TFClass.rda") # return(TFclass) # } # TF.family <- createMotifRelevantTfs("family") # TF.family <- updateTFClassList(TF.family,"family") # TF.subfamily <- createMotifRelevantTfs("subfamily") # TF.subfamily <- updateTFClassList(TF.subfamily,classification = "subfamily") # save(TF.family,file = "~/ELMER.data/data/TF.family.rda", compress = "xz") # save(TF.subfamily,file = "~/ELMER.data/data/TF.subfamily.rda", compress = "xz") ## ----eval=FALSE, include=TRUE------------------------------------------------- # hocomoco.table <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html() %>% html_table() # hocomoco.table <- hocomoco.table[[1]] # save(hocomoco.table,file = "data/hocomoco.table.rda", compress = "xz") ## ----------------------------------------------------------------------------- data("Probes.motif.hg19.450K") dim(Probes.motif.hg19.450K) str(Probes.motif.hg19.450K) ## ----------------------------------------------------------------------------- data("Probes.motif.hg38.450K") dim(Probes.motif.hg38.450K) str(Probes.motif.hg38.450K) ## ----------------------------------------------------------------------------- data("Probes.motif.hg19.EPIC") dim(Probes.motif.hg19.EPIC) str(Probes.motif.hg19.EPIC) ## ----------------------------------------------------------------------------- data("Probes.motif.hg38.EPIC") dim(Probes.motif.hg38.EPIC) str(Probes.motif.hg38.EPIC) ## ----eval=FALSE, include=TRUE------------------------------------------------- # getInfiniumAnnotation <- function(plat = "450K", genome = "hg38"){ # message("Loading object: ",file) # newenv <- new.env() # if(plat == "EPIC" & genome == "hg19") data("EPIC.hg19.manifest", package = "ELMER.data",envir=newenv) # if(plat == "EPIC" & genome == "hg38") data("EPIC.hg38.manifest", package = "ELMER.data",envir=newenv) # if(plat == "450K" & genome == "hg19") data("hm450.hg19.manifest", package = "ELMER.data",envir=newenv) # if(plat == "450K" & genome == "hg38") data("hm450.hg38.manifest", package = "ELMER.data",envir=newenv) # annotation <- get(ls(newenv)[1],envir=newenv) # return(annotation) # } # # To find for each probe the know motif we will use HOMER software (http://homer.salk.edu/homer/) # # Step: # # 1 - get DNA methylation probes annotation with the regions # # 2 - Make a bed file from it # # 3 - Execute section: Finding Instance of Specific Motifs from http://homer.salk.edu/homer/ngs/peakMotifs.html to the HOCOMOCO TF motifs # # Also, As HOMER is using more RAM than the available we will split the files in to 100k probes. # # Obs: for each probe we create a winddow of 500 bp (-size 500) around it. This might lead to false positives, but will not have false negatives. # # The false posives will be removed latter with some statistical tests. # TFBS.motif <- "http://hocomoco11.autosome.ru/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif" # if(!file.exists(basename(TFBS.motif))) downloader::download(TFBS.motif,basename(TFBS.motif)) # for(plat in c("EPIC","450K")){ # for(gen in c("hg38","hg19")){ # # file <- paste0(plat,gen,".txt") # print(file) # if(!file.exists(file)){ # # STEP 1 # gr <- getInfiniumAnnotation(plat = plat,genome = gen) # # # This will remove masked probes. They have poor quality and might be arbitrarily positioned (Wanding Zhou) # gr <- gr[!gr$MASK_general] # # df <- data.frame(seqnames=seqnames(gr), # starts=as.integer(start(gr)), # ends=end(gr), # names=names(gr), # scores=c(rep(".", length(gr))), # strands=strand(gr)) # step <- 10000 # nb of lines in each file. 10K was selected to not explode RAM # n <- nrow(df) # pb <- txtProgressBar(max = floor(n/step), style = 3) # # for(j in 0:floor(n/step)){ # setTxtProgressBar(pb, j) # # STEP 2 # file.aux <- paste0(plat,gen,"_",j,".bed") # if(!file.exists(gsub(".bed",".txt",file.aux))){ # end <- ifelse(((j + 1) * step) > n, n,((j + 1) * step)) # write.table(df[((j * step) + 1):end,], file = file.aux, col.names = F, quote = F,row.names = F,sep = "\t") # # # STEP 3 use -mscore to get scores # cmd <- paste("source ~/.bash_rc; annotatePeaks.pl" ,file.aux, gen, "-m", basename(TFBS.motif), "-size 500 -cpu 12 >", gsub(".bed",".txt",file.aux)) # system(cmd) # } # } # } # close(pb) # # We will merge the results from each file into one # peaks <- NULL # pb <- txtProgressBar(max = floor(n/step), style = 3) # for(j in 0:floor(n/step)){ # setTxtProgressBar(pb, j) # aux <- readr::read_tsv(paste0(plat,gen,"_",j,".txt")) # colnames(aux)[1] <- "PeakID" # if(is.null(peaks)) { # peaks <- aux # } else { # peaks <- rbind(peaks, aux) # } # } # close(pb) # print("Writing file...") # readr::write_tsv(peaks,path=file,col_names = TRUE) # print("DONE!") # gc() # } # } # # getMatrix <- function(filename) { # motifs <- readr::read_tsv(file) # # From 1 to 21 we have annotations # matrix <- Matrix::Matrix(0, nrow = nrow(motifs), ncol = ncol(motifs) - 21 ,sparse = TRUE) # colnames(matrix) <- gsub(" Distance From Peak\\(sequence,strand,conservation\\)","",colnames(motifs)[-c(1:21)]) # rownames(matrix) <- motifs$PeakID # matrix[!is.na(motifs[,-c(1:21)])] <- 1 # matrix <- as(matrix, "nsparseMatrix") # return(matrix) # } # # for(plat in c("EPIC","450K")){ # for(gen in c("hg19","hg38")){ # file <- paste0(plat,gen,".txt") # # if(file == "450Khg19.txt"){ # if(file.exists("Probes.motif.hg19.450K.rda")) next # Probes.motif.hg19.450K <- getMatrix(file) # save(Probes.motif.hg19.450K, file = "Probes.motif.hg19.450K.rda", compress = "xz") # rm(Probes.motif.hg19.450K) # } # if(file == "450Khg38.txt"){ # if(file.exists("Probes.motif.hg38.450K.rda")) next # Probes.motif.hg38.450K <- getMatrix(file) # save(Probes.motif.hg38.450K, file = "Probes.motif.hg38.450K.rda", compress = "xz") # rm(Probes.motif.hg38.450K) # } # # if(file == "EPIChg19.txt"){ # if(file.exists("Probes.motif.hg19.EPIC.rda")) next # Probes.motif.hg19.EPIC <- getMatrix(file) # save(Probes.motif.hg19.EPIC, file = "Probes.motif.hg19.EPIC.rda", compress = "xz") # rm(Probes.motif.hg19.EPIC) # } # # if(file == "EPIChg38.txt"){ # if(file.exists("Probes.motif.hg38.EPIC.rda")) next # # Probes.motif.hg38.EPIC <- getMatrix(file) # save(Probes.motif.hg38.EPIC, file = "Probes.motif.hg38.EPIC.rda", compress = "xz") # rm(Probes.motif.hg38.EPIC) # } # } # } ## ----------------------------------------------------------------------------- data("Probes.motif.hg19.450K") as.data.frame(as.matrix(Probes.motif.hg19.450K[1:20,1:20])) %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) ## ----eval=FALSE, include=TRUE------------------------------------------------- # human.TF <- readr::read_csv("http://humantfs.ccbr.utoronto.ca/download/v_1.01/DatabaseExtract_v_1.01.csv") # human.TF <- human.TF[human.TF$`Is TF?` == "Yes",] # colnames(human.TF)[1:2] <- c("ensembl_gene_id","external_gene_name") # save(human.TF,file = "~/ELMER.data/data/human.TF.rda",compress = "xz") ## ----------------------------------------------------------------------------- data("human.TF") as.data.frame(human.TF) %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) ## ----sessionInfo-------------------------------------------------------------- sessionInfo()