## ----echo = FALSE,hide=TRUE, message=FALSE,warning=FALSE----------------------
library(ELMER.data)
library(DT)
library(dplyr)

## ----eval = FALSE-------------------------------------------------------------
#  devtools::install_github(repo = "tiagochst/ELMER.data")
#  library("ELMER.data")
#  library("GenomicRanges")

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  
#  getTranscripts <- function(genome = "hg38"){
#  
#    tries <- 0L
#    msg <- character()
#    while (tries < 3L) {
#      tss <- tryCatch({
#        host <- ifelse(genome == "hg19",  "grch37.ensembl.org","www.ensembl.org")
#        message("Accessing ", host, " to get TSS information")
#  
#        ensembl <- tryCatch({
#          useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", host =  host)
#        },  error = function(e) {
#          message(e)
#          for(mirror in c("asia","useast","uswest")){
#            x <- useEnsembl("ensembl",
#                            dataset = "hsapiens_gene_ensembl",
#                            mirror = mirror,
#                            host =  host)
#            if(class(x) == "Mart") {
#              return(x)
#            }
#          }
#          return(NULL)
#        })
#  
#        if(is.null(host)) {
#          message("Problems accessing ensembl database")
#          return(NULL)
#        }
#        attributes <- c("chromosome_name",
#                        "start_position",
#                        "end_position", "strand",
#                        "ensembl_gene_id",
#                        "transcription_start_site",
#                        "transcript_start",
#                        "ensembl_transcript_id",
#                        "transcript_end",
#                        "external_gene_name")
#        chrom <- c(1:22, "X", "Y","M","*")
#        db.datasets <- listDatasets(ensembl)
#        description <- db.datasets[db.datasets$dataset=="hsapiens_gene_ensembl",]$description
#        message(paste0("Downloading transcripts information from ", ensembl@host, ". Using: ", description))
#  
#        filename <-  paste0(gsub("[[:punct:]]| ", "_",description),"_tss.rda")
#        tss <- getBM(attributes = attributes, filters = c("chromosome_name"), values = list(chrom), mart = ensembl)
#        tss <- tss[!duplicated(tss$ensembl_transcript_id),]
#        save(tss, file = filename, compress = "xz")
#      })
#    }
#    return(tss)
#  }
#  
#  getGenes <- function (genome = "hg19"){
#    tries <- 0L
#    msg <- character()
#    while (tries < 3L) {
#      gene.location <- tryCatch({
#        host <- ifelse(genome == "hg19", "grch37.ensembl.org",
#                       "www.ensembl.org")
#        message("Accessing ", host, " to get gene information")
#        ensembl <- tryCatch({
#          useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl",
#                     host = host)
#        }, error = function(e) {
#          message(e)
#          for (mirror in c("asia", "useast", "uswest")) {
#            x <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl",
#                            mirror = mirror, host = host)
#            if (class(x) == "Mart") {
#              return(x)
#            }
#          }
#          return(NULL)
#        })
#        if (is.null(host)) {
#          message("Problems accessing ensembl database")
#          return(NULL)
#        }
#        attributes <- c("chromosome_name", "start_position",
#                        "end_position", "strand", "ensembl_gene_id",
#                        "entrezgene", "external_gene_name")
#        db.datasets <- listDatasets(ensembl)
#        description <- db.datasets[db.datasets$dataset ==
#                                     "hsapiens_gene_ensembl", ]$description
#        message(paste0("Downloading genome information (try:",
#                       tries, ") Using: ", description))
#        filename <- paste0(gsub("[[:punct:]]| ", "_", description),
#                           ".rda")
#        if (!file.exists(filename)) {
#          chrom <- c(1:22, "X", "Y")
#          gene.location <- getBM(attributes = attributes,
#                                 filters = c("chromosome_name"), values = list(chrom),
#                                 mart = ensembl)
#        }
#        gene.location
#      }, error = function(e) {
#        msg <<- conditionMessage(e)
#        tries <<- tries + 1L
#      })
#      if (!is.null(gene.location)) break
#    }
#    if (tries == 3L)
#      stop("failed to get URL after 3 tries:", "\n  error: ", msg)
#  
#    return(gene.location)
#  }
#  
#  Human_genes__GRCh37_p13__tss <- getTranscripts(genome = "hg19")
#  Human_genes__GRCh38_p12__tss <- getTranscripts(genome = "hg38")
#  Human_genes__GRCh37_p13 <- getGenes("hg19")
#  Human_genes__GRCh38_p12 <- getGenes("hg38")
#  save(Human_genes__GRCh37_p13__tss,
#       file = "Human_genes__GRCh37_p13__tss.rda",
#       compress = "xz")
#  
#  save(Human_genes__GRCh38_p12,
#       file = "Human_genes__GRCh38_p12.rda",
#       compress = "xz")
#  
#  save(Human_genes__GRCh38_p12__tss,
#       file = "Human_genes__GRCh38_p12__tss.rda",
#       compress = "xz")
#  
#  save(Human_genes__GRCh37_p13,
#       file = "Human_genes__GRCh37_p13.rda",
#       compress = "xz")

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  for(plat in c("450K","EPIC")) {
#    for(genome in c("hg38","hg19")) {
#      base <- "http://zwdzwd.io/InfiniumAnnotation/current/"
#      path <- file.path(base,plat,paste(plat,"hg19.manifest.rds", sep ="."))
#      if (grepl("hg38", genome)) path <- gsub("hg19","hg38",path)
#      if(plat == "EPIC") {
#        annotation <- paste0(base,"EPIC/EPIC.hg19.manifest.rds")
#      } else {
#        annotation <- paste0(base,"hm450/hm450.hg19.manifest.rds")
#      }
#      if(grepl("hg38", genome)) annotation <- gsub("hg19","hg38",annotation)
#      if(!file.exists(basename(annotation))) {
#        if(Sys.info()["sysname"] == "Windows") mode <- "wb" else  mode <- "w"
#        downloader::download(annotation, basename(annotation), mode = mode)
#      }
#    }
#  }
#  
#  devtools::use_data(EPIC.hg19.manifest,overwrite = T,compress = "xz")
#  devtools::use_data(EPIC.hg38.manifest,overwrite = T,compress = "xz")
#  devtools::use_data(hm450.hg19.manifest,overwrite = T,compress = "xz")
#  devtools::use_data(hm450.hg38.manifest,overwrite = T,compress = "xz")

## ----message=FALSE------------------------------------------------------------
data("EPIC.hg19.manifest")
as.data.frame(EPIC.hg19.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("EPIC.hg38.manifest")
as.data.frame(EPIC.hg38.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("hm450.hg19.manifest")
as.data.frame(hm450.hg19.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("hm450.hg38.manifest")
as.data.frame(hm450.hg38.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  library(xml2)
#  library(httr)
#  library(dplyr)
#  library(rvest)
#  createMotifRelevantTfs <- function(classification = "family"){
#  
#    message("Accessing hocomoco to get last version of TFs ", classification)
#    file <- paste0(classification,".motif.relevant.TFs.rda")
#  
#    # Download from http://hocomoco.autosome.ru/human/mono
#    tf.family <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html()  %>%  html_table()
#    tf.family <- tf.family[[1]]
#    # Split TF for each family, this will help us map for each motif which are the some ones in the family
#    # basicaly: for a TF get its family then get all TF in that family
#    col <- ifelse(classification == "family", "TF family","TF subfamily")
#    family <- split(tf.family,f = tf.family[[col]])
#  
#    motif.relevant.TFs <- plyr::alply(tf.family,1, function(x){
#      f <- x[[col]]
#      if(f == "") return(x$`Transcription factor`) # Case without family, we will get only the same object
#      return(unique(family[as.character(f)][[1]]$`Transcription factor`))
#    },.progress = "text")
#    #names(motif.relevant.TFs) <- tf.family$`Transcription factor`
#    names(motif.relevant.TFs) <- tf.family$Model
#    # Cleaning object
#    attr(motif.relevant.TFs,which="split_type") <- NULL
#    attr(motif.relevant.TFs,which="split_labels") <- NULL
#  
#    return(motif.relevant.TFs)
#  }
#  
#  updateTFClassList <- function(tf.list, classification = "family"){
#    col <- ifelse(classification == "family","family.name","subfamily.name")
#    TFclass <- getTFClass()
#    # Hocomoco
#    tf.family <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html()  %>%  html_table()
#    tf.family <- tf.family[[1]]
#  
#    tf.members <- plyr::alply(unique(TFclass %>% pull(col)),1, function(x){
#      TFclass$Gene[which(x == TFclass[,col])]
#    },.progress = "text")
#    names(tf.members) <- unique(TFclass %>% pull(col))
#    attr(tf.members,which="split_type") <- NULL
#    attr(tf.members,which="split_labels") <- NULL
#  
#    for(i in names(tf.list)){
#      x <- tf.family[tf.family$Model == i,"Transcription factor"]
#      idx <- which(sapply(lapply(tf.members, function(ch) grep(paste0("^",x,"$"), ch)), function(x) length(x) > 0))
#      if(length(idx) == 0) next
#      members <- tf.members[[idx]]
#      tf.list[[i]] <- sort(unique(c(tf.list[[i]],members)))
#    }
#    return(tf.list)
#  }
#  
#  getTFClass <- function(){
#    # get TF classification
#    file <- "TFClass.rda"
#    if(file.exists(file)) {
#      return(get(load(file)))
#    }
#    file <- "http://tfclass.bioinf.med.uni-goettingen.de/suppl/tfclass.ttl.gz"
#    downloader::download(file,basename(file))
#    char_vector <- readLines(basename(file))
#    # Find TF idx
#    idx <- grep("genus",char_vector,ignore.case = T)
#  
#    # get TF names
#    TF <- char_vector[sort(c( idx +1, idx + 2, idx + 4))]
#    TF <- TF[-grep("LOGO_|rdf:type",TF)]
#    TF <- gsub("  rdfs:label | ;| rdfs:subClassOf <http://sybig.de/tfclass#|>","",TF)
#    TF <- stringr::str_trim(gsub('"', '', TF))
#    TF <- tibble::as.tibble(t(matrix(TF,nrow = 2)))
#    colnames(TF) <- c("Gene", "class")
#  
#    # Get family and subfamily classification
#    family.pattern <-  "^<http://sybig.de/tfclass#[0-9]+\\.[0-9]+\\.[0-9]+>"
#  
#    idx <- grep(family.pattern,char_vector)
#    family.names <- char_vector[ sort(c(idx,idx+ 2))]
#    family.names <- gsub("  rdfs:label | ;| rdfs:subClassOf <http://sybig.de/tfclass#|>|<http://sybig.de/tfclass#| rdf:type owl:Class","",family.names)
#    family.names <- stringr::str_trim(gsub('"', '', family.names))
#    family.names <- tibble::as.tibble(t(matrix(family.names,nrow = 2)))
#    colnames(family.names) <- c("family", "family.name")
#  
#  
#    subfamily.pattern <-  "^<http://sybig.de/tfclass#[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+>"
#  
#    idx <- grep(subfamily.pattern,char_vector)
#    subfamily.names <- char_vector[ sort(c(idx,idx+ 2))]
#    subfamily.names <- gsub("  rdfs:label | ;| rdfs:subClassOf <http://sybig.de/tfclass#|>|<http://sybig.de/tfclass#| rdf:type owl:Class","",subfamily.names)
#    subfamily.names <- stringr::str_trim(gsub('"', '', subfamily.names))
#    subfamily.names <- tibble::as.tibble(t(matrix(subfamily.names,nrow = 2)))
#    colnames(subfamily.names) <- c("subfamily", "subfamily.name")
#    subfamily.names$family <- stringr::str_sub(subfamily.names$subfamily,1,5)
#  
#    classification <- left_join(family.names,subfamily.names)
#    classification$class <- ifelse(is.na(classification$subfamily),classification$family,classification$subfamily)
#  
#    # Add classification to TF list
#    TFclass <- left_join(TF,classification)
#  
#    # Break ( into multiple cases)
#    m <- grep("\\(|/",TFclass$Gene)
#    df <- NULL
#    for(i in m){
#      gene <- sort(stringr::str_trim(unlist(stringr::str_split(TFclass$Gene[i],"\\(|,|\\)|/"))))
#      gene <- gene[stringr::str_length(gene) > 0]
#      aux <- TFclass[rep(i,length(gene)),]
#      aux$Gene <- gene
#      df <- rbind(df,aux)
#    }
#    TFclass <- rbind(TFclass,df)
#    TFclass <- TFclass[!duplicated(TFclass),]
#  
#    # Break ( into multiple cases)
#    m <- grep("-",TFclass$Gene)
#    df <- NULL
#    for(i in m){
#      gene <- gsub("-","",sort(stringr::str_trim(unlist(stringr::str_split(TFclass$Gene[i],"\\(|,|\\)|/")))))
#      gene <- gene[stringr::str_length(gene) > 0]
#      aux <- TFclass[rep(i,length(gene)),]
#      aux$Gene <- gene
#      df <- rbind(df,aux)
#    }
#    TFclass <- rbind(TFclass,df)
#  
#    df <- NULL
#    for(i in 1:length(TFclass$Gene)){
#      m <- TFclass$Gene[i]
#      gene <- unique(c(toupper(alias2Symbol(toupper(m))),toupper(m),toupper(alias2Symbol(m))))
#      if(all(gene %in% TFclass$Gene)) next
#      aux <- TFclass[rep(i,length(gene)),]
#      aux$Gene <- gene
#      df <- rbind(df,aux)
#    }
#    TFclass <- rbind(TFclass,df)
#    TFclass <- TFclass[!duplicated(TFclass),]
#    TFclass <- TFclass[TFclass$Gene %in% human.TF$external_gene_name,]
#    save(TFclass,file = "TFClass.rda")
#    return(TFclass)
#  }
#  TF.family <- createMotifRelevantTfs("family")
#  TF.family <- updateTFClassList(TF.family,"family")
#  TF.subfamily <- createMotifRelevantTfs("subfamily")
#  TF.subfamily <- updateTFClassList(TF.subfamily,classification = "subfamily")
#  save(TF.family,file = "~/ELMER.data/data/TF.family.rda", compress = "xz")
#  save(TF.subfamily,file = "~/ELMER.data/data/TF.subfamily.rda", compress = "xz")

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  hocomoco.table <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html()  %>%  html_table()
#  hocomoco.table <- hocomoco.table[[1]]
#  save(hocomoco.table,file = "data/hocomoco.table.rda", compress = "xz")

## -----------------------------------------------------------------------------
data("Probes.motif.hg19.450K")
dim(Probes.motif.hg19.450K)
str(Probes.motif.hg19.450K)

## -----------------------------------------------------------------------------
data("Probes.motif.hg38.450K")
dim(Probes.motif.hg38.450K)
str(Probes.motif.hg38.450K)

## -----------------------------------------------------------------------------
data("Probes.motif.hg19.EPIC")
dim(Probes.motif.hg19.EPIC)
str(Probes.motif.hg19.EPIC)

## -----------------------------------------------------------------------------
data("Probes.motif.hg38.EPIC")
dim(Probes.motif.hg38.EPIC)
str(Probes.motif.hg38.EPIC)

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  getInfiniumAnnotation <- function(plat = "450K", genome = "hg38"){
#    message("Loading object: ",file)
#    newenv <- new.env()
#    if(plat == "EPIC" & genome == "hg19") data("EPIC.hg19.manifest", package = "ELMER.data",envir=newenv)
#    if(plat == "EPIC" & genome == "hg38") data("EPIC.hg38.manifest", package = "ELMER.data",envir=newenv)
#    if(plat == "450K" & genome == "hg19") data("hm450.hg19.manifest", package = "ELMER.data",envir=newenv)
#    if(plat == "450K" & genome == "hg38") data("hm450.hg38.manifest", package = "ELMER.data",envir=newenv)
#    annotation <- get(ls(newenv)[1],envir=newenv)
#    return(annotation)
#  }
#  # To find for each probe the know motif we will use HOMER software (http://homer.salk.edu/homer/)
#  # Step:
#  # 1 - get DNA methylation probes annotation with the regions
#  # 2 - Make a bed file from it
#  # 3 - Execute section: Finding Instance of Specific Motifs from http://homer.salk.edu/homer/ngs/peakMotifs.html to the HOCOMOCO TF motifs
#  # Also, As HOMER is using more RAM than the available we will split the files in to 100k probes.
#  # Obs: for each probe we create a winddow of 500 bp (-size 500) around it. This might lead to false positives, but will not have false negatives.
#  # The false posives will be removed latter with some statistical tests.
#  TFBS.motif <- "http://hocomoco11.autosome.ru/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif"
#  if(!file.exists(basename(TFBS.motif))) downloader::download(TFBS.motif,basename(TFBS.motif))
#  for(plat in c("EPIC","450K")){
#    for(gen in c("hg38","hg19")){
#  
#      file <- paste0(plat,gen,".txt")
#      print(file)
#      if(!file.exists(file)){
#        # STEP 1
#        gr <- getInfiniumAnnotation(plat = plat,genome =  gen)
#  
#        # This will remove masked probes. They have poor quality and might be arbitrarily positioned (Wanding Zhou)
#        gr <- gr[!gr$MASK_general]
#  
#        df <- data.frame(seqnames=seqnames(gr),
#                         starts=as.integer(start(gr)),
#                         ends=end(gr),
#                         names=names(gr),
#                         scores=c(rep(".", length(gr))),
#                         strands=strand(gr))
#        step <- 10000 # nb of lines in each file. 10K was selected to not explode RAM
#        n <- nrow(df)
#        pb <- txtProgressBar(max = floor(n/step), style = 3)
#  
#        for(j in 0:floor(n/step)){
#          setTxtProgressBar(pb, j)
#          # STEP 2
#          file.aux <- paste0(plat,gen,"_",j,".bed")
#          if(!file.exists(gsub(".bed",".txt",file.aux))){
#            end <- ifelse(((j + 1) * step) > n, n,((j + 1) * step))
#            write.table(df[((j * step) + 1):end,], file = file.aux, col.names = F, quote = F,row.names = F,sep = "\t")
#  
#            # STEP 3 use -mscore to get scores
#            cmd <- paste("source ~/.bash_rc; annotatePeaks.pl" ,file.aux, gen, "-m", basename(TFBS.motif), "-size 500 -cpu 12 >", gsub(".bed",".txt",file.aux))
#            system(cmd)
#          }
#        }
#      }
#      close(pb)
#      # We will merge the results from each file into one
#      peaks <- NULL
#      pb <- txtProgressBar(max = floor(n/step), style = 3)
#      for(j in 0:floor(n/step)){
#        setTxtProgressBar(pb, j)
#        aux <-  readr::read_tsv(paste0(plat,gen,"_",j,".txt"))
#        colnames(aux)[1] <- "PeakID"
#        if(is.null(peaks)) {
#          peaks <- aux
#        } else {
#          peaks <- rbind(peaks, aux)
#        }
#      }
#      close(pb)
#      print("Writing file...")
#      readr::write_tsv(peaks,path=file,col_names = TRUE)
#      print("DONE!")
#      gc()
#    }
#  }
#  
#  getMatrix <- function(filename) {
#    motifs <- readr::read_tsv(file)
#    # From 1 to 21 we have annotations
#    matrix <- Matrix::Matrix(0, nrow = nrow(motifs), ncol = ncol(motifs) - 21 ,sparse = TRUE)
#    colnames(matrix) <- gsub(" Distance From Peak\\(sequence,strand,conservation\\)","",colnames(motifs)[-c(1:21)])
#    rownames(matrix) <- motifs$PeakID
#    matrix[!is.na(motifs[,-c(1:21)])] <- 1
#    matrix <- as(matrix, "nsparseMatrix")
#    return(matrix)
#  }
#  
#  for(plat in c("EPIC","450K")){
#    for(gen in c("hg19","hg38")){
#      file <- paste0(plat,gen,".txt")
#  
#      if(file == "450Khg19.txt"){
#        if(file.exists("Probes.motif.hg19.450K.rda")) next
#        Probes.motif.hg19.450K <- getMatrix(file)
#        save(Probes.motif.hg19.450K, file = "Probes.motif.hg19.450K.rda", compress = "xz")
#        rm(Probes.motif.hg19.450K)
#      }
#      if(file == "450Khg38.txt"){
#        if(file.exists("Probes.motif.hg38.450K.rda")) next
#        Probes.motif.hg38.450K <- getMatrix(file)
#        save(Probes.motif.hg38.450K, file = "Probes.motif.hg38.450K.rda", compress = "xz")
#        rm(Probes.motif.hg38.450K)
#      }
#  
#      if(file == "EPIChg19.txt"){
#        if(file.exists("Probes.motif.hg19.EPIC.rda")) next
#        Probes.motif.hg19.EPIC <- getMatrix(file)
#        save(Probes.motif.hg19.EPIC, file = "Probes.motif.hg19.EPIC.rda", compress = "xz")
#        rm(Probes.motif.hg19.EPIC)
#      }
#  
#      if(file == "EPIChg38.txt"){
#        if(file.exists("Probes.motif.hg38.EPIC.rda")) next
#  
#        Probes.motif.hg38.EPIC <- getMatrix(file)
#        save(Probes.motif.hg38.EPIC, file = "Probes.motif.hg38.EPIC.rda", compress = "xz")
#        rm(Probes.motif.hg38.EPIC)
#      }
#    }
#  }

## -----------------------------------------------------------------------------
data("Probes.motif.hg19.450K")
as.data.frame(as.matrix(Probes.motif.hg19.450K[1:20,1:20])) %>% 
  datatable(options = list(scrollX = TRUE,pageLength = 5)) 

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  human.TF <- readr::read_csv("http://humantfs.ccbr.utoronto.ca/download/v_1.01/DatabaseExtract_v_1.01.csv")
#  human.TF <- human.TF[human.TF$`Is TF?` == "Yes",]
#  colnames(human.TF)[1:2] <- c("ensembl_gene_id","external_gene_name")
#  save(human.TF,file = "~/ELMER.data/data/human.TF.rda",compress = "xz")

## -----------------------------------------------------------------------------
data("human.TF")
as.data.frame(human.TF) %>% 
  datatable(options = list(scrollX = TRUE,pageLength = 5)) 

## ----sessionInfo--------------------------------------------------------------
sessionInfo()