## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL, ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html warning = FALSE ) ## ----'install1', eval = TRUE, message=FALSE, warning=FALSE-------------------- # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install(version = "3.16") ## ----'install2', eval = TRUE, message=FALSE, warning=FALSE-------------------- # BiocManager::install("AnnotationHub", update = FALSE) # BiocManager::install("GenomicRanges", update = FALSE) # BiocManager::install("plyranges", update = FALSE) ## ----get-data----------------------------------------------------------------- suppressMessages(library(AnnotationHub)) ah <- AnnotationHub() query_data <- subset(ah, preparerclass == "CTCF") # Explore the AnnotationHub object query_data # Get the list of data providers query_data$dataprovider %>% table() ## ----------------------------------------------------------------------------- subset(query_data, species == "Homo sapiens" & genome == "hg38" & dataprovider == "JASPAR 2022") # Same for mm10 mouse genome # subset(query_data, species == "Mus musculus" & genome == "mm10" & dataprovider == "JASPAR 2022") ## ----------------------------------------------------------------------------- # hg38.JASPAR2022_CORE_vertebrates_non_redundant_v2 CTCF_hg38_all <- query_data[["AH104727"]] CTCF_hg38_all ## ----------------------------------------------------------------------------- # hg38.MA0139.1 CTCF_hg38 <- query_data[["AH104729"]] CTCF_hg38 ## ----------------------------------------------------------------------------- suppressMessages(library(plyranges)) CTCF_hg38_all <- CTCF_hg38_all %>% keepStandardChromosomes() %>% sort() CTCF_hg38 <- CTCF_hg38 %>% keepStandardChromosomes() %>% sort() ## ----eval=FALSE--------------------------------------------------------------- # # Note that rtracklayer::import and rtracklayer::export perform unexplained # # start coordinate conversion, likely related to 0- and 1-based coordinate # # system. We recommend converting GRanges to a data frame and save tab-separated # write.table(CTCF_hg38_all %>% sort() %>% as.data.frame(), # file = "CTCF_hg38_all.bed", # sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) # write.table(CTCF_hg38 %>% sort() %>% as.data.frame(), # file = "CTCF_hg38.bed", # sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) ## ----eval=FALSE--------------------------------------------------------------- # library(tracktables) # BiocManager::install("tracktables") # # Sample sheet metadata # SampleSheet <- data.frame(SampleName = c("CTCF all", "CTCF MA0139.1"), # Description = c("All CTCF matrices from JASPAR2022", # "MA0139.1 CTCF matrix from JASPAR2022")) # # File sheet linking files with sample names # FileSheet <- data.frame(SampleName = c("CTCF all", "CTCF MA0139.1"), # bigwig = c(NA, NA), # interval = c("CTCF_hg38_all.bed", "CTCF_hg38.bed"), # bam = c(NA, NA)) # # Creating an IGV session XML file # MakeIGVSession(SampleSheet, FileSheet, # igvdirectory = getwd(), "CTCF_from_JASPAR2022", "hg38") # ## ----echo=FALSE--------------------------------------------------------------- knitr::include_graphics("../man/figures/Figure_human_pvalues_threshold.png") ## ----------------------------------------------------------------------------- # Check length before filtering print(paste("Number of CTCF motifs at the default 1e-4 threshold:", length(CTCF_hg38))) # Filter and check length after filtering CTCF_hg38_filtered <- CTCF_hg38 %>% plyranges::filter(pvalue < 1e-6) print(paste("Number of CTCF motifs at the 1e-6 threshold:", length(CTCF_hg38_filtered))) # Similarly, filter CTCF_hg38_all_filtered <- CTCF_hg38_all %>% plyranges::filter(pvalue < 1e-6) ## ----------------------------------------------------------------------------- # Proportion of overlapping enrtries tmp <- findOverlaps(CTCF_hg38_all, CTCF_hg38_all) prop_overlap <- sort(table(queryHits(tmp)) %>% table(), decreasing = TRUE) sum(prop_overlap[which(names(prop_overlap) != "1")]) / length(CTCF_hg38_all) ## ----------------------------------------------------------------------------- tmp <- findOverlaps(CTCF_hg38, CTCF_hg38) prop_overlap <- sort(table(queryHits(tmp)) %>% table(), decreasing = TRUE) sum(prop_overlap[which(names(prop_overlap) != "1")]) / length(CTCF_hg38) ## ----------------------------------------------------------------------------- print(paste("Number of CTCF_hg38 motifs at the 1e-6 threshold AND reduced:", length(CTCF_hg38_filtered %>% reduce()))) print(paste("Number of CTCF_hg38_all motifs at the 1e-6 threshold AND reduced:", length(CTCF_hg38_all_filtered %>% reduce()))) ## ----echo=FALSE--------------------------------------------------------------- knitr::include_graphics("../man/figures/Figure_liftOverJaccard.png") ## ----echo=FALSE--------------------------------------------------------------- mtx <- read.csv("../man/tables/Table_PWMs.csv") knitr::kable(mtx) ## ----echo=FALSE--------------------------------------------------------------- knitr::include_graphics("../man/figures/Supplementary_Figure_1.png") ## ----echo=FALSE--------------------------------------------------------------- mtx <- read.csv("../man/tables/Table_Predefined.csv") knitr::kable(mtx) ## ----echo=FALSE--------------------------------------------------------------- mtx <- read.csv("../man/tables/Table_log.csv") knitr::kable(mtx) ## ----'citation', eval = requireNamespace('CTCF')------------------------------ print(citation("CTCF"), bibtex = TRUE) ## ----------------------------------------------------------------------------- sessionInfo()