## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(memes) library(magrittr) library(universalmotif) ## ----------------------------------------------------------------------------- flyFactorDb <- MotifDb::MotifDb %>% MotifDb::query("FlyFactorSurvey") ## ----------------------------------------------------------------------------- flyFactorMotifs <- flyFactorDb %>% convert_motifs() ## ----------------------------------------------------------------------------- flyFactorMotifs %>% head(1) ## ----------------------------------------------------------------------------- flyFactor_data <- flyFactorMotifs %>% to_df() ## ----------------------------------------------------------------------------- # The following columns can be changed to update motif metadata flyFactor_data %>% names ## ----------------------------------------------------------------------------- flyFactor_data %>% head(5) ## ----------------------------------------------------------------------------- length(flyFactor_data$altname) == length(unique(flyFactor_data$altname)) ## ----------------------------------------------------------------------------- flyFactor_data %<>% dplyr::rename("altname" = "name", "name" = "altname") ## ----------------------------------------------------------------------------- flyFactor_data %>% head(3) ## ----------------------------------------------------------------------------- flyFactor_data %<>% # Critical to set remove = FALSE to keep the `name` column tidyr::separate(name, c("tfid"), remove = FALSE, extra = "drop") %>% # Only use the tfid if the altname contains an FBgn dplyr::mutate(altname = ifelse(grepl("^FBgn", altname), tfid, altname)) ## ----------------------------------------------------------------------------- flyFactor_data %>% head(3) ## ----------------------------------------------------------------------------- flyFactor_data %<>% dplyr::mutate(name = gsub("_FBgn\\d+", "", name)) ## ----------------------------------------------------------------------------- flyFactor_data %>% dplyr::filter(altname != tfid) %>% # I'm only showing the first 5 rows for brevity, but take a look at the full # data and see what patterns you notice head(5) ## ----------------------------------------------------------------------------- flyFactor_data %>% # calling tolower() on both columns removes capitalization as a difference dplyr::filter(tolower(altname) != tolower(tfid), # Select all altnames that do not contain "-", "." or "(" !grepl("-|\\.|\\(", altname), ) %>% # I'll visalize only these columns for brevity dplyr::select(altname, tfid, name, consensus) %>% head(10) ## ----------------------------------------------------------------------------- flyFactor_data %<>% # rename all "da" instances using their tfid value instead dplyr::mutate(altname = ifelse(altname == "da", tfid, altname)) ## ----------------------------------------------------------------------------- flyFactor_data %>% dplyr::filter(tolower(altname) != tolower(tfid), !grepl("-|\\.|\\(", altname)) %>% dplyr::select(altname, tfid, name, consensus) %>% head(10) ## ----------------------------------------------------------------------------- flyFactor_data %>% dplyr::filter(tolower(altname) != tolower(tfid), !grepl("-|\\.|\\(", altname), # Remove CG genes from consideration !grepl("CG\\d+", tfid) ) %>% dplyr::select(altname, tfid, name, consensus) ## ----------------------------------------------------------------------------- swap_alt_id <- c("CG6272", "Clk", "Max", "Mnt", "Jra") remove <- "Bgb" flyFactor_data %<>% dplyr::mutate(altname = ifelse(altname %in% swap_alt_id, tfid, altname)) %>% dplyr::filter(!(altname %in% remove)) ## ----------------------------------------------------------------------------- flyFactor_data %>% dplyr::filter(tolower(altname) != tolower(tfid), !grepl("-|\\.|\\(", altname), # Remove CG genes from consideration !grepl("CG\\d+", tfid) ) %>% dplyr::select(altname, tfid, name, consensus) ## ----------------------------------------------------------------------------- flyFactor_data %>% dplyr::filter(consensus == "MMCACCTGYYV") ## ----------------------------------------------------------------------------- # This operation takes a while to run on large motif lists flyFactor_dedup <- remove_duplicate_motifs(flyFactor_data) ## ----------------------------------------------------------------------------- # Rows before cleanup nrow(flyFactor_data) # Rows after cleanup nrow(flyFactor_dedup) ## ----------------------------------------------------------------------------- flyFactor_dedup %>% dplyr::filter(consensus == "MMCACCTGYYV") ## ----------------------------------------------------------------------------- # extrainfo = FALSE drops the extra columns we added during data cleaning which are now unneeded flyFactorMotifs_final <- to_list(flyFactor_dedup, extrainfo = FALSE) ## ----------------------------------------------------------------------------- flyFactorMotifs_final %>% head(1) ## ----eval=F,include=T--------------------------------------------------------- # write_meme(flyFactorMotifs_final, "flyFactorSurvey_cleaned.meme") ## ----------------------------------------------------------------------------- sessionInfo()