## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(fig.width=5, fig.height=5, message=FALSE, warning=FALSE) ## ----eval=FALSE--------------------------------------------------------------- # eh <- ExperimentHub() # ah <- AnnotationHub() # # some default resources: # seg <- eh[["EH7307"]] # pre-built genome segmentation for hg38 # exclude <- ah[["AH107305"]] # Kundaje excluded regions for hg38, see below # # set.seed(5) # set seed for reproducibility # blockLength <- 5e5 # size of blocks to bootstrap # R <- 10 # number of iterations of the bootstrap # # # input `ranges` require seqlengths, if missing see `GenomeInfoDb::Seqinfo` # seqlengths(ranges) # # next, we remove non-standard chromosomes ... # ranges <- keepStandardChromosomes(ranges, pruning.mode="coarse") # # ... and mitochondrial genome, as these are too short # seqlevels(ranges, pruning.mode="coarse") <- setdiff(seqlevels(ranges), "MT") # # # generate bootstraps # boots <- bootRanges(ranges, blockLength=blockLength, R=R, # seg=seg, exclude=exclude) # # `boots` can then be used with plyranges commands, e.g. join_overlap_* ## ----echo=FALSE--------------------------------------------------------------- knitr::include_graphics("images/bootRanges.jpeg") ## ----excluderanges------------------------------------------------------------ suppressPackageStartupMessages(library(AnnotationHub)) ah <- AnnotationHub() # hg38.Kundaje.GRCh38_unified_Excludable exclude_1 <- ah[["AH107305"]] # hg38.UCSC.centromere exclude_2 <- ah[["AH107354"]] # hg38.UCSC.telomere exclude_3 <- ah[["AH107355"]] # hg38.UCSC.short_arm exclude_4 <- ah[["AH107356"]] # combine them suppressWarnings({ exclude <- trim(c(exclude_1, exclude_2, exclude_3, exclude_4)) }) exclude <- sort(GenomicRanges::reduce(exclude)) ## ----message=FALSE, warning=FALSE--------------------------------------------- suppressPackageStartupMessages(library(ExperimentHub)) eh <- ExperimentHub() seg_cbs <- eh[["EH7307"]] # prefer CBS for hg38 seg_hmm <- eh[["EH7308"]] seg <- seg_cbs ## ----------------------------------------------------------------------------- suppressPackageStartupMessages(library(ensembldb)) suppressPackageStartupMessages(library(EnsDb.Hsapiens.v86)) edb <- EnsDb.Hsapiens.v86 filt <- AnnotationFilterList(GeneIdFilter("ENSG", "startsWith")) g <- genes(edb, filter = filt) ## ----------------------------------------------------------------------------- library(GenomeInfoDb) g <- keepStandardChromosomes(g, pruning.mode = "coarse") # MT is too small for bootstrapping, so must be removed seqlevels(g, pruning.mode="coarse") <- setdiff(seqlevels(g), "MT") # normally we would assign a new style, but for recent host issues # that produced vignette build problems, we use `paste0` ## seqlevelsStyle(g) <- "UCSC" seqlevels(g) <- paste0("chr", seqlevels(g)) genome(g) <- "hg38" g <- sortSeqlevels(g) g <- sort(g) table(seqnames(g)) ## ----------------------------------------------------------------------------- library(nullranges) suppressPackageStartupMessages(library(plyranges)) library(patchwork) ## ----seg-cbs, fig.width=5, fig.height=4--------------------------------------- set.seed(5) exclude <- exclude %>% plyranges::filter(width(exclude) >= 500) L_s <- 1e6 seg_cbs <- segmentDensity(g, n = 3, L_s = L_s, exclude = exclude, type = "cbs") plots <- lapply(c("ranges","barplot","boxplot"), function(t) { plotSegment(seg_cbs, exclude, type = t) }) plots[[1]] plots[[2]] + plots[[3]] ## ----fig.width=4, fig.height=3------------------------------------------------ region <- GRanges("chr16", IRanges(3e7,4e7)) plotSegment(seg_cbs, exclude, type="ranges", region=region) ## ----seg-hmm, fig.width=5, fig.height=4--------------------------------------- seg_hmm <- segmentDensity(g, n = 3, L_s = L_s, exclude = exclude, type = "hmm") plots <- lapply(c("ranges","barplot","boxplot"), function(t) { plotSegment(seg_hmm, exclude, type = t) }) plots[[1]] plots[[2]] + plots[[3]] ## ----------------------------------------------------------------------------- suppressPackageStartupMessages(library(nullrangesData)) dhs <- DHSA549Hg38() dhs <- dhs %>% plyranges::filter(signalValue > 100) %>% mutate(id = seq_along(.)) %>% plyranges::select(id, signalValue) length(dhs) table(seqnames(dhs)) ## ----------------------------------------------------------------------------- set.seed(5) # for reproducibility R <- 50 blockLength <- 5e5 boots <- bootRanges(dhs, blockLength, R = R, seg = seg, exclude=exclude) boots ## ----------------------------------------------------------------------------- suppressPackageStartupMessages(library(tidyr)) combined <- dhs %>% mutate(iter=0) %>% bind_ranges(boots) %>% plyranges::select(iter) stats <- combined %>% group_by(iter) %>% summarize(n = n()) %>% as_tibble() head(stats) ## ----warning=FALSE------------------------------------------------------------ suppressPackageStartupMessages(library(ggridges)) suppressPackageStartupMessages(library(purrr)) suppressPackageStartupMessages(library(ggplot2)) interdist <- function(dat) { x <- dat[-1,] y <- dat[-nrow(dat),] ifelse(x$seqnames == y$seqnames, x$start + floor((x$width - 1)/2) - y$start - floor((y$width - 1)/2), NA) } # just looking at first 3 iterations... combined %>% plyranges::filter(iter %in% 0:3) %>% mutate(iter = droplevels(iter)) %>% plyranges::select(iter) %>% as_tibble() %>% nest(data = !iter) %>% mutate(interdist = map(data, interdist)) %>% dplyr::select(iter, interdist) %>% unnest(interdist) %>% mutate(type = ifelse(iter == 0, "original", "boot"), interdist = pmax(interdist, 0)) %>% filter(!is.na(interdist)) %>% ggplot(aes(log10(interdist + 1), iter, fill=type)) + geom_density_ridges(alpha = 0.75) + geom_text(data = head(stats, 4), aes(x=1.5, y=iter, label=paste0("n=",n), fill=NULL), vjust=1.5) ## ----eval=FALSE--------------------------------------------------------------- # # pseudocode for the general paradigm: # boots <- bootRanges(y) # make bootstrapped y # x %>% join_overlap_inner(boots) %>% # overlaps of x with bootstrapped y # group_by(x_id, iter) %>% # collate by x ID and the bootstrap iteration # summarize(some_statistic = ...) %>% # compute some summary on metadata # as_tibble() %>% # pass to tibble # complete( # x_id, iter, # for any missing combinations of x ID and iter... # fill=list(some_statistic = 0) # ...fill in missing values # ) ## ----------------------------------------------------------------------------- x <- GRanges("chr2", IRanges(1 + 50:99 * 1e6, width=1e6), x_id=1:50) ## ----------------------------------------------------------------------------- x <- x %>% mutate(n_overlaps = count_overlaps(., dhs)) sum( x$n_overlaps ) ## ----------------------------------------------------------------------------- boot_stats <- x %>% join_overlap_inner(boots) %>% group_by(x_id, iter) %>% summarize(n_overlaps = n()) %>% as_tibble() %>% complete(x_id, iter, fill=list(n_overlaps = 0)) %>% group_by(iter) %>% summarize(sumOverlaps = sum(n_overlaps)) ## ----boot-histI--------------------------------------------------------------- suppressPackageStartupMessages(library(ggplot2)) ggplot(boot_stats, aes(sumOverlaps)) + geom_histogram(binwidth=5)+ geom_vline(xintercept = sum(x$n_overlaps), linetype = "dashed") ## ----------------------------------------------------------------------------- x_obs <- x %>% join_overlap_inner(dhs,maxgap=1e3) sum(x_obs$signalValue ) boot_stats <- x %>% join_overlap_inner(boots,maxgap=1e3) %>% group_by(x_id, iter) %>% summarize(Signal = sum(signalValue)) %>% as_tibble() %>% complete(x_id, iter, fill=list(Signal = 0)) %>% group_by(iter) %>% summarize(sumSignal = sum(Signal)) ## ----boot-histII-------------------------------------------------------------- ggplot(boot_stats, aes(sumSignal)) + geom_histogram()+ geom_vline(xintercept = sum(x_obs$signalValue), linetype = "dashed") ## ----eval=FALSE--------------------------------------------------------------- # ## split sparse count matrix into NumericList # # sc_rna <- rna_Granges[-which(rna.sd==0)] %>% # # mutate(counts1 = NumericList(asplit(rna.scaled, 1))) %>% sort() # # sc_promoter <- promoter_Granges[-which(promoter.sd==0)] %>% # # mutate(counts2 = NumericList(asplit(promoter.scaled, 1))) %>% sort() # suppressPackageStartupMessages(library(nullrangesData)) # data("sc_rna") # sc_rna # sc_rna$counts1 # nct <- length(sc_rna$counts1[[1]]) # print(paste("There are", nct, "cell types")) # data("sc_promoter") # ## bootstrap promoters # library(BSgenome.Hsapiens.UCSC.hg38) # genome <- BSgenome.Hsapiens.UCSC.hg38 # seqlengths(sc_promoter) <- seqlengths(genome)[1:22] # pull chrom lens from USCS # bootranges <- bootRanges(sc_promoter, blockLength = 5e5, R=50) ## ----eval=FALSE--------------------------------------------------------------- # cor_gr <- sc_rna %>% join_overlap_inner(sc_promoter, maxgap=1000) %>% # mutate(rho = 1/(nct-1) * sum(counts1 * counts2)) %>% # summarise(meanCor = mean(rho)) # ## mean correlation distribution # cor_boot <- sc_rna %>% # join_overlap_inner(bootranges, maxgap=1000) %>% # # vectorized code are 10 times faster than cor(counts1, counts2) for plyranges pipeline # mutate(rho = 1/(nct-1) * sum(counts1 * counts2)) %>% # select(rho, iter) %>% # group_by(iter) %>% # summarise(meanCor = mean(rho)) %>% # as.data.frame() # ggplot(cor_boot, aes(meanCor)) + # geom_histogram(binwidth=0.01)+ # geom_vline(xintercept = cor_gr$meanCor,linetype = "dashed")+ # theme_classic() ## ----eval=FALSE--------------------------------------------------------------- # library(tidySummarizedExperiment) # library(purrr) # # # make an SE where each row is an overlap # makeOverlapSE <- function(se_rna, se_promoter) { # idx <- rowRanges(se_rna) %>% join_overlap_inner(rowRanges(se_promoter),maxgap = 1000) # assay_x <- assay(se_rna, "rna")[ idx$gene, ] # assay_y <- assay(se_promoter, "promoter")[ idx$peak, ] # # this is needed to build the SE # rownames(assay_x) <- rownames(assay_y) <- seq_along( idx$gene ) # names(idx) <- seq_along( idx$gene ) # SummarizedExperiment( # assays=list(x=assay_x, y=assay_y), # rowRanges=idx # ) # } # # # create SE for observed data # se_rna <- SummarizedExperiment( # assays=list(rna=do.call(rbind,sc_rna$counts1)), # rowRanges=sc_rna) # se_promoter <- SummarizedExperiment( # assays=list(promoter=do.call(rbind,sc_promoter$counts2)), # rowRanges=sc_promoter) # # se <- makeOverlapSE(se_rna, se_promoter) # se <- se %>% # as_tibble() %>% # nest(data = -.feature) %>% # mutate(rho = map(data, # function(data) data %>% summarize(rho = cor(x, y)) # )) %>% # unnest(rho) %>% # select(-data) # # print(paste("mean correlation is", mean(se$rho))) ## ----eval=FALSE--------------------------------------------------------------- # # create SE for bootranges data # se_promoter_boots <- SummarizedExperiment( # assays=list(promoter=do.call(rbind,bootranges$counts2)), # rowRanges=bootranges) # se_boots <- makeOverlapSE(se_rna, se_promoter_boots) # # se_boots <- se_boots %>% # as_tibble() %>% # nest(data = -c(.feature,iter)) %>% # # vectorized code similar to cor(counts1, counts2) for tidySE pipeline # mutate(rho = map(data, # function(data) data %>% summarize(rho = cor(x, y)) # )) %>% # unnest(rho) %>% # select(-data) # # cor_boot <- se_boots %>% group_by(iter) %>% # summarise(meanCor = mean(rho)) # ggplot(cor_boot, aes(meanCor)) + # geom_histogram(binwidth=0.01)+ # geom_vline(xintercept = mean(se$rho),linetype = "dashed")+ # theme_classic() ## ----------------------------------------------------------------------------- suppressPackageStartupMessages(library(nullrangesData)) dhs <- DHSA549Hg38() region <- GRanges("chr1", IRanges(10e6 + 1, width=1e6)) x <- GRanges("chr1", IRanges(10e6 + 0:9 * 1e5 + 1, width=1e4)) y <- dhs %>% filter_by_overlaps(region) %>% select(NULL) x %>% mutate(num_overlaps = count_overlaps(., y)) ## ----------------------------------------------------------------------------- seg <- oneRegionSegment(region, seqlength=248956422) y <- keepSeqlevels(y, "chr1") set.seed(1) boot <- bootRanges(y, blockLength=1e5, R=1, seg=seg, proportionLength=FALSE) boot x %>% mutate(num_overlaps = count_overlaps(., boot)) ## ----------------------------------------------------------------------------- suppressPackageStartupMessages(library(plotgardener)) my_palette <- function(n) { head(c("red","green3","red3","dodgerblue", "blue2","green4","darkred"), n) } plotGRanges <- function(gr) { pageCreate(width = 5, height = 5, xgrid = 0, ygrid = 0, showGuides = TRUE) for (i in seq_along(seqlevels(gr))) { chrom <- seqlevels(gr)[i] chromend <- seqlengths(gr)[[chrom]] suppressMessages({ p <- pgParams(chromstart = 0, chromend = chromend, x = 0.5, width = 4*chromend/500, height = 2, at = seq(0, chromend, 50), fill = colorby("state_col", palette=my_palette)) prngs <- plotRanges(data = gr, params = p, chrom = chrom, y = 2 * i, just = c("left", "bottom")) annoGenomeLabel(plot = prngs, params = p, y = 0.1 + 2 * i) }) } } ## ----------------------------------------------------------------------------- library(GenomicRanges) seq_nms <- rep(c("chr1","chr2"), c(4,3)) seg <- GRanges( seqnames = seq_nms, IRanges(start = c(1, 101, 201, 401, 1, 201, 301), width = c(100, 100, 200, 100, 200, 100, 100)), seqlengths=c(chr1=500,chr2=400), state = c(1,2,1,3,3,2,1), state_col = factor(1:7) ) ## ----toysegments, fig.width=5, fig.height=4, eval=FALSE----------------------- # plotGRanges(seg) ## ----toyrangesI, fig.width=5, fig.height=4, eval=FALSE------------------------ # set.seed(1) # n <- 200 # gr <- GRanges( # seqnames=sort(sample(c("chr1","chr2"), n, TRUE)), # IRanges(start=round(runif(n, 1, 500-10+1)), width=10) # ) # suppressWarnings({ # seqlengths(gr) <- seqlengths(seg) # }) # gr <- gr[!(seqnames(gr) == "chr2" & end(gr) > 400)] # gr <- sort(gr) # idx <- findOverlaps(gr, seg, type="within", select="first") # gr <- gr[!is.na(idx)] # idx <- idx[!is.na(idx)] # gr$state <- seg$state[idx] # gr$state_col <- factor(seg$state_col[idx]) # plotGRanges(gr) ## ----toy-no-prop, fig.width=5, fig.height=4, eval=FALSE----------------------- # set.seed(1) # gr_prime <- bootRanges(gr, blockLength = 25, seg = seg, # proportionLength = FALSE) # plotGRanges(gr_prime) ## ----toy-prop, fig.width=5, fig.height=4, eval=FALSE-------------------------- # set.seed(1) # gr_prime <- bootRanges(gr, blockLength = 50, seg = seg, # proportionLength = TRUE) # plotGRanges(gr_prime) ## ----------------------------------------------------------------------------- sessionInfo()