## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----getPackage, eval=FALSE--------------------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("cicero") ## ----eval = FALSE------------------------------------------------------------- # BiocManager::install(cole-trapnell-lab/cicero) ## ----Load, message=FALSE------------------------------------------------------ library(cicero) ## ----------------------------------------------------------------------------- data(cicero_data) ## ----eval=TRUE---------------------------------------------------------------- input_cds <- make_atac_cds(cicero_data, binarize = TRUE) ## ----eval=TRUE---------------------------------------------------------------- set.seed(2017) input_cds <- detectGenes(input_cds) input_cds <- estimateSizeFactors(input_cds) input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6, reduction_method = 'tSNE', norm_method = "none") ## ----eval=FALSE--------------------------------------------------------------- # tsne_coords <- t(reducedDimA(input_cds)) # row.names(tsne_coords) <- row.names(pData(input_cds)) # cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords) ## ----eval=FALSE--------------------------------------------------------------- # data("human.hg19.genome") # sample_genome <- subset(human.hg19.genome, V1 == "chr18") # sample_genome$V2[1] <- 10000000 # conns <- run_cicero(cicero_cds, sample_genome, sample_num = 2) # Takes a few minutes to run # head(conns) ## ----fig.width = 7, fig.height = 4, fig.align='center', eval=FALSE------------ # data(gene_annotation_sample) # plot_connections(conns, "chr18", 8575097, 8839855, # gene_model = gene_annotation_sample, # coaccess_cutoff = .25, # connection_width = .5, # collapseTranscripts = "longest" ) ## ----eval=FALSE--------------------------------------------------------------- # chia_conns <- data.frame(Peak1 = c("chr18_10000_10200", "chr18_10000_10200", # "chr18_49500_49600"), # Peak2 = c("chr18_10600_10700", "chr18_111700_111800", # "chr18_10600_10700")) # # conns$in_chia <- compare_connections(conns, chia_conns) ## ----eval=FALSE--------------------------------------------------------------- # conns$in_chia_100 <- compare_connections(conns, chia_conns, maxgap=100) # # head(conns) ## ----eval=FALSE--------------------------------------------------------------- # # Add a column of 1s called "coaccess" # chia_conns <- data.frame(Peak1 = c("chr18_10000_10200", "chr18_10000_10200", # "chr18_49500_49600"), # Peak2 = c("chr18_10600_10700", "chr18_111700_111800", # "chr18_10600_10700"), # coaccess = c(1, 1, 1)) # # plot_connections(conns, "chr18", 10000, 112367, # gene_model = gene_annotation_sample, # coaccess_cutoff = 0, # connection_width = .5, # comparison_track = chia_conns, # include_axis_track = FALSE, # collapseTranscripts = "longest") ## ----eval=FALSE--------------------------------------------------------------- # CCAN_assigns <- generate_ccans(conns) # # head(CCAN_assigns) ## ----eval=FALSE--------------------------------------------------------------- # # #### Add a column for the pData table indicating the gene if a peak is a promoter #### # # Create a gene annotation set that only marks the transcription start sites of # # the genes. We use this as a proxy for promoters. # # To do this we need the first exon of each transcript # pos <- subset(gene_annotation_sample, strand == "+") # pos <- pos[order(pos$start),] # pos <- pos[!duplicated(pos$transcript),] # remove all but the first exons per transcript # pos$end <- pos$start + 1 # make a 1 base pair marker of the TSS # # neg <- subset(gene_annotation_sample, strand == "-") # neg <- neg[order(neg$start, decreasing = TRUE),] # neg <- neg[!duplicated(neg$transcript),] # remove all but the first exons per transcript # neg$start <- neg$end - 1 # # gene_annotation_sub <- rbind(pos, neg) # # # Make a subset of the TSS annotation columns containing just the coordinates # # and the gene name # gene_annotation_sub <- gene_annotation_sub[,c(1:3, 8)] # # # Rename the gene symbol column to "gene" # names(gene_annotation_sub)[4] <- "gene" # # input_cds <- annotate_cds_by_site(input_cds, gene_annotation_sub) # # head(fData(input_cds)) # # # #### Generate gene activity scores #### # # generate unnormalized gene activity matrix # unnorm_ga <- build_gene_activity_matrix(input_cds, conns) # # # remove any rows/columns with all zeroes # unnorm_ga <- unnorm_ga[!Matrix::rowSums(unnorm_ga) == 0, !Matrix::colSums(unnorm_ga) == 0] # # # make a list of num_genes_expressed # num_genes <- pData(input_cds)$num_genes_expressed # names(num_genes) <- row.names(pData(input_cds)) # # # normalize # cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes) # # # if you had two datasets to normalize, you would pass both: # # num_genes should then include all cells from both sets # unnorm_ga2 <- unnorm_ga # cicero_gene_activities <- normalize_gene_activities(list(unnorm_ga, unnorm_ga2), num_genes) # ## ----eval=TRUE---------------------------------------------------------------- data("cicero_data") input_cds <- make_atac_cds(cicero_data) # Add some cell meta-data data("cell_data") pData(input_cds) <- cbind(pData(input_cds), cell_data[row.names(pData(input_cds)),]) pData(input_cds)$cell <- NULL agg_cds <- aggregate_nearby_peaks(input_cds, distance = 10000) agg_cds <- detectGenes(agg_cds) agg_cds <- estimateSizeFactors(agg_cds) agg_cds <- estimateDispersions(agg_cds) ## ----eval=TRUE---------------------------------------------------------------- # This takes a few minutes to run diff_timepoint <- differentialGeneTest(agg_cds, fullModelFormulaStr="~timepoint + num_genes_expressed") # We chose a very high q-value cutoff because there are # so few sites in the sample dataset, in general a q-value # cutoff in the range of 0.01 to 0.1 would be appropriate ordering_sites <- row.names(subset(diff_timepoint, qval < .5)) length(ordering_sites) ## ----fig.show='hold', eval=FALSE---------------------------------------------- # plot_pc_variance_explained(agg_cds, return_all = FALSE) #Choose 2 PCs # agg_cds <- reduceDimension(agg_cds, # max_components = 2, # norm_method = 'log', # num_dim = 2, # reduction_method = 'tSNE', # verbose = TRUE) # # agg_cds <- clusterCells(agg_cds, verbose = FALSE) # # plot_cell_clusters(agg_cds, color_by = 'as.factor(Cluster)') + theme(text = element_text(size=8)) # clustering_DA_sites <- differentialGeneTest(agg_cds[1:10,], #Subset for example only # fullModelFormulaStr = '~Cluster') # # # Not run because using Option 1 to continue # # ordering_sites <- # # row.names(clustering_DA_sites)[order(clustering_DA_sites$qval)][1:1000] # ## ----eval=TRUE---------------------------------------------------------------- agg_cds <- setOrderingFilter(agg_cds, ordering_sites) ## ----fig.align='center', fig.height=4, fig.width=4, eval=TRUE----------------- agg_cds <- reduceDimension(agg_cds, max_components = 2, residualModelFormulaStr="~num_genes_expressed", reduction_method = 'DDRTree') # skipped due to monocle bug # agg_cds <- suppressWarnings(orderCells(agg_cds)) plot_cell_trajectory(agg_cds, color_by = "timepoint") ## ----fig.align='center', fig.height=4, fig.width=4, eval=TRUE----------------- # skipped due to monocle bug #plot_cell_trajectory(agg_cds, color_by = "State") ## ----fig.align='center', fig.height=4, fig.width=4, eval=TRUE----------------- # skipped due to monocle bug #agg_cds <- suppressWarnings(orderCells(agg_cds, root_state = 4)) #plot_cell_trajectory(agg_cds, color_by = "Pseudotime") ## ----fig.align='center', fig.height=4, fig.width=4, eval=TRUE----------------- # skipped due to monocle bug #pData(input_cds)$Pseudotime <- pData(agg_cds)[colnames(input_cds),]$Pseudotime #pData(input_cds)$State <- pData(agg_cds)[colnames(input_cds),]$State ## ----fig.width = 3, fig.height = 4, fig.align='center', eval=TRUE------------- # skipped due to monocle bug #input_cds_lin <- input_cds[,row.names(subset(pData(input_cds), State != 5))] #plot_accessibility_in_pseudotime(input_cds_lin[c("chr18_38156577_38158261", # "chr18_48373358_48374180", # "chr18_60457956_60459080")]) ## ----eval=TRUE---------------------------------------------------------------- # skipped due to monocle bug #pData(input_cds_lin)$cell_subtype <- cut(pData(input_cds_lin)$Pseudotime, 10) #binned_input_lin <- aggregate_by_cell_bin(input_cds_lin, "cell_subtype") ## ----eval=TRUE---------------------------------------------------------------- # skipped due to monocle bug #diff_test_res <- differentialGeneTest(binned_input_lin[1:10,], #Subset for example only # fullModelFormulaStr="~sm.ns(Pseudotime, df=3) + sm.ns(num_genes_expressed, df=3)", # reducedModelFormulaStr="~sm.ns(num_genes_expressed, df=3)", cores=1) #head(diff_test_res) ## ----------------------------------------------------------------------------- citation("cicero") ## ----------------------------------------------------------------------------- sessionInfo()