## ----style, echo = FALSE, results = 'asis'------------------------------------ BiocStyle::markdown() ## ----BiocManager, eval=FALSE-------------------------------------------------- # ## Installation Bioconductor: # if (!require("BiocManager")) # install.packages("BiocManager") # BiocManager::install("seqpac") # # ## Installation github: # devtools::install_github("Danis102/seqpac", # dependencies=TRUE, ref="development") # # # ## ----eval=TRUE, echo=TRUE, warning=FALSE, message=FALSE, results = "hide"----- # Setup library(seqpac) sys_path = system.file("extdata", package = "seqpac", mustWork = TRUE) fastq <- list.files(path = sys_path, pattern = "fastq", all.files = FALSE, full.names = TRUE) ref_tRNA <- system.file("extdata/trna", "tRNA.fa", package = "seqpac", mustWork = TRUE) # Trim NEB adapter, generate counts and create PAC-object (pheno, anno, counts) count_list <- make_counts(fastq, plot = FALSE, parse="default_neb", trimming="seqpac") pac <- make_PAC(count_list$counts) pheno(pac)$groups <- c(rep("gr1",3), rep("gr2",3)) #Add groups to pheno table # Preprocess PAC-object and creat means pac <- PAC_norm(pac) # Default normalization is cpm pac <- PAC_filter(pac, size=c(15,60)) # Here, filter on sequence length pac <- PAC_summary(pac, norm = "cpm", type = "means", pheno_target=list("groups", unique(pheno(pac)$groups))) # Quickly align (annotate) and plot PAC-object map_tRNA <- PAC_mapper(pac, ref=ref_tRNA, override=TRUE) plts <- PAC_covplot(pac, map_tRNA, summary_target=list("cpmMeans_groups")) plts[[13]] ## ----eval=FALSE, echo=TRUE---------------------------------------------------- # ## Using default settings for NEB type adapter # # NEB= NEBNext® Small RNA Library Prep Set for Illumina (E7300/E7330) # # For illumina type adapters, 'parse="default_illumina"' may be used # # To speed things up we use 4 threads. # # count_list <- make_counts(input=fastq, trimming="seqpac", threads=4, # parse="default_neb") # ## ----eval=FALSE, echo=TRUE, warning=FALSE, message=FALSE, results = "hide"---- # ###--------------------------------------------------------------------- # ## Evidence over two independent samples, saving single sample sequences # ## reaching 3 counts # test <- make_counts(input=fastq, trimming="seqpac", # parse="default_neb", threads=3, # evidence=c(experiment=2, sample=3)) # extras <- apply(test$counts, 1, function(x){sum(!x==0)}) # test$counts[extras==1,] # A few still have 3 alone # ## ----eval=TRUE, echo=TRUE, warning=FALSE, message=FALSE, results = "hide"----- ###--------------------------------------------------------------------- ## Generate a Pheno table using the file names of test fastq Sample_ID <- colnames(count_list$counts) pheno_fl <- data.frame(Sample_ID=Sample_ID, Treatment=c(rep("heat", times=1), rep("control", times=2)), Batch=rep(c("1", "2", "3"), times=1)) pheno <- make_pheno(pheno=pheno_fl, progress_report=count_list$progress_report, counts=count_list$counts) ## ----eval=TRUE, echo=TRUE, warning=FALSE, message=FALSE, results = "hide"----- ###--------------------------------------------------------------------- ## Generate PAC object (default S4) pac_master <- make_PAC(pheno=pheno, counts=count_list$counts) pac_master ## If TRUE the PAC object is ok PAC_check(pac_master) ## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'--------------- ###--------------------------------------------------------------------- ## Load the PAC-object and inspect the columns in Pheno and Anno library(seqpac) load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", package = "seqpac", mustWork = TRUE)) pheno(pac)[,1:5] anno(pac)[1:5,1:5] counts(pac)[1:5,1:5] ## This PAC is an S4 object, but you can easily turn it to an S3 list with as(). ## using as.PAC() will turn it back into S4: isS4(pac) pac_s3 <- as(pac, "list") lapply(pac_s3[1:3], head) isS4(pac_s3) pac <- as.PAC(pac_s3) methods(class="PAC") #all S4 methods ## ----eval=TRUE, echo=TRUE----------------------------------------------------- ###--------------------------------------------------------------------- ## Extracts all sequences between 20-30 nt in length with at least 5 counts ## in 20% of all samples. pac_filt <- PAC_filter(pac, size=c(20,30), threshold=5, coverage=20, norm = "counts", pheno_target=NULL, anno_target=NULL) ## ----eval=FALSE, echo=TRUE---------------------------------------------------- # ###--------------------------------------------------------------------- # ## Optionally, a simple coverage/threshold graph at different filter depths # ## can be plotted with stat=TRUE (but it will promt you for a question). # pac_filt <- PAC_filter(pac, threshold=5, coverage=20, # norm = "counts", stat=TRUE, # pheno_target=NULL, anno_target=NULL) ## ----eval=TRUE, echo=TRUE----------------------------------------------------- ###--------------------------------------------------------------------- ## Extracts all sequences with 22 nt size and the samples in Batch1 and Batch2. pac_filt <- PAC_filter(pac, subset_only = TRUE, pheno_target=list("batch", c("Batch1", "Batch2")), anno_target=list("Size", "22")) ###--------------------------------------------------------------------- ## Extracts sequences with >=5 counts in 100% of samples within each stage filtsep <- PAC_filtsep(pac, norm="counts", threshold=5, coverage=100, pheno_target= list("stage")) pac_filt <- PAC_filter(pac, subset_only = TRUE, anno_target= unique(do.call("c", as.list(filtsep)))) ## ----eval=FALSE, echo=TRUE---------------------------------------------------- # ###--------------------------------------------------------------------- # ## Venn diagram using the venneuler package # olap <- reshape2::melt(filtsep, # measure.vars = c("Stage1", "Stage3", "Stage5"), # na.rm=TRUE) # plot(venneuler::venneuler(olap[,c(2,1)])) # # ###--------------------------------------------------------------------- # ## Setting output="binary", prepares data for upset plots (UpSetR package): # filtsep_bin <- PAC_filtsep(pac, norm="counts", threshold=5, # coverage=100, pheno_target= list("stage"), # output="binary") # # UpSetR::upset(filtsep_bin, sets = colnames(filtsep_bin), # mb.ratio = c(0.55, 0.45), order.by = "freq", # keep.order=TRUE) # ## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE, error=FALSE------ ###--------------------------------------------------------------------- ## Example normalization in Seqpac load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", package = "seqpac", mustWork = TRUE)) # PAC_norm works on both pac <- PAC_norm(pac, norm="cpm") pac <- PAC_norm(pac, norm="vst") pac ## ----results=FALSE, eval=TRUE------------------------------------------------- ###--------------------------------------------------------------------- ## Filtering using cpm instead of raw counts ## filter >=100 cpm in 100% of samples in >= 1 full group filtsep <- PAC_filtsep(pac, norm="cpm", threshold=100, coverage=100, pheno_target= list("stage")) pac_cpm_filt <- PAC_filter(pac, subset_only = TRUE, anno_target= unique(do.call("c", as.list(filtsep)))) ## ----results=FALSE, eval=TRUE------------------------------------------------- load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", package = "seqpac", mustWork = TRUE)) anno(pac) <- anno(pac)[,1, drop = FALSE] head(anno(pac)) ## ----eval = TRUE-------------------------------------------------------------- ###--------------------------------------------------------------------- ## Examples of generating bowtie indexes path <- "/some/path/to/your/folder" #Genome mycoplasma_path <- system.file("extdata/mycoplasma_genome", "mycoplasma.fa", package = "seqpac", mustWork = TRUE) Rbowtie::bowtie_build(mycoplasma_path, outdir=gsub("mycoplasma.fa", "", mycoplasma_path), prefix="mycoplasma", force=TRUE) #tRNA trna_path <- system.file("extdata/trna", "tRNA.fa", package = "seqpac", mustWork = TRUE) Rbowtie::bowtie_build(trna_path, outdir=gsub("tRNA.fa", "", trna_path), prefix="tRNA", force=TRUE) #rRNA rrna_path <- system.file("extdata/rrna", "rRNA.fa", package = "seqpac", mustWork = TRUE) Rbowtie::bowtie_build(rrna_path, outdir=gsub("rRNA.fa", "", rrna_path), prefix="rRNA", force=TRUE) ## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE, error=FALSE------ # Lets reload our pac library(seqpac) load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", package = "seqpac", mustWork = TRUE)) anno(pac) <- anno(pac)[,1, drop = FALSE] # Remove previous Anno ###--------------------------------------------------------------------- ## Genome mapping # Path to your output folder outpath_genome <- "/some/path/to/reanno_folder" # or a temp folder outpath_genome <- paste0(tempdir(), "/seqpac_reanno/reanno_genome") # Look up the your own Bowtie indexed reference genomes (fasta) ref_paths <- list(myco="") # For testing, you can use a small mycoplasma genome provided with Seqpac # But remember to provide single genomes as a named list anyway. # The name will be used to keep track of your genome. myco_path <- system.file("extdata/mycoplasma_genome", "mycoplasma.fa", package = "seqpac", mustWork = TRUE) ref_paths <- list(myco=myco_path) # Run map_reanno map_reanno(PAC=pac, ref_paths=ref_paths, output_path=outpath_genome, type="internal", mismatches=1, import="genome", threads=8, keep_temp=TRUE) ## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE, error=FALSE------ ###--------------------------------------------------------------------- ## sRNA mapping using internal bowtie # Path to output folder outpath_biotype <- "/some/path/to/reanno_biotype" # or a temp folder outpath_biotype <- paste0(tempdir(), "/seqpac_reanno/reanno_biotype") # Seqpac contains two fasta for tRNA and rRNA we can use for testing trna_path <- system.file("extdata/trna", "tRNA.fa", package = "seqpac", mustWork = TRUE) rrna_path <- system.file("extdata/rrna", "rRNA.fa", package = "seqpac", mustWork = TRUE) # Provide paths to bowtie indexed fasta references as a list ref_paths <- list(tRNA=trna_path, rRNA=rrna_path) map_reanno(pac, ref_paths=ref_paths, output_path=outpath_biotype, type="internal", mismatches=1, import="biotype", threads=6, keep_temp=TRUE) ## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'---- ###--------------------------------------------------------------------- ## Generate a reanno object with make_reanno reanno_genome <- make_reanno(outpath_genome, PAC=pac, mis_fasta_check = TRUE) reanno_biotype <- make_reanno(outpath_biotype, PAC=pac, mis_fasta_check = TRUE) # Note, setting mis_fasta_check=TRUE will double check that the number of # sequences that failed to receive an alignment in the last mismatch cycle # agrees with the number sequences in the reanno object without an annotation. # (these sequences are stored in mis_fasta_x.txt where x is max mismatches+1) # List structure str(reanno_genome, max.level = 3, give.attr = FALSE) str(reanno_biotype, max.level = 3, give.attr = FALSE) # Simple pie charts using the Overview table pie(table(overview(reanno_genome)$myco)) # Very few mycoplasma with 0 mismatch pie(table(overview(reanno_biotype)$Any)) # Many hits for either rRNA or tRNA ## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE------------------- ###--------------------------------------------------------------------- ### Genomic coordinates using add_reanno # Output as separate table anno_genome <- add_reanno(reanno_genome, type="genome", genome_max=10, mismatches=1) # Output merged with provided PAC object pac <- add_reanno(reanno_genome, type="genome", genome_max=10, mismatches=1, merge_pac=pac) # Example of original reference name annotations head(full(reanno_genome)$mis1$myco) # Finished genome annotation head(anno_genome) head(anno(pac)) ## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE------------------- ###--------------------------------------------------------------------- ## Classify sequences using add_reanno # Lets start by exploring the names in the original fasta reference: ref_path <- "/some/path/to/fasta_reference" # For testing use the the fasta reference for tRNA provided with Seqpac: trna_path <- system.file("extdata/trna", "tRNA.fa", package = "seqpac", mustWork = TRUE) # Read fasta names fasta <- names(Biostrings::readDNAStringSet(trna_path)) # Different naming standards table(substr(fasta, 1, 10)) # Starting with FBgn discriminate mitochondrial tRNA fasta[grepl("FBgn", fasta)] # The other are nuclear head(fasta[!grepl("FBgn", fasta)]) # We can use as 'mt:tRNA' as search term to catch mitochondrial tRNA # and '_tRNA' to catch nuclear. # A useful expression for extracting strings up to 1st white space is # fasta <- do.call("rbind", strsplit(fasta, " "))[,1] # Similarly load the rrna reference provided with seqpaq rrna_path <- system.file("extdata/rrna", "rRNA.fa", package = "seqpac", mustWork = TRUE) # Read fasta names fasta <- names(Biostrings::readDNAStringSet(rrna_path)) # Many rRNA subtypes table(substr(fasta, 1, 10)) # Lets try two search term lists directed against each reference and written as # regular expressions: # Names in the search list needs to be the same as in the reanno object head(overview(reanno_biotype)) # 'rRNA' and 'tRNA' with some capital letters # Generate a search list with search terms bio_search <- list( rRNA=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"), tRNA =c("_tRNA", "mt:tRNA") ) test <- add_reanno(reanno_biotype, bio_search=bio_search, type="biotype", bio_perfect=FALSE, mismatches = 1, merge_pac=pac) ## ----eval=FALSE, results=FALSE, message=FALSE, warning=FALSE------------------ # # Throws an error because perfect matching was required: # anno_temp <- add_reanno(reanno_biotype, bio_search=bio_search, # type="biotype", bio_perfect=TRUE, mismatches = 1) # ## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE------------------- # References with no search term hits are classified as "Other": anno_temp <- add_reanno(reanno_biotype, bio_search=bio_search, type="biotype", bio_perfect=FALSE, mismatches = 1) # Increasing number of hits allowing mismatches table(anno_temp$mis0) table(anno_temp$mis1) # You may also add your new annotaion with a PAC object using the 'merge_pac' # option. For even more options see ?add_reanno. pac <- add_reanno(reanno_biotype, bio_search=bio_search, type="biotype", bio_perfect=FALSE, mismatches = 1, merge_pac=pac) head(anno(pac)) ## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE------------------- ###--------------------------------------------------------------------- ## Hierarchical classification with simplify_reanno # Currently classification does not discriminate table(anno(pac)$mis3_bio) # Set the hierarchy and remember that it is order sensitive. # Here: S5_rRNA >> Other_rRNA >> mt:tRNA >> tRNA # Remember to use 'regular expressions' if you wish to catch all: hierarchy_1 <- list(rRNA_5S="rRNA_5S", Other_rRNA="5.8S|12S|16S|18S|28S|pre_S45|rRNA_other", Mt_tRNA="tRNA_mt:tRNA", tRNA="tRNA__tRNA" ) # What happens if you don't catch all: hierarchy_2 <- list(rRNA_5S="rRNA_5S", Mt_tRNA="tRNA_mt:tRNA", tRNA="tRNA__tRNA") # No mismatch allowed using hierarchy_1 test <- simplify_reanno(input=pac, hierarchy=hierarchy_1, mismatches=0, bio_name="Biotypes_mis0", merge_pac=FALSE) table(test) # All remaining rRNA are classified as 'Other_rRNA' # Instead using hierarchy_2 test <- simplify_reanno(input=pac, hierarchy=hierarchy_2, mismatches=0, bio_name="Biotypes_mis0", merge_pac=FALSE) table(test)# Non-hits are classified as 'other' # Note, setting 'merge_pac=FALSE' returns a one-column data.frame only # containing the new hierarchical classifications. # Lets increase number of mismatches an merge it with the PAC # (How many mismatches depends on what you allowed previously in the workflow) colnames(anno(pac)) # mis0-1_bio = Upto 1 mismatches are available pac_test <- simplify_reanno(input=pac, hierarchy=hierarchy_1, mismatches=1, bio_name="Biotypes_mis1", merge_pac=TRUE) # Now we also have some mitochondrial tRNA table(anno(pac_test)$Biotypes_mis1) ## ----message=FALSE, eval=TRUE------------------------------------------------- load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", package = "seqpac", mustWork = TRUE)) ###--------------------------------------------------------------------- ## PAC_summary in Seqpac # Make means of counts over stages and return a data.frame tab <- PAC_summary(pac, norm = "counts", type = "means", pheno_target=list("stage"), merge_pac=FALSE) # When merge_pac=TRUE the table is added to the summary(PAC) 'folder' pac_test <- PAC_summary(pac, norm = "counts", type = "means", pheno_target=list("stage"), merge_pac=TRUE) pac # Structure of PAC before PAC_summary (S4) pac_test # Structure of PAC after PAC_summary (S4) names(summary(pac_test)) head(summary(pac_test)$countsMeans_stage) ## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE------------------- # You may want to use normalized counts pac_test <- PAC_summary(pac_test, norm = "cpm", type = "means", pheno_target=list("stage"), merge_pac=TRUE) # Maybe only include a subset of the samples pac_test <- PAC_summary(pac_test, norm = "cpm", type = "means", pheno_target=list("batch", c("Batch1", "Batch2")), merge_pac=TRUE) # Generate standard errors pac_test <- PAC_summary(pac_test, norm = "cpm", type = "se", pheno_target=list("stage"), merge_pac=TRUE) # log2FC pac_test <- PAC_summary(pac_test, norm = "cpm", type = "log2FC", pheno_target=list("stage"), merge_pac=TRUE) # log2FC generated from a grand mean over all samples pac_test <- PAC_summary(pac_test, norm = "cpm", type = "log2FCgrand", pheno_target=list("stage"), merge_pac=TRUE) # All summarized tables have identical row names that can be merged names(summary(pac_test)) lapply(summary(pac_test), function(x){ identical(rownames(x), rownames(summary(pac_test)[[1]])) }) head(do.call("cbind", summary(pac_test))) ## ----message=FALSE, eval=TRUE, warning=FALSE---------------------------------- load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", package = "seqpac", mustWork = TRUE)) ###--------------------------------------------------------------------- ## Differential expression in Seqpac # Simple model testing stages against using Wald test with local fit (default) table(pheno(pac)$stage) output_deseq <- PAC_deseq(pac, model= ~stage, threads=6) ## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'---- # More complicated, but still graphs will be generated from 'stage' since it is # first in model output_deseq <- PAC_deseq(pac, model= ~stage + batch, threads=6) # Using pheno_target we can change the focus to batch # (no batch effect) output_deseq <- PAC_deseq(pac, model= ~stage + batch, pheno_target=list("batch")) # With pheno_target we can also change the direction of the comparison output_deseq <- PAC_deseq(pac, model= ~stage, pheno_target=list("stage", c("Stage5", "Stage3"))) ## In the output you find PAC merged results, target plots and output_deseq names(output_deseq) head(output_deseq$result) ## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'--------------- ###--------------------------------------------------------------------- ## PCA analysis in Seqpac # As simple as possible output_pca <- PAC_pca(pac) # Two 'folders' in the output names(output_pca) # Using pheno_target output_pca <- PAC_pca(pac, pheno_target =list("stage")) ## ----message=FALSE, eval=TRUE, warning=FALSE---------------------------------- # Using pheno_target with sample labels output_pca <- PAC_pca(pac, pheno_target =list("stage"), label=pheno(pac)$sample) ## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'--------------- # Plotting sequences instead output_pca <- PAC_pca(pac, type = "anno", anno_target =list("Biotypes_mis0")) ## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'--------------- ###--------------------------------------------------------------------- ## Nucleotide bias in Seqpac load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", package = "seqpac", mustWork = TRUE)) # Test without annotations # (note, is equivalent to an unfiltered master PAC) pac_test <- pac anno(pac_test) <- anno(pac_test)[,1, drop = FALSE] output_nbias <- PAC_nbias(pac_test) cowplot::plot_grid(plotlist=output_nbias$Histograms) # Now lets use an anno_target in an annotated PAC targetting miRNA # (Oops, heavy T-bias on 1st nt; are they piRNA?) table(anno(pac)$Biotypes_mis0) output_nbias <- PAC_nbias(pac, anno_target = list("Biotypes_mis0", "miRNA") ) cowplot::plot_grid(plotlist=output_nbias$Histograms) # Switch to 10:th nt bias output_nbias <- PAC_nbias(pac, position=10, anno_target = list("Biotypes_mis0", "miRNA")) cowplot::plot_grid(plotlist=output_nbias$Histograms) ## ----eval=TRUE, echo=TRUE----------------------------------------------------- # Summarized over group cpm means pac <- PAC_summary(pac, norm = "cpm", type = "means", pheno_target=list("stage"), merge_pac=TRUE) output_nbias <- PAC_nbias(pac, summary_target = list("cpmMeans_stage") ) cowplot::plot_grid(plotlist=output_nbias$Histograms) ## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'--------------- ###--------------------------------------------------------------------- ## Biotype size distribution # Divide stacked bars by biotype with no mismatch allowed output_sizedist_1 <- PAC_sizedist(pac, anno_target = list("Biotypes_mis0")) cowplot::plot_grid(plotlist=c(output_sizedist_1$Histograms), ncol=3, nrow=3) # Divide stacked bars by biotype with allowing up to 3 mismaches output_sizedist_2 <- PAC_sizedist(pac, anno_target = list("Biotypes_mis3")) cowplot::plot_grid(plotlist=c(output_sizedist_2$Histograms), ncol=3, nrow=3) # anno_target is order sensitive, thus can take care of color order issues: ord_bio <- as.character(unique(anno(pac)$Biotypes_mis3)) ord_bio <- ord_bio[c(1,5,2,4,3,6,7)] output_sizedist_2 <- PAC_sizedist(pac, anno_target = list("Biotypes_mis3", ord_bio)) # And again, we can use a summary_target instead: output_sizedist_sum <- PAC_sizedist(pac, summary_target = list("cpmMeans_stage"), anno_target = list("Biotypes_mis3", ord_bio)) cowplot::plot_grid(plotlist=output_sizedist_sum$Histograms) ## Note: ####################################################################### # 1. miRNA is clearly associated with the correct size (21-23) nt, which are # # dramatically increased in Stage 5 when zygotic transcription has started. # # 2. piRNA was deliberately left out from the fasta references. Note, however, # # that there is a broad peak with no annotations between 20-30 nt in Stage 1, # # which also showed a T-bias at the first nt. Possibly piRNA? # ################################################################################ ## ----message=FALSE, include = TRUE, eval=TRUE--------------------------------- ###--------------------------------------------------------------------- ## Stacked bars in Seqpac # Choose an anno_target and plot samples (summary="samples") PAC_stackbar(pac, anno_target=list("Biotypes_mis0")) ## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'---- # 'no_anno' and 'other' will always end on top not matter the order ord_bio <- as.character(sort(unique(anno(pac)$Biotypes_mis3))) p1 <- PAC_stackbar(pac, anno_target=list("Biotypes_mis0", ord_bio)) p2 <- PAC_stackbar(pac, anno_target=list("Biotypes_mis0", rev(ord_bio))) cowplot::plot_grid(plotlist=list(p1, p2)) # (Hint: if you want them to appear not on top, rename them) # Reorder samples by pheno_targets PAC_stackbar(pac, pheno_target=list("batch"), summary="samples", anno_target=list("Biotypes_mis0")) # Instead, summarized over pheno_target (summary="pheno") PAC_stackbar(pac, anno_target=list("Biotypes_mis0"), summary="pheno", pheno_target=list("stage")) ## ----eval=TRUE,fig.show=FALSE,results="hide",warning=FALSE,fig.keep='none'---- ###--------------------------------------------------------------------- ## Pie charts in Seqpac # Choose an anno_target and plot samples (summary="samples"; default) PAC_pie(pac, anno_target=list("Biotypes_mis0")) # Ordered pie charts of grand mean percent of all samples labeled with percent ord_bio <- as.character(sort(unique(anno(pac)$Biotypes_mis3)), unique(anno(pac)$Biotypes_mis0)) output_pie_1 <- PAC_pie(pac, anno_target=list("Biotypes_mis0", ord_bio), summary="all", labels="percent") output_pie_2 <- PAC_pie(pac, anno_target=list("Biotypes_mis3", ord_bio), summary="all", labels="percent") cowplot::plot_grid(plotlist=c(output_pie_1, output_pie_2), labels = c("mis 0","legend", "mis 3"), ncol=2, nrow=2, greedy=TRUE, scale=1.15) # Rotate PAC_pie(pac, anno_target=list("Biotypes_mis0"), summary="all", angle=180) PAC_pie(pac, anno_target=list("Biotypes_mis0"), summary="all", angle=40) # Compare biotype mapping with or without mismatches and group by pheno(PAC) ord_bio <- as.character(sort(unique(anno(pac)$Biotypes_mis0))) output_mis0 <- PAC_pie(pac, pheno_target=list("stage"), summary="pheno", anno_target=list("Biotypes_mis0", ord_bio)) output_mis3 <- PAC_pie(pac, pheno_target=list("stage"), summary="pheno", anno_target=list("Biotypes_mis3", ord_bio)) cowplot::plot_grid(plotlist=c(output_mis0, output_mis3), labels = names(c(output_mis0, output_mis3)), nrow=2, greedy = TRUE, scale=1.5) ## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'---- load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", package = "seqpac", mustWork = TRUE)) ###--------------------------------------------------------------------- ## tRNA analysis in Seqpac # First create an annotation blank PAC with group means anno(pac) <- anno(pac)[,1, drop=FALSE] pac_trna <- PAC_summary(pac, norm = "cpm", type = "means", pheno_target=list("stage"), merge_pac = TRUE) # Then reannotate only tRNA using the PAC_mapper function ref <- "/some/path/to/trna/fasta/recerence.fa" # or use the provided fasta ref <- system.file("extdata/trna2", "tRNA2.fa", package = "seqpac", mustWork = TRUE) map_object <- PAC_mapper(pac_trna, ref=ref, N_up = "NNN", N_down = "NNN", mismatches=0, threads=6, report_string=TRUE, override=TRUE) # Hint: By adding N_up ad N_down you can make sure that modified fragments (like # 3' -CAA in mature tRNA are included). ## Plot tRNA using xseq=TRUE gives you the reference sequence as X-axis: # (OBS! Long reference will not show well.) cov_tRNA <- PAC_covplot(pac_trna, map_object, summary_target = list("cpmMeans_stage"), xseq=TRUE, style="line", color=c("red", "black", "blue")) cov_tRNA[[1]] # Targeting a single tRNA using a summary data.frame PAC_covplot(pac_trna, map=map_object, summary_target= list("cpmMeans_stage"), map_target="tRNA-Lys-CTT-1-9") # Find tRNAs with many fragments # 1st extract number of rows from each alignment n_tRFs <- unlist(lapply(map_object, function(x){nrow(x[[2]])})) # The test data is highly down an filterd sampled, but still some with tRNA have # a few alignment table(n_tRFs) names(map_object)[n_tRFs>2] # Lets select a few of them and plot them selct <- names(map_object)[n_tRFs>2][c(1, 16, 27, 37)] cov_plt <- PAC_covplot(pac_trna, map=map_object, summary_target= list("cpmMeans_stage"), map_target=selct) ## ----eval=TRUE, warning=FALSE------------------------------------------------- cowplot::plot_grid(plotlist=cov_plt, nrow=2, ncol=2) ## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'---- ###--------------------------------------------------------------------- ## Generate range types using a ss-file # Download ss object from GtRNAdb # ("http://gtrnadb.ucsc.edu/") ss_file <- "/some/path/to/GtRNAdb/trna.ss" # or for testing use the provided ss-file in secpac ss_file <- system.file("extdata/trna2", "tRNA2.ss", package = "seqpac", mustWork = TRUE) # Classify fragments according to loop cleavage (small loops are omitted) map_object_ss <- map_rangetype(map_object, type="ss", ss=ss_file, min_loop_width=4) # Remove reference tRNAs with no hits map_object_ss <- map_object_ss[ !unlist(lapply(map_object_ss, function(x){x[[2]][1,1] == "no_hits"}))] # Now we have quite comprehensive tRNA loop annotations names(map_object_ss) map_object_ss[[1]][[2]] # Don't forget to check ?map_rangetype to obtain more options ###--------------------------------------------------------------------- ## Function classifying 5'-tRF, 5'halves, i-tRF, 3'-tRF, 3'halves # Does all tRNAs have 3 loops? table(unlist(lapply(map_object_ss, function(x){unique(x$Alignments$n_ssloop)}))) # Set tolerance for classification as a terminal tRF tolerance <- 5 # 2 nucleotides from start or end of full-length tRNA) ### Important: # We set N_up and N_down to "NNN" in the PAC_mapper step. To make sure # that we have a tolerance that include the original tRNA sequence # we set terminal= 2+3 (5). ## ----eval=TRUE, warning=FALSE------------------------------------------------- ## tRNA classifying function # Apply the tRNA_class function and make a tRNA type column pac_trna <- tRNA_class(pac_trna, map=map_object_ss, terminal=tolerance) anno(pac_trna)$type <- paste0(anno(pac_trna)$decoder, anno(pac_trna)$acceptor) anno(pac_trna)[1:5, 1:4] ## ----message=FALSE, echo=TRUE, eval=TRUE-------------------------------------- ###--------------------------------------------------------------------- ## Plot tRNA types # Now use PAC_trna to generate some graphs based on grand means trna_result <- PAC_trna(pac_trna, norm="cpm", filter = NULL, join = TRUE, top = 15, log2fc = TRUE, pheno_target = list("stage", c("Stage1", "Stage3")), anno_target_1 = list("type"), anno_target_2 = list("class")) names(trna_result) names(trna_result$plots) names(trna_result$plots$Expression_Anno_1) cowplot::plot_grid(trna_result$plots$Expression_Anno_1$Grand_means, trna_result$plots$Log2FC_Anno_1, trna_result$plots$Percent_bars$Grand_means, nrow=1, ncol=3) ## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'---- # By setting join = FALSE you will get group means trna_result <- PAC_trna(pac_trna, norm="cpm", filter = NULL, join = FALSE, top = 15, log2fc = TRUE, pheno_target = list("stage", c("Stage1", "Stage3")), anno_target_1 = list("type"), anno_target_2 = list("class")) names(trna_result$plots$Expression_Anno_1) cowplot::plot_grid(trna_result$plots$Expression_Anno_1$Stage1, trna_result$plots$Expression_Anno_1$Stage3, trna_result$plots$Log2FC_Anno_1, trna_result$plots$Percent_bars$Stage1, trna_result$plots$Percent_bars$Stage3, nrow=1, ncol=5) ## ----eval=TRUE---------------------------------------------------------------- sessionInfo()