## ----style, echo = FALSE, results = 'asis'------------------------------------

  BiocStyle::markdown()


## ----BiocManager, eval=FALSE--------------------------------------------------
#  ## Installation Bioconductor:
#  if (!require("BiocManager"))
#      install.packages("BiocManager")
#  BiocManager::install("seqpac")
#  
#  ## Installation github:
#  devtools::install_github("Danis102/seqpac",
#                           dependencies=TRUE, ref="development")
#  
#  
#  

## ----eval=TRUE, echo=TRUE, warning=FALSE, message=FALSE, results = "hide"-----
 
# Setup
 library(seqpac)
 
 sys_path = system.file("extdata", package = "seqpac", mustWork = TRUE)
 fastq <- list.files(path = sys_path, pattern = "fastq", all.files = FALSE,
                 full.names = TRUE)
 ref_tRNA <- system.file("extdata/trna", "tRNA.fa",
                         package = "seqpac", mustWork = TRUE)
 
 # Trim NEB adapter, generate counts and create PAC-object (pheno, anno, counts)
 count_list  <- make_counts(fastq, plot = FALSE, 
                             parse="default_neb", trimming="seqpac")
 pac <- make_PAC(count_list$counts)
 pheno(pac)$groups <- c(rep("gr1",3), rep("gr2",3)) #Add groups to pheno table
 
 # Preprocess PAC-object and creat means
 pac <- PAC_norm(pac)                        # Default normalization is cpm
 pac <- PAC_filter(pac, size=c(15,60))       # Here, filter on sequence length
 pac <- PAC_summary(pac, norm = "cpm", type = "means", 
                    pheno_target=list("groups", unique(pheno(pac)$groups)))
 
 # Quickly align (annotate) and plot PAC-object
 map_tRNA <- PAC_mapper(pac, ref=ref_tRNA, override=TRUE)
 plts <- PAC_covplot(pac, map_tRNA, summary_target=list("cpmMeans_groups"))
 plts[[13]]

 
## ----eval=FALSE, echo=TRUE----------------------------------------------------
#  ## Using default settings for NEB type adapter
#  # NEB= NEBNext® Small RNA Library Prep Set for Illumina (E7300/E7330)
#  # For illumina type adapters, 'parse="default_illumina"' may be used
#  # To speed things up we use 4 threads.
#  
#  count_list <- make_counts(input=fastq, trimming="seqpac", threads=4,
#                            parse="default_neb")
#  

## ----eval=FALSE, echo=TRUE, warning=FALSE, message=FALSE, results = "hide"----
#  ###---------------------------------------------------------------------
#  ## Evidence over two independent samples, saving single sample sequences
#  ## reaching 3 counts
#  test <- make_counts(input=fastq, trimming="seqpac",
#                      parse="default_neb", threads=3,
#                      evidence=c(experiment=2, sample=3))
#  extras <- apply(test$counts, 1, function(x){sum(!x==0)})
#  test$counts[extras==1,] # A few still have 3 alone
#  

## ----eval=TRUE, echo=TRUE, warning=FALSE, message=FALSE, results = "hide"-----
###--------------------------------------------------------------------- 
## Generate a Pheno table using the file names of test fastq 
Sample_ID <- colnames(count_list$counts)
pheno_fl <- data.frame(Sample_ID=Sample_ID,
                    Treatment=c(rep("heat", times=1), 
                                rep("control", times=2)),
                    Batch=rep(c("1", "2", "3"), times=1))

pheno <- make_pheno(pheno=pheno_fl, 
                    progress_report=count_list$progress_report, 
                    counts=count_list$counts)


## ----eval=TRUE, echo=TRUE, warning=FALSE, message=FALSE, results = "hide"-----
###--------------------------------------------------------------------- 
## Generate PAC object (default S4)
pac_master <- make_PAC(pheno=pheno, counts=count_list$counts)
pac_master

## If TRUE the PAC object is ok 
PAC_check(pac_master)

## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'---------------
###--------------------------------------------------------------------- 
## Load the PAC-object  and inspect the columns in Pheno and Anno
library(seqpac)
load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))

pheno(pac)[,1:5]
anno(pac)[1:5,1:5]
counts(pac)[1:5,1:5]

## This PAC is an S4 object, but you can easily turn it to an S3 list with as().
## using as.PAC() will turn it back into S4:
isS4(pac)
pac_s3 <- as(pac, "list")
lapply(pac_s3[1:3], head) 
isS4(pac_s3)
pac <- as.PAC(pac_s3)

methods(class="PAC") #all S4 methods


## ----eval=TRUE, echo=TRUE-----------------------------------------------------
 ###--------------------------------------------------------------------- 
## Extracts all sequences between 20-30 nt in length with at least 5 counts 
## in 20% of all samples.
pac_filt <- PAC_filter(pac, size=c(20,30), threshold=5, 
                          coverage=20, norm = "counts",  
                          pheno_target=NULL, anno_target=NULL)

## ----eval=FALSE, echo=TRUE----------------------------------------------------
#  ###---------------------------------------------------------------------
#  ## Optionally, a simple coverage/threshold graph at different filter depths
#  ## can be plotted with stat=TRUE (but it will promt you for a question).
#  pac_filt <- PAC_filter(pac, threshold=5, coverage=20,
#                         norm = "counts", stat=TRUE,
#                         pheno_target=NULL, anno_target=NULL)

## ----eval=TRUE, echo=TRUE-----------------------------------------------------
###--------------------------------------------------------------------- 
## Extracts all sequences with 22 nt size and the samples in Batch1 and Batch2.
pac_filt <- PAC_filter(pac, subset_only = TRUE,
                       pheno_target=list("batch", c("Batch1", "Batch2")), 
                       anno_target=list("Size", "22"))

###--------------------------------------------------------------------- 
## Extracts  sequences with >=5 counts in 100% of samples within each stage
filtsep <- PAC_filtsep(pac, norm="counts", threshold=5, 
                       coverage=100, pheno_target= list("stage"))

pac_filt <- PAC_filter(pac, subset_only = TRUE,
                       anno_target= unique(do.call("c", as.list(filtsep))))
                       
                      
## ----eval=FALSE, echo=TRUE----------------------------------------------------
#  ###---------------------------------------------------------------------
#  ## Venn diagram using the venneuler package
#  olap <- reshape2::melt(filtsep,
#                         measure.vars = c("Stage1", "Stage3", "Stage5"),
#                         na.rm=TRUE)
#  plot(venneuler::venneuler(olap[,c(2,1)]))
#  
#  ###---------------------------------------------------------------------
#  ## Setting output="binary", prepares data for upset plots (UpSetR package):
#  filtsep_bin <- PAC_filtsep(pac, norm="counts", threshold=5,
#                             coverage=100, pheno_target= list("stage"),
#                             output="binary")
#  
#  UpSetR::upset(filtsep_bin, sets = colnames(filtsep_bin),
#                             mb.ratio = c(0.55, 0.45), order.by = "freq",
#                             keep.order=TRUE)
#  

## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE, error=FALSE------
###--------------------------------------------------------------------- 
## Example normalization in Seqpac
load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))

# PAC_norm works on both
pac <- PAC_norm(pac, norm="cpm")
pac <- PAC_norm(pac, norm="vst")

pac


## ----results=FALSE, eval=TRUE-------------------------------------------------

###---------------------------------------------------------------------
## Filtering using cpm instead of raw counts
## filter >=100 cpm in 100% of samples in >= 1 full group
filtsep <- PAC_filtsep(pac, norm="cpm", threshold=100, coverage=100, 
                       pheno_target= list("stage")) 
pac_cpm_filt <- PAC_filter(pac, subset_only = TRUE,
                       anno_target= unique(do.call("c", as.list(filtsep))))


## ----results=FALSE, eval=TRUE-------------------------------------------------

load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))
anno(pac) <- anno(pac)[,1, drop = FALSE]
head(anno(pac))


## ----eval = TRUE--------------------------------------------------------------
###---------------------------------------------------------------------
## Examples of generating bowtie indexes

path <- "/some/path/to/your/folder"

#Genome
mycoplasma_path <- system.file("extdata/mycoplasma_genome", "mycoplasma.fa", 
                               package = "seqpac", mustWork = TRUE)
Rbowtie::bowtie_build(mycoplasma_path, 
                      outdir=gsub("mycoplasma.fa", "", mycoplasma_path), 
                      prefix="mycoplasma", force=TRUE)
#tRNA
trna_path <- system.file("extdata/trna", "tRNA.fa", 
                           package = "seqpac", mustWork = TRUE)
Rbowtie::bowtie_build(trna_path, 
                        outdir=gsub("tRNA.fa", "", trna_path), 
                        prefix="tRNA", force=TRUE)
  
#rRNA
rrna_path <- system.file("extdata/rrna", "rRNA.fa", 
                           package = "seqpac", mustWork = TRUE)
Rbowtie::bowtie_build(rrna_path, 
                        outdir=gsub("rRNA.fa", "", rrna_path), 
                        prefix="rRNA", force=TRUE) 


## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE, error=FALSE------
# Lets reload our pac
library(seqpac)
load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))
anno(pac) <- anno(pac)[,1, drop = FALSE] # Remove previous Anno

###---------------------------------------------------------------------
## Genome mapping

# Path to your output folder 
outpath_genome <- "/some/path/to/reanno_folder" 
# or a temp folder
outpath_genome <- paste0(tempdir(), "/seqpac_reanno/reanno_genome")

# Look up the your own Bowtie indexed reference genomes (fasta)
ref_paths <- list(myco="<some/path/to/your/genome.fa>")

# For testing, you can use a small mycoplasma genome provided with Seqpac
# But remember to provide single genomes as a named list anyway.
# The name will be used to keep track of your genome.
myco_path <- system.file("extdata/mycoplasma_genome", "mycoplasma.fa", 
                         package = "seqpac", mustWork = TRUE)
ref_paths <- list(myco=myco_path)

# Run map_reanno
map_reanno(PAC=pac, ref_paths=ref_paths, output_path=outpath_genome, 
           type="internal", mismatches=1, import="genome", 
           threads=8, keep_temp=TRUE)


## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE, error=FALSE------
###---------------------------------------------------------------------
## sRNA mapping using internal bowtie

# Path to output folder 
outpath_biotype <- "/some/path/to/reanno_biotype" 
# or a temp folder
outpath_biotype <- paste0(tempdir(), "/seqpac_reanno/reanno_biotype")


# Seqpac contains two fasta for tRNA and rRNA we can use for testing
trna_path <- system.file("extdata/trna", "tRNA.fa", 
                         package = "seqpac", mustWork = TRUE)  
rrna_path <- system.file("extdata/rrna", "rRNA.fa", 
                           package = "seqpac", mustWork = TRUE)

# Provide paths to bowtie indexed fasta references as a list
ref_paths <- list(tRNA=trna_path,
                  rRNA=rrna_path)

map_reanno(pac, ref_paths=ref_paths, output_path=outpath_biotype, 
           type="internal", mismatches=1,  import="biotype", 
           threads=6, keep_temp=TRUE)


## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'----
###---------------------------------------------------------------------
## Generate a reanno object with make_reanno 
reanno_genome <- make_reanno(outpath_genome, PAC=pac, mis_fasta_check = TRUE)
reanno_biotype <- make_reanno(outpath_biotype, PAC=pac, mis_fasta_check = TRUE)

# Note, setting mis_fasta_check=TRUE will double check that the number of
# sequences that failed to receive an alignment in the last mismatch cycle
# agrees with the number sequences in the reanno object without an annotation.
# (these sequences are stored in mis_fasta_x.txt where x is max mismatches+1)

# List structure
str(reanno_genome, max.level = 3, give.attr = FALSE)
str(reanno_biotype, max.level = 3, give.attr = FALSE)

# Simple pie charts using the Overview table
pie(table(overview(reanno_genome)$myco)) # Very few mycoplasma with 0 mismatch 
pie(table(overview(reanno_biotype)$Any))  # Many hits for either rRNA or tRNA


## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE-------------------
###---------------------------------------------------------------------
### Genomic coordinates using add_reanno 

# Output as separate table
anno_genome <- add_reanno(reanno_genome, type="genome", 
                          genome_max=10, mismatches=1)

# Output merged with provided PAC object
pac <- add_reanno(reanno_genome, type="genome", 
                  genome_max=10, mismatches=1, merge_pac=pac)

# Example of original reference name annotations 
head(full(reanno_genome)$mis1$myco)

# Finished genome annotation 
head(anno_genome)
head(anno(pac))


## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE-------------------
###---------------------------------------------------------------------
## Classify sequences using add_reanno 

# Lets start by exploring the names in the original fasta reference: 
ref_path <- "/some/path/to/fasta_reference" 
# For testing use the the fasta reference for tRNA provided with Seqpac:
trna_path <- system.file("extdata/trna", "tRNA.fa", 
                         package = "seqpac", mustWork = TRUE)  

#  Read fasta names
fasta <- names(Biostrings::readDNAStringSet(trna_path))
# Different naming standards  
table(substr(fasta, 1, 10))
# Starting with FBgn discriminate mitochondrial tRNA
fasta[grepl("FBgn", fasta)]
# The other are nuclear
head(fasta[!grepl("FBgn", fasta)])

# We can use as 'mt:tRNA' as search term to catch mitochondrial tRNA
# and '_tRNA' to catch nuclear. 

# A useful expression for extracting strings up to 1st white space is
# fasta <- do.call("rbind", strsplit(fasta, " "))[,1]
    
# Similarly load the rrna reference provided with seqpaq
rrna_path <- system.file("extdata/rrna", "rRNA.fa", 
                         package = "seqpac", mustWork = TRUE)
#  Read fasta names
fasta <- names(Biostrings::readDNAStringSet(rrna_path))
# Many rRNA subtypes  
table(substr(fasta, 1, 10))

# Lets try two search term lists directed against each reference and written as
# regular expressions:

# Names in the search list needs to be the same as in the reanno object
head(overview(reanno_biotype)) # 'rRNA' and 'tRNA' with some capital letters

# Generate a search list with  search terms 
bio_search <- list(
                rRNA=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
                tRNA =c("_tRNA", "mt:tRNA")
                    )
 
test <- add_reanno(reanno_biotype, bio_search=bio_search, 
                      type="biotype", bio_perfect=FALSE, 
                      mismatches = 1, merge_pac=pac)                

## ----eval=FALSE, results=FALSE, message=FALSE, warning=FALSE------------------
#  # Throws an error because perfect matching was required:
#  anno_temp <- add_reanno(reanno_biotype, bio_search=bio_search,
#                          type="biotype", bio_perfect=TRUE, mismatches = 1)
#  

## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE-------------------
# References with no search term hits are classified as "Other":
anno_temp <- add_reanno(reanno_biotype, bio_search=bio_search, 
                        type="biotype", bio_perfect=FALSE, mismatches = 1)

# Increasing number of hits allowing mismatches  
table(anno_temp$mis0) 
table(anno_temp$mis1)

# You may also add your new annotaion with a PAC object using the 'merge_pac' 
# option. For even more options see ?add_reanno.
pac <- add_reanno(reanno_biotype, bio_search=bio_search, 
                        type="biotype", bio_perfect=FALSE, mismatches = 1,
                        merge_pac=pac)
head(anno(pac))


## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE-------------------
###---------------------------------------------------------------------
## Hierarchical classification with simplify_reanno

#  Currently classification does not discriminate
table(anno(pac)$mis3_bio)

# Set the hierarchy and remember that it is order sensitive.
# Here: S5_rRNA >> Other_rRNA >> mt:tRNA >> tRNA
# Remember to use 'regular expressions' if you wish to catch all:
hierarchy_1 <- list(rRNA_5S="rRNA_5S",
                    Other_rRNA="5.8S|12S|16S|18S|28S|pre_S45|rRNA_other",
                    Mt_tRNA="tRNA_mt:tRNA",
                    tRNA="tRNA__tRNA"
                    )

# What happens if you don't catch all:
hierarchy_2 <- list(rRNA_5S="rRNA_5S",
                    Mt_tRNA="tRNA_mt:tRNA",
                    tRNA="tRNA__tRNA")

# No mismatch allowed using hierarchy_1
test <- simplify_reanno(input=pac, hierarchy=hierarchy_1, mismatches=0, 
                       bio_name="Biotypes_mis0", merge_pac=FALSE)
table(test) # All remaining rRNA are classified as 'Other_rRNA'

# Instead using hierarchy_2
test <- simplify_reanno(input=pac, hierarchy=hierarchy_2, mismatches=0, 
                       bio_name="Biotypes_mis0", merge_pac=FALSE)
table(test)# Non-hits are classified as 'other'

# Note, setting 'merge_pac=FALSE' returns a one-column data.frame only
# containing the new hierarchical classifications.

# Lets increase number of mismatches an merge it with the PAC
# (How many mismatches depends on what you allowed previously in the workflow) 
colnames(anno(pac))  # mis0-1_bio = Upto 1 mismatches are available

pac_test <- simplify_reanno(input=pac, hierarchy=hierarchy_1, mismatches=1, 
                        bio_name="Biotypes_mis1", merge_pac=TRUE)

# Now we also have some mitochondrial tRNA 
table(anno(pac_test)$Biotypes_mis1)


## ----message=FALSE, eval=TRUE-------------------------------------------------

load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))

###---------------------------------------------------------------------
## PAC_summary in Seqpac 

# Make means of counts over stages and return a data.frame
tab <- PAC_summary(pac, norm = "counts", type = "means", 
                   pheno_target=list("stage"), merge_pac=FALSE)

# When merge_pac=TRUE the table is added to the summary(PAC) 'folder'  
pac_test <- PAC_summary(pac, norm = "counts", type = "means", 
                        pheno_target=list("stage"), merge_pac=TRUE)

pac       # Structure of PAC before PAC_summary (S4)
pac_test  # Structure of PAC after PAC_summary  (S4)
names(summary(pac_test))
head(summary(pac_test)$countsMeans_stage)

## ----eval=TRUE, results=FALSE, message=FALSE, warning=FALSE-------------------
# You may want to use normalized counts   
pac_test <- PAC_summary(pac_test, norm = "cpm", type = "means", 
                        pheno_target=list("stage"), merge_pac=TRUE)

# Maybe only include a subset of the samples
pac_test <- PAC_summary(pac_test, norm = "cpm", type = "means", 
                        pheno_target=list("batch", c("Batch1", "Batch2")), 
                        merge_pac=TRUE)

# Generate standard errors
pac_test <- PAC_summary(pac_test, norm = "cpm", type = "se", 
                        pheno_target=list("stage"), merge_pac=TRUE)

# log2FC
pac_test <- PAC_summary(pac_test, norm = "cpm", type = "log2FC",  
                        pheno_target=list("stage"), merge_pac=TRUE)

# log2FC generated from a grand mean over all samples
pac_test <- PAC_summary(pac_test, norm = "cpm", type = "log2FCgrand",  
                        pheno_target=list("stage"), merge_pac=TRUE)

# All summarized tables have identical row names that can be merged
names(summary(pac_test))
lapply(summary(pac_test), function(x){
  identical(rownames(x), rownames(summary(pac_test)[[1]]))
  })
head(do.call("cbind", summary(pac_test)))


## ----message=FALSE, eval=TRUE, warning=FALSE----------------------------------

load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))

###---------------------------------------------------------------------
## Differential expression in Seqpac 

# Simple model testing stages against using Wald test with local fit (default)
table(pheno(pac)$stage)
output_deseq <- PAC_deseq(pac, model= ~stage, threads=6)

## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'----

# More complicated, but still graphs will be generated from 'stage' since it is
# first in model
output_deseq <- PAC_deseq(pac, model= ~stage + batch, threads=6)

# Using pheno_target we can change the focus to batch
# (no batch effect) 
output_deseq <- PAC_deseq(pac, model= ~stage + batch, 
                          pheno_target=list("batch")) 

# With pheno_target we can also change the direction of the comparison 
output_deseq <- PAC_deseq(pac, model= ~stage, 
                          pheno_target=list("stage", c("Stage5", "Stage3")))

## In the output you find PAC merged results, target plots and output_deseq   
names(output_deseq)
head(output_deseq$result)


## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'---------------
###---------------------------------------------------------------------
## PCA analysis in Seqpac 

# As simple as possible 
output_pca <- PAC_pca(pac)
# Two 'folders' in the output
names(output_pca)  

# Using pheno_target
output_pca <- PAC_pca(pac, pheno_target =list("stage"))

## ----message=FALSE, eval=TRUE, warning=FALSE----------------------------------

# Using pheno_target with sample labels
output_pca <- PAC_pca(pac, pheno_target =list("stage"), 
                      label=pheno(pac)$sample)

## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'---------------
# Plotting sequences instead
output_pca <- PAC_pca(pac, type = "anno", 
                      anno_target =list("Biotypes_mis0"))


## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'---------------
###---------------------------------------------------------------------
## Nucleotide bias in Seqpac 

load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))

# Test without annotations
# (note, is equivalent to an unfiltered master PAC)
pac_test <- pac
anno(pac_test) <- anno(pac_test)[,1, drop = FALSE]

output_nbias <- PAC_nbias(pac_test)
cowplot::plot_grid(plotlist=output_nbias$Histograms)

# Now lets use an anno_target in an annotated PAC targetting miRNA 
# (Oops, heavy T-bias on 1st nt; are they piRNA?)  
table(anno(pac)$Biotypes_mis0)
output_nbias <- PAC_nbias(pac, anno_target = list("Biotypes_mis0", "miRNA") )
cowplot::plot_grid(plotlist=output_nbias$Histograms)

# Switch to 10:th nt bias
output_nbias <- PAC_nbias(pac, position=10, 
                          anno_target = list("Biotypes_mis0", "miRNA"))
cowplot::plot_grid(plotlist=output_nbias$Histograms)

## ----eval=TRUE, echo=TRUE-----------------------------------------------------
# Summarized over group cpm means
pac <- PAC_summary(pac, norm = "cpm", type = "means", 
                        pheno_target=list("stage"), merge_pac=TRUE)
output_nbias <- PAC_nbias(pac, summary_target = list("cpmMeans_stage") )
cowplot::plot_grid(plotlist=output_nbias$Histograms)


## ----eval=TRUE, fig.show=FALSE, results="hide", fig.keep='none'---------------

###---------------------------------------------------------------------
## Biotype size distribution

# Divide stacked bars by biotype with no mismatch allowed  
output_sizedist_1 <- PAC_sizedist(pac, anno_target = list("Biotypes_mis0"))
cowplot::plot_grid(plotlist=c(output_sizedist_1$Histograms), ncol=3, nrow=3)

# Divide stacked bars by biotype with allowing up to 3 mismaches  
output_sizedist_2 <- PAC_sizedist(pac, anno_target = list("Biotypes_mis3"))
cowplot::plot_grid(plotlist=c(output_sizedist_2$Histograms), ncol=3, nrow=3)

# anno_target is order sensitive, thus can take care of color order issues:
ord_bio <- as.character(unique(anno(pac)$Biotypes_mis3))
ord_bio <- ord_bio[c(1,5,2,4,3,6,7)] 
output_sizedist_2 <- PAC_sizedist(pac, 
                                  anno_target = list("Biotypes_mis3", ord_bio))

# And again, we can use a summary_target instead:
output_sizedist_sum <- PAC_sizedist(pac,
                                  summary_target = list("cpmMeans_stage"),
                                  anno_target = list("Biotypes_mis3", ord_bio))
cowplot::plot_grid(plotlist=output_sizedist_sum$Histograms)
## Note: #######################################################################
# 1. miRNA is clearly associated with the correct size (21-23) nt, which are   #
# dramatically increased in Stage 5 when zygotic transcription has started.    #
# 2. piRNA was deliberately left out from the fasta references. Note, however, #
# that there is a broad peak with no annotations between 20-30 nt in Stage 1,  #
# which also showed a T-bias at the first nt. Possibly piRNA?                  #
################################################################################


## ----message=FALSE, include = TRUE, eval=TRUE---------------------------------
###---------------------------------------------------------------------
## Stacked bars in Seqpac 

# Choose an anno_target and plot samples (summary="samples")
PAC_stackbar(pac, anno_target=list("Biotypes_mis0"))

## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'----

# 'no_anno' and 'other' will always end on top not matter the order
ord_bio <- as.character(sort(unique(anno(pac)$Biotypes_mis3)))
p1 <- PAC_stackbar(pac, anno_target=list("Biotypes_mis0", ord_bio))
p2 <- PAC_stackbar(pac, anno_target=list("Biotypes_mis0", rev(ord_bio)))
cowplot::plot_grid(plotlist=list(p1, p2))
# (Hint: if you want them to appear not on top, rename them)

# Reorder samples by pheno_targets
PAC_stackbar(pac, pheno_target=list("batch"), summary="samples", 
             anno_target=list("Biotypes_mis0"))

# Instead, summarized over pheno_target (summary="pheno") 
PAC_stackbar(pac, anno_target=list("Biotypes_mis0"), summary="pheno", 
             pheno_target=list("stage"))


## ----eval=TRUE,fig.show=FALSE,results="hide",warning=FALSE,fig.keep='none'----

###---------------------------------------------------------------------
## Pie charts in Seqpac 

# Choose an anno_target and plot samples (summary="samples"; default)
PAC_pie(pac, anno_target=list("Biotypes_mis0"))

# Ordered pie charts of grand mean percent of all samples labeled with percent
ord_bio <- as.character(sort(unique(anno(pac)$Biotypes_mis3)), 
                        unique(anno(pac)$Biotypes_mis0))

output_pie_1 <- PAC_pie(pac, anno_target=list("Biotypes_mis0", ord_bio), 
                        summary="all", labels="percent")
output_pie_2 <- PAC_pie(pac, anno_target=list("Biotypes_mis3", ord_bio), 
                        summary="all", labels="percent")

cowplot::plot_grid(plotlist=c(output_pie_1, output_pie_2), 
                   labels = c("mis 0","legend", "mis 3"),
                   ncol=2, nrow=2, greedy=TRUE, scale=1.15)

# Rotate
PAC_pie(pac, anno_target=list("Biotypes_mis0"), summary="all", angle=180)
PAC_pie(pac, anno_target=list("Biotypes_mis0"), summary="all", angle=40)

# Compare biotype mapping with or without mismatches and group by pheno(PAC)
ord_bio <- as.character(sort(unique(anno(pac)$Biotypes_mis0)))  
output_mis0 <- PAC_pie(pac, pheno_target=list("stage"), summary="pheno", 
                       anno_target=list("Biotypes_mis0", ord_bio))
output_mis3 <- PAC_pie(pac, pheno_target=list("stage"), summary="pheno", 
                       anno_target=list("Biotypes_mis3", ord_bio))

cowplot::plot_grid(plotlist=c(output_mis0, output_mis3), 
                   labels = names(c(output_mis0, output_mis3)), nrow=2,
                   greedy = TRUE, scale=1.5)


## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'----

load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))

###---------------------------------------------------------------------
## tRNA analysis in Seqpac 

# First create an annotation blank PAC with group means
anno(pac) <- anno(pac)[,1, drop=FALSE]
pac_trna <- PAC_summary(pac, norm = "cpm", type = "means", 
                        pheno_target=list("stage"), merge_pac = TRUE)

# Then reannotate only tRNA using the PAC_mapper function
ref <- "/some/path/to/trna/fasta/recerence.fa"  
# or use the provided fasta
ref <- system.file("extdata/trna2", "tRNA2.fa", 
                          package = "seqpac", mustWork = TRUE)          

map_object <- PAC_mapper(pac_trna, ref=ref, N_up = "NNN", N_down = "NNN", 
                         mismatches=0, threads=6, report_string=TRUE,
                         override=TRUE)

# Hint: By adding N_up ad N_down you can make sure that modified fragments (like
# 3' -CAA in mature tRNA are included).

## Plot tRNA using xseq=TRUE gives you the reference sequence as X-axis:
# (OBS! Long reference will not show well.)                     
cov_tRNA <- PAC_covplot(pac_trna, map_object, 
                        summary_target = list("cpmMeans_stage"), 
                        xseq=TRUE, style="line", 
                        color=c("red", "black", "blue"))
cov_tRNA[[1]]
                     
# Targeting a single tRNA using a summary data.frame 
PAC_covplot(pac_trna, map=map_object, summary_target= list("cpmMeans_stage"), 
            map_target="tRNA-Lys-CTT-1-9")

# Find tRNAs with many fragments
# 1st extract number of rows from each alignment 
n_tRFs <- unlist(lapply(map_object, function(x){nrow(x[[2]])}))
# The test data is highly down an filterd sampled, but still some with tRNA have
# a few alignment
table(n_tRFs)
names(map_object)[n_tRFs>2]
# Lets select a few of them and plot them
selct <- names(map_object)[n_tRFs>2][c(1, 16, 27, 37)]
cov_plt <- PAC_covplot(pac_trna, map=map_object, 
                       summary_target= list("cpmMeans_stage"), 
                       map_target=selct)

## ----eval=TRUE, warning=FALSE-------------------------------------------------

cowplot::plot_grid(plotlist=cov_plt, nrow=2, ncol=2)


## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'----

###---------------------------------------------------------------------
## Generate range types using a ss-file

# Download ss object from GtRNAdb 
# ("http://gtrnadb.ucsc.edu/")
ss_file <- "/some/path/to/GtRNAdb/trna.ss"
# or for testing use the provided ss-file in secpac
ss_file <- system.file("extdata/trna2", "tRNA2.ss", 
                          package = "seqpac", mustWork = TRUE)

# Classify fragments according to loop cleavage (small loops are omitted)       
map_object_ss <- map_rangetype(map_object, type="ss", 
                               ss=ss_file, min_loop_width=4)            

# Remove reference tRNAs with no hits
map_object_ss <-  map_object_ss[
  !unlist(lapply(map_object_ss, function(x){x[[2]][1,1] == "no_hits"}))]

# Now we have quite comprehensive tRNA loop annotations
names(map_object_ss)
map_object_ss[[1]][[2]]

# Don't forget to check ?map_rangetype to obtain more options 


###---------------------------------------------------------------------
## Function classifying 5'-tRF, 5'halves, i-tRF, 3'-tRF, 3'halves

# Does all tRNAs have 3 loops?
table(unlist(lapply(map_object_ss, function(x){unique(x$Alignments$n_ssloop)})))

# Set tolerance for classification as a terminal tRF
tolerance <- 5  # 2 nucleotides from start or end of full-length tRNA)

### Important:
# We set N_up and N_down to "NNN" in the PAC_mapper step. To make sure
# that we have a tolerance that include the original tRNA sequence
# we set terminal= 2+3 (5).  

## ----eval=TRUE, warning=FALSE-------------------------------------------------
## tRNA classifying function 
# Apply the tRNA_class function and make a tRNA type column
pac_trna <- tRNA_class(pac_trna, map=map_object_ss, terminal=tolerance)
anno(pac_trna)$type <- paste0(anno(pac_trna)$decoder, anno(pac_trna)$acceptor)
anno(pac_trna)[1:5, 1:4]


## ----message=FALSE, echo=TRUE, eval=TRUE--------------------------------------

###---------------------------------------------------------------------
## Plot tRNA types

# Now use PAC_trna to generate some graphs based on grand means
trna_result <- PAC_trna(pac_trna, norm="cpm", filter = NULL,
  join = TRUE, top = 15, log2fc = TRUE,
  pheno_target = list("stage", c("Stage1", "Stage3")), 
  anno_target_1 = list("type"),
  anno_target_2 = list("class"))

names(trna_result)
names(trna_result$plots)
names(trna_result$plots$Expression_Anno_1)

cowplot::plot_grid(trna_result$plots$Expression_Anno_1$Grand_means,
                   trna_result$plots$Log2FC_Anno_1,
                   trna_result$plots$Percent_bars$Grand_means,
                   nrow=1, ncol=3)

## ----eval=TRUE,fig.show=FALSE,results="hide",message=FALSE,fig.keep='none'----
# By setting join = FALSE you will get group means
trna_result <- PAC_trna(pac_trna, norm="cpm", filter = NULL,
  join = FALSE, top = 15, log2fc = TRUE,
  pheno_target = list("stage", c("Stage1", "Stage3")), 
  anno_target_1 = list("type"),
  anno_target_2 = list("class"))

names(trna_result$plots$Expression_Anno_1)

cowplot::plot_grid(trna_result$plots$Expression_Anno_1$Stage1,
                   trna_result$plots$Expression_Anno_1$Stage3,
                   trna_result$plots$Log2FC_Anno_1,
                   trna_result$plots$Percent_bars$Stage1,
                   trna_result$plots$Percent_bars$Stage3,
                   nrow=1, ncol=5)


## ----eval=TRUE----------------------------------------------------------------
sessionInfo()