## ----style, echo=FALSE, results='hide', message=FALSE------------------------- knitr::opts_chunk$set(tidy = FALSE, cache = FALSE, error = TRUE, warning = FALSE, message = FALSE, dev = 'png') library(CircSeqAlignTk) set.seed(1) ## ----install_package, eval=FALSE---------------------------------------------- # if (!requireNamespace('BiocManager', quietly = TRUE)) # install.packages('BiocManager') # BiocManager::install('CircSeqAlignTk') ## ----quick_start__tempws------------------------------------------------------ ws <- tempdir() ## ----quick_start__ws, eval=FALSE---------------------------------------------- # ws <- '~/desktop/viroid_project' ## ----quick_start__preparation_fastq------------------------------------------- fq <- system.file(package = 'CircSeqAlignTk', 'extdata', 'srna.fq.gz') ## ----quick_start__preparation_reference--------------------------------------- genome_seq <- system.file(package = 'CircSeqAlignTk', 'extdata', 'FR851463.fa') ## ----quick_start__triming----------------------------------------------------- library(R.utils) library(Rbowtie2) adapter <- 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC' # decompressed the gzip file for trimming to avoid errors from `remove_adapters` gunzip(fq, destname=file.path(ws, 'srna.fq'), overwrite = TRUE, remove = FALSE) trimmed_fq <- file.path(ws, 'srna_trimmed.fq') params <- '--maxns 1 --trimqualities --minquality 30 --minlength 21 --maxlength 24' remove_adapters(file1 = file.path(ws, 'srna.fq'), adapter1 = adapter, adapter2 = NULL, output1 = trimmed_fq, params, basename = file.path(ws, 'AdapterRemoval.log'), overwrite = TRUE) ## ----quick_start__alignment--------------------------------------------------- ref_index <- build_index(input = genome_seq, output = file.path(ws, 'index')) aln <- align_reads(input = trimmed_fq, index = ref_index, output = file.path(ws, 'align_results')) ## ----quick_start__summary----------------------------------------------------- alncov <- calc_coverage(aln) head(get_slot_contents(alncov, 'forward')) # alignment coverage in forward strand head(get_slot_contents(alncov, 'reversed')) # alignment coverage in reversed strand ## ----quickStartVisualization, fig.cap='Alignment coverage. The alignment coverage of the case study.'---- plot(alncov) ## ----packageImplementation, fig.cap='Two-stage alignment process. Overview of the two-stage alignment process and the related functions in the CircSeqAlignTk package', echo=FALSE, fig.wide=TRUE---- knitr::include_graphics('overview.png') ## ----implementation__build_index---------------------------------------------- genome_seq <- system.file(package='CircSeqAlignTk', 'extdata', 'FR851463.fa') ref_index <- build_index(input = genome_seq, output = file.path(ws, 'index')) ## ----implementation__build_index_output--------------------------------------- str(ref_index) ## ----implementation__build_index_output_refseq-------------------------------- get_slot_contents(ref_index, 'fasta') ## ----implementation__build_index_output_index--------------------------------- get_slot_contents(ref_index, 'index') ## ----access_slot_content, eval=FALSE------------------------------------------ # slot(ref_index, 'fasta') # slot(ref_index, 'index') # # ref_index@fasta # ref_index@index ## ----implementation__build_index_cutloci-------------------------------------- get_slot_contents(ref_index, 'cut_loc') ## ----implementation__build_ht2index, eval=FALSE------------------------------- # ref_ht2index <- build_index(input = genome_seq, # output = file.path(ws, 'ht2index'), # aligner = 'hisat2') ## ----implementation__align_read----------------------------------------------- fq <- system.file(package='CircSeqAlignTk', 'extdata', 'srna.fq.gz') # trimming the adapter sequences if needed before alignment, omitted here. aln <- align_reads(input = fq, index = ref_index, output = file.path(ws, 'align_results')) ## ----implementation__align_read_output---------------------------------------- str(aln) ## ----implementation__align_read_output_bam------------------------------------ get_slot_contents(aln, 'bam') get_slot_contents(aln, 'clean_bam') ## ----implementation__align_read_stats----------------------------------------- get_slot_contents(aln, 'stats') ## ----implementation__align_read__mismatch------------------------------------- aln <- align_reads(input = fq, index = ref_index, output = file.path(ws, 'align_results'), n_mismatch = 0) ## ----implementation__align_read__threads-------------------------------------- aln <- align_reads(input = fq, index = ref_index, output = file.path(ws, 'align_results'), n_threads = 4) ## ----implementation__align_read__add_params, eval=FALSE----------------------- # aln <- align_reads(input = fq, # index = ref_index, # output = file.path(ws, 'align_results'), # add_args = '-L 20 -N 1') ## ----implementation__align_read__hisat2, eval=FALSE--------------------------- # aln <- align_reads(input = fq, # index = ref_ht2index , # output = file.path(ws, 'align_results'), # aligner = 'hisat2') ## ----implementation__summary-------------------------------------------------- alncov <- calc_coverage(aln) ## ----implementation__summary_fwd---------------------------------------------- head(get_slot_contents(alncov, 'forward')) head(get_slot_contents(alncov, 'reversed')) ## ----implementationVisualization, fig.cap='Alignment coverage.'--------------- plot(alncov) ## ----implementationVisualizationReadlen, fig.cap='Alignment coverage of reads with the specific lengths.'---- plot(alncov, read_lengths = c(21, 22)) ## ----implementationVisualizationOption1, fig.cap='Alignment coverage arranged with ggplot2.'---- library(ggplot2) plot(alncov) + facet_grid(strand ~ read_length, scales = 'free_y') ## ----implementationVisualizationOption2, fig.cap='Alignment coverage represented in polar coordinate system.'---- plot(alncov) + coord_polar() ## ----sim__default_params------------------------------------------------------ sim <- generate_reads(output = file.path(ws, 'synthetic_reads.fq.gz')) ## ----sim__default_params_str-------------------------------------------------- str(sim) ## ----sim__default_params_peak------------------------------------------------- head(get_slot_contents(sim, 'peak')) ## ----sim__default_params_readinfo--------------------------------------------- dim(get_slot_contents(sim, 'read_info')) head(get_slot_contents(sim, 'read_info')) ## ----simDefaultParamsCoverageFig, fig.cap='Alignment coverage of the synthetic data.'---- alncov <- get_slot_contents(sim, 'coverage') head(get_slot_contents(alncov, 'forward')) head(get_slot_contents(alncov, 'reversed')) plot(alncov) ## ----sim__args_n-------------------------------------------------------------- sim <- generate_reads(n = 1e3, output = file.path(ws, 'synthetic_reads.fq.gz')) ## ----sim__args_seq------------------------------------------------------------ genome_seq <- 'CGGAACTAAACTCGTGGTTCCTGTGGTTCACACCTGACCTCCTGACAAGAAAAGAAAAAAGAAGGCGGCTCGGAGGAGCGCTTCAGGGATCCCCGGGGAAACCTGGAGCGAACTGGCAAAAAAGGACGGTGGGGAGTGCCCAGCGGCCGACAGGAGTAATTCCCGCCGAAACAGGGTTTTCACCCTTCCTTTCTTCGGGTGTCCTTCCTCGCGCCCGCAGGACCACCCCTCGCCCCCTTTGCGCTGTCGCTTCGGCTACTACCCGGTGGAAACAACTGAAGCTCCCGAGAACCGCTTTTTCTCTATCTTACTTGCTCCGGGGCGAGGGTGTTTAGCCCTTGGAACCGCAGTTGGTTCCT' sim <- generate_reads(seq = genome_seq, output = file.path(ws, 'synthetic_reads.fq.gz')) ## ----sim__args_adapter-------------------------------------------------------- adapter <- 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' sim <- generate_reads(adapter = adapter, output = file.path(ws, 'synthetic_reads.fq.gz'), read_length = 150) ## ----sim__args_adapter_NA----------------------------------------------------- sim <- generate_reads(adapter = NA, output = file.path(ws, 'synthetic_reads.fq.gz'), read_length = 150) ## ----sim__args_mismatch_1----------------------------------------------------- sim <- generate_reads(output = file.path(ws, 'synthetic_reads.fq.gz'), mismatch_prob = 0.05) ## ----sim__args_mismatch_2----------------------------------------------------- sim <- generate_reads(output = file.path(ws, 'synthetic_reads.fq.gz'), mismatch_prob = c(0.05, 0.1)) ## ----simSetLengthAndPeaksScripts---------------------------------------------- peaks <- data.frame( mean = c( 0, 25, 70, 90, 150, 240, 260, 270, 330, 350), std = c( 5, 5, 5, 5, 10, 5, 5, 1, 2, 1), strand = c( '+', '+', '-', '-', '+', '+', '-', '+', '+', '-'), prob = c(0.10, 0.10, 0.18, 0.05, 0.03, 0.18, 0.15, 0.10, 0.06, 0.05) ) srna_length <- data.frame( length = c( 21, 22, 23, 24), prob = c(0.45, 0.40, 0.10, 0.05) ) sim <- generate_reads(n = 1e4, output = file.path(ws, 'synthetic_reads.fq.gz'), srna_length = srna_length, peaks = peaks) ## ----simSetLengthAndPeaks, fig.cap='Alignment coverage of the synthetic data.'---- plot(get_slot_contents(sim, 'coverage')) ## ----simMergeMultiSimObjectsScripts------------------------------------------- peaks_1 <- data.frame( mean = c( 100, 150, 250, 300), std = c( 5, 5, 5, 5), strand = c( '+', '+', '-', '-'), prob = c(0.25, 0.25, 0.40, 0.05) ) srna_length_1 <- data.frame( length = c( 21, 22), prob = c(0.45, 0.65) ) sim_1 <- generate_reads(n = 1e4, output = file.path(ws, 'synthetic_reads_1.fq.gz'), srna_length = srna_length_1, peaks = peaks_1) peaks_2 <- data.frame( mean = c( 50, 200, 300), std = c( 5, 5, 5), strand = c( '+', '-', '+'), prob = c(0.80, 0.10, 0.10) ) srna_length_2 <- data.frame( length = c( 21, 22, 23), prob = c(0.10, 0.10, 0.80) ) sim_2 <- generate_reads(n = 1e3, output = file.path(ws, 'synthetic_reads_2.fq.gz'), srna_length = srna_length_2, peaks = peaks_2) peaks_3 <- data.frame( mean = c( 80, 100, 220, 270), std = c( 5, 5, 1, 2), strand = c( '-', '+', '+', '-'), prob = c( 0.20, 0.30, 0.20, 0.30) ) srna_length_3 <- data.frame( length = c( 19, 20, 21, 22), prob = c(0.30, 0.30, 0.20, 0.20) ) sim_3 <- generate_reads(n = 5e3, output = file.path(ws, 'synthetic_reads_3.fq.gz'), srna_length = srna_length_3, peaks = peaks_3) # merge the three data sets sim <- merge(sim_1, sim_2, sim_3, output = file.path(ws, 'synthetic_reads.fq.gz')) ## ----simMergeMultiSimObjects, fig.cap='Alignment coverage of the synthetic data.'---- plot(get_slot_contents(sim, 'coverage')) ## ----simeval__chk_workflow_align---------------------------------------------- sim <- generate_reads(adapter = NA, mismatch_prob = 0, output = file.path(ws, 'synthetic_reads.fq.gz')) genome_seq <- system.file(package='CircSeqAlignTk', 'extdata', 'FR851463.fa') ref_index <- build_index(input = genome_seq, output = file.path(ws, 'index')) aln <- align_reads(input = file.path(ws, 'synthetic_reads.fq.gz'), index = ref_index, output = file.path(ws, 'align_results')) alncov <- calc_coverage(aln) ## ----simeval__chk_workflow_rmse----------------------------------------------- # coverage of reads in forward strand fwd_pred <- get_slot_contents(alncov, 'forward') fwd_true <- get_slot_contents(get_slot_contents(sim, 'coverage'), 'forward') sqrt(sum((fwd_pred - fwd_true) ^ 2) / length(fwd_true)) # coverage of reads in reversed strand rev_pred <- get_slot_contents(alncov, 'reversed') rev_true <- get_slot_contents(get_slot_contents(sim, 'coverage'), 'reversed') sqrt(sum((rev_pred - rev_true) ^ 2) / length(rev_true)) ## ----simeval_chk_workflow_rmse0, echo = FALSE--------------------------------- if ((sum((fwd_pred - fwd_true) ^ 2) / length(fwd_true)) != 0) stop() if ((sum((rev_pred - rev_true) ^ 2) / length(rev_true)) != 0) stop() ## ----simeval__sim1_syn-------------------------------------------------------- sim <- generate_reads(mismatch_prob = c(0.1, 0.2), output = file.path(ws, 'synthetic_reads.fq')) ## ----simeval__sim1_align------------------------------------------------------ library(R.utils) library(Rbowtie2) # quality control params <- '--maxns 1 --trimqualities --minquality 30 --minlength 21 --maxlength 24' remove_adapters(file1 = file.path(ws, 'synthetic_reads.fq'), adapter1 = get_slot_contents(sim, 'adapter'), adapter2 = NULL, output1 = file.path(ws, 'srna_trimmed.fq'), params, basename = file.path(ws, 'AdapterRemoval.log'), overwrite = TRUE) # alignment genome_seq <- system.file(package='CircSeqAlignTk', 'extdata', 'FR851463.fa') ref_index <- build_index(input = genome_seq, output = file.path(ws, 'index')) aln <- align_reads(input = file.path(ws, 'srna_trimmed.fq'), index = ref_index, output = file.path(ws, 'align_results'), n_mismatch = 2) # calculate alignment coverage alncov <- calc_coverage(aln) ## ----simeval__sim1_rmse------------------------------------------------------- # coverage of reads in forward strand fwd_pred <- get_slot_contents(alncov, 'forward') fwd_true <- get_slot_contents(get_slot_contents(sim, 'coverage'), 'forward') sqrt(sum((fwd_pred - fwd_true) ^ 2) / length(fwd_true)) # coverage of reads in reversed strand rev_pred <- get_slot_contents(alncov, 'reversed') rev_true <- get_slot_contents(get_slot_contents(sim, 'coverage'), 'reversed') sqrt(sum((rev_pred - rev_true) ^ 2) / length(rev_true)) ## ----simeval__sim2------------------------------------------------------------ library(R.utils) library(Rbowtie2) params <- '--maxns 1 --trimqualities --minquality 30 --minlength 21 --maxlength 24' genome_seq <- system.file(package='CircSeqAlignTk', 'extdata', 'FR851463.fa') ref_index <- build_index(input = genome_seq, output = file.path(ws, 'index')) fwd_rmse <- rev_rmse <- rep(NA, 10) for (i in seq(fwd_rmse)) { # prepare file names and directory to store the simulation results simset_dpath <- file.path(ws, paste0('sim_tries_', i)) dir.create(simset_dpath) syn_fq <- file.path(simset_dpath, 'synthetic_reads.fq') trimmed_syn_fq <- file.path(simset_dpath, 'srna_trimmed.fq') align_result <- file.path(simset_dpath, 'align_results') fig_coverage <- file.path(simset_dpath, 'alin_coverage.png') # generate synthetic reads set.seed(i) sim <- generate_reads(mismatch_prob = c(0.1, 0.2), output = syn_fq) # quality control remove_adapters(file1 = syn_fq, adapter1 = get_slot_contents(sim, 'adapter'), adapter2 = NULL, output1 = trimmed_syn_fq, params, basename = file.path(ws, 'AdapterRemoval.log'), overwrite = TRUE) # alignment aln <- align_reads(input = trimmed_syn_fq, index = ref_index, output = align_result, n_mismatch = 2) # calculate alignment coverage alncov <- calc_coverage(aln) # calculate RMSE fwd_pred <- get_slot_contents(alncov, 'forward') fwd_true <- get_slot_contents(get_slot_contents(sim, 'coverage'), 'forward') fwd_rmse[i] <- sqrt(sum((fwd_pred - fwd_true) ^ 2) / length(fwd_true)) rev_pred <- get_slot_contents(alncov, 'reversed') rev_true <- get_slot_contents(get_slot_contents(sim, 'coverage'), 'reversed') rev_rmse[i] <- sqrt(sum((rev_pred - rev_true) ^ 2) / length(rev_true)) } ## ----simeval__sim2_rmse------------------------------------------------------- rmse <- data.frame(forward = fwd_rmse, reversed = rev_rmse) rmse ## ----tutorial_viroid__preparation--------------------------------------------- library(utils) project_dpath <- tempdir() dir.create(project_dpath) options(timeout = 60 * 10) download.file(url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=U23058&rettype=fasta&retmode=text', destfile = file.path(project_dpath, 'U23058.fa')) download.file(url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE70166&format=file', destfile = file.path(project_dpath, 'GSE70166.tar')) untar(file.path(project_dpath, 'GSE70166.tar'), exdir = project_dpath) ## ----tutorial_viorid__alignment----------------------------------------------- genome_seq <- file.path(project_dpath, 'U23058.fa') fq <- file.path(project_dpath, 'GSM1717894_PSTVd_RG1.txt.gz') ref_index <- build_index(input = genome_seq, output = file.path(project_dpath, 'index')) aln <- align_reads(input = fq, index = ref_index, output = file.path(project_dpath, 'align_results')) ## ----tutorial_viroid__alignment_result, echo = FALSE-------------------------- x <- as.matrix(get_slot_contents(aln, 'stats')) ## ----tutorial_viroid__alignment_stats----------------------------------------- get_slot_contents(aln, 'stats') ## ----tutorial_viroid__alignment_stats_calcov---------------------------------- alncov <- calc_coverage(aln) ## ----tutorialViroidCoverage, fig.cap='Alignment coverage of small RNA-Seq data obtained from the viroid-infected tomato plants.'---- head(get_slot_contents(alncov, 'forward')) head(get_slot_contents(alncov, 'reversed')) plot(alncov) ## ----------------------------------------------------------------------------- sessionInfo()