%\VignetteEngine{knitr::knitr} The objective of this workflow is to detect known *Pythium* species in several samples collected from soybean fields in Pennsylvania. *Pythium* is a eukaryotic microorganism that can help or harm plants depending on the species. Many *Pythium* species cause root rot in specific plants. However, some species of *Pythium* are used as biological control agents to prevent the growth of pathogens on crops. This workflow parallels an analysis of the same datasets performed by Coffua *et al.* in the journal Plant Disease (2016).

Here the Cytochrome c oxidase subunit 1 (COI) gene is used as a phylogenetic marker to identify species. The COI gene is part of the mitochondrial genome of all eukaryotes. In part #1 of this workflow, the goal is to design optimal primers to differentiate all of the *Pythium* species using the COI gene. The first step is to download the COI gene sequences of known *Pythium* species from the Internet and import them into a sequence database, as shown below. ```{r, results="hide"} # all paths are relative to the installed datasets data_dir <- system.file("extdata", package="BigBioSeqData") library(DECIPHER) # Create a connection to an on-disk SQLite database dbConn <- dbConnect(SQLite(), "./COIgenes.sqlite") # path to new database file # Import sequences from a GenBank formatted file Seqs2DB(paste(data_dir, "/Pythium_spp_COI.gb", sep=""), type="GenBank", dbFile=dbConn, identifier="Pythium") # View the database table that was constructed BrowseDB(dbConn) # Retrieve the imported sequences dna <- SearchDB(dbConn) dna # Align the sequences based on their translations DNA <- AlignTranslation(dna) DNA # Display the sequences in a web browser BrowseSeqs(DNA) # show differences with the first sequence BrowseSeqs(DNA, highlight=1) # show differences with the consensus sequence BrowseSeqs(DNA, highlight=0) # change the degree of consensus BrowseSeqs(DNA, highlight=0, threshold=0.2) # note the pattern common to most sequences pattern <- DNAStringSet("TAGATTTAGCWATTTTTAGTTTACA") BrowseSeqs(DNA, patterns=pattern) # The protein sequences are very similar AA <- AlignTranslation(dna, asAAStringSet=TRUE) BrowseSeqs(AA, highlight=1) # Choose a reference for frameshift correction REF <- translate(dna[11]) # sequence #11 # Correct the frameshift in sequence #12 correct <- CorrectFrameshifts(myXStringSet=dna[12], myAAStringSet=REF, type="both") correct dna[12] <- correct$sequence # Sequence #11 is now identical to #12 DNA <- AlignTranslation(dna) BrowseSeqs(DNA, highlight=11) # Identify clusters for primer design d <- DistanceMatrix(DNA) dim(d) # a symmetric matrix c <- IdClusters(d, method="UPGMA", cutoff=0.05, show=TRUE) head(c) # cluster numbers # Identify sequences by cluster name in the database Add2DB(data.frame(identifier=paste("cluster", c$cluster, sep="")), dbConn) BrowseDB(dbConn) # Design primers for next-generation sequencing primers <- DesignSignatures(dbConn, type="sequence", resolution=5, levels=5, minProductSize=400, maxProductSize=800, annealingTemp=55, maxPermutations=8) primers[1,] # the top scoring primer set # Highlight the primers' target sites BrowseSeqs(DNA, patterns=c(DNAStringSet(primers[1, 1]), reverseComplement(DNAStringSet(primers[1, 2])))) ``` Part #2 of the workflow uses sequences of the COI gene that were obtained from several locations in Pennsylvania. These DNA sequences are stored in FASTQ format along with their corresponding quality scores. After importing, the first step is to trim the sequences so that only the high quality center region remains. The subset of sequences that might belong to *Pythium* species will be identified by the presence of a conserved region common all *Pythium*. This analysis will be performed in batches so that all of the sequences do not need to fit in memory simultaneously. ```{r, results="hide"} # Import from the compressed FASTQ sequence files path <- paste(data_dir, "/FASTQ/", sep="") files <- list.files(path) samples <- substring(files, first=1, last=nchar(files) - 6) for (i in seq_along(files)) { cat(samples[i], ":\n", sep="") Seqs2DB(paste(path, files[i], sep=""), type="FASTQ", dbFile=dbConn, identifier=samples[i], tblName="Reads") } # Function for determining boundaries # of the high-quality central region bounds <- function(probs, thresh=0.001, width=21) { # Calculate a moving average padding <- floor(width/2) probs <- c(rep(thresh, padding), probs, rep(thresh, padding)) probs <- filter(probs, rep(1/width, width)) # Find region above the threshold w <- which(probs < thresh) - padding if (length(w)==0) w <- NA return(c(w[1], w[length(w)])) } # Trim the sequences by quality and identify # the subset belonging to the Pythium genus nSeqs <- SearchDB(dbConn, tbl="Reads", count=TRUE, verbose=FALSE) offset <- 0 ends <- starts <- counts <- integer(nSeqs) fprimer <- DNAString("TCAWCWMGATGGCTTTTTTCAAC") rprimer <- DNAString("") pBar <- txtProgressBar(max=nSeqs, style=3) while (offset < nSeqs) { # Select a batch of sequences dna <- SearchDB(dbConn, tbl="Reads", type="QualityScaledXStringSet", limit=paste(offset, 1e4, sep=","), verbose=FALSE) # Convert quality scores to error probabilities probs <- as(quality(dna), "NumericList") endpoints <- sapply(probs, bounds) # Store the results for later use index <- (offset + 1):(offset + length(dna)) starts[index] <- ifelse(endpoints[1,] >= 38L, endpoints[1,], 38L) # first base after the forward primer ends[index] <- ifelse(endpoints[2,] >= starts[index], endpoints[2,], starts[index] - 1L) # no high quality bases # Find the pattern expected in Pythium sequences counts[index] <- vcountPattern(pattern[[1]], subject=dna, max.mismatch=4, with.indels=TRUE, fixed="subject") # allow ambiguities offset <- offset + 1e4 setTxtProgressBar(pBar, ifelse(offset > nSeqs, nSeqs, offset)) } # Add the results to new columns in the database results <- data.frame(start=starts, end=ends, count=counts) Add2DB(results, dbFile=dbConn, tblName="Reads", verbose=FALSE) BrowseDB(dbConn, tblName="Reads", limit=1000) # Cluster the reads in each sample by percent identity for (i in seq_along(samples)) { cat(samples[i]) # Select moderately long sequences dna <- SearchDB(dbConn, tblName="Reads", identifier=samples[i], clause="count > 0 and (end - start + 1) >= 100", verbose=FALSE) cat(":", length(dna), "sequences") # Trim the sequences to the high-quality region index <- as.numeric(names(dna)) dna <- subseq(dna, start=starts[index], end=ends[index]) # Cluster the sequences without a distance matrix clusters <- IdClusters(myXStringSet=dna, method="inexact", cutoff=0.03, # > 97% identity verbose=FALSE) # Add the cluster numbers to the database Add2DB(clusters, dbFile=dbConn, tblName="Reads", verbose=FALSE) cat(",", length(unique(clusters[, 1])), "clusters\n") } # Now the database contains a column of clusters BrowseDB(dbConn, tblName="Reads", limit=1000, clause="cluster is not NULL") ``` In part #3 of the workflow, representatives from each sequence cluster are compared to known *Pythium* species. The goal of this analysis is to identify which organisms present in each sample are similar to known species. The known species are separated into two groups: those that are used as biocontrol agents (good strains) and those that are known to be plant pathogens (bad strains). ```{r, results="hide"} ids <- IdentifyByRank(dbConn, add2tbl=TRUE) lens <- IdLengths(dbConn, add2tbl=TRUE) BrowseDB(dbConn) # separate Pythium strains into good and bad groups biocontrol <- c('Pythium oligandrum', 'Pythium nunn', 'Pythium periplocum') pathogen <- c('Pythium acanthicum', # strawberries: 'Pythium rostratum', 'Pythium middletonii', 'Pythium aristosporum', # grasses/cereals: 'Pythium graminicola', 'Pythium okanoganense', 'Pythium paddicum', 'Pythium volutum', 'Pythium arrhenomanes', 'Pythium buismaniae', # flowers: 'Pythium spinosum', 'Pythium mastophorum', 'Pythium splendens', 'Pythium violae', # carrots: 'Pythium paroecandrum', 'Pythium sulcatum', 'Pythium dissotocum', # potatoes: 'Pythium scleroteichum', 'Pythium myriotylum', 'Pythium heterothallicum', # lettuce: 'Pythium tracheiphilum', 'Pythium ultimum', # multiple plants: 'Pythium irregulare', 'Pythium aphanidermatum', 'Pythium debaryanum', 'Pythium sylvaticum') # Select the longest sequence from each species species <- SearchDB(dbConn, nameBy="identifier", clause=paste("identifier in (", paste("'", c(biocontrol, pathogen), "'", sep="", collapse=", "), ") group by identifier having max(bases)", sep="")) # Select the longest sequence in each cluster dna <- SearchDB(dbConn, identifier="DauphinFarm", # choose a sample tblName="Reads", clause="cluster is not null group by cluster having max(end - start)") # Trim to the high quality central region index <- as.numeric(names(dna)) dna <- subseq(dna, start=starts[index], end=ends[index]) # Create a tree with known and unknown species combined <- AlignSeqs(c(dna, species)) dists <- DistanceMatrix(combined, verbose=FALSE, correction="Jukes-Cantor") tree <- IdClusters(dists, method="NJ", # Neighbor joining asDendrogram=TRUE, verbose=FALSE) plot(tree, nodePar=list(lab.cex=0.5, pch=NA)) # Color known species based on their pathogenicity tree_colored <- dendrapply(tree, function(x) { if (is.leaf(x)) { if (attr(x, "label") %in% pathogen) { attr(x, "edgePar") <- list(col="red") } else if (attr(x, "label") %in% biocontrol) { attr(x, "edgePar") <- list(col="green") } # remove the label attr(x, "label") <- "" } return(x) }) plot(tree_colored) # Disconnect from the sequence database dbDisconnect(dbConn) # permanently delete the database unlink("./COIgenes.sqlite") # optional! ```