## ---- results="hide"----------------------------------------------------- # all paths are relative to the installed datasets data_dir <- system.file("extdata", package="BigBioSeqData") library(DECIPHER) # Create a connection to an on-disk SQLite database dbConn <- dbConnect(SQLite(), "./COIgenes.sqlite") # path to new database file # Import sequences from a GenBank formatted file Seqs2DB(paste(data_dir, "/Pythium_spp_COI.gb", sep=""), type="GenBank", dbFile=dbConn, identifier="Pythium") # View the database table that was constructed BrowseDB(dbConn) # Retrieve the imported sequences dna <- SearchDB(dbConn) dna # Align the sequences based on their translations DNA <- AlignTranslation(dna) DNA # Display the sequences in a web browser BrowseSeqs(DNA) # show differences with the first sequence BrowseSeqs(DNA, highlight=1) # show differences with the consensus sequence BrowseSeqs(DNA, highlight=0) # change the degree of consensus BrowseSeqs(DNA, highlight=0, threshold=0.2) # note the pattern common to most sequences pattern <- DNAStringSet("TAGATTTAGCWATTTTTAGTTTACA") BrowseSeqs(DNA, patterns=pattern) # The protein sequences are very similar AA <- AlignTranslation(dna, asAAStringSet=TRUE) BrowseSeqs(AA, highlight=1) # Choose a reference for frameshift correction REF <- translate(dna[11]) # sequence #11 # Correct the frameshift in sequence #12 correct <- CorrectFrameshifts(myXStringSet=dna[12], myAAStringSet=REF, type="both") correct dna[12] <- correct$sequence # Sequence #11 is now identical to #12 DNA <- AlignTranslation(dna) BrowseSeqs(DNA, highlight=11) # Identify clusters for primer design d <- DistanceMatrix(DNA) dim(d) # a symmetric matrix c <- IdClusters(d, method="UPGMA", cutoff=0.05, show=TRUE) head(c) # cluster numbers # Identify sequences by cluster name in the database Add2DB(data.frame(identifier=paste("cluster", c$cluster, sep="")), dbConn) BrowseDB(dbConn) # Design primers for next-generation sequencing primers <- DesignSignatures(dbConn, type="sequence", resolution=5, levels=5, minProductSize=400, maxProductSize=800, annealingTemp=55, maxPermutations=8) primers[1,] # the top scoring primer set # Highlight the primers' target sites BrowseSeqs(DNA, patterns=c(DNAStringSet(primers[1, 1]), reverseComplement(DNAStringSet(primers[1, 2])))) ## ---- results="hide"----------------------------------------------------- # Import from the compressed FASTQ sequence files path <- paste(data_dir, "/FASTQ/", sep="") files <- list.files(path) samples <- substring(files, first=1, last=nchar(files) - 6) for (i in seq_along(files)) { cat(samples[i], ":\n", sep="") Seqs2DB(paste(path, files[i], sep=""), type="FASTQ", dbFile=dbConn, identifier=samples[i], tblName="Reads") } # Function for determining boundaries # of the high-quality central region bounds <- function(probs, thresh=0.001, width=21) { # Calculate a moving average padding <- floor(width/2) probs <- c(rep(thresh, padding), probs, rep(thresh, padding)) probs <- filter(probs, rep(1/width, width)) # Find region above the threshold w <- which(probs < thresh) - padding if (length(w)==0) w <- NA return(c(w[1], w[length(w)])) } # Trim the sequences by quality and identify # the subset belonging to the Pythium genus nSeqs <- SearchDB(dbConn, tbl="Reads", count=TRUE, verbose=FALSE) offset <- 0 ends <- starts <- counts <- integer(nSeqs) fprimer <- DNAString("TCAWCWMGATGGCTTTTTTCAAC") rprimer <- DNAString("") pBar <- txtProgressBar(max=nSeqs, style=3) while (offset < nSeqs) { # Select a batch of sequences dna <- SearchDB(dbConn, tbl="Reads", type="QualityScaledXStringSet", limit=paste(offset, 1e4, sep=","), verbose=FALSE) # Convert quality scores to error probabilities probs <- as(quality(dna), "NumericList") endpoints <- sapply(probs, bounds) # Store the results for later use index <- (offset + 1):(offset + length(dna)) starts[index] <- ifelse(endpoints[1,] >= 38L, endpoints[1,], 38L) # first base after the forward primer ends[index] <- ifelse(endpoints[2,] >= starts[index], endpoints[2,], starts[index] - 1L) # no high quality bases # Find the pattern expected in Pythium sequences counts[index] <- vcountPattern(pattern[[1]], subject=dna, max.mismatch=4, with.indels=TRUE, fixed="subject") # allow ambiguities offset <- offset + 1e4 setTxtProgressBar(pBar, ifelse(offset > nSeqs, nSeqs, offset)) } # Add the results to new columns in the database results <- data.frame(start=starts, end=ends, count=counts) Add2DB(results, dbFile=dbConn, tblName="Reads", verbose=FALSE) BrowseDB(dbConn, tblName="Reads", limit=1000) # Cluster the reads in each sample by percent identity for (i in seq_along(samples)) { cat(samples[i]) # Select moderately long sequences dna <- SearchDB(dbConn, tblName="Reads", identifier=samples[i], clause="count > 0 and (end - start + 1) >= 100", verbose=FALSE) cat(":", length(dna), "sequences") # Trim the sequences to the high-quality region index <- as.numeric(names(dna)) dna <- subseq(dna, start=starts[index], end=ends[index]) # Cluster the sequences without a distance matrix clusters <- IdClusters(myXStringSet=dna, method="inexact", cutoff=0.03, # > 97% identity verbose=FALSE) # Add the cluster numbers to the database Add2DB(clusters, dbFile=dbConn, tblName="Reads", verbose=FALSE) cat(",", length(unique(clusters[, 1])), "clusters\n") } # Now the database contains a column of clusters BrowseDB(dbConn, tblName="Reads", limit=1000, clause="cluster is not NULL") ## ---- results="hide"----------------------------------------------------- ids <- IdentifyByRank(dbConn, add2tbl=TRUE) lens <- IdLengths(dbConn, add2tbl=TRUE) BrowseDB(dbConn) # separate Pythium strains into good and bad groups biocontrol <- c('Pythium oligandrum', 'Pythium nunn', 'Pythium periplocum') pathogen <- c('Pythium acanthicum', # strawberries: 'Pythium rostratum', 'Pythium middletonii', 'Pythium aristosporum', # grasses/cereals: 'Pythium graminicola', 'Pythium okanoganense', 'Pythium paddicum', 'Pythium volutum', 'Pythium arrhenomanes', 'Pythium buismaniae', # flowers: 'Pythium spinosum', 'Pythium mastophorum', 'Pythium splendens', 'Pythium violae', # carrots: 'Pythium paroecandrum', 'Pythium sulcatum', 'Pythium dissotocum', # potatoes: 'Pythium scleroteichum', 'Pythium myriotylum', 'Pythium heterothallicum', # lettuce: 'Pythium tracheiphilum', 'Pythium ultimum', # multiple plants: 'Pythium irregulare', 'Pythium aphanidermatum', 'Pythium debaryanum', 'Pythium sylvaticum') # Select the longest sequence from each species species <- SearchDB(dbConn, nameBy="identifier", clause=paste("identifier in (", paste("'", c(biocontrol, pathogen), "'", sep="", collapse=", "), ") group by identifier having max(bases)", sep="")) # Select the longest sequence in each cluster dna <- SearchDB(dbConn, identifier="DauphinFarm", # choose a sample tblName="Reads", clause="cluster is not null group by cluster having max(end - start)") # Trim to the high quality central region index <- as.numeric(names(dna)) dna <- subseq(dna, start=starts[index], end=ends[index]) # Create a tree with known and unknown species combined <- AlignSeqs(c(dna, species)) dists <- DistanceMatrix(combined, verbose=FALSE, correction="Jukes-Cantor") tree <- IdClusters(dists, method="NJ", # Neighbor joining asDendrogram=TRUE, verbose=FALSE) plot(tree, nodePar=list(lab.cex=0.5, pch=NA)) # Color known species based on their pathogenicity tree_colored <- dendrapply(tree, function(x) { if (is.leaf(x)) { if (attr(x, "label") %in% pathogen) { attr(x, "edgePar") <- list(col="red") } else if (attr(x, "label") %in% biocontrol) { attr(x, "edgePar") <- list(col="green") } # remove the label attr(x, "label") <- "" } return(x) }) plot(tree_colored) # Disconnect from the sequence database dbDisconnect(dbConn) # permanently delete the database unlink("./COIgenes.sqlite") # optional!