############################################################################## ### ### Exercise 1 ### ========== ### Before you can load a Bioconductor package, you need to install it with: ### source("http://bioconductor.org/biocLite.R") ### biocLite("Biostrings") ### The biocLite() script will take care of installing any other BioC or CRAN ### package that the package you are trying to install depends on. library(Biostrings) library(BSgenome) available.genomes() library(BSgenome.Hsapiens.UCSC.hg18) Hsapiens Hsapiens$chr1 ############################################################################## ### ### Exercise 2 ### ========== library(BSgenome.Celegans.UCSC.ce2) Celegans$chrI class(Celegans$chrI) # a DNAString object length(Celegans$chrI) alphabetFrequency(Celegans$chrI) s1 <- subseq(Celegans$chrI, end=20) # extracts the first 20 nucleotides ### See ?subseq for more information. ### Note that subseq() doesn't copy the extracted sequence data in memory ### so subseq() is very fast and very efficient even when extracting a ### substring made of millions of nucleotides (e.g. from Human chr1). reverseComplement(s1) ############################################################################## ### ### Exercise 3 ### ========== library(hgu95av2probe) hgu95av2probe class(hgu95av2probe) colnames(hgu95av2probe) hgu95av2probe$sequence[1:5] dict0 <- DNAStringSet(hgu95av2probe) dict0 length(dict0) width(dict0)[1:10] # 'width(dict0)' is a very long integer vector! dict0[-2] rev(dict0) dict0[[1]] DNAStringSet(dict0, end=-3) # remove last 2 nucleotides DNAStringSet(dict0, start=-10) # keep the last 10 nucleotides af0 <- alphabetFrequency(dict0) dim(af0) head(af0) alphabetFrequency(dict0, collapse=TRUE) alphabetFrequency(reverseComplement(dict0), collapse=TRUE) sum(af0[,"C"] + af0[,"G"] >= 0.8 * width(dict0)) # only 1 probe ap0 <- alphabetFrequency(dict0, collapse=TRUE, as.prob=TRUE, baseOnly=TRUE) ap0["C"] + ap0["G"] # GC content for the entire microarray ############################################################################## ### ### Exercise 4 ### ========== library(BSgenome.Celegans.UCSC.ce2) chrI <- Celegans$chrI vI <- Views(chrI, start=c(5, 300, 20), width=50) vI subject(vI) start(vI) end(vI) gaps(vI) alphabetFrequency(vI) DNAStringSet(vI) ############################################################################## ### ### Exercise 5 ### ========== library(BSgenome.Hsapiens.UCSC.hg18) Hsapiens$chr1 unmasked(Hsapiens$chr1) ############################################################################## ### ### Exercise 6 ### ========== chrY <- Hsapiens$chrY chrY # See the 1st value in the maskedratio col for the % # of the sequence that is masked by the AGAPS mask. # => 55.6% of this assembly of Human chromosome Y is made # of assembly gaps! # Use maskedratio(masks(chrY))[1] to extract that information # programmatically. alphabetFrequency(chrY, baseOnly=TRUE) # all the Ns are masked alphabetFrequency(unmasked(chrY), as.prob=TRUE)["N"] as(chrY , "Views") gaps(as(chrY , "Views")) ### Lengths of the assembly gaps: width(gaps(as(chrY , "Views"))) ############################################################################## ### ### Exercise 7 ### ========== library(BSgenome) available.genomes() library(BSgenome.Celegans.UCSC.ce2) Celegans seqlengths(Celegans) # from the examples in ?Celegans ############################################################################## ### ### Exercise 8 ### ========== library(BSgenome.Celegans.UCSC.ce2) chrI <- Celegans$chrI pattern <- DNAString("TTACCGATTTCA") matchPattern(pattern, chrI) # hits in the plus strand matchPattern(reverseComplement(pattern), chrI) # hits in the minus strand ### Look at the examples in ?reverseComplement for a discussion about why ### reverse complementing the pattern and not the subject is generally better. matchPattern(pattern, chrI, max.mismatch=1) matchPattern(pattern, chrI, max.mismatch=2, with.indels=TRUE) ############################################################################## ### ### Exercise 9 ### ========== library(BSgenome.Hsapiens.UCSC.hg18) upstream1000 <- Hsapiens$upstream1000 pattern <- DNAString("TTACCGATTTCA") mi <- vmatchPattern(pattern, upstream1000) mi ### Extract the indices of the upstream1000 elements that received ### at least 1 hit: upstream1000_idx <- which(countIndex(mi) != 0) upstream1000[upstream1000_idx] ############################################################################## ### ### Exercise 10 ### =========== library(BSgenome.Celegans.UCSC.ce2) chrI <- Celegans$chrI pattern <- DNAString("GAACTTTGCCAC") matchPattern(pattern, chrI) # no match pattern2 <- DNAString("GAACNNNGCCAC") matchPattern(pattern2, chrI) # no match matchPattern(pattern2, chrI, fixed=FALSE) # 27 matches! ############################################################################## ### ### Exercise 11 ### =========== library(hgu95av2probe) dict0 <- DNAStringSet(hgu95av2probe) pdict0 <- PDict(dict0) pdict0 library(BSgenome.Hsapiens.UCSC.hg18) chr1 <- unmasked(Hsapiens$chr1) mi <- matchPDict(pdict0, chr1) mi ci <- countIndex(mi) ### Probes with the higher number of hits: dict0_idx <- which(ci == max(ci)) dict0[dict0_idx] Views(chr1, mi[[dict0_idx[1]]]) # showing hits for probe number dict0_idx[1] matchPattern(dict0[[dict0_idx[1]]], chr1) # same result ### Hits in the minus strand: rcdict0 <- reverseComplement(dict0) rcpdict0 <- PDict(rcdict0) rcmi <- matchPDict(rcpdict0, chr1) rcci <- countIndex(rcmi) ### Total count of hits: totalci <- ci + rcci tci <- table(totalci) tci ### Plot that table in logarithmic scale: plot(x=1.00+as.numeric(names(tci)), y=tci, log="xy", xlab="nb of hits", ylab="nb of probes") ### With up to 2 mismatching nucleotides in the last 12 nucleotides of ### each probe: pdict2 <- PDict(dict0, tb.end=-13) pdict2 mi2 <- matchPDict(pdict2, chr1, max.mismatch=2) ci2 <- countIndex(mi2) dict0_idx2 <- which(ci2 == max(ci2)) dict0[dict0_idx2] v <- Views(chr1, mi2[[dict0_idx2[1]]]) ### See how mismatches are distributed along the hits: consensusMatrix(v) consensusMatrix(v)[DNA_BASES, ] # lots of mismatches! consensusString(v) dict0[[dict0_idx2[1]]] == DNAString(consensusString(v)) # TRUE rcpdict2 <- PDict(rcdict0, tb.start=13) rcpdict2 rcmi2 <- matchPDict(rcpdict2, chr1, max.mismatch=2) rcci2 <- countIndex(rcmi2) rcdict0_idx2 <- which(rcci2 == max(rcci2)) dict0[rcdict0_idx2] vv <- Views(chr1, rcmi2[[rcdict0_idx2[1]]]) consensusMatrix(vv)[DNA_BASES, ] totalci2 <- ci2 + rcci2 tci2 <- table(totalci2) tci2 plot(x=1.00+as.numeric(names(tci2)), y=tci2, log="xy", xlab="nb of hits", ylab="nb of probes") ############################################################################## ### ### Exercise 12 ### =========== library(GenomicFeatures) # for extractTranscriptsFromGenome() library(GenomicFeatures.Hsapiens.UCSC.hg18) # load the gene table genes <- geneHuman() library(BSgenome.Hsapiens.UCSC.hg18) # load the genome ### Takes about 30 sec.: transcripts <- extractTranscriptsFromGenome(Hsapiens, genes) transcripts hg18tr_af <- alphabetFrequency(transcripts, collapse=TRUE) hg18_af <- sapply(seqnames(Hsapiens), function(nm) alphabetFrequency(unmasked(Hsapiens[[nm]]))) p1 <- hg18tr_af / sum(hg18tr_af) p1 p2 <- rowSums(hg18_af) / sum(as.numeric(hg18_af)) p2 ### This shows that the transcriptome is more CG rich than the genome: p1 > p2 library(hgu95av2probe) ### We count the total number of hits in the transcriptome ### for each the first 50 probes: sapply(1:50, function(i) sum(vcountPattern(hgu95av2probe$sequence[i], transcripts))) ### For the last question in this exercise, we need to use vmatchPattern() ### instead of vcountPattern() because we need to get the locations of the ### hits. We use probe 50: mi50 <- vmatchPattern(hgu95av2probe$sequence[50], transcripts) refstarts50 <- unlist(transcriptLocs2refLocs(startIndex(mi50), exonStarts=genes$exonStarts, exonEnds=genes$exonEnds, strand=genes$strand, reorder.exons.on.minus.strand=TRUE)) refends50 <- unlist(transcriptLocs2refLocs(endIndex(mi50), exonStarts=genes$exonStarts, exonEnds=genes$exonEnds, strand=genes$strand, reorder.exons.on.minus.strand=TRUE)) refends50 - refstarts50 ### The above tells us 2 things: ### 1. The fact that 'refends50' is < 'refstarts50' tells us that the 7 hits ### of probe 50 are in transcripts that belong to the minus strand (this ### is confirmed by 'genes$strand[countIndex(mi50) != 0]') ### 2. Once translated back into reference-based hits, their width is still ### 25 bases. This means that probe 50 is not a "junction" probe i.e. a ### probe that spans 2 exons in a transcript. ### Here is a little function that tries to detect whether a probe is a ### junction probe by using the naive approach above: isJunctionProbe <- function(probe, transcripts, genes) { if (is.character(probe)) probe <- DNAString(probe) mi <- vmatchPattern(probe, transcripts) refstarts <- unlist(transcriptLocs2refLocs(startIndex(mi), exonStarts=genes$exonStarts, exonEnds=genes$exonEnds, strand=genes$strand, reorder.exons.on.minus.strand=TRUE)) refends <- unlist(transcriptLocs2refLocs(endIndex(mi), exonStarts=genes$exonStarts, exonEnds=genes$exonEnds, strand=genes$strand, reorder.exons.on.minus.strand=TRUE)) any(abs(refends - refstarts) + 1L != length(probe)) } ### Searching for junction probes (stop after the 1st is found). ### Note that this will take several minutes... for (i in seq_len(length(hgu95av2probe$sequence))) { cat("Probe ", i, ": ", sep="") ok <- isJunctionProbe(hgu95av2probe$sequence[i], transcripts, genes) cat(ok, "\n") if (ok) break } ### And the winner is... probe 87! mi87 <- vmatchPattern(hgu95av2probe$sequence[87], transcripts) tidx87 <- countIndex(mi87) != 0 # index of transcripts with hits tstarts87 <- startIndex(mi87)[tidx87] tends87 <- endIndex(mi87)[tidx87] genes87 <- genes[tidx87, ] refstarts87 <- unlist(transcriptLocs2refLocs(tstarts87, exonStarts=genes87$exonStarts, exonEnds=genes87$exonEnds, strand=genes87$strand, reorder.exons.on.minus.strand=TRUE)) refends87 <- unlist(transcriptLocs2refLocs(tends87, exonStarts=genes87$exonStarts, exonEnds=genes87$exonEnds, strand=genes87$strand, reorder.exons.on.minus.strand=TRUE)) refends87 - refstarts87 # Probe 87 hits a transcript that is on the plus # strand. Once translated back into a reference-based # hit, its width is > 25 nucleotides. This means # it is a "junction" probe. ############################################################################## ### ### Exercise 13 ### =========== ### IMPORTANT NOTE: You need Biostrings >= 2.14.7 for this exercise. This ### version of the package fixes a problem in matchPDict()/countPDict() ### that was causing an error when trying to solve this exercise. ### Biostrings 2.14.7 will be available on Fri 20 Nov around noon (Seattle ### time). Update your installation with the standard method: ### source("http://bioconductor.org/biocLite.R") ### update.packages(repos=biocinstallRepos(), ask=FALSE) ### Sorry for the inconvenience... library(Biostrings) library(hgu95av2probe) dict0 <- DNAStringSet(hgu95av2probe) ### Using 'algorithm="ACtree"' is a temporary workaround. It won't be ### necessary anymore when the default algo (ACtree2) will support IUPAC ### ambiguities in the subject. pdict0 <- PDict(dict0, algorithm="ACtree") library(BSgenome.Hsapiens.UCSC.hg18) library(SNPlocs.Hsapiens.dbSNP.20080617) hg18snp <- injectSNPs(Hsapiens, 'SNPlocs.Hsapiens.dbSNP.20080617') hg18snp chr1snps <- hg18snp$chr1 alphabetFrequency(chr1snps) # lots of IUPAC extended letters (1 extended # letter per SNP!) alphabetFrequency(Hsapiens$chr1) # no IUPAC extended letters (beside the Ns) ### Make sure that the N-blocks are masked by checking that the AGAPS ### and AMB masks are active: active(masks(chr1snps)) ### Then call countPDict() with fixed="pattern". countPDict() supports masks ### so it will skip the N-blocks in the subject: nhits <- countPDict(pdict0, chr1snps, fixed="pattern") max(nhits) # 11498 ### Let's compare this result with what we get with the original chr1 ### (no SNPs): nhits0 <- countPDict(pdict0, unmasked(Hsapiens$chr1)) max(nhits0) # 11261 all(nhits >= nhits0) # we get more hits after SNP injection which(nhits == max(nhits)) # probe 201534 probe201534 <- dict0[[which.max(nhits)]] probe201534_hits <- matchPattern(probe201534, chr1snps, fixed="pattern") probe201534_hits consensusMatrix(probe201534_hits) ii <- which(probe201534_hits != probe201534) length(ii) # probe 201534 has 784 hits that contain at least 1 known SNP ### Display these hits: probe201534_hits[ii] ############################################################################## ### ### Exercise 14 ### =========== ### This takes between 1min30 and 2min: transcripts2 <- extractTranscriptsFromGenome(hg18snp, genes) ### Nb of SNPs in the transcriptome: hg18tr_af2 <- alphabetFrequency(transcripts2, collapse=TRUE) sum((hg18tr_af - hg18tr_af2)[1:4]) # 776939 ### NOTES: ### ### - Because the same exon can belong to several transcripts, the same SNP ### in the genome can also appear in several transcripts. So the "nb of ### SNPs in the transcriptome" is not the same as the "nb of SNPs in the ### genome that belong to at least one transcript". ### ### - The method we used to determine the nb of SNPs is making the assumption ### that the original reference genome has a non-ambiguous letter (ACGT) at ### each SNP location *before* the injection the SNP. Even if this is true ### most of the times, there are however a few exceptions. In a separate ### analysis (not shown here) we found 20 exceptions among the 10 million ### SNPs stored in SNPlocs.Hsapiens.dbSNP.20080617. So the method we used ### to determine the nb of SNPs in the transcriptome is not 100% accurate. ### We could use the same method to find the number of SNPs in the genome: hg18_af2 <- sapply(seqnames(Hsapiens), function(nm) alphabetFrequency(unmasked(hg18snp[[nm]]))) sum((rowSums(hg18_af) - rowSums(hg18_af2))[1:4]) # 10727966 ### Finally, we determine the nb of SNPs in the genome that belong to at ### least one transcript (i.e. to at least one exon): makeExonViews <- function(subject, exon_starts, exon_ends) { starts <- strsplit(paste(exon_starts, collapse=","), ",", fixed=TRUE)[[1]] ends <- strsplit(paste(exon_ends, collapse=","), ",", fixed=TRUE)[[1]] ranges <- IRanges(as.integer(starts), as.integer(ends)) masks(subject) <- NULL Views(subject, ranges) } hg18ex_af <- sapply(seqnames(Hsapiens), function(nm) alphabetFrequency( reduce( makeExonViews(Hsapiens[[nm]], genes$exonStarts[genes$chrom==nm], genes$exonEnds[genes$chrom==nm]) ), collapse=TRUE)) hg18ex_af2 <- sapply(seqnames(hg18snp), function(nm) alphabetFrequency( reduce( makeExonViews(hg18snp[[nm]], genes$exonStarts[genes$chrom==nm], genes$exonEnds[genes$chrom==nm]) ), collapse=TRUE)) ### Nb of SNPs in the genome that belong to at least one exon: sum((rowSums(hg18ex_af) - rowSums(hg18ex_af2))[1:4]) # 339688 SNPs ### This is 3.17% of all known SNPs.