##############################################################################
###
### Exercise 1
### ==========

### Before you can load a Bioconductor package, you need to install it with:
###   source("http://bioconductor.org/biocLite.R")
###   biocLite("Biostrings")
### The biocLite() script will take care of installing any other BioC or CRAN
### package that the package you are trying to install depends on.
library(Biostrings)

library(BSgenome)

available.genomes()

library(BSgenome.Hsapiens.UCSC.hg18)

Hsapiens

Hsapiens$chr1


##############################################################################
###
### Exercise 2
### ==========

library(BSgenome.Celegans.UCSC.ce2)

Celegans$chrI

class(Celegans$chrI)  # a DNAString object

length(Celegans$chrI)

alphabetFrequency(Celegans$chrI)

s1 <- subseq(Celegans$chrI, end=20)  # extracts the first 20 nucleotides

### See ?subseq for more information.

### Note that subseq() doesn't copy the extracted sequence data in memory
### so subseq() is very fast and very efficient even when extracting a
### substring made of millions of nucleotides (e.g. from Human chr1).

reverseComplement(s1)


##############################################################################
###
### Exercise 3
### ==========

library(hgu95av2probe)

hgu95av2probe

class(hgu95av2probe)

colnames(hgu95av2probe)

hgu95av2probe$sequence[1:5]

dict0 <- DNAStringSet(hgu95av2probe)

dict0

length(dict0)

width(dict0)[1:10]  # 'width(dict0)' is a very long integer vector!

dict0[-2]

rev(dict0)

dict0[[1]]

DNAStringSet(dict0, end=-3)  # remove last 2 nucleotides

DNAStringSet(dict0, start=-10)  # keep the last 10 nucleotides

af0 <- alphabetFrequency(dict0)
dim(af0)
head(af0)

alphabetFrequency(dict0, collapse=TRUE)

alphabetFrequency(reverseComplement(dict0), collapse=TRUE)

sum(af0[,"C"] + af0[,"G"] >= 0.8 * width(dict0))  # only 1 probe

ap0 <- alphabetFrequency(dict0, collapse=TRUE, as.prob=TRUE, baseOnly=TRUE)
ap0["C"] + ap0["G"]  # GC content for the entire microarray


##############################################################################
###
### Exercise 4
### ==========

library(BSgenome.Celegans.UCSC.ce2)

chrI <- Celegans$chrI

vI <- Views(chrI, start=c(5, 300, 20), width=50)
vI

subject(vI)
start(vI)
end(vI)
gaps(vI)

alphabetFrequency(vI)

DNAStringSet(vI)


##############################################################################
###
### Exercise 5
### ==========

library(BSgenome.Hsapiens.UCSC.hg18)

Hsapiens$chr1

unmasked(Hsapiens$chr1)


##############################################################################
###
### Exercise 6
### ==========

chrY <- Hsapiens$chrY

chrY  # See the 1st value in the maskedratio col for the %
      # of the sequence that is masked by the AGAPS mask.
      # => 55.6% of this assembly of Human chromosome Y is made
      # of assembly gaps!
      # Use maskedratio(masks(chrY))[1] to extract that information
      # programmatically.

alphabetFrequency(chrY, baseOnly=TRUE)  # all the Ns are masked
alphabetFrequency(unmasked(chrY), as.prob=TRUE)["N"]

as(chrY , "Views")
gaps(as(chrY , "Views"))

### Lengths of the assembly gaps:
width(gaps(as(chrY , "Views")))


##############################################################################
###
### Exercise 7
### ==========

library(BSgenome)

available.genomes()

library(BSgenome.Celegans.UCSC.ce2)

Celegans
seqlengths(Celegans)  # from the examples in ?Celegans


##############################################################################
###
### Exercise 8
### ==========

library(BSgenome.Celegans.UCSC.ce2)

chrI <- Celegans$chrI

pattern <- DNAString("TTACCGATTTCA")

matchPattern(pattern, chrI)  # hits in the plus strand
matchPattern(reverseComplement(pattern), chrI)  # hits in the minus strand

### Look at the examples in ?reverseComplement for a discussion about why
### reverse complementing the pattern and not the subject is generally better.

matchPattern(pattern, chrI, max.mismatch=1)

matchPattern(pattern, chrI, max.mismatch=2, with.indels=TRUE)


##############################################################################
###
### Exercise 9
### ==========

library(BSgenome.Hsapiens.UCSC.hg18)

upstream1000 <- Hsapiens$upstream1000

pattern <- DNAString("TTACCGATTTCA")

mi <- vmatchPattern(pattern, upstream1000)
mi

### Extract the indices of the upstream1000 elements that received
### at least 1 hit:
upstream1000_idx <- which(countIndex(mi) != 0)

upstream1000[upstream1000_idx]


##############################################################################
###
### Exercise 10
### ===========

library(BSgenome.Celegans.UCSC.ce2)

chrI <- Celegans$chrI

pattern <- DNAString("GAACTTTGCCAC")

matchPattern(pattern, chrI)  # no match

pattern2 <- DNAString("GAACNNNGCCAC")

matchPattern(pattern2, chrI)  # no match

matchPattern(pattern2, chrI, fixed=FALSE)  # 27 matches!


##############################################################################
###
### Exercise 11
### ===========

library(hgu95av2probe)

dict0 <- DNAStringSet(hgu95av2probe)

pdict0 <- PDict(dict0)
pdict0

library(BSgenome.Hsapiens.UCSC.hg18)

chr1 <- unmasked(Hsapiens$chr1)

mi <- matchPDict(pdict0, chr1)
mi

ci <- countIndex(mi)

### Probes with the higher number of hits:
dict0_idx <- which(ci == max(ci))
dict0[dict0_idx]

Views(chr1, mi[[dict0_idx[1]]])  # showing hits for probe number dict0_idx[1]

matchPattern(dict0[[dict0_idx[1]]], chr1)  # same result

### Hits in the minus strand:
rcdict0 <- reverseComplement(dict0)
rcpdict0 <- PDict(rcdict0)

rcmi <- matchPDict(rcpdict0, chr1)
rcci <- countIndex(rcmi)

### Total count of hits:
totalci <- ci + rcci
tci <- table(totalci)
tci

### Plot that table in logarithmic scale:
plot(x=1.00+as.numeric(names(tci)), y=tci,
     log="xy", xlab="nb of hits", ylab="nb of probes")

### With up to 2 mismatching nucleotides in the last 12 nucleotides of
### each probe:

pdict2 <- PDict(dict0, tb.end=-13)
pdict2

mi2 <- matchPDict(pdict2, chr1, max.mismatch=2)
ci2 <- countIndex(mi2)
dict0_idx2 <- which(ci2 == max(ci2))
dict0[dict0_idx2]
v <- Views(chr1, mi2[[dict0_idx2[1]]])
### See how mismatches are distributed along the hits:
consensusMatrix(v)
consensusMatrix(v)[DNA_BASES, ]  # lots of mismatches!
consensusString(v)
dict0[[dict0_idx2[1]]] == DNAString(consensusString(v))  # TRUE

rcpdict2 <- PDict(rcdict0, tb.start=13)
rcpdict2

rcmi2 <- matchPDict(rcpdict2, chr1, max.mismatch=2)
rcci2 <- countIndex(rcmi2)
rcdict0_idx2 <- which(rcci2 == max(rcci2))
dict0[rcdict0_idx2]
vv <- Views(chr1, rcmi2[[rcdict0_idx2[1]]])
consensusMatrix(vv)[DNA_BASES, ]

totalci2 <- ci2 + rcci2
tci2 <- table(totalci2)  
tci2
plot(x=1.00+as.numeric(names(tci2)), y=tci2,
     log="xy", xlab="nb of hits", ylab="nb of probes")


##############################################################################
###
### Exercise 12
### ===========

library(GenomicFeatures)  # for extractTranscriptsFromGenome()
library(GenomicFeatures.Hsapiens.UCSC.hg18)  # load the gene table
genes <- geneHuman()
library(BSgenome.Hsapiens.UCSC.hg18)  # load the genome

### Takes about 30 sec.:
transcripts <- extractTranscriptsFromGenome(Hsapiens, genes)
transcripts

hg18tr_af <- alphabetFrequency(transcripts, collapse=TRUE)

hg18_af <- sapply(seqnames(Hsapiens),
                  function(nm) alphabetFrequency(unmasked(Hsapiens[[nm]])))

p1 <- hg18tr_af / sum(hg18tr_af)
p1
p2 <- rowSums(hg18_af) / sum(as.numeric(hg18_af))
p2
### This shows that the transcriptome is more CG rich than the genome:
p1 > p2

library(hgu95av2probe)

### We count the total number of hits in the transcriptome
### for each the first 50 probes:
sapply(1:50, function(i)
  sum(vcountPattern(hgu95av2probe$sequence[i], transcripts)))

### For the last question in this exercise, we need to use vmatchPattern()
### instead of vcountPattern() because we need to get the locations of the
### hits. We use probe 50:
mi50 <- vmatchPattern(hgu95av2probe$sequence[50], transcripts)

refstarts50 <- unlist(transcriptLocs2refLocs(startIndex(mi50),
                                   exonStarts=genes$exonStarts,
                                   exonEnds=genes$exonEnds,
                                   strand=genes$strand,
                                   reorder.exons.on.minus.strand=TRUE))
refends50 <- unlist(transcriptLocs2refLocs(endIndex(mi50),
                                   exonStarts=genes$exonStarts,
                                   exonEnds=genes$exonEnds,
                                   strand=genes$strand,
                                   reorder.exons.on.minus.strand=TRUE))
refends50 - refstarts50
### The above tells us 2 things:
###   1. The fact that 'refends50' is < 'refstarts50' tells us that the 7 hits
###      of probe 50 are in transcripts that belong to the minus strand (this
###      is confirmed by 'genes$strand[countIndex(mi50) != 0]')
###   2. Once translated back into reference-based hits, their width is still
###      25 bases. This means that probe 50 is not a "junction" probe i.e. a
###      probe that spans 2 exons in a transcript.

### Here is a little function that tries to detect whether a probe is a
### junction probe by using the naive approach above:
isJunctionProbe <- function(probe, transcripts, genes)
{
    if (is.character(probe))
        probe <- DNAString(probe)
    mi <- vmatchPattern(probe, transcripts)
    refstarts <- unlist(transcriptLocs2refLocs(startIndex(mi),
                                   exonStarts=genes$exonStarts,
                                   exonEnds=genes$exonEnds,
                                   strand=genes$strand,
                                   reorder.exons.on.minus.strand=TRUE))
    refends <- unlist(transcriptLocs2refLocs(endIndex(mi),
                                   exonStarts=genes$exonStarts,
                                   exonEnds=genes$exonEnds,
                                   strand=genes$strand,
                                   reorder.exons.on.minus.strand=TRUE))
    any(abs(refends - refstarts) + 1L != length(probe))
}

### Searching for junction probes (stop after the 1st is found).
### Note that this will take several minutes...
for (i in seq_len(length(hgu95av2probe$sequence))) {
    cat("Probe ", i, ": ", sep="")
    ok <- isJunctionProbe(hgu95av2probe$sequence[i], transcripts, genes)
    cat(ok, "\n")
    if (ok)
        break
}
### And the winner is... probe 87!

mi87 <- vmatchPattern(hgu95av2probe$sequence[87], transcripts)
tidx87 <- countIndex(mi87) != 0  # index of transcripts with hits
tstarts87 <- startIndex(mi87)[tidx87]
tends87 <- endIndex(mi87)[tidx87]
genes87 <- genes[tidx87, ]

refstarts87 <- unlist(transcriptLocs2refLocs(tstarts87,
                                   exonStarts=genes87$exonStarts,
                                   exonEnds=genes87$exonEnds,
                                   strand=genes87$strand,
                                   reorder.exons.on.minus.strand=TRUE))
refends87 <- unlist(transcriptLocs2refLocs(tends87,
                                   exonStarts=genes87$exonStarts,
                                   exonEnds=genes87$exonEnds,
                                   strand=genes87$strand,
                                   reorder.exons.on.minus.strand=TRUE))
refends87 - refstarts87  # Probe 87 hits a transcript that is on the plus
                         # strand. Once translated back into a reference-based
                         # hit, its width is > 25 nucleotides. This means
                         # it is a "junction" probe.


##############################################################################
###
### Exercise 13
### ===========

### IMPORTANT NOTE: You need Biostrings >= 2.14.7 for this exercise. This
### version of the package fixes a problem in matchPDict()/countPDict()
### that was causing an error when trying to solve this exercise.
### Biostrings 2.14.7 will be available on Fri 20 Nov around noon (Seattle
### time). Update your installation with the standard method:
###   source("http://bioconductor.org/biocLite.R")
###   update.packages(repos=biocinstallRepos(), ask=FALSE)
### Sorry for the inconvenience...

library(Biostrings)
library(hgu95av2probe)
dict0 <- DNAStringSet(hgu95av2probe)

### Using 'algorithm="ACtree"' is a temporary workaround. It won't be
### necessary anymore when the default algo (ACtree2) will support IUPAC
### ambiguities in the subject.
pdict0 <- PDict(dict0, algorithm="ACtree")

library(BSgenome.Hsapiens.UCSC.hg18)

library(SNPlocs.Hsapiens.dbSNP.20080617)
hg18snp <- injectSNPs(Hsapiens, 'SNPlocs.Hsapiens.dbSNP.20080617')
hg18snp

chr1snps <- hg18snp$chr1
alphabetFrequency(chr1snps)  # lots of IUPAC extended letters (1 extended
                             # letter per SNP!)
alphabetFrequency(Hsapiens$chr1)  # no IUPAC extended letters (beside the Ns)

### Make sure that the N-blocks are masked by checking that the AGAPS
### and AMB masks are active:
active(masks(chr1snps))
### Then call countPDict() with fixed="pattern". countPDict() supports masks
### so it will skip the N-blocks in the subject:
nhits <- countPDict(pdict0, chr1snps, fixed="pattern")

max(nhits)  # 11498

### Let's compare this result with what we get with the original chr1
### (no SNPs):
nhits0 <- countPDict(pdict0, unmasked(Hsapiens$chr1))
max(nhits0)  # 11261
all(nhits >= nhits0)  # we get more hits after SNP injection

which(nhits == max(nhits))  # probe 201534 
probe201534 <- dict0[[which.max(nhits)]]
probe201534_hits <- matchPattern(probe201534, chr1snps, fixed="pattern")
probe201534_hits
consensusMatrix(probe201534_hits)
ii <- which(probe201534_hits != probe201534)
length(ii)  # probe 201534 has 784 hits that contain at least 1 known SNP
### Display these hits:
probe201534_hits[ii]


##############################################################################
###
### Exercise 14
### ===========

### This takes between 1min30 and 2min:
transcripts2 <- extractTranscriptsFromGenome(hg18snp, genes)

### Nb of SNPs in the transcriptome:
hg18tr_af2 <- alphabetFrequency(transcripts2, collapse=TRUE)
sum((hg18tr_af - hg18tr_af2)[1:4])  # 776939
### NOTES:
###
###   - Because the same exon can belong to several transcripts, the same SNP
###     in the genome can also appear in several transcripts. So the "nb of
###     SNPs in the transcriptome" is not the same as the "nb of SNPs in the
###     genome that belong to at least one transcript".
###
###   - The method we used to determine the nb of SNPs is making the assumption
###     that the original reference genome has a non-ambiguous letter (ACGT) at
###     each SNP location *before* the injection the SNP. Even if this is true
###     most of the times, there are however a few exceptions. In a separate
###     analysis (not shown here) we found 20 exceptions among the 10 million
###     SNPs stored in SNPlocs.Hsapiens.dbSNP.20080617. So the method we used
###     to determine the nb of SNPs in the transcriptome is not 100% accurate.

### We could use the same method to find the number of SNPs in the genome:
hg18_af2 <- sapply(seqnames(Hsapiens),
                   function(nm) alphabetFrequency(unmasked(hg18snp[[nm]])))
sum((rowSums(hg18_af) - rowSums(hg18_af2))[1:4])  # 10727966

### Finally, we determine the nb of SNPs in the genome that belong to at
### least one transcript (i.e. to at least one exon):
makeExonViews <- function(subject, exon_starts, exon_ends)
{
    starts <- strsplit(paste(exon_starts, collapse=","), ",", fixed=TRUE)[[1]]
    ends <- strsplit(paste(exon_ends, collapse=","), ",", fixed=TRUE)[[1]]
    ranges <- IRanges(as.integer(starts), as.integer(ends))
    masks(subject) <- NULL
    Views(subject, ranges)
}

hg18ex_af <- sapply(seqnames(Hsapiens),
                    function(nm)
                      alphabetFrequency(
                        reduce(
                          makeExonViews(Hsapiens[[nm]],
                                        genes$exonStarts[genes$chrom==nm],
                                        genes$exonEnds[genes$chrom==nm])
                          ), collapse=TRUE))

hg18ex_af2 <- sapply(seqnames(hg18snp),
                     function(nm)
                       alphabetFrequency(
                         reduce(
                           makeExonViews(hg18snp[[nm]],
                                         genes$exonStarts[genes$chrom==nm],
                                         genes$exonEnds[genes$chrom==nm])
                           ), collapse=TRUE))

### Nb of SNPs in the genome that belong to at least one exon:
sum((rowSums(hg18ex_af) - rowSums(hg18ex_af2))[1:4])  # 339688 SNPs
### This is 3.17% of all known SNPs.