### R code from vignette source 'Introduction.Rnw'

###################################################
### code chunk number 1: setup
###################################################
options(width=90)


###################################################
### code chunk number 2: biocLite (eval = FALSE)
###################################################
## source("http://bioconductor.org/biocLite.R")
## biocLite(c("ShortRead", "VariantAnnotation")) # new packages
## biocLite()                                    # update packages


###################################################
### code chunk number 3: help-start (eval = FALSE)
###################################################
## help.start()


###################################################
### code chunk number 4: help (eval = FALSE)
###################################################
## library(ShortRead)
## ?readFastq


###################################################
### code chunk number 5: S4
###################################################
library(Biostrings)
showMethods(complement)


###################################################
### code chunk number 6: S4-showMethods (eval = FALSE)
###################################################
## showMethods(class="DNAStringSet", where=getNamespace("Biostrings"))


###################################################
### code chunk number 7: S4-help (eval = FALSE)
###################################################
## class ? DNAStringSet
## method ? "complement,DNAStringSet"


###################################################
### code chunk number 8: vignette (eval = FALSE)
###################################################
## vignette(package="useR2013")


###################################################
### code chunk number 9: scavenge (eval = FALSE)
###################################################
## ??readFastq
## library(Biostrings)
## ?alphabetFrequency
## class?GappedAlignments
## vignette(package="GenomicRanges")


### R code from vignette source 'SequencesAndRanges.Rnw'

###################################################
### code chunk number 1: setup
###################################################
options(width=90)
library(useR2013)


###################################################
### code chunk number 2: GRanges-genes
###################################################
genes <- GRanges(seqnames=c("3R", "X"),
                 ranges=IRanges(
                     start=c(19967117, 18962306),
                     end=c(19973212, 18962925)),
                 strand=c("+", "-"),
                 seqlengths=c(`3R`=27905053L, `X`=22422827L))


###################################################
### code chunk number 3: GRanges-display
###################################################
genes


###################################################
### code chunk number 4: GRanges-help (eval = FALSE)
###################################################
## ?GRanges


###################################################
### code chunk number 5: GRanges-vignettes (eval = FALSE)
###################################################
## vignette(package="GenomicRanges")


###################################################
### code chunk number 6: ranges-ir
###################################################
ir <- IRanges(start=c(7, 9, 12, 14, 22:24),
              end=c(15, 11, 12, 18, 26, 27, 28))


###################################################
### code chunk number 7: ranges-ir-plot
###################################################
png("ranges-ir-plot.png", width=800, height=160)
plotRanges(ir, xlim=c(5, 35), main="Original")
dev.off()
png("ranges-shift-ir-plot.png", width=800, height=160)
plotRanges(shift(ir, 5), xlim=c(5, 35), main="Shift")
dev.off()
png("ranges-reduce-ir-plot.png", width=800, height=160)
plotRanges(reduce(ir), xlim=c(5, 35), main="Reduce")
dev.off()


###################################################
### code chunk number 8: GRanges-mcols
###################################################
mcols(genes) <- DataFrame(EntrezId=c("42865", "2768869"),
                          Symbol=c("kal-1", "CG34330"))


###################################################
### code chunk number 9: GRanges-metadata
###################################################
metadata(genes) <-
    list(CreatedBy="A. User", Date=date())


###################################################
### code chunk number 10: GRangesList-eg-setup
###################################################
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene # alias
fbgn <- exonsBy(txdb, "gene")["FBgn0039155"]
seqlevels(fbgn) <- "chr3R"


###################################################
### code chunk number 11: GRangesList-eg
###################################################
fbgn


###################################################
### code chunk number 12: txdb
###################################################
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene # alias
ex0 <- exonsBy(txdb, "gene")
head(table(elementLengths(ex0)))
ids <- c("FBgn0002183", "FBgn0003360", "FBgn0025111", "FBgn0036449")
ex <- reduce(ex0[ids])


###################################################
### code chunk number 13: gc-genome
###################################################
library(BSgenome.Dmelanogaster.UCSC.dm3)
nm <- as.character(unique(seqnames(ex[[1]])))
chr <- Dmelanogaster[[nm]]
v <- Views(chr, start=start(ex[[1]]), end=end(ex[[1]]))


###################################################
### code chunk number 14: gcFunction-genome
###################################################
gcFunction(v)


###################################################
### code chunk number 15: gcFunction-definition
###################################################
gcFunction


### R code from vignette source 'ReadsAndAlignments.Rnw'

###################################################
### code chunk number 1: setup
###################################################
options(width=90)
library(useR2013)


###################################################
### code chunk number 2: fastq-format
###################################################
fls <- dir(file.path(bigdata(), "fastq"), full=TRUE)
cat(noquote(readLines(fls[[1]], 4)), sep="\n")


###################################################
### code chunk number 3: ascii
###################################################
cat(rawToChar(as.raw(32+1:47)),
    rawToChar(as.raw(32+48:94)), sep="\n")


###################################################
### code chunk number 4: readFastq
###################################################
fastqDir <- file.path(bigdata(), "fastq")
fastqFiles <- dir(fastqDir, full=TRUE)
fq <- readFastq(fastqFiles[1])
fq


###################################################
### code chunk number 5: sread
###################################################
head(sread(fq), 3)
head(quality(fq), 3)


###################################################
### code chunk number 6: width-ShortReadQ
###################################################
abc <- alphabetByCycle(sread(fq))
abc[1:4, 1:8]


###################################################
### code chunk number 7: FastqSampler
###################################################
sampler <- FastqSampler(fastqFiles[1], 1000000)
yield(sampler) # sample of 1000000 reads


###################################################
### code chunk number 8: qa (eval = FALSE)
###################################################
## ## Bioc 2.13 only; see ?qa for Bioc 2.12
## qas <- qa(fastqFiles, type="fastq")
## rpt <- report(qas, dest=tempfile())
## browseURL(rpt)


###################################################
### code chunk number 9: report (eval = FALSE)
###################################################
## rpt <- system.file("GSM461176_81_qa_report", "index.html",
##                    package="useR2013")
## browseURL(rpt)


###################################################
### code chunk number 10: fastq-discovery
###################################################
dir(bigdata())
fls <- dir(file.path(bigdata(), "fastq"), full=TRUE)


###################################################
### code chunk number 11: fastq-input-gc
###################################################
rm(fq); invisible(gc())


###################################################
### code chunk number 12: fastq-input
###################################################
fq <- readFastq(fls[1])


###################################################
### code chunk number 13: gcC
###################################################
alf0 <- alphabetFrequency(sread(fq), as.prob=TRUE, collapse=TRUE)
sum(alf0[c("G", "C")])


###################################################
### code chunk number 14: gc-reads
###################################################
gc <- gcFunction(sread(fq))
hist(gc)


###################################################
### code chunk number 15: abc
###################################################
abc <- alphabetByCycle(sread(fq))
matplot(t(abc[c("A", "C", "G", "T"),]), type="l")


###################################################
### code chunk number 16: abc-mclapply (eval = FALSE)
###################################################
## library(parallel)
## gc0 <- mclapply(fls, function(fl) {
##   fq <- readFastq(fl)
##   gc <- gcFunction(sread(fq))
##   table(cut(gc, seq(0, 1, .05)))
## })
## ## simplify list of length 2 to 2-D array
## gc <- simplify2array(gc0)
## matplot(gc, type="s")


###################################################
### code chunk number 17: SAM
###################################################
fl <- system.file("extdata", "ex1.sam", package="Rsamtools")
strsplit(readLines(fl, 1), "\t")[[1]]


###################################################
### code chunk number 18: readGappedAlignments
###################################################
alnFile <- system.file("extdata", "ex1.bam", package="Rsamtools")
aln <- readGappedAlignments(alnFile)
head(aln, 3)


###################################################
### code chunk number 19: GappedAlignments-accessors
###################################################
table(strand(aln))
table(width(aln))
head(sort(table(cigar(aln)), decreasing=TRUE))


###################################################
### code chunk number 20: bam-ex-fls
###################################################
fls <- dir(file.path(bigdata(), "bam"), ".bam$", full=TRUE)
names(fls) <- sub("_.*", "", basename(fls))


###################################################
### code chunk number 21: bam-ex-input
###################################################
## input
aln <- readGappedAlignments(fls[1])
xtabs(~seqnames + strand, as.data.frame(aln))


###################################################
### code chunk number 22: bam-ex-roi
###################################################
data(ex)             # from an earlier exercise


###################################################
### code chunk number 23: bam-ex-strand
###################################################
strand(aln) <- "*"   # protocol not strand-aware


###################################################
### code chunk number 24: bam-ex-hits
###################################################
hits <- findOverlaps(aln, ex)


###################################################
### code chunk number 25: qhits
###################################################
qhits <- countQueryHits(hits)
table(qhits)


###################################################
### code chunk number 26: qhits-keep
###################################################
keep <- which(qhits == 1)


###################################################
### code chunk number 27: bam-ex-cnt
###################################################
cnt <- countSubjectHits(hits[queryHits(hits) %in% keep])


###################################################
### code chunk number 28: bam-count-fun
###################################################
counter <-
    function(filePath, range)
{
    hits <- findOverlaps(aln, ex)
    keep <- which(countQueryHits(hits) == 1)
    cnts <- countSubjectHits(hits[queryHits(hits) %in% keep])
    setNames(cnts, names(ex))
}


###################################################
### code chunk number 29: bam-count-all
###################################################
counts <- sapply(fls, counter, ex)
counts


###################################################
### code chunk number 30: bam-count-mclapply (eval = FALSE)
###################################################
## if (require(parallel))
##     simplify2array(mclapply(fls, counter, ex))


###################################################
### code chunk number 31: gc-read
###################################################
param <- ScanBamParam(what="seq")
seqs <- scanBam(fls[[1]], param=param)
readGC <- gcFunction(seqs[[1]][["seq"]])
hist(readGC)


### R code from vignette source 'RNASeq.Rnw'

###################################################
### code chunk number 1: setup
###################################################
options(width=90)
library(useR2013)


###################################################
### code chunk number 2: counts
###################################################
data(counts)
dim(counts)
grps <- factor(sub("[1-4].*", "", colnames(counts)),
               levels=c("untreated", "treated"))
pairs <- factor(c("single", "paired", "paired",
                  "single", "single", "paired", "paired"))
pData <- data.frame(Group=grps, PairType=pairs,
                    row.names=colnames(counts))


###################################################
### code chunk number 3: DGEList
###################################################
library(edgeR)
dge <- DGEList(counts, group=pData$Group)
dge <- calcNormFactors(dge)


###################################################
### code chunk number 4: DEGList-filter
###################################################
m <- sweep(dge$counts, 2, 1e6 / dge$samples$lib.size, `*`)
ridx <- rowSums(m > 1) >= 2
table(ridx)                        # number filtered / retained
dge <- dge[ridx,]


###################################################
### code chunk number 5: design
###################################################
(design <- model.matrix(~ Group, pData))


###################################################
### code chunk number 6: common.dispersion
###################################################
dge <- estimateTagwiseDisp(dge)
mean(sqrt(dge$tagwise.dispersion))


###################################################
### code chunk number 7: glmFit
###################################################
fit <- glmFit(dge, design)


###################################################
### code chunk number 8: lrt
###################################################
lrTest <- glmLRT(fit, coef=2)


###################################################
### code chunk number 9: topTags
###################################################
tt <- topTags(lrTest, n=10)
tt[1:3,]


###################################################
### code chunk number 10: sanity
###################################################
sapply(rownames(tt$table)[1:4],
       function(x) tapply(counts[x,], pData$Group, mean))


### R code from vignette source 'ChIPSeq.Rnw'

###################################################
### code chunk number 1: setup
###################################################
options(width=90)
library(useR2013)


###################################################
### code chunk number 2: chipseq-report
###################################################
rpt <- system.file("GSE30263_qa_report", "index.html",
           package="useR2013", mustWork=TRUE)
if (interactive())
    browseURL(rpt)


###################################################
### code chunk number 3: chipseq-halfpeak-stamFile
###################################################
stamFile <- system.file("data", "stam.Rda", package="useR2013")
load(stamFile)
stam


###################################################
### code chunk number 4: chipseq-stam
###################################################
head(colData(stam), 3)
head(rowData(stam), 3)
xtabs(~Replicate + CellLine, colData(stam))[,1:5]


###################################################
### code chunk number 5: chipseq-stam-detect
###################################################
m <- assays(stam)[["Tags"]] > 0 # peaks detected...
peaksPerSample <- table(rowSums(m))
head(peaksPerSample)
tail(peaksPerSample)


###################################################
### code chunk number 6: chipseq-stam-similarity-1
###################################################
library(bioDist)                   # for cor.dist
m <- asinh(assays(stam)[["Tags"]]) # transformed tag counts
d <- cor.dist(t(m))                # correlation distance
h <- hclust(d)                     # hierarchical clustering


###################################################
### code chunk number 7: chipseq-stam-similarity-plot (eval = FALSE)
###################################################
## plot(h, cex=.8, ann=FALSE)


###################################################
### code chunk number 8: chipseq-stam-similarity
###################################################
png("chipseq-stam-similarity.png", width=1280)
opar <- par(mar=c(0, 0, 0, 0))
plot(h, axes=FALSE, ann=FALSE)
par(opar)
invisible(dev.off())


###################################################
### code chunk number 9: chipseq-CTCF-PWM-setup
###################################################
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg19)
library(seqLogo)
library(lattice)
pwm <- getJASPAR("MA0139.1") # useR2013::getJASPAR


###################################################
### code chunk number 10: chipseq-CTCF-PWM-binding
###################################################
chrid <- "chr1"
hits <-matchPWM(pwm, Hsapiens[[chrid]]) # '+' strand
scores <- PWMscoreStartingAt(pwm, subject(hits), start(hits))


###################################################
### code chunk number 11: chipseq-CTCF-PWM-densityplot (eval = FALSE)
###################################################
## densityplot(scores, xlim=range(scores), pch="|")


###################################################
### code chunk number 12: CTCF-PWM-found
###################################################
cm <- consensusMatrix(hits)[1:4,]
seqLogo(makePWM(scale(cm, FALSE, colSums(cm))))


### R code from vignette source 'Annotation.Rnw'

###################################################
### code chunk number 1: setup
###################################################
options(width=90)
library(useR2013)


###################################################
### code chunk number 2: select
###################################################
cols(org.Dm.eg.db)
keytypes(org.Dm.eg.db)
uniprotKeys <- head(keys(org.Dm.eg.db, keytype="UNIPROT"))
cols <- c("SYMBOL", "PATH")
select(org.Dm.eg.db, keys=uniprotKeys, cols=cols, keytype="UNIPROT")


###################################################
### code chunk number 3: select-kegg
###################################################
kegg <- select(org.Dm.eg.db, "00310", c("UNIPROT", "SYMBOL"), "PATH")
nrow(kegg)
head(kegg, 3)


###################################################
### code chunk number 4: chipseq-anno-data
###################################################
stamFile <- system.file("data", "stam.Rda", package="useR2013")
load(stamFile)


###################################################
### code chunk number 5: chipseq-anno-common
###################################################
ridx <- rowSums(assays(stam)[["Tags"]] > 0) == ncol(stam)
peak <- rowData(stam)[ridx]


###################################################
### code chunk number 6: chipseq-anno-centers
###################################################
peak <- resize(peak, width=1, fix="center")


###################################################
### code chunk number 7: chipseq-anno-tss
###################################################
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
tx <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene)
tss <- resize(tx, width=1)


###################################################
### code chunk number 8: chipseq-anno-tss-dist
###################################################
idx <- nearest(peak, tss)
sgn <- as.integer(ifelse(strand(tss)[idx] == "+", 1, -1))
dist <- (start(peak) - start(tss)[idx]) * sgn


###################################################
### code chunk number 9: chipseq-anno-tss-dist
###################################################
bound <- 1000
ok <- abs(dist) < bound
dist <- dist[ok]
table(sign(dist))


###################################################
### code chunk number 10: anno-tss-disthist
###################################################
griddensityplot <-
    function(...)
    ## 'panel' function to plot a grid underneath density
{
    panel.grid()
    panel.densityplot(...)
}
print(densityplot(dist[ok], plot.points=FALSE,
    panel=griddensityplot,
    xlab="Distance to Nearest TSS"))


###################################################
### code chunk number 11: tss-dist-func
###################################################
distToTss <-
    function(peak, tx)
{
    peak <- resize(peak, width=1, fix="center")
    tss <- resize(tx, width=1)
    idx <- nearest(peak, tss)
    sgn <- as.numeric(ifelse(strand(tss)[idx] == "+", 1, -1))
    (start(peak) - start(tss)[idx]) * sgn
}


###################################################
### code chunk number 12: chipseq-anno-seq
###################################################
library(BSgenome.Hsapiens.UCSC.hg19)
ridx <- rowSums(assays(stam)[["Tags"]] > 0) == ncol(stam)
ridx <- ridx & (seqnames(rowData(stam)) == "chr6")
pk6 <- rowData(stam)[ridx]
seqs <- getSeq(Hsapiens, pk6)
head(seqs, 3)


###################################################
### code chunk number 13: chipseq-tss-dist-2
###################################################
pwm <- useR2013::getJASPAR("MA0139.1")
hits <- lapply(seqs, matchPWM, pwm=pwm)
hasPwmMatch <- sapply(hits, length) > 0
dist <- distToTss(pk6, tx)

ok <- abs(dist) < bound
df <- data.frame(Distance = dist[ok], HasPwmMatch = hasPwmMatch[ok])
print(densityplot(~Distance, group=HasPwmMatch, df,
    plot.points=FALSE, panel=griddensityplot,
    auto.key=list(
      columns=2,
      title="Has Position Weight Matrix?",
      cex.title=1),
    xlab="Distance to Nearest Tss"))


###################################################
### code chunk number 14: readVcf
###################################################
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz",
                  package="VariantAnnotation")
(hdr <- scanVcfHeader(fl))
info(hdr)[c("VT", "RSQ"),]


###################################################
### code chunk number 15: readVcf
###################################################
(vcf <- readVcf(fl, "hg19"))
head(rowData(vcf), 3)


###################################################
### code chunk number 16: renameSeqlevels
###################################################
rowData(vcf) <- renameSeqlevels(rowData(vcf), c("22"="ch22"))


###################################################
### code chunk number 17: dbSNP
###################################################
library(SNPlocs.Hsapiens.dbSNP.20101109)
snpFilt <- useR2013::dbSNPFilter("SNPlocs.Hsapiens.dbSNP.20101109")
inDbSNP <- snpFilt(vcf)
table(inDbSNP)


###################################################
### code chunk number 18: SNP-quality
###################################################
metrics <-
    data.frame(inDbSNP=inDbSNP, RSQ=info(vcf)$RSQ)


###################################################
### code chunk number 19: RSQ-plot
###################################################
library(ggplot2)
ggplot(metrics, aes(RSQ, fill=inDbSNP)) +
    geom_density(alpha=0.5) +
    scale_x_continuous(name="MaCH / Thunder Imputation Quality") +
    scale_y_continuous(name="Density") +
    theme(legend.position="top")