### R code from vignette source 'A13_LargeData.Rnw'

###################################################
### code chunk number 1: setup
###################################################
library(SequenceAnalysisData)
library(RNAseqData.HNRNPC.bam.chr14)
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(Rsamtools)
library(ShortRead)
library(parallel); options(mc.cores=detectCores())
library(lattice)


###################################################
### code chunk number 2: io-sketch (eval = FALSE)
###################################################
## ## not evaluated
## colClasses <-
##   c("NULL", "integer", "numeric", "NULL")
## df <- read.table("myfile", colClasses=colClasses)


###################################################
### code chunk number 3: iterative
###################################################
x <- runif(100000); x2 <- x^2
m <- matrix(x2, nrow=1000); y <- rowSums(m)


###################################################
### code chunk number 4: preallocate-and-fill-sketch (eval = FALSE)
###################################################
## ## not evaluated
## result <- numeric(nrow(df))
## for (i in seq_len(nrow(df)))
##   result[[i]] <- some_calc(df[i,])


###################################################
### code chunk number 5: unnecessary-names
###################################################
unlist(list(a=1:2)) # name 'a' becomes 'a1', 'a2'
unlist(list(a=1:2), use.names=FALSE)   # no names


###################################################
### code chunk number 6: inappropriate-functions-2 (eval = FALSE)
###################################################
## ## not evaluated
## library(limma) # microarray linear models
## fit <- lmFit(eSet, design)


###################################################
### code chunk number 7: algo-poly
###################################################
x <- 1:100; s <- sample(x, 10)
inS <- x %in% s


###################################################
### code chunk number 8: system.time
###################################################
m <- matrix(runif(200000), 20000)
system.time(apply(m, 1, sum))


###################################################
### code chunk number 9: rbenchmark
###################################################
library(rbenchmark)
f0 <- function(x) apply(x, 1, sum)
f1 <- function(x) rowSums(x)
benchmark(f0(m), f1(m), 
          columns=c("test", "elapsed", "relative"), 
          replications=5)


###################################################
### code chunk number 10: identical
###################################################
res1 <- apply(m, 1, sum)
res2 <- rowSums(m)
identical(res1, res2)
identical(c(1, -1), c(x=1, y=-1))
all.equal(c(1, -1), c(x=1, y=-1),
          check.attributes=FALSE)


###################################################
### code chunk number 11: readGAlignments-restriciton
###################################################
fls <- RNAseqData.HNRNPC.bam.chr14_BAMFILES # 8 BAM files
bamfls <- BamFileList(fls, yieldSize=500000) # yieldSize can be larger
gal <- readGAlignments(bamfls[[1]])
head(gal, 3)
param <- ScanBamParam(what="seq")  ## also input sequence
gal <- readGAlignments(bamfls[[1]], param=param)
head(mcols(gal)$seq)


###################################################
### code chunk number 12: restriction-what
###################################################
param <- ScanBamParam(what=c("rname", "pos", "cigar"))


###################################################
### code chunk number 13: restriction-which
###################################################
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
exByGn <- exonsBy(TxDb.Dmelanogaster.UCSC.dm3.ensGene, "gene")
seqlevels(exByGn, force=TRUE) <- "chr3L"
gns <- unlist(range(exByGn))
param <- ScanBamParam(which=gns)


###################################################
### code chunk number 14: restrictions-several
###################################################
param <- ScanBamParam(what=c("rname", "pos", "cigar"), which=gns)


###################################################
### code chunk number 15: bigdata
###################################################
bigdata <- system.file("bigdata", package="SequenceAnalysisData")
bamfls <- dir(file.path(bigdata, "bam"), ".bam$", full=TRUE) #$
names(bamfls) <- sub("_.*", "", basename(bamfls))


###################################################
### code chunk number 16: scanBam-restriction
###################################################
param <- ScanBamParam(what="rname")
bam <- scanBam(bamfls[1], param=param)[[1]]
table(bam$rname)


###################################################
### code chunk number 17: GC-sampling
###################################################
gcFunction <- 
    function(x)
{
    alf <- alphabetFrequency(x, as.prob=TRUE)
    rowSums(alf[,c("G", "C")])
}


###################################################
### code chunk number 18: fastq-files
###################################################
bigdata <- system.file("bigdata", package="SequenceAnalysisData")
fqfl <- dir(file.path(bigdata, "fastq"), ".fastq$", full=TRUE) #$


###################################################
### code chunk number 19: fastq-sampling
###################################################
sampler <- FastqSampler(fqfl, 100000)
fq <- yield(sampler)                    # 100,000 reads
lattice::densityplot(gcFunction(sread(fq)), plot.points=FALSE)
fq <- yield(sampler)                    # a different 100,000 reads


###################################################
### code chunk number 20: fastq-paired-end (eval = FALSE)
###################################################
## ## NOT RUN
## set.seed(123)
## end1 <- yield(FastqSampler("end_1.fastq"))
## set.seed(123)
## end2 <- yield(FastqSampler("end_2.fastq"))


###################################################
### code chunk number 21: readLines-chunks (eval = FALSE)
###################################################
## ## NOT RUN
## con <- file("<hypothetical-file>.txt")
## open(f)
## while (length(x <- readLines(f, n=10000))) {
##     ## work on character vector 'x'
## }
## close(f)


###################################################
### code chunk number 22: iteration-bam
###################################################
library(RNAseqData.HNRNPC.bam.chr14)
bamfl <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
countBam(bamfl)
bf <- BamFile(bamfl, yieldSize=200000) # could be larger, e.g., 2 million


###################################################
### code chunk number 23: BamFile-iter (eval = FALSE)
###################################################
## ## initialize, e.g., for step 3 ...
## open(bf)
## while (length(gal <- readGAlignments(bf))) {
##     ## step 2: do work...
##     ## step 3: aggregate results...
## }
## close(bf)


###################################################
### code chunk number 24: counter-iter
###################################################
counter <- 
    function(query, subject, ..., ignore.strand=TRUE)
    ## query: GRanges or GRangesList
    ## subject: GAlignments
{
    if (ignore.strand)
        strand(subject) <- "*"
    hits <- countOverlaps(subject, query)
    countOverlaps(query, subject[hits==1])
}


###################################################
### code chunk number 25: counter-roi
###################################################
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
query <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "gene")


###################################################
### code chunk number 26: counter-work (eval = FALSE)
###################################################
## ## initialize, e.g., for step 3 ...
## open(bf)
## while (length(gal <- readGAlignments(bf))) {
##     ## step 2: do work...
##     count0 <- counter(query, gal, ignore.strand=TRUE)
##     ## step 3: aggregate results...
## }
## close(bf)


###################################################
### code chunk number 27: counter-work
###################################################
counter1 <- 
    function(bf, query, ...)
{
    ## initialize, e.g., for step 3 ...
    counts <- integer(length(query))
    open(bf)
    while (length(gal <- readGAlignments(bf, ...))) {
        ## step 2: do work...
        count0 <- counter(query, gal, ignore.strand=TRUE)
        ## step 3: aggregate results...
        counts <- counts + count0
    }
    close(bf)
    counts
}


###################################################
### code chunk number 28: counter1
###################################################
bf <- BamFile(bamfl, yieldSize=500000)
counts <- counter1(bf, query)


###################################################
### code chunk number 29: count-bfl-make
###################################################
fls <- RNAseqData.HNRNPC.bam.chr14_BAMFILES # 8 BAM files
bamfls <- BamFileList(fls, yieldSize=500000) # yieldSize can be larger


###################################################
### code chunk number 30: count-bams (eval = FALSE)
###################################################
## counts <- simplify2array(lapply(bamfls, counter1, query))


###################################################
### code chunk number 31: count-bams-mclapply
###################################################
options(mc.cores=detectCores())         # use all cores
counts <- simplify2array(mclapply(bamfls, counter1, query))
head(counts[rowSums(counts) != 0,], 3)