## create a DNAStringSet covering the single range in roi but
## incorporating variants given by x, a GRanges object returned by
## callVariants

variantSequences <-
    function(x, roi, genome, ...)
{
    stopifnot(length(roi) == 1L)

    ## variants in region of interest
    x <- subsetByOverlaps(x, roi)

    ## sequence in region of interest, replicated for each variant
    seq <-rep(DNAStringSet(getSeq(genome, roi)), length(x))
    if (length(seq) == 0)
        return(seq)

    ## substitute alt nucleotide(s)
    at <- matrix(FALSE, length(seq), max(0L, unique(width(seq))))
    idx <- matrix(c(seq_along(x), start(x) -  start(roi) + 1L), ncol=2L)
    at[idx] <- TRUE
    replaceLetterAt(seq, at, x$alt)
}

matchPWMs <-
    function(pwms, subject, min.score="80%", ...)
{
    subject <- as(subject, "DNAString")
    hits <- lapply(pwms, matchPWM, subject, min.score=min.score, ...)
    good.hits <- sapply(hits, function(hit) isTRUE(width(hit) > 0))
    hits <- hits[good.hits]

    ## Post-calculate the scores of the hits:
    scores <- Map(function(pwm, hit) {
        PWMscoreStartingAt(pwm, subject(hit), start(hit))
    }, pwms[names(hits)], hits)

    DataFrame(PWM=rep(names(hits), elementLengths(hits)),
              score=unlist(scores, use.names=FALSE),
              start=unlist(lapply(hits, start)),
              end=unlist(lapply(hits, end)),
              seq=do.call(c, lapply(unname(hits), as, "DNAStringSet")))
}


# given x, a GRanges object returned by callVariants, a reference genome and
# a genomic region of interest, create a PWM (actually a pfm) expressing
# the frequencey of bases in that region

variantsToPWM <-
    function(x, genome, roi, ...)
{
    ## variants as DNAStringSet
    seq <- variantSequences(x, roi, genome)
    PWM(seq, ...)
}

test.variantsToPWM <- function()
{
    print("--- test.variantsToPWM")
    if(!exists("called"))
        load("called.RData", envir=.GlobalEnv)
    if(!exists("genome.5"))
        load("genome.5.RData", envir=.GlobalEnv)
    
    test.pwm <- variantsToPWM(called, genome.5, GRanges(seqnames="5", IRanges(start=1295225, end=1295235)))
    checkEquals(dim(test.pwm), c(4,11))  # 4 bases, 11 locations
    checkTrue(all(colSums(test.pwm)==1))

       # check the motivating huang et al TERT promoter snp, at chr5:1295228
    tbl.called <- as.data.frame(called)[, c("start", "alt", "file")]

       # exactly 10 samples in this region
    checkEquals(length(unique(subset(tbl.called, start >=1295225 & start <= 1295235)$file)), 10)

       # determine the distribution of snps at the crucial location
    snp.loc <- 1295228
    tbl.test <- subset(tbl.called, start==snp.loc)
    alt.counts <- as.list(table(tbl.test$alt))
    checkEquals(alt.counts, list(A=8,C=0,G=0,T=1))
    
       # make sure that the reference is G at that location
    checkEquals(toString(getSeq(genome.5, GRanges(seqnames="5", IRanges(snp.loc, snp.loc)))), "G")

       # with the requested pwm starting at 1295225, the snp.loc will occupy the 4th column of the pwm
       # the variants at snp.loc are 8-As, and 1-T.  The ref is G.  thus the pwm for 10 sequences
    checkEquals(as.numeric(test.pwm[c("A","C","G","T"),4]),
                c(0.8,0,0.1,0.1))

    test.pwm

} # test.variantsToPWM
#------------------------------------------------------------------------------------------------------------------------
MotIV.toTable = function(match)
{
    stopifnot(length(match@bestMatch) >= 1)
    alignments = match@bestMatch[[1]]@aligns
  
    df = data.frame(stringsAsFactors=FALSE)
    for(alignment in alignments) {   # TODO: pre-allocate and fill
      x = alignment
      name = x@TF@name
      eVal = x@evalue
      sequence = x@sequence
      match = x@match
      strand = x@strand
      df = rbind(df, data.frame(name=name, eVal=eVal, sequence=sequence, match=match,
          strand=strand, stringsAsFactors=FALSE))
      } # for alignment
  
    df

} # MotIV.toTable 
#-------------------------------------------------------------------------------
demo.MotIV.search <- function()
{
    ref.seq <- getSeq(genome.5, GRanges(seqnames="5",
                                        IRanges(start.loc, end.loc)))
    tert.promoter.pwm <- variantsToPWM(called, genome.5,
                                       GRanges(seqnames="5",
                                               IRanges(start=start.loc,
                                                       end=end.loc)))
     x <- motifMatch(list(test=tert.promoter.pwm),
                     as.list(query(query(mdb,"jaspar"), "sapiens")), top=5,
                     go=20, ge=20)
    MotIV.toTable(x)

} # demo.MotIV.search
#------------------------------------------------------------------------------------------------------------------------
rv <- function(seq)
{
    toString(reverseComplement(DNAString(seq)))
}
#------------------------------------------------------------------------------------------------------------------------
