##############################################################################
# NOT USED

checkExons <- function(exons)
{
    checkDuplicated <- function(x, tag)
    {
        dup <- duplicated(x)
        if (any(dup))
            stop("duplicated ", tag, "(s) ",
                 toString(paste("\"", x[dup], "\"", sep="")),
                 " at line(s) ", toString(which(dup)))
    }

    # Exon IDs have 2 parts separated by a semicolon (":"):
    #   - The left part is the gene ID it belongs to.
    #   - The right part is a number (int) >= 1. This number is
    #     unique among all exons in the same gene. It doesn't
    #     necesarily reflect the "real" order of the exons.
    ID <- exons$ID
    strand <- exons$strand

    checkDuplicated(ID, "ID")
    splittedID <- strsplit(ID, ":", fixed=TRUE)
    isbad <- sapply(splittedID, length) != 2
    if (any(isbad))
        stop("bad exon ID(s) ", toString(paste("\"", ID[isbad], "\"", sep="")),
             " at line(s) ", toString(which(isbad)))

    rightID <- sapply(splittedID, function(x) x[2])
    rightIDasInt <- as.integer(rightID)
    if (!identical(as.character(rightIDasInt), rightID)
        || !all(rightIDasInt >= 1))
        stop("bad exon ID(s): right part not always an int >= 1")

    leftID <- sapply(splittedID, function(x) x[1])
    geneID <- unique(leftID)
    ngenes <- length(geneID)
    gene_strand <- character(ngenes)
    names(gene_strand) <- geneID
    gene_exon_start <- integer(ngenes)
    names(gene_exon_start) <- geneID
    gene_exon_end <- gene_exon_start
    for (gid in geneID) {
        rid <- rightIDasInt[leftID == gid]
        if (any(duplicated(rid)))
            stop("bad exon ID(s): duplicated right numbers for exon in gene \"", gid, "\"")
        gstrand <- unique(strand[leftID == gid])
        if (length(gstrand) != 1 || !(gstrand %in% c("+", "-")))
            warning("exon in gene \"", gid, "\" not all in the same strand (+ or -)")
        #gene_strand[gid] <- gstrand
        #gene_exon_start[gid] <- min(exon$start[leftID == gid])
        #gene_exon_end[gid] <- max(exon$end[leftID == gid])
    }
}


##############################################################################
# exonID=>geneID mapping for a given chromosome

# Not vectorized!
.exon2gene.old <- function(exonID, exons, genes, gene_ID2Alias)
{
    ONE <- as.integer(1)

    # Not vectorized! ('tagval' must be a single string)
    geneCandidate <- function(tagval)
    {
        cutAt <- gregexpr(":", tagval, fixed=TRUE)[[ONE]]
        if (length(cutAt) == ONE && cutAt == -ONE) {
            return(NA)
        }
        cutAt <- cutAt[length(cutAt)] - ONE
        return(substr(tagval, ONE, cutAt))
    }
    # Not vectorized! ('tagvals' must be a single string)
    geneCandidates <- function(tagvals)
    {
        vals <- strsplit(tagvals, ",", fixed=TRUE)[[ONE]]
        parts <- sapply(vals, geneCandidate)
        parts[!is.na(parts)]
    }

    exon <- exons[exonID, ]
    candidates <- geneCandidate(exonID)
    if (is.na(candidates))
        stop("no semi-colon in exon ID \"", exonID, "\"")
    if (!is.na(exon$Name)) {
        fromName <- geneCandidate(exon$Name) # can be NA
        if (!is.na(fromName))
            candidates <- c(candidates, fromName)
    }
    if (!is.na(exon$Alias)) {
        fromAlias <- geneCandidates(exon$Alias) # can't be NA but length can vary (>= 0)
        candidates <- c(candidates, fromAlias)
    }
    candidates <- unique(candidates)
    geneID <- row.names(genes)[row.names(genes) %in% candidates | genes$Name %in% candidates]
    if (length(geneID) == 0) {
        # Since 2 distinct genes can share a common alias, this is only used
        # as a last chance. Hopefully for those genes with a shared alias the
        # gene part of the exon ID and Names (candidates) will be found in row.names(genes)
        # or genes$Name so we won't get here (and avoid a potential ambiguity).
        geneID <- row.names(genes)[sapply(gene_ID2Alias, function(x) any(x %in% candidates))]
    }
    if (length(geneID) > ONE)
        stop("more than 1 gene for exon \"", exonID, "\" (ambiguity)")
    return(geneID[1]) # will return NA if length(geneID) is 0
}

# Not vectorized!
.exon2gene <- function(exon.ID, exons, exonParentFeatures, genes)
{
    parent.IDs <- strsplit(exons[exon.ID, "Parent"], ",", fixed=TRUE)[[1]]
    gene.ID <- unique(exonParentFeatures[parent.IDs, "Parent"])
    #gene.ID <- gene.ID[!is.na(gene.ID)]
    if (length(gene.ID) != 1)
        stop("nb of grand-parents found for exon \"", exon.ID, "\" is not 1")
    if (!(gene.ID %in% row.names(genes)))
        stop("grand-parent ID \"", gene.ID, "\"",
             "found for exon \"", exon.ID, "\" is not a gene ID")
    gene.ID
}


##############################################################################
# 'makeAnnotations'

# Some quality control
# --------------------
#
# Expect 4 warnings:
#   1: Because "Alias" is NA for all exons in chrM.
#   2, 3 and 4: Because "putative_ortholog_of", "Ontology_term" and "Dbxref"
#   are NAs for all genes in chrM.
#
# The IDs of the 64489 exons are unique. The names are not.
# 51 exons are sharing the 9 following names:
#   EXON_DUPNAMES <- c( # Nb. of exons  Chr
#       "5SrRNA-&PSgr",  #            4  2R
#       "&agr",          #            3  2L
#       "&bgr",          #           14  2R/3R
#       "His-&PSgr",     #            8  2L
#       "Hsr&ohgr",      #            4  3R
#       "mRpS35:1",      #            2  2R/3L
#       "mRpS35:2",      #            2  2R/3L
#       "Pi4KII&agr",    #           12  3R
#       "Pros&bgr"       #            2  2R
#   )
#
# The IDs and the names of the 14486 genes are unique.
#
# The gene aliases are not unique!
# Unfortunately, with the GFF data from Flybase, 2 "different" genes (i.e.
# 2 genes with distinct IDs) can share a common alias. For example
# CG13984 is an alias for genes FBgn0053531 and FBgn0031796 in chr 2L.

makeAnnotations <- function(srcdir=".", destdir=".")
{
    saveObj <- function(obj, objname)
    {
        assign(objname, obj, envir=.GlobalEnv)
        rda_file <- paste(destdir, "/", objname, ".rda", sep="")
        cat("Saving object '", objname, "' to file ", rda_file, "...\n", sep="")
        save(list=objname, file=rda_file, envir=.GlobalEnv)
    }
    cat("Extracting the \"gene\" and \"exon\" features plus all the parent features\n")
    cat("of the \"exon\" feature (currently \"mRNA\", \"snRNA\", \"tRNA\", \"ncRNA\",\n")
    cat("\"snoRNA\", \"miRNA\", \"rRNA\" and \"pseudogene\") from the GFF files\n")
    cat("(please be patient):\n\n")

    EXON2GENE <- character(0) # exonID=>geneID global map
    GENE_ID2NAMES <- character(0) # geneID=>geneName+Alias global map
    for (chr in CHR_SHORTNAMES) {
        gff_file <- paste(srcdir, "/", chr, "-clean.gff", sep="")
        cat("Loading data from GFF file ", gff_file, "... ", sep="")
        gff <- gff.read(gff_file, verbose=TRUE)

        genes <- gff.extract.gene(gff)
        saveObj(genes, GENE_TBLNAMES[chr])

        exonParentFeatures <- gff.extract.exonParentFeatures(gff)
        saveObj(exonParentFeatures, EXONPARENTFEATURES_TBLNAMES[chr])

        exons <- gff.extract.exon(gff)
        saveObj(exons, EXON_TBLNAMES[chr])

        cat("Mapping exon IDs to gene IDs for chromosome ", chr, "... ", sep="")
        gene_ID2Alias <- strsplit(genes$Alias, ",", fixed=TRUE)
        names(gene_ID2Alias) <- row.names(genes)
        #map <- sapply(row.names(exons), function(x) .exon2gene.old(x, exons, genes, gene_ID2Alias))
        map <- sapply(row.names(exons),
                      function(x) .exon2gene(x, exons, exonParentFeatures, genes))
        nb_ugenes <- length(row.names(genes)) - length(unique(map))
        cat(nb_ugenes, " unmapped gene(s)\n", sep="")
        EXON2GENE <- c(EXON2GENE, map)
        gene_ID2Names <- list()
        for (ID in row.names(genes)) {
            gene_ID2Names[[ID]] <- unique(append(gene_ID2Alias[[ID]],
                                                 genes[ID, "Name"], after=0))
        }
        GENE_ID2NAMES <- c(GENE_ID2NAMES, gene_ID2Names)
    }
    allExonIDs <- names(EXON2GENE)
    if (any(duplicated(allExonIDs)))
        stop("duplicated exon IDs")
    saveObj(EXON2GENE, "EXON2GENE")
    allGeneIDs <- names(GENE_ID2NAMES)
    if (any(duplicated(allGeneIDs)))
        stop("duplicated gene IDs")
    saveObj(GENE_ID2NAMES, "GENE_ID2NAMES")
    nb_ugenes <- length(GENE_ID2NAMES) - length(unique(EXON2GENE)) 
    cat(nb_ugenes, " unmapped gene(s) in all chromosomes\n", sep="")
}

