### =========================================================================
### Functions to read and extract features from a GFF file
### ------------------------------------------------------


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Low-level character vector manipulation.
###

### 'taggedvals' should be a character vector with "tag=value" elements.
### Elements with no "=" are considered to be values with an NA tag.
### Examples:   input     tag   value
###             "a=3"     "a"   "3"
###             "b="      "b"   ""
###             "=5"      ""    "5"
###             "="       ""    ""
###             "xy"      NA    "xy"
###             ""        NA    ""
###             "c=w=4"   "c"   "w=4"
###             NA        NA    NA
taggedValsToNamedVals <- function(taggedvals)
{
    ONE <- as.integer(1)
    TWO <- as.integer(2)
    equalPos <- regexpr("=", taggedvals, fixed=TRUE)
    valStartsAt <- equalPos + ONE
    tags <- substr(taggedvals, ONE, valStartsAt - TWO)
    tagIsNA <- equalPos == -ONE
    tags[tagIsNA] <- as.character(NA)
    ans <- substr(taggedvals, valStartsAt, nchar(taggedvals))
    names(ans) <- tags
    ans
}

### 'namedvals' must be a named character vector, like one returned by the
### 'taggedValsToNamedVals' function above.
### Key property: namedValsToTaggedVals(taggedValsToNamedVals(x)) is equal
### to x for _any_ character vector x (modulo the names).
### IOW, the taggedValsToNamedVals/namedValsToTaggedVals conversions
### don't loose information.
namedValsToTaggedVals <- function(namedvals)
{
    tags <- names(namedvals)
    tagIsNotNA <- !is.na(tags)
    namedvals[tagIsNotNA] <- paste(tags[tagIsNotNA], namedvals[tagIsNotNA], sep="=")
    namedvals
}

namedValsToString <- function(namedvals, sep=";")
{
    paste(namedValsToTaggedVals(namedvals), collapse=sep)
}

### A safe replacement for 'namedvals[tag]' that works properly when
### tag is "" or NA. In addition, allows checking for duplicated tags.
### Like 'namedvals[tag]', returns NA if 'tag' is not found.
### WARNING: Unlike 'namedvals[tag]', 'safeGetTagVal' is (deliberately)
### _not_ vectorized: 'tag' must be a _single_ string (or single NA).
safeGetTagVal <- function(namedvals, tag, errorOnDupTag=TRUE)
{
    tagvals <- namedvals[names(namedvals) %in% tag]
    ## sum(!duplicated(x)) is a fast equivalent to length(unique(x))
    if (length(tagvals) >= 2 && sum(!duplicated(tagvals)) >= 2 && errorOnDupTag) {
        taggedvals_in1string <- namedValsToString(namedvals)
        if (is.na(tag))
            stop("several values with no tag in \"", taggedvals_in1string, "\"")
        else
            stop("duplicated \"", tag, "\" tag in \"", taggedvals_in1string, "\"")
    }
    tagvals[1] # will return NA if length(tagvals) is 0
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Some generic manipulation of the gff "attrib" field.
###

### Converts the gff "attrib" field to a list of character vectors.
### The names/values for each returned vector are the tags/values found in the
### corresponding "attrib" field.
explodeGFFattrib <- function(gff_attrib)
{
    lapply(strsplit(gff_attrib, ";", fixed=TRUE), taggedValsToNamedVals)
}

implodeGFFattrib <- function(namedvals_list)
{
    sapply(namedvals_list, function(x) paste(namedValsToTaggedVals(x), collapse=";"))
}

testImplodeExplodeReciprocity <- function(gff_attrib)
{
    attrib <- implodeGFFattrib(explodeGFFattrib(gff_attrib))
    corrupted <- (attrib != gff_attrib) & (paste(attrib, ";", sep="") != gff_attrib)
    sum(corrupted)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Reading a GFF file.
###

gff.read <- function(file, nrows=-1, verbose=FALSE)
{
    GFFfields <- c("seqname", "source", "feature", "start", "end",
                   "score", "strand", "frame", "attrib")
    ## Note: my first attempt to load tmp_dir/2L-clean.gff" without
    ## specifying 'quote=""' lead to the following R crash:
    ##   gff2L[337689,] produces Error: segfault from C stack overflow
    ##   gff2L[337680:337689,] CRASH!!!
    gff <- read.table(file, sep="\t", quote="", col.names=GFFfields,
                      na.strings=c("."), nrows=nrows, stringsAsFactors=FALSE)
    if (verbose)
        cat("Read", nrow(gff), "record(s)\n")
    gff
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Helper functions used by the "gff.extract.*" functions.
### They should make it easy to write other functions of the same
### family (feature extraction).

moveGFFtags2cols <- function(gff, tags)
{
    if (!is.data.frame(gff))
        stop("'gff' must be a data frame")
    if (!is.character(gff$attrib))
        stop("'gff' must have an \"attrib\" col of class \"character\"")
    if (!is.character(tags))
        stop("'tags' must be a character vector")
    if (any(is.na(tags)))
        stop("'tags' must not contain NAs")
    if (any(duplicated(tags)))
        stop("'tags' must not contain duplicates")
    if (any(tags %in% names(gff)))
        stop("'tags' must be different from any 'gff' col name")
    namedvals_list <- explodeGFFattrib(gff$attrib)
    for (tag in tags) {
        tag_vals <- sapply(namedvals_list, function(x) safeGetTagVal(x, tag))
        gff[[tag]] <- as.character(tag_vals)
        if (all(is.na(gff[[tag]])))
            warning("\"", tag, "\" is NA for all records")
    }
    namedvals_list <- lapply(namedvals_list, function(x) x[!(names(x) %in% tags)])
    gff$attrib <- as.character(implodeGFFattrib(namedvals_list))
    gff
}

### Names of 'col_list' must be the names of the cols to drop.
### The values associated with each col name are the expected
### vals in each col to drop
dropGFFcols <- function(gff, col_list)
{
    if (!is.data.frame(gff))
        stop("'gff' must be a data frame")
    if (!is.list(col_list))
        stop("'col_list' must be a list")
    col.names <- names(col_list)
    if (is.null(col.names))
        stop("'col_list' must have the names of the cols to drop")
    if (any(is.na(col.names)))
        stop("'col.names' must not contain NAs")
    if (any(duplicated(col.names)))
        stop("'col.names' must not contain duplicates")
    for (col.name in col.names) {
        if (sum(names(gff) %in% col.name) != 1)
            stop("'gff' must have exactly one col named \"", col.name, "\"")
        val <- col_list[[col.name]]
        if (!is.null(val) && !all(gff[[col.name]] %in% val))
            stop("col \"", col.name, "\" contains unexpected values")
    }
    gff[ , !(names(gff) %in% col.names)]
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The "gff.extract.gene" function.
###
### Extract the "gene" feature from a GFF data frame.
###
### Returns a data frame where:
###   - col names are "seqname", "source", "start", "end",
###     "strand", "attrib", "Name", "Alias", "cyto_range",
###     "putative_ortholog_of", "Dbxref",
###   - row names are the gene IDs.
###
### Note: We can't move the "gbunit" tag out of the "attrib" field because
### 1 gene in chrom 3L has 2 "gbunit" tags with different values.

gff.extract.gene <- function(gff)
{
    gff <- gff[gff$feature == "gene", ]
    DROPPED_COLS <- list(feature=NULL, score=NA, frame=NA)
    gff <- dropGFFcols(gff, DROPPED_COLS)
    ADDED_COLS <- c("ID", "Name", "Alias", "cyto_range",
                    "putative_ortholog_of", "Dbxref")
    gff <- moveGFFtags2cols(gff, ADDED_COLS)
    row.names(gff) <- gff$ID
    if (any(row.names(gff) == ""))
        stop("one of the gene IDs is the empty string")
    dropGFFcols(gff, list(ID=NULL))
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The "gff.extract.exonParentFeatures" function.
###
### Extract the parent features of the "exon" feature from a GFF data frame.
### Known parent features are: mRNA, snRNA, tRNA, ncRNA, snoRNA, miRNA, rRNA
### and pseudogene.
###
### Returns a data frame where:
###   - col names are "seqname", "source", "feature", "start", "end",
###     "strand", "Name", "Parent", "Alias", "Dbxref",
###   - row names are the feature IDs.

gff.extract.exonParentFeatures <- function(gff)
{
    EXON_PARENT_FEATURES <- c("mRNA", "snRNA", "tRNA", "ncRNA", "snoRNA",
                              "miRNA", "rRNA", "pseudogene")
    gff <- gff[gff$feature %in% EXON_PARENT_FEATURES, ]
    DROPPED_COLS <- list(score=NA, frame=NA)
    gff <- dropGFFcols(gff, DROPPED_COLS)
    ADDED_COLS <- c("ID", "Name", "Parent", "Alias", "Dbxref")
    gff <- moveGFFtags2cols(gff, ADDED_COLS)
    row.names(gff) <- gff$ID
    if (any(row.names(gff) == ""))
        stop("one of the feature IDs is the empty string")
    dropGFFcols(gff, list(ID=NULL, attrib=""))
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The "gff.extract.exon" function.
###
### Extract the "exon" feature from a GFF data frame.
### Note that in FlyBase GFF r5.1, all "exon" records have exactly 4 tags
### ("ID", "Name", "Parent" and "Dbxref"), except in chrom M where
### the "Dbxref" tag is missing.
###
### Returns a data frame where:
###   - col names are "seqname", "source", "start", "end", "strand", "Name",
###     "Parent", "Dbxref",
###   - row names are the exon IDs.

gff.extract.exon <- function(gff)
{
    gff <- gff[gff$feature == "exon", ]
    DROPPED_COLS <- list(feature=NULL, score=NA, frame=NA)
    gff <- dropGFFcols(gff, DROPPED_COLS)
    ADDED_COLS <- c("ID", "Name", "Parent", "Dbxref")
    gff <- moveGFFtags2cols(gff, ADDED_COLS)
    row.names(gff) <- gff$ID
    if (any(row.names(gff) == ""))
        stop("one of the exon IDs is the empty string")
    dropGFFcols(gff, list(ID=NULL, attrib=""))
}

