

## Can get processed Marioni data from:
## http://bioinf.wehi.edu.au/folders/tmm_rnaseq/TMM.html

load("LK_data.Rdata")

## quick look at table of counts and gene identifiers
head(MA.subsetA$M)
head(MA.subsetA$genes)

toc <- MA.subsetA$M
rownames(toc) <- MA.subsetA$genes[,1]
head(toc)

## extract group label from column headings
group <- gsub("R[0-9]L[0-9]","",colnames(toc))



############ Analysis of Marioni Data for DE in edgeR #############
library(edgeR)

## Create DGEList object, the starting point for edgeR analysis 
d <- DGEList(counts=toc, group=group, genes=MA.subsetA$genes)
d
names(d)


## (Optional) Filter out rows with fewer than 10 counts in total. 
## These tags cannot have a p-value for DE less than 0.01 and 
## dispersion estimation is better if we filter out these tags.
d <- d[rowSums(d$counts)>9,]
d


## Do TMM normalization
norm.fact <- calcNormFactors(d$counts, refColumn=1)
norm.fact.GM <- norm.fact/prod(norm.fact)^(1/length(norm.fact))
norm.fact.GM
lib.size.norm <- round(d$samples$lib.size*norm.fact.GM)
lib.size.norm
d$samples$lib.size <- lib.size.norm  # use "effective" library size


## Look at MA plots (or "smear" plots) to explore what the normalization
## factors tell us.  Note that the 'normalize' argument in the maPlot()
## just divides by the sum of the vector (i.e. total # of reads)
maPlot(d$counts[,1], d$counts[,2], normalize=TRUE, ylim=c(-8,8), pch=19, cex=.4)
grid(col="blue")
abline(h=log2(norm.fact[2]), col="red", lwd=4) 

## Look at MDS plot to see if any of the samples are obvious outliers
## No outliers in this dataset---Dimension 1 separates the Kidney 
## from the Liver samples
plotMDS.dge(d, xlim=c(-1,1))


## In order to carry out a DE analysis we need to estimate the expression
## levels and common dispersion for the NB model. 
d <- estimateCommonDisp(d)
names(d)


## What is our estimate of the common dispersion?
d$common.dispersion

## Very, very little "extra" variability between technical
## replicates in these data
## What is the coefficient of variation for these data? 
## (i.e. proportion by which true expression levels may vary 
## between replicate samples.
sqrt(d$common.dispersion)


## We can now carry out statistical testing for DE using 
## the common dispersion NB (here, effective Poisson) model
de.common <- exactTest(d)


## Easy to get the ranked list of the most DE genes, with expression level,
## fold change, p-value and adjusted p-value
topTags(de.common) # Extreme amounts of DE for these data


## We can also sort the top list by FC
topTags(de.common,sort.by="logFC")


## We can look at the actual (or quantile-adjusted) counts for the genes
## we have identified as the most DE
## DE tags are definitely DE! Incredible consistency between libraries as well
topTags(de.common)
tt.com <- topTags(de.common)
topids.com <- rownames(tt.com$table)
d$counts[topids.com,order(group)]
d$pseudo.alt[topids.com,order(group)]


## Visualise the results on a "smear" plot (tags that would otherwise be off
## the scale appearing in a 'smear' on the left of the plot)
## Highlight the top 500 DE tags as defined above in red on the plot
top500tags <- rownames(topTags(de.common, n=500)$table)
plotSmear(d,de.tags=top500tags, cex=.4, main='FC plot using common dispersion')



############################################################################################
################################     organize dataset    ###################################
############################################################################################

f <- url("http://bioinf.wehi.edu.au/folders/tmm_rnaseq/LK_data.RData")
load(f)
toc <- MA.subsetA$M
rownames(toc) <- MA.subsetA$genes[,1]
group <- gsub("R[0-9]L[0-9]","",colnames(toc))


############################################################################################
################################      edgeR Analysis     ###################################
############################################################################################

library(edgeR)
d <- DGEList(counts=toc, group=group, genes=MA.subsetA$genes)
norm.fact <- calcNormFactors(d$counts, refColumn=1)
norm.fact.GM <- norm.fact/prod(norm.fact)^(1/length(norm.fact))
lib.size.norm <- round(d$samples$lib.size*norm.fact.GM)
d$samples$lib.size <- lib.size.norm  # use "effective" library size
#d <- d[rowSums(d$counts)>9,]  # optional
d <- estimateCommonDisp(d)
de.common <- exactTest(d)
detags.table <- topTags(de.common, n=nrow(d$counts))$table



############################################################################################
################################      DESeq Analysis     ###################################
############################################################################################

library(DESeq)
cds <- newCountDataSet( toc, group )
cds <- estimateSizeFactors( cds )
cds <- estimateVarianceFunctions( cds )
res <- nbinomTest( cds, "Kidney", "Liver")

plot( d$samples$lib.size, sizeFactors(cds) )  

############################################################################################
################################      baySeq Analysis     ##################################
############################################################################################

library(baySeq)
if ("snow" %in% installed.packages()[, 1]) {
    library(snow)
    cl <- makeCluster(4, "SOCK")
} else cl <- NULL
replicates <- as.numeric(factor(group))
groups <- list(NDE=rep(1,length(group)), DE=replicates)
CD <- new("countData", data = as.matrix(toc), replicates = replicates, libsizes = colSums(toc), groups = groups)
CD
######## Poisson-Gamma approach ########
CDP.Poi <- getPriors.Pois(CD, samplesize = 20, takemean = TRUE, cl=cl)
CDP.Poi@priors
CDPost.Poi <- getLikelihoods.Pois(CDP.Poi, pET = "BIC", cl=cl)
## Get estimate of the proportion of tags that are DE and not DE
CDPost.Poi@estProps
CDPost.Poi@posteriors[1:10,]
topCounts(CDPost.Poi, group=2)

######## Negative-Binomial approach #########
## Using samplesize=1000 means that the code runs in a couple of minutes
## Using the suggested samplesize=10000 for better results requires > 30 mins
CDP.NBML <- getPriors.NB(CD, samplesize = 1000, estimation = "QL", cl=cl)
CDPost.NBML <- getLikelihoods.NB(CDP.NBML, pET = "BIC", cl = cl)
CDPost.NBML@estProps
CDPost.NBML@posteriors[1:10,]
topCounts(CDPost.NBML, group=2)



############################################################################################
################################      DEGSeq analysis     ##################################
############################################################################################

library(DEGseq)
geneExpMatrix1 <- toc[,group==levels(factor(group))[1]]
geneExpMatrix1 <- cbind(rownames(toc),geneExpMatrix1)
head(geneExpMatrix1)
geneExpMatrix2 <- toc[,group==levels(factor(group))[2]]
geneExpMatrix2 <- cbind(rownames(toc),geneExpMatrix2)
head(geneExpMatrix2)
write.table(geneExpMatrix1[30:31, ], row.names = FALSE)
layout(matrix(c(1, 2, 3, 4, 5, 6), 3, 2, byrow = TRUE))
par(mar = c(2, 2, 2, 2))
DEGexp(geneExpMatrix1 = geneExpMatrix1, geneCol1 = 1, expCol1 = c(2,3, 4, 5, 6), groupLabel1 = "kidney", geneExpMatrix2 = geneExpMatrix2, geneCol2 = 1, expCol2 = c(2, 3, 4, 5, 6), groupLabel2 = "liver",method = "MARS",outputDir=getwd())

layout(matrix(c(1, 2, 3, 4, 5, 6), 3, 2, byrow = TRUE))
par(mar = c(2, 2, 2, 2))
DEGexp(geneExpMatrix1 = geneExpMatrix1, expCol1 = 2, groupLabel1 = "kidneyR1L1",geneExpMatrix2 = geneExpMatrix2, expCol2 = 2, groupLabel2 = "liverR1L2", replicateExpMatrix1 = geneExpMatrix1, expColR1 = 3, replicateExpMatrix2 = geneExpMatrix2, expColR2 = 4, replicateLabel1 = "kidneyR1L3", replicateLabel2 = "kidneyR1L7", method = "MATR")


############################################################################################
##########                Comparing different methods for analysis             #############
############################################################################################


tc <- topCounts(CDPost.Poi, group=2, normalise=TRUE, num=nrow(d$counts))
m1 <- match( rownames(de.common$table), rownames(tc) )

degseq <- read.table("output_score.txt", header=TRUE)
m2 <- match( rownames(de.common$table), degseq$GeneNames )

m3 <- match( rownames(de.common$table), res$id )

df <- data.frame(edgeR=de.common$table$p.value, baySeq=1-tc$Likelihood[m1],
                 DEGSeq=degseq$p.value[m2], DESeq=res$pval[m3])

pairs(df, lower.panel=NULL)


nGenes <- 1000
o1 <- order(de.common$table$p.value)[1:nGenes]
o2 <- order(tc$Likelihood, decreasing=TRUE)[1:nGenes]
o3 <- order(degseq$p.value)[1:nGenes]
o4 <- order(res$pval)[1:nGenes]

input <- list(edgeR=rownames(de.common$table)[o1], baySeq=rownames(tc)[o2],
              DEGSeq=degseq$GeneNames[o3], DESeq=res$id[o4])

library(gplots)   
par(mfrow=c(1,1))
venn(input)


