---
title: "R Notebook"
output: html_notebook
---

```{r}
options( max.print=300 )
```

Differential expression
=======================

Our example data
----------------

Himes et al (PLOS One, 2014, PMID: 24926665):
  - effect of dexamethasone treatment on airway epithelium
  - 4 cell lines, derived from 4 donors
  - 2 sample each: dexamethasone and control

```{r}
library( airway )
data( airway )
airway
```

8 samples:

```{r}
colData(airway)
```

Fix sample names

```{r}
colnames(dds) <- c( "D1u", "D1t", "D2u", "D2t", "D3u", "D3t", "D4u", "D4t" )

# better:
colnames(dds) <- paste0( "D", as.integer(dds$cell), substr( dds$dex, 1, 1 ) )

colData(dds)
```

Count table:

```{r}
head( assay(airway) )
```

Convert the gene IDs to names:

```{r}
a <- AnnotationDbi::intraIDMapper( rownames(airway), species="HOMSA", 
        srcIDType="ENSEMBL", destIDType = "SYMBOL" )  

# Do not simply do this:
# rownames(airway) <- a 

rownames(airway)[ match( names(a), rownames(airway) ) ] <- unlist(a)

head( assay( airway ) )
```


A quick analysis with DESeq2
----------------------------

```{r}
library( DESeq2 )

dds <- DESeqDataSet( airway, ~ cell + dex )
dds <- DESeq( dds )
res <- results(dds)

res[ order(res$padj), ]
```

Plot the top hit

```{r}
library( ggplot2 )

ggplot( cbind( 
     as.data.frame( colData(dds) ), 
     expr_SPARCL1 = counts( dds, normalized=TRUE )["SPARCL1",] ) ) +
  geom_point( aes( 
     x = cell, y = expr_SPARCL1, col = dex ) ) +
  scale_y_log10()
```

Get an overview

```{r}
plotMA( dds, ylim = c( -5, 5 ) )
```


A more detailed look
--------------------

Open questions (that you may have)
- What determines the black/red decision boundary?
- What are "normalized counts"?
- Why the "trumpet shape"?


A naive try
-----------

Take only the first cell line, "N61311"

```{r fig.asp=1}
plot( counts(dds)[,1], counts(dds)[,2], asp=1 )
```

Log scale

```{r fig.asp=1}
plot( counts(dds)[,1] + 1, counts(dds)[,2] + 1, log="xy" ) #, pch="." )
```

Let's calculate the average and the ratio:

```{r}
avg_N61311 <- ( counts(dds)[,1] + counts(dds)[,2] )/2
ratio_N61311 <- counts(dds)[,2] / counts(dds)[,1]

plot( avg_N61311, ratio_N61311 )
```

```{r}
avg_N61311 <- ( counts(dds)[,1] + counts(dds)[,2] )/2
ratio_N61311 <- ( counts(dds)[,2] + 1 ) / ( counts(dds)[,1] + 1 )

plot( avg_N61311, ratio_N61311, log="xy", pch="." )
```

How well does this agree with the second cell line, N052611?

```{r fig.asp=1}
ratio_N052611 <- ( counts(dds)[,4] + 1 ) / ( counts(dds)[,3] + 1 )

plot( ratio_N61311, ratio_N052611, log="xy", pch="." )
```

Some genes agree well, others are in complete contradiction.

How to enforce consistency? And what exactly do we actually want?

But before, some more topics.


Normalization
-------------

Back to our MA plot, here for the second cell line (N052611):

```{r}
avg_N052611 <- ( counts(dds)[,3] + counts(dds)[,4] )/2
ratio_N052611 <- counts(dds)[,4] / counts(dds)[,3] 

plot( avg_N052611, ratio_N052611, log="xy", pch=".", ylim=c(.1, 10) )
abline( h=1, col=adjustcolor("green",.4), lwd=3 )
```

The line is too high

This may be because the untreated sample has more counts in total

```{r}
colSums( counts(dds) )/ 1e6
```

Should we simply move the green line to the ratio of the two sums?

```{r}
plot( avg_N052611, ratio_N052611, log="xy", pch=".", ylim=c(.1, 10) )
abline( h = 1, col=adjustcolor("green",.4), lwd=3 )
```

Should we simply move the green line to the ratio of the two sums?

```{r}
plot( avg_N052611, ratio_N052611, log="xy", pch=".", ylim=c(.1, 10) )
abline( h = 15.16342 / 25.34865, col=adjustcolor("green",.4), lwd=3 )
```

That works, but why not move it into the center of the point cloud?

```{r}
hist( log2( ratio_N052611 ), breaks=50 )
abline( v = log2( 15.16342 / 25.34865 ), col=adjustcolor("green",.4), lwd=3 )

median_ratio_N052611 <- median( ratio_N052611, na.rm=TRUE )
abline( v = log2( median_ratio_N052611 ), col=adjustcolor("orange",.4), lwd=3 )
```

The median is a tad better. Let's multiply

```{r}
plot( avg_N052611, ratio_N052611 / median_ratio_N052611, log="xy", pch=".", ylim=c(.1, 10) )
abline( h = 1, col=adjustcolor("orange",.4), lwd=3 )
```

We could calculate the medians of ratios of all pairs of samples

```{r}
sapply( 1:8, function(i)
   sapply( 1:8, function(j)
     median( counts(dds)[,j] / counts(dds)[,i], na.rm=TRUE ) ) )
```

That's too many numbers.

Let's make one averaged "reference" sample, and normalize everything against that one.

Virtual reference sample: geometric average of all counts

```{r}
vref_counts <- apply( counts(dds), 1, function(x) prod(x) ^ (1/length(x)) )

head( vref_counts, 20 )
```

Normalize each against that one

```{r}
sapply( 1:8, function(i) 
  median( counts(dds)[,i] / vref_counts, na.rm=TRUE ) )
```

DESeq does this automatically:

```{r}
sizeFactors(dds)
```

It's not the same. Why? Because, above, the `na.rm=TRUE` does not exclude zeroes. This is better:

```{r}
sapply( 1:8, function(i) 
  median( ( counts(dds)[,i] / vref_counts) [ vref_counts>0 ] ) ) 
```

Permutation test
----------------

Let's do a very simple, but actually statistically valid analysis.

First, calculate the log fold changes.

```{r}
#log norm counts
lnc <- log2( counts( dds, normalized=TRUE ) + 1 )
lmeans <- rowMeans( lnc )
lfc <- rowMeans( 
   lnc[ , c( "D1t", "D2t", "D3t", "D4t" ) ] - 
   lnc[ , c( "D1u", "D2u", "D3u", "D4u" ) ] )

plot( lmeans, lfc, cex=.1, ylim=c( -4, 4 ) )
```

Pick two cell lines at random: For these, swap control and treatment samples, then recalculate
log fold changes

```{r}
lfc_perm <- rowMeans( 
   lnc[ , c( "D1t", "D2u", "D3u", "D4t" ) ] - 
   lnc[ , c( "D1u", "D2t", "D3t", "D4u" ) ] )


plot( lmeans, lfc_perm, log="x", cex=.1, ylim=c( -4, 4 ) )

plot( lmeans, lfc, log="x", cex=.1, ylim=c( -4, 4 ) )
points( lmeans, lfc_perm, cex=.1, col="purple" )
```


Poisson noise
-------------

Imagine a bag with 10,000 balls, 10% of which are red

```{r fig.width=12}
a <- cbind( 
    expand.grid( x=1:125, y=1:80 ), 
    col=sample( c( rep( "lightyellow", 9000 ), rep( "indianred1", 1000 ) ) ) )

ggplot(a) + 
  geom_point( aes( x=x, y=y ), col=a$col, size=.5) + 
  theme_dark() + coord_fixed()
```

Each of you is allowed to look at 20 balls, then estimate the percentage of red balls.


```{r fig.height=12}
ggplot(a) + 
  geom_point( aes( x=x, y=y ), col=a$col, size=5) + theme_dark() + 
  coord_fixed( xlim=c(0,19.5), ylim=c(0,19.5))
```

```{r}
barplot( table( rowSums( matrix( a$col, ncol=20 ) == "indianred1" ) ) )
```

```{r}
expected_number_of_red_balls <- 0.1 * 20

plot( 
  0:10, 
  dpois( 0:10, expected_number_of_red_balls ), 
  type = "h", lwd=4,
  xlab = "number k of red balls",
  ylab = "probability",
  main = "Poisson distribution with expectation 2" )
```


```{r}
expected_number_of_red_balls <- 0.1 * 1000

plot( 
  0:200, 
  dpois( 0:200, expected_number_of_red_balls ), 
  type = "h", lwd=1,
  xlab = "number k of red balls",
  ylab = "probability",
  main = "Poisson distribution with expectation 100" )
```

What *fraction* of red balls do we estimate in the first case (expected number: 2 of 20),
and in the second case (100 of 1000)?

```{r fig.height=4,fig.width=10}
expected_number_of_red_balls <- 0.1 * 20

par( mfrow=c(1,2) )

plot( 
  0:20 / 20, 
  dpois( 0:20, 2 ), 
  type = "h", lwd=5, lend=2,
  xlab = "estimated fraction of red balls",
  ylab = "probability",
  xlim = c( 0, 1 ),
  main = "expected: 2 of 20 (10%)" )

plot( 
  0:1000 / 1000, 
  dpois( 0:1000, 100 ), 
  type = "h", lwd=1, lend=2,
  xlab = "estimated fraction of red balls",
  ylab = "probability",
  xlim = c( 0, 1 ),
  main = "expected: 100 of 1000 (10%)" )
```


The negative binomial distribution
----------------------------------

Assume a gene is expressed with 5 reads in your control sample and 8 reads in your treated samples

```{r}
library(magrittr)
library(ggbeeswarm)

conds <- rep( c( "C", "T" ), each=4 )
data.frame( 
  cond = conds, 
  nreads = rpois( 8, c( C=5, T=8 )[ conds ] ) ) %>%
ggplot +
  geom_beeswarm( aes( x=cond, y=nreads ) )
```

```{r}
table( replicate( 1000, mean( rpois( 4, 5 ) ) - mean( rpois( 4, 8 ) ) ) > 0 )
```

Now, C: 1000, T: 1010

```{r}
library(magrittr)
library(ggbeeswarm)

conds <- rep( c( "C", "T" ), each=4 )
data.frame( 
  cond = conds, 
  nreads = rpois( 8, c( C=1000, T=1100 )[ conds ] ) ) %>%
ggplot +
  geom_beeswarm( aes( x=cond, y=nreads ) )
```

```{r}
table( replicate( 10000, mean( rpois( 4, 1000 ) ) - mean( rpois( 4, 1100 ) ) ) > 0 )
```

More realistic: 

```{r}
ctrl <- rnorm( 4, mean=1000, sd=200 )
trt  <- rnorm( 4, mean=1100, sd=200 )
rpois( 4, lambda=ctrl )
rpois( 4, lambda=trt )


table(
  replicate( 10000, 
    mean( rpois( 4, lambda = rnorm( 4, mean=1000, sd=200 ) ) ) -
    mean( rpois( 4, lambda = rnorm( 4, mean=1100, sd=200 ) ) ) ) > 0 )
```

Getting the SD right is crucial.