Basic robustness concepts of statistics

Vince Carey

June 14, 2017

Road map:

Some code to enter in your workspace

Mystery functions?

ts_zpm = function(d) sqrt(length(d))*mean(d)/sd(d)
contam1 = function(x, slip=5) {x[1] = x[1]+slip; x}

Try them out

set.seed(12345)
X = rnorm(50)
ts_zpm(X)
## [1] 1.157889
ts_zpm(contam1(X))
## [1] 1.479476
ts_zpm(contam1(X,100))
## [1] 1.082075

The concept of the critical region

Assumptions underlying a test procedure

Computing and reporting a t test for a simple hypothesis

set.seed(12345)
X = rnorm(50)
tst = t.test(X)
tst
## 
##  One Sample t-test
## 
## data:  X
## t = 1.1579, df = 49, p-value = 0.2525
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.1320800  0.4912126
## sample estimates:
## mean of x 
## 0.1795663

Components of the result

str(tst)
## List of 9
##  $ statistic  : Named num 1.16
##   ..- attr(*, "names")= chr "t"
##  $ parameter  : Named num 49
##   ..- attr(*, "names")= chr "df"
##  $ p.value    : num 0.253
##  $ conf.int   : atomic [1:2] -0.132 0.491
##   ..- attr(*, "conf.level")= num 0.95
##  $ estimate   : Named num 0.18
##   ..- attr(*, "names")= chr "mean of x"
##  $ null.value : Named num 0
##   ..- attr(*, "names")= chr "mean"
##  $ alternative: chr "two.sided"
##  $ method     : chr "One Sample t-test"
##  $ data.name  : chr "X"
##  - attr(*, "class")= chr "htest"

The structure of the t statistic: a ratio of measures of location and dispersion

tst$statistic  # from R
##        t 
## 1.157889
sqrt(50)*mean(X)/sd(X) # by hand
## [1] 1.157889

Simulating the distribution of t under the null hypothesis: \(X \sim N(0,\sigma^2)\)

set.seed(12345)
simdist = replicate(10000,  ts_zpm( rnorm(50) ))
head(simdist)
## [1]  1.15788926  1.92818028 -0.08173422  0.82143100  0.39880144 -1.21706025

Simulated data and theoretical density

hist(simdist, freq=FALSE)
lines(seq(-3,3,.01), dt(seq(-3,3,.01), 49))

The one-sided p-value for \(\hat{t}\) = 1.1579

simulated:

mean(simdist > 1.1579)
## [1] 0.123

theoretical (exact):

integrate(function(x) dt(x,49), 1.1579, Inf)$value
## [1] 0.1262589

What if there is a contaminated observation?

contsim = replicate(10000, 
    ts_zpm( contam1( rnorm(50) ) ) )
critval_1sided = qt(.95, 49)
mean(simdist > critval_1sided) # uncontaminated
## [1] 0.0483
mean(contsim > critval_1sided) # contaminated
## [1] 0.0946

Can we fix this?

library(parody)
## Loading required package: tools
robust_t = function(x) {
 outchk = calout.detect(x, method="GESD")
 if (!is.na(outchk$ind[1])) x = x[-outchk$ind]
 sqrt(length(x))*mean(x)/sd(x)
}
set.seed(12345)
contsim_r = replicate(10000, 
    robust_t( contam1( rnorm(50) ) ) )
mean(contsim_r > critval_1sided) # robust test on contaminated
## [1] 0.0531

Recap

Power to reject \(H_0:\mu = 0\) when \(\mu = 0.4\)

power.t.test(n=50, type="one.sample", 
   alt="one.sided", delta=.4)
## 
##      One-sample t test power calculation 
## 
##               n = 50
##           delta = 0.4
##              sd = 1
##       sig.level = 0.05
##           power = 0.8737242
##     alternative = one.sided
# empirical
mean( replicate(10000, 
    ts_zpm(rnorm(50, .4))>qt(.95, 49)) )
## [1] 0.8764

Effect of contamination: a single large contaminant can slash power

mean( replicate(10000, ts_zpm(
     contam1(rnorm(50, .4), slip=25))>qt(.95, 49)) )
## [1] 0.5834

Exercises:

Sample median is a resistant estimator of location

We’ll use a series of outlier magnitudes (0:10) and summarize distributions of estimators

set.seed(12345)
mns <- sapply(0:10, function(o) 
  median(replicate(5000, mean(contam1(rnorm(50),o))))) 
set.seed(12345)
meds <- sapply(0:10, function(o) 
  median(replicate(5000, median(contam1(rnorm(50),o))))) 

Note sensitivity of mean

plot(0:10, mns, xlab="outlier magnitude", ylab="median of stat",
  pch=19)
points(0:10, meds, pch=19, col="blue")
legend(0, .1, pch=19, col=c("black", "blue"), legend=c("mean", "median"))

Sample MAD is a resistant estimator of dispersion

set.seed(12345)
sds <- sapply(0:10, function(o) 
  median(replicate(5000, sd(contam1(rnorm(50),o))))) 
set.seed(12345)
mads <- sapply(0:10, function(o) 
  median(replicate(5000, mad(contam1(rnorm(50),o))))) 

Note sensitivity of SD

plot(0:10, sds, xlab="outlier magnitude", ylab="median of stat",
  pch=19)
points(0:10, mads, pch=19, col="blue")
legend(0, 1.2, pch=19, col=c("black", "blue"), legend=c("SD", "MAD"))

Recap

Exercises

qsignrank(.95, 50)
## [1] 808
set.seed(12345)
wilcox.test(rnorm(50, .4))$statistic
##   V 
## 977

Deficiencies of scalar summaries

Anscombe’s data

Show that the (x,y) pairs have identical

- marginal means
- marginal SDs
- correlation coefficients
- linear regressions of y on x

Use MASS::rlm to get a model for y3, x3 that fits the majority of points exactly

Outliers in RNA-seq analysis

Simulating an RNA-seq experiment

suppressMessages({set.seed(12345)
library(DESeq2)
S1 = makeExampleDESeqDataSet(betaSD=.75)
D1 = DESeq(S1)
R1 = results(D1)})
summary(R1)
## 
## out of 997 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)     : 99, 9.9% 
## LFC < 0 (down)   : 95, 9.5% 
## outliers [1]     : 2, 0.2% 
## low counts [2]   : 211, 21% 
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Exercises

Conclusion