Input clean data, with Sex
and Year
as factors.
path <- file.choose() # look for BRFSS-subset.csv
stopifnot(file.exists(path))
library(tidyverse)
col_types <- cols(
Age = col_integer(),
Weight = col_double(),
Sex = col_factor(c("Female", "Male")),
Height = col_double(),
Year = col_factor(c("1990", "2010"))
)
brfss <- read_csv(path, col_types = col_types)
brfss
## # A tibble: 20,000 x 5
## Age Weight Sex Height Year
## <int> <dbl> <fctr> <dbl> <fctr>
## 1 31 48.98798 Female 157.48 1990
## 2 57 81.64663 Female 157.48 1990
## 3 43 80.28585 Male 177.80 1990
## 4 72 70.30682 Male 170.18 1990
## 5 31 49.89516 Female 154.94 1990
## 6 58 54.43108 Female 154.94 1990
## 7 45 69.85323 Male 172.72 1990
## 8 37 68.03886 Male 180.34 1990
## 9 33 65.77089 Female 170.18 1990
## 10 75 70.76041 Female 152.40 1990
## # ... with 19,990 more rows
t.test()
for Weight in 1990 vs. 2010 FemalesFilter the data to include females only, and use base plot()
function and the formula interface to visualize the relationship between Weight
and Year
.
brfss %>% filter(Sex %in% "Female") %>% plot(Weight ~ Year, data = .)
Use a t.test()
to test the hypothesis that female weight is the same in 2010 as in 1990
brfss %>% filter(Sex %in% "Female") %>% t.test(Weight ~ Year, data = .)
##
## Welch Two Sample t-test
##
## data: Weight by Year
## t = -27.133, df = 11079, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.723607 -7.548102
## sample estimates:
## mean in group 1990 mean in group 2010
## 64.81838 72.95424
Filter the data to contain 2010 Males. Use plot()
to visualize the relationship, and lm()
to model it.
male2010 <- brfss %>% filter(Year %in% "2010", Sex %in% "Male")
male2010 %>% plot( Weight ~ Height, data = .)
fit <- male2010 %>% lm( Weight ~ Height, data = .)
fit
##
## Call:
## lm(formula = Weight ~ Height, data = .)
##
## Coefficients:
## (Intercept) Height
## -86.8747 0.9873
summary(fit)
##
## Call:
## lm(formula = Weight ~ Height, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.867 -11.349 -2.677 8.263 180.227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -86.87470 6.67835 -13.01 <2e-16 ***
## Height 0.98727 0.03748 26.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.88 on 3617 degrees of freedom
## (60 observations deleted due to missingness)
## Multiple R-squared: 0.1609, Adjusted R-squared: 0.1607
## F-statistic: 693.8 on 1 and 3617 DF, p-value: < 2.2e-16
Multiple regression: Weight and Height, accounting for difference between years
male <- brfss %>% filter(Sex %in% "Male")
male %>% lm(Weight ~ Year + Height, data = .) %>% summary()
##
## Call:
## lm(formula = Weight ~ Year + Height, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.037 -9.032 -1.546 6.846 181.022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -80.56568 3.88047 -20.76 <2e-16 ***
## Year2010 7.86690 0.32684 24.07 <2e-16 ***
## Height 0.90764 0.02174 41.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.44 on 7854 degrees of freedom
## (104 observations deleted due to missingness)
## Multiple R-squared: 0.2262, Adjusted R-squared: 0.2261
## F-statistic: 1148 on 2 and 7854 DF, p-value: < 2.2e-16
Is there an interaction between Year
and Height
?
male %>% lm(Weight ~ Year * Height, data = .) %>% summary()
##
## Call:
## lm(formula = Weight ~ Year * Height, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.867 -9.080 -1.731 6.796 180.227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -68.49361 5.27146 -12.993 < 2e-16 ***
## Year2010 -18.38109 7.77065 -2.365 0.018032 *
## Height 0.83990 0.02955 28.421 < 2e-16 ***
## Year2010:Height 0.14737 0.04359 3.381 0.000726 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.43 on 7853 degrees of freedom
## (104 observations deleted due to missingness)
## Multiple R-squared: 0.2274, Adjusted R-squared: 0.2271
## F-statistic: 770.3 on 3 and 7853 DF, p-value: < 2.2e-16
Check out other things to do with fitted model:
broom::tidy()
: P-value, etc., as data.framebroom::augment()
: fitted values, residuals, etc
library(broom)
male %>% lm(Weight ~ Year + Height, data = .) %>%
augment() %>% as.tibble()
## # A tibble: 7,857 x 11
## .rownames Weight Year Height .fitted .se.fit .resid
## <chr> <dbl> <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 1 80.28585 1990 177.80 80.81238 0.2219842 -0.526530
## 2 2 70.30682 1990 170.18 73.89618 0.2823537 -3.589360
## 3 3 69.85323 1990 172.72 76.20158 0.2519471 -6.348353
## 4 4 68.03886 1990 180.34 83.11778 0.2265455 -15.078925
## 5 5 88.45051 1990 180.34 83.11778 0.2265455 5.332732
## 6 6 81.64663 1990 182.88 85.42318 0.2438568 -3.776555
## 7 7 92.98644 1990 182.88 85.42318 0.2438568 7.563255
## 8 8 97.52236 1990 193.04 94.64478 0.3911687 2.877575
## 9 9 78.01789 1990 170.18 73.89618 0.2823537 4.121711
## 10 10 77.11070 1990 175.26 78.50698 0.2309296 -1.396276
## # ... with 7,847 more rows, and 4 more variables: .hat <dbl>, .sigma <dbl>,
## # .cooksd <dbl>, .std.resid <dbl>
ggplot: “Grammar of Graphics”
ggplot2()
aes()
, ‘x’ and ‘y’ values, point colors, etc.geom_point()
: pointsgeom_smooth()
: fitted linegeom_*
: …facet_grid()
) to create ‘panels’ based on factor levels, with shared axes.Create a plot with data points
ggplot(male, aes(x=Height, y = Weight)) + geom_point()
Capture the base plot and points, and explore different smoothed relationships, e.g., linear model, non-parameteric smoother
plt <- ggplot(male, aes(x=Height, y = Weight)) + geom_point()
plt + geom_smooth(method = "lm")
plt + geom_smooth() # default: generalized additive model
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Use an aes()
thetic to color smoothed lines based on Year
, or facet_grid()
to separate years.
ggplot(male, aes(x = Weight)) + geom_density(aes(fill = Year), alpha = .2)
plt + geom_smooth(method = "lm", aes(color = Year))
plt + facet_grid( ~ Year ) + geom_smooth(method = "lm")
This is a classic microarray experiment. Microarrays consist of ‘probesets’ that interogate genes for their level of expression. In the experiment we’re looking at, there are 12625 probesets measured on each of the 128 samples. The raw expression levels estimated by microarray assays require considerable pre-processing, the data we’ll work with has been pre-processed.
Start by finding the expression data file on disk.
path <- file.choose() # look for ALL-expression.csv
stopifnot(file.exists(path))
The data is stored in ‘comma-separate value’ format, with each probeset occupying a line, and the expression value for each sample in that probeset separated by a comma. Input the data using read_csv()
. The sample identifiers are present in the first column.
exprs <- read_csv(path)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene = col_character()
## )
## See spec(...) for full column specifications.
We’ll also input the data that describes each column
path <- file.choose() # look for ALL-phenoData.csv
stopifnot(file.exists(path))
pdata <- read_csv(path)
## Parsed with column specification:
## cols(
## .default = col_character(),
## age = col_integer(),
## `t(4;11)` = col_logical(),
## `t(9;22)` = col_logical(),
## cyto.normal = col_logical(),
## ccr = col_logical(),
## relapse = col_logical(),
## transplant = col_logical()
## )
## See spec(...) for full column specifications.
pdata
## # A tibble: 128 x 22
## Sample cod diagnosis sex age BT remission CR date.cr
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
## 1 01005 1005 5/21/1997 M 53 B2 CR CR 8/6/1997
## 2 01010 1010 3/29/2000 M 19 B2 CR CR 6/27/2000
## 3 03002 3002 6/24/1998 F 52 B4 CR CR 8/17/1998
## 4 04006 4006 7/17/1997 M 38 B1 CR CR 9/8/1997
## 5 04007 4007 7/22/1997 M 57 B2 CR CR 9/17/1997
## 6 04008 4008 7/30/1997 M 17 B1 CR CR 9/27/1997
## 7 04010 4010 10/30/1997 F 18 B1 CR CR 1/7/1998
## 8 04016 4016 2/10/2000 M 16 B1 CR CR 4/17/2000
## 9 06002 6002 3/19/1997 M 15 B2 CR CR 6/9/1997
## 10 08001 8001 1/15/1997 M 40 B2 CR CR 3/26/1997
## # ... with 118 more rows, and 13 more variables: `t(4;11)` <lgl>,
## # `t(9;22)` <lgl>, cyto.normal <lgl>, citog <chr>, mol.biol <chr>, `fusion
## # protein` <chr>, mdr <chr>, kinet <chr>, ccr <lgl>, relapse <lgl>,
## # transplant <lgl>, f.u <chr>, `date last seen` <chr>
The expression data is presented in what is sometimes called ‘wide’ format; a different format is ‘tall’, where Sample and Gene group the single observation Expression. Use tidyr::gather()
to gather the columns of the wide format into two columns representing the tall format, excluding the Gene
column from the gather operation.
exprs <- exprs %>% gather("Sample", "Expression", -Gene)
Explore the data a little, e.g., a summary and histogram of the expression values, and a histogram of average expression values of each gene.
exprs %>% select(Expression) %>% summary()
## Expression
## Min. : 1.985
## 1st Qu.: 4.130
## Median : 5.469
## Mean : 5.625
## 3rd Qu.: 6.827
## Max. :14.127
exprs $ Expression %>% hist()
exprs %>% group_by(Gene) %>%
summarize(AveExprs = mean(Expression)) %$% AveExprs %>%
hist(breaks=50)
For subsequent analysis, we also want to simplify the ‘B or T’ cell type classification
pdata <- pdata %>% mutate(B_or_T = factor(substr(BT, 1, 1)))
We’d like to reduce high-dimensional data to lower dimension for visualization. To do so, we need the dist()
ance between samples. From ?dist
, the input can be a data.frame where rows represent Sample
and columns represent Expression
values. Use spread()
to create appropriate data from exprs
, and pipe the result to dist()
ance.x
input <- exprs %>% spread(Gene, Expression)
samples <- input $ Sample
input <- input %>% select(-Sample) %>% as.matrix
rownames(input) <- samples
Calculate distance between samples, and use that for MDS scaling
mds <- dist(input) %>% cmdscale()
The result is a matrix; make it ‘tidy’ by coercing to a tibble; add the Sample identifiers as a distinct column.
mds <- mds %>% as.tibble() %>% mutate(Sample = rownames(mds))
Visualize the result
ggplot(mds, aes(x=V1, y = V2)) + geom_point()
With the ‘eye of faith’, it seems like there are two groups of points. To explore this, join the MDS scaling with the phenotypic data
joined <- inner_join(mds, pdata)
## Joining, by = "Sample"
and use the B_or_T
column as an aesthetic to color points
ggplot(joined, aes(x = V1, y = V2)) + geom_point(aes(color = B_or_T))