Contents

Author: Sonali Arora (sarora@fredhutch.org)
Date: 20-22 July, 2015

The material in this course requires R version 3.2.1 and Bioconductor version 3.2

0.1 R

0.2 Useful Functions in base R

0.3 Getting help in R

0.4 Data types in R

0.5 R programming concepts

mean(1:10)
## [1] 5.5
rnorm(1:10)
##  [1] -0.33290892 -1.03732032  0.00358368 -0.45695476  1.00505058  0.67227284 -0.90495556 -1.45142584
##  [9]  0.22424728 -0.20679577
summary(rnorm(1:10))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.9524 -0.4189  0.4298  0.3124  0.6072  2.2260
data(iris) 

# find those rows where petal.width is exactly 0.2
iris[iris$Petal.Width==0.2,]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 15          5.8         4.0          1.2         0.2  setosa
## 21          5.4         3.4          1.7         0.2  setosa
## 23          4.6         3.6          1.0         0.2  setosa
## 25          4.8         3.4          1.9         0.2  setosa
## 26          5.0         3.0          1.6         0.2  setosa
## 28          5.2         3.5          1.5         0.2  setosa
## 29          5.2         3.4          1.4         0.2  setosa
## 30          4.7         3.2          1.6         0.2  setosa
## 31          4.8         3.1          1.6         0.2  setosa
## 34          5.5         4.2          1.4         0.2  setosa
## 35          4.9         3.1          1.5         0.2  setosa
## 36          5.0         3.2          1.2         0.2  setosa
## 37          5.5         3.5          1.3         0.2  setosa
## 39          4.4         3.0          1.3         0.2  setosa
## 40          5.1         3.4          1.5         0.2  setosa
## 43          4.4         3.2          1.3         0.2  setosa
## 47          5.1         3.8          1.6         0.2  setosa
## 48          4.6         3.2          1.4         0.2  setosa
## 49          5.3         3.7          1.5         0.2  setosa
## 50          5.0         3.3          1.4         0.2  setosa
# find those rows where sepal.length is less than 4.5
iris[iris$Sepal.Length < 4.5,]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 9           4.4         2.9          1.4         0.2  setosa
## 14          4.3         3.0          1.1         0.1  setosa
## 39          4.4         3.0          1.3         0.2  setosa
## 43          4.4         3.2          1.3         0.2  setosa
# find all rows belonging to setosa
setosa_iris = iris[iris$Species=="setosa",]
dim(setosa_iris)
## [1] 50  5
head(setosa_iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# drop the column containing characters i.e., Species 
iris <- iris[,!( names(iris) %in% "Species")]
dim(iris)
## [1] 150   4
# find the mean of the first 4 numerical columns 
lapply(iris, mean) # simpler: colMeans(iris)
## $Sepal.Length
## [1] 5.843333
## 
## $Sepal.Width
## [1] 3.057333
## 
## $Petal.Length
## [1] 3.758
## 
## $Petal.Width
## [1] 1.199333
# simplify the result 
sapply(iris, mean)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333
# find the mean for each row. 
apply(iris, 1 , mean) #simpler : rowMeans(iris)
##   [1] 2.550 2.375 2.350 2.350 2.550 2.850 2.425 2.525 2.225 2.400 2.700 2.500 2.325 2.125 2.800
##  [16] 3.000 2.750 2.575 2.875 2.675 2.675 2.675 2.350 2.650 2.575 2.450 2.600 2.600 2.550 2.425
##  [31] 2.425 2.675 2.725 2.825 2.425 2.400 2.625 2.500 2.225 2.550 2.525 2.100 2.275 2.675 2.800
##  [46] 2.375 2.675 2.350 2.675 2.475 4.075 3.900 4.100 3.275 3.850 3.575 3.975 2.900 3.850 3.300
##  [61] 2.875 3.650 3.300 3.775 3.350 3.900 3.650 3.400 3.600 3.275 3.925 3.550 3.800 3.700 3.725
##  [76] 3.850 3.950 4.100 3.725 3.200 3.200 3.150 3.400 3.850 3.600 3.875 4.000 3.575 3.500 3.325
##  [91] 3.425 3.775 3.400 2.900 3.450 3.525 3.525 3.675 2.925 3.475 4.525 3.875 4.525 4.150 4.375
## [106] 4.825 3.400 4.575 4.200 4.850 4.200 4.075 4.350 3.800 4.025 4.300 4.200 5.100 4.875 3.675
## [121] 4.525 3.825 4.800 3.925 4.450 4.550 3.900 3.950 4.225 4.400 4.550 5.025 4.250 3.925 3.925
## [136] 4.775 4.425 4.200 3.900 4.375 4.450 4.350 3.875 4.550 4.550 4.300 3.925 4.175 4.325 3.950

0.6 R as a Statistical Computing Environment

# define a vector
x <- rnorm(1000) 

# vectorized calculation
y <- x + rnorm(1000, sd=.8) 

# object construction
df <- data.frame(x=x, y=y)
 
# linear model
fit <- lm(y ~ x, df)

0.7 Visualizing Data in R

par(mfrow=c(1,2))
plot(y ~ x, df, cex.lab=2)
abline(fit, col="red", lwd=2)

library(ggplot2)
ggplot(df, aes(x, y)) + 
   geom_point() +
   stat_smooth(method="lm")

0.8 sessionInfo()

sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-unknown-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.2 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
##  [4] LC_COLLATE=C               LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_1.0.1   BiocStyle_1.7.4
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.11.6      codetools_0.2-14 digest_0.6.8     MASS_7.3-43      grid_3.2.1      
##  [6] plyr_1.8.3       gtable_0.1.2     formatR_1.2      magrittr_1.5     scales_0.2.5    
## [11] evaluate_0.7     stringi_0.5-5    reshape2_1.4.1   rmarkdown_0.7    labeling_0.3    
## [16] proto_0.3-10     tools_3.2.1      stringr_1.0.0    munsell_0.4.2    yaml_2.1.13     
## [21] colorspace_1.2-6 htmltools_0.2.6  knitr_1.10.5