## ----VROOM_CONNECTION_SIZE, eval = TRUE, echo = FALSE-------------------------

  Sys.setenv(VROOM_CONNECTION_SIZE='512000')


## ----getPackage, eval = FALSE-------------------------------------------------
#  
#    if (!requireNamespace('BiocManager', quietly = TRUE))
#      install.packages('BiocManager')
#  
#    BiocManager::install('PCAtools')
#  

## ----getPackageDevel, eval = FALSE--------------------------------------------
#  
#    if (!requireNamespace('remotes', quietly = TRUE))
#      install.packages('remotes')
#  
#    remotes::install_github('kevinblighe/PCAtools')
#  

## ----Load, message = FALSE----------------------------------------------------

  library(PCAtools)


## ----message = FALSE----------------------------------------------------------

  library(airway)
  library(magrittr)

  data('airway')
  airway$dex %<>% relevel('untrt')


## ----message = FALSE----------------------------------------------------------

  ens <- rownames(airway)

  library(org.Hs.eg.db)
  symbols <- mapIds(org.Hs.eg.db, keys = ens,
    column = c('SYMBOL'), keytype = 'ENSEMBL')
  symbols <- symbols[!is.na(symbols)]
  symbols <- symbols[match(rownames(airway), names(symbols))]
  rownames(airway) <- symbols
  keep <- !is.na(rownames(airway))
  airway <- airway[keep,]


## ----message = FALSE, warning = FALSE-----------------------------------------

  library('DESeq2')

  dds <- DESeqDataSet(airway, design = ~ cell + dex)
  dds <- DESeq(dds)
  vst <- assay(vst(dds))


## -----------------------------------------------------------------------------

  p <- pca(vst, metadata = colData(airway), removeVar = 0.1)


## ----ex1, warning = FALSE, fig.height = 7, fig.width = 6, fig.cap = 'Figure 1: A scree plot'----

  screeplot(p, axisLabSize = 18, titleLabSize = 22)


## ----ex2a, eval = FALSE-------------------------------------------------------
#  
#    biplot(p)
#  

## ----ex2b, fig.height = 7, fig.width = 7, fig.cap = 'Figure 2: A bi-plot'-----

  biplot(p, showLoadings = TRUE,
    labSize = 5, pointSize = 5, sizeLoadingsNames = 5)


## ----message = FALSE----------------------------------------------------------

  library(Biobase)
  library(GEOquery)

  # load series and platform data from GEO
    gset <- getGEO('GSE2990', GSEMatrix = TRUE, getGPL = FALSE)
    mat <- exprs(gset[[1]])

  # remove Affymetrix control probes
    mat <- mat[-grep('^AFFX', rownames(mat)),]

  # extract information of interest from the phenotype data (pdata)
   idx <- which(colnames(pData(gset[[1]])) %in%
      c('relation', 'age:ch1', 'distant rfs:ch1', 'er:ch1',
        'ggi:ch1', 'grade:ch1', 'size:ch1',
        'time rfs:ch1'))
    metadata <- data.frame(pData(gset[[1]])[,idx],
      row.names = rownames(pData(gset[[1]])))

  # tidy column names
    colnames(metadata) <- c('Study', 'Age', 'Distant.RFS', 'ER', 'GGI', 'Grade',
      'Size', 'Time.RFS')

  # prepare certain phenotypes of interest
    metadata$Study <- gsub('Reanalyzed by: ', '', as.character(metadata$Study))
    metadata$Age <- as.numeric(gsub('^KJ', NA, as.character(metadata$Age)))
    metadata$Distant.RFS <- factor(metadata$Distant.RFS,
      levels = c(0,1))
    metadata$ER <- factor(gsub('\\?', NA, as.character(metadata$ER)),
      levels = c(0,1))
    metadata$ER <- factor(ifelse(metadata$ER == 1, 'ER+', 'ER-'),
      levels = c('ER-', 'ER+'))
    metadata$GGI <- as.numeric(as.character(metadata$GGI))
    metadata$Grade <- factor(gsub('\\?', NA, as.character(metadata$Grade)),
      levels = c(1,2,3))
    metadata$Grade <- gsub(1, 'Grade 1', gsub(2, 'Grade 2', gsub(3, 'Grade 3', metadata$Grade)))
    metadata$Grade <- factor(metadata$Grade, levels = c('Grade 1', 'Grade 2', 'Grade 3'))
    metadata$Size <- as.numeric(as.character(metadata$Size))
    metadata$Time.RFS <- as.numeric(gsub('^KJX|^KJ', NA, metadata$Time.RFS))

  # remove samples from the pdata that have any NA value
    discard <- apply(metadata, 1, function(x) any(is.na(x)))
    metadata <- metadata[!discard,]

  # filter the expression data to match the samples in our pdata
    mat <- mat[,which(colnames(mat) %in% rownames(metadata))]

  # check that sample names match exactly between pdata and expression data 
    all(colnames(mat) == rownames(metadata))


## -----------------------------------------------------------------------------

  p <- pca(mat, metadata = metadata, removeVar = 0.1)


## ----ex3a, eval = FALSE-------------------------------------------------------
#  
#    biplot(p)
#  

## ----ex3b, fig.height = 6, fig.width = 7, fig.cap = 'Figure 3: A bi-plot'-----

  biplot(p, showLoadings = TRUE, lab = NULL)


## ----ex4, message = FALSE, fig.height = 10, fig.width = 10, fig.cap = 'Figure 4: A pairs plot'----

  pairsplot(p)


## ----ex5, fig.height = 6, fig.width = 8, fig.cap = 'Figure 5: A loadings plot'----

  plotloadings(p, labSize = 3)


## ----ex6, warning = FALSE, fig.height = 4, fig.width = 8, fig.cap = 'Figure 6: An eigencor plot'----

  eigencorplot(p,
    metavars = c('Study','Age','Distant.RFS','ER',
      'GGI','Grade','Size','Time.RFS'))


## -----------------------------------------------------------------------------

  p$rotated[1:5,1:5]

  p$loadings[1:5,1:5]


## ----messages = FALSE---------------------------------------------------------

  suppressMessages(require(hgu133a.db))
  newnames <- mapIds(hgu133a.db,
    keys = rownames(p$loadings),
    column = c('SYMBOL'),
    keytype = 'PROBEID')

  # tidy up for NULL mappings and duplicated gene symbols
  newnames <- ifelse(is.na(newnames) | duplicated(newnames),
    names(newnames), newnames)
  rownames(p$loadings) <- newnames
    

## ----warning = FALSE----------------------------------------------------------

  horn <- parallelPCA(mat)
  horn$n


## -----------------------------------------------------------------------------

  elbow <- findElbowPoint(p$variance)
  elbow


## ----ex7, fig.height = 7, fig.width = 9, fig.cap = 'Figure 7: Advanced scree plot illustrating optimum number of PCs'----

  library(ggplot2)

  screeplot(p,
    components = getComponents(p, 1:20),
    vline = c(horn$n, elbow)) +

    geom_label(aes(x = horn$n + 1, y = 50,
      label = 'Horn\'s', vjust = -1, size = 8)) +
    geom_label(aes(x = elbow + 1, y = 50,
      label = 'Elbow method', vjust = -1, size = 8))


## -----------------------------------------------------------------------------

  which(cumsum(p$variance) > 80)[1]


## ----ex8, fig.height = 7, fig.width = 7.5, fig.cap = 'Figure 8: Colour by a metadata factor, use a custom label, add lines through origin, and add legend'----

  biplot(p,
    lab = paste0(p$metadata$Age, ' años'),
    colby = 'ER',
    hline = 0, vline = 0,
    legendPosition = 'right')


## ----ex9, message = FALSE, fig.height = 7.5, fig.width = 7, fig.cap = 'Figure 9: Supply custom colours and encircle variables by group'----

  biplot(p,
    colby = 'ER', colkey = c('ER+' = 'forestgreen', 'ER-' = 'purple'),
    colLegendTitle = 'ER-\nstatus',
    # encircle config
      encircle = TRUE,
      encircleFill = TRUE,
    hline = 0, vline = c(-25, 0, 25),
    legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)

  biplot(p,
    colby = 'ER', colkey = c('ER+' = 'forestgreen', 'ER-' = 'purple'),
    colLegendTitle = 'ER-\nstatus',
    # encircle config
      encircle = TRUE, encircleFill = FALSE,
      encircleAlpha = 1, encircleLineSize = 5,
    hline = 0, vline = c(-25, 0, 25),
    legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)


## ----ex10, message = FALSE, fig.height = 7.5, fig.width = 7, fig.cap = 'Figure 10: Stat ellipses'----

  biplot(p,
    colby = 'ER', colkey = c('ER+' = 'forestgreen', 'ER-' = 'purple'),
    # ellipse config
      ellipse = TRUE,
      ellipseType = 't',
      ellipseLevel = 0.95,
      ellipseFill = TRUE,
      ellipseAlpha = 1/4,
      ellipseLineSize = 1.0,
    xlim = c(-125,125), ylim = c(-50, 80),
    hline = 0, vline = c(-25, 0, 25),
    legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)

  biplot(p,
    colby = 'ER', colkey = c('ER+' = 'forestgreen', 'ER-' = 'purple'),
    # ellipse config
      ellipse = TRUE,
      ellipseType = 't',
      ellipseLevel = 0.95,
      ellipseFill = TRUE,
      ellipseAlpha = 1/4,
      ellipseLineSize = 0,
      ellipseFillKey = c('ER+' = 'yellow', 'ER-' = 'pink'),
    xlim = c(-125,125), ylim = c(-50, 80),
    hline = 0, vline = c(-25, 0, 25),
    legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)


## ----ex11, message = FALSE, eval = FALSE--------------------------------------
#  
#    biplot(p,
#      colby = 'ER', colkey = c('ER+' = 'forestgreen', 'ER-' = 'purple'),
#      hline = c(-25, 0, 25), vline = c(-25, 0, 25),
#      legendPosition = 'top', legendLabSize = 13, legendIconSize = 8.0,
#      shape = 'Grade', shapekey = c('Grade 1' = 15, 'Grade 2' = 17, 'Grade 3' = 8),
#      drawConnectors = FALSE,
#      title = 'PCA bi-plot',
#      subtitle = 'PC1 versus PC2',
#      caption = '27 PCs ≈ 80%')
#  

## ----ex12a--------------------------------------------------------------------

  biplot(p,
    lab = NULL,
    colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'),
    hline = c(-25, 0, 25), vline = c(-25, 0, 25),
    vlineType = c('dotdash', 'solid', 'dashed'),
    gridlines.major = FALSE, gridlines.minor = FALSE,
    pointSize = 5,
    legendPosition = 'left', legendLabSize = 14, legendIconSize = 8.0,
    shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
    drawConnectors = FALSE,
    title = 'PCA bi-plot',
    subtitle = 'PC1 versus PC2',
    caption = '27 PCs ≈ 80%')


## ----ex12b, message = FALSE, fig.height = 5.5, fig.width = 11, fig.cap = 'Figure 11: Modify line types, remove gridlines, and increase point size'----

  biplot(p,
    # loadings parameters
      showLoadings = TRUE,
      lengthLoadingsArrowsFactor = 1.5,
      sizeLoadingsNames = 4,
      colLoadingsNames = 'red4',
    # other parameters
      lab = NULL,
      colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'),
      hline = 0, vline = c(-25, 0, 25),
      vlineType = c('dotdash', 'solid', 'dashed'),
      gridlines.major = FALSE, gridlines.minor = FALSE,
      pointSize = 5,
      legendPosition = 'left', legendLabSize = 14, legendIconSize = 8.0,
      shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
      drawConnectors = FALSE,
      title = 'PCA bi-plot',
      subtitle = 'PC1 versus PC2',
      caption = '27 PCs ≈ 80%')


## ----ex13a, fig.height = 5.5, fig.width = 6, fig.cap = 'Figure 12: Colour by a continuous variable and plot other PCs'----

  # add ESR1 gene expression to the metadata
  p$metadata$ESR1 <- mat['205225_at',]

  biplot(p,
    x = 'PC2', y = 'PC3',
    lab = NULL,
    colby = 'ESR1',
    shape = 'ER',
    hline = 0, vline = 0,
    legendPosition = 'right') +

  scale_colour_gradient(low = 'gold', high = 'red2')


## ----ex13b, eval = FALSE------------------------------------------------------
#  
#    # was always eval = FALSE
#    biplot(p, x = 'PC10', y = 'PC50',
#      lab = NULL,
#      colby = 'Age',
#      hline = 0, vline = 0,
#      hlineWidth = 1.0, vlineWidth = 1.0,
#      gridlines.major = FALSE, gridlines.minor = TRUE,
#      pointSize = 5,
#      legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0,
#      shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
#      drawConnectors = FALSE,
#      title = 'PCA bi-plot',
#      subtitle = 'PC10 versus PC50',
#      caption = '27 PCs ≈ 80%')
#  

## ----ex14, message = FALSE, fig.height = 8, fig.width = 7, fig.cap = 'Figure 13: Quickly explore potentially informative PCs via a pairs plot'----

  pairsplot(p,
    components = getComponents(p, c(1:10)),
    triangle = TRUE, trianglelabSize = 12,
    hline = 0, vline = 0,
    pointSize = 0.4,
    gridlines.major = FALSE, gridlines.minor = FALSE,
    colby = 'Grade',
    title = 'Pairs plot', plotaxes = FALSE,
    margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))


## ----ex15, fig.height = 5.5, fig.width = 6, fig.cap = 'Figure 14: arranging a pairs plot horizontally'----

  pairsplot(p,
    components = getComponents(p, c(4,33,11,1)),
    triangle = FALSE,
    hline = 0, vline = 0,
    pointSize = 0.8,
    gridlines.major = FALSE, gridlines.minor = FALSE,
    colby = 'ER',
    title = 'Pairs plot', titleLabSize = 22,
    axisLabSize = 14, plotaxes = TRUE,
    margingaps = unit(c(0.1, 0.1, 0.1, 0.1), 'cm'))


## ----ex16, fig.height = 5.5, fig.width = 7, fig.cap = 'Figure 15: Determine the variables that drive variation among each PC'----

  plotloadings(p,
    rangeRetain = 0.01,
    labSize = 4.0,
    title = 'Loadings plot',
    subtitle = 'PC1, PC2, PC3, PC4, PC5',
    caption = 'Top 1% variables',
    shape = 24,
    col = c('limegreen', 'black', 'red3'),
    drawConnectors = TRUE)


## ----ex17a, fig.height = 7, fig.width = 9, fig.cap = 'Figure 16: plotting absolute component loadings'----

  plotloadings(p,
    components = getComponents(p, c(4,33,11,1)),
    rangeRetain = 0.1,
    labSize = 4.0,
    absolute = FALSE,
    title = 'Loadings plot',
    subtitle = 'Misc PCs',
    caption = 'Top 10% variables',
    shape = 23, shapeSizeRange = c(1, 16),
    col = c('white', 'pink'),
    drawConnectors = FALSE)


## ----ex17b, fig.height = 4.5, fig.width = 9, fig.cap = 'Figure 17: plotting absolute component loadings'----

  plotloadings(p,
    components = getComponents(p, c(2)),
    rangeRetain = 0.12, absolute = TRUE,
    col = c('black', 'pink', 'red4'),
    drawConnectors = TRUE, labSize = 4) + coord_flip()


## ----ex18a, warning = FALSE, fig.height = 4.25, fig.width = 12, fig.cap = 'Figure 18: Correlate the principal components back to the clinical data'----

  eigencorplot(p,
    components = getComponents(p, 1:27),
    metavars = c('Study','Age','Distant.RFS','ER',
      'GGI','Grade','Size','Time.RFS'),
    col = c('darkblue', 'blue2', 'black', 'red2', 'darkred'),
    cexCorval = 0.7,
    colCorval = 'white',
    fontCorval = 2,
    posLab = 'bottomleft',
    rotLabX = 45,
    posColKey = 'top',
    cexLabColKey = 1.5,
    scale = TRUE,
    main = 'PC1-27 clinical correlations',
    colFrame = 'white',
    plotRsquared = FALSE)


## ----ex18b, warning = FALSE, fig.height = 5, fig.width = 12, fig.cap = 'Figure 19: Correlate the principal components back to the clinical data'----

  eigencorplot(p,
    components = getComponents(p, 1:horn$n),
    metavars = c('Study','Age','Distant.RFS','ER','GGI',
      'Grade','Size','Time.RFS'),
    col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
    cexCorval = 1.2,
    fontCorval = 2,
    posLab = 'all',
    rotLabX = 45,
    scale = TRUE,
    main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ clinical ~ correlates),
    plotRsquared = TRUE,
    corFUN = 'pearson',
    corUSE = 'pairwise.complete.obs',
    corMultipleTestCorrection = 'BH',
    signifSymbols = c('****', '***', '**', '*', ''),
    signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))


## ----ex19, message = FALSE, warning = FALSE, fig.height = 10, fig.width = 15, fig.cap = 'Figure 20: a merged panel of all PCAtools plots'----

  pscree <- screeplot(p, components = getComponents(p, 1:30),
    hline = 80, vline = 27, axisLabSize = 14, titleLabSize = 20,
    returnPlot = FALSE) +
    geom_label(aes(20, 80, label = '80% explained variation', vjust = -1, size = 8))

  ppairs <- pairsplot(p, components = getComponents(p, c(1:3)),
    triangle = TRUE, trianglelabSize = 12,
    hline = 0, vline = 0,
    pointSize = 0.8, gridlines.major = FALSE, gridlines.minor = FALSE,
    colby = 'Grade',
    title = '', plotaxes = FALSE,
    margingaps = unit(c(0.01, 0.01, 0.01, 0.01), 'cm'),
    returnPlot = FALSE)

  pbiplot <- biplot(p,
    # loadings parameters
      showLoadings = TRUE,
      lengthLoadingsArrowsFactor = 1.5,
      sizeLoadingsNames = 4,
      colLoadingsNames = 'red4',
    # other parameters
      lab = NULL,
      colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'),
      hline = 0, vline = c(-25, 0, 25),
      vlineType = c('dotdash', 'solid', 'dashed'),
      gridlines.major = FALSE, gridlines.minor = FALSE,
      pointSize = 5,
      legendPosition = 'none', legendLabSize = 16, legendIconSize = 8.0,
      shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
      drawConnectors = FALSE,
      title = 'PCA bi-plot',
      subtitle = 'PC1 versus PC2',
      caption = '27 PCs ≈ 80%',
      returnPlot = FALSE)

  ploadings <- plotloadings(p, rangeRetain = 0.01, labSize = 4,
    title = 'Loadings plot', axisLabSize = 12,
    subtitle = 'PC1, PC2, PC3, PC4, PC5',
    caption = 'Top 1% variables',
    shape = 24, shapeSizeRange = c(4, 8),
    col = c('limegreen', 'black', 'red3'),
    legendPosition = 'none',
    drawConnectors = FALSE,
    returnPlot = FALSE)

  peigencor <- eigencorplot(p,
    components = getComponents(p, 1:10),
    metavars = c('Study','Age','Distant.RFS','ER',
      'GGI','Grade','Size','Time.RFS'),
    cexCorval = 1.0,
    fontCorval = 2,
    posLab = 'all', 
    rotLabX = 45,
    scale = TRUE,
    main = "PC clinical correlates",
    cexMain = 1.5,
    plotRsquared = FALSE,
    corFUN = 'pearson',
    corUSE = 'pairwise.complete.obs',
    signifSymbols = c('****', '***', '**', '*', ''),
    signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1),
    returnPlot = FALSE)

    library(cowplot)
    library(ggplotify)

    top_row <- plot_grid(pscree, ppairs, pbiplot,
      ncol = 3,
      labels = c('A', 'B  Pairs plot', 'C'),
      label_fontfamily = 'serif',
      label_fontface = 'bold',
      label_size = 22,
      align = 'h',
      rel_widths = c(1.10, 0.80, 1.10))

    bottom_row <- plot_grid(ploadings,
      as.grob(peigencor),
      ncol = 2,
      labels = c('D', 'E'),
      label_fontfamily = 'serif',
      label_fontface = 'bold',
      label_size = 22,
      align = 'h',
      rel_widths = c(0.8, 1.2))

    plot_grid(top_row, bottom_row, ncol = 1,
      rel_heights = c(1.1, 0.9))


## -----------------------------------------------------------------------------

  p <- pca(mat, metadata = metadata, removeVar = 0.1)
  p.prcomp <- list(sdev = p$sdev,
    rotation = data.matrix(p$loadings),
    x = data.matrix(p$rotated),
    center = TRUE, scale = TRUE)

  class(p.prcomp) <- 'prcomp'

  # for this simple example, just use a chunk of
  # the original data for the prediction
  newdata <- t(mat[,seq(1,20)])
  predict(p.prcomp, newdata = newdata)[,1:5]


## -----------------------------------------------------------------------------

sessionInfo()