## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    eval=TRUE,
    warning=FALSE,
    message = FALSE
)

## ----install, eval=FALSE------------------------------------------------------
#  
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  
#  BiocManager::install("GSgalgoR")
#  library(GSgalgoR)

## ----install-github, eval=FALSE-----------------------------------------------
#  devtools::install_github("https://github.com/harpomaxx/GSgalgoR")
#  library(GSgalgoR)

## ----datasets, eval=FALSE-----------------------------------------------------
#  
#  BiocManager::install("breastCancerUPP",version = "devel")
#  BiocManager::install("breastCancerTRANSBIG",version = "devel")
#  

## ----load_data, message=FALSE-------------------------------------------------

library(breastCancerTRANSBIG)
library(breastCancerUPP)


## ----libraries, message=FALSE-------------------------------------------------
library(GSgalgoR)
library(Biobase)
library(genefu)
library(survival)
library(survminer)
library(ggplot2)
data(pam50)

## ----load_data2---------------------------------------------------------------
data(upp)
Train<- upp
rm(upp)

data(transbig)
Test<- transbig
rm(transbig)

#To access gene expression data
train_expr<- exprs(Train)
test_expr<- exprs(Test)

#To access feature data
train_features<- fData(Train)
test_features<- fData(Test)

#To access clinical data
train_clinic <- pData(Train) 
test_clinic <- pData(Test) 


## ----drop duplicates----------------------------------------------------------

#Custom function to drop duplicated genes (keep genes with highest variance)

DropDuplicates<- function(eset, map= "Gene.symbol"){

    #Drop NA's
    drop <- which(is.na(fData(eset)[,map]))
    eset <- eset[-drop,]

    #Drop duplicates
    drop <- NULL
    Dup <- as.character(unique(fData(eset)[which(duplicated
            (fData(eset)[,map])),map]))
    Var <- apply(exprs(eset),1,var)
    for(j in Dup){
        pos <- which(fData(eset)[,map]==j)
        drop <- c(drop,pos[-which.max(Var[pos])])
    }

    eset <- eset[-drop,]

    featureNames(eset) <- fData(eset)[,map]
    return(eset)
}


## ----expandprobesets----------------------------------------------------------

# Custom function to expand probesets mapping to multiple genes
expandProbesets <- function (eset, sep = "///", map="Gene.symbol"){
    x <- lapply(featureNames(eset), function(x) strsplit(x, sep)[[1]])
    y<- lapply(as.character(fData(eset)[,map]), function(x) strsplit(x,sep))
    eset <- eset[order(sapply(x, length)), ]
    x <- lapply(featureNames(eset), function(x) strsplit(x, sep)[[1]])
    y<- lapply(as.character(fData(eset)[,map]), function(x) strsplit(x,sep))
    idx <- unlist(sapply(1:length(x), function(i) rep(i,length(x[[i]]))))
    idy <- unlist(sapply(1:length(y), function(i) rep(i,length(y[[i]]))))
    xx <- !duplicated(unlist(x))
    idx <- idx[xx]
    idy <- idy[xx]
    x <- unlist(x)[xx]
    y <- unlist(y)[xx]
    eset <- eset[idx, ]
    featureNames(eset) <- x
    fData(eset)[,map] <- x
    fData(eset)$gene <- y
    return(eset)
}


## ----adapted_expression-------------------------------------------------------
Train=DropDuplicates(Train)
Train=expandProbesets(Train)
#Drop NAs in survival
Train <- Train[,!is.na(
    survival::Surv(time=pData(Train)$t.rfs,event=pData(Train)$e.rfs))] 

Test=DropDuplicates(Test)
Test=expandProbesets(Test)
#Drop NAs in survival
Test <- 
    Test[,!is.na(survival::Surv(
        time=pData(Test)$t.rfs,event=pData(Test)$e.rfs))] 

#Determine common probes (Genes)
Int= intersect(rownames(Train),rownames(Test))

Train= Train[Int,]
Test= Test[Int,]

identical(rownames(Train),rownames(Test))


## ----reduced_expr-------------------------------------------------------------

#First we will get PAM50 centroids from genefu package

PAM50Centroids <- pam50$centroids
PAM50Genes <- pam50$centroids.map$probe
PAM50Genes<- featureNames(Train)[ featureNames(Train) %in% PAM50Genes]

#Now we sample 200 random genes from expression matrix

Non_PAM50Genes<- featureNames(Train)[ !featureNames(Train) %in% PAM50Genes]
Non_PAM50Genes <- sample(Non_PAM50Genes,200, replace=FALSE)

reduced_set <- c(PAM50Genes, Non_PAM50Genes)

#Now we get the reduced training and test sets

Train<- Train[reduced_set,]
Test<- Test[reduced_set,]


## ----robust_scaling-----------------------------------------------------------

exprs(Train) <- t(apply(exprs(Train),1,genefu::rescale,na.rm=TRUE,q=0.05))
exprs(Test) <- t(apply(exprs(Test),1,genefu::rescale,na.rm=TRUE,q=0.05))

train_expr <- exprs(Train)
test_expr <- exprs(Test)

## ----Surv---------------------------------------------------------------------
train_clinic <- pData(Train) 
test_clinic <- pData(Test)

train_surv <- survival::Surv(time=train_clinic$t.rfs,event=train_clinic$e.rfs)
test_surv <- survival::Surv(time=test_clinic$t.rfs,event=test_clinic$e.rfs)


## ----parameters, eval=TRUE----------------------------------------------------
# For testing reasons it is set to a low number but ideally should be above 100
population <- 30 
# For testing reasons it is set to a low number but ideally should be above 150
generations <-15
nCV <- 5                      
distancetype <- "pearson"     
TournamentSize <- 2
period <- 3650

## ----galgo_run, eval= TRUE,results='hide'-------------------------------------
set.seed(264)
output <- GSgalgoR::galgo(generations = generations, 
                        population = population, 
                        prob_matrix = train_expr, 
                        OS = train_surv,
                        nCV = nCV, 
                        distancetype = distancetype,
                        TournamentSize = TournamentSize, 
                        period = period)

## -----------------------------------------------------------------------------
print(class(output))

## ----to_list, eval= TRUE------------------------------------------------------
outputList <- to_list(output)
head(names(outputList))

## ----example_1, eval=TRUE-----------------------------------------------------
outputList[["Solution.1"]]

## ----to_dataframe, eval= TRUE-------------------------------------------------
outputDF <- to_dataframe(output)
head(outputDF)

## ----plot_pareto, eval=TRUE---------------------------------------------------
plot_pareto(output)

## ----classify, eval=TRUE------------------------------------------------------
#The reduced UPP dataset will be used as training set 
train_expression <- exprs(Train) 
train_clinic<- pData(Train)
train_features<- fData(Train)
train_surv<- survival::Surv(time=train_clinic$t.rfs,event=train_clinic$e.rfs)

#The reduced TRANSBIG dataset will be used as test set 

test_expression <- exprs(Test) 
test_clinic<- pData(Test)
test_features<- fData(Test)
test_surv<- survival::Surv(time=test_clinic$t.rfs,event=test_clinic$e.rfs)


#PAM50 centroids
centroids<- pam50$centroids
#Extract features from both data.frames
inBoth<- Reduce(intersect, list(rownames(train_expression),rownames(centroids)))

#Classify samples 

PAM50_train<- cluster_classify(train_expression[inBoth,],centroids[inBoth,],
                            method = "spearman")
table(PAM50_train)

PAM50_test<- cluster_classify(test_expression[inBoth,],centroids[inBoth,],
                            method = "spearman")
table(PAM50_test)

# Classify samples using genefu
#annot<- fData(Train)
#colnames(annot)[3]="Gene.Symbol"
#PAM50_train<- molecular.subtyping(sbt.model = "pam50",
#         data = t(train_expression), annot = annot,do.mapping = TRUE)


## ----pam50_surv_UPP, eval=TRUE------------------------------------------------
surv_formula <- 
    as.formula("Surv(train_clinic$t.rfs,train_clinic$e.rfs)~ PAM50_train")
tumortotal1 <- surv_fit(surv_formula,data=train_clinic)
tumortotal1diff <- survdiff(surv_formula)
tumortotal1pval<- pchisq(tumortotal1diff$chisq, length(tumortotal1diff$n) - 1,
                         lower.tail = FALSE) 

p<-ggsurvplot(tumortotal1,
            data=train_clinic,
            risk.table=TRUE,
            pval=TRUE,
            palette="dark2",
            title="UPP breast cancer \n PAM50 subtypes survival",
            surv.scale="percent",
            conf.int=FALSE, 
            xlab="time (days)", 
            ylab="survival(%)", 
            xlim=c(0,3650),
            break.time.by = 365, 
            ggtheme = theme_minimal(), 
            risk.table.y.text.col = TRUE, 
            risk.table.y.text = FALSE,censor=FALSE)
print(p)

## ----pam50_surv_TRANSBIG, eval=TRUE-------------------------------------------
surv_formula <- 
    as.formula("Surv(test_clinic$t.rfs,test_clinic$e.rfs)~ PAM50_test")
tumortotal2 <- surv_fit(surv_formula,data=test_clinic)
tumortotal2diff <- survdiff(surv_formula)
tumortotal2pval<- pchisq(tumortotal2diff$chisq, length(tumortotal2diff$n) - 1,
                        lower.tail = FALSE) 

p<-ggsurvplot(tumortotal2,
            data=test_clinic,
            risk.table=TRUE,
            pval=TRUE,
            palette="dark2",
            title="TRANSBIG breast cancer \n PAM50 subtypes survival",
            surv.scale="percent",
            conf.int=FALSE,
            xlab="time (days)",
            ylab="survival(%)",
            xlim=c(0,3650),
            break.time.by = 365,
            ggtheme = theme_minimal(),
            risk.table.y.text.col = TRUE,
            risk.table.y.text = FALSE,
            censor=FALSE)
print(p)

## ----case_params, eval=TRUE---------------------------------------------------
population <- 15             
generations <-5             
nCV <- 5                      
distancetype <- "pearson"     
TournamentSize <- 2
period <- 3650

## ----galgo_train, results='hide'----------------------------------------------
output= GSgalgoR::galgo(generations = generations,
                    population = population,
                    prob_matrix = train_expression,
                    OS=train_surv,
                    nCV= nCV, 
                    distancetype=distancetype,
                    TournamentSize=TournamentSize,
                    period=period)
print(class(output))

## ----pareto_2,eval=TRUE, out.width='100%'-------------------------------------
plot_pareto(output)

## ----summary_results, eval=TRUE-----------------------------------------------

output_df<- to_dataframe(output)
NonDom_solutions<- output_df[output_df$Rank==1,]

# N of non-dominated solutions 
nrow(NonDom_solutions)

# N of partitions found
table(NonDom_solutions$k)

#Average N of genes per signature
mean(unlist(lapply(NonDom_solutions$Genes,length)))

#SC range
range(NonDom_solutions$SC.Fit)

# Survival fitnesss range
range(NonDom_solutions$Surv.Fit)


## ----best_perform, eval=TRUE--------------------------------------------------

RESULT<- non_dominated_summary(output=output,
                            OS=train_surv, 
                            prob_matrix= train_expression,
                            distancetype =distancetype 
                            )

best_sol=NULL
for(i in unique(RESULT$k)){
    best_sol=c(
    best_sol,
    RESULT[RESULT$k==i,"solution"][which.max(RESULT[RESULT$k==i,"C.Index"])])
}

print(best_sol)

## ----centroid_list------------------------------------------------------------
CentroidsList <- create_centroids(output, 
                                solution_names = best_sol,
                                trainset = train_expression)

## ----class--------------------------------------------------------------------

train_classes<- classify_multiple(prob_matrix=train_expression,
                                centroid_list= CentroidsList, 
                                distancetype = distancetype)

test_classes<- classify_multiple(prob_matrix=test_expression,
                                centroid_list= CentroidsList, 
                                distancetype = distancetype)


## ----pred_model---------------------------------------------------------------

Prediction.models<- list()

for(i in best_sol){

    OS<- train_surv
    predicted_class<- as.factor(train_classes[,i])
    predicted_classdf <- as.data.frame(predicted_class)
    colnames(predicted_classdf)<- i
    surv_formula <- as.formula(paste0("OS~ ",i))
    coxsimple=coxph(surv_formula,data=predicted_classdf)
    Prediction.models[[i]]<- coxsimple
}


## ----cindex-------------------------------------------------------------------

C.indexes<- data.frame(train_CI=rep(NA,length(best_sol)),
                    test_CI=rep(NA,length(best_sol)))
rownames(C.indexes)<- best_sol

for(i in best_sol){
    predicted_class_train<- as.factor(train_classes[,i])
    predicted_class_train_df <- as.data.frame(predicted_class_train)
    colnames(predicted_class_train_df)<- i
    CI_train<- 
        concordance.index(predict(Prediction.models[[i]],
                                predicted_class_train_df),
                                surv.time=train_surv[,1],
                                surv.event=train_surv[,2],
                                outx=FALSE)$c.index
    C.indexes[i,"train_CI"]<- CI_train
    predicted_class_test<- as.factor(test_classes[,i])
    predicted_class_test_df <- as.data.frame(predicted_class_test)
    colnames(predicted_class_test_df)<- i
    CI_test<- 
        concordance.index(predict(Prediction.models[[i]],
                                predicted_class_test_df),
                                surv.time=test_surv[,1],
                                surv.event=test_surv[,2],
                                outx=FALSE)$c.index
    C.indexes[i,"test_CI"]<- CI_test
    }

print(C.indexes)

best_signature<- best_sol[which.max(C.indexes$test_CI)]

print(best_signature)

## ----galgo_train_surv, eval=TRUE, out.width='100%'----------------------------

train_class <- train_classes[,best_signature]

surv_formula <- 
    as.formula("Surv(train_clinic$t.rfs,train_clinic$e.rfs)~ train_class")
tumortotal1 <- surv_fit(surv_formula,data=train_clinic)
tumortotal1diff <- survdiff(surv_formula)
tumortotal1pval<- pchisq(tumortotal1diff$chisq,
                        length(tumortotal1diff$n) - 1,
                        lower.tail = FALSE) 

p<-ggsurvplot(tumortotal1,
            data=train_clinic,
            risk.table=TRUE,pval=TRUE,palette="dark2",
            title="UPP breast cancer \n Galgo subtypes survival",
            surv.scale="percent",
            conf.int=FALSE, xlab="time (days)", 
            ylab="survival(%)", xlim=c(0,3650),
            break.time.by = 365,
            ggtheme = theme_minimal(), 
            risk.table.y.text.col = TRUE, 
            risk.table.y.text = FALSE,censor=FALSE)
print(p)


## ----galgo_test_surv, eval=TRUE, out.width='100%'-----------------------------

test_class <- test_classes[,best_signature]

surv_formula <- 
    as.formula("Surv(test_clinic$t.rfs,test_clinic$e.rfs)~ test_class")
tumortotal1 <- surv_fit(surv_formula,data=test_clinic)
tumortotal1diff <- survdiff(surv_formula)
tumortotal1pval<- pchisq(tumortotal1diff$chisq,
                        length(tumortotal1diff$n) - 1,
                        lower.tail = FALSE) 

p<-ggsurvplot(tumortotal1,
            data=test_clinic,
            risk.table=TRUE,
            pval=TRUE,palette="dark2",
            title="TRANSBIG breast cancer \n Galgo subtypes survival",
            surv.scale="percent",
            conf.int=FALSE, 
            xlab="time (days)",
            ylab="survival(%)",
            xlim=c(0,3650),
            break.time.by = 365, 
            ggtheme = theme_minimal(), 
            risk.table.y.text.col = TRUE,
            risk.table.y.text = FALSE,
            censor=FALSE)
print(p)

## ----test_pam50, eval=TRUE, out.width='100%'----------------------------------

surv_formula1 <- 
    as.formula("Surv(test_clinic$t.rfs,test_clinic$e.rfs)~ test_class")
tumortotal1 <- surv_fit(surv_formula1,data=test_clinic)
tumortotal1diff <- survdiff(surv_formula1)
tumortotal1pval<- pchisq(tumortotal1diff$chisq,
                        length(tumortotal1diff$n) - 1,
                        lower.tail = FALSE) 

surv_formula2 <- 
    as.formula("Surv(test_clinic$t.rfs,test_clinic$e.rfs)~ PAM50_test")
tumortotal2 <- surv_fit(surv_formula2,data=test_clinic)
tumortotal2diff <- survdiff(surv_formula2)
tumortotal2pval<- pchisq(tumortotal1diff$chisq,
                        length(tumortotal2diff$n) - 1,
                        lower.tail = FALSE) 

SURV=list(GALGO=tumortotal1,PAM50=tumortotal2 )
COLS=c(1:8,10)
par(cex=1.35, mar=c(3.8, 3.8, 2.5, 2.5) + 0.1)
p=ggsurvplot(SURV,
            combine=TRUE,
            data=test_clinic,
            risk.table=TRUE,
            pval=TRUE,
            palette="dark2",
            title="Galgo vs. PAM50 subtypes \n BRCA survival comparison",
            surv.scale="percent",
            conf.int=FALSE,
            xlab="time (days)",
            ylab="survival(%)",
            xlim=c(0,period),
            break.time.by = 365, 
            ggtheme = theme_minimal(),
            risk.table.y.text.col = TRUE,
            risk.table.y.text = FALSE,
            censor=FALSE)
print(p)


## ----sess_info, eval=TRUE-----------------------------------------------------
sessionInfo()