Skip to content.

bioconductor.org

Bioconductor is an open source and open development software project
for the analysis and comprehension of genomic data.

Sections

lab5.R

##load up required libraries

library(cluster)
library(Biobase)
library(annotate)
library(genefilter)
library(golubEsets)

############################
## data processing
###########################

data(golubTrain)
golubTrain
data(golubTest)
golubTest

LS<-exprs(golubTrain)
cl<-golubTrain$ALL.AML
TS<-exprs(golubTest)
clts<-golubTest$ALL.AML

##mmfilt is in the library

LS[LS<100]<-100
TS[TS<100]<-100
LS[LS>16000]<-16000
TS[TS>16000]<-16000

mmfilt <- function(r=5, d=500, na.rm=TRUE) {
function(x) {
minval <- min(x, na.rm=na.rm)
maxval <- max(x, na.rm=na.rm)
(maxval/minval > 5) && (maxval-minval > 500)
}
}

mmfun <- mmfilt()

ffun <- filterfun(mmfun)

good <- genefilter(cbind(LS, TS), ffun )

sum(good) ##I get 3571

LSsub <- log10(LS[good,])
TSsub <- log10(TS[good,])
LTsub <- cbind(LSsub, TSsub)


##############################
## Annotate : optional
##############################

##probably want to do a ttest filter here
##but we'll leave that to you

ll<- read.annotation("hgu68ll") #locus link
gname <- row.names(LSsub)[50]
llname<-get(gname, env=ll)
locuslink(llname) ##open your browser to the right thing

chroms <- read.annotation("hgu68Chrom")

whichC <- multiget(row.names(LSsub[1:100,]), env=chroms, iffail=NA)
cc <-as.numeric(unlist(whichC))
hist(cc)


###################################
## Clustering
#################################

gf <- gapFilter(550, 800, .1)
ff <- filterfun(gf)
xx <- 10^LSsub ##to get back to original units
good <- genefilter(xx, gf)
sum(good) # should be 1093

LS2 <- LSsub[good,]

library(mva)
clust.res<-hclust(as.dist(1-cor(LS2)), method = "complete")
plot.hclust(clust.res, labels=cl,hang = 0.1)

clust.res<-hclust(dist(t(LS2)), method = "average")
plot.hclust(clust.res, labels=cl,hang = 0.1)


pamLS <- pam(as.dist(1-cor(LS2)), 3, diss=TRUE)
plot(pamLS)



#######################################
## classification
##################################

library(class)

ALL <- c(as.character(golubTrain$ALL),
as.character(golubTest$ALL))
BT <- c(as.character(golubTrain$T.B),
as.character(golubTest$T.B))
BTs <- ifelse(BT=="B-cell", "B", "T")
BTs <- ifelse(BT=="T-cell", "T", BTs)
BTs <- ifelse(BT=="NA", "", BTs)
newALL <- factor(paste(ALL, BTs, sep=""))
nALL.train <- newALL[1:38]
nALL.test <-newALL[39:72]

##now do an Anova filter
af <- Anova(nALL.train, p=0.0000001)
ffA <- filterfun(af)
wh.ALL <- genefilter(LSsub, af) #use only training data
sum(wh.ALL) #how many


##how does it work on the test set
pred.test <- knn(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train, k=5)
table(pred.test, nALL.test)

##how does it work on the training set
pred.tr <- knn(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train, k=5)
table(pred.tr, nALL.train)

##
##use knn and cross-validation to choose the best k!


knn.cvk<-function(LS, cl, k=1:5, l=0, prob=FALSE, use.all=TRUE)
{
classes<-unique(cl)
cv.err<-rep(0,length(k))
cl.pred<-matrix(NA, nrow(LS), length(k))
for(j in (1:length(k)))
{
cl.pred[,j]<-knn.cv(LS, cl, k[j], l, prob, use.all)
cv.err[j]<-sum(cl!=cl.pred[,j])
}
k0<-k[which.min(cv.err)]
return(k=k0,pred=classes[cl.pred[,which.min(cv.err)]])
}

whichone <- knn.cvk(t(LSsub[wh.ALL,]), nALL.train)


##Tree based classifiers

library(rpart)

dorpart <- function(LS, TS, cl) {
LS <- as.data.frame(LS)
TS <- as.data.frame(TS)
dimnames(TS)[[2]] <- dimnames(LS)[[2]] <- paste("V",
1:ncol(LS), sep = "")
fit <- rpart(factor(cl) ~ ., data = LS)
predict.rpart(fit, TS, type = "class")
}

all.rp <- dorpart(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train)
table(all.rp, nALL.test)


##Discriminant Analysis

library(MASS)

dolda <- function(LS, TS, cl, prior, pmethod="plug-in") {
ndata <- data.frame(LS)
ndata$cl <- cl
if( missing(prior) )
mn <- lda(cl~., data=ndata)
else
mn <- lda(cl~., data=ndata, prior=prior)
pdata<-data.frame(TS)
preds <- predict(mn, pdata, method=pmethod)
rval <- list(lda = mn, preds=preds)
class(rval) <- "mylda"
rval
}

lda.test <- dolda(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train)
table(lda.test$preds$class, nALL.test)

##how does it work on the training set
##not very meaningful
lda.pred.tr <- dolda(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train)
table(lda.pred.tr$preds$class, nALL.train)
News
2009-10-26

BioC 2.5, consisting of 352 packages and designed to work with R 2.10.z, was released today.

2009-01-07

R, the open source platform used by Bioconductor, featured in a series of articles in the New York Times.