##
## R commands for AMD analysis, May 2009 update
##

# get chrom 1

#setwd( "P:/teaching/578b/Rgenet/AMD" )
setwd("C:/Documents and Settings/kenrice/Desktop/BiocWorkshop")
load("AMDchrom1.R")

# amd.chrom object is 9798 x 150
# rows are the 9798 SNPs
# cols are SNP, rsID, chromosome, pos, 5:100 are the cases, 101:150 the controls

# last 596 rows are all missing - so drop them

amd1 <- amd.chrom[1:(9798-596),]
dim(amd1)

summary(amd1)

summary(amd1[,1:10])

# need to recode zeros as NA?
table(apply(amd1[,5:150], 1, function(x){sum(x==0)}) )
table(apply(amd1[,5:150], 2, function(x){sum(x==0)}) )

# split into 2; data.frame of id etc, matrix of the allele counts
amd.inf  <- amd1[,1:4] 
amd.snps <- as.matrix( t( apply(amd1[,5:150], 1, function(x){ replace(x, x==0, NA) } ) ) -1 )
rm(amd1)

amd.inf$maf <- apply(amd.snps, 1, function(x){mean(x/2,na.rm=T)} )
dim(amd.inf) #9202 SNPs

for(i in 1:9202){
	if(amd.inf$maf[i]>0.5){amd.snps[i,] <- 2-amd.snps[i,]}
}

# everything now coded as number of minor alleles

# the histogram of MAFs
amd.inf$maf <- apply(amd.snps, 1, function(x){mean(x/2,na.rm=T)} )
pdf("amdhist1.pdf", w=6, h=4)
hist( amd.inf$maf , xlab="minor allele frequency", main="AMD Chr 1, all SNPs", nclass=50)
dev.off()

# get rid of the SNPs with 'high' MAFs;
table(amd.inf$maf>0.05 )
amd.big.inf  <-  amd.inf[amd.inf$maf>0.05,]
amd.big.snps <- amd.snps[amd.inf$maf>0.05,]

dim(amd.big.snps) # 7154 rows

r2.matrix <- matrix(NA, 7154, 7154)

## slow but straightforward way; minimal NA hassles

system.time( {
for(i in 1:(1000-1)){
	for(j in (i+1):1000){
		r2.matrix[i,j] <- cor(amd.big.snps[i,], amd.big.snps[j,], use="na.or.complete")^2
}}
})

date()
for(i in 1:(7154-1)){
	for(j in (i+1):7154){
		r2.matrix[i,j] <- cor(amd.big.snps[i,], amd.big.snps[j,], use="na.or.complete")^2
}}
date()

save.image("amdRsquare.Rdata")

# draw some pictures (clunky ones!)

png("try.r2.png", w=6*600, h=4*600, pointsize=12*600/72) 

plot(0, xlab=expression(Delta(plain(position))), ylab=expression(r^2), type="n", 
     xlim=c(0,1)*max(amd.inf$pos[1:7154], na.rm=T),
     ylim=c(0,1) )

for(i in 1:(7154-1)){
	points( x = amd.inf$pos[(i+1):7154] - amd.inf$pos[i], y=r2.matrix[i, (i+1):7154] )
	}

dev.off()

png("r2zoom.png", w=6*600, h=4*600, pointsize=12*600/72) 

plot(0, xlab=expression(Delta(plain(position))), ylab=expression(r^2), type="n", 
     xlim=c(0,3E6),
     ylim=c(0,1) )
for(i in 1:(7154-1)){
	points( x = amd.inf$pos[(i+1):7154] - amd.inf$pos[i], y=r2.matrix[i, (i+1):7154] )
	}
dev.off()

rm(r2.matrix)

### A more elegant way
###
### Above, we used for() to do the loops, in R. This means R has to keep track of vector 'i' and 'j',
### and how far along them we've come.
### R is not terrible at this, but it's not super-speedy either
###
### If you feed cor() a matrix, it defaults to giving you the whole correlation matrix
### i.e. the correlations of all pairs of columns
### It does this with all the looping steps *done directly in C*. 
### 
### Here's the code - it's one line;

system.time( 
r2.matrix.quick <- cor(t(amd.big.snps), use="pairwise.complete.obs" )^2 
)

# nb the t() bit is because you want correlation of rows of the original, not columns
# The whole job takes 2 mins (!) on my machine
# For e.g. GWAS analysis, note that the time the CPU spends on 'looping' is not
# much compared to the time spend on e.g. logistic regression 
# So this particular trick would have a less dramatic effect (but it's still worth knowing)

dim(r2.matrix.quick)
select.rows <- sample(1:7154, 100, replace=F) # for speed

plot(0, xlab=expression(Delta(plain(position))), ylab=expression(r^2), type="n", 
     xlim=c(0,1)*max(amd.inf$pos[1:7154], na.rm=T),
     ylim=c(0,1) )
for(i in select.rows){
	points( x = abs(amd.inf$pos[select.rows] - amd.inf$pos[i]), y=r2.matrix.quick[i, select.rows] )
	}

