## ----echo=FALSE--------------------------------------------------------------------------------------------- options(width=110) ## ----------------------------------------------------------------------------------------------------------- library(HIBAG) ## ----fig.width=10, fig.height=4, fig.align='center'--------------------------------------------------------- # load the published parameter estimates from European ancestry # e.g., filename <- "HumanOmniExpressExome-European-HLA4-hg19.RData" # here, we use example data in the package filename <- system.file("extdata", "ModelList.RData", package="HIBAG") model.list <- get(load(filename)) # HLA imputation at HLA-A hla.id <- "A" model <- hlaModelFromObj(model.list[[hla.id]]) summary(model) plot(model) # show the frequency of SNP marker in the model # SNPs in the model head(model$snp.id) head(model$snp.position) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # ######################################################################### # # Import your PLINK BED file # # # yourgeno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") # summary(yourgeno) # # # best-guess genotypes and all posterior probabilities # pred.guess <- hlaPredict(model, yourgeno, type="response+prob") # summary(pred.guess) # # pred.guess$value # pred.guess$postprob ## ----eval=FALSE--------------------------------------------------------------------------------------------- # library(HIBAG) # # # load HLA types and SNP genotypes in the package # data(HLA_Type_Table, package="HIBAG") # data(HapMap_CEU_Geno, package="HIBAG") # # # a list of HLA genotypes # # e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...), # # H2=c("02:01", "03:01", ...), locus="A") # # the HLA type of the first individual is 01:02/02:01, # # the second is 05:01/03:01, ... # hla.id <- "A" # train.HLA <- hlaAllele(HLA_Type_Table$sample.id, # H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], # H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], # locus=hla.id, assembly="hg19") # # # training genotypes # # import your PLINK BED file, # # e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") # # and select the SNPs in the flanking region of 500kb on each side # region <- 500 # kb # train.geno <- hlaGenoSubsetFlank(HapMap_CEU_Geno, hla.id, region*1000, assembly="hg19") # summary(train.geno) # # # train a HIBAG model # set.seed(100) # model <- hlaAttrBagging(train.HLA, train.geno, nclassifier=100) ## ----echo=FALSE--------------------------------------------------------------------------------------------- mobj <- get(load(system.file("extdata", "ModelList.RData", package="HIBAG"))) model <- hlaModelFromObj(mobj$A) ## ----------------------------------------------------------------------------------------------------------- summary(model) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # library(HIBAG) # library(parallel) # # # load HLA types and SNP genotypes in the package # data(HLA_Type_Table, package="HIBAG") # data(HapMap_CEU_Geno, package="HIBAG") # # # a list of HLA genotypes # # e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...), # # H2=c("02:01", "03:01", ...), locus="A") # # the HLA type of the first individual is 01:02/02:01, # # the second is 05:01/03:01, ... # hla.id <- "A" # train.HLA <- hlaAllele(HLA_Type_Table$sample.id, # H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], # H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], # locus=hla.id, assembly="hg19") # # # training genotypes # # import your PLINK BED file, # # e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") # # and select the SNPs in the flanking region of 500kb on each side # region <- 500 # kb # train.geno <- hlaGenoSubsetFlank(HapMap_CEU_Geno, hla.id, region*1000, assembly="hg19") # summary(train.geno) # # # # Multithreading # cl <- 2 # 2 -- # of threads # # # Building ... # set.seed(1000) # hlaParallelAttrBagging(cl, train.HLA, train.geno, nclassifier=100, # auto.save="AutoSaveModel.RData") # model.obj <- get(load("AutoSaveModel.RData")) # model <- hlaModelFromObj(model.obj) # summary(model) ## ----------------------------------------------------------------------------------------------------------- library(HIBAG) # load HLA types and SNP genotypes in the package data(HLA_Type_Table, package="HIBAG") data(HapMap_CEU_Geno, package="HIBAG") # make a list of HLA types hla.id <- "A" hla <- hlaAllele(HLA_Type_Table$sample.id, H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], locus=hla.id, assembly="hg19") # divide HLA types randomly set.seed(100) hlatab <- hlaSplitAllele(hla, train.prop=0.5) names(hlatab) summary(hlatab$training) summary(hlatab$validation) # SNP predictors within the flanking region on each side region <- 500 # kb snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19") length(snpid) # training and validation genotypes train.geno <- hlaGenoSubset(HapMap_CEU_Geno, snp.sel = match(snpid, HapMap_CEU_Geno$snp.id), samp.sel = match(hlatab$training$value$sample.id, HapMap_CEU_Geno$sample.id)) summary(train.geno) test.geno <- hlaGenoSubset(HapMap_CEU_Geno, samp.sel=match( hlatab$validation$value$sample.id, HapMap_CEU_Geno$sample.id)) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # # train a HIBAG model # set.seed(100) # model <- hlaAttrBagging(hlatab$training, train.geno, nclassifier=100) ## ----echo=FALSE--------------------------------------------------------------------------------------------- mobj <- get(load(system.file("extdata", "OutOfBag.RData", package="HIBAG"))) model <- hlaModelFromObj(mobj) ## ----------------------------------------------------------------------------------------------------------- summary(model) # validation pred <- hlaPredict(model, test.geno) summary(pred) # compare comp <- hlaCompareAllele(hlatab$validation, pred, allele.limit=model, call.threshold=0) comp$overall ## ----fig.align="center", fig.height=4, fig.width=5---------------------------------------------------------- # hierarchical cluster analysis d <- hlaDistance(model) p <- hclust(as.dist(d)) plot(p, xlab="HLA alleles") ## ----fig.align="center", fig.height=3.3, fig.width=5-------------------------------------------------------- # violin plot hlaReportPlot(pred, model=model, fig="matching") ## ----fig.align="center", fig.height=3.3, fig.width=5-------------------------------------------------------- hlaReportPlot(pred, hlatab$validation, fig="call.rate") hlaReportPlot(pred, hlatab$validation, fig="call.threshold") ## ----------------------------------------------------------------------------------------------------------- # report overall accuracy, per-allele sensitivity, specificity, etc hlaReport(comp, type="txt") ## ----results='asis'----------------------------------------------------------------------------------------- # report overall accuracy, per-allele sensitivity, specificity, etc hlaReport(comp, type="markdown") ## ----------------------------------------------------------------------------------------------------------- # report overall accuracy, per-allele sensitivity, specificity, etc hlaReport(comp, type="tex", header=FALSE) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # library(HIBAG) # # # make a list of HLA types # hla.id <- "DQA1" # hla <- hlaAllele(HLA_Type_Table$sample.id, # H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], # H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], # locus = hla.id, assembly = "hg19") # # # training genotypes # region <- 500 # kb # snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, HapMap_CEU_Geno$snp.position, # hla.id, region*1000, assembly="hg19") # train.geno <- hlaGenoSubset(HapMap_CEU_Geno, # snp.sel = match(snpid, HapMap_CEU_Geno$snp.id), # samp.sel = match(hla$value$sample.id, HapMap_CEU_Geno$sample.id)) # # set.seed(1000) # model <- hlaAttrBagging(hla, train.geno, nclassifier=100) # summary(model) # # # remove unused SNPs and sample IDs from the model # mobj <- hlaPublish(model, # platform = "Illumina 1M Duo", # information = "Training set -- HapMap Phase II", # warning = NULL, # rm.unused.snp=TRUE, anonymize=TRUE) # # save(mobj, file="Your_HIBAG_Model.RData") ## ----eval=FALSE--------------------------------------------------------------------------------------------- # # assume the HIBAG models are stored in R objects: mobj.A, mobj.B, ... # # ModelList <- list() # ModelList[["A"]] <- mobj.A # ModelList[["B"]] <- mobj.B # ... # # # save to an R data file # save(ModelList, file="HIBAG_Model_List.RData") ## ----------------------------------------------------------------------------------------------------------- sessionInfo()