## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, tidy = FALSE, warning = FALSE, message = FALSE) ## ----readWhippetWholeData, eval = FALSE--------------------------------------- # # Load packages # library(GeneStructureTools) # library(GenomicRanges) # library(stringr) # library(BSgenome.Mmusculus.UCSC.mm10) # library(Gviz) # library(rtracklayer) # # # # list of files in the whippet directory # whippet_file_directory <- "~/Downloads/GeneStructureTools_VignetteFiles/" # # # read in files as a whippetDataSet # wds <- readWhippetDataSet(whippet_file_directory) # # # create a sample table with sample id, condition and replicate # whippet_sampleTable <- data.frame(sample=c("A01","B01","A21","B21"), # condition=c("01","01","21","21"), # replicate=(c("A","B","A","B"))) # # # read in gtf annotation # gtf <- rtracklayer::import("~/Downloads/gencode.vM14.annotation.gtf.gz") # exons <- gtf[gtf$type=="exon"] # transcripts <- gtf[gtf$type=="transcript"] # # # add first/last annotation (speeds up later steps) # if(!("first_last" %in% colnames(mcols(exons)))){ # t <- as.data.frame(table(exons$transcript_id)) # exons$first_last <- NA # exons$first_last[exons$exon_number == 1] <- "first" # exons$first_last[exons$exon_number == # t$Freq[match(exons$transcript_id, t$Var1)]] <- "last" # } # # # specify the BSGenome annotation # g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10 # ## ----readWhippet-------------------------------------------------------------- # Load packages suppressPackageStartupMessages({ library(GeneStructureTools) library(GenomicRanges) library(stringr) library(BSgenome.Mmusculus.UCSC.mm10) library(Gviz) library(rtracklayer) }) # list of files in the whippet directory whippet_file_directory <- system.file("extdata","whippet/", package = "GeneStructureTools") # read in files as a whippetDataSet wds <- readWhippetDataSet(whippet_file_directory) # create a sample table with sample id, condition and replicate whippet_sampleTable <- data.frame(sample=c("A01","B01","A21","B21"), condition=c("01","01","21","21"), replicate=(c("A","B","A","B"))) # read in gtf annotation gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf", package = "GeneStructureTools")) exons <- gtf[gtf$type=="exon"] transcripts <- gtf[gtf$type=="transcript"] # add first/last annotation (speeds up later steps) if(!("first_last" %in% colnames(mcols(exons)))){ t <- as.data.frame(table(exons$transcript_id)) exons$first_last <- NA exons$first_last[exons$exon_number == 1] <- "first" exons$first_last[exons$exon_number == t$Freq[match(exons$transcript_id, t$Var1)]] <- "last" } # specify the BSGenome annotation g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10 ## ----findScript, eval=FALSE--------------------------------------------------- # #find location of the script # system.file("extdata","leafviz2table.R", package = "GeneStructureTools") ## ----readLeafcutter----------------------------------------------------------- # read in gtf annotation gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf", package = "GeneStructureTools")) exons <- gtf[gtf$type=="exon"] # specify the BSGenome annotation g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10 # list of files in the leafcutter directory leafcutter_files <- list.files(system.file("extdata","leafcutter/", package = "GeneStructureTools"),full.names = TRUE) intron_results <- read.delim(leafcutter_files[grep("per_intron_results.tab", leafcutter_files)], stringsAsFactors = FALSE) ## ----filterWhippetEvents------------------------------------------------------ # filter events for significance wds <- filterWhippetEvents( whippetDataSet = wds, probability = 0.95, # min probability psiDelta = 0.1, # min change in PSI eventTypes = "all", # all event types minCounts = 100, # mean of at least 100 counts in one condition sampleTable = whippet_sampleTable) # check for changes in gene/transcript structure whippet_summary <- whippetTranscriptChangeSummary(wds, exons = exons, transcripts = transcripts, BSgenome = g, NMD = FALSE # ignore nonsense mediated decay ) head(whippet_summary) ## ----leafcutterSummary-------------------------------------------------------- leafcutter_summary <- leafcutterTranscriptChangeSummary(intron_results, exons = exons, BSgenome = g, NMD = FALSE, showProgressBar = FALSE) head(leafcutter_summary[!duplicated(leafcutter_summary$cluster),]) ## ----whippetSkippedFilter----------------------------------------------------- # filter out skipped exon events (coded as "CE") # we will be looking at Ndufv3 (ENSMUSG00000024038.16) wds.ce <- filterWhippetEvents(wds, psiDelta=0,probability=0, event="CE", idList="ENSMUSG00000024038.16") diffSplicingResults(wds.ce) # psi_a = 0.137, psi_b = 0.275 # percentage of transcripts skipping exon 3 decreases from timepoint 1 to 21 # whippet outputs the skipped exon coordinates coordinates(wds.ce) ## ----findOverlapsSkipped------------------------------------------------------ # find exons in the gtf that overlap the skipped exon event exons.ce <- findExonContainingTranscripts(wds.ce, exons = exons, transcripts = transcripts) # make skipped and included exon transcripts # removes the skipped exon from all transcripts which contain it skippedExonTranscripts <- skipExonInTranscript(skippedExons = exons.ce, exons=exons, match="skip", whippetDataSet = wds.ce) ## ----skippedGviz, eval=TRUE--------------------------------------------------- ## make Gvis models # set up for visualisation gtr <- GenomeAxisTrack() # all transcripts for the gene geneModel.all <- GeneRegionTrack(makeGeneModel(exons[exons$gene_id == skippedExonTranscripts$gene_id[1]]), name="Reference Gene", showId=TRUE, transcriptAnnotation = "transcript") # reference transcript geneModelNormal <- GeneRegionTrack(makeGeneModel( skippedExonTranscripts[skippedExonTranscripts$set=="included_exon"]), name="Reference Isoform", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") # for the skipped exon transcript geneModelSkipped <- GeneRegionTrack(makeGeneModel( skippedExonTranscripts[skippedExonTranscripts$set=="skipped_exon"]), name="Alternative Isoform", showId=TRUE, fill="#94AFD8", transcriptAnnotation = "transcript") plotTracks(list(gtr,geneModel.all,geneModelNormal, geneModelSkipped), extend.left = 1000, extend.right = 1000) # Only the transcript isoform containing the skipped exon (exon 3) # is used for analysis, and a 'novel' isoform is created by exon skipping ## ----retainedIntronWhippet---------------------------------------------------- # filter out retained events (coded as "RI") # we will be looking at Srsf1 (ENSMUSG00000018379.17) wds.ri <- filterWhippetEvents(wds, psiDelta=0,probability=0, event="RI", idList="ENSMUSG00000018379.17") diffSplicingResults(wds.ri) ## ----findIntron--------------------------------------------------------------- # find exons pairs in the gtf that bound the retained intron event exons.ri <- findIntronContainingTranscripts(wds.ri, exons) # make retained and non-retained transcripts # adds the intron into all transcripts which overlap it retainedIntronTranscripts <- addIntronInTranscript(exons.ri, exons = exons, whippetDataSet = wds.ri, glueExons = TRUE) ## ----gvizIntronRetention, eval=TRUE------------------------------------------- ## make Gviz models # all transcripts for the gene geneModel.all <- GeneRegionTrack(makeGeneModel(exons[exons$gene_id == retainedIntronTranscripts$gene_id[1]]), name="Reference Gene", showId=TRUE, transcriptAnnotation = "transcript") # reference transcript geneModelNormal <- GeneRegionTrack(makeGeneModel( retainedIntronTranscripts[retainedIntronTranscripts$set=="spliced_intron"]), name="Reference Isoform", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") # for the retained intron transcript geneModelRetained <- GeneRegionTrack(makeGeneModel( retainedIntronTranscripts[retainedIntronTranscripts$set=="retained_intron"]), name="Alternative Isoform", showId=TRUE, fill="#94AFD8", transcriptAnnotation = "transcript") # Only the transcript isoforms with exons at the boundries of the retained intron are used for analysis, and 'novel' isoforms are created by intron retention plotTracks(list(gtr,geneModel.all,geneModelNormal, geneModelRetained), extend.left = 1000, extend.right = 1000) ## ----altAccept---------------------------------------------------------------- # filter out alternative acceptor events (coded as "AA") wds.aa <- filterWhippetEvents(wds, psiDelta=0,probability=0, event="AA", idList="ENSMUSG00000034064.14") diffSplicingResults(wds.aa) # AA/AD coordinates range from the normal acceptor splice site to the alternative acceptor splice site coordinates(wds.aa) ## ----findJunctionsAA---------------------------------------------------------- # find exons pairs in the gtf that bound the retained intron event junctionPairs.aa <- findJunctionPairs(wds.aa, type="AA") junctionPairs.aa ## ----replaceJunctionAA-------------------------------------------------------- # make transcripts with alternative junction usage altTranscripts <- replaceJunction(wds.aa, junctionPairs.aa, exons, type="AA") # make transcripts using junction X xTranscripts <- altTranscripts[altTranscripts$set=="AA_X"] # make transcripts using junction Y yTranscripts <- altTranscripts[altTranscripts$set=="AA_Y"] ## ----altAceGvis--------------------------------------------------------------- geneModel.all <- GeneRegionTrack(makeGeneModel(exons[exons$gene_id == altTranscripts$gene_id[1]]), name="Reference Gene", showId=TRUE, transcriptAnnotation = "transcript") # transcript X geneModelX <- GeneRegionTrack(makeGeneModel(xTranscripts), name="Isoform X", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") # transcript Y geneModelY<- GeneRegionTrack(makeGeneModel(yTranscripts), name="Isoform Y", showId=TRUE, fill="#94AFD8", transcriptAnnotation = "transcript") plotTracks(list(gtr,geneModel.all,geneModelX, geneModelY), extend.left = 1000, extend.right = 1000) # Zoomed in at the alternative acceptor site plotTracks(list(gtr,geneModel.all,geneModelX, geneModelY), from = 38547500, to = 38551000) ## ----altDonor----------------------------------------------------------------- # filter out alternative acceptor events (coded as "AD") # we will be looking at Mdbd3 (ENSMUSG00000035478.14) wds.ad <- filterWhippetEvents(wds, psiDelta=0,probability=0, event="AD", idList="ENSMUSG00000035478.14") diffSplicingResults(wds.ad) # AD coordinates range from the normal donor splice site to the alternative donor splice site coordinates(wds.ad) ## ----createIsoformsAD--------------------------------------------------------- # find exons pairs in the gtf that bound the retained intron event junctionPairs.ad <- findJunctionPairs(wds.ad, type="AD") # make transcripts with alternative junction usage altTranscripts <- replaceJunction(wds.ad, junctionPairs.ad, exons, type="AD") # make transcripts using junction X xTranscripts <- altTranscripts[altTranscripts$set=="AD_X"] # make transcripts using junction Y yTranscripts <- altTranscripts[altTranscripts$set=="AD_Y"] ## ----altDonGvis--------------------------------------------------------------- geneModel.all <- GeneRegionTrack(makeGeneModel(exons[exons$gene_id == altTranscripts$gene_id[1]]), name="Reference Gene", showId=TRUE, transcriptAnnotation = "transcript") # transcript X geneModelX <- GeneRegionTrack(makeGeneModel(xTranscripts), name="Isoform X", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") # transcript Y geneModelY<- GeneRegionTrack(makeGeneModel(yTranscripts), name="Isoform Y", showId=TRUE, fill="#94AFD8", transcriptAnnotation = "transcript") plotTracks(list(gtr,geneModel.all,geneModelX, geneModelY), extend.left = 1000, extend.right = 1000) ## ----altFirst----------------------------------------------------------------- # filter out alternative acceptor events (coded as "AF") # we will be looking at Csrp1 (ENSMUSG00000026421.14) wds.af <- filterWhippetEvents(wds, psiDelta=0,probability=0, event="AF", idList="ENSMUSG00000026421.14") diffSplicingResults(wds.af) # whippet outputs first (or last) exon being tested only # AF/AL coordinates range are exon coordinates for the tested first/last exon coordinates(wds.af) ## ----createIsoformsAF--------------------------------------------------------- # find junction pairs that use the same acceptor/donor as the specified first/last exon # i.e. find the alternative first/last exon junctionPairs.af <- findJunctionPairs(wds.af, type="AF") ## ----replaceJunctionAF-------------------------------------------------------- # make transcripts with alternative junction usage altTranscripts <- replaceJunction(wds.af, junctionPairs.af, exons, type="AF") # make transcripts using exon X xTranscripts <- altTranscripts[altTranscripts$set=="AF_X"] # make transcripts using exon Y yTranscripts <- altTranscripts[altTranscripts$set=="AF_Y"] ## ----altFirstGvis------------------------------------------------------------- geneModel.all <- GeneRegionTrack(makeGeneModel(exons[exons$gene_id == altTranscripts$gene_id[1]]), name="Reference Gene", showId=TRUE, transcriptAnnotation = "transcript") # reference transcript geneModelX <- GeneRegionTrack(makeGeneModel(xTranscripts), name="Isoform X", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") # for the retained intron transcript geneModelY<- GeneRegionTrack(makeGeneModel(yTranscripts), name="Isoform Y", showId=TRUE, fill="#94AFD8", transcriptAnnotation = "transcript") # Only the transcript isoforms with exons at the boundries of the retained intron are used for analysis, and 'novel' isoforms are created by intron retention plotTracks(list(gtr,geneModel.all,geneModelX, geneModelY), extend.left = 1000, extend.right = 1000) ## ----altLast------------------------------------------------------------------ # filter out alternative acceptor events (coded as "AL") # we will be looking at Ppm1b (ENSMUSG00000061130.12) wds.al <- filterWhippetEvents(wds, psiDelta=0,probability=0, event="AL", idList="ENSMUSG00000061130.12") diffSplicingResults(wds.al) # whippet outputs first (or last) exon being tested only # AF/AL coordinates range are exon coordinates for the tested first/last exon coordinates(wds.al) ## ----createIsoformsAL--------------------------------------------------------- # find junction pairs that use the same acceptor/donor as the specified first/last exon # i.e. find the alternative first/last exon junctionPairs.al <- findJunctionPairs(wds.al, type="AL") # make transcripts with alternative junction usage altTranscripts <- replaceJunction(wds.al, junctionPairs.al, exons, type="AL") # make transcripts using junction X xTranscripts <- altTranscripts[altTranscripts$set=="AL_X"] # make transcripts using junction Y yTranscripts <- altTranscripts[altTranscripts$set=="AL_Y"] ## ----altLastGvis-------------------------------------------------------------- geneModel.all <- GeneRegionTrack(makeGeneModel(exons[exons$gene_id == altTranscripts$gene_id[1]]), name="Reference Gene", showId=TRUE, transcriptAnnotation = "transcript") # reference transcript geneModelX <- GeneRegionTrack(makeGeneModel(xTranscripts), name="Isoform X", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") # for the retained intron transcript geneModelY<- GeneRegionTrack(makeGeneModel(yTranscripts), name="Isoform Y", showId=TRUE, fill="#94AFD8", transcriptAnnotation = "transcript") # Only the transcript isoforms with exons at the boundries of the retained intron are used for analysis, and 'novel' isoforms are created by intron retention plotTracks(list(gtr,geneModel.all,geneModelX, geneModelY), extend.left = 1000, extend.right = 1000) ## ----leafcutterEif4a2--------------------------------------------------------- # select a single cluster cluster <- leafcutter_summary[leafcutter_summary$cluster=="chr16:clu_1396",] # generate alternative isoforms altIsoforms1396 <- alternativeIntronUsage(cluster, exons) # downregulated isoforms altIsoforms1396_dnreg <- altIsoforms1396[grep("dnre", altIsoforms1396$transcript_id)] # upregulated isoforms altIsoforms1396_upreg <- altIsoforms1396[grep("upre", altIsoforms1396$transcript_id)] # visualise gtr <- GenomeAxisTrack() geneModel.ref <- GeneRegionTrack(makeGeneModel( exons[exons$gene_id=="ENSMUSG00000022884.14"]), name="Reference Gene", showId=TRUE, transcriptAnnotation = "transcript") geneModel.dnreg <- GeneRegionTrack(makeGeneModel(altIsoforms1396_dnreg), name="Downregulated isoforms", showId=TRUE,fill="#4D7ABE", transcriptAnnotation = "transcript") geneModel.upreg <- GeneRegionTrack(makeGeneModel(altIsoforms1396_upreg), name="Upregulated isoforms",fill="#94AFD8", showId=TRUE, transcriptAnnotation = "transcript") plotTracks(list(geneModel.ref,geneModel.dnreg, geneModel.upreg), extend.left = 1000, extend.right = 1000) ## ----leafcutterRnps1---------------------------------------------------------- # select a single cluster cluster <- leafcutter_summary[leafcutter_summary$cluster=="chr17:clu_1281",] # generate alternative isoforms altIsoforms1281 <- alternativeIntronUsage(cluster, exons) # downregulated isoforms altIsoforms1281_dnreg <- altIsoforms1281[grep("dnre", altIsoforms1281$transcript_id)] # upregulated isoforms altIsoforms1281_upreg <- altIsoforms1281[grep("upre", altIsoforms1281$transcript_id)] # visualise gtr <- GenomeAxisTrack() geneModel.ref <- GeneRegionTrack(makeGeneModel( exons[exons$gene_id=="ENSMUSG00000034681.16"]), name="Reference Gene", showId=TRUE, transcriptAnnotation = "transcript") geneModel.dnreg <- GeneRegionTrack(makeGeneModel(altIsoforms1281_dnreg), name="Downregulated isoforms", showId=TRUE,fill="#4D7ABE", transcriptAnnotation = "transcript") geneModel.upreg <- GeneRegionTrack(makeGeneModel(altIsoforms1281_upreg), name="Upregulated isoforms",fill="#94AFD8", showId=TRUE, transcriptAnnotation = "transcript") plotTracks(list(geneModel.ref,geneModel.dnreg, geneModel.upreg), extend.left = 1000, extend.right = 1000) ## ----SkippedExonORF----------------------------------------------------------- # we will be looking at Ndufv3 (ENSMUSG00000024038.16) again wds.ce <- filterWhippetEvents(wds, psiDelta=0,probability=0, event="CE", idList="ENSMUSG00000024038.16") # find exons in the gtf that overlap the skipped exon event exons.ce <- findExonContainingTranscripts(wds.ce, exons = exons, transcripts = transcripts) # make skipped and included exon transcripts # removes the skipped exon from all transcripts which contain it skippedExonTranscripts <- skipExonInTranscript(skippedExons = exons.ce, exons=exons, match="exact", whippetDataSet=wds.ce) # make non-skipped exon transcripts normalTranscripts <- exons[exons$transcript_id %in% exons.ce$transcript_id] # get ORF details for each set of transcripts orfs_normal <- getOrfs(normalTranscripts, BSgenome = g, returnLongestOnly = FALSE, allFrames = TRUE) orfs_skipped <- getOrfs(skippedExonTranscripts[skippedExonTranscripts$set == "skipped_exon"], BSgenome = g, returnLongestOnly = FALSE, allFrames = TRUE) orfs_included <- getOrfs(skippedExonTranscripts[skippedExonTranscripts$set == "included_exon"], BSgenome = g, returnLongestOnly = FALSE, allFrames = TRUE) head(orfs_normal[,-8]) # id: transcript isoform id # gene_id: gene id # frame: which open reading frame (1:3) # seq_length: sequence length (in AA) # seq_length_nt: sequence length (in nt) # start_site: ORF start site (in AA) # stop_site: ORF stop site (in AA) # orf_sequence: ORF sequence (not shown) # orf_length: ORF length (in AA) # start_site_nt: ORF start site (in nt) / 5'UTR length # stop_site_nt: ORF stop site (in nt) # utr3_length: 3'UTR length (in nt) # min_dist_to_junction_a: distance from stop codon to upstream junction (junction A) # exon_a_from_start: junction A exon number # min_dist_to_junction_b: distance from stop codon to downstream junction (junction B), # exon_b_from_final: junction B exon number (counting backwards from the final exon) ## ----upstreamORFs------------------------------------------------------------- # either as an indivudual data.frame with all uORFs upstreamORFs <- getUOrfs(normalTranscripts, BSgenome = g, orfs=orfs_normal, findExonB=TRUE) head(upstreamORFs) # id: transcript id # frame: reading frame for ORIGINAL orf data # overlaps_main_orf: is the entire uorf upstream of the main orf (upstream), or is there some overlap with the main orf (downsteam) - i.e. uORF stop codon is within the main ORF # uorf_length: length of the uorf (in AA) # start_site_nt: position (in nt) where the uorf start codon occurs within the transcript # stop_site_nt: position (in nt) where the uorf stop codon occurs within the transcript # dist_to_start_nt: distance (in nt) from the uorf stop codon to the main orf start codon # min_dist_to_junction_b: distance from the uorf stop codon to the nearest downstream exon end/splice junction # exon_b_from_final: relative exon number (from the end) of the uorf stop codon containing exon # or as a summary by using the getOrfs() function # with uORFS=TRUE orfs_normal <- getOrfs(normalTranscripts, BSgenome = g, returnLongestOnly = FALSE, allFrames = TRUE, uORFs=TRUE) head(orfs_normal[,-8]) # this adds the following columns: # total_uorfs: total number of uorfs found for the transcript and annotated open reading frame. # upstream_count: number of uorfs that are located fully upstream of the main orf # downstream_count: number of uorfs which partially overlap the main orf # max_uorf: maximum length of an annotated uorf. If no uorfs annotated, = 0 # uorf_maxb: maximum distance from the uorf stop codon to the nearest downstream exon end/splice junction ## ----CompareORF--------------------------------------------------------------- # compare normal and skipped isoforms orfChange <- orfDiff(orfsX = orfs_included, orfsY = orfs_skipped, filterNMD = FALSE, compareBy="gene", geneSimilarity = TRUE, compareUTR=TRUE, allORFs = orfs_normal) orfChange # id: splicing event ID # orf_length_by_group_x: longest orf in first set of transcripts (included exon) # orf_length_by_group_y: longest orf in second set of transcripts (skipped exon) # utr3_length_by_group_x: 3'UTR length in first set of transcripts (included exon) # utr3_length_by_group_y: 3'UTR length in second set of transcripts (skipped exon) # utr5_length_by_group_x: 5'UTR length in first set of transcripts (included exon) # utr5_length_by_group_y: 5'UTR length in second set of transcripts (skipped exon) # filtered: filtered for NMD ? # percent_orf_shared: percent of the ORF shared between skipped and included exon transcripts # max_percent_orf_shared: theoretical maximum percent of the ORF that could be shared (orf_length_by_group_y / orf_length_by_group_x) or (orf_length_by_group_x / orf_length_by_group_y) # orf_percent_kept_x: percent of the ORF in group x (included exon) contained in group y (skipped exon) # orf_percent_kept_y: percent of the ORF in group y (skipped exon) contained in group x (included exon) # gene_similarity_x: max percent of a normal ORF shared in the group x (included exon) transcript. If multiple ORF frames and transcripts are available, this is the maximum value from comparing the skipped isoform ORF to ALL normal isoform ORFs. # gene_similarity_y: max percent of a normal ORF shared in the group y (skipped exon) transcript. If multiple ORF frames and transcripts are available, this is the maximum value from comparing the skipped isoform ORF to ALL normal isoform ORFs. ## ----CompareORFNMD, eval=FALSE------------------------------------------------ # # devtools::install_github("betsig/notNMD") # library(notNMD) # # # we will be looking at Ndufv3 (ENSMUSG00000024038.16) again # wds.ce <- filterWhippetEvents(wds, psiDelta=0,probability=0, # event="CE", idList="ENSMUSG00000024038.16") # # # find exons in the gtf that overlap the skipped exon event # exons.ce <- findExonContainingTranscripts(wds.ce, # exons = exons, # transcripts = transcripts) # # make skipped and included exon transcripts # # removes the skipped exon from all transcripts which contain it # skippedExonTranscripts <- skipExonInTranscript(skippedExons = exons.ce, # exons=exons, # match="exact", # whippetDataSet=wds.ce) # # make non-skipped exon transcripts # normalTranscripts <- exons[exons$transcript_id %in% exons.ce$transcript_id] # # # get ORF details for each set of transcripts # # note that notNMD requires upstream orf annotations # orfs_normal <- getOrfs(normalTranscripts, BSgenome = g, # returnLongestOnly = FALSE, allFrames = TRUE, uORFs=TRUE) # orfs_skipped <- getOrfs(skippedExonTranscripts[skippedExonTranscripts$set == # "skipped_exon"], # BSgenome = g, # returnLongestOnly = FALSE, allFrames = TRUE, uORFs=TRUE) # orfs_included <- getOrfs(skippedExonTranscripts[skippedExonTranscripts$set == # "included_exon"], # BSgenome = g, # returnLongestOnly = FALSE, allFrames = TRUE, uORFs=TRUE) # # # calculate NMD probability # # --- note that if you have a different method for assessing NMD potential, you may substitute the values here # orfs_normal$nmd_prob <- notNMD::predictNMD(orfs_normal, "prob") # orfs_normal$nmd_class <- notNMD::predictNMD(orfs_normal) # orfs_skipped$nmd_prob <- notNMD::predictNMD(orfs_skipped, "prob") # orfs_skipped$nmd_class <- notNMD::predictNMD(orfs_skipped) # orfs_included$nmd_prob <- notNMD::predictNMD(orfs_included, "prob") # orfs_included$nmd_class <- notNMD::predictNMD(orfs_included) # # orfs_normal <- orfs_normal[which(!is.na(orfs_normal$orf_length)),] # orfs_skipped <- orfs_skipped[which(!is.na(orfs_skipped$orf_length)),] # orfs_included <- orfs_included[which(!is.na(orfs_included$orf_length)),] # # # # compare normal and skipped isoforms # # this time setting filterNMD to TRUE, which removes NMD targeted frames/isoforms where possible # orfChange <- orfDiff(orfsX = orfs_included, # orfsY = orfs_skipped, # filterNMD = TRUE, # geneSimilarity = TRUE, # compareUTR=TRUE, # allORFs = orfs_normal) # nmdChange <- attrChangeAltSpliced(orfs_included,orfs_skipped, # attribute="nmd_prob", # compareBy="gene", # useMax=FALSE) # m <- match(orfChange$id, nmdChange$id) # orfChange <- cbind(orfChange, nmdChange[m,-1]) ## ----plotORFs, eval = TRUE---------------------------------------------------- # plot ORFs on transcripts # annotate UTR/CDS locations geneModel.skipped <- annotateGeneModel(skippedExonTranscripts[ skippedExonTranscripts$set=="skipped_exon"], orfs_skipped) geneModel.included <- annotateGeneModel(skippedExonTranscripts[ skippedExonTranscripts$set=="included_exon"], orfs_included) grtr.included <- GeneRegionTrack(geneModel.included, name="Included Isoform", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") # make tracks for non-nmd targeted CDS grtrCDS.included <- GeneRegionTrack( geneModel.included[geneModel.included$feature == "CDS",], name="Included Isoform CDS", showId=TRUE,fill="#CB3634", transcriptAnnotation = "transcript") grtr.skipped <- GeneRegionTrack(geneModel.skipped, name="Skipped Isoform", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") # make tracks for non-nmd targeted CDS grtrCDS.skipped <- GeneRegionTrack( geneModel.skipped[geneModel.skipped$feature == "CDS",], name="Skipped Isoform CDS", showId=TRUE,fill="#CB3634", transcriptAnnotation = "transcript") plotTracks(list(gtr, grtr.included, grtr.skipped, grtrCDS.included,grtrCDS.skipped), extend.left = 1000, extend.right = 1000) # Full length transcripts in blue, CDS only in red ## ----RetainedORF-------------------------------------------------------------- # we will be looking at Srsf1 (ENSMUSG00000018379.17) again wds.ri <- filterWhippetEvents(wds, psiDelta=0,probability=0, event="RI", idList="ENSMUSG00000018379.17") # find flanking exons exons.ri <- findIntronContainingTranscripts(wds.ri, exons) # make retained and non-retained transcripts # adds the intron into all transcripts which overlap it retainedIntronTranscripts <- addIntronInTranscript(exons.ri, exons = exons, glueExons = TRUE, whippetDataSet=wds.ri) # make non-retained intron transcripts normalTranscripts <- exons[exons$transcript_id %in% exons.ri$transcript_id] # get ORF details for each set of transcripts orfs_normal <- getOrfs(normalTranscripts, BSgenome = g, returnLongestOnly = FALSE, allFrames = TRUE) orfs_retained <- getOrfs( retainedIntronTranscripts[retainedIntronTranscripts$set == "retained_intron"], BSgenome = g, returnLongestOnly = FALSE, allFrames = TRUE) orfs_spliced <- getOrfs( retainedIntronTranscripts[retainedIntronTranscripts$set == "spliced_intron"], BSgenome = g, returnLongestOnly = FALSE, allFrames = TRUE) # compare normal and retained isoforms orfChange <- orfDiff(orfsX = orfs_spliced, orfsY = orfs_retained, filterNMD = FALSE, geneSimilarity = TRUE, compareUTR=TRUE) orfChange # plot ORFs on transcripts # annotate UTR/CDS locations geneModel.retained <- annotateGeneModel( retainedIntronTranscripts[retainedIntronTranscripts$set == "retained_intron"], orfs_retained) geneModel.spliced <- annotateGeneModel( retainedIntronTranscripts[retainedIntronTranscripts$set == "spliced_intron"], orfs_retained) grtr.spliced <- GeneRegionTrack(geneModel.spliced, name="Spliced Isoform", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") grtrCDS.spliced <- GeneRegionTrack( geneModel.spliced[geneModel.spliced$feature == "CDS",], name="Spliced Isoform CDS", showId=TRUE,fill="#CB3634", transcriptAnnotation = "transcript") grtr.retained <- GeneRegionTrack(geneModel.retained, name="Retained Isoform", showId=TRUE, fill="#4D7ABE", transcriptAnnotation = "transcript") grtrCDS.retained <- GeneRegionTrack( geneModel.retained[geneModel.retained$feature == "CDS",], name="Retained Isoform CDS", showId=TRUE,fill="#CB3634", transcriptAnnotation = "transcript") gtr <- GenomeAxisTrack() plotTracks(list(gtr, grtr.spliced, grtrCDS.spliced, grtr.retained, grtrCDS.retained), extend.left = 1000, extend.right = 1000) # Full length transcripts in blue, CDS only in red ## ----notNMDsummary, eval=FALSE------------------------------------------------ # summary <- whippetTranscriptChangeSummary( # whippetDataSet=wds, # exons = exons, # transcripts = transcripts, # BSgenome = g, # NMD = TRUE # ) ## ----gtfreannotate------------------------------------------------------------ gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf", package = "GeneStructureTools")) table(gtf$type) gtf_UTRannotated <- UTR2UTR53(gtf) #some transfer from exon annotation to UTR3/5 due to overlapping with a reannotated UTR table(gtf$type, gtf_UTRannotated$type) ## ----gtfreannotatetype-------------------------------------------------------- gtf <- addBroadTypes(gtf) table(gtf$transcript_type, gtf$transcript_type_broad) ## ----gtfreannotatetypeTable--------------------------------------------------- # Ful table of all transcript types and their broader version from gencode vM14 transcript_types <- read.delim("transcript_types_broad_table.txt") transcript_types ## ----dexseqImport, eval=FALSE------------------------------------------------- # # load dexseq processed data # load("dexseq_processed.Rdata") # # create results data.frame from the DEXSeqResults object # dexseq_results <- as.data.frame(dxr1) # # 3395 events significant # dexseq_results.significant <- dexseq_results[which(dexseq_results$padj < 1e-12 & abs(dexseq_results$log2fold_21_01) > 1),] # write.table(dexseq_results.significant, file="dexseq_results_significant.txt", # sep="\t", quote=FALSE) ## ----dexseq------------------------------------------------------------------- # import dexseq gtf gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf", package = "GeneStructureTools")) gtf <- UTR2UTR53(gtf) dexseq_ranges <- rtracklayer::import(system.file("extdata", "gencode.vM14.dexseq.gtf", package = "GeneStructureTools")) dexseq_results.significant <- read.delim(system.file("extdata", "dexseq_results_significant.txt", package = "GeneStructureTools")) # find the exon type of the significant events dexseq_results.significant$overlap_types <- findDEXexonType(rownames(dexseq_results.significant), dexseq_ranges, gtf=gtf) overlap_types <- table(dexseq_results.significant$overlap_types) # broader definition dexseq_results.significant$overlap_types_broad <- summariseExonTypes(dexseq_results.significant$overlap_types) table(dexseq_results.significant$overlap_types_broad) ## ----SessionInfo-------------------------------------------------------------- sessionInfo()