Why?
What?
library(Biostrings)
head( methods(class = "DNAStringSet") )
## [1] "!,List-method" "!=,ANY,Vector-method"
## [3] "!=,Vector,ANY-method" "!=,Vector,Vector-method"
## [5] "[,List-method" "[[,List-method"
methods(generic = "reverseComplement")
## [1] reverseComplement,DNAString-method
## [2] reverseComplement,DNAStringSet-method
## [3] reverseComplement,MaskedDNAString-method
## [4] reverseComplement,MaskedRNAString-method
## [5] reverseComplement,matrix-method
## [6] reverseComplement,QualityScaledDNAStringSet-method
## [7] reverseComplement,QualityScaledRNAStringSet-method
## [8] reverseComplement,RNAString-method
## [9] reverseComplement,RNAStringSet-method
## [10] reverseComplement,XStringViews-method
## see '?methods' for accessing help and source code
DNA, amino acid, and other biological sequences. See earlier example in 01: Introdction to R and Bioconductor.
GRanges()
: genomic coordinates to represent annotations (exons,
genes, regulatory marks, …) and data (called peaks, variants,
aligned reads)GRangesList()
: genomic coordinates grouped into list elements
(e.g., paired-end reads; exons grouped by transcript)start()
/ end()
/ width()
length()
, subset, etc.mcols()
Seqinfo
, including seqlevels
and seqlengths
shift()
, narrow()
, flank()
, promoters()
, resize()
,
restrict()
, trim()
?"intra-range-methods"
range()
, reduce()
, gaps()
, disjoin()
coverage()
(!)?"inter-range-methods"
findOverlaps()
, countOverlaps()
, …, %over%
, %within%
,
%outside%
; union()
, intersect()
, setdiff()
, punion()
,
pintersect()
, psetdiff()
library(GenomicRanges)
gr <- GRanges("A", IRanges(c(10, 20, 22), width=5), "+")
shift(gr, 1) # intra-range
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A 11-15 +
## [2] A 21-25 +
## [3] A 23-27 +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
range(gr) # inter-range
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A 10-26 +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
reduce(gr) # inter-range
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A 10-14 +
## [2] A 20-26 +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
snps <- GRanges("A", IRanges(c(11, 17, 24), width=1))
findOverlaps(snps, gr) # between-range
## Hits object with 3 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 3 2
## [3] 3 3
## -------
## queryLength: 3 / subjectLength: 3
setdiff(range(gr), gr) # 'introns'
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A 15-19 +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
colData()
data frame for desciption of samplesrowRanges()
GRanges / GRangeList or data frame for description
of featuresexptData()
to describe the entire objectassays()
can be any matrix-like object, including very large
on-disk representations such as HDF5Arraylibrary(SummarizedExperiment)
library(airway)
data(airway)
airway
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
colData(airway)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
head(assay(airway))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000005 0 0 0 0 0
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## ENSG00000000460 60 55 40 35 78
## ENSG00000000938 0 0 2 0 1
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000005 0 0 0
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
## ENSG00000000460 63 76 60
## ENSG00000000938 0 0 0
airway[, airway$dex %in% "trt"]
## class: RangedSummarizedExperiment
## dim: 64102 4
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
chr14 <- as(seqinfo(rowRanges(airway)), "GRanges")["14"]
airway[airway %over% chr14,]
## class: RangedSummarizedExperiment
## dim: 2244 8
## metadata(1): ''
## assays(1): counts
## rownames(2244): ENSG00000006432 ENSG00000009830 ...
## ENSG00000273259 ENSG00000273307
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
The object allows very easy access to the assay, for instance to determine library size (total number of mapped reads in each sample)
colSums(assay(airway))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
## 20637971 18809481 25348649 15163415 24448408 30818215
## SRR1039520 SRR1039521
## 19126151 21164133
and average log count number across genes with non-zero counts
library(ggplot2)
ridx <- rowSums(assay(airway)) > 0
se <- airway[ridx,]
ave_log_expr <- rowMeans(log(1 + assay(se)))
tbl <- tibble::enframe(ave_log_expr, "gene", "ave_log_expr")
ggplot(tbl, aes(ave_log_expr)) +
geom_density()
NumericList
and friend
unlist()
/ relist()
for fast operationsil = IntegerList(sample(26, 5), sample(26, 7))
il > 7
## LogicalList of length 2
## [[1]] TRUE TRUE TRUE TRUE TRUE
## [[2]] TRUE TRUE TRUE TRUE FALSE FALSE TRUE
any(il > 7)
## [1] TRUE TRUE
il[il > 7]
## IntegerList of length 2
## [[1]] 12 26 18 15 9
## [[2]] 13 16 21 10 17
relist(letters[unlist(il)], il)
## CharacterList of length 2
## [[1]] l z r o i
## [[2]] m p u j e a q
DataFrame
length()
, [
, and [<-
methods.DataFrame(
i = 1:2,
il = IntegerList(sample(5), sample(7)),
dna = DNAStringSet(c("ATCG", "AAACGGG")),
gr = GRanges(c("chr1:1-20", "chr20:10-30"))
)
## DataFrame with 2 rows and 4 columns
## i il dna gr
## <integer> <IntegerList> <DNAStringSet> <GRanges>
## 1 1 3,4,5,... ATCG chr1:1-20
## 2 2 6,4,3,... AAACGGG chr20:10-30
E.g., GTF:
Component coordinates
7 protein_coding gene 27221129 27224842 . - . ...
...
7 protein_coding transcript 27221134 27224835 . - . ...
7 protein_coding exon 27224055 27224835 . - . ...
7 protein_coding CDS 27224055 27224763 . - 0 ...
7 protein_coding start_codon 27224761 27224763 . - 0 ...
7 protein_coding exon 27221134 27222647 . - . ...
7 protein_coding CDS 27222418 27222647 . - 2 ...
7 protein_coding stop_codon 27222415 27222417 . - 0 ...
7 protein_coding UTR 27224764 27224835 . - . ...
7 protein_coding UTR 27221134 27222414 . - . ...
Annotations
gene_id "ENSG00000005073"; gene_name "HOXA11"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
...
... transcript_id "ENST00000006015"; transcript_name "HOXA11-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS5411";
... exon_number "1"; exon_id "ENSE00001147062";
... exon_number "1"; protein_id "ENSP00000006015";
... exon_number "1";
... exon_number "2"; exon_id "ENSE00002099557";
... exon_number "2"; protein_id "ENSP00000006015";
... exon_number "2";
...
import()
: import various formats to GRanges
and similar instancesexport()
: transform from GRanges
and similar types to BED, GTF, …library(rtracklayer)
data(cpgIslands, package="Gviz")
cpgIslands$name <- letters[1:10]
cpgIslands$score <- sample(10)
cpgIslands
## GRanges object with 10 ranges and 2 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <integer>
## [1] chr7 26549019-26550183 * | a 9
## [2] chr7 26564119-26564500 * | b 1
## [3] chr7 26585667-26586158 * | c 7
## [4] chr7 26591772-26593309 * | d 4
## [5] chr7 26594192-26594570 * | e 8
## [6] chr7 26623835-26624150 * | f 3
## [7] chr7 26659284-26660352 * | g 2
## [8] chr7 26721294-26721717 * | h 10
## [9] chr7 26821518-26823297 * | i 6
## [10] chr7 26991322-26991841 * | j 5
## -------
## seqinfo: 1 sequence from hg19 genome; no seqlengths
bed_file <- tempfile(fileext = ".bed")
basename(bed_file)
## [1] "file5af33bf55cd0.bed"
export(cpgIslands, bed_file)
cat(readLines(bed_file), sep = "\n")
## chr7 26549018 26550183 a 9 .
## chr7 26564118 26564500 b 1 .
## chr7 26585666 26586158 c 7 .
## chr7 26591771 26593309 d 4 .
## chr7 26594191 26594570 e 8 .
## chr7 26623834 26624150 f 3 .
## chr7 26659283 26660352 g 2 .
## chr7 26721293 26721717 h 10 .
## chr7 26821517 26823297 i 6 .
## chr7 26991321 26991841 j 5 .
import(bed_file)
## GRanges object with 10 ranges and 2 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr7 26549019-26550183 * | a 9
## [2] chr7 26564119-26564500 * | b 1
## [3] chr7 26585667-26586158 * | c 7
## [4] chr7 26591772-26593309 * | d 4
## [5] chr7 26594192-26594570 * | e 8
## [6] chr7 26623835-26624150 * | f 3
## [7] chr7 26659284-26660352 * | g 2
## [8] chr7 26721294-26721717 * | h 10
## [9] chr7 26821518-26823297 * | i 6
## [10] chr7 26991322-26991841 * | j 5
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
gtf <- system.file(
package = "airway",
"extdata", "Homo_sapiens.GRCh37.75_subset.gtf"
)
import(gtf)
## GRanges object with 891 ranges and 16 metadata columns:
## seqnames ranges strand | source
## <Rle> <IRanges> <Rle> | <factor>
## [1] 1 11006528-11009808 - | retained_intron
## [2] 1 11007700-11009808 - | retained_intron
## [3] 1 11006528-11006795 - | retained_intron
## [4] 1 11009681-11009871 - | protein_coding
## [5] 1 11009681-11009871 - | protein_coding
## ... ... ... ... . ...
## [887] 1 11126678-11141274 - | retained_intron
## [888] 1 11126678-11159894 - | protein_coding
## [889] 1 11140758-11142680 - | processed_transcript
## [890] 1 11166592-11322564 - | protein_coding
## [891] 1 11166592-11322564 - | protein_coding
## type score phase gene_id transcript_id
## <factor> <numeric> <integer> <character> <character>
## [1] transcript <NA> <NA> ENSG00000175262 ENST00000476357
## [2] exon <NA> <NA> ENSG00000175262 ENST00000476357
## [3] exon <NA> <NA> ENSG00000175262 ENST00000476357
## [4] exon <NA> <NA> ENSG00000175262 ENST00000418570
## [5] CDS <NA> 2 ENSG00000175262 ENST00000418570
## ... ... ... ... ... ...
## [887] transcript <NA> <NA> ENSG00000171824 ENST00000474216
## [888] transcript <NA> <NA> ENSG00000171824 ENST00000544779
## [889] transcript <NA> <NA> ENSG00000171824 ENST00000485606
## [890] gene <NA> <NA> ENSG00000198793 <NA>
## [891] transcript <NA> <NA> ENSG00000198793 ENST00000361445
## gene_name gene_source gene_biotype transcript_name
## <character> <character> <character> <character>
## [1] C1orf127 ensembl_havana protein_coding C1orf127-002
## [2] C1orf127 ensembl_havana protein_coding C1orf127-002
## [3] C1orf127 ensembl_havana protein_coding C1orf127-002
## [4] C1orf127 ensembl_havana protein_coding C1orf127-001
## [5] C1orf127 ensembl_havana protein_coding C1orf127-001
## ... ... ... ... ...
## [887] EXOSC10 ensembl_havana protein_coding EXOSC10-003
## [888] EXOSC10 ensembl_havana protein_coding EXOSC10-201
## [889] EXOSC10 ensembl_havana protein_coding EXOSC10-011
## [890] MTOR ensembl_havana protein_coding <NA>
## [891] MTOR ensembl_havana protein_coding MTOR-001
## transcript_source exon_number exon_id tag
## <character> <character> <character> <character>
## [1] havana <NA> <NA> <NA>
## [2] havana 1 ENSE00001869344 <NA>
## [3] havana 2 ENSE00001938325 <NA>
## [4] havana 6 ENSE00001262890 mRNA_start_NF
## [5] havana 6 <NA> mRNA_start_NF
## ... ... ... ... ...
## [887] havana <NA> <NA> <NA>
## [888] ensembl <NA> <NA> <NA>
## [889] havana <NA> <NA> <NA>
## [890] <NA> <NA> <NA> <NA>
## [891] ensembl_havana <NA> <NA> CCDS
## protein_id ccds_id
## <character> <character>
## [1] <NA> <NA>
## [2] <NA> <NA>
## [3] <NA> <NA>
## [4] <NA> <NA>
## [5] ENSP00000387816 <NA>
## ... ... ...
## [887] <NA> <NA>
## [888] <NA> <NA>
## [889] <NA> <NA>
## [890] <NA> <NA>
## [891] <NA> CCDS127
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
## see also `GenonmicFeatures::makeTxDbFromGFF()`
Sequenced reads: FASTQ files
@ERR127302.1703 HWI-EAS350_0441:1:1:1460:19184#0/1
CCTGAGTGAAGCTGATCTTGATCTACGAAGAGAGATAGATCTTGATCGTCGAGGAGATGCTGACCTTGACCT
+
HHGHHGHHHHHHHHDGG<GDGGE@GDGGD<?B8??ADAD<BE@EE8EGDGA3CB85*,77@>>CE?=896=:
@ERR127302.1704 HWI-EAS350_0441:1:1:1460:16861#0/1
GCGGTATGCTGGAAGGTGCTCGAATGGAGAGCGCCAGCGCCCCGGCGCTGAGCCGCAGCCTCAGGTCCGCCC
+
DE?DD>ED4>EEE>DE8EEEDE8B?EB<@3;BA79?,881B?@73;1?########################
readFastq()
: inputFastqStreamer()
: iterate through FASTQ filesFastqSampler()
: sample from FASTQ files, e.g., for quality assessmentAligned reads: BAM files
Header
@HD VN:1.0 SO:coordinate
@SQ SN:chr1 LN:249250621
@SQ SN:chr10 LN:135534747
@SQ SN:chr11 LN:135006516
...
@SQ SN:chrY LN:59373566
@PG ID:TopHat VN:2.0.8b CL:/home/hpages/tophat-2.0.8b.Linux_x86_64/tophat --mate-inner-dist 150 --solexa-quals --max-multihits 5 --no-discordant --no-mixed --coverage-search --microexon-search --library-type fr-unstranded --num-threads 2 --output-dir tophat2_out/ERR127306 /home/hpages/bowtie2-2.1.0/indexes/hg19 fastq/ERR127306_1.fastq fastq/ERR127306_2.fastq
Alignments: ID, flag, alignment and mate
ERR127306.7941162 403 chr14 19653689 3 72M = 19652348 -1413 ...
ERR127306.22648137 145 chr14 19653692 1 72M = 19650044 -3720 ...
ERR127306.933914 339 chr14 19653707 1 66M120N6M = 19653686 -213 ...
Alignments: sequence and quality
... GAATTGATCAGTCTCATCTGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCC *'%%%%%#&&%''#'&%%%)&&%%$%%'%%'&*****$))$)'')'%)))&)%%%%$'%%%%&"))'')%))
... TTGATCAGTCTCATCTGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAG '**)****)*'*&*********('&)****&***(**')))())%)))&)))*')&***********)****
... TGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAGCAGCCTCTGGTTTCT '******&%)&)))&")')'')'*((******&)&'')'))$))'')&))$)**&&****************
Alignments: Tags
... AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:72 YT:Z:UU NH:i:2 CC:Z:chr22 CP:i:16189276 HI:i:0
... AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:72 YT:Z:UU NH:i:3 CC:Z:= CP:i:19921600 HI:i:0
... AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:4 MD:Z:72 YT:Z:UU XS:A:+ NH:i:3 CC:Z:= CP:i:19921465 HI:i:0
... AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:4 MD:Z:72 YT:Z:UU XS:A:+ NH:i:2 CC:Z:chr22 CP:i:16189138 HI:i:0
readGAlignments()
: Single-end readsreadGAlignmentPairs()
, readGAlignmentsList()
: paired end readslibrary(GenomicAlignments)
## Loading required package: Rsamtools
fls <- dir(
system.file(package = "airway", "extdata"),
pattern = "bam",
full.names = TRUE
)
readGAlignments(fls[1])
## GAlignments object with 14282 alignments and 0 metadata columns:
## seqnames strand cigar qwidth start end
## <Rle> <Rle> <character> <integer> <integer> <integer>
## [1] 1 + 63M 63 11053773 11053835
## [2] 1 - 63M 63 11053840 11053902
## [3] 1 + 63M 63 11067866 11067928
## [4] 1 - 63M 63 11067931 11067993
## [5] 1 + 63M 63 11072708 11072770
## ... ... ... ... ... ... ...
## [14278] 1 - 63M 63 11364733 11364795
## [14279] 1 + 63M 63 11365866 11365928
## [14280] 1 - 63M 63 11365987 11366049
## [14281] 1 + 2S61M 63 11386063 11386123
## [14282] 1 - 63M 63 11386132 11386194
## width njunc
## <integer> <integer>
## [1] 63 0
## [2] 63 0
## [3] 63 0
## [4] 63 0
## [5] 63 0
## ... ... ...
## [14278] 63 0
## [14279] 63 0
## [14280] 63 0
## [14281] 61 0
## [14282] 63 0
## -------
## seqinfo: 84 sequences from an unspecified genome
Header
##fileformat=VCFv4.2
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
##phasing=partial
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
...
##FILTER=<ID=q10,Description="Quality below 10">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
...
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
Location
#CHROM POS ID REF ALT QUAL FILTER ...
20 14370 rs6054257 G A 29 PASS ...
20 17330 . T A 3 q10 ...
20 1110696 rs6040355 A G,T 67 PASS ...
Variant INFO
#CHROM POS ... INFO ...
20 14370 ... NS=3;DP=14;AF=0.5;DB;H2 ...
20 17330 ... NS=3;DP=11;AF=0.017 ...
20 1110696 ... NS=2;DP=10;AF=0.333,0.667;AA=T;DB ...
Genotype FORMAT and samples
... POS ... FORMAT NA00001 NA00002 NA00003
... 14370 ... GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
... 17330 ... GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
... 1110696 ... GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
readVcf()
: VCF inputScanVcfParam()
: restrict input to necessary fields / rangesVcfFile()
: indexing and iterating through large VCF fileslocateVariants()
: annotate in relation to genes, etc; see also
ensemblVEP, VariantFilteringfilterVcf()
: flexible filteringExample: BAM file
ScanBamParam()
: restrict inputBamFile(, yieldSize=)
: iterationreduceByYield()
Restrict input:
library(GenomicAlignments)
fl <- dir(
system.file(package = "airway", "extdata"),
pattern = "bam", full.name = TRUE
)
fl[1]
## [1] "/Users/ma38727/Library/R/3.6/Bioc/3.10/airway/extdata/SRR1039508_subset.bam"
readGAlignments(fl[1])
## GAlignments object with 14282 alignments and 0 metadata columns:
## seqnames strand cigar qwidth start end
## <Rle> <Rle> <character> <integer> <integer> <integer>
## [1] 1 + 63M 63 11053773 11053835
## [2] 1 - 63M 63 11053840 11053902
## [3] 1 + 63M 63 11067866 11067928
## [4] 1 - 63M 63 11067931 11067993
## [5] 1 + 63M 63 11072708 11072770
## ... ... ... ... ... ... ...
## [14278] 1 - 63M 63 11364733 11364795
## [14279] 1 + 63M 63 11365866 11365928
## [14280] 1 - 63M 63 11365987 11366049
## [14281] 1 + 2S61M 63 11386063 11386123
## [14282] 1 - 63M 63 11386132 11386194
## width njunc
## <integer> <integer>
## [1] 63 0
## [2] 63 0
## [3] 63 0
## [4] 63 0
## [5] 63 0
## ... ... ...
## [14278] 63 0
## [14279] 63 0
## [14280] 63 0
## [14281] 61 0
## [14282] 63 0
## -------
## seqinfo: 84 sequences from an unspecified genome
## hack: index the bam file in-place
invisible(indexBam(fl[1]))
which <- GRanges(c("1:11053773-11072770", "1:11362290-11386194"))
param <- ScanBamParam(which = which, what = "seq")
readGAlignments(fl[1], param = param)
## GAlignments object with 86 alignments and 1 metadata column:
## seqnames strand cigar qwidth start end width
## <Rle> <Rle> <character> <integer> <integer> <integer> <integer>
## [1] 1 + 63M 63 11053773 11053835 63
## [2] 1 - 63M 63 11053840 11053902 63
## [3] 1 + 63M 63 11067866 11067928 63
## [4] 1 - 63M 63 11067931 11067993 63
## [5] 1 + 63M 63 11072708 11072770 63
## ... ... ... ... ... ... ... ...
## [82] 1 - 63M 63 11364733 11364795 63
## [83] 1 + 63M 63 11365866 11365928 63
## [84] 1 - 63M 63 11365987 11366049 63
## [85] 1 + 2S61M 63 11386063 11386123 61
## [86] 1 - 63M 63 11386132 11386194 63
## njunc | seq
## <integer> | <DNAStringSet>
## [1] 0 | ATACAAAAAT...GAAGCTGAGG
## [2] 0 | AGAATCACTT...TGCACTCCAG
## [3] 0 | CTAAAAACAT...TTACTTTATT
## [4] 0 | TTTATTACTT...ACCTCCGCCA
## [5] 0 | GCCATTTTGT...TTCGGTGTCC
## ... ... . ...
## [82] 0 | CCCCGGCCCA...GTGTGGGAAG
## [83] 0 | CTCCTCACTT...AGACAGGGCG
## [84] 0 | TCACCTCCCA...GGGGCGGCGG
## [85] 0 | GGGCAGTGCA...TTGGGCCAAA
## [86] 0 | TTCTCTGGGT...GAGAAGCTAC
## -------
## seqinfo: 84 sequences from an unspecified genome
Iterate:
bf <- BamFile(fl[1], yieldSize = 5000)
open(bf)
repeat {
galn <- readGAlignments(bf)
if (length(galn) == 0)
break
## do work
message(length(galn))
}
## 5000
## 5000
## 4282
close(bf)
sessionInfo()
## R version 3.6.0 Patched (2019-04-26 r76431)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Users/ma38727/bin/R-3-6-branch/lib/libRblas.dylib
## LAPACK: /Users/ma38727/bin/R-3-6-branch/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GenomicAlignments_1.21.4 Rsamtools_2.1.2
## [3] rtracklayer_1.45.1 ggplot2_3.2.0
## [5] airway_1.5.0 SummarizedExperiment_1.15.5
## [7] DelayedArray_0.11.2 BiocParallel_1.19.0
## [9] matrixStats_0.54.0 Biobase_2.45.0
## [11] GenomicRanges_1.37.14 GenomeInfoDb_1.21.1
## [13] Biostrings_2.53.0 XVector_0.25.0
## [15] IRanges_2.19.10 S4Vectors_0.23.17
## [17] BiocGenerics_0.31.4 BiocStyle_2.13.2
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.8 purrr_0.3.2
## [4] lattice_0.20-38 colorspace_1.4-1 htmltools_0.3.6
## [7] yaml_2.2.0 XML_3.98-1.20 rlang_0.4.0
## [10] pillar_1.4.2 glue_1.3.1 withr_2.1.2
## [13] GenomeInfoDbData_1.2.1 stringr_1.4.0 zlibbioc_1.31.0
## [16] munsell_0.5.0 gtable_0.3.0 codetools_0.2-16
## [19] evaluate_0.14 labeling_0.3 knitr_1.23
## [22] Rcpp_1.0.1 scales_1.0.0 BiocManager_1.30.5.1
## [25] digest_0.6.19 stringi_1.4.3 bookdown_0.11
## [28] dplyr_0.8.2 grid_3.6.0 tools_3.6.0
## [31] bitops_1.0-6 magrittr_1.5 RCurl_1.95-4.12
## [34] lazyeval_0.2.2 tibble_2.1.3 crayon_1.3.4
## [37] pkgconfig_2.0.2 Matrix_1.2-17 assertthat_0.2.1
## [40] rmarkdown_1.13 R6_2.4.0 compiler_3.6.0