linkSet 0.99.19
8 August 2025
linkSet 0.99.19
This vignette provides a step-by-step guide to using the linkSet package to analyze Hi-C/HiChIP data. We will use a toy example to illustrate the main functions and workflows. Our goal is to identify the enhancer-gene links in this example.
We will use the following datasets as input:
We highly recommend you to use custom data instead of the example data provided in this vignette.
suppressPackageStartupMessages({
library(linkSet)
# Load additional packages only if available
if (requireNamespace("TxDb.Mmusculus.UCSC.mm10.knownGene", quietly = TRUE)) {
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
}
if (requireNamespace("org.Mm.eg.db", quietly = TRUE)) {
library(org.Mm.eg.db)
}
if (requireNamespace("Organism.dplyr", quietly = TRUE)) {
library(Organism.dplyr)
}
if (requireNamespace("InteractionSet", quietly = TRUE)) {
library(InteractionSet)
}
if (requireNamespace("rtracklayer", quietly = TRUE)) {
library(rtracklayer)
}
})
We use our custom function readvalidPairs
to load the example data.
Firstly, we need to load into GInteractions object.
hic_file <- system.file("extdata", "toyExample.pair.gz",
package = "linkSet"
)
gi <- readvalidPairs(hic_file, format = "pair")
promoterGr <- tryCatch(
{
withTxDb("mm10", function(src) {
genes <- Organism.dplyr::genes(src, columns = "symbol")
IRanges::promoters(genes, upstream = 10000)
})
},
error = function(e) {
# Fallback approach using TxDb directly
if (requireNamespace("TxDb.Mmusculus.UCSC.mm10.knownGene", quietly = TRUE) &&
requireNamespace("org.Mm.eg.db", quietly = TRUE)) {
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene
gene_ranges <- GenomicFeatures::genes(txdb)
# Add gene symbols
gene_ids <- names(gene_ranges)
symbols <- AnnotationDbi::select(org.Mm.eg.db::org.Mm.eg.db,
keys = gene_ids,
columns = "SYMBOL",
keytype = "ENTREZID"
)
symbol_map <- setNames(symbols$SYMBOL, symbols$ENTREZID)
mcols(gene_ranges)$symbol <- symbol_map[names(gene_ranges)]
genes <- gene_ranges[!is.na(mcols(gene_ranges)$symbol)]
IRanges::promoters(genes, upstream = 10000)
} else {
stop("Required packages not available for mm10 annotation")
}
}
)
file_path <- system.file("extdata", "Embryo_body.bed.gz", package = "linkSet")
enhancer <- rtracklayer::import(file_path, format = "BED")
Because the hic data only contains digist end, so we resize the region to upstream 5kb and downstream 5kb.
After that, we use baitGInteractions
to generate the linkSet
object.
gi <- resize(gi, 10000, fix = "center")
ls <- baitGInteractions(gi, promoterGr, enhancer, geneSymbol = "symbol")
ls
#> linkSet object with 27 interactions and 2 metadata columns:
#> bait seqnames_oe ranges_oe | anchor1.symbol
#> <character> <Rle> <IRanges> | <character>
#> [1] Xkr4 --- chr1 51753961-51754690 | Xkr4
#> [2] Xkr4 --- chr1 38494581-38496180 | Xkr4
#> [3] Xkr4 --- chr1 38496241-38496840 | Xkr4
#> [4] Rp1 --- chr1 14496641-14497000 | Rp1
#> [5] Sox17 --- chr1 90401091-90402520 | Sox17
#> ... ... ... ... ... . ...
#> [23] Npbwr1 --- chr1 59014721-59015250 | Npbwr1
#> [24] Rb1cc1 --- chr1 45369781-45370210 | Rb1cc1
#> [25] Rb1cc1 --- chr1 33869921-33870570 | Rb1cc1
#> [26] Rb1cc1 --- chr1 94074541-94075190 | Rb1cc1
#> [27] Alkal1 --- chr1 12683361-12683590 | Alkal1
#> anchor2.name
#> <character>
#> [1] 11.4716435093611
#> [2] 11.9206847715326
#> [3] 11.1575156050835
#> [4] 16.1891332694206
#> [5] 11.3810398990553
#> ... ...
#> [23] 11.2298655414852
#> [24] 16.4355294570208
#> [25] 11.2829690359357
#> [26] 10.9286062722993
#> [27] 14.0682990384515
#> -------
#> regions: 30 ranges and 0 metadata columns
#> seqinfo: 66 sequences (1 circular) from mm10 genome
When we print the linkSet
object, we can see the basic information of the linkSet
object.
By default, we don’t show the bait region. But you are interested in the bait region, you can set showBaitRegion = TRUE
.
showLinkSet(ls, baitRegion = TRUE)
#> linkSet object with 27 interactions and 2 metadata columns:
#> bait seqnames_bait ranges_bait seqnames_oe ranges_oe |
#> [1] Xkr4 chr1 3671299-3681498 --- chr1 51753961-51754690 |
#> [2] Xkr4 chr1 3671299-3681498 --- chr1 38494581-38496180 |
#> [3] Xkr4 chr1 3671299-3681498 --- chr1 38496241-38496840 |
#> [4] Rp1 chr1 4409042-4419241 --- chr1 14496641-14497000 |
#> [5] Sox17 chr1 4497155-4507354 --- chr1 90401091-90402520 |
#> ... ... ... ... ... ... ... .
#> [23] Npbwr1 chr1 5917199-5927398 --- chr1 59014721-59015250 |
#> [24] Rb1cc1 chr1 6196197-6206396 --- chr1 45369781-45370210 |
#> [25] Rb1cc1 chr1 6196197-6206396 --- chr1 33869921-33870570 |
#> [26] Rb1cc1 chr1 6196197-6206396 --- chr1 94074541-94075190 |
#> [27] Alkal1 chr1 6349218-6359417 --- chr1 12683361-12683590 |
#> anchor1.symbol anchor2.name
#> [1] Xkr4 11.4716435093611
#> [2] Xkr4 11.9206847715326
#> [3] Xkr4 11.1575156050835
#> [4] Rp1 16.1891332694206
#> [5] Sox17 11.3810398990553
#> ... ... ...
#> [23] Npbwr1 11.2298655414852
#> [24] Rb1cc1 16.4355294570208
#> [25] Rb1cc1 11.2829690359357
#> [26] Rb1cc1 10.9286062722993
#> [27] Alkal1 14.0682990384515
Now, we can run diagnoseLinkSet to check the distance distribution and inter/intra interaction percentage. First, let’s annotate the promoter regions with genome information:
#> linkSet object with 27 interactions and 4 metadata columns:
#> bait seqnames_oe ranges_oe | anchor1.symbol
#> <character> <Rle> <IRanges> | <character>
#> [1] Xkr4 --- chr1 51753961-51754690 | Xkr4
#> [2] Xkr4 --- chr1 38494581-38496180 | Xkr4
#> [3] Xkr4 --- chr1 38496241-38496840 | Xkr4
#> [4] Rp1 --- chr1 14496641-14497000 | Rp1
#> [5] Sox17 --- chr1 90401091-90402520 | Sox17
#> ... ... ... ... ... . ...
#> [23] Npbwr1 --- chr1 59014721-59015250 | Npbwr1
#> [24] Rb1cc1 --- chr1 45369781-45370210 | Rb1cc1
#> [25] Rb1cc1 --- chr1 33869921-33870570 | Rb1cc1
#> [26] Rb1cc1 --- chr1 94074541-94075190 | Rb1cc1
#> [27] Alkal1 --- chr1 12683361-12683590 | Alkal1
#> anchor2.name inter_type distance
#> <character> <character> <integer>
#> [1] 11.4716435093611 inter 48077927
#> [2] 11.9206847715326 inter 34818982
#> [3] 11.1575156050835 inter 34820142
#> [4] 16.1891332694206 inter 10082679
#> [5] 11.3810398990553 inter 85899551
#> ... ... ... ...
#> [23] 11.2298655414852 inter 53092687
#> [24] 16.4355294570208 inter 39168699
#> [25] 11.2829690359357 inter 27668949
#> [26] 10.9286062722993 inter 87873569
#> [27] 14.0682990384515 inter 6329158
#> -------
#> regions: 30 ranges and 0 metadata columns
#> seqinfo: 66 sequences (1 circular) from mm10 genome
Intrachromosomal interaction and long distance interaction are likely be noise, so we filter them.
ls <- countInteractibility(ls)
ls <- filterLinks(ls, filter_intra = TRUE, filter_unannotate = TRUE, distance = 50000000)
Duplicated links are associated with contact frequency, so it’s a good idea to count duplicated links.
ls <- countInteractions(ls)
orderLinks(ls, by = "count", decreasing = TRUE)
#> linkSet object with 15 interactions and 5 metadata columns:
#> bait seqnames_oe ranges_oe | anchor1.symbol
#> <character> <Rle> <IRanges> | <character>
#> [1] Xkr4 --- chr1 51753961-51754690 | Xkr4
#> [2] Xkr4 --- chr1 38494581-38496180 | Xkr4
#> [3] Xkr4 --- chr1 38496241-38496840 | Xkr4
#> [4] Rp1 --- chr1 14496641-14497000 | Rp1
#> [5] Sox17 --- chr1 4561881-4562150 | Sox17
#> ... ... ... ... ... . ...
#> [11] Rgs20 --- chr1 44099101-44099360 | Rgs20
#> [12] Npbwr1 --- chr1 39818891-39819140 | Npbwr1
#> [13] Rb1cc1 --- chr1 45369781-45370210 | Rb1cc1
#> [14] Rb1cc1 --- chr1 33869921-33870570 | Rb1cc1
#> [15] Alkal1 --- chr1 12683361-12683590 | Alkal1
#> anchor2.name inter_type count distance
#> <character> <character> <integer> <integer>
#> [1] 11.4716435093611 inter 1 48077927
#> [2] 11.9206847715326 inter 1 34818982
#> [3] 11.1575156050835 inter 1 34820142
#> [4] 16.1891332694206 inter 1 10082679
#> [5] 13.507493224644 inter 1 59761
#> ... ... ... ... ...
#> [11] 11.3026423239041 inter 1 39024045
#> [12] 12.9148641505382 inter 1 33896717
#> [13] 16.4355294570208 inter 1 39168699
#> [14] 11.2829690359357 inter 1 27668949
#> [15] 14.0682990384515 inter 1 6329158
#> -------
#> regions: 20 ranges and 0 metadata columns
#> seqinfo: 66 sequences (1 circular) from mm10 genome
We can notice that there is a significant link strength between Sulf1
and chr1:12785091-12785750
.
Enhancers that regulate multiple genes are biologically meaningful.
ls <- crossGeneEnhancer(ls)
ls <- orderLinks(ls, by = "crossFreq", decreasing = TRUE)
ls
#> linkSet object with 15 interactions and 6 metadata columns:
#> bait seqnames_oe ranges_oe | anchor1.symbol
#> <character> <Rle> <IRanges> | <character>
#> [1] Sox17 --- chr1 4561881-4562150 | Sox17
#> [2] Lypla1 --- chr1 4561881-4562150 | Lypla1
#> [3] Mrpl15 --- chr1 4561881-4562150 | Mrpl15
#> [4] Lypla1 --- chr1 33869921-33870570 | Lypla1
#> [5] Mrpl15 --- chr1 33869921-33870570 | Mrpl15
#> ... ... ... ... ... . ...
#> [11] Lypla1 --- chr1 34163531-34164100 | Lypla1
#> [12] Rgs20 --- chr1 44099101-44099360 | Rgs20
#> [13] Npbwr1 --- chr1 39818891-39819140 | Npbwr1
#> [14] Rb1cc1 --- chr1 45369781-45370210 | Rb1cc1
#> [15] Alkal1 --- chr1 12683361-12683590 | Alkal1
#> anchor2.name inter_type count distance crossFreq
#> <character> <character> <integer> <integer> <integer>
#> [1] 13.507493224644 inter 1 59761 3
#> [2] 13.507493224644 inter 1 240872 3
#> [3] 13.507493224644 inter 1 228624 3
#> [4] 11.2829690359357 inter 1 29067358 3
#> [5] 11.2829690359357 inter 1 29079606 3
#> ... ... ... ... ... ...
#> [11] 11.2903880454209 inter 1 29360928 1
#> [12] 11.3026423239041 inter 1 39024045 1
#> [13] 12.9148641505382 inter 1 33896717 1
#> [14] 16.4355294570208 inter 1 39168699 1
#> [15] 14.0682990384515 inter 1 6329158 1
#> -------
#> regions: 20 ranges and 0 metadata columns
#> seqinfo: 66 sequences (1 circular) from mm10 genome
We can use plot_genomic_ranges
to visualize the cross gene links.
We can also choose to visualze the bait center region.
# Check if regionsBait exists and has names
if (!is.null(regionsBait(ls)) && !is.null(names(regionsBait(ls)))) {
# Get available bait names from regionsBait
available_baits <- unique(names(regionsBait(ls)))
if (length(available_baits) > 0) {
# Use the first available bait for demonstration
plot_genomic_ranges(ls, showBait = available_baits[1])
} else {
cat("No named bait regions available for plotting")
}
} else {
# If no regionsBait, just plot without showBait
plot_genomic_ranges(ls)
}
sessionInfo()
#> R version 4.5.1 (2025-06-13)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.22-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] rtracklayer_1.69.1
#> [2] InteractionSet_1.37.1
#> [3] SummarizedExperiment_1.39.1
#> [4] MatrixGenerics_1.21.0
#> [5] matrixStats_1.5.0
#> [6] Organism.dplyr_1.37.1
#> [7] AnnotationFilter_1.33.0
#> [8] dplyr_1.1.4
#> [9] org.Mm.eg.db_3.21.0
#> [10] TxDb.Mmusculus.UCSC.mm10.knownGene_3.10.0
#> [11] GenomicFeatures_1.61.6
#> [12] AnnotationDbi_1.71.1
#> [13] Biobase_2.69.0
#> [14] linkSet_0.99.19
#> [15] GenomicRanges_1.61.1
#> [16] Seqinfo_0.99.2
#> [17] IRanges_2.43.0
#> [18] S4Vectors_0.47.0
#> [19] BiocGenerics_0.55.1
#> [20] generics_0.1.4
#> [21] BiocStyle_2.37.1
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.2.1 farver_2.1.2 blob_1.2.4
#> [4] filelock_1.0.3 Biostrings_2.77.2 bitops_1.0-9
#> [7] fastmap_1.2.0 lazyeval_0.2.2 RCurl_1.98-1.17
#> [10] BiocFileCache_2.99.5 GenomicAlignments_1.45.2 XML_3.99-0.18
#> [13] digest_0.6.37 lifecycle_1.0.4 KEGGREST_1.49.1
#> [16] RSQLite_2.4.2 magrittr_2.0.3 compiler_4.5.1
#> [19] rlang_1.1.6 sass_0.4.10 tools_4.5.1
#> [22] yaml_2.3.10 data.table_1.17.8 knitr_1.50
#> [25] labeling_0.4.3 S4Arrays_1.9.1 bit_4.6.0
#> [28] curl_6.4.0 DelayedArray_0.35.2 RColorBrewer_1.1-3
#> [31] abind_1.4-8 BiocParallel_1.43.4 withr_3.0.2
#> [34] purrr_1.1.0 grid_4.5.1 ggplot2_3.5.2
#> [37] scales_1.4.0 iterators_1.0.14 tinytex_0.57
#> [40] dichromat_2.0-0.1 cli_3.6.5 rmarkdown_2.29
#> [43] crayon_1.5.3 httr_1.4.7 rjson_0.2.23
#> [46] DBI_1.2.3 cachem_1.1.0 parallel_4.5.1
#> [49] BiocManager_1.30.26 XVector_0.49.0 restfulr_0.0.16
#> [52] vctrs_0.6.5 Matrix_1.7-3 jsonlite_2.0.0
#> [55] bookdown_0.43 patchwork_1.3.1 bit64_4.6.0-1
#> [58] magick_2.8.7 foreach_1.5.2 jquerylib_0.1.4
#> [61] glue_1.8.0 codetools_0.2-20 gtable_0.3.6
#> [64] GenomeInfoDb_1.45.9 BiocIO_1.19.0 UCSC.utils_1.5.0
#> [67] tibble_3.3.0 pillar_1.11.0 rappdirs_0.3.3
#> [70] htmltools_0.5.8.1 R6_2.6.1 dbplyr_2.5.0
#> [73] httr2_1.2.1 evaluate_1.0.4 lattice_0.22-7
#> [76] png_0.1-8 Rsamtools_2.25.2 memoise_2.0.1
#> [79] bslib_0.9.0 Rcpp_1.1.0 SparseArray_1.9.1
#> [82] xfun_0.52 pkgconfig_2.0.3