This vignette shows an example workflow for ensemble biclustering analysis
with the mosbi
package.
Every function of the package has a help page with a detailed documentation.
To access these type help(package=mosbi)
in the R
console.
Import dependencies.
library(mosbi)
Two additional functions are defined, to calculate z-scores of the data and to visualize the biclusters as a histogram.
z_score <- function(x, margin = 2) {
z_fun <- function(y) {
(y - mean(y, na.rm = TRUE)) / sd(y, na.rm = TRUE)
}
if (margin == 2) {
return(apply(x, margin, z_fun))
} else if (margin == 1) {
return(t(apply(x, margin, z_fun)))
}
}
bicluster_histo <- function(biclusters) {
cols <- mosbi::colhistogram(biclusters)
rows <- mosbi::rowhistogram(biclusters)
graphics::par(mfrow = c(1, 2))
hist(cols, main = "Column size ditribution")
hist(rows, main = "Row size ditribution")
}
Biclustering will be done on a data matrix. As an example,
lipidomics dataset from the metabolights database will be used
https://www.ebi.ac.uk/metabolights/MTBLS562
.
The data consists of 40 samples (columns) and 245 lipids (rows).
# get data
data(mouse_data)
mouse_data <- mouse_data[c(
grep(
"metabolite_identification",
colnames(mouse_data)
),
grep("^X", colnames(mouse_data))
)]
# Make data matrix
data_matrix <- z_score(log2(as.matrix(mouse_data[2:ncol(mouse_data)])), 1)
rownames(data_matrix) <- mouse_data$metabolite_identification
stats::heatmap(data_matrix)
The data has a gaussian-like distribution and no missing values, so we can proceed with biclustering.
The mosbi
package is able to work with results of different
biclustering algorithms. The approach unites the results
from different algorithms.
The results of four example algorithms will be computed and
converted to mosbi::bicluster
objects. For a list of all
supported biclustering algorithms/packages type ?mosbi::get_biclusters
in the R
console.
# Fabia
fb <- mosbi::run_fabia(data_matrix) # In case the algorithms throws an error,
#> Cycle: 0
Cycle: 20
Cycle: 40
Cycle: 60
Cycle: 80
Cycle: 100
Cycle: 120
Cycle: 140
Cycle: 160
Cycle: 180
Cycle: 200
Cycle: 220
Cycle: 240
Cycle: 260
Cycle: 280
Cycle: 300
Cycle: 320
Cycle: 340
Cycle: 360
Cycle: 380
Cycle: 400
Cycle: 420
Cycle: 440
Cycle: 460
Cycle: 480
Cycle: 500
# return an empty list
# isa2
BCisa <- mosbi::run_isa(data_matrix)
# Plaid
BCplaid <- mosbi::run_plaid(data_matrix)
#> layer: 0
#> 5882.744
#> layer: 1
#> [1] 0 95 10
#> [1] 1 101 10
#> [1] 30 101 10
#> [1] 31 39 10
#> [1] 32 39 9
#> [1] 33 37 9
#> [1] 34 37 9
#> [1] 35 37 9
#> [1] 60 37 9
#> [1] 2
#> [1] 154.413 0.000 0.000 0.000
#> back fitting 2 times
#> layer: 2
#> [1] 0 110 11
#> [1] 1 98 11
#> [1] 2 93 11
#> [1] 3 90 11
#> [1] 4 88 11
#> [1] 5 87 11
#> [1] 30 87 11
#> [1] 31 37 11
#> [1] 32 37 6
#> [1] 33 37 6
#> [1] 60 37 6
#> [1] 6
#> [1] 202.7203 0.0000 0.0000 0.0000
#> back fitting 2 times
#> layer: 3
#> [1] 0 120 20
#> [1] 1 110 17
#> [1] 2 106 16
#> [1] 3 106 15
#> [1] 4 107 15
#> [1] 30 107 15
#> [1] 31 42 15
#> [1] 32 42 10
#> [1] 33 37 10
#> [1] 34 37 10
#> [1] 35 37 10
#> [1] 60 37 10
#> [1] 5
#> [1] 104.5608 0.0000 0.0000 0.0000
#> back fitting 2 times
#> layer: 4
#> [1] 0 107 19
#> [1] 1 91 17
#> [1] 2 87 16
#> [1] 3 85 15
#> [1] 4 85 14
#> [1] 5 84 13
#> [1] 6 83 13
#> [1] 30 83 13
#> [1] 31 6 13
#> [1] 32 6 11
#> [1] 33 4 11
#> [1] 34 4 11
#> [1] 35 3 11
#> [1] 36 3 9
#> [1] 37 3 9
#> [1] 38 3 8
#> [1] 39 3 8
#> [1] 60 3 8
#> [1] 7
#> [1] 46.39038 0.00000 0.00000 0.00000
#> back fitting 2 times
#> layer: 5
#> [1] 0 101 17
#> [1] 1 83 17
#> [1] 2 76 17
#> [1] 3 69 17
#> [1] 4 66 17
#> [1] 30 66 17
#> [1] 31 2 17
#> [1] 32 2 10
#> [1] 33 2 10
#> [1] 34 2 9
#> [1] 35 2 9
#> [1] 60 2 9
#> [1] 5
#> [1] 6.682646 0.000000 0.000000 0.000000
#> back fitting 2 times
#> layer: 6
#> [1] 0 100 17
#> [1] 1 88 16
#> [1] 2 87 16
#> [1] 3 88 15
#> [1] 4 87 15
#> [1] 30 87 15
#> [1] 31 8 15
#> [1] 32 8 10
#> [1] 33 7 10
#> [1] 34 7 10
#> [1] 35 7 10
#> [1] 60 7 10
#> [1] 5
#> [1] 35.24584 0.00000 0.00000 0.00000
#> back fitting 2 times
#> layer: 7
#> [1] 0 106 18
#> [1] 1 96 18
#> [1] 2 91 17
#> [1] 3 90 17
#> [1] 30 90 17
#> [1] 31 3 17
#> [1] 32 3 11
#> [1] 33 3 11
#> [1] 34 3 10
#> [1] 35 3 10
#> [1] 36 3 9
#> [1] 37 3 9
#> [1] 60 3 9
#> [1] 4
#> [1] 14.51552 0.00000 0.00000 0.00000
#> back fitting 2 times
#> layer: 8
#> [1] 0 43 19
#> [1] 1 46 19
#> [1] 30 46 19
#> [1] "Zero residual degrees of freedom"
#> [1] 31 1 19
#> [1] "Zero residual degrees of freedom"
#> [1] 32 1 19
#> [1] 33 0 19
#> [1] 34
#> [1] 0 0 0 0
#>
#> Layer Rows Cols Df SS MS Convergence Rows Released Cols Released
#> 0 245 40 284 6321.62 22.26 NA NA NA
#> 1 37 9 45 282.80 6.28 1 64 1
#> 2 37 6 42 355.72 8.47 1 50 5
#> 3 37 10 46 233.94 5.09 1 70 5
#> 4 3 8 10 72.64 7.26 1 80 5
#> 5 2 9 10 9.12 0.91 1 64 8
#> 6 7 10 16 59.13 3.70 1 80 5
#> 7 3 9 11 22.92 2.08 1 87 8
# QUBIC
BCqubic <- mosbi::run_qubic(data_matrix)
# Merge results of all algorithms
all_bics <- c(fb, BCisa, BCplaid, BCqubic)
bicluster_histo(all_bics)
The histogram visualizes the distribution of bicluster sizes (separately for the number of rows and columns of each bicluster). The total number of found biclusters are given in the title.
The next step of the ensemble approach is the computation of a
similarity network of biclusters. To filter for for similarities due to
random overlaps of biclusters, we apply an error
model (For more details refer to our publication).
Different similarity metrics are available.
For details type mosbi::bicluster_network
in the R
console.
bic_net <- mosbi::bicluster_network(all_bics, # List of biclusters
data_matrix, # Data matrix
n_randomizations = 5,
# Number of randomizations for the
# error model
MARGIN = "both",
# Use datapoints for metric evaluation
metric = 4, # Fowlkes–Mallows index
# For information about the metrics,
# visit the "Similarity metrics
# evaluation" vignette
n_steps = 1000,
# At how many steps should
# the cut-of is evaluated
plot_edge_dist = TRUE
# Plot the evaluation of cut-off estimation
)
#> Esimated cut-off: 0.05105105