BCClong
is an R package for performing Bayesian
Consensus Clustering (BCC) model for clustering continuous, discrete and
categorical longitudinal data, which are commonly seen in many clinical
studies. This document gives a tour of BCClong package.
see help(package = "BCClong")
for more information and
references provided by citation("BCClong")
To download BCClong, use the following commands:
require("devtools")
devtools::install_github("ZhiwenT/BCClong", build_vignettes = TRUE)
library("BCClong")
To list all functions available in this package:
Currently, there are 5 function in this package which are BCC.multi, BayesT, model.selection.criteria, traceplot, trajplot.
BCC.multi function performs clustering on mixed-type (continuous, discrete and categorical) longitudinal markers using Bayesian consensus clustering method with MCMC sampling and provide a summary statistics for the computed model. This function will take in a data set and multiple parameters and output a BCC model with summary statistics.
BayesT function assess the model goodness of fit by calculate the discrepancy measure T(, ) with following steps (a) Generate T.obs based on the MCMC samples (b) Generate T.rep based on the posterior distribution of the parameters (c) Compare T.obs and T.rep, and calculate the P values.
model.selection.criteria function calculates DIC and WAIC for the fitted model traceplot function visualize the MCMC chain for model parameters trajplot function plot the longitudinal trajectory of features by local and global clustering
In this example, the epileptic.qol
data set from
joinrRML
package was used. The variables used here include
anxiety score
, depress score
and
AEP score
. All of the variables are continuous.
library(BCClong)
library(joineRML)
library(ggplot2)
library(cowplot)
# convert days to months
epileptic.qol$time_month <- epileptic.qol$time/30.25
# Sort by ID and time
epileptic.qol <- epileptic.qol[order(epileptic.qol$id,epileptic.qol$time_month),]
## Make Spaghetti Plots to Visualize
p1 <- ggplot(data =epileptic.qol, aes(x =time_month, y = anxiety, group = id))+
geom_point() + geom_line() +
geom_smooth(method = "loess", size = 1.5,group =1,se = FALSE, span=2) +
theme(legend.position = "none",
plot.title = element_text(size = 20, face = "bold"),
axis.text=element_text(size=20),
axis.title=element_text(size=20),
axis.text.x = element_text(angle = 0 ),
strip.text.x = element_text(size = 20, angle = 0),
strip.text.y = element_text(size = 20,face="bold")) +
xlab("Time (months)") + ylab("anxiety")
p2 <- ggplot(data =epileptic.qol, aes(x =time_month, y = depress, group = id))+
geom_point() +
geom_line() +
geom_smooth(method = "loess", size = 1.5,group =1,se = FALSE, span=2) +
theme(legend.position = "none",
plot.title = element_text(size = 20, face = "bold"),
axis.text=element_text(size=20),
axis.title=element_text(size=20),
axis.text.x = element_text(angle = 0 ),
strip.text.x = element_text(size = 20, angle = 0),
strip.text.y = element_text(size = 20,face="bold")) +
xlab("Time (months)") + ylab("depress")
p3 <- ggplot(data =epileptic.qol, aes(x =time_month, y = aep, group = id))+
geom_point() +
geom_line() +
geom_smooth(method = "loess", size = 1.5,group =1,se = FALSE, span=2) +
theme(legend.position = "none",
plot.title = element_text(size = 20, face = "bold"),
axis.text=element_text(size=20),
axis.title=element_text(size=20),
axis.text.x = element_text(angle = 0 ),
strip.text.x = element_text(size = 20, angle = 0),
strip.text.y = element_text(size = 20,face="bold")) +
xlab("Time (months)") + ylab("aep")
plot_grid(p1,NULL,p2,NULL,p3,NULL,labels=c("(A)","", "(B)","","(C)",""), nrow = 1,
align = "v", rel_widths = c(1,0.1,1,0.1,1,0.1))
We can compute the mean adjusted adherence to determine the number of clusters using the code below. Since this program takes a long time to run, this chunk of code will not run in this tutorial file.
# computed the mean adjusted adherence to determine the number of clusters
set.seed(20220929)
alpha.adjust <- NULL
DIC <- WAIC <- NULL
for (k in 1:5){
fit.BCC <- BCC.multi (
mydat = list(dat$anxiety_scale,dat$depress_scale,dat$aep_scale),
dist = c("gaussian"),
id = list(dat$id),
time = list(dat$time),
formula =list(y ~ time + (1 |id)),
num.cluster = k,
initials= NULL,
burn.in = 1000,
thin = 10,
per = 100,
max.iter = 2000)
alpha.adjust <- c(alpha.adjust, fit.BCC$alpha.adjust)
res <- model.selection.criteria(fit.BCC, fast_version=0)
DIC <- c(DIC,res$DIC)
WAIC <- c(WAIC,res$WAIC)}
num.cluster <- 1:5
par(mfrow=c(1,3))
plot(num.cluster[2:5], alpha.adjust, type="o",cex.lab=1.5,cex.axis=1.5,cex.main=1.5,lwd=2,
xlab="Number of Clusters",
ylab="mean adjusted adherence",main="mean adjusted adherence")
plot(num.cluster, DIC, type="o",cex=1.5, cex.lab=1.5,cex.axis=1.5,cex.main=1.5,lwd=2,
xlab="Number of Clusters",ylab="DIC",main="DIC")
plot(num.cluster, WAIC, type="o",cex=1.5, cex.lab=1.5,cex.axis=1.5,cex.main=1.5,lwd=2,
xlab="Number of Clusters",ylab="WAIC",main="WAIC")
Here, We used gaussian distribution for all three markers. The number of clusters was set to 2 because it has highest mean adjusted adherence. All hyper parameters were set to default.
For the purpose of this tutorial, the MCMC iteration will be set to a
small number to minimize the compile time and the result will be read
from the pre-compiled RData file using
data(epil1), data(epil1)
and data(epil1)
# Fit the final model with the number of cluster 2 (largest mean adjusted adherence)
fit.BCC2 <- BCC.multi (
mydat = list(dat$anxiety_scale,dat$depress_scale,dat$aep_scale),
dist = c("gaussian"),
id = list(dat$id),
time = list(dat$time),
formula =list(y ~ time + (1|id)),
num.cluster = 2,
burn.in = 10, # number of samples discarded
thin = 1, # thinning
per = 10, # output information every "per" iteration
max.iter = 30) # maximum number of iteration
fit.BCC2b <- BCC.multi (
mydat = list(dat$anxiety_scale,dat$depress_scale,dat$aep_scale),
dist = c("gaussian"),
id = list(dat$id),
time = list(dat$time),
formula =list(y ~ time + (1 + time|id)),
num.cluster = 2,
burn.in = 10,
thin = 1,
per = 10,
max.iter = 30)
fit.BCC2c <- BCC.multi (
mydat = list(dat$anxiety_scale,dat$depress_scale,dat$aep_scale),
dist = c("gaussian"),
id = list(dat$id),
time = list(dat$time),
formula =list(y ~ time + time2 + (1 + time|id)),
num.cluster = 2,
burn.in = 10,
thin = 1,
per = 10,
max.iter = 30)
Load the pre-compiled results
data(epil1)
data(epil2)
data(epil3)
fit.BCC2 <- epil1
fit.BCC2b <- epil2
fit.BCC2c <- epil3
fit.BCC2b$cluster.global <- factor(fit.BCC2b$cluster.global,
labels=c("Cluster 1","Cluster 2"))
table(fit.BCC2$cluster.global, fit.BCC2b$cluster.global)
#>
#> Cluster 1 Cluster 2
#> 1 231 4
#> 2 14 291
fit.BCC2c$cluster.global <- factor(fit.BCC2c$cluster.global,
labels=c("Cluster 1","Cluster 2"))
table(fit.BCC2$cluster.global, fit.BCC2c$cluster.global)
#>
#> Cluster 1 Cluster 2
#> 1 228 7
#> 2 6 299
To print the BCC model
To print the summary statistics for all parameters
To print the proportion for each cluster (mean, sd, 2.5% and 97.5% percentile) geweke statistics (geweke.stat) between -2 and 2 suggests the parameters converge
The code below prints out all major parameters
summary(fit.BCC2)
#> Total number of individual:
#> [1] 540
#>
#> Number of features:
#> [1] 3
#>
#> Cluster proportions statistics for global clusters:
#> V1 V2
#> Min. :0.4084 Min. :0.5288
#> 1st Qu.:0.4234 1st Qu.:0.5521
#> Median :0.4423 Median :0.5577
#> Mean :0.4379 Mean :0.5621
#> 3rd Qu.:0.4479 3rd Qu.:0.5766
#> Max. :0.4712 Max. :0.5916
#>
#> Globle clusters table:
#>
#> 1 2
#> 235 305
#>
#> Adherence parameters statistics by feature:
#> V1 V2 V3
#> Min. :0.9412 Min. :0.8999 Min. :0.9088
#> 1st Qu.:0.9593 1st Qu.:0.9160 1st Qu.:0.9200
#> Median :0.9655 Median :0.9267 Median :0.9257
#> Mean :0.9669 Mean :0.9251 Mean :0.9254
#> 3rd Qu.:0.9777 3rd Qu.:0.9330 3rd Qu.:0.9304
#> Max. :0.9901 Max. :0.9478 Max. :0.9429
#>
#> Local clusters statistics by feature:
#> Cluster statistics for feature 1 :
#> , , 1
#>
#> [,1] [,2]
#> mean 0.87729642 -0.64459070
#> sd 0.01927627 0.01666083
#> 2.5% 0.85349682 -0.66739415
#> 97.5% 0.91306409 -0.61823693
#> geweke.stat -0.19942773 0.29949578
#>
#> , , 2
#>
#> [,1] [,2]
#> mean -1.074188e-04 -1.576562e-04
#> sd 7.759632e-05 5.338211e-05
#> 2.5% -2.203863e-04 -2.533276e-04
#> 97.5% 3.290686e-05 -8.484365e-05
#> geweke.stat -2.814307e+00 7.024379e-01
#>
#> Cluster statistics for feature 2 :
#> , , 1
#>
#> [,1] [,2]
#> mean 0.88166260 -0.5921965
#> sd 0.02881597 0.0219339
#> 2.5% 0.83165270 -0.6358678
#> 97.5% 0.93839108 -0.5559157
#> geweke.stat -1.50786195 -1.5791791
#>
#> , , 2
#>
#> [,1] [,2]
#> mean -7.530200e-05 -2.009088e-04
#> sd 7.605123e-05 6.210093e-05
#> 2.5% -1.841543e-04 -3.131674e-04
#> 97.5% 7.958110e-05 -1.112733e-04
#> geweke.stat -9.565325e-01 3.748582e-01
#>
#> Cluster statistics for feature 3 :
#> , , 1
#>
#> [,1] [,2]
#> mean 0.79904742 -0.71860049
#> sd 0.01328303 0.01553592
#> 2.5% 0.78206752 -0.74887774
#> 97.5% 0.82017453 -0.69286688
#> geweke.stat -5.73392753 -1.63767793
#>
#> , , 2
#>
#> [,1] [,2]
#> mean 5.700862e-05 -2.019033e-04
#> sd 3.521174e-05 2.627128e-05
#> 2.5% 5.222274e-06 -2.475186e-04
#> 97.5% 1.338561e-04 -1.615889e-04
#> geweke.stat 1.888959e+00 -2.243578e+00
#>
#>
#> Variance-covariance matrix statistics for random effects by feature:
#> Variance-covariance matrix statistics for feature 1 :
#> , , 1
#>
#> [,1]
#> mean 2.936789e-05
#> sd 3.400319e-05
#> 2.5% 1.309234e-05
#> 97.5% 1.228279e-04
#> geweke.stat 3.123410e+00
#>
#> , , 2
#>
#> [,1]
#> mean 2.667787e-05
#> sd 3.324741e-05
#> 2.5% 1.177405e-05
#> 97.5% 1.177560e-04
#> geweke.stat 2.894896e+00
#>
#> Variance-covariance matrix statistics for feature 2 :
#> , , 1
#>
#> [,1]
#> mean 2.472793e-05
#> sd 1.884985e-05
#> 2.5% 1.377823e-05
#> 97.5% 7.222922e-05
#> geweke.stat 3.155393e+00
#>
#> , , 2
#>
#> [,1]
#> mean 2.111768e-05
#> sd 1.730134e-05
#> 2.5% 1.125047e-05
#> 97.5% 6.841069e-05
#> geweke.stat 3.381524e+00
#>
#> Variance-covariance matrix statistics for feature 3 :
#> , , 1
#>
#> [,1]
#> mean 3.120109e-05
#> sd 3.311745e-05
#> 2.5% 1.271328e-05
#> 97.5% 1.121231e-04
#> geweke.stat 4.103521e+00
#>
#> , , 2
#>
#> [,1]
#> mean 3.214810e-05
#> sd 3.918276e-05
#> 2.5% 1.320204e-05
#> 97.5% 1.399506e-04
#> geweke.stat 3.717880e+00
#>
#>
#> Residual variance of continuous features statistics by feature:
#> Residual variance statistics for feature 1 :
#> [,1] [,2]
#> mean 0.43798839 0.43798839
#> sd 0.02039686 0.02039686
#> 2.5% 0.40803863 0.40803863
#> 97.5% 0.47957980 0.47957980
#> geweke.stat 1.32597079 1.32597079
#> Residual variance statistics for feature 2 :
#> [,1] [,2]
#> mean 0.47252164 0.47252164
#> sd 0.01239170 0.01239170
#> 2.5% 0.45219075 0.45219075
#> 97.5% 0.49519381 0.49519381
#> geweke.stat 0.06409189 0.06409189
#> Residual variance statistics for feature 3 :
#> [,1] [,2]
#> mean 0.42012860 0.42012860
#> sd 0.01331219 0.01331219
#> 2.5% 0.39710081 0.39710081
#> 97.5% 0.44412052 0.44412052
#> geweke.stat 1.20076131 1.20076131
#>
#> Local clusters tables by feature:
#> Clusters table for feature 1 :
#>
#> 1 2
#> 233 307
#> Clusters table for feature 2 :
#>
#> 1 2
#> 224 316
#> Clusters table for feature 3 :
#>
#> 1 2
#> 260 280
Generic plot can be used on BCC object, all relevant plots will be generate one by one using return key
We can also use the traceplot function to plot the MCMC process and the trajplot function to plot the trajectory for each feature.
#=====================================================#
# Trace-plot for key model parameters
#=====================================================#
traceplot(fit=fit.BCC2, parameter="PPI",ylab="pi",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 1, feature.indx=1,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 1, feature.indx=2,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 1, feature.indx=3,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 2, feature.indx=1,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 2, feature.indx=2,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 2, feature.indx=3,parameter="GA",ylab="GA",xlab="MCMC samples")
#=====================================================#
# Trajectory plot for features
#=====================================================#
gp1 <- trajplot(fit=fit.BCC2,feature.ind=1,
which.cluster = "local.cluster",
title= bquote(paste("Local Clustering (",hat(alpha)[1] ==.(round(fit.BCC2$alpha[1],2)),")")),
xlab="time (months)",ylab="anxiety",color=c("#00BA38", "#619CFF"))
gp2 <- trajplot(fit=fit.BCC2,feature.ind=2,
which.cluster = "local.cluster",
title= bquote(paste("Local Clustering (",hat(alpha)[2] ==.(round(fit.BCC2$alpha[2],2)),")")),
xlab="time (months)",ylab="depress",color=c("#00BA38", "#619CFF"))
gp3 <- trajplot(fit=fit.BCC2,feature.ind=3,
which.cluster = "local.cluster",
title= bquote(paste("Local Clustering (",hat(alpha)[3] ==.(round(fit.BCC2$alpha[3],2)),")")),
xlab="time (months)",ylab="aep",color=c("#00BA38", "#619CFF"))
gp4 <- trajplot(fit=fit.BCC2,feature.ind=1,
which.cluster = "global.cluster",
title="Global Clustering",xlab="time (months)",ylab="anxiety",color=c("#00BA38", "#619CFF"))
gp5 <- trajplot(fit=fit.BCC2,feature.ind=2,
which.cluster = "global.cluster",
title="Global Clustering",xlab="time (months)",ylab="depress",color=c("#00BA38", "#619CFF"))
gp6 <- trajplot(fit=fit.BCC2,feature.ind=3,
which.cluster = "global.cluster",
title="Global Clustering",
xlab="time (months)",ylab="aep",color=c("#00BA38", "#619CFF"))
library(cowplot)
plot_grid(gp1, gp2,gp3,gp4,gp5,gp6,
labels=c("(A)", "(B)", "(C)", "(D)", "(E)", "(F)"),
ncol = 3, align = "v" )
The BayesT function will be used for
posterior check. Here we used the pre-compiled results, un-comment the
line res <- BayesT(fit=fit.BCC2)
to try your own. The
pre-compiled data file can be attached using data("conRes")
For this function, the p-value between 0.3 to 0.7 was consider
reasonable. In the scatter plot, the data pints should be evenly
distributed around y = x.
sessionInfo()
#> R version 4.3.2 (2023-10-31 ucrt)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 11 x64 (build 22631)
#>
#> Matrix products: default
#>
#>
#> locale:
#> [1] LC_COLLATE=C
#> [2] LC_CTYPE=English_United States.utf8
#> [3] LC_MONETARY=English_United States.utf8
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.utf8
#>
#> time zone: America/Toronto
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] cowplot_1.1.3 ggplot2_3.5.0 joineRML_0.4.6 survival_3.5-7 nlme_3.1-163
#> [6] BCClong_1.0.3
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.4 xfun_0.41 bslib_0.6.1
#> [4] lattice_0.21-9 vctrs_0.6.4 tools_4.3.2
#> [7] generics_0.1.3 stats4_4.3.2 parallel_4.3.2
#> [10] tibble_3.2.1 fansi_1.0.6 highr_0.10
#> [13] cluster_2.1.4 pkgconfig_2.0.3 Matrix_1.6-5
#> [16] lifecycle_1.0.4 farver_2.1.1 compiler_4.3.2
#> [19] mixAK_5.7 MatrixModels_0.5-3 mcmc_0.9-8
#> [22] munsell_0.5.0 mnormt_2.1.1 combinat_0.0-8
#> [25] fastGHQuad_1.0.1 codetools_0.2-19 SparseM_1.81
#> [28] quantreg_5.97 htmltools_0.5.7 sass_0.4.8
#> [31] evd_2.3-6.1 yaml_2.3.8 gmp_0.7-4
#> [34] pillar_1.9.0 nloptr_2.0.3 jquerylib_0.1.4
#> [37] MASS_7.3-60 randtoolbox_2.0.4 truncdist_1.0-2
#> [40] cachem_1.0.8 iterators_1.0.14 foreach_1.5.2
#> [43] boot_1.3-28.1 abind_1.4-5 mclust_6.0.1
#> [46] tidyselect_1.2.0 digest_0.6.34 mvtnorm_1.2-4
#> [49] LaplacesDemon_16.1.6 dplyr_1.1.4 labeling_0.4.3
#> [52] splines_4.3.2 fastmap_1.1.1 grid_4.3.2
#> [55] colorspace_2.1-0 cli_3.6.1 magrittr_2.0.3
#> [58] cobs_1.3-7 label.switching_1.8 utf8_1.2.4
#> [61] withr_3.0.0 Rmpfr_0.9-5 scales_1.3.0
#> [64] rmarkdown_2.25 rngWELL_0.10-9 nnet_7.3-19
#> [67] lme4_1.1-35.1 gridExtra_2.3 coda_0.19-4.1
#> [70] evaluate_0.23 lpSolve_5.6.20 knitr_1.45
#> [73] doParallel_1.0.17 mgcv_1.9-0 rlang_1.1.1
#> [76] MCMCpack_1.7-0 Rcpp_1.0.12 glue_1.6.2
#> [79] rstudioapi_0.15.0 minqa_1.2.6 jsonlite_1.8.8
#> [82] R6_2.5.1