lightgbm: Multiclass Classification

# nolint start
library(mlexperiments)
library(mllrnrs)

See https://github.com/kapsner/mllrnrs/blob/main/R/learner_lightgbm.R for implementation details.

Preprocessing

Import and Prepare Data

library(mlbench)
data("DNA")
dataset <- DNA |>
  data.table::as.data.table() |>
  na.omit()

feature_cols <- colnames(dataset)[1:180]
target_col <- "Class"

General Configurations

seed <- 123
if (isTRUE(as.logical(Sys.getenv("_R_CHECK_LIMIT_CORES_")))) {
  # on cran
  ncores <- 2L
} else {
  ncores <- ifelse(
    test = parallel::detectCores() > 4,
    yes = 4L,
    no = ifelse(
      test = parallel::detectCores() < 2L,
      yes = 1L,
      no = parallel::detectCores()
    )
  )
}
options("mlexperiments.bayesian.max_init" = 10L)
options("mlexperiments.optim.lgb.nrounds" = 100L)
options("mlexperiments.optim.lgb.early_stopping_rounds" = 10L)

Generate Training- and Test Data

data_split <- splitTools::partition(
  y = dataset[, get(target_col)],
  p = c(train = 0.7, test = 0.3),
  type = "stratified",
  seed = seed
)

train_x <- model.matrix(
  ~ -1 + .,
  dataset[data_split$train, .SD, .SDcols = feature_cols]
)
train_y <- as.integer(dataset[data_split$train, get(target_col)]) - 1L


test_x <- model.matrix(
  ~ -1 + .,
  dataset[data_split$test, .SD, .SDcols = feature_cols]
)
test_y <- as.integer(dataset[data_split$test, get(target_col)]) - 1L

Generate Training Data Folds

fold_list <- splitTools::create_folds(
  y = train_y,
  k = 3,
  type = "stratified",
  seed = seed
)

Experiments

Prepare Experiments

# required learner arguments, not optimized
learner_args <- list(
  max_depth = -1L,
  verbose = -1L,
  objective = "multiclass",
  metric = "multi_logloss",
  num_class = "3"
)

# set arguments for predict function and performance metric,
# required for mlexperiments::MLCrossValidation and
# mlexperiments::MLNestedCV
predict_args <- list(reshape = TRUE)
performance_metric <- metric("bacc")
performance_metric_args <- NULL
return_models <- FALSE

# required for grid search and initialization of bayesian optimization
parameter_grid <- expand.grid(
  bagging_fraction = seq(0.6, 1, .2),
  feature_fraction = seq(0.6, 1, .2),
  min_data_in_leaf = seq(2, 10, 2),
  learning_rate = seq(0.1, 0.2, 0.1),
  num_leaves = seq(2, 20, 4)
)
# reduce to a maximum of 10 rows
if (nrow(parameter_grid) > 10) {
  set.seed(123)
  sample_rows <- sample(seq_len(nrow(parameter_grid)), 10, FALSE)
  parameter_grid <- kdry::mlh_subset(parameter_grid, sample_rows)
}

# required for bayesian optimization
parameter_bounds <- list(
  bagging_fraction = c(0.2, 1),
  feature_fraction = c(0.2, 1),
  min_data_in_leaf = c(2L, 12L),
  learning_rate = c(0.1, 0.2),
  num_leaves =  c(2L, 20L)
)
optim_args <- list(
  iters.n = ncores,
  kappa = 3.5,
  acq = "ucb"
)

Hyperparameter Tuning

tuner <- mlexperiments::MLTuneParameters$new(
  learner = mllrnrs::LearnerLightgbm$new(
    metric_optimization_higher_better = FALSE
  ),
  strategy = "grid",
  ncores = ncores,
  seed = seed
)

tuner$parameter_grid <- parameter_grid
tuner$learner_args <- learner_args
tuner$split_type <- "stratified"

tuner$set_data(
  x = train_x,
  y = train_y
)

tuner_results_grid <- tuner$execute(k = 3)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [==================>-----------------------------------------------------------------------------] 2/10 ( 20%)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [============================>-------------------------------------------------------------------] 3/10 ( 30%)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [=====================================>----------------------------------------------------------] 4/10 ( 40%)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [===============================================>------------------------------------------------] 5/10 ( 50%)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [=========================================================>--------------------------------------] 6/10 ( 60%)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> 
#> Parameter settings [===============================================================================================] 10/10 (100%)                                                                                                                                  
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -1.424655
#> [LightGBM] [Info] Start training from score -0.656204
#> [LightGBM] [Info] Start training from score -1.422637
#> [LightGBM] [Info] Start training from score -1.428239
#> [LightGBM] [Info] Start training from score -0.655482

head(tuner_results_grid)
#>    setting_id metric_optim_mean nrounds bagging_fraction feature_fraction min_data_in_leaf learning_rate num_leaves max_depth
#> 1:          1         0.1353093      33              0.6              0.6                4           0.2         18        -1
#> 2:          2         0.1282925      59              0.8              1.0               10           0.2          6        -1
#> 3:          3         0.2360723     100              0.8              0.8                4           0.1          2        -1
#> 4:          4         0.1298904      71              1.0              0.8                4           0.1         10        -1
#> 5:          5         0.1357692      32              1.0              0.6                6           0.2         18        -1
#> 6:          6         0.1313455      64              1.0              1.0                8           0.1         14        -1
#>    verbose  objective        metric num_class
#> 1:      -1 multiclass multi_logloss         3
#> 2:      -1 multiclass multi_logloss         3
#> 3:      -1 multiclass multi_logloss         3
#> 4:      -1 multiclass multi_logloss         3
#> 5:      -1 multiclass multi_logloss         3
#> 6:      -1 multiclass multi_logloss         3

Bayesian Optimization

tuner <- mlexperiments::MLTuneParameters$new(
  learner = mllrnrs::LearnerLightgbm$new(
    metric_optimization_higher_better = FALSE
  ),
  strategy = "bayesian",
  ncores = ncores,
  seed = seed
)

tuner$parameter_grid <- parameter_grid
tuner$parameter_bounds <- parameter_bounds

tuner$learner_args <- learner_args
tuner$optim_args <- optim_args

tuner$split_type <- "stratified"

tuner$set_data(
  x = train_x,
  y = train_y
)

tuner_results_bayesian <- tuner$execute(k = 3)
#> 
#> Registering parallel backend using 4 cores.

head(tuner_results_bayesian)
#>    Epoch setting_id bagging_fraction feature_fraction min_data_in_leaf learning_rate num_leaves gpUtility acqOptimum inBounds
#> 1:     0          1              0.6              0.6                4           0.2         18        NA      FALSE     TRUE
#> 2:     0          2              0.8              1.0               10           0.2          6        NA      FALSE     TRUE
#> 3:     0          3              0.8              0.8                4           0.1          2        NA      FALSE     TRUE
#> 4:     0          4              1.0              0.8                4           0.1         10        NA      FALSE     TRUE
#> 5:     0          5              1.0              0.6                6           0.2         18        NA      FALSE     TRUE
#> 6:     0          6              1.0              1.0                8           0.1         14        NA      FALSE     TRUE
#>    Elapsed      Score metric_optim_mean nrounds errorMessage max_depth verbose  objective        metric num_class
#> 1:   1.283 -0.1353093         0.1353093      33           NA        -1      -1 multiclass multi_logloss         3
#> 2:   1.300 -0.1282925         0.1282925      59           NA        -1      -1 multiclass multi_logloss         3
#> 3:   1.277 -0.2360723         0.2360723     100           NA        -1      -1 multiclass multi_logloss         3
#> 4:   1.460 -0.1298904         0.1298904      71           NA        -1      -1 multiclass multi_logloss         3
#> 5:   0.360 -0.1357692         0.1357692      32           NA        -1      -1 multiclass multi_logloss         3
#> 6:   0.561 -0.1313455         0.1313455      64           NA        -1      -1 multiclass multi_logloss         3

k-Fold Cross Validation

validator <- mlexperiments::MLCrossValidation$new(
  learner = mllrnrs::LearnerLightgbm$new(
    metric_optimization_higher_better = FALSE
  ),
  fold_list = fold_list,
  ncores = ncores,
  seed = seed
)

validator$learner_args <- tuner$results$best.setting[-1]

validator$predict_args <- predict_args
validator$performance_metric <- performance_metric
validator$performance_metric_args <- performance_metric_args
validator$return_models <- return_models

validator$set_data(
  x = train_x,
  y = train_y
)

validator_results <- validator$execute()
#> 
#> CV fold: Fold1
#> 
#> CV fold: Fold2
#> CV progress [====================================================================>-----------------------------------] 2/3 ( 67%)
#> 
#> CV fold: Fold3
#> CV progress [========================================================================================================] 3/3 (100%)
#>                                                                                                                                   

head(validator_results)
#>     fold performance bagging_fraction feature_fraction min_data_in_leaf learning_rate num_leaves nrounds max_depth verbose
#> 1: Fold1   0.9674260              0.8              0.6                8           0.1         14      66        -1      -1
#> 2: Fold2   0.9534347              0.8              0.6                8           0.1         14      66        -1      -1
#> 3: Fold3   0.9549840              0.8              0.6                8           0.1         14      66        -1      -1
#>     objective        metric num_class
#> 1: multiclass multi_logloss         3
#> 2: multiclass multi_logloss         3
#> 3: multiclass multi_logloss         3

Nested Cross Validation

validator <- mlexperiments::MLNestedCV$new(
  learner = mllrnrs::LearnerLightgbm$new(
    metric_optimization_higher_better = FALSE
  ),
  strategy = "grid",
  fold_list = fold_list,
  k_tuning = 3L,
  ncores = ncores,
  seed = seed
)

validator$parameter_grid <- parameter_grid
validator$learner_args <- learner_args
validator$split_type <- "stratified"

validator$predict_args <- predict_args
validator$performance_metric <- performance_metric
validator$performance_metric_args <- performance_metric_args
validator$return_models <- return_models

validator$set_data(
  x = train_x,
  y = train_y
)

validator_results <- validator$execute()
#> 
#> CV fold: Fold1
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> 
#> Parameter settings [============================>-------------------------------------------------------------------] 3/10 ( 30%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=====================================>----------------------------------------------------------] 4/10 ( 40%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [===============================================>------------------------------------------------] 5/10 ( 50%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=========================================================>--------------------------------------] 6/10 ( 60%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [===============================================================================================] 10/10 (100%)                                                                                                                                  
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> CV fold: Fold2
#> CV progress [====================================================================>-----------------------------------] 2/3 ( 67%)
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [============================>-------------------------------------------------------------------] 3/10 ( 30%)
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=====================================>----------------------------------------------------------] 4/10 ( 40%)
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [===============================================>------------------------------------------------] 5/10 ( 50%)
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=========================================================>--------------------------------------] 6/10 ( 60%)
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [===============================================================================================] 10/10 (100%)                                                                                                                                  
#> [LightGBM] [Info] Start training from score -1.423260
#> [LightGBM] [Info] Start training from score -1.427452
#> [LightGBM] [Info] Start training from score -0.655556
#> [LightGBM] [Info] Start training from score -1.428460
#> [LightGBM] [Info] Start training from score -1.420092
#> [LightGBM] [Info] Start training from score -0.656564
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> CV fold: Fold3
#> CV progress [========================================================================================================] 3/3 (100%)
#>                                                                                                                                   
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [============================>-------------------------------------------------------------------] 3/10 ( 30%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=====================================>----------------------------------------------------------] 4/10 ( 40%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [===============================================>------------------------------------------------] 5/10 ( 50%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=========================================================>--------------------------------------] 6/10 ( 60%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491
#> 
#> Parameter settings [===============================================================================================] 10/10 (100%)                                                                                                                                  
#> [LightGBM] [Info] Start training from score -1.421241
#> [LightGBM] [Info] Start training from score -1.429645
#> [LightGBM] [Info] Start training from score -0.655482
#> [LightGBM] [Info] Start training from score -1.424424
#> [LightGBM] [Info] Start training from score -1.428634
#> [LightGBM] [Info] Start training from score -0.654471
#> [LightGBM] [Info] Start training from score -1.422251
#> [LightGBM] [Info] Start training from score -1.426444
#> [LightGBM] [Info] Start training from score -0.656491

head(validator_results)
#>     fold performance nrounds bagging_fraction feature_fraction min_data_in_leaf learning_rate num_leaves max_depth verbose
#> 1: Fold1   0.9674260      62              0.8              0.6                8           0.1         14        -1      -1
#> 2: Fold2   0.9506435      64              0.8              1.0               10           0.2          6        -1      -1
#> 3: Fold3   0.9559827     100              0.6              0.6                8           0.1          6        -1      -1
#>     objective        metric num_class
#> 1: multiclass multi_logloss         3
#> 2: multiclass multi_logloss         3

Inner Bayesian Optimization

validator <- mlexperiments::MLNestedCV$new(
  learner = mllrnrs::LearnerLightgbm$new(
    metric_optimization_higher_better = FALSE
  ),
  strategy = "bayesian",
  fold_list = fold_list,
  k_tuning = 3L,
  ncores = ncores,
  seed = seed
)

validator$parameter_grid <- parameter_grid
validator$learner_args <- learner_args
validator$split_type <- "stratified"


validator$parameter_bounds <- parameter_bounds
validator$optim_args <- optim_args

validator$predict_args <- predict_args
validator$performance_metric <- performance_metric
validator$performance_metric_args <- performance_metric_args
validator$return_models <- TRUE

validator$set_data(
  x = train_x,
  y = train_y
)

validator_results <- validator$execute()
#> 
#> CV fold: Fold1
#> 
#> Registering parallel backend using 4 cores.
#> 
#> CV fold: Fold2
#> CV progress [====================================================================>-----------------------------------] 2/3 ( 67%)
#> 
#> Registering parallel backend using 4 cores.
#> 
#> CV fold: Fold3
#> CV progress [========================================================================================================] 3/3 (100%)
#>                                                                                                                                   
#> Registering parallel backend using 4 cores.

head(validator_results)
#>     fold performance bagging_fraction feature_fraction min_data_in_leaf learning_rate num_leaves nrounds max_depth verbose
#> 1: Fold1   0.9727424        0.2000000        0.2559232                2     0.1992638         20      52        -1      -1
#> 2: Fold2   0.9494955        0.8293692        0.5664407                7     0.1203691         11      76        -1      -1
#> 3: Fold3   0.9568462        0.4438889        0.3041453               10     0.1462295         11      67        -1      -1
#>     objective        metric num_class
#> 1: multiclass multi_logloss         3
#> 2: multiclass multi_logloss         3
#> 3: multiclass multi_logloss         3

Holdout Test Dataset Performance

Predict Outcome in Holdout Test Dataset

preds_lightgbm <- mlexperiments::predictions(
  object = validator,
  newdata = test_x
)

Evaluate Performance on Holdout Test Dataset

perf_lightgbm <- mlexperiments::performance(
  object = validator,
  prediction_results = preds_lightgbm,
  y_ground_truth = test_y
)
perf_lightgbm
#>    model performance
#> 1: Fold1   0.9596127
#> 2: Fold2   0.9612778
#> 3: Fold3   0.9583793

Appendix I: Grid-Search with Target Weigths

Here, lightgbm’s weight-argument is used to rescale the case-weights during the training.

# define the target weights
y_weights <- ifelse(train_y == 1, 0.8, ifelse(train_y == 2, 1.2, 1))
head(y_weights)
#> [1] 1.2 1.2 0.0 0.8 0.8 0.0
tuner_w_weights <- mlexperiments::MLTuneParameters$new(
  learner = mllrnrs::LearnerLightgbm$new(
    metric_optimization_higher_better = FALSE
  ),
  strategy = "grid",
  ncores = ncores,
  seed = seed
)

tuner_w_weights$parameter_grid <- parameter_grid
tuner_w_weights$learner_args <- c(
  learner_args,
  list(case_weights = y_weights)
)
tuner_w_weights$split_type <- "stratified"

tuner_w_weights$set_data(
  x = train_x,
  y = train_y
)

tuner_results_grid <- tuner_w_weights$execute(k = 3)
#> 
#> Parameter settings [============================>-------------------------------------------------------------------] 3/10 ( 30%)
#> Parameter settings [=====================================>----------------------------------------------------------] 4/10 ( 40%)
#> Parameter settings [===============================================>------------------------------------------------] 5/10 ( 50%)
#> Parameter settings [=========================================================>--------------------------------------] 6/10 ( 60%)
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> Parameter settings [===============================================================================================] 10/10 (100%)                                                                                                                                  

head(tuner_results_grid)
#>    setting_id metric_optim_mean nrounds bagging_fraction feature_fraction min_data_in_leaf learning_rate num_leaves max_depth
#>         <int>             <num>   <int>            <num>            <num>            <num>         <num>      <num>     <int>
#> 1:          1         0.1294454      29              0.6              0.6                4           0.2         18        -1
#> 2:          2         0.1221349      51              0.8              1.0               10           0.2          6        -1
#> 3:          3         0.2240799     100              0.8              0.8                4           0.1          2        -1
#> 4:          4         0.1194221      75              1.0              0.8                4           0.1         10        -1
#> 5:          5         0.1281037      32              1.0              0.6                6           0.2         18        -1
#> 6:          6         0.1245721      60              1.0              1.0                8           0.1         14        -1
#>    verbose  objective        metric num_class
#>      <int>     <char>        <char>    <char>
#> 1:      -1 multiclass multi_logloss         3
#> 2:      -1 multiclass multi_logloss         3
#> 3:      -1 multiclass multi_logloss         3
#> 4:      -1 multiclass multi_logloss         3
#> 5:      -1 multiclass multi_logloss         3
#> 6:      -1 multiclass multi_logloss         3

Appendix II: k-Fold Cross Validation with Target Weigths

validator <- mlexperiments::MLCrossValidation$new(
  learner = mllrnrs::LearnerLightgbm$new(
    metric_optimization_higher_better = FALSE
  ),
  fold_list = fold_list,
  ncores = ncores,
  seed = seed
)

# append the optimized setting from above with the newly created weights
validator$learner_args <- c(
  tuner$results$best.setting[-1],
  list("case_weights" = y_weights)
)

validator$predict_args <- predict_args
validator$performance_metric <- performance_metric
validator$performance_metric_args <- performance_metric_args
validator$return_models <- return_models

validator$set_data(
  x = train_x,
  y = train_y
)

validator_results <- validator$execute()
#> 
#> CV fold: Fold1
#> 
#> CV fold: Fold2
#> 
#> CV fold: Fold3
#> CV progress [========================================================================================================] 3/3 (100%)
#>                                                                                                                                   

head(validator_results)
#>      fold performance bagging_fraction feature_fraction min_data_in_leaf learning_rate num_leaves nrounds max_depth verbose
#>    <char>       <num>            <num>            <num>            <num>         <num>      <num>   <int>     <int>   <int>
#> 1:  Fold1   0.9635665              0.8              0.6                8           0.1         14      66        -1      -1
#> 2:  Fold2   0.9551452              0.8              0.6                8           0.1         14      66        -1      -1
#> 3:  Fold3   0.9521232              0.8              0.6                8           0.1         14      66        -1      -1
#>     objective        metric num_class
#>        <char>        <char>    <char>
#> 1: multiclass multi_logloss         3
#> 2: multiclass multi_logloss         3
#> 3: multiclass multi_logloss         3