Section 6 Small data set demo

6.1 Setup

6.1.1 Load libraries

library(ranger)
library(rlist)
library(h2o)
h2o.init(nthreads = -1)

6.1.2 AIC functions

#' Mean squared error function
#'
#' @param y_true a factor vector representing true values.
#' @param y_pred a numeric vector or a matrix of predicted values.
#'
#' @return MSE
#'
mse <- function(y_true, y_pred) {
  return(mean((y_true - y_pred)^2))
}

#' Regression AIC
#'
#' @param y_true a factor vector representing true values.
#' @param y_pred a numeric vector or a matrix of predicted values.
#' @param k number of parameters/variables.
#' @param eps a very small value to avoid negative infinitives generated by log(p=0).
#'
#' @return AIC
#'
aic_reg <- function(y_true, y_pred, k, eps = 1e-15) {
  mserr <- mse(y_true, y_pred)
  if (mserr == 0) mserr <- eps
  AIC <- length(y_true) * log(mserr) + 2 * k
  return(AIC)
}

#' Cross-entropy function
#'
#' @param y_true a factor vector representing true labels.
#' @param y_pred a numeric vector (probabilities of the second class/factor level)
#' or a matrix of predicted probabilities.
#' @param eps a very small value to avoid negative infinitives generated by log(p=0).
#' If the function returns NaN, then increasing eps may solve the issue. Alternatively,
#' As NaN is usually generated in the case of p = 1 - eps = 1, resulting in log(1 - p)
#' = log(0) from binary cross-entropy, the user can pass prediction values as a matrix
#' to calculate categorical cross-entropy instead.
#'
#' @return H, cross-entropy (mean loss per sample)
#'
crossEntropy <- function(y_true, y_pred, eps = 1e-15) {
  stopifnot(is.factor(y_true))
  y_pred <- pmax(pmin(y_pred, 1 - eps), eps)
  n_levels <- nlevels(y_true)
  H <- NULL
  # Binary classification
  if (n_levels == 2 && is.vector(y_pred)) {
    y_true <- as.numeric(y_true == levels(y_true)[-1])
    H <- -mean(y_true * log2(y_pred) + (1 - y_true) * log2(1 - y_pred))
  }
  # Multi-class classification
  else if (n_levels == ncol(y_pred)) {
    y_true <- as.numeric(y_true)
    y_true_encoded <- t(sapply(y_true, function(x) {
      tmp <- rep(0, n_levels)
      tmp[x] <- 1
      return(tmp)
    }))
    H <- -mean(sapply(1:nrow(y_true_encoded), function(x) sum(y_true_encoded[x, ] * log2(y_pred[x, ]))))
  }
  return(H)
}

#' Classification AIC
#'
#' @param y_true a factor vector representing true labels.
#' @param y_pred a numeric vector or a matrix of predicted probabilities.
#' @param k number of parameters/variables.
#' @param ... other parameters to be passed to `crossEntropy()`
#'
#' @return AIC
#'
aic_clf <- function(y_true, y_pred, k, ...) {
  H <- crossEntropy(y_true, y_pred, ...)
  AIC <- 2 * log(2) * length(y_true) * H + 2 * k
  return(AIC)
}

6.1.3 Model parameters

# DeepLearning Grid 1
deeplearning_params_1 <- list(
  activation = "RectifierWithDropout",
  epochs = 10000, # early stopping
  epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
  hidden = list(c(50), c(200), c(500)),
  hidden_dropout_ratios = list(c(0.1), c(0.2), c(0.3), c(0.4), c(0.5)),
  input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
  rho = c(0.9, 0.95, 0.99)
)
# DeepLearning Grid 2
deeplearning_params_2 <- list(
  activation = "RectifierWithDropout",
  epochs = 10000, # early stopping
  epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
  hidden = list(c(50, 50), c(200, 200), c(500, 500)),
  hidden_dropout_ratios = list(
    c(0.1, 0.1), c(0.2, 0.2), c(0.3, 0.3),
    c(0.4, 0.4), c(0.5, 0.5)
  ),
  input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
  rho = c(0.9, 0.95, 0.99)
)
# DeepLearning Grid 3
deeplearning_params_3 <- list(
  activation = "RectifierWithDropout",
  epochs = 10000, # early stopping
  epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
  hidden = list(c(50, 50, 50), c(200, 200, 200), c(500, 500, 500)),
  hidden_dropout_ratios = list(
    c(0.1, 0.1, 0.1), c(0.2, 0.2, 0.2),
    c(0.3, 0.3, 0.3), c(0.4, 0.4, 0.4),
    c(0.5, 0.5, 0.5)
  ),
  input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
  rho = c(0.9, 0.95, 0.99)
)
# GBM
gbm_params <- list(
  col_sample_rate = c(0.4, 0.7, 1.0),
  col_sample_rate_per_tree = c(0.4, 0.7, 1.0),
  learn_rate = 0.1,
  max_depth = c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
  min_rows = c(1, 5, 10, 15, 30, 100),
  min_split_improvement = c(1e-4, 1e-5),
  ntrees = 10000, # early stopping
  sample_rate = c(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
)
# XGBoost
xgboost_params <- list(
  booster = c("gbtree", "dart"),
  col_sample_rate = c(0.6, 0.8, 1.0),
  col_sample_rate_per_tree = c(0.7, 0.8, 0.9, 1.0),
  max_depth = c(5, 10, 15, 20),
  min_rows = c(0.01, 0.1, 1.0, 3.0, 5.0, 10.0, 15.0, 20.0),
  ntrees = 10000, # early stopping
  reg_alpha = c(0.001, 0.01, 0.1, 1, 10, 100),
  reg_lambda = c(0.001, 0.01, 0.1, 0.5, 1),
  sample_rate = c(0.6, 0.8, 1.0)
)

6.2 Variable reduction

variable_reduction <- function(predictor_variables, response_variable, seed = 12) {
  # Calculate initial variable importance
  set.seed(seed)
  # Train random forest using ranger
  ntree <- 100000
  rf <- ranger(x = predictor_variables, y = response_variable, num.trees = ntree, importance = "permutation", verbose = TRUE, scale.permutation.importance = TRUE, seed = 3, keep.inbag = TRUE)
  var_imp <- ranger::importance(rf)
  var_imp <- var_imp[order(-var_imp)]
  var_imp <- var_imp[var_imp >= 0]
  # Variable reduction iterations
  ntree <- 1000
  var_subset_res <- list()
  for (i in 1:length(var_imp)) {
    cat(paste0("Evaluating ", i, " variable(s) out of ", length(var_imp), " variables\n"))
    predictor_variables_ <- predictor_variables[, colnames(predictor_variables) %in% names(var_imp)[1:i], drop = FALSE]
    rf_ <- ranger(x = predictor_variables_, y = response_variable, num.trees = ntree, importance = "none", verbose = TRUE, seed = seed, probability = is.factor(response_variable))
    if (!is.factor(response_variable)) { # Regression
      var_subset_res <- list.append(var_subset_res, list(
        rsq = rf$r.squared,
        aic = aic_reg(
          y_true = response_variable,
          y_pred = rf_$predictions,
          k = i
        )
      ))
    } else { # Classification
      var_subset_res <- list.append(var_subset_res, list(
        acc = 1 - rf$prediction.error,
        aic = aic_clf(
          y_true = response_variable,
          y_pred = rf_$predictions,
          k = i
        )
      ))
    }
  }
  aic_vec <- unlist(lapply(var_subset_res, function(x) x$aic))
  aic_cutoff <- which.min(aic_vec)
  var_imp <- ranger::importance(rf)
  var_imp <- var_imp[order(-var_imp)][1:aic_cutoff]
  cat(paste("Number of variables after variable reduction: ", aic_cutoff, "\n"))
  data <- predictor_variables[, colnames(predictor_variables) %in% names(var_imp), drop = FALSE]
  data$response <- response_variable
  return(data)
}

6.3 Model training

model_training <- function(train_set, prefix = "demo_models", exp_dir = "./demo_models", nfolds = 10, grid_seed = 1) {
  h2o.init(nthreads = -1)
  h2o.removeAll()
  dir.create(exp_dir)
  tmp <- as.h2o(train_set, destination_frame = prefix)
  classification <- FALSE
  if (is.factor(train_set$response)) classification <- TRUE
  samp_factors <- NULL
  if (classification) {
    tmp["response"] <- as.factor(tmp["response"])
    samp_factors <- as.vector(mean(table(train_set$response)) / table(train_set$response))
  }
  y <- "response"
  x <- setdiff(names(tmp), y)
  res <- as.data.frame(tmp$response)
  # -------------------
  # base model training
  # -------------------
  cat("Deep learning grid 1\n")
  deeplearning_1 <- h2o.grid(
    algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_1,
    stopping_rounds = 3, balance_classes = classification,
    class_sampling_factors = samp_factors,
    search_criteria = list(
      strategy = "RandomDiscrete",
      max_models = 5, seed = grid_seed
    ),
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
  )
  for (model_id in deeplearning_1@model_ids) {
    tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  }
  cat("Deep learning grid 2\n")
  deeplearning_2 <- h2o.grid(
    algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_2,
    stopping_rounds = 3, balance_classes = classification,
    class_sampling_factors = samp_factors,
    search_criteria = list(
      strategy = "RandomDiscrete",
      max_models = 5, seed = grid_seed
    ),
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
  )
  for (model_id in deeplearning_2@model_ids) {
    tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  }
  cat("Deep learning grid 3\n")
  deeplearning_3 <- h2o.grid(
    algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_3,
    stopping_rounds = 3, balance_classes = classification,
    class_sampling_factors = samp_factors,
    search_criteria = list(
      strategy = "RandomDiscrete",
      max_models = 5, seed = grid_seed
    ),
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
  )
  for (model_id in deeplearning_3@model_ids) {
    tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  }
  cat("GBM grid\n")
  gbm <- h2o.grid(
    algorithm = "gbm", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, hyper_params = gbm_params, stopping_rounds = 3,
    balance_classes = classification, class_sampling_factors = samp_factors,
    search_criteria = list(strategy = "RandomDiscrete", max_models = 15, seed = grid_seed),
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
  )
  for (model_id in gbm@model_ids) {
    tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  }
  cat("GBM 5 default models\n")
  gbm_1 <- h2o.gbm(
    model_id = "GBM_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
    balance_classes = classification, class_sampling_factors = samp_factors,
    col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
    max_depth = 6, min_rows = 1, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  gbm_2 <- h2o.gbm(
    model_id = "GBM_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
    balance_classes = classification, class_sampling_factors = samp_factors,
    col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
    max_depth = 7, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  gbm_3 <- h2o.gbm(
    model_id = "GBM_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
    balance_classes = classification, class_sampling_factors = samp_factors,
    col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
    max_depth = 8, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  gbm_4 <- h2o.gbm(
    model_id = "GBM_4", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
    balance_classes = classification, class_sampling_factors = samp_factors,
    col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
    max_depth = 10, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  # gbm_5 = h2o.gbm(model_id='GBM_5', x=x, y=y, training_frame=tmp, seed=1, nfolds=nfolds,
  #                 keep_cross_validation_predictions=TRUE, stopping_rounds=3, score_tree_interval=5,
  #                 balance_classes=classification, class_sampling_factors=samp_factors,
  #                 col_sample_rate=0.8, col_sample_rate_per_tree=0.8, learn_rate=0.1,
  #                 max_depth=15, min_rows=100, min_split_improvement=1e-5, ntrees=10000, sample_rate=0.8,
  #                 keep_cross_validation_models=FALSE, fold_assignment='Modulo')
  for (model_id in paste0("GBM_", 1:4)) {
    tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  }
  cat("XGBoost grid\n")
  xgboost <- h2o.grid(
    algorithm = "xgboost", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, hyper_params = xgboost_params, stopping_rounds = 3,
    search_criteria = list(strategy = "RandomDiscrete", max_models = 15, seed = grid_seed),
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
  )
  for (model_id in xgboost@model_ids) {
    tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  }
  cat("XGBoost 3 default models\n")
  xgboost_1 <- h2o.xgboost(
    model_id = "XGBoost_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
    booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
    max_depth = 10, min_rows = 5, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
    sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  xgboost_2 <- h2o.xgboost(
    model_id = "XGBoost_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
    booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
    max_depth = 20, min_rows = 10, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
    sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  xgboost_3 <- h2o.xgboost(
    model_id = "XGBoost_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
    booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
    max_depth = 5, min_rows = 3, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
    sample_rate = 0.8, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  for (model_id in paste0("XGBoost_", 1:3)) {
    tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  }
  cat("GLM\n")
  glm <- h2o.glm(
    model_id = "GLM", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, alpha = c(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    balance_classes = classification, class_sampling_factors = samp_factors,
    max_after_balance_size = 0.5, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  tmp_path <- h2o.saveModel(h2o.getModel("GLM"), path = exp_dir, force = TRUE)
  cat("DRF\n")
  drf <- h2o.randomForest(
    model_id = "DRF", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, ntrees = 10000,
    score_tree_interval = 5, stopping_rounds = 3,
    balance_classes = classification,
    class_sampling_factors = samp_factors,
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  tmp_path <- h2o.saveModel(h2o.getModel("DRF"), path = exp_dir, force = TRUE)
  cat("XRT\n")
  xrt <- h2o.randomForest(
    model_id = "XRT", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
    keep_cross_validation_predictions = TRUE, ntrees = 10000, histogram_type = "Random",
    score_tree_interval = 5, stopping_rounds = 3,
    balance_classes = classification,
    class_sampling_factors = samp_factors,
    keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
  )
  tmp_path <- h2o.saveModel(h2o.getModel("XRT"), path = exp_dir, force = TRUE)
  # -----------------------
  # get holdout predictions
  # -----------------------
  base_models <- as.list(c(
    unlist(deeplearning_1@model_ids),
    unlist(deeplearning_2@model_ids),
    unlist(deeplearning_3@model_ids),
    unlist(gbm@model_ids), paste0("GBM_", 1:4),
    unlist(xgboost@model_ids), paste0("XGBoost_", 1:3),
    "GLM",
    "DRF",
    "XRT"
  ))
  for (model_id in base_models) {
    if (!classification) {
      res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
        col.names = model_id
      ))
    } else {
      res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
        col.names = model_id
      ))
    }
  }
  # ----------------------
  # super learner training
  # ----------------------
  sl_iter <- 0
  cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
  sl <- h2o.stackedEnsemble(
    x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
    training_frame = tmp, seed = 1,
    base_models = base_models,
    metalearner_algorithm = "glm",
    metalearner_nfolds = nfolds,
    keep_levelone_frame = TRUE,
    metalearner_params = list(
      standardize = TRUE, keep_cross_validation_predictions = TRUE,
      balance_classes = classification,
      class_sampling_factors = samp_factors,
      max_after_balance_size = 0.5
    )
  )
  tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
  model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
  tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  if (!classification) {
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
      col.names = paste0(model_id, "_", length(base_models), "_models")
    ))
  } else {
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
      col.names = paste0(model_id, "_", length(base_models), "_models")
    ))
  }
  # ----------------------------------
  # super learner base model reduction
  # ----------------------------------
  while (TRUE) {
    meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_1"))
    names <- meta@model$coefficients_table[, "names"]
    coeffs <- (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
    for (j in 2:nfolds) {
      meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_", j))
      names <- meta@model$coefficients_table[, "names"]
      coeffs <- coeffs + (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
    }
    base_models_ <- as.list(names[coeffs >= ceiling(nfolds / 2) & names != "Intercept"])
    if (length(base_models_) == 0) {
      cat("No base models passing the threshold\n\n")
      break
    }
    if (sum(base_models %in% base_models_) == length(base_models)) {
      cat("No further reduction of base models\n\n")
      break
    }
    sl_iter <- sl_iter + 1
    base_models <- base_models_
    cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
    sl <- h2o.stackedEnsemble(
      x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
      training_frame = tmp, seed = 1,
      base_models = base_models,
      metalearner_algorithm = "glm",
      metalearner_nfolds = nfolds,
      keep_levelone_frame = TRUE,
      metalearner_params = list(
        standardize = TRUE, keep_cross_validation_predictions = TRUE,
        balance_classes = classification,
        class_sampling_factors = samp_factors,
        max_after_balance_size = 0.5
      )
    )
    tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
    model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
    tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
    if (!classification) {
      res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
        col.names = paste0(model_id, "_", length(base_models), "_models")
      ))
    } else {
      res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
        col.names = paste0(model_id, "_", length(base_models), "_models")
      ))
    }
  }
  # -----------------------------------------
  # super learner for homogeneous base models
  # -----------------------------------------
  # DeepLearning
  base_models <- as.list(c(
    unlist(deeplearning_1@model_ids),
    unlist(deeplearning_2@model_ids),
    unlist(deeplearning_3@model_ids)
  ))
  cat(paste0("Super learner deep learning (", length(base_models), " models)\n"))
  sl <- h2o.stackedEnsemble(
    x = x, y = y, model_id = "superlearner_deeplearning",
    training_frame = tmp, seed = 1,
    base_models = base_models,
    metalearner_algorithm = "glm",
    metalearner_nfolds = nfolds,
    keep_levelone_frame = TRUE,
    metalearner_params = list(
      standardize = TRUE, keep_cross_validation_predictions = TRUE,
      balance_classes = classification,
      class_sampling_factors = samp_factors,
      max_after_balance_size = 0.5
    )
  )
  tmp_path <- h2o.saveModel(h2o.getModel("superlearner_deeplearning"), path = exp_dir, force = TRUE)
  model_id <- "metalearner_glm_superlearner_deeplearning"
  tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  if (!classification) {
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
      col.names = paste0(model_id, "_", length(base_models), "_models")
    ))
  } else {
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
      col.names = paste0(model_id, "_", length(base_models), "_models")
    ))
  }
  # GBM
  base_models <- as.list(c(unlist(gbm@model_ids), paste0("GBM_", 1:4)))
  cat(paste0("Super learner GBM (", length(base_models), " models)\n"))
  sl <- h2o.stackedEnsemble(
    x = x, y = y, model_id = "superlearner_gbm",
    training_frame = tmp, seed = 1,
    base_models = base_models,
    metalearner_algorithm = "glm",
    metalearner_nfolds = nfolds,
    keep_levelone_frame = TRUE,
    metalearner_params = list(
      standardize = TRUE, keep_cross_validation_predictions = TRUE,
      balance_classes = classification,
      class_sampling_factors = samp_factors,
      max_after_balance_size = 0.5
    )
  )
  tmp_path <- h2o.saveModel(h2o.getModel("superlearner_gbm"), path = exp_dir, force = TRUE)
  model_id <- "metalearner_glm_superlearner_gbm"
  tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  if (!classification) {
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
      col.names = paste0(model_id, "_", length(base_models), "_models")
    ))
  } else {
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
      col.names = paste0(model_id, "_", length(base_models), "_models")
    ))
  }
  # XGBoost
  base_models <- as.list(c(unlist(xgboost@model_ids), paste0("XGBoost_", 1:3)))
  cat(paste0("Super learner XGBoost (", length(base_models), " models)\n"))
  sl <- h2o.stackedEnsemble(
    x = x, y = y, model_id = "superlearner_xgboost",
    training_frame = tmp, seed = 1,
    base_models = base_models,
    metalearner_algorithm = "glm",
    metalearner_nfolds = nfolds,
    keep_levelone_frame = TRUE,
    metalearner_params = list(
      standardize = TRUE, keep_cross_validation_predictions = TRUE,
      balance_classes = classification,
      class_sampling_factors = samp_factors,
      max_after_balance_size = 0.5
    )
  )
  tmp_path <- h2o.saveModel(h2o.getModel("superlearner_xgboost"), path = exp_dir, force = TRUE)
  model_id <- "metalearner_glm_superlearner_xgboost"
  tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
  if (!classification) {
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
      col.names = paste0(model_id, "_", length(base_models), "_models")
    ))
  } else {
    res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
      col.names = paste0(model_id, "_", length(base_models), "_models")
    ))
  }
  write.csv(res, file = paste0(exp_dir, "/cv_holdout_predictions.csv"), row.names = FALSE)
  cat("\n\n")
  h2o.removeAll()
}

6.4 Run pipeline

data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data", check.names = FALSE, row.names = 1)

# Random subset
set.seed(10)
data <- data[sample(1:nrow(data), size = 50, replace = FALSE), ]
predictor_variables <- data[, colnames(data) != "status"]
response_variable <- as.factor(data$status)

system.time({
  reduced_data <- variable_reduction(predictor_variables = predictor_variables, response_variable = response_variable)
  model_training(train_set = reduced_data)
})

6.5 Session info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.14   knitr_1.40        magrittr_2.0.3    R.cache_0.16.0   
##  [5] R6_2.5.1          rlang_1.0.4       fastmap_1.1.0     stringr_1.4.1    
##  [9] styler_1.8.0      tools_4.2.2       xfun_0.32         R.oo_1.25.0      
## [13] cli_3.3.0         jquerylib_0.1.4   htmltools_0.5.3   yaml_2.3.5       
## [17] digest_0.6.29     bookdown_0.28     purrr_0.3.4       vctrs_0.4.1      
## [21] sass_0.4.2        R.utils_2.12.0    cachem_1.0.6      evaluate_0.16    
## [25] rmarkdown_2.16    stringi_1.7.8     compiler_4.2.2    bslib_0.4.0      
## [29] R.methodsS3_1.8.2 jsonlite_1.8.0