model_training <- function(train_set, prefix = "demo_models", exp_dir = "./demo_models", nfolds = 10, grid_seed = 1) {
h2o.init(nthreads = -1)
h2o.removeAll()
dir.create(exp_dir)
tmp <- as.h2o(train_set, destination_frame = prefix)
classification <- FALSE
if (is.factor(train_set$response)) classification <- TRUE
samp_factors <- NULL
if (classification) {
tmp["response"] <- as.factor(tmp["response"])
samp_factors <- as.vector(mean(table(train_set$response)) / table(train_set$response))
}
y <- "response"
x <- setdiff(names(tmp), y)
res <- as.data.frame(tmp$response)
# -------------------
# base model training
# -------------------
cat("Deep learning grid 1\n")
deeplearning_1 <- h2o.grid(
algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_1,
stopping_rounds = 3, balance_classes = classification,
class_sampling_factors = samp_factors,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 5, seed = grid_seed
),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in deeplearning_1@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("Deep learning grid 2\n")
deeplearning_2 <- h2o.grid(
algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_2,
stopping_rounds = 3, balance_classes = classification,
class_sampling_factors = samp_factors,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 5, seed = grid_seed
),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in deeplearning_2@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("Deep learning grid 3\n")
deeplearning_3 <- h2o.grid(
algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_3,
stopping_rounds = 3, balance_classes = classification,
class_sampling_factors = samp_factors,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 5, seed = grid_seed
),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in deeplearning_3@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GBM grid\n")
gbm <- h2o.grid(
algorithm = "gbm", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = gbm_params, stopping_rounds = 3,
balance_classes = classification, class_sampling_factors = samp_factors,
search_criteria = list(strategy = "RandomDiscrete", max_models = 15, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in gbm@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GBM 5 default models\n")
gbm_1 <- h2o.gbm(
model_id = "GBM_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = classification, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 6, min_rows = 1, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_2 <- h2o.gbm(
model_id = "GBM_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = classification, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 7, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_3 <- h2o.gbm(
model_id = "GBM_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = classification, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 8, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_4 <- h2o.gbm(
model_id = "GBM_4", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = classification, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 10, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
# gbm_5 = h2o.gbm(model_id='GBM_5', x=x, y=y, training_frame=tmp, seed=1, nfolds=nfolds,
# keep_cross_validation_predictions=TRUE, stopping_rounds=3, score_tree_interval=5,
# balance_classes=classification, class_sampling_factors=samp_factors,
# col_sample_rate=0.8, col_sample_rate_per_tree=0.8, learn_rate=0.1,
# max_depth=15, min_rows=100, min_split_improvement=1e-5, ntrees=10000, sample_rate=0.8,
# keep_cross_validation_models=FALSE, fold_assignment='Modulo')
for (model_id in paste0("GBM_", 1:4)) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("XGBoost grid\n")
xgboost <- h2o.grid(
algorithm = "xgboost", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = xgboost_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 15, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in xgboost@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("XGBoost 3 default models\n")
xgboost_1 <- h2o.xgboost(
model_id = "XGBoost_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 10, min_rows = 5, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
xgboost_2 <- h2o.xgboost(
model_id = "XGBoost_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 20, min_rows = 10, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
xgboost_3 <- h2o.xgboost(
model_id = "XGBoost_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 5, min_rows = 3, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.8, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
for (model_id in paste0("XGBoost_", 1:3)) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GLM\n")
glm <- h2o.glm(
model_id = "GLM", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, alpha = c(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
balance_classes = classification, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("GLM"), path = exp_dir, force = TRUE)
cat("DRF\n")
drf <- h2o.randomForest(
model_id = "DRF", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000,
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = classification,
class_sampling_factors = samp_factors,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("DRF"), path = exp_dir, force = TRUE)
cat("XRT\n")
xrt <- h2o.randomForest(
model_id = "XRT", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000, histogram_type = "Random",
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = classification,
class_sampling_factors = samp_factors,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("XRT"), path = exp_dir, force = TRUE)
# -----------------------
# get holdout predictions
# -----------------------
base_models <- as.list(c(
unlist(deeplearning_1@model_ids),
unlist(deeplearning_2@model_ids),
unlist(deeplearning_3@model_ids),
unlist(gbm@model_ids), paste0("GBM_", 1:4),
unlist(xgboost@model_ids), paste0("XGBoost_", 1:3),
"GLM",
"DRF",
"XRT"
))
for (model_id in base_models) {
if (!classification) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = model_id
))
} else {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = model_id
))
}
}
# ----------------------
# super learner training
# ----------------------
sl_iter <- 0
cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = classification,
class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
if (!classification) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
} else {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
}
# ----------------------------------
# super learner base model reduction
# ----------------------------------
while (TRUE) {
meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_1"))
names <- meta@model$coefficients_table[, "names"]
coeffs <- (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
for (j in 2:nfolds) {
meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_", j))
names <- meta@model$coefficients_table[, "names"]
coeffs <- coeffs + (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
}
base_models_ <- as.list(names[coeffs >= ceiling(nfolds / 2) & names != "Intercept"])
if (length(base_models_) == 0) {
cat("No base models passing the threshold\n\n")
break
}
if (sum(base_models %in% base_models_) == length(base_models)) {
cat("No further reduction of base models\n\n")
break
}
sl_iter <- sl_iter + 1
base_models <- base_models_
cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = classification,
class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
if (!classification) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
} else {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
}
}
# -----------------------------------------
# super learner for homogeneous base models
# -----------------------------------------
# DeepLearning
base_models <- as.list(c(
unlist(deeplearning_1@model_ids),
unlist(deeplearning_2@model_ids),
unlist(deeplearning_3@model_ids)
))
cat(paste0("Super learner deep learning (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_deeplearning",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = classification,
class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_deeplearning"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_deeplearning"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
if (!classification) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
} else {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
}
# GBM
base_models <- as.list(c(unlist(gbm@model_ids), paste0("GBM_", 1:4)))
cat(paste0("Super learner GBM (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_gbm",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = classification,
class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_gbm"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_gbm"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
if (!classification) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
} else {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
}
# XGBoost
base_models <- as.list(c(unlist(xgboost@model_ids), paste0("XGBoost_", 1:3)))
cat(paste0("Super learner XGBoost (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_xgboost",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = classification,
class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_xgboost"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_xgboost"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
if (!classification) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
} else {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
}
write.csv(res, file = paste0(exp_dir, "/cv_holdout_predictions.csv"), row.names = FALSE)
cat("\n\n")
h2o.removeAll()
}