Section 3 Model training
library(h2o)
library(MLmetrics)
library(mltools)
library(scales)
library(enrichvs)
library(rlist)
library(stringr)
library(digest)
library(DT)
library(reshape2)3.2 General code/functions
# h2o.init(nthreads=-1, max_mem_size='100G', port=54321)
h2o.init(nthreads = -1)
h2o.removeAll()
# DeepLearning Grid 1
deeplearning_params_1 <- list(
activation = "RectifierWithDropout",
epochs = 10000, # early stopping
epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
hidden = list(c(50), c(200), c(500)),
hidden_dropout_ratios = list(c(0.1), c(0.2), c(0.3), c(0.4), c(0.5)),
input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
rho = c(0.9, 0.95, 0.99)
)
# DeepLearning Grid 2
deeplearning_params_2 <- list(
activation = "RectifierWithDropout",
epochs = 10000, # early stopping
epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
hidden = list(c(50, 50), c(200, 200), c(500, 500)),
hidden_dropout_ratios = list(
c(0.1, 0.1), c(0.2, 0.2), c(0.3, 0.3),
c(0.4, 0.4), c(0.5, 0.5)
),
input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
rho = c(0.9, 0.95, 0.99)
)
# DeepLearning Grid 3
deeplearning_params_3 <- list(
activation = "RectifierWithDropout",
epochs = 10000, # early stopping
epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
hidden = list(c(50, 50, 50), c(200, 200, 200), c(500, 500, 500)),
hidden_dropout_ratios = list(
c(0.1, 0.1, 0.1), c(0.2, 0.2, 0.2),
c(0.3, 0.3, 0.3), c(0.4, 0.4, 0.4),
c(0.5, 0.5, 0.5)
),
input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
rho = c(0.9, 0.95, 0.99)
)
# GBM
gbm_params <- list(
col_sample_rate = c(0.4, 0.7, 1.0),
col_sample_rate_per_tree = c(0.4, 0.7, 1.0),
learn_rate = 0.1,
max_depth = c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
min_rows = c(1, 5, 10, 15, 30, 100),
min_split_improvement = c(1e-4, 1e-5),
ntrees = 10000, # early stopping
sample_rate = c(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
)
# XGBoost
xgboost_params <- list(
booster = c("gbtree", "dart"),
col_sample_rate = c(0.6, 0.8, 1.0),
col_sample_rate_per_tree = c(0.7, 0.8, 0.9, 1.0),
max_depth = c(5, 10, 15, 20),
min_rows = c(0.01, 0.1, 1.0, 3.0, 5.0, 10.0, 15.0, 20.0),
ntrees = 10000, # early stopping
reg_alpha = c(0.001, 0.01, 0.1, 1, 10, 100),
reg_lambda = c(0.001, 0.01, 0.1, 0.5, 1),
sample_rate = c(0.6, 0.8, 1.0)
)balanced_accuracy <- function(y_pred, y_true) {
return(mean(c(Sensitivity(y_pred, y_true), Specificity(y_pred, y_true)), na.rm = TRUE))
}
sem <- function(x, na.rm = TRUE) sd(x, na.rm) / sqrt(length(na.omit(x)))
statistical_testing <- function(data, metric_vec, decreasing_vec, p_value_threshold = 0.05, num_entries = 10,
output_path = NULL) {
for (i in 1:length(metric_vec)) {
metric <- metric_vec[i]
ranked_models <- rownames(data[order(data[, paste(metric, "mean")], decreasing = decreasing_vec[i]), ])
pval <- c()
for (j in 2:length(ranked_models)) {
if (sum(is.na(data[ranked_models[j], paste(metric, "CV", 1:10)]))) {
pval <- c(pval, NA)
} else {
pval <- c(pval, wilcox.test(as.numeric(data[ranked_models[1], paste(metric, "CV", 1:10)]),
as.numeric(data[ranked_models[j], paste(metric, "CV", 1:10)]),
exact = FALSE
)$p.value)
}
}
adj_pval <- c(NA, p.adjust(pval, method = "BH", n = length(pval)))
df <- data.frame(adj_pval)
rownames(df) <- ranked_models
colnames(df) <- paste(metric, "adj. <i>p</i> value")
data <- merge(data, df, by = "row.names", all = TRUE)
rownames(data) <- data$Row.names
data$Row.names <- NULL
}
for (i in 1:length(metric_vec)) {
if (decreasing_vec[i]) {
data[paste(metric_vec[i], "rank")] <- rank(-data[, paste(metric_vec[i], "mean")], ties.method = "average")
} else {
data[paste(metric_vec[i], "rank")] <- rank(data[, paste(metric_vec[i], "mean")], ties.method = "average")
}
}
data["Rank sum"] <- rowSums(data[(ncol(data) - length(metric_vec) + 1):ncol(data)])
data <- data[order(data$`Rank sum`), ]
competitive <- rowSums(data[paste(metric_vec, "adj. <i>p</i> value")] < p_value_threshold) == 0
competitive[is.na(competitive)] <- TRUE
data <- data[competitive, ]
if (!is.null(output_path)) {
data[c(
paste(metric_vec, "adj. <i>p</i> value"), paste(metric_vec, "rank"), "Rank sum",
melt(sapply(metric_vec, function(x) paste(x, c("mean", "s.e.m."))))$value
)] %>%
round(digits = 4) %>%
write.csv(., file = output_path)
}
data[c(paste(metric_vec, "adj. <i>p</i> value"), paste(metric_vec, "rank"), "Rank sum")] %>%
round(digits = 4) %>%
datatable(options = list(pageLength = num_entries), escape = FALSE)
}3.3 Melanin binding (regression)
3.3.1 Model training
train_mb_models <- function(train_set, exp_dir, prefix, nfolds = 10, grid_seed = 1) {
tmp <- as.h2o(train_set, destination_frame = prefix)
y <- "log_intensity"
x <- setdiff(names(tmp), y)
res <- as.data.frame(tmp$log_intensity)
# -------------------
# base model training
# -------------------
cat("Deep learning grid 1\n")
deeplearning_1 <- h2o.grid(
algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_1,
stopping_rounds = 3, keep_cross_validation_models = FALSE,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),
fold_assignment = "Modulo", parallelism = 0
)
for (model_id in deeplearning_1@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("Deep learning grid 2\n")
deeplearning_2 <- h2o.grid(
algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_2,
stopping_rounds = 3, keep_cross_validation_models = FALSE,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),
fold_assignment = "Modulo", parallelism = 0
)
for (model_id in deeplearning_2@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("Deep learning grid 3\n")
deeplearning_3 <- h2o.grid(
algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_3,
stopping_rounds = 3, keep_cross_validation_models = FALSE,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),
fold_assignment = "Modulo", parallelism = 0
)
for (model_id in deeplearning_3@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GBM grid\n")
gbm <- h2o.grid(
algorithm = "gbm", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = gbm_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 300, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in gbm@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GBM 5 default models\n")
gbm_1 <- h2o.gbm(
model_id = "GBM_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 6, min_rows = 1, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_2 <- h2o.gbm(
model_id = "GBM_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 7, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_3 <- h2o.gbm(
model_id = "GBM_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 8, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_4 <- h2o.gbm(
model_id = "GBM_4", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 10, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_5 <- h2o.gbm(
model_id = "GBM_5", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 15, min_rows = 100, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
for (model_id in paste0("GBM_", 1:5)) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("XGBoost grid\n")
xgboost <- h2o.grid(
algorithm = "xgboost", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = xgboost_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 300, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in xgboost@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("XGBoost 3 default models\n")
xgboost_1 <- h2o.xgboost(
model_id = "XGBoost_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 10, min_rows = 5, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
xgboost_2 <- h2o.xgboost(
model_id = "XGBoost_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 20, min_rows = 10, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
xgboost_3 <- h2o.xgboost(
model_id = "XGBoost_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 5, min_rows = 3, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.8, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
for (model_id in paste0("XGBoost_", 1:3)) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GLM\n")
glm <- h2o.glm(
model_id = "GLM", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, alpha = c(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("GLM"), path = exp_dir, force = TRUE)
cat("DRF\n")
drf <- h2o.randomForest(
model_id = "DRF", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000,
score_tree_interval = 5, stopping_rounds = 3,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("DRF"), path = exp_dir, force = TRUE)
cat("XRT\n")
xrt <- h2o.randomForest(
model_id = "XRT", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000, histogram_type = "Random",
score_tree_interval = 5, stopping_rounds = 3,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("XRT"), path = exp_dir, force = TRUE)
# -----------------------
# get holdout predictions
# -----------------------
base_models <- as.list(c(
unlist(deeplearning_1@model_ids),
unlist(deeplearning_2@model_ids),
unlist(deeplearning_3@model_ids),
unlist(gbm@model_ids), paste0("GBM_", 1:5),
unlist(xgboost@model_ids), paste0("XGBoost_", 1:3),
"GLM",
"DRF",
"XRT"
))
for (model_id in base_models) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = model_id
))
}
# ----------------------
# super learner training
# ----------------------
sl_iter <- 0
cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)
tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# ----------------------------------
# super learner base model reduction
# ----------------------------------
while (TRUE) {
meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_1"))
names <- meta@model$coefficients_table[, "names"]
coeffs <- (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
for (j in 2:nfolds) {
meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_", j))
names <- meta@model$coefficients_table[, "names"]
coeffs <- coeffs + (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
}
base_models_ <- as.list(names[coeffs >= ceiling(nfolds / 2) & names != "Intercept"])
if (length(base_models_) == 0) {
cat("No base models passing the threshold\n\n")
break
}
if (sum(base_models %in% base_models_) == length(base_models)) {
cat("No further reduction of base models\n\n")
break
}
sl_iter <- sl_iter + 1
base_models <- base_models_
cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)
tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
}
# -----------------------------------------
# super learner for homogeneous base models
# -----------------------------------------
# DeepLearning
base_models <- as.list(c(
unlist(deeplearning_1@model_ids),
unlist(deeplearning_2@model_ids),
unlist(deeplearning_3@model_ids)
))
cat(paste0("Super learner deep learning (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_deeplearning",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_deeplearning"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_deeplearning"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# GBM
base_models <- as.list(c(unlist(gbm@model_ids), paste0("GBM_", 1:5)))
cat(paste0("Super learner GBM (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_gbm",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_gbm"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_gbm"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# XGBoost
base_models <- as.list(c(unlist(xgboost@model_ids), paste0("XGBoost_", 1:3)))
cat(paste0("Super learner XGBoost (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_xgboost",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_xgboost"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_xgboost"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
col.names = paste0(model_id, "_", length(base_models), "_models")
))
write.csv(res, file = paste0(exp_dir, "/cv_holdout_predictions.csv"), row.names = FALSE)
cat("\n\n")
h2o.removeAll()
}3.3.2 Inner cross-validation
load(file = "./rdata/var_reduct_mb_train_test_splits.RData")
for (i in 1:10) {
cat(paste0("Outer training set ", i, "\n"))
prefix <- paste0("outer_", i)
exp_dir <- paste0("/Users/renee/Downloads/melanin_binding/", prefix)
dir.create(exp_dir)
train_mb_models(train_set = outer_splits[[i]][["train"]], exp_dir = exp_dir, prefix = prefix)
}3.3.3 Training on whole data set
load(file = "./rdata/var_reduct_mb_train_test_splits.RData")
prefix <- "whole_data_set"
exp_dir <- paste0("/Users/renee/Downloads/melanin_binding/", prefix)
dir.create(exp_dir)
train_mb_models(train_set = mb_data, exp_dir = exp_dir, prefix = prefix)# Keep track of grid models
dl_grid <- list()
gbm_grid <- list()
xgboost_grid <- list()
dl_grid_params <- list()
gbm_grid_params <- list()
xgboost_grid_params <- list()
for (i in 1:11) {
if (i == 11) {
cat(paste0("Whole data set\n"))
prefix <- "whole_data_set"
} else {
cat(paste0("Outer training set ", i, "\n"))
prefix <- paste0("outer_", i)
}
dir <- paste0("/Users/renee/Downloads/melanin_binding/", prefix)
files <- list.files(dir)
# Deep learning
dl <- files[str_detect(files, "DeepLearning_model")]
for (m in dl) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$epsilon,
model@allparameters$hidden,
model@allparameters$hidden_dropout_ratios,
model@allparameters$input_dropout_ratio,
model@allparameters$rho
), collapse = " "))
if (hs %in% names(dl_grid)) {
dl_grid[[hs]] <- c(dl_grid[[hs]], m)
} else {
dl_grid[[hs]] <- c(m)
dl_grid_params <- list.append(
dl_grid_params,
c(
"epsilon" = model@allparameters$epsilon,
"hidden" = paste0("[", paste(model@allparameters$hidden, collapse = ","), "]"),
"hidden_dropout_ratios" = paste0(
"[",
paste(model@allparameters$hidden_dropout_ratios,
collapse = ","
), "]"
),
"input_dropout_ratio" = model@allparameters$input_dropout_ratio,
"rho" = model@allparameters$rho
)
)
}
}
h2o.removeAll()
# GBM
gbm <- files[str_detect(files, "GBM_model")]
for (m in gbm) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$min_split_improvement,
model@allparameters$sample_rate
), collapse = " "))
if (hs %in% names(gbm_grid)) {
gbm_grid[[hs]] <- c(gbm_grid[[hs]], m)
} else {
gbm_grid[[hs]] <- c(m)
gbm_grid_params <- list.append(
gbm_grid_params,
c(
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"min_split_improvement" = model@allparameters$min_split_improvement,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}
h2o.removeAll()
# XGBoost
xgboost <- files[str_detect(files, "XGBoost_model")]
for (m in xgboost) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$booster,
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$reg_alpha,
model@allparameters$reg_lambda,
model@allparameters$sample_rate
), collapse = " "))
if (hs %in% names(xgboost_grid)) {
xgboost_grid[[hs]] <- c(xgboost_grid[[hs]], m)
} else {
xgboost_grid[[hs]] <- c(m)
xgboost_grid_params <- list.append(
xgboost_grid_params,
c(
"booster" = model@allparameters$booster,
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"reg_alpha" = model@allparameters$reg_alpha,
"reg_lambda" = model@allparameters$reg_lambda,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}
h2o.removeAll()
}
dl_grid_params <- as.data.frame(t(data.frame(dl_grid_params)))
rownames(dl_grid_params) <- paste("Neural network grid model", 1:nrow(dl_grid_params))
gbm_grid_params <- as.data.frame(t(data.frame(gbm_grid_params)))
rownames(gbm_grid_params) <- paste("GBM grid model", 1:nrow(gbm_grid_params))
xgboost_grid_params <- as.data.frame(t(data.frame(xgboost_grid_params)))
rownames(xgboost_grid_params) <- paste("XGBoost grid model", 1:nrow(xgboost_grid_params))
write.csv(dl_grid_params, "./other_data/melanin_binding/neural_network_grid_params.csv")
write.csv(gbm_grid_params, "./other_data/melanin_binding/gbm_grid_params.csv")
write.csv(xgboost_grid_params, "./other_data/melanin_binding/xgboost_grid_params.csv")
save(dl_grid, gbm_grid, xgboost_grid, file = "./rdata/model_training_grid_models_mb.RData")3.3.4 Model evaluation
model_evaluation <- function(holdout_pred, fold_asign, grid_name, grid_meta = NULL) {
load(file = "./rdata/var_reduct_mb_train_test_splits.RData")
res_ <- list()
model_num <- c()
if (startsWith(grid_name, "Super learner")) {
if (grid_name == "Super learner all models") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter_0")]
} else if (grid_name == "Super learner final") {
sl_models <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter")]
m <- sl_models[length(sl_models)]
} else if (grid_name == "Super learner neural network") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_deeplearning")]
} else if (grid_name == "Super learner GBM") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_gbm")]
} else if (grid_name == "Super learner XGBoost") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_xgboost")]
}
mae <- c()
rmse <- c()
R2 <- c()
for (k in 1:10) {
y_true <- holdout_pred[fold_assign == k, "log_intensity"]
y_pred <- holdout_pred[fold_assign == k, m]
mae <- c(mae, MAE(y_pred = y_pred, y_true = y_true))
rmse <- c(rmse, RMSE(y_pred = y_pred, y_true = y_true))
R2 <- c(R2, R2_Score(y_pred = y_pred, y_true = y_true))
}
# Re-scale to 0-100
mae <- rescale(mae, to = c(0, 100), from = range(mb_data$log_intensity))
rmse <- rescale(rmse, to = c(0, 100), from = range(mb_data$log_intensity))
res_ <- list.append(res_, c(
mean(mae, na.rm = TRUE), sem(mae),
mean(rmse, na.rm = TRUE), sem(rmse),
mean(R2, na.rm = TRUE), sem(R2),
mae, rmse, R2
))
} else {
for (j in 1:length(grid_meta)) {
g <- grid_meta[[j]]
m <- intersect(colnames(holdout_pred), g)
if (length(m) == 1) {
model_num <- c(model_num, j)
mae <- c()
rmse <- c()
R2 <- c()
for (k in 1:10) {
y_true <- holdout_pred[fold_assign == k, "log_intensity"]
y_pred <- holdout_pred[fold_assign == k, m]
mae <- c(mae, MAE(y_pred = y_pred, y_true = y_true))
rmse <- c(rmse, RMSE(y_pred = y_pred, y_true = y_true))
R2 <- c(R2, R2_Score(y_pred = y_pred, y_true = y_true))
}
# Re-scale to 0-100
mae <- rescale(mae, to = c(0, 100), from = range(mb_data$log_intensity))
rmse <- rescale(rmse, to = c(0, 100), from = range(mb_data$log_intensity))
res_ <- list.append(res_, c(
mean(mae, na.rm = TRUE), sem(mae),
mean(rmse, na.rm = TRUE), sem(rmse),
mean(R2, na.rm = TRUE), sem(R2),
mae, rmse, R2
))
}
}
}
res <- as.data.frame(t(data.frame(res_)))
colnames(res) <- c(
"Norm. MAE mean", "Norm. MAE s.e.m.", "Norm. RMSE mean", "Norm. RMSE s.e.m.",
"R^2 mean", "R^2 s.e.m.",
paste("Norm. MAE CV", 1:10), paste("Norm. RMSE CV", 1:10), paste("R^2 CV", 1:10)
)
if (nrow(res) == 1) {
rownames(res) <- grid_name
} else {
rownames(res) <- paste(grid_name, model_num)
}
return(res)
}3.3.5 Inner loop model selection
load(file = "./rdata/model_training_grid_models_mb.RData")
for (i in 1:10) {
holdout_pred <- read.csv(paste0("./other_data/melanin_binding/outer_", i, "/cv_holdout_predictions.csv"))
fold_assign <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
data_ <- list()
# Deep learning grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "Neural network grid model",
grid_meta = dl_grid
))
# GBM grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM grid model",
grid_meta = gbm_grid
))
# GBM default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))
# XGBoost grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))
# XGBoost default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))
# GLM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "GLM model", grid_meta = list("GLM")))
# DRF
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
# XRT
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
# Super learner all
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
# Super learner final
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
# Super learner deep learning
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
# Super learner GBM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
# Super learner XGBoost
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data <- do.call(rbind, data_)
write.csv(data, paste0("./other_data/melanin_binding/outer_", i, "/cv_res.csv"))
}3.3.6 Final evaluation
load(file = "./rdata/var_reduct_mb_train_test_splits.RData")
dir <- "/Users/renee/Downloads/melanin_binding"
selected_models <- c(
"Super learner final", "Super learner final", "Super learner final",
"Super learner final", "Super learner final", "Super learner final",
"Super learner final", "Super learner final", "Super learner final",
"Super learner final"
)
mae <- c()
rmse <- c()
R2 <- c()
for (i in 1:10) {
holdout_pred <- read.csv(paste0("./other_data/melanin_binding/outer_", i, "/cv_holdout_predictions.csv"))
if (startsWith(selected_models[i], "Neural network grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), dl_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "GBM grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), gbm_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "GBM model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/GBM_", grid_num))
} else if (startsWith(selected_models[i], "XGBoost grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), xgboost_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "XGBoost model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/XGBoost_", grid_num))
} else if (selected_models[i] == "Super learner all models") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_iter_0"))
} else if (selected_models[i] == "Super learner final") {
files <- list.files(paste0(dir, "/outer_", i))
files <- files[startsWith(files, "superlearner_iter")]
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", files[length(files)]))
} else if (selected_models[i] == "Super learner neural network") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_deeplearning"))
} else if (selected_models[i] == "Super learner GBM") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_gbm"))
} else if (selected_models[i] == "Super learner XGBoost") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_xgboost"))
} else {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", unlist(strsplit(selected_models[i], " "))[1]))
}
tmp <- as.h2o(subset(outer_splits[[i]][["test"]], select = -log_intensity))
y_true <- outer_splits[[i]][["test"]]$log_intensity
y_pred <- as.data.frame(as.data.frame(h2o.predict(model, tmp)))[, 1]
mae <- c(mae, MAE(y_pred = y_pred, y_true = y_true))
rmse <- c(rmse, RMSE(y_pred = y_pred, y_true = y_true))
R2 <- c(R2, R2_Score(y_pred = y_pred, y_true = y_true))
}
# Re-scale to 0-100
mae <- rescale(mae, to = c(0, 100), from = range(mb_data$log_intensity))
rmse <- rescale(rmse, to = c(0, 100), from = range(mb_data$log_intensity))
h2o.removeAll()
save(mae, rmse, R2, file = "./rdata/final_evaluation_mb.RData")load(file = "./rdata/final_evaluation_mb.RData")
data.frame(
Metric = c("Norm. MAE", "Norm. RMSE", "R<sup>2</sup>"),
`Mean ± s.e.m.` = c(
paste0(round(mean(mae), 3), " ± ", round(sem(mae), 3)),
paste0(round(mean(rmse), 3), " ± ", round(sem(rmse), 3)),
paste0(round(mean(R2), 3), " ± ", round(sem(R2), 3))
),
check.names = FALSE
) %>%
datatable(escape = FALSE, rownames = FALSE)3.3.7 Final model selection
load(file = "./rdata/model_training_grid_models_mb.RData")
holdout_pred <- read.csv("./other_data/melanin_binding/whole_data_set/cv_holdout_predictions.csv")
fold_assign <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
data_ <- list()
# Deep learning grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "Neural network grid model",
grid_meta = dl_grid
))
# GBM grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM grid model",
grid_meta = gbm_grid
))
# GBM default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))
# XGBoost grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))
# XGBoost default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))
# GLM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "GLM model", grid_meta = list("GLM")))
# DRF
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
# XRT
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
# Super learner all
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
# Super learner final
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
# Super learner deep learning
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
# Super learner GBM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
# Super learner XGBoost
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data <- do.call(rbind, data_)
write.csv(data, "./other_data/melanin_binding/whole_data_set/cv_res.csv")data <- read.csv("./other_data/melanin_binding/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
statistical_testing(data, metric_vec = c("Norm. MAE", "Norm. RMSE", "R^2"), decreasing_vec = c(FALSE, FALSE, TRUE))# statistical_testing(data, metric_vec=c('Norm. MAE', 'Norm. RMSE', 'R^2'), decreasing_vec=c(FALSE, FALSE, TRUE),
# output_path='./other_data/melanin_binding/whole_data_set/cv_res_statistical_testing.csv')3.3.8 Final reduced SL
h2o.init(nthreads = -1)
model_path <- "/Users/renee/Downloads/melanin_binding/whole_data_set/superlearner_iter_3"
model <- h2o.loadModel(model_path)
load(file = "./rdata/model_training_grid_models_mb.RData")
data <- read.csv("./other_data/melanin_binding/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
df <- as.data.frame(model@model$metalearner_model@model$coefficients_table)[c("names", "standardized_coefficients")][-1, ]
for (i in 1:nrow(df)) {
name <- df$names[i]
if (startsWith(name, "DeepLearning_model")) {
for (j in 1:length(dl_grid)) {
if (name %in% dl_grid[[j]]) break
}
df$names[i] <- paste("Neural network grid model", j)
} else if (startsWith(name, "GBM_model")) {
for (j in 1:length(gbm_grid)) {
if (name %in% gbm_grid[[j]]) break
}
df$names[i] <- paste("GBM grid model", j)
} else if (startsWith(name, "XGBoost_model")) {
for (j in 1:length(xgboost_grid)) {
if (name %in% xgboost_grid[[j]]) break
}
df$names[i] <- paste("XGBoost grid model", j)
} else {
df$names[i] <- paste(strsplit(name, "_")[[1]], collapse = " model ")
}
}
rownames(df) <- df$names
df <- merge(df, data["R^2 mean"], by = "row.names")[, -1]
df <- df[df$standardized_coefficients > 0, ]p1 <- ggplot(df, aes(x = reorder(names, standardized_coefficients), y = standardized_coefficients, fill = `R^2 mean`)) +
geom_col(color = "black", size = 0.2) +
scale_fill_gradient(low = "white", high = "red", n.breaks = 4, limits = c(0.30, 0.60), breaks = c(0.3, 0.4, 0.5, 0.6)) +
geom_text(aes(label = sprintf("%.2f", `R^2 mean`)), nudge_y = -0.002, size = 3, color = "white") +
geom_text(aes(label = sprintf("%.3f", standardized_coefficients)), nudge_y = 0.0025, size = 3) +
coord_flip() +
scale_y_continuous(limits = c(0, max(df$standardized_coefficients) + 0.005), expand = c(0.01, 0)) +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
legend.text = element_text(colour = "black", size = 10),
plot.title = element_text(hjust = 0.5, size = 18),
plot.margin = ggplot2::margin(10, 10, 10, 0, "pt"),
axis.title.x = element_text(colour = "black", size = 18),
axis.title.y = element_text(colour = "black", size = 18),
axis.text.x = element_text(colour = "black", size = 12),
axis.text.y = element_text(colour = "black", size = 12),
legend.title = element_text(size = 14),
legend.position = c(0.85, 0.5)
) +
guides(fill = guide_colorbar(title = expression(italic("R"^2)))) +
xlab("") +
ylab("Coefficient") +
ggtitle("Melanin binding")
3.4 Cell-penetration (classification)
3.4.1 Model training
train_cpp_models <- function(train_set, exp_dir, prefix, nfolds = 10, grid_seed = 1) {
tmp <- as.h2o(train_set, destination_frame = prefix)
tmp["category"] <- as.factor(tmp["category"])
y <- "category"
x <- setdiff(names(tmp), y)
res <- as.data.frame(tmp$category)
samp_factors <- as.vector(mean(table(train_set$category)) / table(train_set$category))
# -------------------
# base model training
# -------------------
cat("Deep learning grid 1\n")
deeplearning_1 <- h2o.grid(
algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_1,
stopping_rounds = 3, balance_classes = TRUE, class_sampling_factors = samp_factors,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in deeplearning_1@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GBM grid\n")
gbm <- h2o.grid(
algorithm = "gbm", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = gbm_params, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
search_criteria = list(strategy = "RandomDiscrete", max_models = 100, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in gbm@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GBM 5 default models\n")
gbm_1 <- h2o.gbm(
model_id = "GBM_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 6, min_rows = 1, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_2 <- h2o.gbm(
model_id = "GBM_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 7, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_3 <- h2o.gbm(
model_id = "GBM_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 8, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_4 <- h2o.gbm(
model_id = "GBM_4", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 10, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_5 <- h2o.gbm(
model_id = "GBM_5", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 15, min_rows = 100, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
for (model_id in paste0("GBM_", 1:5)) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("XGBoost grid\n")
xgboost <- h2o.grid(
algorithm = "xgboost", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = xgboost_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 100, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in xgboost@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("XGBoost 3 default models\n")
xgboost_1 <- h2o.xgboost(
model_id = "XGBoost_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 10, min_rows = 5, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
xgboost_2 <- h2o.xgboost(
model_id = "XGBoost_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 20, min_rows = 10, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
xgboost_3 <- h2o.xgboost(
model_id = "XGBoost_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 5, min_rows = 3, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.8, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
for (model_id in paste0("XGBoost_", 1:3)) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("DRF\n")
drf <- h2o.randomForest(
model_id = "DRF", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000,
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("DRF"), path = exp_dir, force = TRUE)
cat("XRT\n")
xrt <- h2o.randomForest(
model_id = "XRT", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000, histogram_type = "Random",
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("XRT"), path = exp_dir, force = TRUE)
# -----------------------
# get holdout predictions
# -----------------------
base_models <- as.list(c(
unlist(deeplearning_1@model_ids),
unlist(gbm@model_ids), paste0("GBM_", 1:5),
unlist(xgboost@model_ids), paste0("XGBoost_", 1:3),
"DRF",
"XRT"
))
for (model_id in base_models) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
col.names = model_id
))
}
# ----------------------
# super learner training
# ----------------------
sl_iter <- 0
cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)
tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# ----------------------------------
# super learner base model reduction
# ----------------------------------
while (TRUE) {
meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_1"))
names <- meta@model$coefficients_table[, "names"]
coeffs <- (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
for (j in 2:nfolds) {
meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_", j))
names <- meta@model$coefficients_table[, "names"]
coeffs <- coeffs + (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
}
base_models_ <- as.list(names[coeffs >= ceiling(nfolds / 2) & names != "Intercept"])
if (length(base_models_) == 0) {
cat("No base models passing the threshold\n\n")
break
}
if (sum(base_models %in% base_models_) == length(base_models)) {
cat("No further reduction of base models\n\n")
break
}
sl_iter <- sl_iter + 1
base_models <- base_models_
cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)
tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
}
# -----------------------------------------
# super learner for homogeneous base models
# -----------------------------------------
# DeepLearning
base_models <- deeplearning_1@model_ids
cat(paste0("Super learner deep learning (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_deeplearning",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_deeplearning"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_deeplearning"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# GBM
base_models <- as.list(c(unlist(gbm@model_ids), paste0("GBM_", 1:5)))
cat(paste0("Super learner GBM (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_gbm",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_gbm"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_gbm"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# XGBoost
base_models <- as.list(c(unlist(xgboost@model_ids), paste0("XGBoost_", 1:3)))
cat(paste0("Super learner XGBoost (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_xgboost",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_xgboost"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_xgboost"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
write.csv(res, file = paste0(exp_dir, "/cv_holdout_predictions.csv"), row.names = FALSE)
cat("\n\n")
h2o.removeAll()
}3.4.2 Inner cross-validation
load(file = "./rdata/var_reduct_cpp_train_test_splits.RData")
for (i in 1:10) {
cat(paste0("Outer training set ", i, "\n"))
prefix <- paste0("outer_", i)
exp_dir <- paste0("/Users/renee/Downloads/cell_penetration/", prefix)
dir.create(exp_dir)
train_cpp_models(train_set = outer_splits[[i]][["train"]], exp_dir = exp_dir, prefix = prefix)
}3.4.3 Training on whole data set
load(file = "./rdata/var_reduct_cpp_train_test_splits.RData")
prefix <- "whole_data_set"
exp_dir <- paste0("/Users/renee/Downloads/cell_penetration/", prefix)
dir.create(exp_dir)
train_cpp_models(train_set = cpp_data, exp_dir = exp_dir, prefix = prefix)# Keep track of grid models
dl_grid <- list()
gbm_grid <- list()
xgboost_grid <- list()
dl_grid_params <- list()
gbm_grid_params <- list()
xgboost_grid_params <- list()
for (i in 1:11) {
if (i == 11) {
cat(paste0("Whole data set\n"))
prefix <- "whole_data_set"
} else {
cat(paste0("Outer training set ", i, "\n"))
prefix <- paste0("outer_", i)
}
dir <- paste0("/Users/renee/Downloads/cell_penetration/", prefix)
files <- list.files(dir)
# Deep learning
dl <- files[str_detect(files, "DeepLearning_model")]
for (m in dl) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$epsilon,
model@allparameters$hidden,
model@allparameters$hidden_dropout_ratios,
model@allparameters$input_dropout_ratio,
model@allparameters$rho
), collapse = " "))
if (hs %in% names(dl_grid)) {
dl_grid[[hs]] <- c(dl_grid[[hs]], m)
} else {
dl_grid[[hs]] <- c(m)
dl_grid_params <- list.append(
dl_grid_params,
c(
"epsilon" = model@allparameters$epsilon,
"hidden" = paste0("[", paste(model@allparameters$hidden, collapse = ","), "]"),
"hidden_dropout_ratios" = paste0(
"[",
paste(model@allparameters$hidden_dropout_ratios,
collapse = ","
), "]"
),
"input_dropout_ratio" = model@allparameters$input_dropout_ratio,
"rho" = model@allparameters$rho
)
)
}
}
h2o.removeAll()
# GBM
gbm <- files[str_detect(files, "GBM_model")]
for (m in gbm) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$min_split_improvement,
model@allparameters$sample_rate
), collapse = " "))
if (hs %in% names(gbm_grid)) {
gbm_grid[[hs]] <- c(gbm_grid[[hs]], m)
} else {
gbm_grid[[hs]] <- c(m)
gbm_grid_params <- list.append(
gbm_grid_params,
c(
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"min_split_improvement" = model@allparameters$min_split_improvement,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}
h2o.removeAll()
# XGBoost
xgboost <- files[str_detect(files, "XGBoost_model")]
for (m in xgboost) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$booster,
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$reg_alpha,
model@allparameters$reg_lambda,
model@allparameters$sample_rate
), collapse = " "))
if (hs %in% names(xgboost_grid)) {
xgboost_grid[[hs]] <- c(xgboost_grid[[hs]], m)
} else {
xgboost_grid[[hs]] <- c(m)
xgboost_grid_params <- list.append(
xgboost_grid_params,
c(
"booster" = model@allparameters$booster,
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"reg_alpha" = model@allparameters$reg_alpha,
"reg_lambda" = model@allparameters$reg_lambda,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}
h2o.removeAll()
}
dl_grid_params <- as.data.frame(t(data.frame(dl_grid_params)))
rownames(dl_grid_params) <- paste("Neural network grid model", 1:nrow(dl_grid_params))
gbm_grid_params <- as.data.frame(t(data.frame(gbm_grid_params)))
rownames(gbm_grid_params) <- paste("GBM grid model", 1:nrow(gbm_grid_params))
xgboost_grid_params <- as.data.frame(t(data.frame(xgboost_grid_params)))
rownames(xgboost_grid_params) <- paste("XGBoost grid model", 1:nrow(xgboost_grid_params))
write.csv(dl_grid_params, "./other_data/cell_penetration/neural_network_grid_params.csv")
write.csv(gbm_grid_params, "./other_data/cell_penetration/gbm_grid_params.csv")
write.csv(xgboost_grid_params, "./other_data/cell_penetration/xgboost_grid_params.csv")
save(dl_grid, gbm_grid, xgboost_grid, file = "./rdata/model_training_grid_models_cpp.RData")3.4.4 Model evaluation
model_evaluation <- function(holdout_pred, fold_asign, grid_name, grid_meta = NULL) {
res_ <- list()
model_num <- c()
if (startsWith(grid_name, "Super learner")) {
if (grid_name == "Super learner all models") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter_0")]
} else if (grid_name == "Super learner final") {
sl_models <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter")]
m <- sl_models[length(sl_models)]
} else if (grid_name == "Super learner neural network") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_deeplearning")]
} else if (grid_name == "Super learner GBM") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_gbm")]
} else if (grid_name == "Super learner XGBoost") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_xgboost")]
}
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- c()
threshold <- 0.5
for (k in 1:10) {
y_true <- sapply(holdout_pred[fold_assign == k, "category"], function(x) if (x == "penetrating") 1 else 0)
y_pred <- holdout_pred[fold_assign == k, m]
y_pred_cls <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
logloss <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
MCC <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
F1 <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
acc <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
EF <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
BEDROC <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
}
res_ <- list.append(res_, c(
mean(logloss, na.rm = TRUE), sem(logloss),
mean(MCC, na.rm = TRUE), sem(MCC),
mean(F1, na.rm = TRUE), sem(F1),
mean(acc, na.rm = TRUE), sem(acc),
mean(EF, na.rm = TRUE), sem(EF),
mean(BEDROC, na.rm = TRUE), sem(BEDROC),
logloss, MCC, F1, acc, EF, BEDROC
))
} else {
for (j in 1:length(grid_meta)) {
g <- grid_meta[[j]]
m <- intersect(colnames(holdout_pred), g)
if (length(m) == 1) {
model_num <- c(model_num, j)
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- c()
threshold <- 0.5
for (k in 1:10) {
y_true <- sapply(holdout_pred[fold_assign == k, "category"], function(x) if (x == "penetrating") 1 else 0)
y_pred <- holdout_pred[fold_assign == k, m]
y_pred_cls <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
logloss <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
MCC <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
F1 <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
acc <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
EF <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
BEDROC <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
}
res_ <- list.append(res_, c(
mean(logloss, na.rm = TRUE), sem(logloss),
mean(MCC, na.rm = TRUE), sem(MCC),
mean(F1, na.rm = TRUE), sem(F1),
mean(acc, na.rm = TRUE), sem(acc),
mean(EF, na.rm = TRUE), sem(EF),
mean(BEDROC, na.rm = TRUE), sem(BEDROC),
logloss, MCC, F1, acc, EF, BEDROC
))
}
}
}
res <- as.data.frame(t(data.frame(res_)))
colnames(res) <- c(
"Log loss mean", "Log loss s.e.m.", "MCC mean", "MCC s.e.m.", "F_1 mean", "F_1 s.e.m.",
"Balanced accuracy mean", "Balanced accuracy s.e.m.", "EF mean", "EF s.e.m.", "BEDROC mean", "BEDROC s.e.m.",
paste("Log loss CV", 1:10), paste("MCC CV", 1:10), paste("F_1 CV", 1:10),
paste("Balanced accuracy CV", 1:10), paste("EF CV", 1:10), paste("BEDROC CV", 1:10)
)
if (nrow(res) == 1) {
rownames(res) <- grid_name
} else {
rownames(res) <- paste(grid_name, model_num)
}
return(res)
}3.4.5 Inner loop model selection
load(file = "./rdata/model_training_grid_models_cpp.RData")
for (i in 1:10) {
holdout_pred <- read.csv(paste0("./other_data/cell_penetration/outer_", i, "/cv_holdout_predictions.csv"))
fold_assign <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
data_ <- list()
# Deep learning grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "Neural network grid model",
grid_meta = dl_grid
))
# GBM grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM grid model",
grid_meta = gbm_grid
))
# GBM default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))
# XGBoost grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))
# XGBoost default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))
# DRF
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
# XRT
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
# Super learner all
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
# Super learner final
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
# Super learner deep learning
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
# Super learner GBM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
# Super learner XGBoost
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data <- do.call(rbind, data_)
write.csv(data, paste0("./other_data/cell_penetration/outer_", i, "/cv_res.csv"))
}3.4.6 Final evaluation
load(file = "./rdata/var_reduct_cpp_train_test_splits.RData")
load(file = "./rdata/model_training_grid_models_cpp.RData")
dir <- "/Users/renee/Downloads/cell_penetration"
selected_models <- c(
"GBM grid model 30", "Neural network grid model 98", "Neural network grid model 72",
"GBM grid model 1", "GBM grid model 45", "GBM grid model 82", "GBM grid model 72",
"GBM grid model 15", "GBM grid model 74", "GBM grid model 24"
)
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- c()
threshold <- 0.5
for (i in 1:10) {
holdout_pred <- read.csv(paste0("./other_data/cell_penetration/outer_", i, "/cv_holdout_predictions.csv"))
if (startsWith(selected_models[i], "Neural network grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), dl_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "GBM grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), gbm_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "GBM model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/GBM_", grid_num))
} else if (startsWith(selected_models[i], "XGBoost grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), xgboost_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "XGBoost model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/XGBoost_", grid_num))
} else if (selected_models[i] == "Super learner all models") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_iter_0"))
} else if (selected_models[i] == "Super learner final") {
files <- list.files(paste0(dir, "/outer_", i))
files <- files[startsWith(files, "superlearner_iter")]
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", files[length(files)]))
} else if (selected_models[i] == "Super learner neural network") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_deeplearning"))
} else if (selected_models[i] == "Super learner GBM") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_gbm"))
} else if (selected_models[i] == "Super learner XGBoost") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_xgboost"))
} else {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", unlist(strsplit(selected_models[i], " "))[1]))
}
tmp <- as.h2o(subset(outer_splits[[i]][["test"]], select = -category))
y_true <- sapply(outer_splits[[i]][["test"]]$category, function(x) if (x == "penetrating") 1 else 0)
y_pred <- as.data.frame(as.data.frame(h2o.predict(model, tmp)))[, "penetrating"]
y_pred_cls <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
logloss <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
MCC <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
F1 <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
acc <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
EF <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
BEDROC <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
}
save(logloss, MCC, F1, acc, EF, BEDROC, file = "./rdata/final_evaluation_cpp.RData")load(file = "./rdata/final_evaluation_cpp.RData")
data.frame(
Metric = c("Log loss", "MCC", "F<sub>1</sub>", "Balanced accuracy", "EF", "BEDROC"),
`Mean ± s.e.m.` = c(
paste0(round(mean(logloss), 3), " ± ", round(sem(logloss), 3)),
paste0(round(mean(MCC), 3), " ± ", round(sem(MCC), 3)),
paste0(round(mean(F1), 3), " ± ", round(sem(F1), 3)),
paste0(round(mean(acc), 3), " ± ", round(sem(acc), 3)),
paste0(round(mean(EF), 3), " ± ", round(sem(EF), 3)),
paste0(round(mean(BEDROC), 3), " ± ", round(sem(BEDROC), 3))
),
check.names = FALSE
) %>%
datatable(escape = FALSE, rownames = FALSE)3.4.7 Final model selection
load(file = "./rdata/model_training_grid_models_cpp.RData")
holdout_pred <- read.csv("./other_data/cell_penetration/whole_data_set/cv_holdout_predictions.csv")
fold_assign <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
data_ <- list()
# Deep learning grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "Neural network grid model",
grid_meta = dl_grid
))
# GBM grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM grid model",
grid_meta = gbm_grid
))
# GBM default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))
# XGBoost grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))
# XGBoost default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))
# DRF
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
# XRT
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
# Super learner all
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
# Super learner final
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
# Super learner deep learning
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
# Super learner GBM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
# Super learner XGBoost
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data <- do.call(rbind, data_)
write.csv(data, "./other_data/cell_penetration/whole_data_set/cv_res.csv")data <- read.csv("./other_data/cell_penetration/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
statistical_testing(data,
metric_vec = c("Log loss", "MCC", "F_1", "Balanced accuracy"),
decreasing_vec = c(FALSE, TRUE, TRUE, TRUE)
)# statistical_testing(data, metric_vec=c('Log loss', 'MCC', 'F_1', 'Balanced accuracy'),
# decreasing_vec=c(FALSE, TRUE, TRUE, TRUE),
# output_path='./other_data/cell_penetration/whole_data_set/cv_res_statistical_testing.csv')3.4.8 Final reduced SL
h2o.init(nthreads = -1)
model_path <- "/Users/renee/Downloads/cell_penetration/whole_data_set/superlearner_iter_1"
model <- h2o.loadModel(model_path)
load(file = "./rdata/model_training_grid_models_cpp.RData")
data <- read.csv("./other_data/cell_penetration/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
df <- as.data.frame(model@model$metalearner_model@model$coefficients_table)[c("names", "standardized_coefficients")][-1, ]
for (i in 1:nrow(df)) {
name <- df$names[i]
if (startsWith(name, "DeepLearning_model")) {
for (j in 1:length(dl_grid)) {
if (name %in% dl_grid[[j]]) break
}
df$names[i] <- paste("Neural network grid model", j)
} else if (startsWith(name, "GBM_model")) {
for (j in 1:length(gbm_grid)) {
if (name %in% gbm_grid[[j]]) break
}
df$names[i] <- paste("GBM grid model", j)
} else if (startsWith(name, "XGBoost_model")) {
for (j in 1:length(xgboost_grid)) {
if (name %in% xgboost_grid[[j]]) break
}
df$names[i] <- paste("XGBoost grid model", j)
} else if (startsWith(name, "DRF")) {
df$names[i] <- "DRF model"
} else if (startsWith(name, "XRT")) {
df$names[i] <- "XRT model"
} else {
df$names[i] <- paste(strsplit(name, "_")[[1]], collapse = " model ")
}
}
rownames(df) <- df$names
df <- merge(df, data["Balanced accuracy mean"], by = "row.names")[, -1]
df <- df[df$standardized_coefficients > 0, ]p2 <- ggplot(df, aes(x = reorder(names, standardized_coefficients), y = standardized_coefficients, fill = `Balanced accuracy mean`)) +
geom_col(color = "black", size = 0.2) +
scale_fill_gradient(low = "white", high = "red", n.breaks = 4, limits = c(0.85, 1.00)) +
geom_text(aes(label = sprintf("%.2f", `Balanced accuracy mean`)), nudge_y = -0.0025, size = 3, color = "white") +
geom_text(aes(label = sprintf("%.3f", standardized_coefficients)), nudge_y = 0.0035, size = 3) +
coord_flip() +
scale_y_continuous(limits = c(0, max(df$standardized_coefficients) + 0.006), expand = c(0.01, 0)) +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
legend.text = element_text(colour = "black", size = 10),
plot.title = element_text(hjust = 0.5, size = 18),
plot.margin = ggplot2::margin(10, 10, 10, 0, "pt"),
axis.title.x = element_text(colour = "black", size = 18),
axis.title.y = element_text(colour = "black", size = 18),
axis.text.x = element_text(colour = "black", size = 12),
axis.text.y = element_text(colour = "black", size = 12),
legend.title = element_text(size = 14),
legend.position = c(0.85, 0.5)
) +
guides(fill = guide_colorbar(title = "Balanced accuracy")) +
xlab("") +
ylab("Coefficient") +
ggtitle("Cell-penetration")
3.5 Toxicity (classification)
3.5.1 Model training
train_tx_models <- function(train_set, exp_dir, prefix, nfolds = 10, grid_seed = 1) {
tmp <- as.h2o(train_set, destination_frame = prefix)
tmp["category"] <- as.factor(tmp["category"])
y <- "category"
x <- setdiff(names(tmp), y)
res <- as.data.frame(tmp$category)
samp_factors <- as.vector(mean(table(train_set$category)) / table(train_set$category))
# -------------------
# base model training
# -------------------
cat("Deep learning grid 1\n")
deeplearning_1 <- h2o.grid(
algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_1,
stopping_rounds = 3, balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in deeplearning_1@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GBM grid\n")
gbm <- h2o.grid(
algorithm = "gbm", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = gbm_params, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
search_criteria = list(strategy = "RandomDiscrete", max_models = 100, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in gbm@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GBM 5 default models\n")
gbm_1 <- h2o.gbm(
model_id = "GBM_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 6, min_rows = 1, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_2 <- h2o.gbm(
model_id = "GBM_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 7, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_3 <- h2o.gbm(
model_id = "GBM_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 8, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_4 <- h2o.gbm(
model_id = "GBM_4", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 10, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
gbm_5 <- h2o.gbm(
model_id = "GBM_5", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 15, min_rows = 100, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
for (model_id in paste0("GBM_", 1:5)) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("XGBoost grid\n")
xgboost <- h2o.grid(
algorithm = "xgboost", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = xgboost_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 100, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)
for (model_id in xgboost@model_ids) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("XGBoost 3 default models\n")
xgboost_1 <- h2o.xgboost(
model_id = "XGBoost_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 10, min_rows = 5, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
xgboost_2 <- h2o.xgboost(
model_id = "XGBoost_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 20, min_rows = 10, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
xgboost_3 <- h2o.xgboost(
model_id = "XGBoost_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 5, min_rows = 3, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.8, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
for (model_id in paste0("XGBoost_", 1:3)) {
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
}
cat("GLM\n")
glm <- h2o.glm(
model_id = "GLM", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, alpha = c(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("GLM"), path = exp_dir, force = TRUE)
cat("DRF\n")
drf <- h2o.randomForest(
model_id = "DRF", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000,
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("DRF"), path = exp_dir, force = TRUE)
cat("XRT\n")
xrt <- h2o.randomForest(
model_id = "XRT", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000, histogram_type = "Random",
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)
tmp_path <- h2o.saveModel(h2o.getModel("XRT"), path = exp_dir, force = TRUE)
# -----------------------
# get holdout predictions
# -----------------------
base_models <- as.list(c(
unlist(deeplearning_1@model_ids),
unlist(gbm@model_ids), paste0("GBM_", 1:5),
unlist(xgboost@model_ids), paste0("XGBoost_", 1:3),
"GLM",
"DRF",
"XRT"
))
for (model_id in base_models) {
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = model_id
))
}
# ----------------------
# super learner training
# ----------------------
sl_iter <- 0
cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# ----------------------------------
# super learner base model reduction
# ----------------------------------
while (TRUE) {
meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_1"))
names <- meta@model$coefficients_table[, "names"]
coeffs <- (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
for (j in 2:nfolds) {
meta <- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_", j))
names <- meta@model$coefficients_table[, "names"]
coeffs <- coeffs + (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
}
base_models_ <- as.list(names[coeffs >= ceiling(nfolds / 2) & names != "Intercept"])
if (length(base_models_) == 0) {
cat("No base models passing the threshold\n\n")
break
}
if (sum(base_models %in% base_models_) == length(base_models)) {
cat("No further reduction of base models\n\n")
break
}
sl_iter <- sl_iter + 1
base_models <- base_models_
cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
model_id <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
}
# -----------------------------------------
# super learner for homogeneous base models
# -----------------------------------------
# DeepLearning
base_models <- deeplearning_1@model_ids
cat(paste0("Super learner deep learning (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_deeplearning",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_deeplearning"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_deeplearning"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# GBM
base_models <- as.list(c(unlist(gbm@model_ids), paste0("GBM_", 1:5)))
cat(paste0("Super learner GBM (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_gbm",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_gbm"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_gbm"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
# XGBoost
base_models <- as.list(c(unlist(xgboost@model_ids), paste0("XGBoost_", 1:3)))
cat(paste0("Super learner XGBoost (", length(base_models), " models)\n"))
sl <- h2o.stackedEnsemble(
x = x, y = y, model_id = "superlearner_xgboost",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)
tmp_path <- h2o.saveModel(h2o.getModel("superlearner_xgboost"), path = exp_dir, force = TRUE)
model_id <- "metalearner_glm_superlearner_xgboost"
tmp_path <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
res <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
col.names = paste0(model_id, "_", length(base_models), "_models")
))
write.csv(res, file = paste0(exp_dir, "/cv_holdout_predictions.csv"), row.names = FALSE)
cat("\n\n")
h2o.removeAll()
}3.5.2 Inner cross-validation
load(file = "./rdata/var_reduct_tx_train_test_splits.RData")
for (i in 1:10) {
cat(paste0("Outer training set ", i, "\n"))
prefix <- paste0("outer_", i)
exp_dir <- paste0("/Users/renee/Downloads/toxicity/", prefix)
dir.create(exp_dir)
train_tx_models(train_set = outer_splits[[i]][["train"]], exp_dir = exp_dir, prefix = prefix)
}3.5.3 Training on whole data set
load(file = "./rdata/var_reduct_tx_train_test_splits.RData")
prefix <- "whole_data_set"
exp_dir <- paste0("/Users/renee/Downloads/toxicity/", prefix)
dir.create(exp_dir)
train_tx_models(train_set = tx_data, exp_dir = exp_dir, prefix = prefix)# Keep track of grid models
dl_grid <- list()
gbm_grid <- list()
xgboost_grid <- list()
dl_grid_params <- list()
gbm_grid_params <- list()
xgboost_grid_params <- list()
for (i in 1:11) {
if (i == 11) {
cat(paste0("Whole data set\n"))
prefix <- "whole_data_set"
} else {
cat(paste0("Outer training set ", i, "\n"))
prefix <- paste0("outer_", i)
}
dir <- paste0("/Users/renee/Downloads/toxicity/", prefix)
files <- list.files(dir)
# Deep learning
dl <- files[str_detect(files, "DeepLearning_model")]
for (m in dl) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$epsilon,
model@allparameters$hidden,
model@allparameters$hidden_dropout_ratios,
model@allparameters$input_dropout_ratio,
model@allparameters$rho
), collapse = " "))
if (hs %in% names(dl_grid)) {
dl_grid[[hs]] <- c(dl_grid[[hs]], m)
} else {
dl_grid[[hs]] <- c(m)
dl_grid_params <- list.append(
dl_grid_params,
c(
"epsilon" = model@allparameters$epsilon,
"hidden" = paste0("[", paste(model@allparameters$hidden, collapse = ","), "]"),
"hidden_dropout_ratios" = paste0(
"[",
paste(model@allparameters$hidden_dropout_ratios,
collapse = ","
), "]"
),
"input_dropout_ratio" = model@allparameters$input_dropout_ratio,
"rho" = model@allparameters$rho
)
)
}
}
h2o.removeAll()
# GBM
gbm <- files[str_detect(files, "GBM_model")]
for (m in gbm) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$min_split_improvement,
model@allparameters$sample_rate
), collapse = " "))
if (hs %in% names(gbm_grid)) {
gbm_grid[[hs]] <- c(gbm_grid[[hs]], m)
} else {
gbm_grid[[hs]] <- c(m)
gbm_grid_params <- list.append(
gbm_grid_params,
c(
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"min_split_improvement" = model@allparameters$min_split_improvement,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}
h2o.removeAll()
# XGBoost
xgboost <- files[str_detect(files, "XGBoost_model")]
for (m in xgboost) {
model <- h2o.loadModel(paste0(dir, "/", m))
hs <- sha1(paste(c(
model@allparameters$booster,
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$reg_alpha,
model@allparameters$reg_lambda,
model@allparameters$sample_rate
), collapse = " "))
if (hs %in% names(xgboost_grid)) {
xgboost_grid[[hs]] <- c(xgboost_grid[[hs]], m)
} else {
xgboost_grid[[hs]] <- c(m)
xgboost_grid_params <- list.append(
xgboost_grid_params,
c(
"booster" = model@allparameters$booster,
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"reg_alpha" = model@allparameters$reg_alpha,
"reg_lambda" = model@allparameters$reg_lambda,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}
h2o.removeAll()
}
dl_grid_params <- as.data.frame(t(data.frame(dl_grid_params)))
rownames(dl_grid_params) <- paste("Neural network grid model", 1:nrow(dl_grid_params))
gbm_grid_params <- as.data.frame(t(data.frame(gbm_grid_params)))
rownames(gbm_grid_params) <- paste("GBM grid model", 1:nrow(gbm_grid_params))
xgboost_grid_params <- as.data.frame(t(data.frame(xgboost_grid_params)))
rownames(xgboost_grid_params) <- paste("XGBoost grid model", 1:nrow(xgboost_grid_params))
write.csv(dl_grid_params, "./other_data/toxicity/neural_network_grid_params.csv")
write.csv(gbm_grid_params, "./other_data/toxicity/gbm_grid_params.csv")
write.csv(xgboost_grid_params, "./other_data/toxicity/xgboost_grid_params.csv")
save(dl_grid, gbm_grid, xgboost_grid, file = "./rdata/model_training_grid_models_tx.RData")3.5.4 Model evaluation
model_evaluation <- function(holdout_pred, fold_asign, grid_name, grid_meta = NULL) {
res_ <- list()
model_num <- c()
if (startsWith(grid_name, "Super learner")) {
if (grid_name == "Super learner all models") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter_0")]
} else if (grid_name == "Super learner final") {
sl_models <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter")]
m <- sl_models[length(sl_models)]
} else if (grid_name == "Super learner neural network") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_deeplearning")]
} else if (grid_name == "Super learner GBM") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_gbm")]
} else if (grid_name == "Super learner XGBoost") {
m <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_xgboost")]
}
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- c()
threshold <- 0.5
for (k in 1:10) {
y_true <- holdout_pred[fold_assign == k, "category"]
y_pred <- holdout_pred[fold_assign == k, m]
y_pred_cls <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
logloss <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
MCC <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
F1 <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
acc <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
EF <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
BEDROC <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
}
res_ <- list.append(res_, c(
mean(logloss, na.rm = TRUE), sem(logloss),
mean(MCC, na.rm = TRUE), sem(MCC),
mean(F1, na.rm = TRUE), sem(F1),
mean(acc, na.rm = TRUE), sem(acc),
mean(EF, na.rm = TRUE), sem(EF),
mean(BEDROC, na.rm = TRUE), sem(BEDROC),
logloss, MCC, F1, acc, EF, BEDROC
))
} else {
for (j in 1:length(grid_meta)) {
g <- grid_meta[[j]]
m <- intersect(colnames(holdout_pred), g)
if (length(m) == 1) {
model_num <- c(model_num, j)
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- c()
threshold <- 0.5
for (k in 1:10) {
y_true <- holdout_pred[fold_assign == k, "category"]
y_pred <- holdout_pred[fold_assign == k, m]
y_pred_cls <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
logloss <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
MCC <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
F1 <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
acc <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
EF <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
BEDROC <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
}
res_ <- list.append(res_, c(
mean(logloss, na.rm = TRUE), sem(logloss),
mean(MCC, na.rm = TRUE), sem(MCC),
mean(F1, na.rm = TRUE), sem(F1),
mean(acc, na.rm = TRUE), sem(acc),
mean(EF, na.rm = TRUE), sem(EF),
mean(BEDROC, na.rm = TRUE), sem(BEDROC),
logloss, MCC, F1, acc, EF, BEDROC
))
}
}
}
res <- as.data.frame(t(data.frame(res_)))
colnames(res) <- c(
"Log loss mean", "Log loss s.e.m.", "MCC mean", "MCC s.e.m.", "F_1 mean", "F_1 s.e.m.",
"Balanced accuracy mean", "Balanced accuracy s.e.m.", "EF mean", "EF s.e.m.", "BEDROC mean", "BEDROC s.e.m.",
paste("Log loss CV", 1:10), paste("MCC CV", 1:10), paste("F_1 CV", 1:10),
paste("Balanced accuracy CV", 1:10), paste("EF CV", 1:10), paste("BEDROC CV", 1:10)
)
if (nrow(res) == 1) {
rownames(res) <- grid_name
} else {
rownames(res) <- paste(grid_name, model_num)
}
return(res)
}3.5.5 Inner loop model selection
load(file = "./rdata/model_training_grid_models_tx.RData")
for (i in 1:10) {
holdout_pred <- read.csv(paste0("./other_data/toxicity/outer_", i, "/cv_holdout_predictions.csv"))
fold_assign <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
data_ <- list()
# Deep learning grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "Neural network grid model",
grid_meta = dl_grid
))
# GBM grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM grid model",
grid_meta = gbm_grid
))
# GBM default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))
# XGBoost grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))
# XGBoost default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))
# GLM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "GLM model", grid_meta = list("GLM")))
# DRF
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
# XRT
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
# Super learner all
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
# Super learner final
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
# Super learner deep learning
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
# Super learner GBM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
# Super learner XGBoost
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data <- do.call(rbind, data_)
write.csv(data, paste0("./other_data/toxicity/outer_", i, "/cv_res.csv"))
}3.5.6 Final evaluation
load(file = "./rdata/var_reduct_tx_train_test_splits.RData")
load(file = "./rdata/model_training_grid_models_tx.RData")
dir <- "/Users/renee/Downloads/toxicity"
selected_models <- c(
"Super learner XGBoost", "Super learner final", "Super learner final", "Super learner final",
"Super learner final", "Super learner final", "Super learner final", "Super learner final",
"Super learner final", "Super learner final"
)
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- c()
threshold <- 0.5
for (i in 1:10) {
holdout_pred <- read.csv(paste0("./other_data/toxicity/outer_", i, "/cv_holdout_predictions.csv"))
if (startsWith(selected_models[i], "Neural network grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), dl_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "GBM grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), gbm_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "GBM model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/GBM_", grid_num))
} else if (startsWith(selected_models[i], "XGBoost grid model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
m <- intersect(colnames(holdout_pred), xgboost_grid[[grid_num]])
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
} else if (startsWith(selected_models[i], "XGBoost model")) {
grid_num <- as.integer(str_extract(selected_models[i], "[0-9]+"))
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/XGBoost_", grid_num))
} else if (selected_models[i] == "Super learner all models") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_iter_0"))
} else if (selected_models[i] == "Super learner final") {
files <- list.files(paste0(dir, "/outer_", i))
files <- files[startsWith(files, "superlearner_iter")]
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", files[length(files)]))
} else if (selected_models[i] == "Super learner neural network") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_deeplearning"))
} else if (selected_models[i] == "Super learner GBM") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_gbm"))
} else if (selected_models[i] == "Super learner XGBoost") {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_xgboost"))
} else {
model <- h2o.loadModel(paste0(dir, "/outer_", i, "/", unlist(strsplit(selected_models[i], " "))[1]))
}
tmp <- as.h2o(subset(outer_splits[[i]][["test"]], select = -category))
y_true <- as.numeric(outer_splits[[i]][["test"]]$category) - 1
y_pred <- as.data.frame(as.data.frame(h2o.predict(model, tmp)))[, 3]
y_pred_cls <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
logloss <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
MCC <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
F1 <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
acc <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
EF <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
BEDROC <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
}
save(logloss, MCC, F1, acc, EF, BEDROC, file = "./rdata/final_evaluation_tx.RData")load(file = "./rdata/final_evaluation_tx.RData")
data.frame(
Metric = c("Log loss", "MCC", "F<sub>1</sub>", "Balanced accuracy", "EF", "BEDROC"),
`Mean ± s.e.m.` = c(
paste0(round(mean(logloss), 3), " ± ", round(sem(logloss), 3)),
paste0(round(mean(MCC), 3), " ± ", round(sem(MCC), 3)),
paste0(round(mean(F1), 3), " ± ", round(sem(F1), 3)),
paste0(round(mean(acc), 3), " ± ", round(sem(acc), 3)),
paste0(round(mean(EF), 3), " ± ", round(sem(EF), 3)),
paste0(round(mean(BEDROC), 3), " ± ", round(sem(BEDROC), 3))
),
check.names = FALSE
) %>%
datatable(escape = FALSE, rownames = FALSE)3.5.7 Final model selection
load(file = "./rdata/model_training_grid_models_tx.RData")
holdout_pred <- read.csv("./other_data/toxicity/whole_data_set/cv_holdout_predictions.csv")
fold_assign <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
data_ <- list()
# Deep learning grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "Neural network grid model",
grid_meta = dl_grid
))
# GBM grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM grid model",
grid_meta = gbm_grid
))
# GBM default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))
# XGBoost grid models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))
# XGBoost default models
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign,
grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))
# GLM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "GLM model", grid_meta = list("GLM")))
# DRF
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
# XRT
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
# Super learner all
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
# Super learner final
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
# Super learner deep learning
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
# Super learner GBM
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
# Super learner XGBoost
data_ <- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data <- do.call(rbind, data_)
write.csv(data, "./other_data/toxicity/whole_data_set/cv_res.csv")data <- read.csv("./other_data/toxicity/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
statistical_testing(data,
metric_vec = c("Log loss", "MCC", "F_1", "Balanced accuracy"),
decreasing_vec = c(FALSE, TRUE, TRUE, TRUE)
)# statistical_testing(data, metric_vec=c('Log loss', 'MCC', 'F_1', 'Balanced accuracy'),
# decreasing_vec=c(FALSE, TRUE, TRUE, TRUE),
# output_path='./other_data/toxicity/whole_data_set/cv_res_statistical_testing.csv')3.5.8 Final reduced SL
h2o.init(nthreads = -1)
model_path <- "/Users/renee/Downloads/toxicity/whole_data_set/superlearner_iter_3"
model <- h2o.loadModel(model_path)
load(file = "./rdata/model_training_grid_models_tx.RData")
data <- read.csv("./other_data/toxicity/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
df <- as.data.frame(model@model$metalearner_model@model$coefficients_table)[c("names", "standardized_coefficients")][-1, ]
for (i in 1:nrow(df)) {
name <- df$names[i]
if (startsWith(name, "DeepLearning_model")) {
for (j in 1:length(dl_grid)) {
if (name %in% dl_grid[[j]]) break
}
df$names[i] <- paste("Neural network grid model", j)
} else if (startsWith(name, "GBM_model")) {
for (j in 1:length(gbm_grid)) {
if (name %in% gbm_grid[[j]]) break
}
df$names[i] <- paste("GBM grid model", j)
} else if (startsWith(name, "XGBoost_model")) {
for (j in 1:length(xgboost_grid)) {
if (name %in% xgboost_grid[[j]]) break
}
df$names[i] <- paste("XGBoost grid model", j)
} else {
df$names[i] <- paste(strsplit(name, "_")[[1]], collapse = " model ")
}
}
rownames(df) <- df$names
df <- merge(df, data["Balanced accuracy mean"], by = "row.names")[, -1]
df <- df[df$standardized_coefficients > 0, ]p3 <- ggplot(df, aes(x = reorder(names, standardized_coefficients), y = standardized_coefficients, fill = `Balanced accuracy mean`)) +
geom_col(color = "black", size = 0.2) +
scale_fill_gradient(low = "white", high = "red", n.breaks = 4, limits = c(0.85, 1.00)) +
geom_text(aes(label = sprintf("%.2f", `Balanced accuracy mean`)), nudge_y = -0.008, size = 3, color = "white") +
geom_text(aes(label = sprintf("%.3f", standardized_coefficients)), nudge_y = 0.01, size = 3) +
coord_flip() +
scale_y_continuous(limits = c(0, max(df$standardized_coefficients) + 0.02), expand = c(0.01, 0)) +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
legend.text = element_text(colour = "black", size = 10),
plot.title = element_text(hjust = 0.5, size = 18),
plot.margin = ggplot2::margin(10, 10, 10, 0, "pt"),
axis.title.x = element_text(colour = "black", size = 18),
axis.title.y = element_text(colour = "black", size = 18),
axis.text.x = element_text(colour = "black", size = 12),
axis.text.y = element_text(colour = "black", size = 12),
legend.title = element_text(size = 14),
legend.position = c(0.85, 0.35)
) +
guides(fill = guide_colorbar(title = "Balanced accuracy")) +
xlab("") +
ylab("Coefficient") +
ggtitle("Non-toxic")
# Close h2o server
# h2o.shutdown()
sessionInfo()## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.4 DT_0.24 digest_0.6.29 stringr_1.4.1
## [5] rlist_0.4.6.2 enrichvs_0.0.5 scales_1.2.1 mltools_0.3.5
## [9] MLmetrics_1.1.1 h2o_3.38.0.2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.9 highr_0.9 plyr_1.8.7 bslib_0.4.0
## [5] compiler_4.2.2 jquerylib_0.1.4 R.methodsS3_1.8.2 R.utils_2.12.0
## [9] bitops_1.0-7 tools_4.2.2 lifecycle_1.0.1 jsonlite_1.8.0
## [13] evaluate_0.16 R.cache_0.16.0 lattice_0.20-45 rlang_1.0.4
## [17] Matrix_1.5-1 cli_3.3.0 rstudioapi_0.14 crosstalk_1.2.0
## [21] yaml_2.3.5 xfun_0.32 fastmap_1.1.0 styler_1.8.0
## [25] knitr_1.40 vctrs_0.4.1 htmlwidgets_1.5.4 sass_0.4.2
## [29] grid_4.2.2 data.table_1.14.2 R6_2.5.1 rmarkdown_2.16
## [33] bookdown_0.28 purrr_0.3.4 magrittr_2.0.3 codetools_0.2-18
## [37] htmltools_0.5.3 colorspace_2.0-3 stringi_1.7.8 munsell_0.5.0
## [41] RCurl_1.98-1.8 cachem_1.0.6 R.oo_1.25.0
