Section 3 Model training
library(h2o)
library(MLmetrics)
library(mltools)
library(scales)
library(enrichvs)
library(rlist)
library(stringr)
library(digest)
library(DT)
library(reshape2)
3.2 General code/functions
# h2o.init(nthreads=-1, max_mem_size='100G', port=54321)
h2o.init(nthreads = -1)
h2o.removeAll()
# DeepLearning Grid 1
<- list(
deeplearning_params_1 activation = "RectifierWithDropout",
epochs = 10000, # early stopping
epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
hidden = list(c(50), c(200), c(500)),
hidden_dropout_ratios = list(c(0.1), c(0.2), c(0.3), c(0.4), c(0.5)),
input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
rho = c(0.9, 0.95, 0.99)
)# DeepLearning Grid 2
<- list(
deeplearning_params_2 activation = "RectifierWithDropout",
epochs = 10000, # early stopping
epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
hidden = list(c(50, 50), c(200, 200), c(500, 500)),
hidden_dropout_ratios = list(
c(0.1, 0.1), c(0.2, 0.2), c(0.3, 0.3),
c(0.4, 0.4), c(0.5, 0.5)
),input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
rho = c(0.9, 0.95, 0.99)
)# DeepLearning Grid 3
<- list(
deeplearning_params_3 activation = "RectifierWithDropout",
epochs = 10000, # early stopping
epsilon = c(1e-6, 1e-7, 1e-8, 1e-9),
hidden = list(c(50, 50, 50), c(200, 200, 200), c(500, 500, 500)),
hidden_dropout_ratios = list(
c(0.1, 0.1, 0.1), c(0.2, 0.2, 0.2),
c(0.3, 0.3, 0.3), c(0.4, 0.4, 0.4),
c(0.5, 0.5, 0.5)
),input_dropout_ratio = c(0, 0.05, 0.1, 0.15, 0.2),
rho = c(0.9, 0.95, 0.99)
)# GBM
<- list(
gbm_params col_sample_rate = c(0.4, 0.7, 1.0),
col_sample_rate_per_tree = c(0.4, 0.7, 1.0),
learn_rate = 0.1,
max_depth = c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
min_rows = c(1, 5, 10, 15, 30, 100),
min_split_improvement = c(1e-4, 1e-5),
ntrees = 10000, # early stopping
sample_rate = c(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
)# XGBoost
<- list(
xgboost_params booster = c("gbtree", "dart"),
col_sample_rate = c(0.6, 0.8, 1.0),
col_sample_rate_per_tree = c(0.7, 0.8, 0.9, 1.0),
max_depth = c(5, 10, 15, 20),
min_rows = c(0.01, 0.1, 1.0, 3.0, 5.0, 10.0, 15.0, 20.0),
ntrees = 10000, # early stopping
reg_alpha = c(0.001, 0.01, 0.1, 1, 10, 100),
reg_lambda = c(0.001, 0.01, 0.1, 0.5, 1),
sample_rate = c(0.6, 0.8, 1.0)
)
<- function(y_pred, y_true) {
balanced_accuracy return(mean(c(Sensitivity(y_pred, y_true), Specificity(y_pred, y_true)), na.rm = TRUE))
}
<- function(x, na.rm = TRUE) sd(x, na.rm) / sqrt(length(na.omit(x)))
sem
<- function(data, metric_vec, decreasing_vec, p_value_threshold = 0.05, num_entries = 10,
statistical_testing output_path = NULL) {
for (i in 1:length(metric_vec)) {
<- metric_vec[i]
metric <- rownames(data[order(data[, paste(metric, "mean")], decreasing = decreasing_vec[i]), ])
ranked_models <- c()
pval for (j in 2:length(ranked_models)) {
if (sum(is.na(data[ranked_models[j], paste(metric, "CV", 1:10)]))) {
<- c(pval, NA)
pval else {
} <- c(pval, wilcox.test(as.numeric(data[ranked_models[1], paste(metric, "CV", 1:10)]),
pval as.numeric(data[ranked_models[j], paste(metric, "CV", 1:10)]),
exact = FALSE
$p.value)
)
}
}<- c(NA, p.adjust(pval, method = "BH", n = length(pval)))
adj_pval <- data.frame(adj_pval)
df rownames(df) <- ranked_models
colnames(df) <- paste(metric, "adj. <i>p</i> value")
<- merge(data, df, by = "row.names", all = TRUE)
data rownames(data) <- data$Row.names
$Row.names <- NULL
data
}for (i in 1:length(metric_vec)) {
if (decreasing_vec[i]) {
paste(metric_vec[i], "rank")] <- rank(-data[, paste(metric_vec[i], "mean")], ties.method = "average")
data[else {
} paste(metric_vec[i], "rank")] <- rank(data[, paste(metric_vec[i], "mean")], ties.method = "average")
data[
}
}"Rank sum"] <- rowSums(data[(ncol(data) - length(metric_vec) + 1):ncol(data)])
data[<- data[order(data$`Rank sum`), ]
data <- rowSums(data[paste(metric_vec, "adj. <i>p</i> value")] < p_value_threshold) == 0
competitive is.na(competitive)] <- TRUE
competitive[<- data[competitive, ]
data if (!is.null(output_path)) {
c(
data[paste(metric_vec, "adj. <i>p</i> value"), paste(metric_vec, "rank"), "Rank sum",
melt(sapply(metric_vec, function(x) paste(x, c("mean", "s.e.m."))))$value
%>%
)] round(digits = 4) %>%
write.csv(., file = output_path)
}c(paste(metric_vec, "adj. <i>p</i> value"), paste(metric_vec, "rank"), "Rank sum")] %>%
data[round(digits = 4) %>%
datatable(options = list(pageLength = num_entries), escape = FALSE)
}
3.3 Melanin binding (regression)
3.3.1 Model training
<- function(train_set, exp_dir, prefix, nfolds = 10, grid_seed = 1) {
train_mb_models <- as.h2o(train_set, destination_frame = prefix)
tmp <- "log_intensity"
y <- setdiff(names(tmp), y)
x <- as.data.frame(tmp$log_intensity)
res # -------------------
# base model training
# -------------------
cat("Deep learning grid 1\n")
<- h2o.grid(
deeplearning_1 algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_1,
stopping_rounds = 3, keep_cross_validation_models = FALSE,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),fold_assignment = "Modulo", parallelism = 0
)for (model_id in deeplearning_1@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("Deep learning grid 2\n")
<- h2o.grid(
deeplearning_2 algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_2,
stopping_rounds = 3, keep_cross_validation_models = FALSE,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),fold_assignment = "Modulo", parallelism = 0
)for (model_id in deeplearning_2@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("Deep learning grid 3\n")
<- h2o.grid(
deeplearning_3 algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_3,
stopping_rounds = 3, keep_cross_validation_models = FALSE,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),fold_assignment = "Modulo", parallelism = 0
)for (model_id in deeplearning_3@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("GBM grid\n")
<- h2o.grid(
gbm algorithm = "gbm", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = gbm_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 300, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)for (model_id in gbm@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("GBM 5 default models\n")
<- h2o.gbm(
gbm_1 model_id = "GBM_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 6, min_rows = 1, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_2 model_id = "GBM_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 7, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_3 model_id = "GBM_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 8, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_4 model_id = "GBM_4", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 10, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_5 model_id = "GBM_5", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 15, min_rows = 100, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)for (model_id in paste0("GBM_", 1:5)) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("XGBoost grid\n")
<- h2o.grid(
xgboost algorithm = "xgboost", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = xgboost_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 300, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)for (model_id in xgboost@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("XGBoost 3 default models\n")
<- h2o.xgboost(
xgboost_1 model_id = "XGBoost_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 10, min_rows = 5, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.xgboost(
xgboost_2 model_id = "XGBoost_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 20, min_rows = 10, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.xgboost(
xgboost_3 model_id = "XGBoost_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 5, min_rows = 3, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.8, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)for (model_id in paste0("XGBoost_", 1:3)) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("GLM\n")
<- h2o.glm(
glm model_id = "GLM", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, alpha = c(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.saveModel(h2o.getModel("GLM"), path = exp_dir, force = TRUE)
tmp_path cat("DRF\n")
<- h2o.randomForest(
drf model_id = "DRF", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000,
score_tree_interval = 5, stopping_rounds = 3,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.saveModel(h2o.getModel("DRF"), path = exp_dir, force = TRUE)
tmp_path cat("XRT\n")
<- h2o.randomForest(
xrt model_id = "XRT", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000, histogram_type = "Random",
score_tree_interval = 5, stopping_rounds = 3,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.saveModel(h2o.getModel("XRT"), path = exp_dir, force = TRUE)
tmp_path # -----------------------
# get holdout predictions
# -----------------------
<- as.list(c(
base_models unlist(deeplearning_1@model_ids),
unlist(deeplearning_2@model_ids),
unlist(deeplearning_3@model_ids),
unlist(gbm@model_ids), paste0("GBM_", 1:5),
unlist(xgboost@model_ids), paste0("XGBoost_", 1:3),
"GLM",
"DRF",
"XRT"
))for (model_id in base_models) {
<- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
res col.names = model_id
))
}# ----------------------
# super learner training
# ----------------------
<- 0
sl_iter cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)<- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
tmp_path <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# ----------------------------------
# super learner base model reduction
# ----------------------------------
while (TRUE) {
<- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_1"))
meta <- meta@model$coefficients_table[, "names"]
names <- (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
coeffs for (j in 2:nfolds) {
<- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_", j))
meta <- meta@model$coefficients_table[, "names"]
names <- coeffs + (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
coeffs
}<- as.list(names[coeffs >= ceiling(nfolds / 2) & names != "Intercept"])
base_models_ if (length(base_models_) == 0) {
cat("No base models passing the threshold\n\n")
break
}if (sum(base_models %in% base_models_) == length(base_models)) {
cat("No further reduction of base models\n\n")
break
}<- sl_iter + 1
sl_iter <- base_models_
base_models cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)<- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
tmp_path <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
res col.names = paste0(model_id, "_", length(base_models), "_models")
))
}# -----------------------------------------
# super learner for homogeneous base models
# -----------------------------------------
# DeepLearning
<- as.list(c(
base_models unlist(deeplearning_1@model_ids),
unlist(deeplearning_2@model_ids),
unlist(deeplearning_3@model_ids)
))cat(paste0("Super learner deep learning (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_deeplearning",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)<- h2o.saveModel(h2o.getModel("superlearner_deeplearning"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_deeplearning"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# GBM
<- as.list(c(unlist(gbm@model_ids), paste0("GBM_", 1:5)))
base_models cat(paste0("Super learner GBM (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_gbm",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)<- h2o.saveModel(h2o.getModel("superlearner_gbm"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_gbm"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# XGBoost
<- as.list(c(unlist(xgboost@model_ids), paste0("XGBoost_", 1:3)))
base_models cat(paste0("Super learner XGBoost (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_xgboost",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(standardize = TRUE, keep_cross_validation_predictions = TRUE)
)<- h2o.saveModel(h2o.getModel("superlearner_xgboost"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_xgboost"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id)),
res col.names = paste0(model_id, "_", length(base_models), "_models")
))write.csv(res, file = paste0(exp_dir, "/cv_holdout_predictions.csv"), row.names = FALSE)
cat("\n\n")
h2o.removeAll()
}
3.3.2 Inner cross-validation
load(file = "./rdata/var_reduct_mb_train_test_splits.RData")
for (i in 1:10) {
cat(paste0("Outer training set ", i, "\n"))
<- paste0("outer_", i)
prefix <- paste0("/Users/renee/Downloads/melanin_binding/", prefix)
exp_dir dir.create(exp_dir)
train_mb_models(train_set = outer_splits[[i]][["train"]], exp_dir = exp_dir, prefix = prefix)
}
3.3.3 Training on whole data set
load(file = "./rdata/var_reduct_mb_train_test_splits.RData")
<- "whole_data_set"
prefix <- paste0("/Users/renee/Downloads/melanin_binding/", prefix)
exp_dir dir.create(exp_dir)
train_mb_models(train_set = mb_data, exp_dir = exp_dir, prefix = prefix)
# Keep track of grid models
<- list()
dl_grid <- list()
gbm_grid <- list()
xgboost_grid <- list()
dl_grid_params <- list()
gbm_grid_params <- list()
xgboost_grid_params for (i in 1:11) {
if (i == 11) {
cat(paste0("Whole data set\n"))
<- "whole_data_set"
prefix else {
} cat(paste0("Outer training set ", i, "\n"))
<- paste0("outer_", i)
prefix
}<- paste0("/Users/renee/Downloads/melanin_binding/", prefix)
dir <- list.files(dir)
files # Deep learning
<- files[str_detect(files, "DeepLearning_model")]
dl for (m in dl) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$epsilon,
model@allparameters$hidden,
model@allparameters$hidden_dropout_ratios,
model@allparameters$input_dropout_ratio,
model@allparameters$rho
modelcollapse = " "))
), if (hs %in% names(dl_grid)) {
<- c(dl_grid[[hs]], m)
dl_grid[[hs]] else {
} <- c(m)
dl_grid[[hs]] <- list.append(
dl_grid_params
dl_grid_params,c(
"epsilon" = model@allparameters$epsilon,
"hidden" = paste0("[", paste(model@allparameters$hidden, collapse = ","), "]"),
"hidden_dropout_ratios" = paste0(
"[",
paste(model@allparameters$hidden_dropout_ratios,
collapse = ","
"]"
),
),"input_dropout_ratio" = model@allparameters$input_dropout_ratio,
"rho" = model@allparameters$rho
)
)
}
}h2o.removeAll()
# GBM
<- files[str_detect(files, "GBM_model")]
gbm for (m in gbm) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$min_split_improvement,
model@allparameters$sample_rate
modelcollapse = " "))
), if (hs %in% names(gbm_grid)) {
<- c(gbm_grid[[hs]], m)
gbm_grid[[hs]] else {
} <- c(m)
gbm_grid[[hs]] <- list.append(
gbm_grid_params
gbm_grid_params,c(
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"min_split_improvement" = model@allparameters$min_split_improvement,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}h2o.removeAll()
# XGBoost
<- files[str_detect(files, "XGBoost_model")]
xgboost for (m in xgboost) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$booster,
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$reg_alpha,
model@allparameters$reg_lambda,
model@allparameters$sample_rate
modelcollapse = " "))
), if (hs %in% names(xgboost_grid)) {
<- c(xgboost_grid[[hs]], m)
xgboost_grid[[hs]] else {
} <- c(m)
xgboost_grid[[hs]] <- list.append(
xgboost_grid_params
xgboost_grid_params,c(
"booster" = model@allparameters$booster,
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"reg_alpha" = model@allparameters$reg_alpha,
"reg_lambda" = model@allparameters$reg_lambda,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}h2o.removeAll()
}
<- as.data.frame(t(data.frame(dl_grid_params)))
dl_grid_params rownames(dl_grid_params) <- paste("Neural network grid model", 1:nrow(dl_grid_params))
<- as.data.frame(t(data.frame(gbm_grid_params)))
gbm_grid_params rownames(gbm_grid_params) <- paste("GBM grid model", 1:nrow(gbm_grid_params))
<- as.data.frame(t(data.frame(xgboost_grid_params)))
xgboost_grid_params rownames(xgboost_grid_params) <- paste("XGBoost grid model", 1:nrow(xgboost_grid_params))
write.csv(dl_grid_params, "./other_data/melanin_binding/neural_network_grid_params.csv")
write.csv(gbm_grid_params, "./other_data/melanin_binding/gbm_grid_params.csv")
write.csv(xgboost_grid_params, "./other_data/melanin_binding/xgboost_grid_params.csv")
save(dl_grid, gbm_grid, xgboost_grid, file = "./rdata/model_training_grid_models_mb.RData")
3.3.4 Model evaluation
<- function(holdout_pred, fold_asign, grid_name, grid_meta = NULL) {
model_evaluation load(file = "./rdata/var_reduct_mb_train_test_splits.RData")
<- list()
res_ <- c()
model_num if (startsWith(grid_name, "Super learner")) {
if (grid_name == "Super learner all models") {
<- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter_0")]
m else if (grid_name == "Super learner final") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter")]
sl_models <- sl_models[length(sl_models)]
m else if (grid_name == "Super learner neural network") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_deeplearning")]
m else if (grid_name == "Super learner GBM") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_gbm")]
m else if (grid_name == "Super learner XGBoost") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_xgboost")]
m
}<- c()
mae <- c()
rmse <- c()
R2 for (k in 1:10) {
<- holdout_pred[fold_assign == k, "log_intensity"]
y_true <- holdout_pred[fold_assign == k, m]
y_pred <- c(mae, MAE(y_pred = y_pred, y_true = y_true))
mae <- c(rmse, RMSE(y_pred = y_pred, y_true = y_true))
rmse <- c(R2, R2_Score(y_pred = y_pred, y_true = y_true))
R2
}# Re-scale to 0-100
<- rescale(mae, to = c(0, 100), from = range(mb_data$log_intensity))
mae <- rescale(rmse, to = c(0, 100), from = range(mb_data$log_intensity))
rmse <- list.append(res_, c(
res_ mean(mae, na.rm = TRUE), sem(mae),
mean(rmse, na.rm = TRUE), sem(rmse),
mean(R2, na.rm = TRUE), sem(R2),
mae, rmse, R2
))else {
} for (j in 1:length(grid_meta)) {
<- grid_meta[[j]]
g <- intersect(colnames(holdout_pred), g)
m if (length(m) == 1) {
<- c(model_num, j)
model_num <- c()
mae <- c()
rmse <- c()
R2 for (k in 1:10) {
<- holdout_pred[fold_assign == k, "log_intensity"]
y_true <- holdout_pred[fold_assign == k, m]
y_pred <- c(mae, MAE(y_pred = y_pred, y_true = y_true))
mae <- c(rmse, RMSE(y_pred = y_pred, y_true = y_true))
rmse <- c(R2, R2_Score(y_pred = y_pred, y_true = y_true))
R2
}# Re-scale to 0-100
<- rescale(mae, to = c(0, 100), from = range(mb_data$log_intensity))
mae <- rescale(rmse, to = c(0, 100), from = range(mb_data$log_intensity))
rmse <- list.append(res_, c(
res_ mean(mae, na.rm = TRUE), sem(mae),
mean(rmse, na.rm = TRUE), sem(rmse),
mean(R2, na.rm = TRUE), sem(R2),
mae, rmse, R2
))
}
}
}<- as.data.frame(t(data.frame(res_)))
res colnames(res) <- c(
"Norm. MAE mean", "Norm. MAE s.e.m.", "Norm. RMSE mean", "Norm. RMSE s.e.m.",
"R^2 mean", "R^2 s.e.m.",
paste("Norm. MAE CV", 1:10), paste("Norm. RMSE CV", 1:10), paste("R^2 CV", 1:10)
)if (nrow(res) == 1) {
rownames(res) <- grid_name
else {
} rownames(res) <- paste(grid_name, model_num)
}return(res)
}
3.3.5 Inner loop model selection
load(file = "./rdata/model_training_grid_models_mb.RData")
for (i in 1:10) {
<- read.csv(paste0("./other_data/melanin_binding/outer_", i, "/cv_holdout_predictions.csv"))
holdout_pred <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
fold_assign <- list()
data_ # Deep learning grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "Neural network grid model",
grid_meta = dl_grid
))# GBM grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM grid model",
grid_meta = gbm_grid
))# GBM default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))# XGBoost grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))# XGBoost default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))# GLM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "GLM model", grid_meta = list("GLM")))
data_ # DRF
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
data_ # XRT
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
data_ # Super learner all
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
data_ # Super learner final
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
data_ # Super learner deep learning
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
data_ # Super learner GBM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
data_ # Super learner XGBoost
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data_
<- do.call(rbind, data_)
data write.csv(data, paste0("./other_data/melanin_binding/outer_", i, "/cv_res.csv"))
}
3.3.6 Final evaluation
load(file = "./rdata/var_reduct_mb_train_test_splits.RData")
<- "/Users/renee/Downloads/melanin_binding"
dir <- c(
selected_models "Super learner final", "Super learner final", "Super learner final",
"Super learner final", "Super learner final", "Super learner final",
"Super learner final", "Super learner final", "Super learner final",
"Super learner final"
)<- c()
mae <- c()
rmse <- c()
R2 for (i in 1:10) {
<- read.csv(paste0("./other_data/melanin_binding/outer_", i, "/cv_holdout_predictions.csv"))
holdout_pred if (startsWith(selected_models[i], "Neural network grid model")) {
<- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), dl_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "GBM grid model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), gbm_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "GBM model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- h2o.loadModel(paste0(dir, "/outer_", i, "/GBM_", grid_num))
model else if (startsWith(selected_models[i], "XGBoost grid model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), xgboost_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "XGBoost model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- h2o.loadModel(paste0(dir, "/outer_", i, "/XGBoost_", grid_num))
model else if (selected_models[i] == "Super learner all models") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_iter_0"))
model else if (selected_models[i] == "Super learner final") {
} <- list.files(paste0(dir, "/outer_", i))
files <- files[startsWith(files, "superlearner_iter")]
files <- h2o.loadModel(paste0(dir, "/outer_", i, "/", files[length(files)]))
model else if (selected_models[i] == "Super learner neural network") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_deeplearning"))
model else if (selected_models[i] == "Super learner GBM") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_gbm"))
model else if (selected_models[i] == "Super learner XGBoost") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_xgboost"))
model else {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/", unlist(strsplit(selected_models[i], " "))[1]))
model
}<- as.h2o(subset(outer_splits[[i]][["test"]], select = -log_intensity))
tmp <- outer_splits[[i]][["test"]]$log_intensity
y_true <- as.data.frame(as.data.frame(h2o.predict(model, tmp)))[, 1]
y_pred <- c(mae, MAE(y_pred = y_pred, y_true = y_true))
mae <- c(rmse, RMSE(y_pred = y_pred, y_true = y_true))
rmse <- c(R2, R2_Score(y_pred = y_pred, y_true = y_true))
R2
}
# Re-scale to 0-100
<- rescale(mae, to = c(0, 100), from = range(mb_data$log_intensity))
mae <- rescale(rmse, to = c(0, 100), from = range(mb_data$log_intensity))
rmse
h2o.removeAll()
save(mae, rmse, R2, file = "./rdata/final_evaluation_mb.RData")
load(file = "./rdata/final_evaluation_mb.RData")
data.frame(
Metric = c("Norm. MAE", "Norm. RMSE", "R<sup>2</sup>"),
`Mean ± s.e.m.` = c(
paste0(round(mean(mae), 3), " ± ", round(sem(mae), 3)),
paste0(round(mean(rmse), 3), " ± ", round(sem(rmse), 3)),
paste0(round(mean(R2), 3), " ± ", round(sem(R2), 3))
),check.names = FALSE
%>%
) datatable(escape = FALSE, rownames = FALSE)
3.3.7 Final model selection
load(file = "./rdata/model_training_grid_models_mb.RData")
<- read.csv("./other_data/melanin_binding/whole_data_set/cv_holdout_predictions.csv")
holdout_pred <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
fold_assign <- list()
data_ # Deep learning grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "Neural network grid model",
grid_meta = dl_grid
))# GBM grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM grid model",
grid_meta = gbm_grid
))# GBM default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))# XGBoost grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))# XGBoost default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))# GLM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "GLM model", grid_meta = list("GLM")))
data_ # DRF
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
data_ # XRT
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
data_ # Super learner all
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
data_ # Super learner final
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
data_ # Super learner deep learning
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
data_ # Super learner GBM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
data_ # Super learner XGBoost
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data_
<- do.call(rbind, data_)
data write.csv(data, "./other_data/melanin_binding/whole_data_set/cv_res.csv")
<- read.csv("./other_data/melanin_binding/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
data statistical_testing(data, metric_vec = c("Norm. MAE", "Norm. RMSE", "R^2"), decreasing_vec = c(FALSE, FALSE, TRUE))
# statistical_testing(data, metric_vec=c('Norm. MAE', 'Norm. RMSE', 'R^2'), decreasing_vec=c(FALSE, FALSE, TRUE),
# output_path='./other_data/melanin_binding/whole_data_set/cv_res_statistical_testing.csv')
3.3.8 Final reduced SL
h2o.init(nthreads = -1)
<- "/Users/renee/Downloads/melanin_binding/whole_data_set/superlearner_iter_3"
model_path <- h2o.loadModel(model_path)
model load(file = "./rdata/model_training_grid_models_mb.RData")
<- read.csv("./other_data/melanin_binding/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
data <- as.data.frame(model@model$metalearner_model@model$coefficients_table)[c("names", "standardized_coefficients")][-1, ]
df
for (i in 1:nrow(df)) {
<- df$names[i]
name if (startsWith(name, "DeepLearning_model")) {
for (j in 1:length(dl_grid)) {
if (name %in% dl_grid[[j]]) break
}$names[i] <- paste("Neural network grid model", j)
dfelse if (startsWith(name, "GBM_model")) {
} for (j in 1:length(gbm_grid)) {
if (name %in% gbm_grid[[j]]) break
}$names[i] <- paste("GBM grid model", j)
dfelse if (startsWith(name, "XGBoost_model")) {
} for (j in 1:length(xgboost_grid)) {
if (name %in% xgboost_grid[[j]]) break
}$names[i] <- paste("XGBoost grid model", j)
dfelse {
} $names[i] <- paste(strsplit(name, "_")[[1]], collapse = " model ")
df
}
}
rownames(df) <- df$names
<- merge(df, data["R^2 mean"], by = "row.names")[, -1]
df <- df[df$standardized_coefficients > 0, ] df
<- ggplot(df, aes(x = reorder(names, standardized_coefficients), y = standardized_coefficients, fill = `R^2 mean`)) +
p1 geom_col(color = "black", size = 0.2) +
scale_fill_gradient(low = "white", high = "red", n.breaks = 4, limits = c(0.30, 0.60), breaks = c(0.3, 0.4, 0.5, 0.6)) +
geom_text(aes(label = sprintf("%.2f", `R^2 mean`)), nudge_y = -0.002, size = 3, color = "white") +
geom_text(aes(label = sprintf("%.3f", standardized_coefficients)), nudge_y = 0.0025, size = 3) +
coord_flip() +
scale_y_continuous(limits = c(0, max(df$standardized_coefficients) + 0.005), expand = c(0.01, 0)) +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
legend.text = element_text(colour = "black", size = 10),
plot.title = element_text(hjust = 0.5, size = 18),
plot.margin = ggplot2::margin(10, 10, 10, 0, "pt"),
axis.title.x = element_text(colour = "black", size = 18),
axis.title.y = element_text(colour = "black", size = 18),
axis.text.x = element_text(colour = "black", size = 12),
axis.text.y = element_text(colour = "black", size = 12),
legend.title = element_text(size = 14),
legend.position = c(0.85, 0.5)
+
) guides(fill = guide_colorbar(title = expression(italic("R"^2)))) +
xlab("") +
ylab("Coefficient") +
ggtitle("Melanin binding")
3.4 Cell-penetration (classification)
3.4.1 Model training
<- function(train_set, exp_dir, prefix, nfolds = 10, grid_seed = 1) {
train_cpp_models <- as.h2o(train_set, destination_frame = prefix)
tmp "category"] <- as.factor(tmp["category"])
tmp[<- "category"
y <- setdiff(names(tmp), y)
x <- as.data.frame(tmp$category)
res <- as.vector(mean(table(train_set$category)) / table(train_set$category))
samp_factors # -------------------
# base model training
# -------------------
cat("Deep learning grid 1\n")
<- h2o.grid(
deeplearning_1 algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_1,
stopping_rounds = 3, balance_classes = TRUE, class_sampling_factors = samp_factors,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)for (model_id in deeplearning_1@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("GBM grid\n")
<- h2o.grid(
gbm algorithm = "gbm", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = gbm_params, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
search_criteria = list(strategy = "RandomDiscrete", max_models = 100, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)for (model_id in gbm@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("GBM 5 default models\n")
<- h2o.gbm(
gbm_1 model_id = "GBM_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 6, min_rows = 1, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_2 model_id = "GBM_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 7, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_3 model_id = "GBM_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 8, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_4 model_id = "GBM_4", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 10, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_5 model_id = "GBM_5", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 15, min_rows = 100, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)for (model_id in paste0("GBM_", 1:5)) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("XGBoost grid\n")
<- h2o.grid(
xgboost algorithm = "xgboost", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = xgboost_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 100, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)for (model_id in xgboost@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("XGBoost 3 default models\n")
<- h2o.xgboost(
xgboost_1 model_id = "XGBoost_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 10, min_rows = 5, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.xgboost(
xgboost_2 model_id = "XGBoost_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 20, min_rows = 10, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.xgboost(
xgboost_3 model_id = "XGBoost_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 5, min_rows = 3, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.8, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)for (model_id in paste0("XGBoost_", 1:3)) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("DRF\n")
<- h2o.randomForest(
drf model_id = "DRF", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000,
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.saveModel(h2o.getModel("DRF"), path = exp_dir, force = TRUE)
tmp_path cat("XRT\n")
<- h2o.randomForest(
xrt model_id = "XRT", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000, histogram_type = "Random",
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.saveModel(h2o.getModel("XRT"), path = exp_dir, force = TRUE)
tmp_path # -----------------------
# get holdout predictions
# -----------------------
<- as.list(c(
base_models unlist(deeplearning_1@model_ids),
unlist(gbm@model_ids), paste0("GBM_", 1:5),
unlist(xgboost@model_ids), paste0("XGBoost_", 1:3),
"DRF",
"XRT"
))for (model_id in base_models) {
<- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
res col.names = model_id
))
}# ----------------------
# super learner training
# ----------------------
<- 0
sl_iter cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)<- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
tmp_path <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# ----------------------------------
# super learner base model reduction
# ----------------------------------
while (TRUE) {
<- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_1"))
meta <- meta@model$coefficients_table[, "names"]
names <- (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
coeffs for (j in 2:nfolds) {
<- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_", j))
meta <- meta@model$coefficients_table[, "names"]
names <- coeffs + (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
coeffs
}<- as.list(names[coeffs >= ceiling(nfolds / 2) & names != "Intercept"])
base_models_ if (length(base_models_) == 0) {
cat("No base models passing the threshold\n\n")
break
}if (sum(base_models %in% base_models_) == length(base_models)) {
cat("No further reduction of base models\n\n")
break
}<- sl_iter + 1
sl_iter <- base_models_
base_models cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)<- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
tmp_path <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))
}# -----------------------------------------
# super learner for homogeneous base models
# -----------------------------------------
# DeepLearning
<- deeplearning_1@model_ids
base_models cat(paste0("Super learner deep learning (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_deeplearning",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)<- h2o.saveModel(h2o.getModel("superlearner_deeplearning"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_deeplearning"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# GBM
<- as.list(c(unlist(gbm@model_ids), paste0("GBM_", 1:5)))
base_models cat(paste0("Super learner GBM (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_gbm",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)<- h2o.saveModel(h2o.getModel("superlearner_gbm"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_gbm"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# XGBoost
<- as.list(c(unlist(xgboost@model_ids), paste0("XGBoost_", 1:3)))
base_models cat(paste0("Super learner XGBoost (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_xgboost",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors
)
)<- h2o.saveModel(h2o.getModel("superlearner_xgboost"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_xgboost"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))["penetrating"],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))write.csv(res, file = paste0(exp_dir, "/cv_holdout_predictions.csv"), row.names = FALSE)
cat("\n\n")
h2o.removeAll()
}
3.4.2 Inner cross-validation
load(file = "./rdata/var_reduct_cpp_train_test_splits.RData")
for (i in 1:10) {
cat(paste0("Outer training set ", i, "\n"))
<- paste0("outer_", i)
prefix <- paste0("/Users/renee/Downloads/cell_penetration/", prefix)
exp_dir dir.create(exp_dir)
train_cpp_models(train_set = outer_splits[[i]][["train"]], exp_dir = exp_dir, prefix = prefix)
}
3.4.3 Training on whole data set
load(file = "./rdata/var_reduct_cpp_train_test_splits.RData")
<- "whole_data_set"
prefix <- paste0("/Users/renee/Downloads/cell_penetration/", prefix)
exp_dir dir.create(exp_dir)
train_cpp_models(train_set = cpp_data, exp_dir = exp_dir, prefix = prefix)
# Keep track of grid models
<- list()
dl_grid <- list()
gbm_grid <- list()
xgboost_grid <- list()
dl_grid_params <- list()
gbm_grid_params <- list()
xgboost_grid_params for (i in 1:11) {
if (i == 11) {
cat(paste0("Whole data set\n"))
<- "whole_data_set"
prefix else {
} cat(paste0("Outer training set ", i, "\n"))
<- paste0("outer_", i)
prefix
}<- paste0("/Users/renee/Downloads/cell_penetration/", prefix)
dir <- list.files(dir)
files # Deep learning
<- files[str_detect(files, "DeepLearning_model")]
dl for (m in dl) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$epsilon,
model@allparameters$hidden,
model@allparameters$hidden_dropout_ratios,
model@allparameters$input_dropout_ratio,
model@allparameters$rho
modelcollapse = " "))
), if (hs %in% names(dl_grid)) {
<- c(dl_grid[[hs]], m)
dl_grid[[hs]] else {
} <- c(m)
dl_grid[[hs]] <- list.append(
dl_grid_params
dl_grid_params,c(
"epsilon" = model@allparameters$epsilon,
"hidden" = paste0("[", paste(model@allparameters$hidden, collapse = ","), "]"),
"hidden_dropout_ratios" = paste0(
"[",
paste(model@allparameters$hidden_dropout_ratios,
collapse = ","
"]"
),
),"input_dropout_ratio" = model@allparameters$input_dropout_ratio,
"rho" = model@allparameters$rho
)
)
}
}h2o.removeAll()
# GBM
<- files[str_detect(files, "GBM_model")]
gbm for (m in gbm) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$min_split_improvement,
model@allparameters$sample_rate
modelcollapse = " "))
), if (hs %in% names(gbm_grid)) {
<- c(gbm_grid[[hs]], m)
gbm_grid[[hs]] else {
} <- c(m)
gbm_grid[[hs]] <- list.append(
gbm_grid_params
gbm_grid_params,c(
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"min_split_improvement" = model@allparameters$min_split_improvement,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}h2o.removeAll()
# XGBoost
<- files[str_detect(files, "XGBoost_model")]
xgboost for (m in xgboost) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$booster,
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$reg_alpha,
model@allparameters$reg_lambda,
model@allparameters$sample_rate
modelcollapse = " "))
), if (hs %in% names(xgboost_grid)) {
<- c(xgboost_grid[[hs]], m)
xgboost_grid[[hs]] else {
} <- c(m)
xgboost_grid[[hs]] <- list.append(
xgboost_grid_params
xgboost_grid_params,c(
"booster" = model@allparameters$booster,
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"reg_alpha" = model@allparameters$reg_alpha,
"reg_lambda" = model@allparameters$reg_lambda,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}h2o.removeAll()
}
<- as.data.frame(t(data.frame(dl_grid_params)))
dl_grid_params rownames(dl_grid_params) <- paste("Neural network grid model", 1:nrow(dl_grid_params))
<- as.data.frame(t(data.frame(gbm_grid_params)))
gbm_grid_params rownames(gbm_grid_params) <- paste("GBM grid model", 1:nrow(gbm_grid_params))
<- as.data.frame(t(data.frame(xgboost_grid_params)))
xgboost_grid_params rownames(xgboost_grid_params) <- paste("XGBoost grid model", 1:nrow(xgboost_grid_params))
write.csv(dl_grid_params, "./other_data/cell_penetration/neural_network_grid_params.csv")
write.csv(gbm_grid_params, "./other_data/cell_penetration/gbm_grid_params.csv")
write.csv(xgboost_grid_params, "./other_data/cell_penetration/xgboost_grid_params.csv")
save(dl_grid, gbm_grid, xgboost_grid, file = "./rdata/model_training_grid_models_cpp.RData")
3.4.4 Model evaluation
<- function(holdout_pred, fold_asign, grid_name, grid_meta = NULL) {
model_evaluation <- list()
res_ <- c()
model_num if (startsWith(grid_name, "Super learner")) {
if (grid_name == "Super learner all models") {
<- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter_0")]
m else if (grid_name == "Super learner final") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter")]
sl_models <- sl_models[length(sl_models)]
m else if (grid_name == "Super learner neural network") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_deeplearning")]
m else if (grid_name == "Super learner GBM") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_gbm")]
m else if (grid_name == "Super learner XGBoost") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_xgboost")]
m
}<- c()
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- 0.5
threshold for (k in 1:10) {
<- sapply(holdout_pred[fold_assign == k, "category"], function(x) if (x == "penetrating") 1 else 0)
y_true <- holdout_pred[fold_assign == k, m]
y_pred <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
y_pred_cls <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
logloss <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
MCC <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
F1 <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
acc <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
EF <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
BEDROC
}<- list.append(res_, c(
res_ mean(logloss, na.rm = TRUE), sem(logloss),
mean(MCC, na.rm = TRUE), sem(MCC),
mean(F1, na.rm = TRUE), sem(F1),
mean(acc, na.rm = TRUE), sem(acc),
mean(EF, na.rm = TRUE), sem(EF),
mean(BEDROC, na.rm = TRUE), sem(BEDROC),
logloss, MCC, F1, acc, EF, BEDROC
))else {
} for (j in 1:length(grid_meta)) {
<- grid_meta[[j]]
g <- intersect(colnames(holdout_pred), g)
m if (length(m) == 1) {
<- c(model_num, j)
model_num <- c()
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- 0.5
threshold for (k in 1:10) {
<- sapply(holdout_pred[fold_assign == k, "category"], function(x) if (x == "penetrating") 1 else 0)
y_true <- holdout_pred[fold_assign == k, m]
y_pred <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
y_pred_cls <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
logloss <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
MCC <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
F1 <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
acc <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
EF <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
BEDROC
}<- list.append(res_, c(
res_ mean(logloss, na.rm = TRUE), sem(logloss),
mean(MCC, na.rm = TRUE), sem(MCC),
mean(F1, na.rm = TRUE), sem(F1),
mean(acc, na.rm = TRUE), sem(acc),
mean(EF, na.rm = TRUE), sem(EF),
mean(BEDROC, na.rm = TRUE), sem(BEDROC),
logloss, MCC, F1, acc, EF, BEDROC
))
}
}
}<- as.data.frame(t(data.frame(res_)))
res colnames(res) <- c(
"Log loss mean", "Log loss s.e.m.", "MCC mean", "MCC s.e.m.", "F_1 mean", "F_1 s.e.m.",
"Balanced accuracy mean", "Balanced accuracy s.e.m.", "EF mean", "EF s.e.m.", "BEDROC mean", "BEDROC s.e.m.",
paste("Log loss CV", 1:10), paste("MCC CV", 1:10), paste("F_1 CV", 1:10),
paste("Balanced accuracy CV", 1:10), paste("EF CV", 1:10), paste("BEDROC CV", 1:10)
)if (nrow(res) == 1) {
rownames(res) <- grid_name
else {
} rownames(res) <- paste(grid_name, model_num)
}return(res)
}
3.4.5 Inner loop model selection
load(file = "./rdata/model_training_grid_models_cpp.RData")
for (i in 1:10) {
<- read.csv(paste0("./other_data/cell_penetration/outer_", i, "/cv_holdout_predictions.csv"))
holdout_pred <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
fold_assign <- list()
data_ # Deep learning grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "Neural network grid model",
grid_meta = dl_grid
))# GBM grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM grid model",
grid_meta = gbm_grid
))# GBM default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))# XGBoost grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))# XGBoost default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))# DRF
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
data_ # XRT
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
data_ # Super learner all
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
data_ # Super learner final
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
data_ # Super learner deep learning
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
data_ # Super learner GBM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
data_ # Super learner XGBoost
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data_
<- do.call(rbind, data_)
data write.csv(data, paste0("./other_data/cell_penetration/outer_", i, "/cv_res.csv"))
}
3.4.6 Final evaluation
load(file = "./rdata/var_reduct_cpp_train_test_splits.RData")
load(file = "./rdata/model_training_grid_models_cpp.RData")
<- "/Users/renee/Downloads/cell_penetration"
dir <- c(
selected_models "GBM grid model 30", "Neural network grid model 98", "Neural network grid model 72",
"GBM grid model 1", "GBM grid model 45", "GBM grid model 82", "GBM grid model 72",
"GBM grid model 15", "GBM grid model 74", "GBM grid model 24"
)<- c()
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- 0.5
threshold for (i in 1:10) {
<- read.csv(paste0("./other_data/cell_penetration/outer_", i, "/cv_holdout_predictions.csv"))
holdout_pred if (startsWith(selected_models[i], "Neural network grid model")) {
<- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), dl_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "GBM grid model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), gbm_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "GBM model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- h2o.loadModel(paste0(dir, "/outer_", i, "/GBM_", grid_num))
model else if (startsWith(selected_models[i], "XGBoost grid model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), xgboost_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "XGBoost model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- h2o.loadModel(paste0(dir, "/outer_", i, "/XGBoost_", grid_num))
model else if (selected_models[i] == "Super learner all models") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_iter_0"))
model else if (selected_models[i] == "Super learner final") {
} <- list.files(paste0(dir, "/outer_", i))
files <- files[startsWith(files, "superlearner_iter")]
files <- h2o.loadModel(paste0(dir, "/outer_", i, "/", files[length(files)]))
model else if (selected_models[i] == "Super learner neural network") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_deeplearning"))
model else if (selected_models[i] == "Super learner GBM") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_gbm"))
model else if (selected_models[i] == "Super learner XGBoost") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_xgboost"))
model else {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/", unlist(strsplit(selected_models[i], " "))[1]))
model
}<- as.h2o(subset(outer_splits[[i]][["test"]], select = -category))
tmp <- sapply(outer_splits[[i]][["test"]]$category, function(x) if (x == "penetrating") 1 else 0)
y_true <- as.data.frame(as.data.frame(h2o.predict(model, tmp)))[, "penetrating"]
y_pred <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
y_pred_cls <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
logloss <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
MCC <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
F1 <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
acc <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
EF <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
BEDROC
}save(logloss, MCC, F1, acc, EF, BEDROC, file = "./rdata/final_evaluation_cpp.RData")
load(file = "./rdata/final_evaluation_cpp.RData")
data.frame(
Metric = c("Log loss", "MCC", "F<sub>1</sub>", "Balanced accuracy", "EF", "BEDROC"),
`Mean ± s.e.m.` = c(
paste0(round(mean(logloss), 3), " ± ", round(sem(logloss), 3)),
paste0(round(mean(MCC), 3), " ± ", round(sem(MCC), 3)),
paste0(round(mean(F1), 3), " ± ", round(sem(F1), 3)),
paste0(round(mean(acc), 3), " ± ", round(sem(acc), 3)),
paste0(round(mean(EF), 3), " ± ", round(sem(EF), 3)),
paste0(round(mean(BEDROC), 3), " ± ", round(sem(BEDROC), 3))
),check.names = FALSE
%>%
) datatable(escape = FALSE, rownames = FALSE)
3.4.7 Final model selection
load(file = "./rdata/model_training_grid_models_cpp.RData")
<- read.csv("./other_data/cell_penetration/whole_data_set/cv_holdout_predictions.csv")
holdout_pred <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
fold_assign <- list()
data_ # Deep learning grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "Neural network grid model",
grid_meta = dl_grid
))# GBM grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM grid model",
grid_meta = gbm_grid
))# GBM default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))# XGBoost grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))# XGBoost default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))# DRF
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
data_ # XRT
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
data_ # Super learner all
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
data_ # Super learner final
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
data_ # Super learner deep learning
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
data_ # Super learner GBM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
data_ # Super learner XGBoost
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data_
<- do.call(rbind, data_)
data write.csv(data, "./other_data/cell_penetration/whole_data_set/cv_res.csv")
<- read.csv("./other_data/cell_penetration/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
data statistical_testing(data,
metric_vec = c("Log loss", "MCC", "F_1", "Balanced accuracy"),
decreasing_vec = c(FALSE, TRUE, TRUE, TRUE)
)
# statistical_testing(data, metric_vec=c('Log loss', 'MCC', 'F_1', 'Balanced accuracy'),
# decreasing_vec=c(FALSE, TRUE, TRUE, TRUE),
# output_path='./other_data/cell_penetration/whole_data_set/cv_res_statistical_testing.csv')
3.4.8 Final reduced SL
h2o.init(nthreads = -1)
<- "/Users/renee/Downloads/cell_penetration/whole_data_set/superlearner_iter_1"
model_path <- h2o.loadModel(model_path)
model load(file = "./rdata/model_training_grid_models_cpp.RData")
<- read.csv("./other_data/cell_penetration/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
data <- as.data.frame(model@model$metalearner_model@model$coefficients_table)[c("names", "standardized_coefficients")][-1, ]
df
for (i in 1:nrow(df)) {
<- df$names[i]
name if (startsWith(name, "DeepLearning_model")) {
for (j in 1:length(dl_grid)) {
if (name %in% dl_grid[[j]]) break
}$names[i] <- paste("Neural network grid model", j)
dfelse if (startsWith(name, "GBM_model")) {
} for (j in 1:length(gbm_grid)) {
if (name %in% gbm_grid[[j]]) break
}$names[i] <- paste("GBM grid model", j)
dfelse if (startsWith(name, "XGBoost_model")) {
} for (j in 1:length(xgboost_grid)) {
if (name %in% xgboost_grid[[j]]) break
}$names[i] <- paste("XGBoost grid model", j)
dfelse if (startsWith(name, "DRF")) {
} $names[i] <- "DRF model"
dfelse if (startsWith(name, "XRT")) {
} $names[i] <- "XRT model"
dfelse {
} $names[i] <- paste(strsplit(name, "_")[[1]], collapse = " model ")
df
}
}
rownames(df) <- df$names
<- merge(df, data["Balanced accuracy mean"], by = "row.names")[, -1]
df <- df[df$standardized_coefficients > 0, ] df
<- ggplot(df, aes(x = reorder(names, standardized_coefficients), y = standardized_coefficients, fill = `Balanced accuracy mean`)) +
p2 geom_col(color = "black", size = 0.2) +
scale_fill_gradient(low = "white", high = "red", n.breaks = 4, limits = c(0.85, 1.00)) +
geom_text(aes(label = sprintf("%.2f", `Balanced accuracy mean`)), nudge_y = -0.0025, size = 3, color = "white") +
geom_text(aes(label = sprintf("%.3f", standardized_coefficients)), nudge_y = 0.0035, size = 3) +
coord_flip() +
scale_y_continuous(limits = c(0, max(df$standardized_coefficients) + 0.006), expand = c(0.01, 0)) +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
legend.text = element_text(colour = "black", size = 10),
plot.title = element_text(hjust = 0.5, size = 18),
plot.margin = ggplot2::margin(10, 10, 10, 0, "pt"),
axis.title.x = element_text(colour = "black", size = 18),
axis.title.y = element_text(colour = "black", size = 18),
axis.text.x = element_text(colour = "black", size = 12),
axis.text.y = element_text(colour = "black", size = 12),
legend.title = element_text(size = 14),
legend.position = c(0.85, 0.5)
+
) guides(fill = guide_colorbar(title = "Balanced accuracy")) +
xlab("") +
ylab("Coefficient") +
ggtitle("Cell-penetration")
3.5 Toxicity (classification)
3.5.1 Model training
<- function(train_set, exp_dir, prefix, nfolds = 10, grid_seed = 1) {
train_tx_models <- as.h2o(train_set, destination_frame = prefix)
tmp "category"] <- as.factor(tmp["category"])
tmp[<- "category"
y <- setdiff(names(tmp), y)
x <- as.data.frame(tmp$category)
res <- as.vector(mean(table(train_set$category)) / table(train_set$category))
samp_factors # -------------------
# base model training
# -------------------
cat("Deep learning grid 1\n")
<- h2o.grid(
deeplearning_1 algorithm = "deeplearning", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = deeplearning_params_1,
stopping_rounds = 3, balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5,
search_criteria = list(
strategy = "RandomDiscrete",
max_models = 100, seed = grid_seed
),keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)for (model_id in deeplearning_1@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("GBM grid\n")
<- h2o.grid(
gbm algorithm = "gbm", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = gbm_params, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
search_criteria = list(strategy = "RandomDiscrete", max_models = 100, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)for (model_id in gbm@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("GBM 5 default models\n")
<- h2o.gbm(
gbm_1 model_id = "GBM_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 6, min_rows = 1, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_2 model_id = "GBM_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 7, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_3 model_id = "GBM_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 8, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_4 model_id = "GBM_4", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 10, min_rows = 10, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.gbm(
gbm_5 model_id = "GBM_5", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8, learn_rate = 0.1,
max_depth = 15, min_rows = 100, min_split_improvement = 1e-5, ntrees = 10000, sample_rate = 0.8,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)for (model_id in paste0("GBM_", 1:5)) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("XGBoost grid\n")
<- h2o.grid(
xgboost algorithm = "xgboost", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, hyper_params = xgboost_params, stopping_rounds = 3,
search_criteria = list(strategy = "RandomDiscrete", max_models = 100, seed = grid_seed),
keep_cross_validation_models = FALSE, fold_assignment = "Modulo", parallelism = 0
)for (model_id in xgboost@model_ids) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("XGBoost 3 default models\n")
<- h2o.xgboost(
xgboost_1 model_id = "XGBoost_1", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 10, min_rows = 5, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.xgboost(
xgboost_2 model_id = "XGBoost_2", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 20, min_rows = 10, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.6, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.xgboost(
xgboost_3 model_id = "XGBoost_3", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, stopping_rounds = 3, score_tree_interval = 5,
booster = "gbtree", col_sample_rate = 0.8, col_sample_rate_per_tree = 0.8,
max_depth = 5, min_rows = 3, ntrees = 10000, reg_alpha = 0, reg_lambda = 1,
sample_rate = 0.8, keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)for (model_id in paste0("XGBoost_", 1:3)) {
<- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path
}cat("GLM\n")
<- h2o.glm(
glm model_id = "GLM", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, alpha = c(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.saveModel(h2o.getModel("GLM"), path = exp_dir, force = TRUE)
tmp_path cat("DRF\n")
<- h2o.randomForest(
drf model_id = "DRF", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000,
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.saveModel(h2o.getModel("DRF"), path = exp_dir, force = TRUE)
tmp_path cat("XRT\n")
<- h2o.randomForest(
xrt model_id = "XRT", x = x, y = y, training_frame = tmp, seed = 1, nfolds = nfolds,
keep_cross_validation_predictions = TRUE, ntrees = 10000, histogram_type = "Random",
score_tree_interval = 5, stopping_rounds = 3,
balance_classes = TRUE, class_sampling_factors = samp_factors, max_after_balance_size = 0.5,
keep_cross_validation_models = FALSE, fold_assignment = "Modulo"
)<- h2o.saveModel(h2o.getModel("XRT"), path = exp_dir, force = TRUE)
tmp_path # -----------------------
# get holdout predictions
# -----------------------
<- as.list(c(
base_models unlist(deeplearning_1@model_ids),
unlist(gbm@model_ids), paste0("GBM_", 1:5),
unlist(xgboost@model_ids), paste0("XGBoost_", 1:3),
"GLM",
"DRF",
"XRT"
))for (model_id in base_models) {
<- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
res col.names = model_id
))
}# ----------------------
# super learner training
# ----------------------
<- 0
sl_iter cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)<- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
tmp_path <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# ----------------------------------
# super learner base model reduction
# ----------------------------------
while (TRUE) {
<- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_1"))
meta <- meta@model$coefficients_table[, "names"]
names <- (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
coeffs for (j in 2:nfolds) {
<- h2o.getModel(paste0("metalearner_glm_superlearner_iter_", sl_iter, "_cv_", j))
meta <- meta@model$coefficients_table[, "names"]
names <- coeffs + (meta@model$coefficients_table[, "standardized_coefficients"] > 0)
coeffs
}<- as.list(names[coeffs >= ceiling(nfolds / 2) & names != "Intercept"])
base_models_ if (length(base_models_) == 0) {
cat("No base models passing the threshold\n\n")
break
}if (sum(base_models %in% base_models_) == length(base_models)) {
cat("No further reduction of base models\n\n")
break
}<- sl_iter + 1
sl_iter <- base_models_
base_models cat(paste0("Super learner iteration ", sl_iter, " (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = paste0("superlearner_iter_", sl_iter),
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)<- h2o.saveModel(h2o.getModel(paste0("superlearner_iter_", sl_iter)), path = exp_dir, force = TRUE)
tmp_path <- paste0("metalearner_glm_superlearner_iter_", sl_iter)
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))
}# -----------------------------------------
# super learner for homogeneous base models
# -----------------------------------------
# DeepLearning
<- deeplearning_1@model_ids
base_models cat(paste0("Super learner deep learning (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_deeplearning",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)<- h2o.saveModel(h2o.getModel("superlearner_deeplearning"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_deeplearning"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# GBM
<- as.list(c(unlist(gbm@model_ids), paste0("GBM_", 1:5)))
base_models cat(paste0("Super learner GBM (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_gbm",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)<- h2o.saveModel(h2o.getModel("superlearner_gbm"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_gbm"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))# XGBoost
<- as.list(c(unlist(xgboost@model_ids), paste0("XGBoost_", 1:3)))
base_models cat(paste0("Super learner XGBoost (", length(base_models), " models)\n"))
<- h2o.stackedEnsemble(
sl x = x, y = y, model_id = "superlearner_xgboost",
training_frame = tmp, seed = 1,
base_models = base_models,
metalearner_algorithm = "glm",
metalearner_nfolds = nfolds,
keep_levelone_frame = TRUE,
metalearner_params = list(
standardize = TRUE, keep_cross_validation_predictions = TRUE,
balance_classes = TRUE, class_sampling_factors = samp_factors,
max_after_balance_size = 0.5
)
)<- h2o.saveModel(h2o.getModel("superlearner_xgboost"), path = exp_dir, force = TRUE)
tmp_path <- "metalearner_glm_superlearner_xgboost"
model_id <- h2o.saveModel(h2o.getModel(model_id), path = exp_dir, force = TRUE)
tmp_path <- cbind(res, as.data.frame(h2o.getFrame(paste0("cv_holdout_prediction_", model_id))[3],
res col.names = paste0(model_id, "_", length(base_models), "_models")
))write.csv(res, file = paste0(exp_dir, "/cv_holdout_predictions.csv"), row.names = FALSE)
cat("\n\n")
h2o.removeAll()
}
3.5.2 Inner cross-validation
load(file = "./rdata/var_reduct_tx_train_test_splits.RData")
for (i in 1:10) {
cat(paste0("Outer training set ", i, "\n"))
<- paste0("outer_", i)
prefix <- paste0("/Users/renee/Downloads/toxicity/", prefix)
exp_dir dir.create(exp_dir)
train_tx_models(train_set = outer_splits[[i]][["train"]], exp_dir = exp_dir, prefix = prefix)
}
3.5.3 Training on whole data set
load(file = "./rdata/var_reduct_tx_train_test_splits.RData")
<- "whole_data_set"
prefix <- paste0("/Users/renee/Downloads/toxicity/", prefix)
exp_dir dir.create(exp_dir)
train_tx_models(train_set = tx_data, exp_dir = exp_dir, prefix = prefix)
# Keep track of grid models
<- list()
dl_grid <- list()
gbm_grid <- list()
xgboost_grid <- list()
dl_grid_params <- list()
gbm_grid_params <- list()
xgboost_grid_params for (i in 1:11) {
if (i == 11) {
cat(paste0("Whole data set\n"))
<- "whole_data_set"
prefix else {
} cat(paste0("Outer training set ", i, "\n"))
<- paste0("outer_", i)
prefix
}<- paste0("/Users/renee/Downloads/toxicity/", prefix)
dir <- list.files(dir)
files # Deep learning
<- files[str_detect(files, "DeepLearning_model")]
dl for (m in dl) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$epsilon,
model@allparameters$hidden,
model@allparameters$hidden_dropout_ratios,
model@allparameters$input_dropout_ratio,
model@allparameters$rho
modelcollapse = " "))
), if (hs %in% names(dl_grid)) {
<- c(dl_grid[[hs]], m)
dl_grid[[hs]] else {
} <- c(m)
dl_grid[[hs]] <- list.append(
dl_grid_params
dl_grid_params,c(
"epsilon" = model@allparameters$epsilon,
"hidden" = paste0("[", paste(model@allparameters$hidden, collapse = ","), "]"),
"hidden_dropout_ratios" = paste0(
"[",
paste(model@allparameters$hidden_dropout_ratios,
collapse = ","
"]"
),
),"input_dropout_ratio" = model@allparameters$input_dropout_ratio,
"rho" = model@allparameters$rho
)
)
}
}h2o.removeAll()
# GBM
<- files[str_detect(files, "GBM_model")]
gbm for (m in gbm) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$min_split_improvement,
model@allparameters$sample_rate
modelcollapse = " "))
), if (hs %in% names(gbm_grid)) {
<- c(gbm_grid[[hs]], m)
gbm_grid[[hs]] else {
} <- c(m)
gbm_grid[[hs]] <- list.append(
gbm_grid_params
gbm_grid_params,c(
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"min_split_improvement" = model@allparameters$min_split_improvement,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}h2o.removeAll()
# XGBoost
<- files[str_detect(files, "XGBoost_model")]
xgboost for (m in xgboost) {
<- h2o.loadModel(paste0(dir, "/", m))
model <- sha1(paste(c(
hs @allparameters$booster,
model@allparameters$col_sample_rate,
model@allparameters$col_sample_rate_per_tree,
model@allparameters$max_depth,
model@allparameters$min_rows,
model@allparameters$reg_alpha,
model@allparameters$reg_lambda,
model@allparameters$sample_rate
modelcollapse = " "))
), if (hs %in% names(xgboost_grid)) {
<- c(xgboost_grid[[hs]], m)
xgboost_grid[[hs]] else {
} <- c(m)
xgboost_grid[[hs]] <- list.append(
xgboost_grid_params
xgboost_grid_params,c(
"booster" = model@allparameters$booster,
"col_sample_rate" = model@allparameters$col_sample_rate,
"col_sample_rate_per_tree" = model@allparameters$col_sample_rate_per_tree,
"max_depth" = model@allparameters$max_depth,
"min_rows" = model@allparameters$min_rows,
"reg_alpha" = model@allparameters$reg_alpha,
"reg_lambda" = model@allparameters$reg_lambda,
"sample_rate" = model@allparameters$sample_rate
)
)
}
}h2o.removeAll()
}
<- as.data.frame(t(data.frame(dl_grid_params)))
dl_grid_params rownames(dl_grid_params) <- paste("Neural network grid model", 1:nrow(dl_grid_params))
<- as.data.frame(t(data.frame(gbm_grid_params)))
gbm_grid_params rownames(gbm_grid_params) <- paste("GBM grid model", 1:nrow(gbm_grid_params))
<- as.data.frame(t(data.frame(xgboost_grid_params)))
xgboost_grid_params rownames(xgboost_grid_params) <- paste("XGBoost grid model", 1:nrow(xgboost_grid_params))
write.csv(dl_grid_params, "./other_data/toxicity/neural_network_grid_params.csv")
write.csv(gbm_grid_params, "./other_data/toxicity/gbm_grid_params.csv")
write.csv(xgboost_grid_params, "./other_data/toxicity/xgboost_grid_params.csv")
save(dl_grid, gbm_grid, xgboost_grid, file = "./rdata/model_training_grid_models_tx.RData")
3.5.4 Model evaluation
<- function(holdout_pred, fold_asign, grid_name, grid_meta = NULL) {
model_evaluation <- list()
res_ <- c()
model_num if (startsWith(grid_name, "Super learner")) {
if (grid_name == "Super learner all models") {
<- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter_0")]
m else if (grid_name == "Super learner final") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_iter")]
sl_models <- sl_models[length(sl_models)]
m else if (grid_name == "Super learner neural network") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_deeplearning")]
m else if (grid_name == "Super learner GBM") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_gbm")]
m else if (grid_name == "Super learner XGBoost") {
} <- colnames(holdout_pred)[startsWith(colnames(holdout_pred), "metalearner_glm_superlearner_xgboost")]
m
}<- c()
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- 0.5
threshold for (k in 1:10) {
<- holdout_pred[fold_assign == k, "category"]
y_true <- holdout_pred[fold_assign == k, m]
y_pred <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
y_pred_cls <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
logloss <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
MCC <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
F1 <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
acc <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
EF <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
BEDROC
}<- list.append(res_, c(
res_ mean(logloss, na.rm = TRUE), sem(logloss),
mean(MCC, na.rm = TRUE), sem(MCC),
mean(F1, na.rm = TRUE), sem(F1),
mean(acc, na.rm = TRUE), sem(acc),
mean(EF, na.rm = TRUE), sem(EF),
mean(BEDROC, na.rm = TRUE), sem(BEDROC),
logloss, MCC, F1, acc, EF, BEDROC
))else {
} for (j in 1:length(grid_meta)) {
<- grid_meta[[j]]
g <- intersect(colnames(holdout_pred), g)
m if (length(m) == 1) {
<- c(model_num, j)
model_num <- c()
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- 0.5
threshold for (k in 1:10) {
<- holdout_pred[fold_assign == k, "category"]
y_true <- holdout_pred[fold_assign == k, m]
y_pred <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
y_pred_cls <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
logloss <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
MCC <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
F1 <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
acc <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
EF <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
BEDROC
}<- list.append(res_, c(
res_ mean(logloss, na.rm = TRUE), sem(logloss),
mean(MCC, na.rm = TRUE), sem(MCC),
mean(F1, na.rm = TRUE), sem(F1),
mean(acc, na.rm = TRUE), sem(acc),
mean(EF, na.rm = TRUE), sem(EF),
mean(BEDROC, na.rm = TRUE), sem(BEDROC),
logloss, MCC, F1, acc, EF, BEDROC
))
}
}
}<- as.data.frame(t(data.frame(res_)))
res colnames(res) <- c(
"Log loss mean", "Log loss s.e.m.", "MCC mean", "MCC s.e.m.", "F_1 mean", "F_1 s.e.m.",
"Balanced accuracy mean", "Balanced accuracy s.e.m.", "EF mean", "EF s.e.m.", "BEDROC mean", "BEDROC s.e.m.",
paste("Log loss CV", 1:10), paste("MCC CV", 1:10), paste("F_1 CV", 1:10),
paste("Balanced accuracy CV", 1:10), paste("EF CV", 1:10), paste("BEDROC CV", 1:10)
)if (nrow(res) == 1) {
rownames(res) <- grid_name
else {
} rownames(res) <- paste(grid_name, model_num)
}return(res)
}
3.5.5 Inner loop model selection
load(file = "./rdata/model_training_grid_models_tx.RData")
for (i in 1:10) {
<- read.csv(paste0("./other_data/toxicity/outer_", i, "/cv_holdout_predictions.csv"))
holdout_pred <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
fold_assign <- list()
data_ # Deep learning grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "Neural network grid model",
grid_meta = dl_grid
))# GBM grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM grid model",
grid_meta = gbm_grid
))# GBM default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))# XGBoost grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))# XGBoost default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))# GLM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "GLM model", grid_meta = list("GLM")))
data_ # DRF
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
data_ # XRT
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
data_ # Super learner all
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
data_ # Super learner final
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
data_ # Super learner deep learning
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
data_ # Super learner GBM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
data_ # Super learner XGBoost
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data_
<- do.call(rbind, data_)
data write.csv(data, paste0("./other_data/toxicity/outer_", i, "/cv_res.csv"))
}
3.5.6 Final evaluation
load(file = "./rdata/var_reduct_tx_train_test_splits.RData")
load(file = "./rdata/model_training_grid_models_tx.RData")
<- "/Users/renee/Downloads/toxicity"
dir <- c(
selected_models "Super learner XGBoost", "Super learner final", "Super learner final", "Super learner final",
"Super learner final", "Super learner final", "Super learner final", "Super learner final",
"Super learner final", "Super learner final"
)<- c()
logloss <- c()
MCC <- c()
F1 <- c()
acc <- c()
EF <- c()
BEDROC <- 0.5
threshold for (i in 1:10) {
<- read.csv(paste0("./other_data/toxicity/outer_", i, "/cv_holdout_predictions.csv"))
holdout_pred if (startsWith(selected_models[i], "Neural network grid model")) {
<- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), dl_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "GBM grid model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), gbm_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "GBM model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- h2o.loadModel(paste0(dir, "/outer_", i, "/GBM_", grid_num))
model else if (startsWith(selected_models[i], "XGBoost grid model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- intersect(colnames(holdout_pred), xgboost_grid[[grid_num]])
m <- h2o.loadModel(paste0(dir, "/outer_", i, "/", m))
model else if (startsWith(selected_models[i], "XGBoost model")) {
} <- as.integer(str_extract(selected_models[i], "[0-9]+"))
grid_num <- h2o.loadModel(paste0(dir, "/outer_", i, "/XGBoost_", grid_num))
model else if (selected_models[i] == "Super learner all models") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_iter_0"))
model else if (selected_models[i] == "Super learner final") {
} <- list.files(paste0(dir, "/outer_", i))
files <- files[startsWith(files, "superlearner_iter")]
files <- h2o.loadModel(paste0(dir, "/outer_", i, "/", files[length(files)]))
model else if (selected_models[i] == "Super learner neural network") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_deeplearning"))
model else if (selected_models[i] == "Super learner GBM") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_gbm"))
model else if (selected_models[i] == "Super learner XGBoost") {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/superlearner_xgboost"))
model else {
} <- h2o.loadModel(paste0(dir, "/outer_", i, "/", unlist(strsplit(selected_models[i], " "))[1]))
model
}<- as.h2o(subset(outer_splits[[i]][["test"]], select = -category))
tmp <- as.numeric(outer_splits[[i]][["test"]]$category) - 1
y_true <- as.data.frame(as.data.frame(h2o.predict(model, tmp)))[, 3]
y_pred <- sapply(y_pred, function(x) if (x >= threshold) 1 else 0)
y_pred_cls <- c(logloss, LogLoss(y_pred = y_pred, y_true = y_true))
logloss <- c(MCC, mcc(preds = y_pred_cls, actuals = y_true))
MCC <- c(F1, F1_Score(y_pred = y_pred_cls, y_true = y_true))
F1 <- c(acc, balanced_accuracy(y_pred = y_pred_cls, y_true = y_true))
acc <- c(EF, enrichment_factor(x = y_pred, y = y_true, top = 0.05))
EF <- c(BEDROC, bedroc(x = y_pred, y = y_true, alpha = 20))
BEDROC
}save(logloss, MCC, F1, acc, EF, BEDROC, file = "./rdata/final_evaluation_tx.RData")
load(file = "./rdata/final_evaluation_tx.RData")
data.frame(
Metric = c("Log loss", "MCC", "F<sub>1</sub>", "Balanced accuracy", "EF", "BEDROC"),
`Mean ± s.e.m.` = c(
paste0(round(mean(logloss), 3), " ± ", round(sem(logloss), 3)),
paste0(round(mean(MCC), 3), " ± ", round(sem(MCC), 3)),
paste0(round(mean(F1), 3), " ± ", round(sem(F1), 3)),
paste0(round(mean(acc), 3), " ± ", round(sem(acc), 3)),
paste0(round(mean(EF), 3), " ± ", round(sem(EF), 3)),
paste0(round(mean(BEDROC), 3), " ± ", round(sem(BEDROC), 3))
),check.names = FALSE
%>%
) datatable(escape = FALSE, rownames = FALSE)
3.5.7 Final model selection
load(file = "./rdata/model_training_grid_models_tx.RData")
<- read.csv("./other_data/toxicity/whole_data_set/cv_holdout_predictions.csv")
holdout_pred <- rep(1:10, ceiling(nrow(holdout_pred) / 10))[1:nrow(holdout_pred)]
fold_assign <- list()
data_ # Deep learning grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "Neural network grid model",
grid_meta = dl_grid
))# GBM grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM grid model",
grid_meta = gbm_grid
))# GBM default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "GBM model",
grid_meta = as.list(paste0("GBM_", 1:5))
))# XGBoost grid models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost grid model",
grid_meta = xgboost_grid
))# XGBoost default models
<- list.append(data_, model_evaluation(holdout_pred, fold_assign,
data_ grid_name = "XGBoost model",
grid_meta = as.list(paste0("XGBoost_", 1:3))
))# GLM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "GLM model", grid_meta = list("GLM")))
data_ # DRF
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "DRF model", grid_meta = list("DRF")))
data_ # XRT
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "XRT model", grid_meta = list("XRT")))
data_ # Super learner all
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner all models"))
data_ # Super learner final
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner final"))
data_ # Super learner deep learning
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner neural network"))
data_ # Super learner GBM
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner GBM"))
data_ # Super learner XGBoost
<- list.append(data_, model_evaluation(holdout_pred, fold_assign, grid_name = "Super learner XGBoost"))
data_
<- do.call(rbind, data_)
data write.csv(data, "./other_data/toxicity/whole_data_set/cv_res.csv")
<- read.csv("./other_data/toxicity/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
data statistical_testing(data,
metric_vec = c("Log loss", "MCC", "F_1", "Balanced accuracy"),
decreasing_vec = c(FALSE, TRUE, TRUE, TRUE)
)
# statistical_testing(data, metric_vec=c('Log loss', 'MCC', 'F_1', 'Balanced accuracy'),
# decreasing_vec=c(FALSE, TRUE, TRUE, TRUE),
# output_path='./other_data/toxicity/whole_data_set/cv_res_statistical_testing.csv')
3.5.8 Final reduced SL
h2o.init(nthreads = -1)
<- "/Users/renee/Downloads/toxicity/whole_data_set/superlearner_iter_3"
model_path <- h2o.loadModel(model_path)
model load(file = "./rdata/model_training_grid_models_tx.RData")
<- read.csv("./other_data/toxicity/whole_data_set/cv_res.csv", row.names = 1, check.names = FALSE)
data <- as.data.frame(model@model$metalearner_model@model$coefficients_table)[c("names", "standardized_coefficients")][-1, ]
df
for (i in 1:nrow(df)) {
<- df$names[i]
name if (startsWith(name, "DeepLearning_model")) {
for (j in 1:length(dl_grid)) {
if (name %in% dl_grid[[j]]) break
}$names[i] <- paste("Neural network grid model", j)
dfelse if (startsWith(name, "GBM_model")) {
} for (j in 1:length(gbm_grid)) {
if (name %in% gbm_grid[[j]]) break
}$names[i] <- paste("GBM grid model", j)
dfelse if (startsWith(name, "XGBoost_model")) {
} for (j in 1:length(xgboost_grid)) {
if (name %in% xgboost_grid[[j]]) break
}$names[i] <- paste("XGBoost grid model", j)
dfelse {
} $names[i] <- paste(strsplit(name, "_")[[1]], collapse = " model ")
df
}
}
rownames(df) <- df$names
<- merge(df, data["Balanced accuracy mean"], by = "row.names")[, -1]
df <- df[df$standardized_coefficients > 0, ] df
<- ggplot(df, aes(x = reorder(names, standardized_coefficients), y = standardized_coefficients, fill = `Balanced accuracy mean`)) +
p3 geom_col(color = "black", size = 0.2) +
scale_fill_gradient(low = "white", high = "red", n.breaks = 4, limits = c(0.85, 1.00)) +
geom_text(aes(label = sprintf("%.2f", `Balanced accuracy mean`)), nudge_y = -0.008, size = 3, color = "white") +
geom_text(aes(label = sprintf("%.3f", standardized_coefficients)), nudge_y = 0.01, size = 3) +
coord_flip() +
scale_y_continuous(limits = c(0, max(df$standardized_coefficients) + 0.02), expand = c(0.01, 0)) +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
legend.text = element_text(colour = "black", size = 10),
plot.title = element_text(hjust = 0.5, size = 18),
plot.margin = ggplot2::margin(10, 10, 10, 0, "pt"),
axis.title.x = element_text(colour = "black", size = 18),
axis.title.y = element_text(colour = "black", size = 18),
axis.text.x = element_text(colour = "black", size = 12),
axis.text.y = element_text(colour = "black", size = 12),
legend.title = element_text(size = 14),
legend.position = c(0.85, 0.35)
+
) guides(fill = guide_colorbar(title = "Balanced accuracy")) +
xlab("") +
ylab("Coefficient") +
ggtitle("Non-toxic")
# Close h2o server
# h2o.shutdown()
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.4 DT_0.24 digest_0.6.29 stringr_1.4.1
## [5] rlist_0.4.6.2 enrichvs_0.0.5 scales_1.2.1 mltools_0.3.5
## [9] MLmetrics_1.1.1 h2o_3.38.0.2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.9 highr_0.9 plyr_1.8.7 bslib_0.4.0
## [5] compiler_4.2.2 jquerylib_0.1.4 R.methodsS3_1.8.2 R.utils_2.12.0
## [9] bitops_1.0-7 tools_4.2.2 lifecycle_1.0.1 jsonlite_1.8.0
## [13] evaluate_0.16 R.cache_0.16.0 lattice_0.20-45 rlang_1.0.4
## [17] Matrix_1.5-1 cli_3.3.0 rstudioapi_0.14 crosstalk_1.2.0
## [21] yaml_2.3.5 xfun_0.32 fastmap_1.1.0 styler_1.8.0
## [25] knitr_1.40 vctrs_0.4.1 htmlwidgets_1.5.4 sass_0.4.2
## [29] grid_4.2.2 data.table_1.14.2 R6_2.5.1 rmarkdown_2.16
## [33] bookdown_0.28 purrr_0.3.4 magrittr_2.0.3 codetools_0.2-18
## [37] htmltools_0.5.3 colorspace_2.0-3 stringi_1.7.8 munsell_0.5.0
## [41] RCurl_1.98-1.8 cachem_1.0.6 R.oo_1.25.0