Section 2 Variable reduction
library(Peptides)
library(protr)
library(rlist)
library(ranger)
library(ggplot2)
library(cowplot)
library(scales)
2.1 Methods
- Formula of AIC
\[\begin{equation} AIC = -2\ln(\hat{L}) + 2k, \tag{2.1} \end{equation}\]
\[\begin{equation} \begin{split} \mathrm{where}\:\hat{L}\:\mathrm{is\:the\:maximum\:likelihood\:value,}\\ \mathrm{and}\:k\:\mathrm{is\:the\:number\:of\:parameters.} \end{split} \end{equation}\]
2.1.1 Regression
Likelihood of normal distribution
Assume residuals are normally distributed, then \[\begin{equation} L(\theta) = \prod_{i = 1}^{N} \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{{(\hat{y}_{i} - y_i)}^2}{2\sigma^2}} \tag{2.2} \end{equation}\]
\[\begin{equation} \begin{split} \ln{L(\theta)} & = \sum_{i = 1}^{N} [\ln{(\frac{1}{\sqrt{2\pi\sigma^2}})} - {\frac{{(\hat{y}_{i} - y_i)}^2}{2\sigma^2}}] \\ & = -\frac{N}{2}\ln{(2\pi)} -\frac{N}{2}\ln{(\sigma^2)} - \frac{1}{2\sigma^2}\sum_{i = 1}^{N}{{(\hat{y}_{i} - y_i)}^2.} \\ \end{split} \tag{2.3} \end{equation}\] \(\mathrm{Since}\:{\hat{\sigma}^2} = \frac{\sum_{i = 1}^{N}{{(\hat{y}_{i} - y_i)}^2}}{N} = {MSE},\) \[\begin{equation} \ln{L(\theta)} = -\frac{N}{2}\ln{(MSE)} + C, \mathrm{where}\:C\:\mathrm{is\:a\:constant.} \tag{2.4} \end{equation}\]
- Derived regression AIC
\[\begin{equation} AIC_{reg} = N\ln{(MSE)} + 2k \tag{2.5} \end{equation}\]
2.1.2 Classification
- Likelihood of Bernoulli distribution
\[\begin{equation} L(\theta) = \prod_{i = 1}^{N} [q_{\theta}(y_i = 1)^{y_i}q_{\theta}(y_i = 0)^{1-y_i}] \tag{2.6} \end{equation}\]
\[\begin{equation} \begin{split} \ln{L(\theta)} = \sum_{i = 1}^{N} [y_i\ln{q_{\theta}(y_i = 1)} + (1 - y_i)\ln{q_{\theta}(y_i = 0)}], \end{split} \tag{2.7} \end{equation}\]
\[\begin{equation} \begin{split} \mathrm{where}\:q_{\theta}\:\mathrm{is\:the\:estimated\:probability\:with\:parameters\:\theta}\:\mathrm{and\:}\:y_i \in \{0, 1\} \end{split} \end{equation}\]
- Binary cross-entropy
\[\begin{equation} \begin{split} H_p(q_{\theta}) & = -\frac{1}{N}\sum_{i = 1}^{N} [y_i\log_2{q_{\theta}(y_i = 1)} + (1 - y_i)\log_2{q_{\theta}(y_i = 0)}] \\ & = -\frac{1}{N{\cdot}\ln{2}}\sum_{i = 1}^{N} [y_i\ln{q_{\theta}(y_i = 1)} + (1 - y_i)\ln{q_{\theta}(y_i = 0)}] \end{split} \tag{2.8} \end{equation}\]
- Derived classification AIC
\[\begin{equation} AIC_{clf} = 2{\cdot}\ln{2}{\cdot}N{\cdot}H_p(q_{\theta}) + 2k \tag{2.9} \end{equation}\]
2.1.2.1 Extension to multi-class classification
Generalized likelihood estimation
Assume the classes are one-hot encoded, then we can generalize equation (2.9) to multi-class problems.
\[\begin{equation} L(\theta) = \prod_{i = 1}^{N}\prod_{j = 1}^{M}\hat{y}_{ij}^{y_{ij}} \tag{2.10} \end{equation}\]
\[\begin{equation} \begin{split} \ln{L(\theta)} = \sum_{i = 1}^{N}\sum_{j = 1}^{M}y_{ij}\ln{\hat{y}_{ij}}, \end{split} \tag{2.11} \end{equation}\]
\[\begin{equation} \begin{split} \mathrm{where}\:\hat{y}_{ij}\:\mathrm{is\:the\:estimated\:probability\:of\:class}\:j\:\mathrm{of\:sample\:}i\:\mathrm{and\:}\:y_{ij} \in \{0, 1\} \end{split} \end{equation}\]
- Categorical cross-entropy
\[\begin{equation} H_p(q_{\theta}) = -\frac{1}{N}\sum_{i = 1}^{N}\sum_{j = 1}^{M}y_{ij}\log_2{\hat{y}_{ij}} \tag{2.12} \end{equation}\]
- Derived generalized classification AIC
\[\begin{equation} AIC_{clf} = 2{\cdot}\ln{2}{\cdot}N{\cdot}H_p(q_{\theta}) + 2k \tag{2.13} \end{equation}\]
2.2 ML input generating function
<- function(fasta_file, label_file, output_file) {
generate_ml_input # --------------
# Peptides_2.4.4
# --------------
# read in fasta file
<- read.fasta(fasta_file, seqtype = "AA", as.string = TRUE, set.attributes = FALSE)
peptides # Molecular weight
<- as.matrix(sapply(peptides, function(x) mw(x, monoisotopic = FALSE)))
weights colnames(weights) <- "weight"
# Amino acid composition
<- sapply(peptides, function(x) aaComp(x))
aa.comp <- t(sapply(aa.comp, function(x) x[, "Mole%"]))
aa.comp.matrix # Isoelectric point
<- as.matrix(sapply(peptides, function(x) pI(x, pKscale = "EMBOSS")))
pI.values colnames(pI.values) <- "pI.value"
# Hydrophobicity
<- as.matrix(sapply(peptides, function(x) hydrophobicity(x, scale = "KyteDoolittle")))
hydrophobicity.values colnames(hydrophobicity.values) <- "hydrophobicity.value"
# Net charge at pH 7
<- as.matrix(sapply(peptides, function(x) charge(x, pH = 7, pKscale = "EMBOSS")))
net.charges colnames(net.charges) <- "net.charge"
# Boman index
<- as.matrix(sapply(peptides, function(x) boman(x)))
boman.indices colnames(boman.indices) <- "boman.index"
# combine peptides results
<- data.frame(cbind(weights, aa.comp.matrix, pI.values, hydrophobicity.values, net.charges, boman.indices))
peptides_res
# -----------
# protr_1.6-2
# -----------
<- 7
length # read in FASTA file
<- readFASTA(fasta_file)
sequences <- sequences[unlist(lapply(sequences, function(x) nchar(x) > 1))]
sequences # calculate amino acid composition descriptors (dim = 20)
<- t(sapply(sequences, extractAAC))
x1 colnames(x1)[colnames(x1) == "Y"] <- "Y_tyrosine"
# calculate dipeptide composition descriptors (dim = 400)
<- t(sapply(sequences, extractDC))
x2 colnames(x2)[colnames(x2) == "NA"] <- "NA_dipeptide"
# calculate Moreau-Broto autocorrelation descriptors (dim = 8 * (length - 1))
<- t(sapply(sequences, extractMoreauBroto, nlag = length - 1L))
x3 colnames(x3) <- paste("moreau_broto", colnames(x3), sep = "_")
# calculate composition descriptors (dim = 21)
<- t(sapply(sequences, extractCTDC))
x4 # calculate transition descriptors (dim = 21)
<- t(sapply(sequences, extractCTDT))
x5 # calculate distribution descriptors (dim = 105)
<- t(sapply(sequences, extractCTDD))
x6 # calculate conjoint triad descriptors (dim = 343)
<- t(sapply(sequences, extractCTriad))
x7 # calculate sequence-order-coupling numbers (dim = 2 * (length - 1))
<- t(sapply(sequences, extractSOCN, nlag = length - 1L))
x8 # calculate quasi-sequence-order descriptors (dim = 40 + 2 * (length - 1))
<- t(sapply(sequences, extractQSO, nlag = length - 1L))
x9 # calculate pseudo-amino acid composition (dim = 20 + (length - 1))
<- t(sapply(sequences, extractPAAC, lambda = length - 1L))
x10 # calculate amphiphilic pseudo-amino acid composition (dim = 20 + 2 * (length - 1))
<- t(sapply(sequences, extractAPAAC, lambda = length - 1L))
x11 # combine all of the result datasets
<- data.frame(cbind(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11))
protr_res
<- read.csv(label_file, row.names = 1)
labels <- function(x, ..., by = "row.names") { # https://stackoverflow.com/a/22618297
merge.all <- list(...)
L for (i in seq_along(L)) {
<- merge(x, L[[i]], by = by)
x rownames(x) <- x$Row.names
$Row.names <- NULL
x
}return(x)
}<- merge.all(peptides_res, protr_res, labels)
data write.csv(data, file = output_file)
}
2.3 AIC functions
#' Mean squared error function
#'
#' @param y_true a factor vector representing true values.
#' @param y_pred a numeric vector or a matrix of predicted values.
#'
#' @return MSE
#'
<- function(y_true, y_pred) {
mse return(mean((y_true - y_pred)^2))
}
#' Regression AIC
#'
#' @param y_true a factor vector representing true values.
#' @param y_pred a numeric vector or a matrix of predicted values.
#' @param k number of parameters/variables.
#' @param eps a very small value to avoid negative infinitives generated by log(p=0).
#'
#' @return AIC
#'
<- function(y_true, y_pred, k, eps = 1e-15) {
aic_reg <- mse(y_true, y_pred)
mserr if (mserr == 0) mserr <- eps
<- length(y_true) * log(mserr) + 2 * k
AIC return(AIC)
}
#' Cross-entropy function
#'
#' @param y_true a factor vector representing true labels.
#' @param y_pred a numeric vector (probabilities of the second class/factor level)
#' or a matrix of predicted probabilities.
#' @param eps a very small value to avoid negative infinitives generated by log(p=0).
#' If the function returns NaN, then increasing eps may solve the issue. Alternatively,
#' As NaN is usually generated in the case of p = 1 - eps = 1, resulting in log(1 - p)
#' = log(0) from binary cross-entropy, the user can pass prediction values as a matrix
#' to calculate categorical cross-entropy instead.
#'
#' @return H, cross-entropy (mean loss per sample)
#'
<- function(y_true, y_pred, eps = 1e-15) {
crossEntropy stopifnot(is.factor(y_true))
<- pmax(pmin(y_pred, 1 - eps), eps)
y_pred <- nlevels(y_true)
n_levels <- NULL
H # Binary classification
if (n_levels == 2 && is.vector(y_pred)) {
<- as.numeric(y_true == levels(y_true)[-1])
y_true <- -mean(y_true * log2(y_pred) + (1 - y_true) * log2(1 - y_pred))
H
}# Multi-class classification
else if (n_levels == ncol(y_pred)) {
<- as.numeric(y_true)
y_true <- t(sapply(y_true, function(x) {
y_true_encoded <- rep(0, n_levels)
tmp <- 1
tmp[x] return(tmp)
}))<- -mean(sapply(1:nrow(y_true_encoded), function(x) sum(y_true_encoded[x, ] * log2(y_pred[x, ]))))
H
}return(H)
}
#' Classification AIC
#'
#' @param y_true a factor vector representing true labels.
#' @param y_pred a numeric vector or a matrix of predicted probabilities.
#' @param k number of parameters/variables.
#' @param ... other parameters to be passed to `crossEntropy()`
#'
#' @return AIC
#'
<- function(y_true, y_pred, k, ...) {
aic_clf <- crossEntropy(y_true, y_pred, ...)
H <- 2 * log(2) * length(y_true) * H + 2 * k
AIC return(AIC)
}
2.4 Melanin binding (regression)
2.4.1 Generating ML input
generate_ml_input(
fasta_file = "./other_data/mb_second_peptide_array.fasta",
label_file = "./other_data/mb_second_peptide_array_labels.csv",
output_file = "./data/mb_second_peptide_array_ml_input.csv"
)
2.4.2 Variable importance
set.seed(12)
<- read.csv("./data/mb_second_peptide_array_ml_input.csv", check.names = FALSE, row.names = 1)
mb_data # random shuffle
<- mb_data[sample(1:nrow(mb_data), replace = FALSE), ]
mb_data
# train random forest using ranger
<- 100000
ntree <- subset(mb_data, select = -log_intensity)
predictor_variables <- mb_data$log_intensity
response_variable <- ranger(
rf x = predictor_variables, y = response_variable, num.trees = ntree, importance = "permutation", verbose = TRUE,
scale.permutation.importance = TRUE, seed = 3, keep.inbag = TRUE
)save(rf, file = "./rdata/var_reduct_mb_rf.RData")
2.4.3 Variable reduction
load(file = "./rdata/var_reduct_mb_rf.RData")
<- ranger::importance(rf)
var_imp <- var_imp[order(-var_imp)]
var_imp <- var_imp[var_imp >= 0]
var_imp
set.seed(12)
<- read.csv("./data/mb_second_peptide_array_ml_input.csv", check.names = FALSE, row.names = 1)
mb_data # random shuffle
<- mb_data[sample(1:nrow(mb_data), replace = FALSE), ]
mb_data
<- 1000
ntree <- list()
var_subset_res for (i in 1:length(var_imp)) {
cat(paste0(i, "\n"))
<- mb_data[, colnames(mb_data) %in% names(var_imp)[1:i], drop = FALSE]
predictor_variables <- mb_data$log_intensity
response_variable <- ranger(x = predictor_variables, y = response_variable, num.trees = ntree, importance = "none", verbose = TRUE, seed = 3)
rf <- list.append(var_subset_res, list(
var_subset_res rsq = rf$r.squared,
aic = aic_reg(
y_true = response_variable,
y_pred = rf$predictions,
k = i
)
))
}save(var_subset_res, file = "./rdata/var_reduct_mb_var_subset_res.RData")
load(file = "./rdata/var_reduct_mb_var_subset_res.RData")
<- unlist(lapply(var_subset_res, function(x) x$rsq))
rsq_vec <- unlist(lapply(var_subset_res, function(x) x$aic))
aic_vec <- which.max(rsq_vec)
rsq_cutoff <- which.min(aic_vec)
aic_cutoff
<- ggplot(data = data.frame(x = 1:length(rsq_vec), y = rsq_vec), aes(x = x, y = y)) +
p1_1 geom_point(colour = "black", alpha = 0.3, size = 0.5) +
geom_vline(xintercept = rsq_cutoff, colour = "red", linetype = "dashed") +
scale_x_continuous(breaks = c(1, rsq_cutoff, 200, 400, 600, 735)) +
scale_y_continuous(
breaks = c(0.40, 0.45, 0.50, 0.55),
labels = c("0.40", "0.45", "0.50", "0.55")
+
) theme_bw() +
theme(
plot.margin = unit(c(10, 10, -10, 10), "pt"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
plot.title = element_text(hjust = 0.5),
axis.title = element_text(colour = "black"),
axis.text = element_text(colour = "black")
+
) xlab("") +
ylab(expression(italic(R^2))) +
ggtitle("Melanin binding")
<- ggplot(data = data.frame(x = 1:length(aic_vec), y = aic_vec), aes(x = x, y = y)) +
p1_2 geom_point(colour = "black", alpha = 0.3, size = 0.5) +
geom_vline(xintercept = aic_cutoff, colour = "red", linetype = "dashed") +
scale_x_continuous(breaks = c(1, aic_cutoff, 200, 400, 600, 735)) +
scale_y_continuous(
breaks = c(-500, 0, 500, 1000),
labels = c("-500", "0", "500", "1,000")
+
) theme_bw() +
theme(
plot.margin = unit(c(-15, 10, 10, 10), "pt"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
plot.title = element_text(hjust = 0.5),
axis.title = element_text(colour = "black"),
axis.text = element_text(colour = "black")
+
) xlab("Number of variables") +
ylab("AIC") +
ggtitle("")
cat(paste0("Maximum R-squared: ", formatC(rsq_vec[rsq_cutoff], format = "f", digits = 3), "\n"))
cat(paste0("R-squared at AIC cutoff: ", formatC(rsq_vec[aic_cutoff], format = "f", digits = 3), "\n"))
2.4.4 Generating train-test splits
set.seed(12)
<- read.csv("./data/mb_second_peptide_array_ml_input.csv", check.names = FALSE, row.names = 1)
mb_data # random shuffle
<- mb_data[sample(1:nrow(mb_data), replace = FALSE), ]
mb_data
<- aic_cutoff
cutoff load(file = "./rdata/var_reduct_mb_rf.RData")
<- ranger::importance(rf)
var_imp <- var_imp[order(-var_imp)][1:cutoff]
var_imp <- mb_data[, colnames(mb_data) %in% c(names(var_imp), "log_intensity")]
mb_data
<- list()
outer_splits <- round(nrow(mb_data) * 0.9)
n_train set.seed(22)
for (i in 1:10) {
# random shuffle
<- mb_data[sample(1:nrow(mb_data), replace = FALSE), ]
mb_data_ <- mb_data_[1:n_train, ]
outer_train <- mb_data_[(n_train + 1):nrow(mb_data_), ]
outer_test <- list.append(outer_splits, list(train = outer_train, test = outer_test))
outer_splits
}save(mb_data, outer_splits, file = "./rdata/var_reduct_mb_train_test_splits.RData")
2.5 Cell-penetration (classification)
2.5.1 Generating ML input
generate_ml_input(
fasta_file = "./other_data/CPP924.fasta",
label_file = "./other_data/cpp_labels.csv",
output_file = "./data/cpp_ml_input.csv"
)
2.5.2 Variable importance
set.seed(12)
<- read.csv("./data/cpp_ml_input.csv", check.names = FALSE, row.names = 1)
cpp_data # random shuffle
<- cpp_data[sample(1:nrow(cpp_data), replace = FALSE), ]
cpp_data
# train random forest using ranger
<- 100000
ntree <- subset(cpp_data, select = -category)
predictor_variables <- factor(cpp_data$category, levels = c("non-penetrating", "penetrating"))
response_variable <- ranger(
rf x = predictor_variables, y = response_variable, num.trees = ntree, sample.fraction = rep(0.5, 2),
importance = "permutation", verbose = TRUE, scale.permutation.importance = TRUE, seed = 3, keep.inbag = TRUE
)save(rf, file = "./rdata/var_reduct_cpp_rf.RData")
2.5.3 Variable reduction
load(file = "./rdata/var_reduct_cpp_rf.RData")
<- ranger::importance(rf)
var_imp <- var_imp[order(-var_imp)]
var_imp <- var_imp[var_imp >= 0]
var_imp
set.seed(12)
<- read.csv("./data/cpp_ml_input.csv", check.names = FALSE, row.names = 1)
cpp_data # random shuffle
<- cpp_data[sample(1:nrow(cpp_data), replace = FALSE), ]
cpp_data
<- 1000
ntree <- list()
var_subset_res for (i in 1:length(var_imp)) {
cat(paste0(i, "\n"))
<- cpp_data[, colnames(cpp_data) %in% names(var_imp)[1:i], drop = FALSE]
predictor_variables <- factor(cpp_data$category, levels = c("non-penetrating", "penetrating"))
response_variable <- ranger(
rf x = predictor_variables, y = response_variable, num.trees = ntree, importance = "none", verbose = TRUE, seed = 3,
probability = TRUE
)<- list.append(var_subset_res, list(
var_subset_res acc = 1 - rf$prediction.error,
aic = aic_clf(
y_true = response_variable,
y_pred = rf$predictions,
k = i
)
))
}save(var_subset_res, file = "./rdata/var_reduct_cpp_var_subset_res.RData")
load(file = "./rdata/var_reduct_cpp_var_subset_res.RData")
<- unlist(lapply(var_subset_res, function(x) x$acc))
acc_vec <- unlist(lapply(var_subset_res, function(x) x$aic))
aic_vec <- which.max(acc_vec)
acc_cutoff <- which.min(aic_vec)
aic_cutoff
<- ggplot(data = data.frame(x = 1:length(acc_vec), y = acc_vec), aes(x = x, y = y)) +
p2_1 geom_point(colour = "black", alpha = 0.3, size = 0.5) +
geom_vline(xintercept = acc_cutoff, colour = "red", linetype = "dashed") +
scale_x_continuous(breaks = c(1, acc_cutoff, 250, 500, 750, 955)) +
scale_y_continuous(
breaks = c(0.88, 0.90, 0.92),
labels = c("0.88", "0.90", "0.92")
+
) theme_bw() +
theme(
plot.margin = unit(c(10, 10, -10, 10), "pt"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
plot.title = element_text(hjust = 0.5),
axis.title = element_text(colour = "black"),
axis.text = element_text(colour = "black")
+
) xlab("") +
ylab("Accuracy") +
ggtitle("Cell-penetration")
<- ggplot(data = data.frame(x = 1:length(aic_vec), y = aic_vec), aes(x = x, y = y)) +
p2_2 geom_point(colour = "black", alpha = 0.3, size = 0.5) +
geom_vline(xintercept = aic_cutoff, colour = "red", linetype = "dashed") +
scale_x_continuous(breaks = c(aic_cutoff, 250, 500, 750, 955)) +
scale_y_continuous(
breaks = c(500, 1000, 1500, 2000, 2500),
labels = c("500", "1,000", "1,500", "2,000", "2,500")
+
) theme_bw() +
theme(
plot.margin = unit(c(-15, 10, 10, 10), "pt"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
plot.title = element_text(hjust = 0.5),
axis.title = element_text(colour = "black"),
axis.text = element_text(colour = "black")
+
) xlab("Number of variables") +
ylab("AIC") +
ggtitle("")
cat(paste0("Maximum accuracy: ", formatC(acc_vec[acc_cutoff], format = "f", digits = 3), "\n"))
cat(paste0("Accuracy at AIC cutoff: ", formatC(acc_vec[aic_cutoff], format = "f", digits = 3), "\n"))
2.5.4 Generating train-test splits
set.seed(12)
<- read.csv("./data/cpp_ml_input.csv", check.names = FALSE, row.names = 1)
cpp_data # random shuffle
<- cpp_data[sample(1:nrow(cpp_data), replace = FALSE), ]
cpp_data
<- aic_cutoff
cutoff load(file = "./rdata/var_reduct_cpp_rf.RData")
<- ranger::importance(rf)
var_imp <- var_imp[order(-var_imp)][1:cutoff]
var_imp <- cpp_data[, colnames(cpp_data) %in% c(names(var_imp), "category")]
cpp_data
<- list()
outer_splits <- round(nrow(cpp_data) * 0.9)
n_train set.seed(22)
for (i in 1:10) {
# random shuffle
<- cpp_data[sample(1:nrow(cpp_data), replace = FALSE), ]
cpp_data_ <- cpp_data_[1:n_train, ]
outer_train <- cpp_data_[(n_train + 1):nrow(cpp_data_), ]
outer_test <- list.append(outer_splits, list(train = outer_train, test = outer_test))
outer_splits
}save(cpp_data, outer_splits, file = "./rdata/var_reduct_cpp_train_test_splits.RData")
2.6 Toxicity (classification)
2.6.1 Generating ML input
generate_ml_input(
fasta_file = "./other_data/toxicity.fasta",
label_file = "./other_data/tx_labels.csv",
output_file = "./data/tx_ml_input.csv"
)
2.6.2 Variable importance
set.seed(12)
<- read.csv("./data/tx_ml_input.csv", check.names = FALSE, row.names = 1)
tx_data # random shuffle
<- tx_data[sample(1:nrow(tx_data), replace = FALSE), ]
tx_data # train random forest using ranger
<- 100000
ntree <- subset(tx_data, select = -category)
predictor_variables <- factor(tx_data$category, levels = c("non-toxic", "toxic"))
response_variable <- ranger(
rf x = predictor_variables, y = response_variable, num.trees = ntree, sample.fraction = rep(0.5, 2),
importance = "permutation", verbose = TRUE, scale.permutation.importance = TRUE, seed = 3, keep.inbag = TRUE
)save(rf, file = "./rdata/var_reduct_tx_rf.RData")
2.6.3 Variable reduction
load(file = "./rdata/var_reduct_tx_rf.RData")
<- ranger::importance(rf)
var_imp <- var_imp[order(-var_imp)]
var_imp <- var_imp[var_imp >= 0]
var_imp
set.seed(12)
<- read.csv("./data/tx_ml_input.csv", check.names = FALSE, row.names = 1)
tx_data # random shuffle
<- tx_data[sample(1:nrow(tx_data), replace = FALSE), ]
tx_data
<- 1000
ntree <- list()
var_subset_res for (i in 1:length(var_imp)) {
cat(paste0(i, "\n"))
<- tx_data[, colnames(tx_data) %in% names(var_imp)[1:i], drop = FALSE]
predictor_variables <- factor(tx_data$category, levels = c("non-toxic", "toxic"))
response_variable <- ranger(
rf x = predictor_variables, y = response_variable, num.trees = ntree, importance = "none", verbose = TRUE, seed = 3,
probability = TRUE
)<- list.append(var_subset_res, list(
var_subset_res acc = 1 - rf$prediction.error,
aic = aic_clf(
y_true = response_variable,
y_pred = rf$predictions,
k = i
)
))
}save(var_subset_res, file = "./rdata/var_reduct_tx_var_subset_res.RData")
load(file = "./rdata/var_reduct_tx_var_subset_res.RData")
<- unlist(lapply(var_subset_res, function(x) x$acc))
acc_vec <- unlist(lapply(var_subset_res, function(x) x$aic))
aic_vec <- which.max(acc_vec)
acc_cutoff <- which.min(aic_vec)
aic_cutoff
<- ggplot(data = data.frame(x = 1:length(acc_vec), y = acc_vec), aes(x = x, y = y)) +
p3_1 geom_point(colour = "black", alpha = 0.3, size = 0.5) +
geom_vline(xintercept = acc_cutoff, colour = "red", linetype = "dashed") +
scale_x_continuous(
breaks = c(1, acc_cutoff, 300, 600, 900, 1076),
labels = c("1", "148", "300", "600", "900", "1,076")
+
) scale_y_continuous(
breaks = c(0.91, 0.92, 0.93, 0.94, 0.95, 0.96),
labels = c("0.91", "0.92", "0.93", "0.94", "0.95", "0.96")
+
) theme_bw() +
theme(
plot.margin = unit(c(10, 10, -10, 10), "pt"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
plot.title = element_text(hjust = 0.5),
axis.title = element_text(colour = "black"),
axis.text = element_text(colour = "black")
+
) xlab("") +
ylab("Accuracy") +
ggtitle("Toxicity")
<- ggplot(data = data.frame(x = 1:length(aic_vec), y = aic_vec), aes(x = x, y = y)) +
p3_2 geom_point(colour = "black", alpha = 0.3, size = 0.5) +
geom_vline(xintercept = aic_cutoff, colour = "red", linetype = "dashed") +
scale_x_continuous(
breaks = c(aic_cutoff, 300, 600, 900, 1076),
labels = c("56", "300", "600", "900", "1,076")
+
) scale_y_continuous(
breaks = c(2000, 3000, 4000, 5000, 6000, 7000),
labels = c("2,000", "3,000", "4,000", "5,000", "6,000", "7,000")
+
) theme_bw() +
theme(
plot.margin = unit(c(-15, 10, 10, 10), "pt"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
plot.title = element_text(hjust = 0.5),
axis.title = element_text(colour = "black"),
axis.text = element_text(colour = "black")
+
) xlab("Number of variables") +
ylab("AIC") +
ggtitle("")
cat(paste0("Maximum accuracy: ", formatC(acc_vec[acc_cutoff], format = "f", digits = 3), "\n"))
cat(paste0("Accuracy at AIC cutoff: ", formatC(acc_vec[aic_cutoff], format = "f", digits = 3), "\n"))
2.6.4 Generating train-test splits
set.seed(12)
<- read.csv("./data/tx_ml_input.csv", check.names = FALSE, row.names = 1)
tx_data # random shuffle
<- tx_data[sample(1:nrow(tx_data), replace = FALSE), ]
tx_data # convert category so that positive class represents non-toxic
# for evaluation metrics that focus on true positives more than
# true negatives
$category <- as.factor(sapply(tx_data$category, function(x) if (x == "non-toxic") 1 else 0))
tx_data
<- aic_cutoff
cutoff load(file = "./rdata/var_reduct_tx_rf.RData")
<- ranger::importance(rf)
var_imp <- var_imp[order(-var_imp)][1:cutoff]
var_imp <- tx_data[, colnames(tx_data) %in% c(names(var_imp), "category")]
tx_data
<- list()
outer_splits <- round(nrow(tx_data) * 0.9)
n_train set.seed(22)
for (i in 1:10) {
# random shuffle
<- tx_data[sample(1:nrow(tx_data), replace = FALSE), ]
tx_data_ <- tx_data_[1:n_train, ]
outer_train <- tx_data_[(n_train + 1):nrow(tx_data_), ]
outer_test <- list.append(outer_splits, list(train = outer_train, test = outer_test))
outer_splits
}save(tx_data, outer_splits, file = "./rdata/var_reduct_tx_train_test_splits.RData")
2.7 Plotting
<- plot_grid(p1_1, p2_1, p3_1,
p_combined
p1_2, p2_2, p3_2,nrow = 2,
labels = c("a", "b", "c", "", "", ""),
align = "v"
) p_combined
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.2.1 cowplot_1.1.1 ggplot2_3.3.6 ranger_0.14.1 rlist_0.4.6.2
## [6] protr_1.6-2 Peptides_2.4.4
##
## loaded via a namespace (and not attached):
## [1] styler_1.8.0 tidyselect_1.1.2 xfun_0.32 bslib_0.4.0
## [5] purrr_0.3.4 lattice_0.20-45 colorspace_2.0-3 vctrs_0.4.1
## [9] generics_0.1.3 htmltools_0.5.3 yaml_2.3.5 utf8_1.2.2
## [13] rlang_1.0.4 R.oo_1.25.0 jquerylib_0.1.4 pillar_1.8.1
## [17] withr_2.5.0 glue_1.6.2 DBI_1.1.3 R.utils_2.12.0
## [21] R.cache_0.16.0 lifecycle_1.0.1 stringr_1.4.1 munsell_0.5.0
## [25] gtable_0.3.0 R.methodsS3_1.8.2 codetools_0.2-18 evaluate_0.16
## [29] knitr_1.40 fastmap_1.1.0 fansi_1.0.3 highr_0.9
## [33] Rcpp_1.0.9 cachem_1.0.6 jsonlite_1.8.0 digest_0.6.29
## [37] stringi_1.7.8 bookdown_0.28 dplyr_1.0.9 grid_4.2.2
## [41] cli_3.3.0 tools_4.2.2 magrittr_2.0.3 sass_0.4.2
## [45] tibble_3.1.8 pkgconfig_2.0.3 Matrix_1.5-1 data.table_1.14.2
## [49] assertthat_0.2.1 rmarkdown_2.16 rstudioapi_0.14 R6_2.5.1
## [53] compiler_4.2.2