Section 2 Model training
2.1 Hyper-parameter tuning for PURF
2.1.1 Analysis
In Python:
library(reticulate)
use_condaenv("/Users/renee/Library/r-miniconda/envs/purf/bin/python")
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
import pickle
from sklearn.impute import SimpleImputer
from purf.pu_ensemble import PURandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance
from joblib import Parallel, delayed
from sklearn.ensemble._forest import _generate_unsampled_indices
import os
import re
import session_info
# private function for train_purf()
def _get_ref_antigen_stats(idx, tree, X, y, ref_indices, max_samples=None):
if max_samples is None:
= y.shape[0]
max_samples = _generate_unsampled_indices(tree.random_state, y.shape[0], max_samples)
oob_indices = [i in oob_indices for i in ref_indices]
ref_oob = list()
ref_pred = tree.predict_proba(X[ref_indices,:], check_input=False)
pred = pred[:,1]
ref_pred return ref_oob, ref_pred
def train_purf(features, outcome, pos_level=0.5, purf=None, tree_filtering=False, ref_antigens=None, n_jobs=1):
= shuffle(features, outcome, random_state=0)
features, outcome # Imputation
= SimpleImputer(strategy='median')
imputer = imputer.fit_transform(features)
X = pd.DataFrame(X, index=features.index, columns=features.columns)
X = outcome
y = X
features print('There are %d positives out of %d samples before feature space weighting.' % (sum(y), len(y)))
# Feature space weighting
= X.loc[y==1,:]
lab_pos = np.median(lab_pos, axis=0)
median # Feature space weighting
= X.loc[y==1,:]
lab_pos = np.median(lab_pos, axis=0)
median = MinMaxScaler(feature_range=(1,10))
scaler = list()
dist for i in range(lab_pos.shape[0]):
dist.append(distance.euclidean(lab_pos.iloc[i, :], median))= np.asarray(dist).reshape(-1, 1)
dist = np.round(scaler.fit_transform(dist))
counts = np.array(counts, dtype=np.int64)[:, 0]
counts = X.iloc[y==1, :]
X_temp = X.iloc[y==0, :]
X = np.asarray([0] * X.shape[0] + [1] * (sum(counts)))
y = [X]
appended_data for i in range(len(counts)):
* counts[i]))
appended_data.append(pd.concat([X_temp.iloc[[i]]] = pd.concat(appended_data)
X print('There are %d positives out of %d samples after feature space weighting.' % (sum(y), len(y)))
= pd.DataFrame({'protein_id': X.index, 'antigen_label' : y})
res if tree_filtering is True:
# get ref antigen indices
= {ref:list() for ref in list(ref_antigens.values())}
ref_index_dict for i in range(res.shape[0]):
if res['protein_id'][i] in list(ref_antigens.values()):
'protein_id'][i]].append(res.index[i])
ref_index_dict[res[= sum(ref_index_dict.values(), [])
ref_indices # get OOB stats and predictions
= X.astype('float32')
X = purf.estimators_
trees = [i for i in range(len(trees))]
idx_list = Parallel(n_jobs=n_jobs)(
stats_res for idx in idx_list)
delayed(_get_ref_antigen_stats)(idx, trees[idx], np.array(X), y, ref_indices) # ref_oob data structure:
# rows represent individual trees
# column represent reference antigens
# cells indicate whether the reference antigen is in the OOB samples of the tree
= np.array([ref_oob for ref_oob, ref_pred in stats_res])
ref_oob # ref_pred data structure:
# rows represent individual trees
# column represent reference antigens
# cells indicate the prediction of the reference antigen by the tree
= np.array([ref_pred for ref_oob, ref_pred in stats_res])
ref_pred # analyze duplicated reference antigens as a group
= np.cumsum(np.array([len(v) for k,v in ref_index_dict.items()]))
cumsum_num_ref = np.array([ref_oob[:, 0:cumsum_num_ref[i]].any(axis=1) if i == 0 else \
ref_oob_all - 1]:cumsum_num_ref[i]].any(axis=1) \
ref_oob[:, cumsum_num_ref[i for i in range(len(ref_antigens))]).T
= np.array([ref_pred[:, 0:cumsum_num_ref[i]].any(axis=1) if i == 0 else \
ref_pred_all - 1]:cumsum_num_ref[i]].sum(axis=1) \
ref_pred[:, cumsum_num_ref[i for i in range(len(ref_antigens))]).T
# calculate number of reference antigens as OOB samples for each tree
= ref_oob_all.sum(axis=1)
oob_total # assign score of 1 to trees that correctly predict all OOB reference antigens; otherwise, assign 0 score
= np.zeros(len(trees))
weights # iterate through the trees and calculate the stats
for i in range(len(trees)):
= list(ref_oob_all[i,:])
oob_list = list(ref_pred_all[i,:])
pred_list if oob_total[i] == 0:
= 0
weights[i] else:
if sum(np.array(pred_list)[oob_list] != 0) == oob_total[i]:
= 1
weights[i] if tree_filtering is False:
# Training PURF
= PURandomForestClassifier(
purf = 100000,
n_estimators = True,
oob_score = 64,
n_jobs = 42,
random_state = pos_level
pos_level
)
purf.fit(X, y)else:
-1,1), weights=weights)
purf._set_oob_score_with_weights(np.array(X), y.reshape(# Storing results
'OOB score'] = purf.oob_decision_function_[:,1]
res[= features.merge(res.groupby('protein_id').mean(), left_index=True, right_on='protein_id')
res = res[['antigen_label', 'OOB score']]
res if tree_filtering is False:
return (purf, res)
else:
return ({'model': purf, 'weights': weights}, res)
2.1.1.1 Pv data set
= {'CSP': 'PVP01_0835600.1-p1', 'DBP': 'PVP01_0623800.1-p1', 'MSP1': 'PVP01_0728900.1-p1'}
ref_antigens = pd.read_csv('./other_data/pv_ml_input.csv', index_col=0)
data = data.iloc[:, 1:]
features = np.array(data.antigen_label)
outcome
for pos_level in [0.5, 0.4, 0.6, 0.3, 0.7, 0.2, 0.8, 0.1, 0.9]:
= train_purf(features, outcome, pos_level=pos_level, tree_filtering=False)
(purf, res) = train_purf(features, outcome, pos_level=pos_level, purf=purf, tree_filtering=True, ref_antigens=ref_antigens)
(purf_filtered, res_filtered) 'OOB score filtered'] = res_filtered['OOB score']
res['~/Downloads/pv_pos_level/%.1f_res.tsv' % pos_level)
res.to_csv(with open('~/Downloads/pv_pos_level/%.1f_purf_tree_filtering.pkl' % pos_level, 'wb') as out:
pickle.dump(purf_filtered, out, pickle.HIGHEST_PROTOCOL)
dir = '~/Downloads/pv_pos_level/'
= os.listdir(dir)
files = pd.read_csv(dir + '0.1_res.tsv', sep='\t', index_col=0)['antigen_label']
tmp
= [pd.read_csv(dir + '%.1f_res.tsv' % pos_level, sep='\t', index_col=0)['OOB score filtered'] for pos_level in np.arange(0.1, 1, 0.1)]
data_frames = pd.concat([tmp] + data_frames, join='outer', axis=1)
merged_df = ['antigen_label'] + ['%.1f' % pos_level for pos_level in np.arange(0.1, 1, 0.1)]
colnames = colnames
merged_df.columns './other_data/pv_pos_level_parameter_tuning.csv') merged_df.to_csv(
2.1.1.2 Pv + Pf combined data set
= {'CSP (Pf)': 'PF3D7_0304600.1-p1', 'RH5 (Pf)': 'PF3D7_0424100.1-p1', 'MSP5 (Pf)': 'PF3D7_0206900.1-p1',
ref_antigens 'P230 (Pf)': 'PF3D7_0209000.1-p1',
'CSP (Pv)': 'PVP01_0835600.1-p1', 'DBP (Pv)': 'PVP01_0623800.1-p1', 'MSP1 (Pv)': 'PVP01_0728900.1-p1'}
= pd.read_csv('./data/supplementary_data_4_pfpv_ml_input.csv', index_col=0)
data = data.iloc[:, 1:]
features = np.array(data.antigen_label)
outcome
for pos_level in [0.5, 0.4, 0.6, 0.3, 0.7, 0.2, 0.8, 0.1, 0.9]:
= train_purf(features, outcome, pos_level=pos_level, tree_filtering=False)
(purf, res) = train_purf(features, outcome, pos_level=pos_level, purf=purf, tree_filtering=True, ref_antigens=ref_antigens)
(purf_filtered, res_filtered) 'OOB score filtered'] = res_filtered['OOB score']
res['~/Downloads/pfpv_pos_level/%.1f_res.tsv' % pos_level)
res.to_csv(with open('~/Downloads/pfpv_pos_level/%.1f_purf_tree_filtering.pkl' % pos_level, 'wb') as out:
pickle.dump(purf_filtered, out, pickle.HIGHEST_PROTOCOL)
dir = '~/Downloads/pfpv_pos_level/'
= os.listdir(dir)
files = pd.read_csv(dir + '0.1_res.tsv', sep='\t', index_col=0)['antigen_label']
tmp
= [pd.read_csv(dir + '%.1f_res.tsv' % pos_level, sep='\t', index_col=0)['OOB score filtered'] for pos_level in np.arange(0.1, 1, 0.1)]
data_frames = pd.concat([tmp] + data_frames, join='outer', axis=1)
merged_df = ['antigen_label'] + ['%.1f' % pos_level for pos_level in np.arange(0.1, 1, 0.1)]
colnames = colnames
merged_df.columns './other_data/pfpv_pos_level_parameter_tuning.csv') merged_df.to_csv(
2.1.2 Plotting
In R:
library(mixR)
library(pracma)
library(rlist)
library(ggplot2)
library(cowplot)
library(grid)
library(gridExtra)
2.1.2.1 Pv data set
<- read.csv("./other_data/pv_pos_level_parameter_tuning.csv", header = TRUE, row.names = 1, check.names = FALSE)
data # Extract data with only unlabeled proteins
<- data[data$antigen_label == 0, ] data_unl
<- list()
plot_list <- list()
plot_list2 for (i in seq(0.1, 0.9, 0.1)) {
<- mixfit(data_unl[[as.character(i)]], ncomp = 2)
fit
# Calculate receiver operating characteristic (ROC) curve
# for putative positive and negative samples
<- seq(-0.5, 1.5, by = 0.01)
x <- pnorm(x, mean = fit$mu[1], sd = fit$sd[1])
neg_cum <- pnorm(x, mean = fit$mu[2], sd = fit$sd[2])
pos_cum <- (1 - neg_cum) / ((1 - neg_cum) + neg_cum) # false positive / (false positive + true negative)
fpr <- (1 - pos_cum) / ((1 - pos_cum) + pos_cum) # true positive / (true positive + false negative)
tpr
<- plot(fit, title = paste0(
p "Positive level = ", i,
" (AUROC = ", round(trapz(-fpr, tpr), 2), ")"
+
)) scale_fill_manual(values = c("#0080FF", "#FF007F"), labels = c("Putative negative", "Putative positive")) +
theme_bw() +
{if (i == 0.1) {
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.title = element_blank(),
legend.text = element_text(colour = "black"),
legend.position = c(0.7, 0.85),
legend.background = element_blank()
)else {
} theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.position = "none"
)
}+
} xlim(-0.8, 1.5)
# Calculate percent rank for labeled positives
<- data[c("antigen_label", as.character(i))]
data_ colnames(data_) <- c("antigen_label", "OOB score")
$percent_rank <- rank(data_[["OOB score"]]) / nrow(data)
data_<- data_[data$antigen_label == 1, ]
data_ <- data_[order(-data_$percent_rank), ]
data_ $x <- 1:nrow(data_) / nrow(data_)
data_cat(paste0("EPR: ", sum(data_$`OOB score` >= 0.5) / nrow(data_), "\n"))
<- ggplot(data_, aes(x = x, y = `percent_rank`)) +
p2 geom_hline(yintercept = 0.5, linetype = "dashed", color = "grey50") +
geom_line(linewidth = 0.1, color = "grey30") +
geom_point(aes(fill = `OOB score`), size = 2.2, shape = 21, color = "grey30", stroke = 0.1) +
scale_fill_gradient2(low = "#0080FF", mid = "white", high = "#FF007F", midpoint = 0.5, limits = c(0, 1)) +
theme_bw() +
{if (i == 0.1) {
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.title = element_text(hjust = 0.5, colour = "black", angle = 0),
legend.text = element_text(colour = "black"),
legend.position = c(0.5, 0.15),
legend.background = element_blank(),
legend.title.align = -130
)else {
} theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.position = "none"
)
}+
}
{if (i == 0.1) {
guides(fill = guide_colourbar(title.position = "top", direction = "horizontal"))
}+
}
{if (i == 0.1) {
labs(fill = "Score (proportion of votes)")
}+
} ggtitle(paste0(
"Positive level = ", i,
" (AUC = ", round(trapz(c(0, data_$x, 1), c(1, data_$percent_rank, 0)), 2), ")"
+
)) ylim(0, 1)
<- list.append(plot_list, p)
plot_list <- list.append(plot_list2, p2)
plot_list2 }
2.1.2.2 Bimodal distribution plot
<- textGrob("Score (proportion of votes)", gp = gpar(fontsize = 15))
x_grob <- textGrob("Density", gp = gpar(fontsize = 15), rot = 90)
y_grob grid.arrange(arrangeGrob(plot_grid(plotlist = plot_list, ncol = 3), left = y_grob, bottom = x_grob))
2.1.2.3 Known antigen ranking
<- textGrob("Ranked known antigens (scaled)", gp = gpar(fontsize = 15))
x_grob2 <- textGrob("Percent rank", gp = gpar(fontsize = 15), rot = 90)
y_grob2 grid.arrange(arrangeGrob(plot_grid(plotlist = plot_list2, ncol = 3), left = y_grob2, bottom = x_grob2))
2.1.2.4 Pv + Pf combined data set
<- read.csv("./other_data/pfpv_pos_level_parameter_tuning.csv", header = TRUE, row.names = 1, check.names = FALSE)
data # Extract data with only unlabeled proteins
<- data[data$antigen_label == 0, ] data_unl
<- list()
plot_list3 <- list()
plot_list4 for (i in seq(0.1, 0.9, 0.1)) {
<- mixfit(data_unl[[as.character(i)]], ncomp = 2)
fit
# Calculate receiver operating characteristic (ROC) curve
# for putative positive and negative samples
<- seq(-0.5, 1.5, by = 0.01)
x <- pnorm(x, mean = fit$mu[1], sd = fit$sd[1])
neg_cum <- pnorm(x, mean = fit$mu[2], sd = fit$sd[2])
pos_cum <- (1 - neg_cum) / ((1 - neg_cum) + neg_cum) # false positive / (false positive + true negative)
fpr <- (1 - pos_cum) / ((1 - pos_cum) + pos_cum) # true positive / (true positive + false negative)
tpr
<- plot(fit, title = paste0(
p "Positive level = ", i,
" (AUROC = ", round(trapz(-fpr, tpr), 3), ")"
+
)) scale_fill_manual(values = c("#0080FF", "#FF007F"), labels = c("Putative negative", "Putative positive")) +
theme_bw() +
{if (i == 0.1) {
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.title = element_blank(),
legend.text = element_text(colour = "black"),
legend.position = c(0.7, 0.85),
legend.background = element_blank()
)else {
} theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.position = "none"
)
}+
} xlim(-0.8, 1.5)
# Calculate percent rank for labeled positives
<- data[c("antigen_label", as.character(i))]
data_ colnames(data_) <- c("antigen_label", "OOB score")
$percent_rank <- rank(data_[["OOB score"]]) / nrow(data)
data_<- data_[data$antigen_label == 1, ]
data_ <- data_[order(-data_$percent_rank), ]
data_ $x <- 1:nrow(data_) / nrow(data_)
data_cat(paste0("EPR: ", sum(data_$`OOB score` >= 0.5) / nrow(data_), "\n"))
<- ggplot(data_, aes(x = x, y = `percent_rank`)) +
p2 geom_hline(yintercept = 0.5, linetype = "dashed", color = "grey50") +
geom_line(linewidth = 0.1, color = "grey30") +
geom_point(aes(fill = `OOB score`), size = 1.2, shape = 21, color = "grey30", stroke = 0.1) +
scale_fill_gradient2(low = "#0080FF", mid = "white", high = "#FF007F", midpoint = 0.5, limits = c(0, 1)) +
theme_bw() +
{if (i == 0.1) {
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.title = element_text(hjust = 0.5, colour = "black", angle = 0),
legend.text = element_text(colour = "black"),
legend.position = c(0.5, 0.15),
legend.background = element_blank(),
legend.title.align = -130
)else {
} theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.position = "none"
)
}+
}
{if (i == 0.1) {
guides(fill = guide_colourbar(title.position = "top", direction = "horizontal"))
}+
}
{if (i == 0.1) {
labs(fill = "Score (proportion of votes)")
}+
} ggtitle(paste0(
"Positive level = ", i,
" (AUC = ", round(trapz(c(0, data_$x, 1), c(1, data_$percent_rank, 0)), 2), ")"
+
)) ylim(0, 1)
<- list.append(plot_list3, p)
plot_list3 <- list.append(plot_list4, p2)
plot_list4 }
2.1.3 Results for positive level = 0.5
5]]$labels$title <- ""
plot_list[[5]]$labels$title <- ""
plot_list2[[5]]$labels$title <- ""
plot_list3[[5]]$labels$title <- ""
plot_list4[[
5]] <- plot_list[[5]] +
plot_list[[theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.title = element_blank(),
legend.text = element_text(colour = "black"),
legend.position = c(0.2, 0.91),
legend.background = element_blank()
)
5]] <- plot_list2[[5]] +
plot_list2[[theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
legend.title = element_text(hjust = 0.5, colour = "black", angle = 0),
legend.text = element_text(colour = "black"),
legend.position = c(0.35, 0.13),
legend.background = element_blank(),
legend.title.align = -130
+
) guides(fill = guide_colourbar(title.position = "top", direction = "horizontal")) +
labs(fill = "Score (proportion of votes)")
<- textGrob("Score (proportion of votes)", gp = gpar(fontsize = 10))
x_grob <- textGrob("Density", gp = gpar(fontsize = 10), rot = 90)
y_grob <- textGrob("Ranked known antigens (scaled)", gp = gpar(fontsize = 10))
x_grob2 <- textGrob("Percent rank", gp = gpar(fontsize = 10), rot = 90)
y_grob2 plot_grid(
arrangeGrob(plot_list[[5]], left = y_grob, bottom = x_grob),
arrangeGrob(plot_list2[[5]], left = y_grob2, bottom = x_grob2),
arrangeGrob(plot_list3[[5]], left = y_grob, bottom = x_grob),
arrangeGrob(plot_list4[[5]], left = y_grob2, bottom = x_grob2),
nrow = 2, labels = c("a", "b", "c", "d")
)
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] gridExtra_2.3 cowplot_1.1.1 ggplot2_3.4.2 rlist_0.4.6.2
## [5] pracma_2.4.2 mixR_0.2.0 reticulate_1.28
##
## loaded via a namespace (and not attached):
## [1] styler_1.9.1 tidyselect_1.2.0 xfun_0.39 bslib_0.4.2
## [5] purrr_1.0.1 lattice_0.21-8 colorspace_2.1-0 vctrs_0.6.2
## [9] generics_0.1.3 htmltools_0.5.5 yaml_2.3.7 utf8_1.2.3
## [13] rlang_1.1.1 R.oo_1.25.0 jquerylib_0.1.4 pillar_1.9.0
## [17] glue_1.6.2 withr_2.5.0 R.utils_2.12.2 R.cache_0.16.0
## [21] lifecycle_1.0.3 munsell_0.5.0 gtable_0.3.3 R.methodsS3_1.8.2
## [25] codetools_0.2-19 evaluate_0.21 knitr_1.42 fastmap_1.1.1
## [29] fansi_1.0.4 highr_0.10 Rcpp_1.0.10 scales_1.2.1
## [33] cachem_1.0.8 jsonlite_1.8.4 png_0.1-8 digest_0.6.31
## [37] bookdown_0.34 dplyr_1.1.2 rprojroot_2.0.3 here_1.0.1
## [41] cli_3.6.1 tools_4.2.3 magrittr_2.0.3 sass_0.4.6
## [45] tibble_3.2.1 pkgconfig_2.0.3 Matrix_1.5-4 data.table_1.14.8
## [49] rmarkdown_2.21 rstudioapi_0.14 R6_2.5.1 compiler_4.2.3
session_info.show()
## -----
## joblib 1.1.1
## numpy 1.19.0
## pandas 1.3.2
## purf NA
## scipy 1.8.0
## session_info 1.0.0
## sklearn 0.24.2
## -----
## Python 3.8.2 (default, Mar 26 2020, 10:45:18) [Clang 4.0.1 (tags/RELEASE_401/final)]
## macOS-10.16-x86_64-i386-64bit
## -----
## Session information updated at 2023-05-18 12:34