Section 3 Model evaluation
3.1 Known antigen prediction accuracy
3.1.1 Analysis
In Python:
library(reticulate)
use_condaenv("/Users/renee/Library/r-miniconda/envs/purf/bin/python")
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance
from purf.pu_ensemble import PURandomForestClassifier
import pickle
import os
import re
import session_info
3.1.1.1 Pv model cross-species predictions
= pd.read_csv('./other_data/pv_ml_input.csv', index_col=0)
data_ = data_.iloc[:,1:]
X_ = SimpleImputer(strategy='median')
imputer
imputer.fit(X_)
= pd.read_csv('./other_data/pf_ml_input.csv', index_col=0)
data = data.iloc[:,1:]
X = np.array(data.antigen_label)
y
= './pv_0.5_purf_tree_filtering.pkl'
pickle_path
with open(pickle_path, 'rb') as infile:
= pickle.load(infile)
purf
# Convert float type
= X.select_dtypes(np.float64).astype(np.float32)
X[X.select_dtypes(np.float64).columns] # Impute missing values
= imputer.transform(X)
X
= purf['model'].predict_proba(X)[:,1]
pred
= pd.DataFrame({'protein_id': data.index, 'antigen_label': y})
res 'OOB score filtered'] = pred
res['~/Downloads/pv_single_cross_predictions.csv', index=False) res.to_csv(
3.1.1.2 Pf model cross-species predictions
= pd.read_csv('./other_data/pf_ml_input.csv', index_col=0)
data_ = data_.iloc[:,1:]
X_ = SimpleImputer(strategy='median')
imputer
imputer.fit(X_)
= pd.read_csv('./other_data/pv_ml_input.csv', index_col=0)
data = data.iloc[:,1:]
X = np.array(data.antigen_label)
y
= './pf_0.5_purf_tree_filtering.pkl'
pickle_path
with open(pickle_path, 'rb') as infile:
= pickle.load(infile)
purf
# Convert float type
= X.select_dtypes(np.float64).astype(np.float32)
X[X.select_dtypes(np.float64).columns] # Impute missing values
= imputer.transform(X)
X
= purf['model'].predict_proba(X)[:,1]
pred
= pd.DataFrame({'protein_id': data.index, 'antigen_label': y})
res 'OOB score filtered'] = pred
res['~/Downloads/pf_single_cross_predictions.csv', index=False) res.to_csv(
In R:
library(DT)
<- read.csv("./other_data/pf_single_pv_single_scores.csv", check.names = FALSE)
single_model_res <- read.csv("./other_data/pf_single_pv_single_cross_predictions.csv", check.names = FALSE)
cross_pred_res <- read.csv("./data/supplementary_data_5_pfpv_purf_oob_predictions.csv", check.names = FALSE) combined_model_res
<- data.frame(
df "PURF model" = c("Pv single model", "Pf single model", "Pv + Pf combined model"),
"Accuracy for Pv known antigens" = c(
paste0(
nrow(single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pv" & single_model_res$`OOB score filtered` >= 0.5, ]),
" / ", nrow(single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pv", ]),
" = ", sprintf("%0.2f", nrow(single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pv" & single_model_res$`OOB score filtered` >= 0.5, ]) / nrow(single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pv", ]))
),paste0(
nrow(cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pv" & cross_pred_res$`OOB score filtered` >= 0.5, ]),
" / ", nrow(cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pv", ]),
" = ", sprintf("%0.2f", nrow(cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pv" & cross_pred_res$`OOB score filtered` >= 0.5, ]) / nrow(cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pv", ]))
),paste0(
nrow(combined_model_res[combined_model_res$antigen_label == 1 & combined_model_res$species == "pv" & combined_model_res$`OOB score filtered` >= 0.5, ]),
" / ", nrow(combined_model_res[combined_model_res$antigen_label == 1 & combined_model_res$species == "pv", ]),
" = ", sprintf("%0.2f", nrow(combined_model_res[combined_model_res$antigen_label == 1 & combined_model_res$species == "pv" & combined_model_res$`OOB score filtered` >= 0.5, ]) / nrow(combined_model_res[combined_model_res$antigen_label == 1 & combined_model_res$species == "pv", ]))
)
),"Accuracy for Pf known antigens" = c(
paste0(
nrow(cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pf" & cross_pred_res$`OOB score filtered` >= 0.5, ]),
" / ", nrow(cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pf", ]),
" = ", sprintf("%0.2f", nrow(cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pf" & cross_pred_res$`OOB score filtered` >= 0.5, ]) / nrow(cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pf", ]))
),paste0(
nrow(single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pf" & single_model_res$`OOB score filtered` >= 0.5, ]),
" / ", nrow(single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pf", ]),
" = ", sprintf("%0.2f", nrow(single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pf" & single_model_res$`OOB score filtered` >= 0.5, ]) / nrow(single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pf", ]))
),paste0(
nrow(combined_model_res[combined_model_res$antigen_label == 1 & combined_model_res$species == "pf" & combined_model_res$`OOB score filtered` >= 0.5, ]),
" / ", nrow(combined_model_res[combined_model_res$antigen_label == 1 & combined_model_res$species == "pf", ]),
" = ", sprintf("%0.2f", nrow(combined_model_res[combined_model_res$antigen_label == 1 & combined_model_res$species == "pf" & combined_model_res$`OOB score filtered` >= 0.5, ]) / nrow(combined_model_res[combined_model_res$antigen_label == 1 & combined_model_res$species == "pf", ]))
)
),check.names = FALSE
)
save(df, file = "./rdata/known_antigen_pred_acc.RData")
load("./rdata/known_antigen_pred_acc.RData")
%>%
df datatable(rownames = FALSE)
<- single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pv", ]$protein_id
pv_antigens <- cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pf" & cross_pred_res$`OOB score filtered` >= 0.5, ]$protein_id
pv_model_pf_antigens <- single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pv" & single_model_res$`OOB score filtered` >= 0.5, ]$protein_id
pv_model_pv_antigens
<- single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pf", ]$protein_id
pf_antigens <- single_model_res[single_model_res$antigen_label == 1 & single_model_res$species == "pf" & single_model_res$`OOB score filtered` >= 0.5, ]$protein_id
pf_model_pf_antigens <- cross_pred_res[cross_pred_res$antigen_label == 1 & cross_pred_res$species == "pv" & cross_pred_res$`OOB score filtered` >= 0.5, ]$protein_id
pf_model_pv_antigens
<- intersect(pf_model_pv_antigens, pv_model_pv_antigens)
pv_intersect <- intersect(pf_model_pf_antigens, pv_model_pf_antigens)
pf_intersect <- union(pf_model_pv_antigens, pv_model_pv_antigens)
pv_union <- union(pf_model_pf_antigens, pv_model_pf_antigens)
pf_union
<- data.frame(
df "Set" = c("Pv model only", "Intersect", "Pf model only", "Complement"),
"Pv known antigens" = c(
paste(pv_model_pv_antigens[!pv_model_pv_antigens %in% pv_intersect], collapse = ", "),
length(pv_intersect),
paste(pf_model_pv_antigens[!pf_model_pv_antigens %in% pv_intersect], collapse = ", "),
paste(pv_antigens[!pv_antigens %in% pv_union], collapse = ", ")
),"Pf known antigens" = c(
paste(pv_model_pf_antigens[!pv_model_pf_antigens %in% pf_intersect], collapse = ", "),
length(pf_intersect),
paste(pf_model_pf_antigens[!pf_model_pf_antigens %in% pf_intersect], collapse = ", "),
paste(pf_antigens[!pf_antigens %in% pf_union], collapse = ", ")
),check.names = FALSE
)
save(df, file = "./rdata/known_antigen_pred_set_analysis.RData")
load("./rdata/known_antigen_pred_set_analysis.RData")
%>%
df datatable(rownames = FALSE)
3.2 Adversarial controls
3.2.1 Analysis
In Python:
library(reticulate)
use_condaenv("/Users/renee/Library/r-miniconda/envs/purf/bin/python")
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
import pickle
from sklearn.impute import SimpleImputer
from purf.pu_ensemble import PURandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance
from joblib import Parallel, delayed
from sklearn.ensemble._forest import _generate_unsampled_indices
import os
import re
import session_info
# private function for train_purf()
def _get_ref_antigen_stats(idx, tree, X, y, ref_indices, max_samples=None):
if max_samples is None:
= y.shape[0]
max_samples = _generate_unsampled_indices(tree.random_state, y.shape[0], max_samples)
oob_indices = [i in oob_indices for i in ref_indices]
ref_oob = list()
ref_pred = tree.predict_proba(X[ref_indices,:], check_input=False)
pred = pred[:,1]
ref_pred return ref_oob, ref_pred
def train_purf(features, outcome, pos_level=0.5, purf=None, tree_filtering=False, ref_antigens=None, n_jobs=1):
= shuffle(features, outcome, random_state=0)
features, outcome # Imputation
= SimpleImputer(strategy='median')
imputer = imputer.fit_transform(features)
X = pd.DataFrame(X, index=features.index, columns=features.columns)
X = outcome
y = X
features print('There are %d positives out of %d samples before feature space weighting.' % (sum(y), len(y)))
# Feature space weighting
= X.loc[y==1,:]
lab_pos = np.median(lab_pos, axis=0)
median # Feature space weighting
= X.loc[y==1,:]
lab_pos = np.median(lab_pos, axis=0)
median = MinMaxScaler(feature_range=(1,10))
scaler = list()
dist for i in range(lab_pos.shape[0]):
dist.append(distance.euclidean(lab_pos.iloc[i, :], median))= np.asarray(dist).reshape(-1, 1)
dist = np.round(scaler.fit_transform(dist))
counts = np.array(counts, dtype=np.int64)[:, 0]
counts = X.iloc[y==1, :]
X_temp = X.iloc[y==0, :]
X = np.asarray([0] * X.shape[0] + [1] * (sum(counts)))
y = [X]
appended_data for i in range(len(counts)):
* counts[i]))
appended_data.append(pd.concat([X_temp.iloc[[i]]] = pd.concat(appended_data)
X print('There are %d positives out of %d samples after feature space weighting.' % (sum(y), len(y)))
= pd.DataFrame({'protein_id': X.index, 'antigen_label' : y})
res if tree_filtering is True:
# get ref antigen indices
= {ref:list() for ref in list(ref_antigens.values())}
ref_index_dict for i in range(res.shape[0]):
if res['protein_id'][i] in list(ref_antigens.values()):
'protein_id'][i]].append(res.index[i])
ref_index_dict[res[= sum(ref_index_dict.values(), [])
ref_indices # get OOB stats and predictions
= X.astype('float32')
X = purf.estimators_
trees = [i for i in range(len(trees))]
idx_list = Parallel(n_jobs=n_jobs)(
stats_res for idx in idx_list)
delayed(_get_ref_antigen_stats)(idx, trees[idx], np.array(X), y, ref_indices) # ref_oob data structure:
# rows represent individual trees
# column represent reference antigens
# cells indicate whether the reference antigen is in the OOB samples of the tree
= np.array([ref_oob for ref_oob, ref_pred in stats_res])
ref_oob # ref_pred data structure:
# rows represent individual trees
# column represent reference antigens
# cells indicate the prediction of the reference antigen by the tree
= np.array([ref_pred for ref_oob, ref_pred in stats_res])
ref_pred # analyze duplicated reference antigens as a group
= np.cumsum(np.array([len(v) for k,v in ref_index_dict.items()]))
cumsum_num_ref = np.array([ref_oob[:, 0:cumsum_num_ref[i]].any(axis=1) if i == 0 else \
ref_oob_all - 1]:cumsum_num_ref[i]].any(axis=1) \
ref_oob[:, cumsum_num_ref[i for i in range(len(ref_antigens))]).T
= np.array([ref_pred[:, 0:cumsum_num_ref[i]].any(axis=1) if i == 0 else \
ref_pred_all - 1]:cumsum_num_ref[i]].sum(axis=1) \
ref_pred[:, cumsum_num_ref[i for i in range(len(ref_antigens))]).T
# calculate number of reference antigens as OOB samples for each tree
= ref_oob_all.sum(axis=1)
oob_total # assign score of 1 to trees that correctly predict all OOB reference antigens; otherwise, assign 0 score
= np.zeros(len(trees))
weights # iterate through the trees and calculate the stats
for i in range(len(trees)):
= list(ref_oob_all[i,:])
oob_list = list(ref_pred_all[i,:])
pred_list if oob_total[i] == 0:
= 0
weights[i] else:
if sum(np.array(pred_list)[oob_list] != 0) == oob_total[i]:
= 1
weights[i] if tree_filtering is False:
# Training PURF
= PURandomForestClassifier(
purf = 100000,
n_estimators = True,
oob_score = 64,
n_jobs = 42,
random_state = pos_level
pos_level
)
purf.fit(X, y)else:
-1,1), weights=weights)
purf._set_oob_score_with_weights(np.array(X), y.reshape(# Storing results
'OOB score'] = purf.oob_decision_function_[:,1]
res[= features.merge(res.groupby('protein_id').mean(), left_index=True, right_on='protein_id')
res = res[['antigen_label', 'OOB score']]
res if tree_filtering is False:
return (purf, res)
else:
return ({'model': purf, 'weights': weights}, res)
3.2.1.1 Pv data set
= {'CSP': 'PVP01_0835600.1-p1', 'DBP': 'PVP01_0623800.1-p1', 'MSP1': 'PVP01_0728900.1-p1'}
ref_antigens = pd.read_csv('./other_data/pv_ml_input.csv', index_col=0)
data = data.iloc[:, 1:]
features = np.array(data.antigen_label)
outcome
for (idx, (antigen, out)) in enumerate(zip(features.index, outcome)):
if out == 1:
if antigen in ref_antigens.values():
continue
= outcome.copy()
outcome_ = 0
outcome_[idx] = train_purf(features, outcome_, pos_level=0.5, tree_filtering=False)
(purf, res) = train_purf(features, outcome_, pos_level=0.5, purf=purf, tree_filtering=True, ref_antigens=ref_antigens)
(purf_filtered, res_filtered) '~/Downloads/pv_jackknife/' + antigen + '_res.csv') res_filtered.to_csv(
dir = '~/Downloads/pv_jackknife/'
= os.listdir(dir)
files
for file in files:
if file.endswith('csv'):
= pd.read_csv(dir + file, index_col=0)['antigen_label']
tmp break
= [pd.read_csv(dir + file, index_col=0)['OOB score'] for file in files if file.endswith('csv')]
data_frames = pd.concat([tmp] + data_frames, join='outer', axis=1)
merged_df = ['antigen_label'] + [re.match('PVP01_[0-9]+\.[0-9]-p1', file)[0] for file in files if file.endswith('csv')]
colnames = colnames
merged_df.columns './other_data/pv_adversarial_controls.csv') merged_df.to_csv(
3.2.1.2 Pv + Pf combined data set
= {'CSP (Pf)': 'PF3D7_0304600.1-p1', 'RH5 (Pf)': 'PF3D7_0424100.1-p1', 'MSP5 (Pf)': 'PF3D7_0206900.1-p1',
ref_antigens 'P230 (Pf)': 'PF3D7_0209000.1-p1',
'CSP (Pv)': 'PVP01_0835600.1-p1', 'DBP (Pv)': 'PVP01_0623800.1-p1', 'MSP1 (Pv)': 'PVP01_0728900.1-p1'}
= pd.read_csv('./data/supplementary_data_4_pfpv_ml_input.csv', index_col=0)
data = data.iloc[:, 1:]
features = np.array(data.antigen_label)
outcome
for (idx, (antigen, out)) in enumerate(zip(features.index, outcome)):
if out == 1:
if antigen in ref_antigens.values():
continue
= outcome.copy()
outcome_ = 0
outcome_[idx] = train_purf(features, outcome_, pos_level=0.5, tree_filtering=False)
(purf, res) = train_purf(features, outcome_, pos_level=0.5, purf=purf, tree_filtering=True, ref_antigens=ref_antigens)
(purf_filtered, res_filtered) '~/Downloads/pfpv_jackknife/' + antigen + '_res.csv') res_filtered.to_csv(
dir = '~/Downloads/pfpv_jackknife/'
= os.listdir(dir)
files
for file in files:
if file.endswith('csv'):
= pd.read_csv(dir + file, index_col=0)['antigen_label']
tmp break
= [pd.read_csv(dir + file, index_col=0)['OOB score'] for file in files if file.endswith('csv')]
data_frames = pd.concat([tmp] + data_frames, join='outer', axis=1)
merged_df = ['antigen_label'] + [re.match('PF3D7_[0-9]+\.[0-9]-p1', file)[0] if file.startswith('PF3D7') else re.match('PVP01_[0-9]+\.[0-9]-p1', file)[0] for file in files if file.endswith('csv')]
colnames = colnames
merged_df.columns './other_data/pfpv_adversarial_controls.csv') merged_df.to_csv(
3.2.2 Plotting
In R:
library(ggplot2)
library(ggdist)
library(gghalves)
library(cowplot)
library(grid)
library(ggbeeswarm)
library(ggpubr)
<- read.csv("./data/supplementary_data_3_pv_purf_oob_predictions.csv", check.names = FALSE, row.names = 1)
pv_data <- read.csv("./other_data/pf_single_pv_single_scores.csv", check.names = FALSE, row.names = 1)
pf_data <- read.csv("./data/supplementary_data_5_pfpv_purf_oob_predictions.csv", check.names = FALSE, row.names = 1)
pfpv_data
<- read.csv("./other_data/pv_adversarial_controls.csv", check.names = FALSE, row.names = 1)
pv_ad_ctrl <- read.csv("./other_data/pf_adversarial_controls.csv", check.names = FALSE, row.names = 1)
pf_ad_ctrl <- read.csv("./other_data/pfpv_adversarial_controls.csv", check.names = FALSE, row.names = 1) pfpv_ad_ctrl
3.2.2.1 Mean differences in scores
<- read.csv("./other_data/pvp01_gene_products_v62.csv", row.names = 1)
pv_products <- read.csv("./other_data/pf3d7_gene_products_v62.csv", row.names = 1)
pf_products <- rbind(pf_products, pv_products)
gene_products
<- function(validation_data, baseline_scores) {
calculate_known_antigen_scores <- c()
scores for (i in 2:ncol(validation_data)) {
<- sort(colnames(validation_data))[i]
known_antigen <- sort(colnames(validation_data))[-c(1, i)]
other_antigens <- c(scores, mean(validation_data[other_antigens, known_antigen] - baseline_scores[other_antigens, ]))
scores
}names(scores) <- sort(colnames(validation_data))[2:ncol(validation_data)]
return(scores)
}
<- calculate_known_antigen_scores(pv_ad_ctrl, pv_data["OOB score filtered"])
pv_ad_ctrl_scores <- calculate_known_antigen_scores(pf_ad_ctrl, pf_data[rownames(pf_ad_ctrl), "OOB score filtered", drop = FALSE])
pf_ad_ctrl_scores <- calculate_known_antigen_scores(pfpv_ad_ctrl, pfpv_data["OOB score filtered"])
pfpv_ad_ctrl_scores
<- data.frame(
pv_ad_ctrl_res "score" = pv_ad_ctrl_scores,
"gene_product" = gene_products[names(pv_ad_ctrl_scores), "gene_product"],
row.names = names(pv_ad_ctrl_scores)
)$source <- "Intersect"
pv_ad_ctrl_res<- c(
literature "PVP01_0102300.1-p1", "PVP01_0118900.1-p1", "PVP01_0202200.1-p1", "PVP01_0205500.1-p1",
"PVP01_0210500.1-p1", "PVP01_0304300.1-p1", "PVP01_0317900.1-p1", "PVP01_0418300.1-p1",
"PVP01_0418400.1-p1", "PVP01_0529100.1-p1", "PVP01_0532400.1-p1", "PVP01_0616000.1-p1",
"PVP01_0616100.1-p1", "PVP01_0701200.1-p1", "PVP01_0707700.1-p1", "PVP01_1026000.1-p1",
"PVP01_1030900.1-p1", "PVP01_1129100.1-p1"
)<- c(
IEDB "PVP01_0106200.1-p1", "PVP01_0106300.1-p1", "PVP01_0215600.1-p1", "PVP01_0307400.1-p1",
"PVP01_0522300.1-p1", "PVP01_0813700.1-p1", "PVP01_0829000.1-p1", "PVP01_0835600.1-p1",
"PVP01_0923000.1-p1", "PVP01_0932400.1-p1", "PVP01_1240600.1-p1", "PVP01_1306000.1-p1",
"PVP01_1412900.1-p1", "PVP01_1453900.1-p1"
)$source[rownames(pv_ad_ctrl_res) %in% literature] <- "Literature"
pv_ad_ctrl_res$source[rownames(pv_ad_ctrl_res) %in% IEDB] <- "IEDB"
pv_ad_ctrl_res
<- data.frame(
pfpv_ad_ctrl_res "score" = pfpv_ad_ctrl_scores,
"gene_product" = gene_products[names(pfpv_ad_ctrl_scores), "gene_product"],
row.names = names(pfpv_ad_ctrl_scores)
)$source <- "Intersect"
pfpv_ad_ctrl_res$source[rownames(pfpv_ad_ctrl_res) %in% literature] <- "Literature"
pfpv_ad_ctrl_res$source[rownames(pfpv_ad_ctrl_res) %in% IEDB] <- "IEDB"
pfpv_ad_ctrl_res
write.csv(pv_ad_ctrl_res, file = "./other_data/pv_ad_ctrl_res.csv", row.names = TRUE)
write.csv(pfpv_ad_ctrl_res, file = "./other_data/pfpv_ad_ctrl_res.csv", row.names = TRUE)
<- read.csv("./other_data/pv_ad_ctrl_res.csv", row.names = 1)
pv_ad_ctrl_res <- read.csv("./other_data/pfpv_ad_ctrl_res.csv", row.names = 1)
pfpv_ad_ctrl_res <- data.frame(
data_ group = factor(c(
rep(0, length(pv_ad_ctrl_scores)), rep(1, length(pf_ad_ctrl_scores)),
rep(2, length(pfpv_ad_ctrl_scores))
)),score = c(pv_ad_ctrl_scores, pf_ad_ctrl_scores, pfpv_ad_ctrl_scores)
)
<- ggplot(data_, aes(x = group, y = score)) +
p geom_hline(yintercept = 0, color = "grey80", linetype = "dashed") +
::stat_halfeye(aes(color = group, fill = group), adjust = 0.5, width = 0.7, .width = 0, justification = -0.1, point_colour = NA, alpha = 0.9) +
ggdistgeom_boxplot(aes(color = group), width = 0.1, outlier.color = NA, lwd = 0.5, show.legend = FALSE) +
::geom_half_point(aes(fill = group), side = "l", range_scale = 0.4, alpha = 0.6, shape = 21, color = "black", stroke = 0.1, size = 2) +
gghalvescoord_flip() +
scale_color_manual(values = c("#03a1fc", "#984EA3", "#FF7F00")) +
scale_fill_manual(values = c("#03a1fc", "#984EA3", "#FF7F00")) +
scale_x_discrete(
breaks = c(0, 1, 2),
labels = c(
expression(paste(italic("P. vivax"), " model")),
expression(paste(italic("P. falciparum"), " model")),
"Combined model"
),limits = rev
+
) theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(colour = "black"),
axis.title.y = element_blank(),
axis.text = element_text(colour = "black"),
plot.title = element_text(hjust = 0.5, colour = "black"),
plot.margin = ggplot2::margin(5, 5, 5, 5, "pt"),
legend.title = element_blank(),
legend.text = element_text(colour = "black"),
legend.position = "none"
+
) ylab("Mean difference in scores (proportion of votes)")
png(
file = "./figures/Supplementary Fig 5.png",
width = 5000, height = 2000, res = 600
)print(p)
dev.off()
pdf(file = "../supplementary_figures/Supplementary Fig 5.pdf", width = 10, height = 4)
print(p)
dev.off()
3.2.2.2 Investigation of known antigen groups
library(umap)
library(ggplot2)
library(ggrepel)
library(cowplot)
<- read.csv("./other_data/pv_ad_ctrl_res.csv", row.names = 1)
pv_ad_ctrl_res # pv_ad_ctrl_res = pv_ad_ctrl_res[pv_ad_ctrl_res$source != 'Intersect', ]
$score_group <- ""
pv_ad_ctrl_res$score_group[pv_ad_ctrl_res$score < 0.01] <- "Group 1"
pv_ad_ctrl_res$score_group[pv_ad_ctrl_res$score >= 0.01] <- "Group 2"
pv_ad_ctrl_restable(pv_ad_ctrl_res$score_group, pv_ad_ctrl_res$source)
##
## IEDB Intersect Literature
## Group 1 11 2 3
## Group 2 2 2 15
fisher.test(pv_ad_ctrl_res$score_group, pv_ad_ctrl_res$source)
##
## Fisher's Exact Test for Count Data
##
## data: pv_ad_ctrl_res$score_group and pv_ad_ctrl_res$source
## p-value = 0.0002597
## alternative hypothesis: two.sided
<- read.csv("./other_data/pfpv_ad_ctrl_res.csv", row.names = 1)
pfpv_ad_ctrl_res # pfpv_ad_ctrl_res = pfpv_ad_ctrl_res[pfpv_ad_ctrl_res$source != 'Intersect', ]
$score_group <- ""
pfpv_ad_ctrl_res$score_group[pfpv_ad_ctrl_res$score < -0.09] <- "Group 1"
pfpv_ad_ctrl_res$score_group[pfpv_ad_ctrl_res$score >= -0.09] <- "Group 2"
pfpv_ad_ctrl_restable(pfpv_ad_ctrl_res$score_group, pfpv_ad_ctrl_res$source)
##
## IEDB Intersect Literature
## Group 1 12 27 2
## Group 2 1 25 16
fisher.test(pfpv_ad_ctrl_res$score_group, pfpv_ad_ctrl_res$source)
##
## Fisher's Exact Test for Count Data
##
## data: pfpv_ad_ctrl_res$score_group and pfpv_ad_ctrl_res$source
## p-value = 1.176e-05
## alternative hypothesis: two.sided
<- read.csv("./other_data/pfpv_ad_ctrl_res.csv", row.names = 1)
pfpv_ad_ctrl_res $score_group <- ""
pfpv_ad_ctrl_res$score_group[pfpv_ad_ctrl_res$score < -0.09] <- "Group 1"
pfpv_ad_ctrl_res$score_group[pfpv_ad_ctrl_res$score >= -0.09] <- "Group 2"
pfpv_ad_ctrl_res$species <- ""
pfpv_ad_ctrl_res$species[startsWith(rownames(pfpv_ad_ctrl_res), "PF3D7")] <- "Pf"
pfpv_ad_ctrl_res$species[startsWith(rownames(pfpv_ad_ctrl_res), "PVP01")] <- "Pv"
pfpv_ad_ctrl_restable(pfpv_ad_ctrl_res$score_group, pfpv_ad_ctrl_res$species)
##
## Pf Pv
## Group 1 25 16
## Group 2 23 19
fisher.test(pfpv_ad_ctrl_res$score_group, pfpv_ad_ctrl_res$species)
##
## Fisher's Exact Test for Count Data
##
## data: pfpv_ad_ctrl_res$score_group and pfpv_ad_ctrl_res$species
## p-value = 0.6584
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.4934941 3.3881568
## sample estimates:
## odds ratio
## 1.286776
3.2.2.3 Labeled known antigen accuracies
<- function(validation_data, baseline_scores) {
calculate_known_antigen_accuracies <- c()
accuracies for (i in 2:ncol(validation_data)) {
<- sort(colnames(validation_data))[i]
known_antigen <- sort(colnames(validation_data))[-c(1, i)]
other_antigens <- c(accuracies, sum(validation_data[other_antigens, known_antigen] >= 0.5) / length(validation_data[other_antigens, known_antigen]))
accuracies
}return(accuracies)
}
<- calculate_known_antigen_accuracies(pv_ad_ctrl, pv_data["OOB score filtered"])
pv_ad_ctrl_accuracies <- calculate_known_antigen_accuracies(pf_ad_ctrl, pf_data[rownames(pf_ad_ctrl), "OOB score filtered", drop = FALSE])
pf_ad_ctrl_accuracies <- calculate_known_antigen_accuracies(pfpv_ad_ctrl, pfpv_data["OOB score filtered"])
pfpv_ad_ctrl_accuracies
<- data.frame(
df "PURF model" = c("Pv single model", "Pf single model", "Pv + Pf combined model"),
"Accuracy (mean ± SD)" = c(
paste0(
sprintf("%0.2f", mean(pv_ad_ctrl_accuracies)), " ± ",
sprintf("%0.2f", sd(pv_ad_ctrl_accuracies))
),paste0(
sprintf("%0.2f", mean(pf_ad_ctrl_accuracies)), " ± ",
sprintf("%0.2f", sd(pf_ad_ctrl_accuracies))
),paste0(
sprintf("%0.2f", mean(pfpv_ad_ctrl_accuracies)), " ± ",
sprintf("%0.2f", sd(pfpv_ad_ctrl_accuracies))
)
),check.names = FALSE
)
save(df, file = "./rdata/ad_ctrl_pred_accuracy.RData")
load("./rdata/ad_ctrl_pred_accuracy.RData")
%>%
df datatable(rownames = FALSE)
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggrepel_0.9.3 umap_0.2.10.0 ggpubr_0.6.0 ggbeeswarm_0.7.2
## [5] cowplot_1.1.1 gghalves_0.1.4 ggdist_3.2.1 ggplot2_3.4.2
## [9] DT_0.27 reticulate_1.28
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.10 here_1.0.1 lattice_0.21-8
## [4] tidyr_1.3.0 png_0.1-8 rprojroot_2.0.3
## [7] digest_0.6.31 utf8_1.2.3 RSpectra_0.16-1
## [10] R6_2.5.1 backports_1.4.1 evaluate_0.21
## [13] highr_0.10 pillar_1.9.0 rlang_1.1.1
## [16] rstudioapi_0.14 car_3.1-2 jquerylib_0.1.4
## [19] R.utils_2.12.2 R.oo_1.25.0 Matrix_1.5-4
## [22] rmarkdown_2.21 styler_1.9.1 htmlwidgets_1.6.2
## [25] munsell_0.5.0 broom_1.0.4 compiler_4.2.3
## [28] vipor_0.4.5 xfun_0.39 askpass_1.1
## [31] pkgconfig_2.0.3 htmltools_0.5.5 openssl_2.0.6
## [34] tidyselect_1.2.0 tibble_3.2.1 bookdown_0.34
## [37] codetools_0.2-19 fansi_1.0.4 dplyr_1.1.2
## [40] withr_2.5.0 R.methodsS3_1.8.2 distributional_0.3.2
## [43] jsonlite_1.8.4 gtable_0.3.3 lifecycle_1.0.3
## [46] magrittr_2.0.3 scales_1.2.1 carData_3.0-5
## [49] cli_3.6.1 cachem_1.0.8 farver_2.1.1
## [52] ggsignif_0.6.4 bslib_0.4.2 ellipsis_0.3.2
## [55] generics_0.1.3 vctrs_0.6.2 tools_4.2.3
## [58] R.cache_0.16.0 glue_1.6.2 beeswarm_0.4.0
## [61] purrr_1.0.1 crosstalk_1.2.0 abind_1.4-5
## [64] fastmap_1.1.1 yaml_2.3.7 colorspace_2.1-0
## [67] rstatix_0.7.2 knitr_1.42 sass_0.4.6
session_info.show()
## -----
## joblib 1.1.1
## numpy 1.19.0
## pandas 1.3.2
## purf NA
## scipy 1.8.0
## session_info 1.0.0
## sklearn 0.24.2
## -----
## Python 3.8.2 (default, Mar 26 2020, 10:45:18) [Clang 4.0.1 (tags/RELEASE_401/final)]
## macOS-10.16-x86_64-i386-64bit
## -----
## Session information updated at 2023-05-18 12:34