Section 1 Data engineering
1.1 purf
package installization
See instructions on the GitHub webpage.
1.2 Retrieving P. vivax protein variables
In Bash:
echo "create database pfpv_reverse_vaccinology" | mariadb -u <dbuser> -p
mariadb -u <dbuser> -p pfpv_reverse_vaccinology < ./other_data/pfpv_reverse_vaccinology.sql
Overall database schema:
In R:
library(RMariaDB)
library(DBI)
library(rlist)
<- "./other_data/pv_assembled_data.csv"
output_path <- dbConnect(RMariaDB::MariaDB(), user = "root", password = "", dbname = "pfpv_reverse_vaccinology")
db
<- function(table_name, db) {
get_data <- sqlInterpolate(db, "SELECT * FROM ?table", table = dbQuoteIdentifier(db, table_name))
sql return(subset(dbGetQuery(db, sql), select = -c(id)))
}
# protein ID and transcript ID map table
<- dbGetQuery(db, 'SELECT dbxref_1.accession AS accession, dbxref_1.dbxref_id AS protein_id FROM dbxref AS dbxref_1
id_map LEFT OUTER JOIN feature AS feature_1 ON dbxref_1.dbxref_id = feature_1.dbxref_id
LEFT OUTER JOIN cvterm ON feature_1.type_id = cvterm.cvterm_id
LEFT OUTER JOIN feature_relationship ON feature_1.feature_id = feature_relationship.subject_id
LEFT OUTER JOIN feature AS feature_2 ON feature_relationship.object_id = feature_2.feature_id
LEFT OUTER JOIN dbxref AS dbxref_2 ON feature_2.dbxref_id = dbxref_2.dbxref_id
LEFT OUTER JOIN organism ON feature_1.organism_id = organism.organism_id
WHERE feature_1.is_obsolete = 0 AND cvterm.name = "polypeptide" AND organism.species="vivax_P01"')
1.2.1 Immunological data sets
In R:
<- list()
immu
# predivac_processed result for T-cell epitopes
<- list.append(immu, get_data("predivac_processed", db))
immu
# bepipred_2_0 result for B-cell epitopes
<- list.append(immu, get_data("bepipred_2_0_processed", db))
immu
# bepipred_1_0 result for B-cell epitopes
<- list.append(immu, get_data("bepipred_1_0_processed", db))
immu
# abcpred result for B-cell epitopes
<- list.append(immu, get_data("abcpred_processed", db))
immu
# ctlpred result for cytotoxic T-cell epitopes
<- list.append(immu, get_data("ctlpred_processed", db))
immu
# il_10pred result for interleukine-10 inducing epitopes
<- list.append(immu, get_data("il_10pred_processed", db))
immu
# ifnepitope result for IFN-gamma inducing epitopes
<- list.append(immu, get_data("ifnepitope_processed", db))
immu
# tappred for high binding affinity epitopes toward the TAP transporter
<- list.append(immu, get_data("tappred_processed", db))
immu
# mhc_i for MHC class I epitopes
<- list.append(immu, get_data("mhc_i", db))
immu
# mhc_ii for MHC class II epitopes
<- list.append(immu, get_data("mhc_ii", db))
immu
# antigenicity
<- list.append(immu, get_data("antigenicity", db))
immu
# immunogenicity
<- list.append(immu, get_data("immunogenicity", db))
immu
# merge immunological data sets
<- id_map["protein_id"]
immunological_ds for (ds in immu) {
<- merge(x = immunological_ds, y = ds, by = "protein_id", all.x = TRUE)
immunological_ds }
1.2.2 Proteomic data sets
In R:
<- list()
pro
# cello result for subcellualr localization
<- list.append(pro, subset(get_data("cello", db), select = -c(location_id)))
pro
# maap result for malaria adhesin/adhesin-like proteins
<- list.append(pro, get_data("maap", db))
pro
# peptides result for physicochemical properties
<- list.append(pro, get_data("r_peptides", db))
pro
# protr result for physicochemical properties
<- list.append(pro, get_data("protr", db))
pro
# hydrophilicity
<- list.append(pro, get_data("hydrophilicity", db))
pro
# predgpi result for gpi anchors prediction
<- list.append(pro, get_data("predgpi", db))
pro
# signalp result for signal cleavage prediction
<- list.append(pro, subset(get_data("signalp", db), select = -c(is_secprotein, cleavage_site, cs_probability)))
pro
# abpred results for amino acid compositions and other variables
<- list.append(pro, get_data("abpred", db))
pro
# glycoep results for N-linked and O-linked glycosylation
<- list.append(pro, get_data("glycoep_processed", db))
pro
# blastp result for similarity to human proteins
<- list.append(pro, get_data("blastp", db))
pro
# merge proteomic data sets
<- id_map["protein_id"]
proteomic_ds for (ds in pro) {
<- merge(x = proteomic_ds, y = ds, by = "protein_id", all.x = TRUE)
proteomic_ds }
1.2.3 Structural data sets
In R:
<- list()
struc
# tmhmm result for transmembrane helices prediction
<- list.append(struc, get_data("tmhmm_processed", db))
struc
# seg result for sequence complexity
<- list.append(struc, get_data("seg", db))
struc
# seg + tmhmm result
<- list.append(struc, get_data("seg_processed", db))
struc
# b_cell_epitope_methods for structural predictions
<- list.append(struc, get_data("beta_turn", db))
struc <- list.append(struc, get_data("surface_accessibility", db))
struc <- list.append(struc, get_data("flexibility", db))
struc
# merge structural analysis data sets
<- id_map["protein_id"]
structural_ds for (ds in struc) {
<- merge(x = structural_ds, y = ds, by = "protein_id", all.x = TRUE)
structural_ds }
1.2.4 Genomic data sets
In R:
<- list()
geno
# snp result
<- list.append(geno, get_data("snp", db))
geno
# merge structural analysis data sets
<- id_map["protein_id"]
genomic_ds for (ds in geno) {
<- merge(x = genomic_ds, y = ds, by = "protein_id", all.x = TRUE)
genomic_ds }
1.2.5 Final assembly
In R:
# prepare predictor variables
<- merge(x = immunological_ds, y = proteomic_ds, by = "protein_id", all = FALSE)
data <- merge(x = data, y = structural_ds, by = "protein_id", all = FALSE)
data <- merge(x = data, y = genomic_ds, by = "protein_id", all = FALSE)
data <- merge(x = id_map, y = data, by = "protein_id", all.x = TRUE)
data <- data$accession
accession <- subset(data, select = -c(protein_id, accession))
data rownames(data) <- accession
# write to output
write.csv(data, output_path)
# close database connection
dbDisconnect(db)
1.3 Generating ML input
In R:
<- read.csv("./other_data/pv_antigen_labels.csv", row.names = 1)
response_var <- read.csv("./other_data/pv_assembled_data.csv", row.names = 1)
predictor_vars <- merge(x = response_var, y = predictor_vars, by = "row.names", all = TRUE)
ml_input <- ml_input$Row.names
row_names <- subset(ml_input, select = -c(Row.names))
ml_input rownames(ml_input) <- row_names
# write to output
write.csv(ml_input, "./data/supplementary_data_2_pv_ml_input.csv")
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rlist_0.4.6.2 DBI_1.1.3 RMariaDB_1.2.2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.10 bslib_0.4.2 compiler_4.2.3 jquerylib_0.1.4
## [5] highr_0.10 R.methodsS3_1.8.2 R.utils_2.12.2 tools_4.2.3
## [9] digest_0.6.31 bit_4.0.5 jsonlite_1.8.4 evaluate_0.21
## [13] lifecycle_1.0.3 R.cache_0.16.0 pkgconfig_2.0.3 rlang_1.1.1
## [17] cli_3.6.1 rstudioapi_0.14 yaml_2.3.7 xfun_0.39
## [21] fastmap_1.1.1 styler_1.9.1 knitr_1.42 vctrs_0.6.2
## [25] sass_0.4.6 hms_1.1.3 bit64_4.0.5 glue_1.6.2
## [29] data.table_1.14.8 R6_2.5.1 rmarkdown_2.21 bookdown_0.34
## [33] purrr_1.0.1 magrittr_2.0.3 codetools_0.2-19 htmltools_0.5.5
## [37] cachem_1.0.8 R.oo_1.25.0