Section 1 Melanin binding pilot peptide array
library(seqinr)
library(Peptides)
library(stringr)
library(protr)
library(plyr)
library(randomForest)
library(ggplot2)
1.2 Generating ML input
# --------------
# Peptides_2.4.4
# --------------
# read in fasta file
<- read.fasta("./other_data/mb_pilot_peptide_array.fasta", seqtype = "AA", as.string = TRUE, set.attributes = FALSE)
peptides # Molecular weight
<- as.matrix(sapply(peptides, function(x) mw(x, monoisotopic = FALSE)))
weights colnames(weights) <- "weight"
# Amino acid composition
<- sapply(peptides, function(x) aaComp(x))
aa.comp <- t(sapply(aa.comp, function(x) x[, "Mole%"]))
aa.comp.matrix # Isoelectric point
<- as.matrix(sapply(peptides, function(x) pI(x, pKscale = "EMBOSS")))
pI.values colnames(pI.values) <- "pI.value"
# Hydrophobicity
<- as.matrix(sapply(peptides, function(x) hydrophobicity(x, scale = "KyteDoolittle")))
hydrophobicity.values colnames(hydrophobicity.values) <- "hydrophobicity.value"
# Net charge at pH 7
<- as.matrix(sapply(peptides, function(x) charge(x, pH = 7, pKscale = "EMBOSS")))
net.charges colnames(net.charges) <- "net.charge"
# Boman index
<- as.matrix(sapply(peptides, function(x) boman(x)))
boman.indices colnames(boman.indices) <- "boman.index"
# combine peptides results
<- data.frame(cbind(weights, aa.comp.matrix, pI.values, hydrophobicity.values, net.charges, boman.indices))
peptides_res
# -----------
# protr_1.6-2
# -----------
<- 7
length # read in FASTA file
<- readFASTA("./other_data/mb_pilot_peptide_array.fasta")
sequences <- sequences[unlist(lapply(sequences, function(x) nchar(x) > 1))]
sequences # calculate amino acid composition descriptors (dim = 20)
<- t(sapply(sequences, extractAAC))
x1 colnames(x1)[colnames(x1) == "Y"] <- "Y_tyrosine"
# calculate dipeptide composition descriptors (dim = 400)
<- t(sapply(sequences, extractDC))
x2 colnames(x2)[colnames(x2) == "NA"] <- "NA_dipeptide"
# calculate Moreau-Broto autocorrelation descriptors (dim = 8 * (length - 1))
<- t(sapply(sequences, extractMoreauBroto, nlag = length - 1L))
x3 colnames(x3) <- paste("moreau_broto", colnames(x3), sep = "_")
# calculate composition descriptors (dim = 21)
<- t(sapply(sequences, extractCTDC))
x4 # calculate transition descriptors (dim = 21)
<- t(sapply(sequences, extractCTDT))
x5 # calculate distribution descriptors (dim = 105)
<- t(sapply(sequences, extractCTDD))
x6 # calculate conjoint triad descriptors (dim = 343)
<- t(sapply(sequences, extractCTriad))
x7 # calculate sequence-order-coupling numbers (dim = 2 * (length - 1))
<- t(sapply(sequences, extractSOCN, nlag = length - 1L))
x8 # calculate quasi-sequence-order descriptors (dim = 40 + 2 * (length - 1))
<- t(sapply(sequences, extractQSO, nlag = length - 1L))
x9 # calculate pseudo-amino acid composition (dim = 20 + (length - 1))
<- t(sapply(sequences, extractPAAC, lambda = length - 1L))
x10 # calculate amphiphilic pseudo-amino acid composition (dim = 20 + 2 * (length - 1))
<- t(sapply(sequences, extractAPAAC, lambda = length - 1L))
x11 # combine all of the result datasets
<- data.frame(cbind(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11))
protr_res
<- read.csv("./other_data/mb_pilot_peptide_array_labels.csv", row.names = 1)
labels <- function(x, ..., by = "row.names") {
merge.all <- list(...)
L for (i in seq_along(L)) {
<- merge(x, L[[i]], by = by)
x rownames(x) <- x$Row.names
$Row.names <- NULL
x
}return(x)
}<- merge.all(peptides_res, protr_res, labels)
data write.csv(data, file = "./data/mb_pilot_peptide_array_ml_input.csv")
1.3 Training initial RF model
<- read.csv("./data/mb_pilot_peptide_array_ml_input.csv", check.names = FALSE, row.names = 1)
data <- subset(data, select = -category)
predictor_variables <- factor(data$category, levels = c("non-bind", "bind"))
response_variable # impute missing values
<- na.roughfix(response_variable)
response_variable # perform balance sampling
<- min(table(response_variable)) samp_size
# build RF model
set.seed(22)
<- 100000
ntree <- randomForest(
rf x = predictor_variables, y = response_variable, ntree = ntree, sampsize = rep(samp_size, 2),
importance = TRUE, proximity = TRUE, do.trace = 0.1 * ntree
)save(rf, file = "./rdata/mb_pilot_rf.RData")
1.4 Melanin binding variable importance
load(file = "./rdata/mb_pilot_rf.RData")
<- as.data.frame(importance(rf)[, "MeanDecreaseAccuracy", drop = FALSE])
imp_ds $`Feature` <- rownames(imp_ds)
imp_dscolnames(imp_ds) <- c("Mean decrease accuracy", "Feature")
<- imp_ds[order(-imp_ds$`Mean decrease accuracy`), ][1:20, ]
imp_ds
<- ggplot(imp_ds, aes(x = reorder(`Feature`, `Mean decrease accuracy`), y = `Mean decrease accuracy`)) +
p geom_col(color = "grey10", fill = "#f5b8b8", size = 0.2) +
geom_text(aes(label = sprintf("%.2f", `Mean decrease accuracy`)), nudge_y = 4.2, size = 3) +
coord_flip() +
ylim(0, max(imp_ds$`Mean decrease accuracy`) + 5) +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = "black"),
legend.text = element_text(colour = "black", size = 10),
plot.title = element_text(hjust = 0.5, size = 18),
plot.margin = ggplot2::margin(10, 10, 10, 0, "pt"),
axis.title.x = element_text(colour = "black", size = 18),
axis.title.y = element_text(colour = "black", size = 18),
axis.text.x = element_text(colour = "black", size = 12),
axis.text.y = element_text(colour = "black", size = 12),
legend.title = element_text(size = 14),
legend.position = c(0.85, 0.5)
+
) xlab("") +
ggtitle("")
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.3.6 randomForest_4.7-1.1 plyr_1.8.7
## [4] protr_1.6-2 stringr_1.4.1 Peptides_2.4.4
## [7] seqinr_4.2-16
##
## loaded via a namespace (and not attached):
## [1] styler_1.8.0 tidyselect_1.1.2 xfun_0.32 bslib_0.4.0
## [5] purrr_0.3.4 colorspace_2.0-3 vctrs_0.4.1 generics_0.1.3
## [9] htmltools_0.5.3 yaml_2.3.5 utf8_1.2.2 rlang_1.0.4
## [13] R.oo_1.25.0 jquerylib_0.1.4 pillar_1.8.1 glue_1.6.2
## [17] withr_2.5.0 DBI_1.1.3 R.utils_2.12.0 R.cache_0.16.0
## [21] lifecycle_1.0.1 munsell_0.5.0 gtable_0.3.0 R.methodsS3_1.8.2
## [25] codetools_0.2-18 evaluate_0.16 knitr_1.40 fastmap_1.1.0
## [29] fansi_1.0.3 highr_0.9 Rcpp_1.0.9 scales_1.2.1
## [33] cachem_1.0.6 jsonlite_1.8.0 digest_0.6.29 stringi_1.7.8
## [37] bookdown_0.28 dplyr_1.0.9 grid_4.2.2 ade4_1.7-19
## [41] cli_3.3.0 tools_4.2.2 magrittr_2.0.3 sass_0.4.2
## [45] tibble_3.1.8 pkgconfig_2.0.3 MASS_7.3-58.1 assertthat_0.2.1
## [49] rmarkdown_2.16 rstudioapi_0.14 R6_2.5.1 compiler_4.2.2