This tutorial demonstrates how to build a machine learning pipeline to predict vaccine response outcomes using transcriptomic data from the Immune Signatures Data Resource (ISDR). The ISDR is a standardized compendium of systems vaccinology datasets that integrates pre- and post-vaccination gene expression profiles alongside matched immune response measurements from multiple vaccine studies.
Citation: Diray-Arce, J., Miller, H.E.R., Henrich, E. et al. The Immune Signatures data resource, a compendium of systems vaccinology datasets. Sci Data 9, 635 (2022). https://doi.org/10.1038/s41597-022-01714-7 [doi.org]
We walk through each step from data preparation to model evaluation, focusing on:
This tutorial supports reproducible research and serves as a reference pipeline for systems vaccinology studies using public immunology data. It can be adapted to other datasets and extended to test additional classifiers.
all_immdata_with_response.rds and
all_norm_eset.rds are storedall_immdata_with_response.rds)all_norm_eset.rds)pathogen_vaccine_type identifierimmdataR<-readRDS(file.path("/Users/joannarce/BCH Dropbox/Joann Arce/NAIRR/ISDR", "all_immdata_with_response.rds"))
eset<-readRDS(file.path("/Users/joannarce/BCH Dropbox/Joann Arce/NAIRR/ISDR", "all_norm_eset.rds"))
youngIDs<-eset@phenoData@data[which(eset@phenoData@data$age_imputed<50),"participant_id"]
youngIDswithResponse<-intersect(youngIDs,immdataR$participant_id)
eset@phenoData@data<-eset@phenoData@data[which(eset@phenoData@data$participant_id %in% youngIDswithResponse),]
immdataR_ordered<-eset@phenoData@data %>% left_join(immdataR, by = "participant_id")
MissingCols<-which(colnames(immdataR_ordered)%in%colnames(eset@phenoData@data)==FALSE)
cols_to_add<- immdataR_ordered %>% select(all_of(MissingCols))
eset@phenoData@data<- bind_cols(eset@phenoData@data,cols_to_add)
exprs_matrix <-Biobase::exprs(eset)[,which(colnames(Biobase::exprs(eset))%in%eset@phenoData@data$uid)]
rownames(eset@phenoData@data) <- eset@phenoData@data$uid
eset@phenoData@data <- eset@phenoData@data[colnames(exprs_matrix), ]
eset$pathogen_vaccine_type=paste0(eset$pathogen,"_",eset$vaccine_type)
#Timepoints of interest
#tp_int=c(-7,0)
tp_int=c(0)
# Get sample indices for timepoints of interest
ind=lapply(tp_int, function(x){ which(eset$study_time_collected==x)})
# Retain only samples from timepoints of interest
eset=eset[,Reduce(union,ind)]
#Recompute timepoint indices after removing extraneous timepoints
ind=lapply(tp_int, function(x){which(eset$study_time_collected==x)})
# Remove samples from a single study with fewer than sample_cutoff samples at any timepoint
sample_cutoff = 10
matrix_uni_tp=lapply(ind,function(x){unique(eset[,x]$matrix)})
matrix_ind=lapply(1:length(ind),function(x){
lapply(1:length(matrix_uni_tp[[x]]), function(y){which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix)})})
# Create empty vector
ind_cut_all=vector()
for (i in 1:length(matrix_ind)) {
ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff)
if (is_empty(ind_cut)==FALSE) {
for (j in 1:length(ind_cut)) {
ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]])
}
}
}
if (is_empty(ind_cut_all)==FALSE) {
eset=eset[,-ind_cut_all]
}
# Recompute timepoint indices after removing samples
tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)])
ind=lapply(tp_int, function(x) {which(eset$study_time_collected==x)})
# Get Pheno Data
PD<-pData(eset)
eset=eset[complete.cases(Biobase::exprs(eset)),]
eset=eset[,eset$uid!="SUB187457.1289_0_Days_BS974314"]
data.ML) for ML: rows =
gene-time points, cols = participantsPatientInfo table with demographics and
outcomesPD.use <- droplevels(pData(eset))
EXP <- Biobase::exprs(eset)
EXP <- EXP[complete.cases(EXP), ]
EXP.melt <- reshape2::melt(EXP)
colnames(EXP.melt)[2] <- "uid"
EXP.melt$uid <- as.character(EXP.melt$uid)
EXP.PD <- inner_join(EXP.melt, PD.use, by = "uid")
EXP.PD$Gene_Time <- paste0(EXP.PD$Var1, "_", EXP.PD$study_time_collected)
data.ML <- dcast(EXP.PD, Var1 + Gene_Time + study_time_collected ~ participant_id, value.var = "value")
rownames(data.ML) <- data.ML$Gene_Time
data.ML <- data.ML[, -c(1:3)]
PatientInfo <- EXP.PD[!duplicated(EXP.PD$participant_id),
c("age_imputed", "gender", "pathogen_vaccine_type", "maxRBA_p30", "MFC_p30", "participant_id", "maxRBA", "MFC")]
rownames(PatientInfo) <- PatientInfo$participant_id
We filter PatientInfo to include only participants
labeled as highResponder or lowResponder, and
set highResponder as the reference level for
classification.
PatientInfo %>%
filter(MFC_p30%in%c("lowResponder","highResponder")) %>%
droplevels() %>%
mutate(MFC_p30=fct_relevel(MFC_p30, c("highResponder", "lowResponder"))) -> PatientInfo
X.Train and response vector
Y.TrainDevID<-rownames(PatientInfo)
DevID<-intersect(DevID, colnames(data.ML))
X.Dev<-data.ML[,DevID]
rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]#didn't work, have genes at two timepoints
Y.Dev<-PatientInfo[DevID, 'MFC_p30']
names(Y.Dev)<-DevID
## Keep top 500 most varying genes ##
VarVec<-sort(apply(X.Dev, 1, var), decreasing = T)
keep.genes<-names(VarVec[1:500])
X.Train.t1<-t(X.Dev[keep.genes,])
Y.Train1<-PatientInfo[rownames(X.Train.t1), 'MFC_p30']
names(Y.Train1)<-rownames(X.Train.t1)
mtry grid for Random Forest tuningSEED<-1987 # a great scientist was born this year
set.seed(SEED)
ctrl=trainControl(method = "adaptive_cv",
number=10,
adaptive = list(min = 5, alpha = 0.05,
method = "gls", complete = TRUE),
selectionFunction = 'best',
search="grid",
returnResamp = "final",
classProbs = TRUE,
returnData = F,
verboseIter = F,
summaryFunction = twoClassSummary,
savePredictions = T)
## Parallel on mac
# doMC::registerDoMC(detectCores()-2)
## Parallel on windows
#library(doParallel)
#cl<-makePSOCKcluster(detectCores()-2)
#registerDoParallel(cl)
set.seed(SEED)
model_list <- train(x=X.Train.t1,
y=Y.Train1[rownames(X.Train.t1)],
trControl=ctrl,
method="rf",
metric="ROC",
tuneLength=10,
maximize=TRUE,
preProc = c("center", "scale"))
#stopCluster(cl)
#save(model_list, file="Top500_Var_Model.rda")
## Feature importance ##
db.imp<-arrange(varImp(model_list)$importance, desc(Overall))
#write.csv(db.imp, file="Top500_Var_VarImp.csv")
## Model Predictions 10-CV ##
db.preds<-model_list$pred %>%
mutate(participant_id=rownames(X.Train.t1)[rowIndex],
pathogen_vaccine_type=PatientInfo[rownames(X.Train.t1)[rowIndex],"pathogen_vaccine_type"])
preds.train <- predict(model_list, newdata=X.Train.t1, type="prob")
preds.train <- data.frame(preds.train) %>% mutate(obs=Y.Train1[rownames(preds.train)])
We retrieve the top predictive genes, generate cross-validated probability predictions, and plot the ROC curve with its confidence interval.
#dev.new()
#par(pty="s")
### 10-CV
ROC.OBJ1<-pROC::roc(response = db.preds$obs,
predictor = db.preds$highResponder,
levels=c("lowResponder", "highResponder"),
direction="<",
plot=TRUE, legacy.axes=TRUE, main= "Overall ROC", percent=TRUE,
xlab="False Positive %", ylab="True Positive %", identity=FALSE,
#main=paste(time, type, meth, bag),
col="black",lwd=4, lty=1, print.auc.x=77, print.auc.y=28,
print.auc.cex=0.9,
ci=TRUE)
#plot(ROC.OBJ1, legacy.axes = TRUE, percent = TRUE, col = "black", lwd = 4,lty = 1, print.auc = TRUE)
### Training-this is exactly the same data as test data
#ROC.OBJ2<-pROC::roc(response = preds.train$obs,
# levels=c("lowResponder", "highResponder"),
# predictor = preds.train$highResponder,
# direction="<",
# legacy.axes=TRUE,
# percent=TRUE,
# ci=TRUE)
#plot(ROC.OBJ2, legacy.axes = TRUE,
# percent = TRUE,
# col = "gray60",
# lwd = 4,
# lty = 3,
# print.auc = TRUE)
ci.auc (ROC.OBJ1)
## 95% CI: 65.55%-75.89% (DeLong)
This section of code uses the 10-CV predictions in db.preds to calculate performance metrics for each vaccine.
db.preds.filtered <- db.preds %>% group_by(pathogen_vaccine_type) %>% filter(n_distinct(obs) == 2) %>% ungroup()
perf.stats.by.vaccine<-plyr::ddply(.data=db.preds.filtered,
.variable=c("pathogen_vaccine_type"),
.fun=function(df){
CONF.MAT<-caret::confusionMatrix(data=relevel(df$pred, ref="highResponder"),
reference=relevel(df$obs, ref="highResponder"))
ROC.OBJ<-pROC::roc(response = df$obs,levels=c("lowResponder", "highResponder"),
direction="<",
predictor = df$highResponder)
AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ))
names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")
BRIER = ModelMetrics::brier(actual = (as.numeric(df$obs)-1), predicted = df$highResponder)
N = length(df$pathogen_vaccine_type)
PROP_TEST<-prop.test((CONF.MAT$overall["Accuracy"]*N)[1], N)
FISHER_TEST<-fisher.test(df$pred, df$obs)
VEC=c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI,
Brier=BRIER, n_vac=N, n_correct=(CONF.MAT$overall["Accuracy"]*N)[[1]],
n_fail=((1-CONF.MAT$overall["Accuracy"])*N)[[1]],
PropTestPvalue=PROP_TEST$p.value,
FisherTestPvalue=FISHER_TEST$p.value)
return(VEC)})
perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC))
perf.stats.by.vaccine$n_correct<-perf.stats.by.vaccine$Accuracy*perf.stats.by.vaccine$n_vac
perf.stats.by.vaccine$n_fail<-perf.stats.by.vaccine$n_vac-perf.stats.by.vaccine$n_correct
perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(n_vac))
perf.stats.by.vaccine
This section of code uses ggplot() to show the AUC and confidence intervals for vaccine.
ggplot(perf.stats.by.vaccine, aes(x=reorder(pathogen_vaccine_type, -AUC),
y=AUC, fill=pathogen_vaccine_type))+
geom_bar(stat='identity', show.legend = F)+
scale_fill_brewer(palette = "Set3")+
geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH),width = 0.5)+
labs(y="AUC (CI)", x = "Vaccine")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=8))+
geom_text(aes(y=0.05, label=n_vac))+
ylim(0,1)
X.Train <- t(X.Dev[keep.genes, ])
Y.Train <- PatientInfo[rownames(X.Train), "MFC_p30"]
# Perform PCA on top variable genes from training data
pca_res <- prcomp(X.Train, scale. = TRUE)
# Build data frame for plotting
pca_df <- data.frame(
PC1 = pca_res$x[,1],
PC2 = pca_res$x[,2],
Vaccine = PatientInfo[rownames(X.Train), "pathogen_vaccine_type"],
Responder = PatientInfo[rownames(X.Train), "MFC_p30"]
)
# Plot PCA
ggplot(pca_df, aes(x = PC1, y = PC2, color = Vaccine, shape = Responder)) +
geom_point(size = 3) +
theme_bw() +
labs(
title = "PCA of Top 500 Most Variable Genes",
x = paste0("PC1 (", round(summary(pca_res)$importance[2,1]*100,1), "% variance)"),
y = paste0("PC2 (", round(summary(pca_res)$importance[2,2]*100,1), "% variance)")
)
# Predicted probability distribution
ggplot(db.preds, aes(x = highResponder, fill = obs)) +
geom_density(alpha = 0.5) +
theme_bw() +
labs(title = "Predicted Probability Distribution by Class",
x = "Predicted Probability for High Responder", y = "Density")
# Confusion matrix heatmap
library(reshape2)
pred_labels <- factor(ifelse(db.preds$highResponder > 0.5, "highResponder", "lowResponder"),
levels = c("lowResponder", "highResponder"))
true_labels <- factor(db.preds$obs, levels = c("lowResponder", "highResponder"))
cm_table <- table(Predicted = pred_labels, Actual = true_labels)
cm_melt <- melt(cm_table)
ggplot(cm_melt, aes(x = Actual, y = Predicted, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = value), size = 5) +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal() +
labs(title = "Confusion Matrix Heatmap")
var_df_all <- data.frame(Gene = names(VarVec), Variance = VarVec)
# Stick-dot plot for variance
ggplot(var_df_all[1:20, ], aes(x = reorder(Gene, Variance), y = Variance)) +
geom_segment(aes(xend = Gene, y = 0, yend = Variance), color = "grey") +
geom_point(size = 3, color = "coral") +
coord_flip() +
theme_bw() +
labs(title = "Top 20 Most Variable Genes - Entire Dataset", x = "Gene", y = "Variance")
# Re-extract feature importance for clarity
db.imp <- arrange(varImp(model_list)$importance, desc(Overall))
db.imp$Feature <- rownames(db.imp)
# Stick-dot plot for feature importance
ggplot(db.imp[1:20, ], aes(x = reorder(Feature, Overall), y = Overall)) +
geom_segment(aes(xend = Feature, y = 0, yend = Overall), color = "grey") +
geom_point(size = 3, color = "steelblue") +
coord_flip() +
theme_bw() +
labs(title = "Top 20 Feature Importance - Entire Dataset", x = "Gene", y = "Importance")
customGrid <- expand.grid(mtry = c(2, 5, 10))
influenza_ids <- PatientInfo %>% filter(pathogen_vaccine_type == "Influenza_Inactivated") %>% rownames()
influenza_ids <- intersect(influenza_ids, colnames(data.ML))
X.Influenza <- data.ML[, influenza_ids]
rownames(X.Influenza) <- strsplit2(rownames(X.Influenza), "_")[, 1]
VarVec_influenza <- sort(apply(X.Influenza, 1, var), decreasing = TRUE)
keep.genes.influenza <- names(VarVec_influenza[1:500])
X.Influenza.Train <- t(X.Influenza[keep.genes.influenza, ])
Y.Influenza.Train <- PatientInfo[rownames(X.Influenza.Train), "MFC_p30"]
var_df <- data.frame(Gene = names(VarVec_influenza), Variance = VarVec_influenza)
ggplot(var_df[1:20, ], aes(x = reorder(Gene, Variance), y = Variance)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
theme_bw() +
labs(title = "Top 20 Most Variable Genes - Influenza (Inactivated)", x = "Gene", y = "Variance")
model_influenza <- train(
x = X.Influenza.Train,
y = Y.Influenza.Train,
trControl = ctrl,
method = "rf",
metric = "ROC",
tuneGrid = customGrid,
preProc = c("center", "scale")
)
db.imp.influenza <- arrange(varImp(model_influenza)$importance, desc(Overall))
db.imp.influenza$Feature <- rownames(db.imp.influenza)
ggplot(db.imp.influenza[1:20, ], aes(x = reorder(Feature, Overall), y = Overall)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
theme_bw() +
labs(title = "Top 20 Feature Importance - Influenza (Inactivated)", x = "Gene", y = "Importance")
db.preds.influenza <- model_influenza$pred %>% mutate(participant_id = rownames(X.Influenza.Train)[rowIndex])
if (length(unique(db.preds.influenza$obs)) == 2) {
pROC::roc(
response = db.preds.influenza$obs,
predictor = db.preds.influenza$highResponder,
levels = c("lowResponder", "highResponder"),
direction = "<",
plot = TRUE,
legacy.axes = TRUE,
percent = TRUE,
col = "purple",
lwd = 3,
main = "ROC Curve - Influenza (Inactivated) Subgroup"
)
} else {
print("Cannot compute ROC: Only one response class present.")
}
##
## Call:
## roc.default(response = db.preds.influenza$obs, predictor = db.preds.influenza$highResponder, levels = c("lowResponder", "highResponder"), percent = TRUE, direction = "<", plot = TRUE, legacy.axes = TRUE, col = "purple", lwd = 3, main = "ROC Curve - Influenza (Inactivated) Subgroup")
##
## Data: db.preds.influenza$highResponder in 57 controls (db.preds.influenza$obs lowResponder) < 69 cases (db.preds.influenza$obs highResponder).
## Area under the curve: 70.09%
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rlang_1.1.6 ModelMetrics_1.2.2.2 pROC_1.19.0.1
## [4] reshape2_1.4.4 randomForest_4.7-1.2 limma_3.64.3
## [7] caret_7.0-1 lattice_0.22-7 ggplot2_4.0.0
## [10] forcats_1.0.1 dplyr_1.1.4 data.table_1.17.8
## [13] Biobase_2.68.0 BiocGenerics_0.54.0 generics_0.1.4
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.53 bslib_0.9.0
## [4] recipes_1.3.1 vctrs_0.6.5 tools_4.5.0
## [7] stats4_4.5.0 parallel_4.5.0 proxy_0.4-27
## [10] tibble_3.3.0 pkgconfig_2.0.3 Matrix_1.7-4
## [13] RColorBrewer_1.1-3 S7_0.2.0 lifecycle_1.0.4
## [16] compiler_4.5.0 farver_2.1.2 stringr_1.5.2
## [19] statmod_1.5.0 codetools_0.2-20 htmltools_0.5.8.1
## [22] class_7.3-23 sass_0.4.10 yaml_2.3.10
## [25] prodlim_2025.04.28 pillar_1.11.1 jquerylib_0.1.4
## [28] MASS_7.3-65 cachem_1.1.0 gower_1.0.2
## [31] iterators_1.0.14 rpart_4.1.24 foreach_1.5.2
## [34] nlme_3.1-168 parallelly_1.45.1 lava_1.8.1
## [37] tidyselect_1.2.1 digest_0.6.37 stringi_1.8.7
## [40] future_1.67.0 purrr_1.1.0 listenv_0.9.1
## [43] labeling_0.4.3 splines_4.5.0 fastmap_1.2.0
## [46] grid_4.5.0 cli_3.6.5 magrittr_2.0.4
## [49] survival_3.8-3 e1071_1.7-16 future.apply_1.20.0
## [52] withr_3.0.2 scales_1.4.0 lubridate_1.9.4
## [55] timechange_0.3.0 rmarkdown_2.29 globals_0.18.0
## [58] nnet_7.3-20 timeDate_4041.110 evaluate_1.0.5
## [61] knitr_1.50 hardhat_1.4.2 Rcpp_1.1.0
## [64] glue_1.8.0 ipred_0.9-15 rstudioapi_0.17.1
## [67] jsonlite_2.0.0 R6_2.6.1 plyr_1.8.9