Step1: Pre-vaccination Data Preparation

1.1: Load Data

  • Immune response and gene expression datasets are loaded.
  • Samples are selected with age <50
immdataR<-readRDS(file.path("c:/Work/ImmPort/NAIRR",  "all_immdata_with_response.rds"))
eset<-readRDS(file.path("c:/Work/ImmPort/NAIRR",  "all_norm_eset.rds"))
youngIDs<-eset@phenoData@data[which(eset@phenoData@data$age_imputed<50),"participant_id"]
youngIDswithResponse<-intersect(youngIDs,immdataR$participant_id)
eset@phenoData@data<-eset@phenoData@data[which(eset@phenoData@data$participant_id %in% youngIDswithResponse),]
immdataR_ordered<-eset@phenoData@data %>% left_join(immdataR, by = "participant_id")
MissingCols<-which(colnames(immdataR_ordered)%in%colnames(eset@phenoData@data)==FALSE)
cols_to_add<- immdataR_ordered %>% select(all_of(MissingCols))
eset@phenoData@data<- bind_cols(eset@phenoData@data,cols_to_add)
exprs_matrix <-Biobase::exprs(eset)[,which(colnames(Biobase::exprs(eset))%in%eset@phenoData@data$uid)]
rownames(eset@phenoData@data) <- eset@phenoData@data$uid
eset@phenoData@data <- eset@phenoData@data[colnames(exprs_matrix), ]
eset$pathogen_vaccine_type=paste0(eset$pathogen,"_",eset$vaccine_type)

1.2: Data Cleaning

  • Select time points of interest. In this tutorial time 0 is selected.
  • Remove samples with fewer than sample cut off at any timepoint
#Timepoints of interest
#tp_int=c(-7,0)
tp_int=c(0)
# Get sample indices for timepoints of interest
ind=lapply(tp_int, function(x){ which(eset$study_time_collected==x)})

# Retain only samples from timepoints of interest
eset=eset[,Reduce(union,ind)]

#Recompute timepoint indices after removing extraneous timepoints
ind=lapply(tp_int, function(x){which(eset$study_time_collected==x)})

# Remove samples from a single study with fewer than sample_cutoff samples at any timepoint
sample_cutoff = 10
matrix_uni_tp=lapply(ind,function(x){unique(eset[,x]$matrix)})
matrix_ind=lapply(1:length(ind),function(x){
  lapply(1:length(matrix_uni_tp[[x]]), function(y){which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix)})})

# Create empty vector
ind_cut_all=vector()

for (i in 1:length(matrix_ind)) {
  ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff)
  if (is_empty(ind_cut)==FALSE) {
    for (j in 1:length(ind_cut)) {
      ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]])
    }
  }
}

if (is_empty(ind_cut_all)==FALSE) {
  eset=eset[,-ind_cut_all]
}

# Recompute timepoint indices after removing samples
tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)])
ind=lapply(tp_int, function(x) {which(eset$study_time_collected==x)})

# Get Pheno Data
PD<-pData(eset)

eset=eset[complete.cases(Biobase::exprs(eset)),]
eset=eset[,eset$uid!="SUB187457.1289_0_Days_BS974314"]

1.3: Wrangle Data to Machine-Learning format

  • Wrangle data to a format suitable for Machine-Learning
  • Repeat this for both BTM and Genes
### 2 Preparing BL data for ML ----

## get pheno data, PD
PD.use<-droplevels(pData(eset))

UID<-PD.use$uid

## Prepare expression data
EXP<-Biobase::exprs(eset)
EXP<-EXP[complete.cases(EXP),]

## Need to wrangle this into ML format: Gene_Time by PTID
EXP.melt<-reshape2::melt(EXP)
colnames(EXP.melt)[2]="uid"
EXP.melt$uid<-as.character(EXP.melt$uid)

## Final Wrangle, can be slow
ptm<-proc.time()
EXP.PD<-inner_join(x=EXP.melt[EXP.melt$uid%in%PD.use$uid,],
                   y=PD.use[,c("uid","participant_id","study_time_collected","time_post_last_vax","age_imputed",
                               "gender","pathogen","vaccine_type","maxRBA_p30","MFC_p30","MFC","maxRBA", "pathogen_vaccine_type")], by="uid")

proc.time()[3]-ptm[3]
## elapsed 
##    0.32
EXP.PD$Gene_Time<-paste0(EXP.PD$Var1, "_",EXP.PD$study_time_collected)

data.ML<-as.data.frame(dcast.data.table(as.data.table(EXP.PD),
                                        formula=Var1+Gene_Time+study_time_collected~participant_id,
                                        value.var = "value"))

rownames(data.ML)<-data.ML$Gene_Time

# Save none filtered ML
data.ML<-data.ML[,-c(1:3)]

# Create and Save Patient Info
PatientInfo<-EXP.PD[!duplicated(EXP.PD$participant_id),
                    c('age_imputed','gender','pathogen_vaccine_type',
                      'maxRBA_p30','MFC_p30',"participant_id","maxRBA","MFC")]

rownames(PatientInfo)<-PatientInfo$participant_id
PatientInfo<-as.data.frame(PatientInfo)

1.4: Train Random Forest

Loading Pre-vaccination gene-expression data

The phenotype data is filtered to include only high and low responders.

PatientInfo %>% 
  filter(MFC_p30%in%c("lowResponder","highResponder")) %>%
  droplevels() %>%
  mutate(MFC_p30=fct_relevel(MFC_p30, c("highResponder", "lowResponder"))) -> PatientInfo

DevID contains PatientID for high and low responders.

The X and Y is subset to include only these patients, X.Dev Y.Dev (development data)

The development data is then filtered to include only the top 500 most varying genes.

DevID<-rownames(PatientInfo)
DevID<-intersect(DevID, colnames(data.ML))
X.Dev<-data.ML[,DevID]
rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]#didn't work, have genes at two timepoints

Y.Dev<-PatientInfo[DevID, 'MFC_p30']
names(Y.Dev)<-DevID

## Keep top 500 most varying genes ##
VarVec<-sort(apply(X.Dev, 1, var), decreasing = T)
keep.genes<-names(VarVec[1:500])

X.Train.t1<-t(X.Dev[keep.genes,])
Y.Train1<-PatientInfo[rownames(X.Train.t1), 'MFC_p30']
names(Y.Train1)<-rownames(X.Train.t1)

Define parameters for training using trainControl()

Ten fold-CV (adaptive), selecting the tuning parameters with highest AUROC.

SEED<-1987 # a great scientist was born this year

set.seed(SEED)
ctrl=trainControl(method = "adaptive_cv", 
                  number=10,
                  adaptive = list(min = 5, alpha = 0.05,
                                  method = "gls", complete = TRUE),
                  selectionFunction = 'best',
                  search="grid",
                  returnResamp = "final",
                  classProbs = TRUE,
                  returnData = F,
                  verboseIter = F,
                  summaryFunction = twoClassSummary,
                  savePredictions = T)

Run model training for random forest classifier

Training data is centered and scaled.

A tune length of 10 is used.

Model is trained to maximize the AUROC.

Parallelization code is provided for mac or windows.

## Parallel on mac
# doMC::registerDoMC(detectCores()-2)

## Parallel on windows
#library(doParallel)
#cl<-makePSOCKcluster(detectCores()-2)
#registerDoParallel(cl)

set.seed(SEED)
model_list <- train(x=X.Train.t1,
                    y=Y.Train1[rownames(X.Train.t1)],
                    trControl=ctrl,
                    method="rf",
                    metric="ROC", 
                    tuneLength=10,
                    maximize=TRUE,
                    preProc = c("center", "scale"))

#stopCluster(cl)

#save(model_list, file="Top500_Var_Model.rda")

Get Model Predictions and Variable Importance

Extracts 10-CV predictions, Training predictions and feature importance

## Feature importance ##
db.imp<-arrange(varImp(model_list)$importance, desc(Overall))
#write.csv(db.imp, file="Top500_Var_VarImp.csv")

## Model Predictions 10-CV ##
db.preds<-model_list$pred %>%
  mutate(participant_id=rownames(X.Train.t1)[rowIndex],
         pathogen_vaccine_type=PatientInfo[rownames(X.Train.t1)[rowIndex],"pathogen_vaccine_type"])

preds.train <- predict(model_list, newdata=X.Train.t1, type="prob")
preds.train <- data.frame(preds.train) %>% mutate(obs=Y.Train1[rownames(preds.train)])

1.5: Evaluate model

The script then creates an ROC curve for both sets of predictions.

#dev.new()
#par(pty="s")

### 10-CV 
ROC.OBJ1<-pROC::roc(response = db.preds$obs, 
                    predictor = db.preds$highResponder, 
                    levels=c("lowResponder", "highResponder"),
                    direction="<",
                    plot=TRUE, legacy.axes=TRUE, percent=TRUE,
                    xlab="False Positive %", ylab="True Positive %", identity=FALSE,
                    #main=paste(time, type, meth, bag),
                    col="black",lwd=4, lty=1, print.auc.x=77, print.auc.y=28,
                    print.auc.cex=0.9,
                    ci=TRUE)

#plot(ROC.OBJ1, legacy.axes = TRUE, percent = TRUE, col = "black", lwd = 4,lty = 1, print.auc = TRUE)

### Training-this is exactly the same data as test data
#ROC.OBJ2<-pROC::roc(response = preds.train$obs, 
#                    levels=c("lowResponder", "highResponder"),
#                    predictor = preds.train$highResponder, 
#                    direction="<",
#                    legacy.axes=TRUE, 
#                    percent=TRUE,
#                    ci=TRUE)

#plot(ROC.OBJ2, legacy.axes = TRUE,
#         percent = TRUE,
#         col = "gray60",
#         lwd = 4,
#         lty = 3,
#         print.auc = TRUE)

This section of code uses the 10-CV predictions in db.preds to calculate performance metrics for each vaccine.

db.preds.filtered <- db.preds %>%  group_by(pathogen_vaccine_type) %>%  filter(n_distinct(obs) == 2) %>% ungroup()
perf.stats.by.vaccine<-ddply(.data=db.preds.filtered,
                             .variable=c("pathogen_vaccine_type"),
                             .fun=function(df){
                               
                               CONF.MAT<-caret::confusionMatrix(data=relevel(df$pred, ref="highResponder"),
                                                                reference=relevel(df$obs, ref="highResponder"))
                               
                               ROC.OBJ<-pROC::roc(response = df$obs,levels=c("lowResponder", "highResponder"),
                                                  direction="<", 
                                                  predictor = df$highResponder)
                               
                               AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ))
                               names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")
                               
                               BRIER = ModelMetrics::brier(actual = (as.numeric(df$obs)-1), predicted = df$highResponder)
                               
                               N = length(df$pathogen_vaccine_type)
                               
                               PROP_TEST<-prop.test((CONF.MAT$overall["Accuracy"]*N)[1], N)
                               
                               FISHER_TEST<-fisher.test(df$pred, df$obs)
                               
                               VEC=c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI, 
                                     Brier=BRIER, n_vac=N, n_correct=(CONF.MAT$overall["Accuracy"]*N)[[1]],
                                     n_fail=((1-CONF.MAT$overall["Accuracy"])*N)[[1]],
                                     PropTestPvalue=PROP_TEST$p.value,
                                     FisherTestPvalue=FISHER_TEST$p.value)
                               
                               return(VEC)})

perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC))

perf.stats.by.vaccine$n_correct<-perf.stats.by.vaccine$Accuracy*perf.stats.by.vaccine$n_vac
perf.stats.by.vaccine$n_fail<-perf.stats.by.vaccine$n_vac-perf.stats.by.vaccine$n_correct

perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(n_vac))

perf.stats.by.vaccine

This section of code uses ggplot() to show the AUC and confidence intervals for vaccine.

ggplot(perf.stats.by.vaccine, aes(x=reorder(pathogen_vaccine_type, -AUC),
                       y=AUC, fill=pathogen_vaccine_type))+
  geom_bar(stat='identity', show.legend = F)+
  scale_fill_brewer(palette = "Set3")+
  geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH),width = 0.5)+
  labs(y="AUC (CI)", x = "Vaccine")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=8))+
  geom_text(aes(y=0.05, label=n_vac))+
  ylim(0,1)

Session Info

sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] randomForest_4.7-1.2 limma_3.62.2         caret_7.0-1         
##  [4] lattice_0.22-6       ggplot2_3.5.1        forcats_1.0.0       
##  [7] dplyr_1.1.4          plyr_1.8.9           rlang_1.1.5         
## [10] data.table_1.16.4    Biobase_2.66.0       BiocGenerics_0.52.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6         xfun_0.51            bslib_0.9.0         
##  [4] recipes_1.1.1        vctrs_0.6.5          tools_4.4.2         
##  [7] generics_0.1.3       stats4_4.4.2         parallel_4.4.2      
## [10] proxy_0.4-27         tibble_3.2.1         pkgconfig_2.0.3     
## [13] ModelMetrics_1.2.2.2 Matrix_1.7-1         RColorBrewer_1.1-3  
## [16] lifecycle_1.0.4      farver_2.1.2         compiler_4.4.2      
## [19] stringr_1.5.1        statmod_1.5.0        munsell_0.5.1       
## [22] codetools_0.2-20     htmltools_0.5.8.1    class_7.3-22        
## [25] sass_0.4.10          yaml_2.3.10          prodlim_2024.06.25  
## [28] pillar_1.10.1        jquerylib_0.1.4      MASS_7.3-61         
## [31] cachem_1.1.0         gower_1.0.2          iterators_1.0.14    
## [34] rpart_4.1.23         foreach_1.5.2        nlme_3.1-166        
## [37] parallelly_1.42.0    lava_1.8.1           tidyselect_1.2.1    
## [40] digest_0.6.37        stringi_1.8.4        future_1.34.0       
## [43] reshape2_1.4.4       purrr_1.0.4          listenv_0.9.1       
## [46] labeling_0.4.3       splines_4.4.2        fastmap_1.2.0       
## [49] grid_4.4.2           colorspace_2.1-1     cli_3.6.4           
## [52] magrittr_2.0.3       survival_3.7-0       e1071_1.7-16        
## [55] future.apply_1.11.3  withr_3.0.2          scales_1.3.0        
## [58] lubridate_1.9.4      timechange_0.3.0     rmarkdown_2.29      
## [61] globals_0.16.3       nnet_7.3-19          timeDate_4041.110   
## [64] evaluate_1.0.3       knitr_1.49           hardhat_1.4.1       
## [67] Rcpp_1.0.14          glue_1.8.0           pROC_1.18.5         
## [70] ipred_0.9-15         jsonlite_2.0.0       R6_2.6.1