immdataR<-readRDS(file.path("c:/Work/ImmPort/NAIRR", "all_immdata_with_response.rds"))
eset<-readRDS(file.path("c:/Work/ImmPort/NAIRR", "all_norm_eset.rds"))
youngIDs<-eset@phenoData@data[which(eset@phenoData@data$age_imputed<50),"participant_id"]
youngIDswithResponse<-intersect(youngIDs,immdataR$participant_id)
eset@phenoData@data<-eset@phenoData@data[which(eset@phenoData@data$participant_id %in% youngIDswithResponse),]
immdataR_ordered<-eset@phenoData@data %>% left_join(immdataR, by = "participant_id")
MissingCols<-which(colnames(immdataR_ordered)%in%colnames(eset@phenoData@data)==FALSE)
cols_to_add<- immdataR_ordered %>% select(all_of(MissingCols))
eset@phenoData@data<- bind_cols(eset@phenoData@data,cols_to_add)
exprs_matrix <-Biobase::exprs(eset)[,which(colnames(Biobase::exprs(eset))%in%eset@phenoData@data$uid)]
rownames(eset@phenoData@data) <- eset@phenoData@data$uid
eset@phenoData@data <- eset@phenoData@data[colnames(exprs_matrix), ]
eset$pathogen_vaccine_type=paste0(eset$pathogen,"_",eset$vaccine_type)
#Timepoints of interest
#tp_int=c(-7,0)
tp_int=c(0)
# Get sample indices for timepoints of interest
ind=lapply(tp_int, function(x){ which(eset$study_time_collected==x)})
# Retain only samples from timepoints of interest
eset=eset[,Reduce(union,ind)]
#Recompute timepoint indices after removing extraneous timepoints
ind=lapply(tp_int, function(x){which(eset$study_time_collected==x)})
# Remove samples from a single study with fewer than sample_cutoff samples at any timepoint
sample_cutoff = 10
matrix_uni_tp=lapply(ind,function(x){unique(eset[,x]$matrix)})
matrix_ind=lapply(1:length(ind),function(x){
lapply(1:length(matrix_uni_tp[[x]]), function(y){which(matrix_uni_tp[[x]][[y]]==eset[,ind[[x]]]$matrix)})})
# Create empty vector
ind_cut_all=vector()
for (i in 1:length(matrix_ind)) {
ind_cut=which(sapply(matrix_ind[[i]],length)<sample_cutoff)
if (is_empty(ind_cut)==FALSE) {
for (j in 1:length(ind_cut)) {
ind_cut_all=c(ind_cut_all,ind[[i]][matrix_ind[[i]][[ind_cut[j]]]])
}
}
}
if (is_empty(ind_cut_all)==FALSE) {
eset=eset[,-ind_cut_all]
}
# Recompute timepoint indices after removing samples
tp_int=unique(eset$study_time_collected[which(eset$study_time_collected>=0)])
ind=lapply(tp_int, function(x) {which(eset$study_time_collected==x)})
# Get Pheno Data
PD<-pData(eset)
eset=eset[complete.cases(Biobase::exprs(eset)),]
eset=eset[,eset$uid!="SUB187457.1289_0_Days_BS974314"]
### 2 Preparing BL data for ML ----
## get pheno data, PD
PD.use<-droplevels(pData(eset))
UID<-PD.use$uid
## Prepare expression data
EXP<-Biobase::exprs(eset)
EXP<-EXP[complete.cases(EXP),]
## Need to wrangle this into ML format: Gene_Time by PTID
EXP.melt<-reshape2::melt(EXP)
colnames(EXP.melt)[2]="uid"
EXP.melt$uid<-as.character(EXP.melt$uid)
## Final Wrangle, can be slow
ptm<-proc.time()
EXP.PD<-inner_join(x=EXP.melt[EXP.melt$uid%in%PD.use$uid,],
y=PD.use[,c("uid","participant_id","study_time_collected","time_post_last_vax","age_imputed",
"gender","pathogen","vaccine_type","maxRBA_p30","MFC_p30","MFC","maxRBA", "pathogen_vaccine_type")], by="uid")
proc.time()[3]-ptm[3]
## elapsed
## 0.32
EXP.PD$Gene_Time<-paste0(EXP.PD$Var1, "_",EXP.PD$study_time_collected)
data.ML<-as.data.frame(dcast.data.table(as.data.table(EXP.PD),
formula=Var1+Gene_Time+study_time_collected~participant_id,
value.var = "value"))
rownames(data.ML)<-data.ML$Gene_Time
# Save none filtered ML
data.ML<-data.ML[,-c(1:3)]
# Create and Save Patient Info
PatientInfo<-EXP.PD[!duplicated(EXP.PD$participant_id),
c('age_imputed','gender','pathogen_vaccine_type',
'maxRBA_p30','MFC_p30',"participant_id","maxRBA","MFC")]
rownames(PatientInfo)<-PatientInfo$participant_id
PatientInfo<-as.data.frame(PatientInfo)
The phenotype data is filtered to include only high and low responders.
PatientInfo %>%
filter(MFC_p30%in%c("lowResponder","highResponder")) %>%
droplevels() %>%
mutate(MFC_p30=fct_relevel(MFC_p30, c("highResponder", "lowResponder"))) -> PatientInfo
DevID contains PatientID for high and low responders.
The X and Y is subset to include only these patients, X.Dev Y.Dev (development data)
The development data is then filtered to include only the top 500 most varying genes.
DevID<-rownames(PatientInfo)
DevID<-intersect(DevID, colnames(data.ML))
X.Dev<-data.ML[,DevID]
rownames(X.Dev)<-strsplit2(rownames(X.Dev), "_")[,1]#didn't work, have genes at two timepoints
Y.Dev<-PatientInfo[DevID, 'MFC_p30']
names(Y.Dev)<-DevID
## Keep top 500 most varying genes ##
VarVec<-sort(apply(X.Dev, 1, var), decreasing = T)
keep.genes<-names(VarVec[1:500])
X.Train.t1<-t(X.Dev[keep.genes,])
Y.Train1<-PatientInfo[rownames(X.Train.t1), 'MFC_p30']
names(Y.Train1)<-rownames(X.Train.t1)
Ten fold-CV (adaptive), selecting the tuning parameters with highest AUROC.
SEED<-1987 # a great scientist was born this year
set.seed(SEED)
ctrl=trainControl(method = "adaptive_cv",
number=10,
adaptive = list(min = 5, alpha = 0.05,
method = "gls", complete = TRUE),
selectionFunction = 'best',
search="grid",
returnResamp = "final",
classProbs = TRUE,
returnData = F,
verboseIter = F,
summaryFunction = twoClassSummary,
savePredictions = T)
Training data is centered and scaled.
A tune length of 10 is used.
Model is trained to maximize the AUROC.
Parallelization code is provided for mac or windows.
## Parallel on mac
# doMC::registerDoMC(detectCores()-2)
## Parallel on windows
#library(doParallel)
#cl<-makePSOCKcluster(detectCores()-2)
#registerDoParallel(cl)
set.seed(SEED)
model_list <- train(x=X.Train.t1,
y=Y.Train1[rownames(X.Train.t1)],
trControl=ctrl,
method="rf",
metric="ROC",
tuneLength=10,
maximize=TRUE,
preProc = c("center", "scale"))
#stopCluster(cl)
#save(model_list, file="Top500_Var_Model.rda")
Extracts 10-CV predictions, Training predictions and feature importance
## Feature importance ##
db.imp<-arrange(varImp(model_list)$importance, desc(Overall))
#write.csv(db.imp, file="Top500_Var_VarImp.csv")
## Model Predictions 10-CV ##
db.preds<-model_list$pred %>%
mutate(participant_id=rownames(X.Train.t1)[rowIndex],
pathogen_vaccine_type=PatientInfo[rownames(X.Train.t1)[rowIndex],"pathogen_vaccine_type"])
preds.train <- predict(model_list, newdata=X.Train.t1, type="prob")
preds.train <- data.frame(preds.train) %>% mutate(obs=Y.Train1[rownames(preds.train)])
The script then creates an ROC curve for both sets of predictions.
#dev.new()
#par(pty="s")
### 10-CV
ROC.OBJ1<-pROC::roc(response = db.preds$obs,
predictor = db.preds$highResponder,
levels=c("lowResponder", "highResponder"),
direction="<",
plot=TRUE, legacy.axes=TRUE, percent=TRUE,
xlab="False Positive %", ylab="True Positive %", identity=FALSE,
#main=paste(time, type, meth, bag),
col="black",lwd=4, lty=1, print.auc.x=77, print.auc.y=28,
print.auc.cex=0.9,
ci=TRUE)
#plot(ROC.OBJ1, legacy.axes = TRUE, percent = TRUE, col = "black", lwd = 4,lty = 1, print.auc = TRUE)
### Training-this is exactly the same data as test data
#ROC.OBJ2<-pROC::roc(response = preds.train$obs,
# levels=c("lowResponder", "highResponder"),
# predictor = preds.train$highResponder,
# direction="<",
# legacy.axes=TRUE,
# percent=TRUE,
# ci=TRUE)
#plot(ROC.OBJ2, legacy.axes = TRUE,
# percent = TRUE,
# col = "gray60",
# lwd = 4,
# lty = 3,
# print.auc = TRUE)
This section of code uses the 10-CV predictions in db.preds to calculate performance metrics for each vaccine.
db.preds.filtered <- db.preds %>% group_by(pathogen_vaccine_type) %>% filter(n_distinct(obs) == 2) %>% ungroup()
perf.stats.by.vaccine<-ddply(.data=db.preds.filtered,
.variable=c("pathogen_vaccine_type"),
.fun=function(df){
CONF.MAT<-caret::confusionMatrix(data=relevel(df$pred, ref="highResponder"),
reference=relevel(df$obs, ref="highResponder"))
ROC.OBJ<-pROC::roc(response = df$obs,levels=c("lowResponder", "highResponder"),
direction="<",
predictor = df$highResponder)
AUC.CI<-as.vector(pROC::ci.auc(ROC.OBJ))
names(AUC.CI)<-c("AUC.CI.LOW", "AUC", "AUC.CI.HIGH")
BRIER = ModelMetrics::brier(actual = (as.numeric(df$obs)-1), predicted = df$highResponder)
N = length(df$pathogen_vaccine_type)
PROP_TEST<-prop.test((CONF.MAT$overall["Accuracy"]*N)[1], N)
FISHER_TEST<-fisher.test(df$pred, df$obs)
VEC=c(CONF.MAT$overall, CONF.MAT$byClass, AUC.CI,
Brier=BRIER, n_vac=N, n_correct=(CONF.MAT$overall["Accuracy"]*N)[[1]],
n_fail=((1-CONF.MAT$overall["Accuracy"])*N)[[1]],
PropTestPvalue=PROP_TEST$p.value,
FisherTestPvalue=FISHER_TEST$p.value)
return(VEC)})
perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(AUC))
perf.stats.by.vaccine$n_correct<-perf.stats.by.vaccine$Accuracy*perf.stats.by.vaccine$n_vac
perf.stats.by.vaccine$n_fail<-perf.stats.by.vaccine$n_vac-perf.stats.by.vaccine$n_correct
perf.stats.by.vaccine<-arrange(perf.stats.by.vaccine, desc(n_vac))
perf.stats.by.vaccine
This section of code uses ggplot() to show the AUC and confidence intervals for vaccine.
ggplot(perf.stats.by.vaccine, aes(x=reorder(pathogen_vaccine_type, -AUC),
y=AUC, fill=pathogen_vaccine_type))+
geom_bar(stat='identity', show.legend = F)+
scale_fill_brewer(palette = "Set3")+
geom_errorbar(aes(ymin = AUC.CI.LOW, ymax = AUC.CI.HIGH),width = 0.5)+
labs(y="AUC (CI)", x = "Vaccine")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=8))+
geom_text(aes(y=0.05, label=n_vac))+
ylim(0,1)
sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] randomForest_4.7-1.2 limma_3.62.2 caret_7.0-1
## [4] lattice_0.22-6 ggplot2_3.5.1 forcats_1.0.0
## [7] dplyr_1.1.4 plyr_1.8.9 rlang_1.1.5
## [10] data.table_1.16.4 Biobase_2.66.0 BiocGenerics_0.52.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.51 bslib_0.9.0
## [4] recipes_1.1.1 vctrs_0.6.5 tools_4.4.2
## [7] generics_0.1.3 stats4_4.4.2 parallel_4.4.2
## [10] proxy_0.4-27 tibble_3.2.1 pkgconfig_2.0.3
## [13] ModelMetrics_1.2.2.2 Matrix_1.7-1 RColorBrewer_1.1-3
## [16] lifecycle_1.0.4 farver_2.1.2 compiler_4.4.2
## [19] stringr_1.5.1 statmod_1.5.0 munsell_0.5.1
## [22] codetools_0.2-20 htmltools_0.5.8.1 class_7.3-22
## [25] sass_0.4.10 yaml_2.3.10 prodlim_2024.06.25
## [28] pillar_1.10.1 jquerylib_0.1.4 MASS_7.3-61
## [31] cachem_1.1.0 gower_1.0.2 iterators_1.0.14
## [34] rpart_4.1.23 foreach_1.5.2 nlme_3.1-166
## [37] parallelly_1.42.0 lava_1.8.1 tidyselect_1.2.1
## [40] digest_0.6.37 stringi_1.8.4 future_1.34.0
## [43] reshape2_1.4.4 purrr_1.0.4 listenv_0.9.1
## [46] labeling_0.4.3 splines_4.4.2 fastmap_1.2.0
## [49] grid_4.4.2 colorspace_2.1-1 cli_3.6.4
## [52] magrittr_2.0.3 survival_3.7-0 e1071_1.7-16
## [55] future.apply_1.11.3 withr_3.0.2 scales_1.3.0
## [58] lubridate_1.9.4 timechange_0.3.0 rmarkdown_2.29
## [61] globals_0.16.3 nnet_7.3-19 timeDate_4041.110
## [64] evaluate_1.0.3 knitr_1.49 hardhat_1.4.1
## [67] Rcpp_1.0.14 glue_1.8.0 pROC_1.18.5
## [70] ipred_0.9-15 jsonlite_2.0.0 R6_2.6.1