Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make permutation importance test optional, functionality to use any outcome variables, optional hyperparameter input #11

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ cd DeepLearning
2. `Rscript code/learning/main.R` sources 4 other scripts that are part of the pipeline.

* To choose the model and model hyperparemeters:`source('code/learning/model_selection.R')`
* This function (and `get_results`) also takes an optional argument to specify your own hyperparameters to be used for cross-validation (`hyperparameters`). This argument should be a list where the names of the list are the hyperparameters and the values are the values to be tested

Depending on your ML task, the model hyperparameter range to tune will be different. This is hard-coded for our study but will be updated to integrate user-defined range in the future (Issue # 10)

Expand Down
4 changes: 2 additions & 2 deletions code/learning/generateAUCs.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
######################################################################
#------------------------- DEFINE FUNCTION -------------------#
######################################################################
get_results <- function(dataset, models, split_number){
get_results <- get_results <- function(dataset, models, split_number, perm=T, outcome=NA, hyperparameters=NULL){
# Save results of the modeling pipeline as a list
results <- pipeline(dataset, models, split_number)
results <- pipeline(dataset, models, split_number,outcome=outcome,perm=perm,hyperparameters=hyperparameters)
# These results have
# 1. cv_auc,
# 2. test_auc
Expand Down
16 changes: 12 additions & 4 deletions code/learning/main.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ data <- inner_join(meta, shared, by=c("sample"="Group")) %>%
select(-sample, -Dx_Bin, -fit_result) %>%
drop_na()
# We want the diagnosis column to be a factor
data$dx <- factor(data$dx)
# data$dx <- factor(data$dx)
# We want the first sample to be a cancer so we shuffle the dataset with a specific seed to get cancer as the first sample
set.seed(0)
data <- data[sample(1:nrow(data)), ]
# set.seed(0)
# data <- data[sample(1:nrow(data)), ]
###################################################################

######################## RUN PIPELINE #############################
Expand All @@ -98,6 +98,11 @@ data <- data[sample(1:nrow(data)), ]
input <- commandArgs(trailingOnly=TRUE)
seed <- as.numeric(input[1])
model <- input[2]
permutation <- as.numeric(input[3]) # 1 if permutation performed
# 0 if permutation not performed
outcome <- input[4]



# Then arguments 1 and 2 will be placed respectively into the functions:
# 1. set.seed() : creates reproducibility and variability
Expand All @@ -108,7 +113,10 @@ set.seed(seed)
# Start walltime for running model
tic("model")
# Run the model
get_results(data, model, seed)
# User can define the outcome and to do permutation or not here:
# example: get_results(data, model, seed, 0, "dx)
# OR pass as NA if no argument is given on the command line
get_results(data, model, seed, permutation, outcome)
# Stop walltime for running model
secs <- toc()
# Save elapsed time
Expand Down
102 changes: 73 additions & 29 deletions code/learning/model_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,18 @@
######################################################################


pipeline <- function(dataset, model, split_number){
pipeline <- function(dataset, model, split_number, outcome=NA, hyperparameters=NULL, perm=T){

# -----------------------Get outcome variable----------------------------->
# If no outcome specified, use first column in dataset
if(is.na(outcome)){
outcome <- colnames(dataset)[1]
}else{
# check to see if outcome is in column names of dataset
if(!outcome %in% names(dataset)){
stop(paste('Outcome',outcome,'not in column names of dataset.'))
}
}

# ------------------Pre-process the full Dataset------------------------->
# We are doing the pre-processing to the full dataset and then splitting 80-20
Expand All @@ -46,20 +57,29 @@ pipeline <- function(dataset, model, split_number){
dataTransformed <- predict(preProcValues, dataset)
# ----------------------------------------------------------------------->

# Get outcome variables
first_outcome = as.character(dataset[,outcome][1])
outcome_vals = unique(dataset[,outcome])
if(length(outcome_vals) != 2) stop('A binary outcome variable is required.')
second_outcome = as.character(outcome_vals[!outcome_vals == first_outcome])
print(paste(c('first outcome:','second outcome:'),c(first_outcome,second_outcome)))


# ------------------80-20 Datasplit for each seed------------------------->
# Do the 80-20 data-split
# Stratified data partitioning %80 training - %20 testing
inTraining <- createDataPartition(dataTransformed$dx, p = .80, list = FALSE)
inTraining <- createDataPartition(dataTransformed[,outcome], p = .80, list = FALSE)
trainTransformed <- dataTransformed[ inTraining,]
testTransformed <- dataTransformed[-inTraining,]
# ----------------------------------------------------------------------->

# -------------Define hyper-parameter and cv settings-------------------->
# Define hyper-parameter tuning grid and the training method
# Uses function tuning_grid() in file ('code/learning/model_selection.R')
grid <- tuning_grid(trainTransformed, model)[[1]]
method <- tuning_grid(trainTransformed, model)[[2]]
cv <- tuning_grid(trainTransformed, model)[[3]]
tune <- tuning_grid(trainTransformed, model, outcome, hyperparameters)
grid <- tune[[1]]
method <- tune[[2]]
cv <- tune[[3]]
# ----------------------------------------------------------------------->

# ---------------------------Train the model ---------------------------->
Expand All @@ -85,11 +105,15 @@ pipeline <- function(dataset, model, split_number){
# - If the model is random forest, we need to add a ntree=1000 parameter.
# We chose ntree=1000 empirically.
# ----------------------------------------------------------------------->
# Make formula based on outcome
f <- as.formula(paste(outcome, '~ .'))
print('Machine learning formula:')
print(f)
# Start walltime for training model
tic("train")
if(model=="L2_Logistic_Regression"){
print(model)
trained_model <- train(dx ~ ., # label
trained_model <- train(f, # label
data=trainTransformed, #total data
method = method,
trControl = cv,
Expand All @@ -99,7 +123,7 @@ pipeline <- function(dataset, model, split_number){
}
else if(model=="Random_Forest"){
print(model)
trained_model <- train(dx ~ .,
trained_model <- train(f,
data=trainTransformed,
method = method,
trControl = cv,
Expand All @@ -109,7 +133,7 @@ pipeline <- function(dataset, model, split_number){
}
else{
print(model)
trained_model <- train(dx ~ .,
trained_model <- train(f,
data=trainTransformed,
method = method,
trControl = cv,
Expand All @@ -133,27 +157,47 @@ pipeline <- function(dataset, model, split_number){
# if linear: Output the weights of features of linear models
# else: Output the feature importances based on random permutation for non-linear models
# Here we look at the top 20 important features
if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
# We will use the permutation_importance function here to:
# 1. Predict held-out test-data
# 2. Calculate ROC and AUROC values on this prediction
# 3. Get the feature importances for correlated and uncorrelated feautures
roc_results <- permutation_importance(trained_model, testTransformed)
test_auc <- roc_results[[1]] # Predict the base test importance
feature_importance_non_cor <- roc_results[2] # save permutation results
# Get feature weights
feature_importance_cor <- trained_model$finalModel$W
}
else{
# We will use the permutation_importance function here to:
# 1. Predict held-out test-data
# 2. Calculate ROC and AUROC values on this prediction
# 3. Get the feature importances for correlated and uncorrelated feautures
roc_results <- permutation_importance(trained_model, testTransformed)
test_auc <- roc_results[[1]] # Predict the base test importance
feature_importance_non_cor <- roc_results[2] # save permutation results of non-cor
feature_importance_cor <- roc_results[3] # save permutation results of cor
}
if(perm==T){
if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
# We will use the permutation_importance function here to:
# 1. Predict held-out test-data
# 2. Calculate ROC and AUROC values on this prediction
# 3. Get the feature importances for correlated and uncorrelated feautures
roc_results <- permutation_importance(trained_model, testTransformed, first_outcome)
test_auc <- roc_results[[1]] # Predict the base test importance
feature_importance_non_cor <- roc_results[2] # save permutation results
# Get feature weights
feature_importance_cor <- trained_model$finalModel$W
}
else{
# We will use the permutation_importance function here to:
# 1. Predict held-out test-data
# 2. Calculate ROC and AUROC values on this prediction
# 3. Get the feature importances for correlated and uncorrelated feautures
roc_results <- permutation_importance(trained_model, testTransformed, first_outcome)
test_auc <- roc_results[[1]] # Predict the base test importance
feature_importance_non_cor <- roc_results[2] # save permutation results of non-cor
feature_importance_cor <- roc_results[3] # save permutation results of cor
}
}else{
print("No permutation test being performed.")
if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
# Get feature weights
feature_importance_non_cor <- trained_model$finalModel$W
# Get feature weights
feature_importance_cor <- trained_model$finalModel$W
}else{
# Get feature weights
feature_importance_non_cor <- NULL
# Get feature weights
feature_importance_cor <- NULL
}
# Calculate the test-auc for the actual pre-processed held-out data
rpartProbs <- predict(trained_model, testTransformed, type="prob")
test_roc <- roc(ifelse(testTransformed[,outcome] == first_outcome, 1, 0), rpartProbs[[1]])
test_auc <- test_roc$auc
}

# ---------------------------------------------------------------------------------->

# ----------------------------Save metrics as vector ------------------------------->
Expand Down
84 changes: 68 additions & 16 deletions code/learning/model_selection.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,15 @@
######################################################################
#------------------------- DEFINE FUNCTION -------------------#
######################################################################
tuning_grid <- function(train_data, model){
tuning_grid <- function(train_data, model, outcome, hyperparameters=NULL){

# NOTE: Hyperparameters should be a list where the names of the list are the
# hyperparameters and the values are the values to be tested

# set outcome as first column if null
#if(is.null(outcome)){
# outcome <- colnames(train_data)[1]
#}

# -------------------------CV method definition--------------------------------------->
# ADDED cv index to make sure
Expand All @@ -45,7 +52,7 @@ tuning_grid <- function(train_data, model){
# 2. Return 2class summary and save predictions to calculate cvROC
# 3. Save the predictions and class probabilities/decision values.
folds <- 5
cvIndex <- createMultiFolds(factor(train_data$dx), folds, times=100)
cvIndex <- createMultiFolds(factor(train_data[,outcome]), folds, times=100)
cv <- trainControl(method="repeatedcv",
number=folds,
index = cvIndex,
Expand Down Expand Up @@ -126,44 +133,89 @@ tuning_grid <- function(train_data, model){

# Grid and caret method defined for each classification models
if(model=="L2_Logistic_Regression") {
grid <- expand.grid(cost = c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 10),
if(is.null(hyperparameters)){
hyperparameters <- list()
hyperparameters$cost <- c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 10) # maybe change these default parameters?
}
grid <- expand.grid(cost = hyperparameters$cost,
loss = "L2_primal",
# This chooses type=0 for liblinear R package
# which is logistic loss, primal solve for L2 regularized logistic regression
epsilon = 0.01) #default epsilon recommended from liblinear
method <- "regLogistic"
}
else if (model=="L1_Linear_SVM"){ #
grid <- expand.grid(cost = c(0.0001, 0.001, 0.01, 0.015, 0.025, 0.05, 0.1, 0.5, 1),
if(is.null(hyperparameters)){
hyperparameters <- list()
hyperparameters$cost <- c(0.0001, 0.001, 0.01, 0.015, 0.025, 0.05, 0.1, 0.5, 1) # maybe change these default parameters?
}
grid <- expand.grid(cost = hyperparameters$cost,
Loss = "L2")
method <- "svmLinear4" # I wrote this function in caret
}
else if (model=="L2_Linear_SVM"){
grid <- expand.grid(cost = c(0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1),
if(is.null(hyperparameters)){
hyperparameters <- list()
hyperparameters$cost <- c(0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1) # maybe change these default parameters?
}
grid <- expand.grid(cost = hyperparameters$cost,
Loss = "L2")
method <- "svmLinear3" # I changed this function in caret
}
else if (model=="RBF_SVM"){
grid <- expand.grid(sigma = c(0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1),
C = c(0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10))
if(is.null(hyperparameters)){
hyperparameters <- list()
hyperparameters$sigma <- c(0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1) # maybe change these default parameters?
hyperparameters$C <- c(0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10) # maybe change these default parameters?
}
grid <- expand.grid(sigma = hyperparameters$sigma,
C = hyperparameters$C)
method <-"svmRadial"
}
else if (model=="Decision_Tree"){
grid <- expand.grid(maxdepth = c(1,2,3,4,5,6))
if(is.null(hyperparameters)){
hyperparameters <- list()
hyperparameters$maxdepth <- c(1,2,3,4,5,6)
}
grid <- expand.grid(maxdepth = hyperparameters$maxdepth) # maybe change these default parameters?
method <-"rpart2"
}
else if (model=="Random_Forest"){
grid <- expand.grid(mtry = c(80,500,1000,1500))
if(is.null(hyperparameters)){
# get number of features
n_features <- ncol(train_data) - 1
if(n_features > 20000) n_features <- 20000
# if few features
if(n_features < 19){ mtry <- 1:6
} else {
# if many features
mtry <- floor(seq(1, n_features, length=6))
}
# only keep ones with less features than you have
hyperparameters <- list()
hyperparameters$mtry <- mtry[mtry <= n_features]
}
grid <- expand.grid(mtry = hyperparameters$mtry)
method = "rf"
}
else if (model=="XGBoost"){
grid <- expand.grid(nrounds=500,
gamma=0,
eta=c(0.001, 0.01, 0.1, 1),
max_depth=8,
colsample_bytree= 0.8,
min_child_weight=1,
subsample=c(0.4, 0.5, 0.6, 0.7))
if(is.null(hyperparameters)){
hyperparameters <- list()
hyperparameters$nrounds <- 500 # maybe change these default parameters?
hyperparameters$gamma <- 0 # maybe change these default parameters?
hyperparameters$eta <- c(0.001, 0.01, 0.1, 1) # maybe change these default parameters?
hyperparameters$max_depth <- 8 # maybe change these default parameters?
hyperparameters$colsample_bytree <- 0.8 # maybe change these default parameters?
hyperparameters$min_child_weight <- 1 # maybe change these default parameters?
hyperparameters$subsample <- c(0.4, 0.5, 0.6, 0.7) # maybe change these default parameters?
}
grid <- expand.grid(nrounds=hyperparameters$nrounds,
gamma=hyperparameters$gamma,
eta=hyperparameters$eta,
max_depth=hyperparameters$max_depth,
colsample_bytree=hyperparameters$colsample_bytree,
min_child_weight=hyperparameters$min_child_weight,
subsample=hyperparameters$subsample)
method <- "xgbTree"
}
else {
Expand Down
20 changes: 14 additions & 6 deletions code/learning/permutation_importance.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,17 @@ for (dep in deps){


####################### DEFINE FUNCTION #############################
permutation_importance <- function(model, full){
permutation_importance <- function(model, full, first_outcome, outcome=NULL){

# Set outcome as first column if null
if(is.null(outcome)){
outcome <- colnames(full)[1]
}

# -----------Get the original testAUC from held-out test data--------->
# Calculate the test-auc for the actual pre-processed held-out data
rpartProbs <- predict(model, full, type="prob")
base_roc <- roc(ifelse(full$dx == "cancer", 1, 0), rpartProbs[[1]])
base_roc <- roc(ifelse(full[,outcome] == first_outcome, 1, 0), rpartProbs[[1]])
base_auc <- base_roc$auc
# -------------------------------------------------------------------->

Expand All @@ -78,8 +83,11 @@ permutation_importance <- function(model, full){
# Remove those names as columns from full test data
# Remove the diagnosis column to only keep non-correlated features
non_correlated_otus <- full %>%
select(-correlated_otus) %>%
select(-dx) %>%
select(-correlated_otus)

non_correlated_otus[,outcome] <- NULL

non_correlated_otus <- non_correlated_otus %>%
colnames()
# -------------------------------------------------------------------->

Expand All @@ -101,7 +109,7 @@ permutation_importance <- function(model, full){
# Predict the diagnosis outcome with the one-feature-permuted test dataset
rpartProbs_permuted <- predict(model, full_permuted, type="prob")
# Calculate the new auc
new_auc <- roc(ifelse(full_permuted$dx == "cancer", 1, 0), rpartProbs_permuted[[1]])$auc
new_auc <- roc(ifelse(full_permuted[,outcome] == first_outcome, 1, 0), rpartProbs_permuted[[1]])$auc
# Return how does this feature being permuted effect the auc
return(new_auc)
}))
Expand Down Expand Up @@ -182,7 +190,7 @@ permutation_importance <- function(model, full){
# Predict the diagnosis outcome with the group-permuted test dataset
rpartProbs_permuted_corr <- predict(model, full_permuted_corr, type="prob")
# Calculate the new auc
new_auc <- roc(ifelse(full_permuted_corr$dx == "cancer", 1, 0), rpartProbs_permuted_corr[[1]])$auc
new_auc <- roc(ifelse(full_permuted_corr[,outcome] == first_outcome, 1, 0), rpartProbs_permuted_corr[[1]])$auc
list <- list(new_auc, unlist(i))
return(list)
}))
Expand Down