SchlossLab · zenalapp · Nov 11, 2019 · Nov 11, 2019 · Nov 11, 2019 · Nov 11, 2019
diff --git a/README.md b/README.md
@@ -87,6 +87,7 @@ cd DeepLearning
 	 2. `Rscript code/learning/main.R` sources 4 other scripts that are part of the pipeline. 
 
 	 	* To choose the model and model hyperparemeters:`source('code/learning/model_selection.R')`
+		* This function (and `get_results`) also takes an optional argument to specify your own hyperparameters to be used for cross-validation (`hyperparameters`). This argument should be a list where the names of the list are the hyperparameters and the values are the values to be tested
 
 		Depending on your ML task, the model hyperparameter range to tune will be different. This is hard-coded for our study but will be updated to integrate user-defined range in the future (Issue # 10)
 

diff --git a/code/learning/generateAUCs.R b/code/learning/generateAUCs.R
@@ -34,9 +34,9 @@
 ######################################################################
 #------------------------- DEFINE FUNCTION -------------------#
 ######################################################################
-get_results <- function(dataset, models, split_number){
+get_results <- get_results <- function(dataset, models, split_number, perm=T, outcome=NA, hyperparameters=NULL){
   # Save results of the modeling pipeline as a list
-  results <- pipeline(dataset, models, split_number) 
+  results <- pipeline(dataset, models, split_number,outcome=outcome,perm=perm,hyperparameters=hyperparameters) 
   # These results have
   # 1. cv_auc, 
   # 2. test_auc

diff --git a/code/learning/main.R b/code/learning/main.R
@@ -74,10 +74,10 @@ data <- inner_join(meta, shared, by=c("sample"="Group")) %>%
   select(-sample, -Dx_Bin, -fit_result) %>%
   drop_na()
 # We want the diagnosis column to be a factor
-data$dx <- factor(data$dx)
+# data$dx <- factor(data$dx)
 # We want the first sample to be a cancer so we shuffle the dataset with a specific seed to get cancer as the first sample
-set.seed(0)
-data <- data[sample(1:nrow(data)), ]
+# set.seed(0)
+# data <- data[sample(1:nrow(data)), ]
 ###################################################################
 
 ######################## RUN PIPELINE #############################
@@ -98,6 +98,11 @@ data <- data[sample(1:nrow(data)), ]
 input <- commandArgs(trailingOnly=TRUE)
 seed <- as.numeric(input[1])
 model <- input[2]
+permutation <- as.numeric(input[3]) # 1 if permutation performed
+                                    # 0 if permutation not performed
+outcome <- input[4]
+
+
 
 # Then arguments 1 and 2 will be placed respectively into the functions:
 #   1. set.seed() : creates reproducibility and variability
@@ -108,7 +113,10 @@ set.seed(seed)
 # Start walltime for running model
 tic("model")
 # Run the model
-get_results(data, model, seed)
+  # User can define the outcome and to do permutation or not here:
+  # example: get_results(data, model, seed, 0, "dx)
+  # OR pass as NA if no argument is given on the command line
+get_results(data, model, seed, permutation, outcome)
 # Stop walltime for running model
 secs <- toc()
 # Save elapsed time

diff --git a/code/learning/model_pipeline.R b/code/learning/model_pipeline.R
@@ -37,7 +37,18 @@
 ######################################################################
 
 
-pipeline <- function(dataset, model, split_number){
+pipeline <- function(dataset, model, split_number, outcome=NA, hyperparameters=NULL, perm=T){
+
+  # -----------------------Get outcome variable----------------------------->
+  # If no outcome specified, use first column in dataset
+  if(is.na(outcome)){
+    outcome <- colnames(dataset)[1]
+  }else{
+    # check to see if outcome is in column names of dataset
+    if(!outcome %in% names(dataset)){
+      stop(paste('Outcome',outcome,'not in column names of dataset.'))
+    }
+  }
 
   # ------------------Pre-process the full Dataset------------------------->
   # We are doing the pre-processing to the full dataset and then splitting 80-20
@@ -46,20 +57,29 @@ pipeline <- function(dataset, model, split_number){
   dataTransformed <- predict(preProcValues, dataset)
   # ----------------------------------------------------------------------->
 
+  # Get outcome variables
+  first_outcome = as.character(dataset[,outcome][1])
+  outcome_vals = unique(dataset[,outcome])
+  if(length(outcome_vals) != 2) stop('A binary outcome variable is required.')
+  second_outcome = as.character(outcome_vals[!outcome_vals == first_outcome])
+  print(paste(c('first outcome:','second outcome:'),c(first_outcome,second_outcome)))
+
+
   # ------------------80-20 Datasplit for each seed------------------------->
   # Do the 80-20 data-split
   # Stratified data partitioning %80 training - %20 testing
-  inTraining <- createDataPartition(dataTransformed$dx, p = .80, list = FALSE)
+  inTraining <- createDataPartition(dataTransformed[,outcome], p = .80, list = FALSE)
   trainTransformed <- dataTransformed[ inTraining,]
   testTransformed  <- dataTransformed[-inTraining,]
   # ----------------------------------------------------------------------->
 
   # -------------Define hyper-parameter and cv settings-------------------->
   # Define hyper-parameter tuning grid and the training method
   # Uses function tuning_grid() in file ('code/learning/model_selection.R')
-  grid <- tuning_grid(trainTransformed, model)[[1]]
-  method <- tuning_grid(trainTransformed, model)[[2]]
-  cv <- tuning_grid(trainTransformed, model)[[3]]
+  tune <- tuning_grid(trainTransformed, model, outcome, hyperparameters)
+  grid <- tune[[1]]
+  method <- tune[[2]]
+  cv <- tune[[3]]
   # ----------------------------------------------------------------------->
 
   # ---------------------------Train the model ---------------------------->
@@ -85,11 +105,15 @@ pipeline <- function(dataset, model, split_number){
   # - If the model is random forest, we need to add a ntree=1000 parameter.
   #         We chose ntree=1000 empirically.
   # ----------------------------------------------------------------------->
+  # Make formula based on outcome
+  f <- as.formula(paste(outcome, '~ .'))
+  print('Machine learning formula:')
+  print(f)
   # Start walltime for training model
   tic("train")
   if(model=="L2_Logistic_Regression"){
   print(model)
-  trained_model <-  train(dx ~ ., # label
+  trained_model <-  train(f, # label
                           data=trainTransformed, #total data
                           method = method,
                           trControl = cv,
@@ -99,7 +123,7 @@ pipeline <- function(dataset, model, split_number){
   }
   else if(model=="Random_Forest"){
       print(model)
-      trained_model <-  train(dx ~ .,
+      trained_model <-  train(f,
                               data=trainTransformed,
                               method = method,
                               trControl = cv,
@@ -109,7 +133,7 @@ pipeline <- function(dataset, model, split_number){
   }
   else{
     print(model)
-    trained_model <-  train(dx ~ .,
+    trained_model <-  train(f,
                             data=trainTransformed,
                             method = method,
                             trControl = cv,
@@ -133,27 +157,47 @@ pipeline <- function(dataset, model, split_number){
   #   if linear: Output the weights of features of linear models
   #   else: Output the feature importances based on random permutation for non-linear models
   # Here we look at the top 20 important features
-  if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
-    # We will use the permutation_importance function here to:
-    #     1. Predict held-out test-data
-    #     2. Calculate ROC and AUROC values on this prediction
-    #     3. Get the feature importances for correlated and uncorrelated feautures
-    roc_results <- permutation_importance(trained_model, testTransformed)
-    test_auc <- roc_results[[1]]  # Predict the base test importance
-    feature_importance_non_cor <- roc_results[2] # save permutation results
-    # Get feature weights
-    feature_importance_cor <- trained_model$finalModel$W
-  }
-  else{
-    # We will use the permutation_importance function here to:
-    #     1. Predict held-out test-data
-    #     2. Calculate ROC and AUROC values on this prediction
-    #     3. Get the feature importances for correlated and uncorrelated feautures
-    roc_results <- permutation_importance(trained_model, testTransformed)
-    test_auc <- roc_results[[1]] # Predict the base test importance
-    feature_importance_non_cor <- roc_results[2] # save permutation results of non-cor
-    feature_importance_cor <- roc_results[3] # save permutation results of cor
-  }
+  if(perm==T){
+    if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
+      # We will use the permutation_importance function here to:
+      #     1. Predict held-out test-data
+      #     2. Calculate ROC and AUROC values on this prediction
+      #     3. Get the feature importances for correlated and uncorrelated feautures
+      roc_results <- permutation_importance(trained_model, testTransformed, first_outcome)
+      test_auc <- roc_results[[1]]  # Predict the base test importance
+      feature_importance_non_cor <- roc_results[2] # save permutation results
+      # Get feature weights
+      feature_importance_cor <- trained_model$finalModel$W
+    }
+    else{
+      # We will use the permutation_importance function here to:
+      #     1. Predict held-out test-data
+      #     2. Calculate ROC and AUROC values on this prediction
+      #     3. Get the feature importances for correlated and uncorrelated feautures
+      roc_results <- permutation_importance(trained_model, testTransformed, first_outcome)
+      test_auc <- roc_results[[1]] # Predict the base test importance
+      feature_importance_non_cor <- roc_results[2] # save permutation results of non-cor
+      feature_importance_cor <- roc_results[3] # save permutation results of cor
+    }
+  }else{
+    print("No permutation test being performed.")
+    if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
+      # Get feature weights
+      feature_importance_non_cor <- trained_model$finalModel$W
+      # Get feature weights
+      feature_importance_cor <- trained_model$finalModel$W
+    }else{
+      # Get feature weights
+      feature_importance_non_cor <- NULL
+      # Get feature weights
+      feature_importance_cor <- NULL
+    }
+    # Calculate the test-auc for the actual pre-processed held-out data
+    rpartProbs <- predict(trained_model, testTransformed, type="prob")
+    test_roc <- roc(ifelse(testTransformed[,outcome] == first_outcome, 1, 0), rpartProbs[[1]])
+    test_auc <- test_roc$auc
+  } 
+
   # ---------------------------------------------------------------------------------->
 
   # ----------------------------Save metrics as vector ------------------------------->

diff --git a/code/learning/model_selection.R b/code/learning/model_selection.R
@@ -33,8 +33,15 @@
 ######################################################################
 #------------------------- DEFINE FUNCTION -------------------#
 ######################################################################
-tuning_grid <- function(train_data, model){
+tuning_grid <- function(train_data, model, outcome, hyperparameters=NULL){
 
+  # NOTE: Hyperparameters should be a list where the names of the list are the 
+  # hyperparameters and the values are the values to be tested
+
+  # set outcome as first column if null
+  #if(is.null(outcome)){
+   # outcome <- colnames(train_data)[1]
+  #}
 
 # -------------------------CV method definition--------------------------------------->
 # ADDED cv index to make sure
@@ -45,7 +52,7 @@ tuning_grid <- function(train_data, model){
 #     2. Return 2class summary and save predictions to calculate cvROC
 #     3. Save the predictions and class probabilities/decision values.
   folds <- 5
-  cvIndex <- createMultiFolds(factor(train_data$dx), folds, times=100)
+  cvIndex <- createMultiFolds(factor(train_data[,outcome]), folds, times=100)
   cv <- trainControl(method="repeatedcv",
                      number=folds,
                      index = cvIndex,
@@ -126,44 +133,89 @@ tuning_grid <- function(train_data, model){
 
   # Grid and caret method defined for each classification models
   if(model=="L2_Logistic_Regression") {
-    grid <-  expand.grid(cost = c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 10),
+   if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$cost <- c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 10) # maybe change these default parameters?
+    }
+    grid <-  expand.grid(cost = hyperparameters$cost,
                          loss = "L2_primal",
                          # This chooses type=0 for liblinear R package
                          # which is logistic loss, primal solve for L2 regularized logistic regression
                          epsilon = 0.01) #default epsilon recommended from liblinear
     method <- "regLogistic"
   }
   else if (model=="L1_Linear_SVM"){ #
-    grid <- expand.grid(cost = c(0.0001, 0.001, 0.01, 0.015, 0.025, 0.05, 0.1, 0.5, 1),
+    if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$cost <- c(0.0001, 0.001, 0.01, 0.015, 0.025, 0.05, 0.1, 0.5, 1) # maybe change these default parameters?
+    }
+    grid <- expand.grid(cost = hyperparameters$cost,
                         Loss = "L2")
     method <- "svmLinear4" # I wrote this function in caret
   }
   else if (model=="L2_Linear_SVM"){
-    grid <- expand.grid(cost = c(0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1),
+    if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$cost <- c(0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1) # maybe change these default parameters?
+    }
+    grid <- expand.grid(cost = hyperparameters$cost,
                         Loss = "L2")
     method <- "svmLinear3" # I changed this function in caret
   }
   else if (model=="RBF_SVM"){
-    grid <-  expand.grid(sigma = c(0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1),
-                         C = c(0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10))
+    if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$sigma <- c(0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1) # maybe change these default parameters?
+       hyperparameters$C <- c(0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10) # maybe change these default parameters?
+    }
+    grid <-  expand.grid(sigma = hyperparameters$sigma,
+                         C = hyperparameters$C)
     method <-"svmRadial"
   }
   else if (model=="Decision_Tree"){
-    grid <-  expand.grid(maxdepth = c(1,2,3,4,5,6))
+    if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$maxdepth <- c(1,2,3,4,5,6)
+    }
+    grid <-  expand.grid(maxdepth = hyperparameters$maxdepth) # maybe change these default parameters?
     method <-"rpart2"
   }
   else if (model=="Random_Forest"){
-    grid <-  expand.grid(mtry = c(80,500,1000,1500))
+    if(is.null(hyperparameters)){
+      # get number of features
+      n_features <- ncol(train_data) - 1
+      if(n_features > 20000) n_features <- 20000
+      # if few features
+      if(n_features < 19){ mtry <- 1:6
+      } else { 
+        # if many features
+        mtry <- floor(seq(1, n_features, length=6)) 
+      }
+      # only keep ones with less features than you have
+      hyperparameters <- list()
+      hyperparameters$mtry <- mtry[mtry <= n_features]
+    }
+    grid <-  expand.grid(mtry = hyperparameters$mtry)
     method = "rf"
   }
   else if (model=="XGBoost"){
-    grid <-  expand.grid(nrounds=500,
-                         gamma=0,
-                         eta=c(0.001, 0.01, 0.1, 1),
-                         max_depth=8,
-                         colsample_bytree= 0.8,
-                         min_child_weight=1,
-                         subsample=c(0.4, 0.5, 0.6, 0.7))
+   if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$nrounds <- 500 # maybe change these default parameters?
+       hyperparameters$gamma <- 0 # maybe change these default parameters?
+       hyperparameters$eta <- c(0.001, 0.01, 0.1, 1) # maybe change these default parameters?
+       hyperparameters$max_depth <- 8 # maybe change these default parameters?
+       hyperparameters$colsample_bytree <- 0.8 # maybe change these default parameters?
+       hyperparameters$min_child_weight <- 1 # maybe change these default parameters?
+       hyperparameters$subsample <- c(0.4, 0.5, 0.6, 0.7) # maybe change these default parameters?
+    }
+    grid <-  expand.grid(nrounds=hyperparameters$nrounds,
+                         gamma=hyperparameters$gamma,
+                         eta=hyperparameters$eta,
+                         max_depth=hyperparameters$max_depth,
+                         colsample_bytree=hyperparameters$colsample_bytree,
+                         min_child_weight=hyperparameters$min_child_weight,
+                         subsample=hyperparameters$subsample)
     method <- "xgbTree"
   }
   else {

diff --git a/code/learning/permutation_importance.R b/code/learning/permutation_importance.R
@@ -50,12 +50,17 @@ for (dep in deps){
 
 
 ####################### DEFINE FUNCTION  #############################
-permutation_importance <- function(model, full){
+permutation_importance <- function(model, full, first_outcome, outcome=NULL){
+
+  # Set outcome as first column if null
+  if(is.null(outcome)){
+    outcome <- colnames(full)[1]
+  }
 
   # -----------Get the original testAUC from held-out test data--------->
   # Calculate the test-auc for the actual pre-processed held-out data
   rpartProbs <- predict(model, full, type="prob")
-  base_roc <- roc(ifelse(full$dx == "cancer", 1, 0), rpartProbs[[1]])
+  base_roc <- roc(ifelse(full[,outcome]  == first_outcome, 1, 0), rpartProbs[[1]])
   base_auc <- base_roc$auc
   # -------------------------------------------------------------------->
 
@@ -78,8 +83,11 @@ permutation_importance <- function(model, full){
   # Remove those names as columns from full test data
   # Remove the diagnosis column to only keep non-correlated features
   non_correlated_otus <- full %>%
-    select(-correlated_otus) %>%
-    select(-dx) %>%
+    select(-correlated_otus)
+
+  non_correlated_otus[,outcome] <- NULL
+
+  non_correlated_otus <- non_correlated_otus %>%
     colnames()
   # -------------------------------------------------------------------->
 
@@ -101,7 +109,7 @@ permutation_importance <- function(model, full){
     # Predict the diagnosis outcome with the one-feature-permuted test dataset
     rpartProbs_permuted <- predict(model, full_permuted, type="prob")
     # Calculate the new auc
-    new_auc <- roc(ifelse(full_permuted$dx == "cancer", 1, 0), rpartProbs_permuted[[1]])$auc
+    new_auc <- roc(ifelse(full_permuted[,outcome] == first_outcome, 1, 0), rpartProbs_permuted[[1]])$auc
     # Return how does this feature being permuted effect the auc
     return(new_auc)
   }))
@@ -182,7 +190,7 @@ permutation_importance <- function(model, full){
     # Predict the diagnosis outcome with the group-permuted test dataset
     rpartProbs_permuted_corr <- predict(model, full_permuted_corr, type="prob")
     # Calculate the new auc
-    new_auc <- roc(ifelse(full_permuted_corr$dx == "cancer", 1, 0), rpartProbs_permuted_corr[[1]])$auc
+    new_auc <- roc(ifelse(full_permuted_corr[,outcome] == first_outcome, 1, 0), rpartProbs_permuted_corr[[1]])$auc
     list <- list(new_auc, unlist(i))
     return(list)
   }))