From 3012f090d0985da577ae8da058c1caced173b606 Mon Sep 17 00:00:00 2001
From: Zena Lapp <zenalapp@umich.edu>
Date: Mon, 11 Nov 2019 13:13:39 -0500
Subject: [PATCH 01/11] make permutation test optional and fix hard-coding of
 outcome variable

---
 code/learning/generateAUCs.R           |  4 +-
 code/learning/model_pipeline.R         | 95 +++++++++++++++++++-------
 code/learning/model_selection.R        |  9 ++-
 code/learning/permutation_importance.R | 15 ++--
 4 files changed, 87 insertions(+), 36 deletions(-)

diff --git a/code/learning/generateAUCs.R b/code/learning/generateAUCs.R
index e90906e..b3d0510 100644
--- a/code/learning/generateAUCs.R
+++ b/code/learning/generateAUCs.R
@@ -34,9 +34,9 @@
 ######################################################################
 #------------------------- DEFINE FUNCTION -------------------#
 ######################################################################
-get_results <- function(dataset, models, split_number){
+get_results <- function(dataset, models, split_number,outcome=NULL,perm=T){
   # Save results of the modeling pipeline as a list
-  results <- pipeline(dataset, models, split_number) 
+  results <- pipeline(dataset, models, split_number,outcome=NULL,perm=perm) 
   # These results have
   # 1. cv_auc, 
   # 2. test_auc
diff --git a/code/learning/model_pipeline.R b/code/learning/model_pipeline.R
index a53b658..fa6bc07 100644
--- a/code/learning/model_pipeline.R
+++ b/code/learning/model_pipeline.R
@@ -37,7 +37,18 @@
 ######################################################################
 
 
-pipeline <- function(dataset, model, split_number){
+pipeline <- function(dataset, model, split_number, outcome=NULL, hyperparameters=NULL, perm=T){
+
+  # -----------------------Get outcome variable----------------------------->
+  # If no outcome specified, use first column in dataset
+  if(is.null(outcome)){
+    outcome <- colnames(dataset)[1]
+  }else{
+    # check to see if outcome is in column names of dataset
+    if(!outcome %in% names(dataset)){
+      stop(paste('Outcome',outcome,'not in column names of dataset.'))
+    }
+  }
 
   # ------------------Pre-process the full Dataset------------------------->
   # We are doing the pre-processing to the full dataset and then splitting 80-20
@@ -46,10 +57,18 @@ pipeline <- function(dataset, model, split_number){
   dataTransformed <- predict(preProcValues, dataset)
   # ----------------------------------------------------------------------->
 
+  # Get outcome variables
+  first_outcome = as.character(dataset[,outcome][1])
+  outcome_vals = unique(dataset[,outcome])
+  if(length(outcome_vals) != 2) stop('A binary outcome variable is required.')
+  second_outcome = as.character(outcome_vals[!outcome_vals == first_outcome])
+  print(paste(c('first outcome:','second outcome:'),c(first_outcome,second_outcome)))
+
+
   # ------------------80-20 Datasplit for each seed------------------------->
   # Do the 80-20 data-split
   # Stratified data partitioning %80 training - %20 testing
-  inTraining <- createDataPartition(dataTransformed$dx, p = .80, list = FALSE)
+  inTraining <- createDataPartition(dataTransformed[,outcome], p = .80, list = FALSE)
   trainTransformed <- dataTransformed[ inTraining,]
   testTransformed  <- dataTransformed[-inTraining,]
   # ----------------------------------------------------------------------->
@@ -85,11 +104,15 @@ pipeline <- function(dataset, model, split_number){
   # - If the model is random forest, we need to add a ntree=1000 parameter.
   #         We chose ntree=1000 empirically.
   # ----------------------------------------------------------------------->
+  # Make formula based on outcome
+  f <- as.formula(paste(outcome, '~ .'))
+  print('Machine learning formula:')
+  print(f)
   # Start walltime for training model
   tic("train")
   if(model=="L2_Logistic_Regression"){
   print(model)
-  trained_model <-  train(dx ~ ., # label
+  trained_model <-  train(f, # label
                           data=trainTransformed, #total data
                           method = method,
                           trControl = cv,
@@ -99,7 +122,7 @@ pipeline <- function(dataset, model, split_number){
   }
   else if(model=="Random_Forest"){
       print(model)
-      trained_model <-  train(dx ~ .,
+      trained_model <-  train(f,
                               data=trainTransformed,
                               method = method,
                               trControl = cv,
@@ -109,7 +132,7 @@ pipeline <- function(dataset, model, split_number){
   }
   else{
     print(model)
-    trained_model <-  train(dx ~ .,
+    trained_model <-  train(f,
                             data=trainTransformed,
                             method = method,
                             trControl = cv,
@@ -133,27 +156,47 @@ pipeline <- function(dataset, model, split_number){
   #   if linear: Output the weights of features of linear models
   #   else: Output the feature importances based on random permutation for non-linear models
   # Here we look at the top 20 important features
-  if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
-    # We will use the permutation_importance function here to:
-    #     1. Predict held-out test-data
-    #     2. Calculate ROC and AUROC values on this prediction
-    #     3. Get the feature importances for correlated and uncorrelated feautures
-    roc_results <- permutation_importance(trained_model, testTransformed)
-    test_auc <- roc_results[[1]]  # Predict the base test importance
-    feature_importance_non_cor <- roc_results[2] # save permutation results
-    # Get feature weights
-    feature_importance_cor <- trained_model$finalModel$W
-  }
-  else{
-    # We will use the permutation_importance function here to:
-    #     1. Predict held-out test-data
-    #     2. Calculate ROC and AUROC values on this prediction
-    #     3. Get the feature importances for correlated and uncorrelated feautures
-    roc_results <- permutation_importance(trained_model, testTransformed)
-    test_auc <- roc_results[[1]] # Predict the base test importance
-    feature_importance_non_cor <- roc_results[2] # save permutation results of non-cor
-    feature_importance_cor <- roc_results[3] # save permutation results of cor
-  }
+  if(perm==T){
+    if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
+      # We will use the permutation_importance function here to:
+      #     1. Predict held-out test-data
+      #     2. Calculate ROC and AUROC values on this prediction
+      #     3. Get the feature importances for correlated and uncorrelated feautures
+      roc_results <- permutation_importance(trained_model, testTransformed, first_outcome)
+      test_auc <- roc_results[[1]]  # Predict the base test importance
+      feature_importance_non_cor <- roc_results[2] # save permutation results
+      # Get feature weights
+      feature_importance_cor <- trained_model$finalModel$W
+    }
+    else{
+      # We will use the permutation_importance function here to:
+      #     1. Predict held-out test-data
+      #     2. Calculate ROC and AUROC values on this prediction
+      #     3. Get the feature importances for correlated and uncorrelated feautures
+      roc_results <- permutation_importance(trained_model, testTransformed, first_outcome)
+      test_auc <- roc_results[[1]] # Predict the base test importance
+      feature_importance_non_cor <- roc_results[2] # save permutation results of non-cor
+      feature_importance_cor <- roc_results[3] # save permutation results of cor
+    }
+  }else{
+    print("No permutation test being performed.")
+    if(model=="L1_Linear_SVM" || model=="L2_Linear_SVM" || model=="L2_Logistic_Regression"){
+      # Get feature weights
+      feature_importance_non_cor <- trained_model$finalModel$W
+      # Get feature weights
+      feature_importance_cor <- trained_model$finalModel$W
+    }else{
+      # Get feature weights
+      feature_importance_non_cor <- NULL
+      # Get feature weights
+      feature_importance_cor <- NULL
+    }
+    # Calculate the test-auc for the actual pre-processed held-out data
+    rpartProbs <- predict(trained_model, testTransformed, type="prob")
+    test_roc <- roc(ifelse(testTransformed[,outcome] == first_outcome, 1, 0), rpartProbs[[1]])
+    test_auc <- test_roc$auc
+  } 
+
   # ---------------------------------------------------------------------------------->
 
   # ----------------------------Save metrics as vector ------------------------------->
diff --git a/code/learning/model_selection.R b/code/learning/model_selection.R
index e9f6122..7eecd6d 100644
--- a/code/learning/model_selection.R
+++ b/code/learning/model_selection.R
@@ -33,8 +33,11 @@
 ######################################################################
 #------------------------- DEFINE FUNCTION -------------------#
 ######################################################################
-tuning_grid <- function(train_data, model){
-
+tuning_grid <- function(train_data, model, outcome=NULL, hyperparameters=NULL){
+  # set outcome as first column if null
+  if(is.null(outcome)){
+    outcome <- colnames(train_data)[1]
+  }
 
 # -------------------------CV method definition--------------------------------------->
 # ADDED cv index to make sure
@@ -45,7 +48,7 @@ tuning_grid <- function(train_data, model){
 #     2. Return 2class summary and save predictions to calculate cvROC
 #     3. Save the predictions and class probabilities/decision values.
   folds <- 5
-  cvIndex <- createMultiFolds(factor(train_data$dx), folds, times=100)
+  cvIndex <- createMultiFolds(factor(train_data[,outcome]), folds, times=100)
   cv <- trainControl(method="repeatedcv",
                      number=folds,
                      index = cvIndex,
diff --git a/code/learning/permutation_importance.R b/code/learning/permutation_importance.R
index aaada06..d90f974 100644
--- a/code/learning/permutation_importance.R
+++ b/code/learning/permutation_importance.R
@@ -50,12 +50,17 @@ for (dep in deps){
 
 
 ####################### DEFINE FUNCTION  #############################
-permutation_importance <- function(model, full){
+permutation_importance <- function(model, full, first_outcome, outcome=NULL, perm=T){
+
+  # Set outcome as first column if null
+  if(is.null(outcome)){
+    outcome <- colnames(full)[1]
+  }
 
   # -----------Get the original testAUC from held-out test data--------->
   # Calculate the test-auc for the actual pre-processed held-out data
   rpartProbs <- predict(model, full, type="prob")
-  base_roc <- roc(ifelse(full$dx == "cancer", 1, 0), rpartProbs[[1]])
+  base_roc <- roc(ifelse(full[,outcome]  == first_outcome, 1, 0), rpartProbs[[1]])
   base_auc <- base_roc$auc
   # -------------------------------------------------------------------->
 
@@ -79,7 +84,7 @@ permutation_importance <- function(model, full){
   # Remove the diagnosis column to only keep non-correlated features
   non_correlated_otus <- full %>%
     select(-correlated_otus) %>%
-    select(-dx) %>%
+    select(-sym(first_outcome)) %>%
     colnames()
   # -------------------------------------------------------------------->
 
@@ -101,7 +106,7 @@ permutation_importance <- function(model, full){
     # Predict the diagnosis outcome with the one-feature-permuted test dataset
     rpartProbs_permuted <- predict(model, full_permuted, type="prob")
     # Calculate the new auc
-    new_auc <- roc(ifelse(full_permuted$dx == "cancer", 1, 0), rpartProbs_permuted[[1]])$auc
+    new_auc <- roc(ifelse(full_permuted[,outcome] == first_outcome, 1, 0), rpartProbs_permuted[[1]])$auc
     # Return how does this feature being permuted effect the auc
     return(new_auc)
   }))
@@ -182,7 +187,7 @@ permutation_importance <- function(model, full){
     # Predict the diagnosis outcome with the group-permuted test dataset
     rpartProbs_permuted_corr <- predict(model, full_permuted_corr, type="prob")
     # Calculate the new auc
-    new_auc <- roc(ifelse(full_permuted_corr$dx == "cancer", 1, 0), rpartProbs_permuted_corr[[1]])$auc
+    new_auc <- roc(ifelse(full_permuted_corr[,outcome] == first_outcome, 1, 0), rpartProbs_permuted_corr[[1]])$auc
     list <- list(new_auc, unlist(i))
     return(list)
   }))

From f84804f574c75fb9c7ebd97f54f597bd74129b6b Mon Sep 17 00:00:00 2001
From: Zena Lapp <zenalapp@umich.edu>
Date: Mon, 11 Nov 2019 13:34:06 -0500
Subject: [PATCH 02/11] remove perm=T from function because not needed

---
 code/learning/permutation_importance.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/learning/permutation_importance.R b/code/learning/permutation_importance.R
index d90f974..d1c0747 100644
--- a/code/learning/permutation_importance.R
+++ b/code/learning/permutation_importance.R
@@ -50,7 +50,7 @@ for (dep in deps){
 
 
 ####################### DEFINE FUNCTION  #############################
-permutation_importance <- function(model, full, first_outcome, outcome=NULL, perm=T){
+permutation_importance <- function(model, full, first_outcome, outcome=NULL){
 
   # Set outcome as first column if null
   if(is.null(outcome)){

From 958d642b9e87ce31383c7421338906301547934e Mon Sep 17 00:00:00 2001
From: Zena Lapp <zenalapp@umich.edu>
Date: Mon, 11 Nov 2019 15:29:52 -0500
Subject: [PATCH 03/11] add option to specify hyperparameters to test for
 cross-validation

---
 code/learning/generateAUCs.R    |  4 +-
 code/learning/model_pipeline.R  |  7 +--
 code/learning/model_selection.R | 77 +++++++++++++++++++++++++++------
 3 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/code/learning/generateAUCs.R b/code/learning/generateAUCs.R
index b3d0510..a85be2f 100644
--- a/code/learning/generateAUCs.R
+++ b/code/learning/generateAUCs.R
@@ -34,9 +34,9 @@
 ######################################################################
 #------------------------- DEFINE FUNCTION -------------------#
 ######################################################################
-get_results <- function(dataset, models, split_number,outcome=NULL,perm=T){
+get_results <- function(dataset, models, split_number,outcome=NULL,perm=T,hyperparameters=NULL){
   # Save results of the modeling pipeline as a list
-  results <- pipeline(dataset, models, split_number,outcome=NULL,perm=perm) 
+  results <- pipeline(dataset, models, split_number,outcome=NULL,perm=perm,hyperparameters=hyperparameters) 
   # These results have
   # 1. cv_auc, 
   # 2. test_auc
diff --git a/code/learning/model_pipeline.R b/code/learning/model_pipeline.R
index fa6bc07..7f814e2 100644
--- a/code/learning/model_pipeline.R
+++ b/code/learning/model_pipeline.R
@@ -76,9 +76,10 @@ pipeline <- function(dataset, model, split_number, outcome=NULL, hyperparameters
   # -------------Define hyper-parameter and cv settings-------------------->
   # Define hyper-parameter tuning grid and the training method
   # Uses function tuning_grid() in file ('code/learning/model_selection.R')
-  grid <- tuning_grid(trainTransformed, model)[[1]]
-  method <- tuning_grid(trainTransformed, model)[[2]]
-  cv <- tuning_grid(trainTransformed, model)[[3]]
+  tune <- tuning_grid(trainTransformed, model, outcome, hyperparameters)
+  grid <- tune[[1]]
+  method <- tune[[2]]
+  cv <- tune[[3]]
   # ----------------------------------------------------------------------->
 
   # ---------------------------Train the model ---------------------------->
diff --git a/code/learning/model_selection.R b/code/learning/model_selection.R
index 7eecd6d..5915857 100644
--- a/code/learning/model_selection.R
+++ b/code/learning/model_selection.R
@@ -34,6 +34,10 @@
 #------------------------- DEFINE FUNCTION -------------------#
 ######################################################################
 tuning_grid <- function(train_data, model, outcome=NULL, hyperparameters=NULL){
+
+  # NOTE: Hyperparameters should be a list where the names of the list are the 
+  # hyperparameters and the values are the values to be tested
+
   # set outcome as first column if null
   if(is.null(outcome)){
     outcome <- colnames(train_data)[1]
@@ -129,7 +133,11 @@ tuning_grid <- function(train_data, model, outcome=NULL, hyperparameters=NULL){
 
   # Grid and caret method defined for each classification models
   if(model=="L2_Logistic_Regression") {
-    grid <-  expand.grid(cost = c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 10),
+   if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$cost <- c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 10) # maybe change these default parameters?
+    }
+    grid <-  expand.grid(cost = hyperparameters$cost,
                          loss = "L2_primal",
                          # This chooses type=0 for liblinear R package
                          # which is logistic loss, primal solve for L2 regularized logistic regression
@@ -137,36 +145,77 @@ tuning_grid <- function(train_data, model, outcome=NULL, hyperparameters=NULL){
     method <- "regLogistic"
   }
   else if (model=="L1_Linear_SVM"){ #
-    grid <- expand.grid(cost = c(0.0001, 0.001, 0.01, 0.015, 0.025, 0.05, 0.1, 0.5, 1),
+    if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$cost <- c(0.0001, 0.001, 0.01, 0.015, 0.025, 0.05, 0.1, 0.5, 1) # maybe change these default parameters?
+    }
+    grid <- expand.grid(cost = hyperparameters$cost,
                         Loss = "L2")
     method <- "svmLinear4" # I wrote this function in caret
   }
   else if (model=="L2_Linear_SVM"){
-    grid <- expand.grid(cost = c(0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1),
+    if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$cost <- c(0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1) # maybe change these default parameters?
+    }
+    grid <- expand.grid(cost = hyperparameters$cost,
                         Loss = "L2")
     method <- "svmLinear3" # I changed this function in caret
   }
   else if (model=="RBF_SVM"){
-    grid <-  expand.grid(sigma = c(0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1),
-                         C = c(0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10))
+    if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$sigma <- c(0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1) # maybe change these default parameters?
+       hyperparameters$C <- c(0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10) # maybe change these default parameters?
+    }
+    grid <-  expand.grid(sigma = hyperparameters$sigma,
+                         C = hyperparameters$C)
     method <-"svmRadial"
   }
   else if (model=="Decision_Tree"){
-    grid <-  expand.grid(maxdepth = c(1,2,3,4,5,6))
+    if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$maxdepth <- c(1,2,3,4,5,6)
+    }
+    grid <-  expand.grid(maxdepth = hyperparameters$maxdepth) # maybe change these default parameters?
     method <-"rpart2"
   }
   else if (model=="Random_Forest"){
-    grid <-  expand.grid(mtry = c(80,500,1000,1500))
+    if(is.null(hyperparameters)){
+      # get number of features
+      n_features <- ncol(train_data) - 1
+      if(n_features > 20000) n_features <- 20000
+      # if few features
+      if(n_features < 19){ mtry <- 1:6
+      } else { 
+        # if many features
+        mtry <- floor(seq(1, n_features, length=6)) 
+      }
+      # only keep ones with less features than you have
+      hyperparameters <- list()
+      hyperparameters$mtry <- mtry[mtry <= n_features]
+    }
+    grid <-  expand.grid(mtry = hyperparameters$mtry)
     method = "rf"
   }
   else if (model=="XGBoost"){
-    grid <-  expand.grid(nrounds=500,
-                         gamma=0,
-                         eta=c(0.001, 0.01, 0.1, 1),
-                         max_depth=8,
-                         colsample_bytree= 0.8,
-                         min_child_weight=1,
-                         subsample=c(0.4, 0.5, 0.6, 0.7))
+   if(is.null(hyperparameters)){
+       hyperparameters <- list()
+       hyperparameters$nrounds <- 500 # maybe change these default parameters?
+       hyperparameters$gamma <- 0 # maybe change these default parameters?
+       hyperparameters$eta <- c(0.001, 0.01, 0.1, 1) # maybe change these default parameters?
+       hyperparameters$max_depth <- 8 # maybe change these default parameters?
+       hyperparameters$colsample_bytree <- 0.8 # maybe change these default parameters?
+       hyperparameters$min_child_weight <- 1 # maybe change these default parameters?
+       hyperparameters$subsample <- c(0.4, 0.5, 0.6, 0.7) # maybe change these default parameters?
+    }
+    grid <-  expand.grid(nrounds=hyperparameters$nrounds,
+                         gamma=hyperparameters$gamma,
+                         eta=hyperparameters$eta,
+                         max_depth=hyperparameters$max_depth,
+                         colsample_bytree=hyperparameters$colsample_bytree,
+                         min_child_weight=hyperparameters$min_child_weight,
+                         subsample=hyperparameters$subsample)
     method <- "xgbTree"
   }
   else {

From c9e4dba0f844b7edfeab9130a89d79b24c418e5e Mon Sep 17 00:00:00 2001
From: zmml <zenalapp@umich.edu>
Date: Mon, 11 Nov 2019 15:45:14 -0500
Subject: [PATCH 04/11] include info about specifying hyperparameters

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 9f4ecf2..da8050d 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,7 @@ cd DeepLearning
 	 2. `Rscript code/learning/main.R` sources 4 other scripts that are part of the pipeline. 
 	 
 	 	* To choose the model and model hyperparemeters:`source('code/learning/model_selection.R')`
+		* This function (and `get_results`) also takes an optional argument to specify your own hyperparameters to be used for cross-validation (`hyperparameters`). This argument should be a list where the names of the list are the hyperparameters and the values are the values to be tested
 		
 		Depending on your ML task, the model hyperparameter range to tune will be different. This is hard-coded for our study but will be updated to integrate user-defined range in the future (Issue # 10)
 	 

From b188ed24d30139c0fcb7854c4929adb0c58e2a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beg=C3=BCm=20D=2E=20Top=C3=A7uo=C4=9Flu?=
 <39833095+BTopcuoglu@users.noreply.github.com>
Date: Mon, 11 Nov 2019 17:34:24 -0500
Subject: [PATCH 05/11] fix the issue of passing outcome argument

---
 code/learning/generateAUCs.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/learning/generateAUCs.R b/code/learning/generateAUCs.R
index a85be2f..acf7df8 100644
--- a/code/learning/generateAUCs.R
+++ b/code/learning/generateAUCs.R
@@ -36,7 +36,7 @@
 ######################################################################
 get_results <- function(dataset, models, split_number,outcome=NULL,perm=T,hyperparameters=NULL){
   # Save results of the modeling pipeline as a list
-  results <- pipeline(dataset, models, split_number,outcome=NULL,perm=perm,hyperparameters=hyperparameters) 
+  results <- pipeline(dataset, models, split_number,outcome=outcome,perm=perm,hyperparameters=hyperparameters) 
   # These results have
   # 1. cv_auc, 
   # 2. test_auc

From 2ffd5930992dc29089901225e5cf662af31aaa0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beg=C3=BCm=20D=2E=20Top=C3=A7uo=C4=9Flu?=
 <39833095+BTopcuoglu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 11:23:11 -0500
Subject: [PATCH 06/11] add arguments to pass to outcome and permutation

address the last comment in #11
---
 code/learning/main.R | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/code/learning/main.R b/code/learning/main.R
index a5dbfd7..2474530 100644
--- a/code/learning/main.R
+++ b/code/learning/main.R
@@ -74,10 +74,10 @@ data <- inner_join(meta, shared, by=c("sample"="Group")) %>%
   select(-sample, -Dx_Bin, -fit_result) %>%
   drop_na()
 # We want the diagnosis column to be a factor
-data$dx <- factor(data$dx)
+# data$dx <- factor(data$dx)
 # We want the first sample to be a cancer so we shuffle the dataset with a specific seed to get cancer as the first sample
-set.seed(0)
-data <- data[sample(1:nrow(data)), ]
+# set.seed(0)
+# data <- data[sample(1:nrow(data)), ]
 ###################################################################
 
 ######################## RUN PIPELINE #############################
@@ -98,6 +98,9 @@ data <- data[sample(1:nrow(data)), ]
 input <- commandArgs(trailingOnly=TRUE)
 seed <- as.numeric(input[1])
 model <- input[2]
+outcome <- input[3] # pass outcome column name in quotation marks (e.g. "dx)
+permutation <- as.numeric(input[4]) # 1 if permutation performed
+                                    # 0 if permutation not performed
 
 # Then arguments 1 and 2 will be placed respectively into the functions:
 #   1. set.seed() : creates reproducibility and variability
@@ -108,7 +111,9 @@ set.seed(seed)
 # Start walltime for running model
 tic("model")
 # Run the model
-get_results(data, model, seed)
+  # User can define the outcome here:
+  # get_results(data, model, seed, "dx", 0)
+get_results(data, model, seed, outcome, permutation)
 # Stop walltime for running model
 secs <- toc()
 # Save elapsed time

From 8cc92b26ce53768eba3a9f9d8d30d87b6485a7c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beg=C3=BCm=20D=2E=20Top=C3=A7uo=C4=9Flu?=
 <39833095+BTopcuoglu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 11:39:54 -0500
Subject: [PATCH 07/11] change from `NULL` to `NA`

---
 code/learning/generateAUCs.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/learning/generateAUCs.R b/code/learning/generateAUCs.R
index acf7df8..867381a 100644
--- a/code/learning/generateAUCs.R
+++ b/code/learning/generateAUCs.R
@@ -34,7 +34,7 @@
 ######################################################################
 #------------------------- DEFINE FUNCTION -------------------#
 ######################################################################
-get_results <- function(dataset, models, split_number,outcome=NULL,perm=T,hyperparameters=NULL){
+get_results <- get_results <- function(dataset, models, split_number, perm=T, outcome=NA, hyperparameters=NULL){
   # Save results of the modeling pipeline as a list
   results <- pipeline(dataset, models, split_number,outcome=outcome,perm=perm,hyperparameters=hyperparameters) 
   # These results have

From 076fdcb0796734f900cc22c24a2e6552e1b78edd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beg=C3=BCm=20D=2E=20Top=C3=A7uo=C4=9Flu?=
 <39833095+BTopcuoglu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 11:40:55 -0500
Subject: [PATCH 08/11] outcome changed from NULL to NA

---
 code/learning/model_pipeline.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/code/learning/model_pipeline.R b/code/learning/model_pipeline.R
index 7f814e2..8dd44fa 100644
--- a/code/learning/model_pipeline.R
+++ b/code/learning/model_pipeline.R
@@ -37,11 +37,11 @@
 ######################################################################
 
 
-pipeline <- function(dataset, model, split_number, outcome=NULL, hyperparameters=NULL, perm=T){
+pipeline <- function(dataset, model, split_number, outcome=NA, hyperparameters=NULL, perm=T){
 
   # -----------------------Get outcome variable----------------------------->
   # If no outcome specified, use first column in dataset
-  if(is.null(outcome)){
+  if(is.na(outcome)){
     outcome <- colnames(dataset)[1]
   }else{
     # check to see if outcome is in column names of dataset

From 185e2105359f65fb040ccdf17dd9a0a60ca91a87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beg=C3=BCm=20D=2E=20Top=C3=A7uo=C4=9Flu?=
 <39833095+BTopcuoglu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 11:42:47 -0500
Subject: [PATCH 09/11] change the position of perm and outcome

---
 code/learning/main.R | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/code/learning/main.R b/code/learning/main.R
index 2474530..6904389 100644
--- a/code/learning/main.R
+++ b/code/learning/main.R
@@ -98,9 +98,11 @@ data <- inner_join(meta, shared, by=c("sample"="Group")) %>%
 input <- commandArgs(trailingOnly=TRUE)
 seed <- as.numeric(input[1])
 model <- input[2]
-outcome <- input[3] # pass outcome column name in quotation marks (e.g. "dx)
-permutation <- as.numeric(input[4]) # 1 if permutation performed
+permutation <- as.numeric(input[3]) # 1 if permutation performed
                                     # 0 if permutation not performed
+outcome <- input[4]
+
+
 
 # Then arguments 1 and 2 will be placed respectively into the functions:
 #   1. set.seed() : creates reproducibility and variability
@@ -111,9 +113,10 @@ set.seed(seed)
 # Start walltime for running model
 tic("model")
 # Run the model
-  # User can define the outcome here:
-  # get_results(data, model, seed, "dx", 0)
-get_results(data, model, seed, outcome, permutation)
+  # User can define the outcome and to do permutation or not here:
+  # example: get_results(data, model, seed, 0, "dx)
+  # OR pass as NA if no argument is given on the command line
+get_results(data, model, seed, permutation, outcome)
 # Stop walltime for running model
 secs <- toc()
 # Save elapsed time

From 1e6862ea9a9ebb054a644f7a09c9dde08a1df3f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beg=C3=BCm=20D=2E=20Top=C3=A7uo=C4=9Flu?=
 <39833095+BTopcuoglu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 11:49:12 -0500
Subject: [PATCH 10/11] remove outcome=NULL, it is already set before

---
 code/learning/model_selection.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/code/learning/model_selection.R b/code/learning/model_selection.R
index 5915857..5a15308 100644
--- a/code/learning/model_selection.R
+++ b/code/learning/model_selection.R
@@ -33,15 +33,15 @@
 ######################################################################
 #------------------------- DEFINE FUNCTION -------------------#
 ######################################################################
-tuning_grid <- function(train_data, model, outcome=NULL, hyperparameters=NULL){
+tuning_grid <- function(train_data, model, outcome, hyperparameters=NULL){
 
   # NOTE: Hyperparameters should be a list where the names of the list are the 
   # hyperparameters and the values are the values to be tested
 
   # set outcome as first column if null
-  if(is.null(outcome)){
-    outcome <- colnames(train_data)[1]
-  }
+  #if(is.null(outcome)){
+   # outcome <- colnames(train_data)[1]
+  #}
 
 # -------------------------CV method definition--------------------------------------->
 # ADDED cv index to make sure

From ac0b248704354c2b2cd7f610d2f6b2e38ab02191 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beg=C3=BCm=20D=2E=20Top=C3=A7uo=C4=9Flu?=
 <39833095+BTopcuoglu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 15:35:31 -0500
Subject: [PATCH 11/11] fix the tidyverse bug (select out outcome column)

Fixes #8
---
 code/learning/permutation_importance.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/code/learning/permutation_importance.R b/code/learning/permutation_importance.R
index d1c0747..217e6bc 100644
--- a/code/learning/permutation_importance.R
+++ b/code/learning/permutation_importance.R
@@ -83,8 +83,11 @@ permutation_importance <- function(model, full, first_outcome, outcome=NULL){
   # Remove those names as columns from full test data
   # Remove the diagnosis column to only keep non-correlated features
   non_correlated_otus <- full %>%
-    select(-correlated_otus) %>%
-    select(-sym(first_outcome)) %>%
+    select(-correlated_otus)
+  
+  non_correlated_otus[,outcome] <- NULL
+  
+  non_correlated_otus <- non_correlated_otus %>%
     colnames()
   # -------------------------------------------------------------------->