Orchestrate (#14)

* added orchestrator and template * class for parameter grids * Added entry points support for custom models with Pipeline Co-authored-by: gwarmstrong <[email protected]> * Adds ability to set_params of custom models (Pipelines) * Adds a _parameters.py containing defined parameter grids * parameter grids * parameter get * file locating bug fixed * added templates * template issues * orchestrator json fixes in template * Final Random Forest grids * more updates to parameter grid for RF * grid updates * randomized parameters, minor template tweak * Added template to setup * remove extra template * reduced param grids for rf * update to template and preprocess bugfix * Option for reduced parameter grid, where defined * option flags * (untested) adds force option, info txt * tested previous additions * Fixed result-skipping behavior, removed lgbm * checking for existing results fix * specify intel in resource list * Update q2_mlab/orchestrator.py Co-authored-by: Yoshiki Vázquez Baeza <[email protected]> Co-authored-by: gwarmstrong <[email protected]> Co-authored-by: Patrick McGrath <[email protected]> Co-authored-by: Patrick McGrath <[email protected]> Co-authored-by: Patrick McGrath <[email protected]> Co-authored-by: Yoshiki Vázquez Baeza <[email protected]>
knightlab-analyses · Aug 14, 2020 · 808a6d0 · 808a6d0
1 parent 3f137e4
commit 808a6d0
Show file tree

Hide file tree

Showing 8 changed files with 442 additions and 15 deletions.
diff --git a/q2_mlab/__init__.py b/q2_mlab/__init__.py
@@ -12,6 +12,7 @@
 from .learningtask import LearningTask, ClassificationTask, RegressionTask
 from ._type import Target, Results
 from ._format import ResultsDirectoryFormat, ResultsFormat
+from ._parameters import ParameterGrids
 
 __version__ = get_versions()["version"]
 
@@ -28,4 +29,5 @@
     "LearningTask",
     "ClassificationTask",
     "RegressionTask",
+    "ParameterGrids",
 ]
diff --git a/q2_mlab/_parameters.py b/q2_mlab/_parameters.py
@@ -0,0 +1,103 @@
+import numpy as np
+from sklearn.model_selection import ParameterGrid
+
+
+class ParameterGrids:
+    def get(algorithm):
+        grids = {
+            "LinearSVC": {
+                'penalty': ['l2'],
+                'tol': [1e-4, 1e-3, 1e-2, 1e-1],
+                'loss': ['hinge', 'squared_hinge'],
+                'random_state': [2018]
+            },
+            "LinearSVR": {
+                "C": [1e-4, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e4],
+                "epsilon": [1e-2, 1e-1, 0, 1],
+                "loss": ["squared_epsilon_insensitive", "epsilon_insensitive"],
+                "random_state": [2018],
+            },
+            "RidgeClassifier": {
+                "alpha": [1e-15, 1e-10, 1e-8, 1e-4],
+                "fit_intercept": [True],
+                "normalize": [True, False],
+                "tol": [1e-1, 1e-2, 1e-3],
+                "solver": [
+                    "svd",
+                    "cholesky",
+                    "lsqr",
+                    "sparse_cg",
+                    "sag",
+                    "saga",
+                ],
+                "random_state": [2018],
+            },
+            "RidgeRegressor": {
+                "alpha": [1e-15, 1e-10, 1e-8, 1e-4],
+                "fit_intercept": [True],
+                "normalize": [True, False],
+                "tol": [1e-1, 1e-2, 1e-3],
+                "solver": [
+                    "svd",
+                    "cholesky",
+                    "lsqr",
+                    "sparse_cg",
+                    "sag",
+                    "saga",
+                ],
+                "random_state": [2018],
+            },
+            "RandomForestClassifier": {
+                "n_estimators": [1000, 5000],
+                "criterion": ["gini", "entropy"],
+                "max_features": ["sqrt", "log2", None] + list(np.arange(0.2, 1, 0.2)),
+                "max_samples": [0.25, 0.5, 0.75, None],
+                "max_depth": [None],
+                "n_jobs": [-1],
+                "random_state": [2020],
+                "bootstrap": [True],
+                "min_samples_split": list(np.arange(0.2, 1, 0.2)) + [2],
+                "min_samples_leaf": list(np.arange(0.01, 0.5, 0.2)) + [1],
+            },
+            "RandomForestRegressor": {
+                'n_estimators': [1000, 5000],
+                'criterion': ['mse', 'mae'],
+                "max_features": ["sqrt", "log2", None] + list(np.arange(0.2, 1, 0.2)),
+                "max_samples": [0.25, 0.5, 0.75, None],
+                'max_depth': [None],
+                'n_jobs': [-1],
+                'random_state': [2020],
+                'bootstrap': [True],
+                'min_samples_split': list(np.arange(0.2, 1, 0.2)) + [2],
+                'min_samples_leaf': list(np.arange(0.01, .5, 0.2)) + [1],
+            },
+        }
+        return grids[algorithm]
+
+    def get_reduced(algorithm):
+        grids = {
+            "RandomForestClassifier": {
+                "n_estimators": [5000],
+                "criterion": ["gini"],
+                "max_features": ["sqrt", "log2", None] + list(np.arange(0.2, 1, 0.2)),
+                "max_samples": [0.25, 0.5, 0.75, None],
+                "max_depth": [None],
+                "n_jobs": [-1],
+                "random_state": [2020],
+                "bootstrap": [True],
+            },
+            "RandomForestRegressor": {
+                'n_estimators': [5000],
+                'criterion': ['mse'],
+                "max_features": ["sqrt", "log2", None] + list(np.arange(0.2, 1, 0.2)),
+                "max_samples": [0.25, 0.5, 0.75, None],
+                'max_depth': [None],
+                'n_jobs': [-1],
+                'random_state': [2020],
+                'bootstrap': [True],
+            },
+        }
+        return grids[algorithm]
+
+    def get_size(algorithm):
+        return len(list(ParameterGrid(ParameterGrids.get(algorithm))))
diff --git a/q2_mlab/_preprocess.py b/q2_mlab/_preprocess.py
@@ -59,7 +59,8 @@ def preprocess(
     initial_ids_to_keep = table.view(biom.Table).ids()
     table_id_set = set(initial_ids_to_keep)
     metadata_id_set = set(metadata.ids)
-    num_shared_ids = len(table_id_set.intersection(metadata_id_set))
+    shared_ids = table_id_set.intersection(metadata_id_set)
+    num_shared_ids = len(shared_ids)
     if num_shared_ids == 0:
         raise ValueError("No sample IDs are shared between Table and Metadata")
     print(
@@ -69,7 +70,7 @@ def preprocess(
 
     # Filter metadata by samples in table
     print("Filtering Metadata by samples in table")
-    filteredmetadata = metadata.filter_ids(ids_to_keep=initial_ids_to_keep)
+    filteredmetadata = metadata.filter_ids(ids_to_keep=shared_ids)
     print_datasize(table, filteredmetadata)
 
     # Filter samples from metadata where NaN in target_variable column

diff --git a/q2_mlab/learningtask.py b/q2_mlab/learningtask.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import time
+import pkg_resources
 from abc import ABC
 
 # CV Methods
@@ -21,6 +22,7 @@
 )
 
 # Algorithms
+from sklearn.pipeline import Pipeline
 from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
 from sklearn.linear_model import RidgeClassifier, Ridge
 from xgboost import XGBRegressor, XGBClassifier
@@ -39,14 +41,18 @@
 )
 from sklearn.mixture import BayesianGaussianMixture
 from sklearn.naive_bayes import ComplementNB
-from lightgbm import LGBMClassifier, LGBMRegressor
 from sklearn.neural_network import MLPClassifier, MLPRegressor
 from sklearn.linear_model import ElasticNet, Lasso
 
 
 class LearningTask(ABC):
     algorithms = {}
 
+    def iter_entry_points(cls):
+        for entry_point in pkg_resources.iter_entry_points(
+                group='q2_mlab.models'):
+            yield entry_point
+
     def __init__(
         self,
         table,
@@ -56,12 +62,26 @@ def __init__(
         n_repeats,
         distance_matrix=None,
     ):
-        self.distance_matrix = distance_matrix
+        # Add any custom algorithms from entry points
+        for entry_point in self.iter_entry_points():
+            name = entry_point.name
+            method = entry_point.load()
+            self.algorithms.update({name: method})
+
+        self.learner = self.algorithms[algorithm]
+        print(params)
         self.params = json.loads(params)
+        if isinstance(self.learner, Pipeline):
+            # Assumes that the last step in the pipeline is the model:
+            prefix = list(self.learner.named_steps)[-1] + "__"
+            # And adds the prefix of that last step to our param dict's keys
+            # so we can access that step's parameters.
+            newparams = {prefix + key: val for key, val in self.params.items()}
+            self.params = newparams
         self.X = table.transpose().matrix_data
         self.metadata = metadata
         self.y = self.metadata.to_numpy()
-        self.learner = self.algorithms[algorithm]
+        self.distance_matrix = distance_matrix
         self.cv_idx = 0
         self.idx = 0
         self.n_repeats = n_repeats
@@ -109,7 +129,6 @@ class ClassificationTask(LearningTask):
         "BaggingClassifier": BaggingClassifier,
         "ExtraTreesClassifier": ExtraTreesClassifier,
         "HistGradientBoostingClassifier": HistGradientBoostingClassifier,
-        "LGBMClassifier": LGBMClassifier,
         "BayesianGaussianMixture": BayesianGaussianMixture,
         "ComplementNB": ComplementNB,
         "BayesianGaussianMixture": BayesianGaussianMixture,
@@ -150,7 +169,8 @@ def cv_fold(self, train_index, test_index):
 
         # Start timing
         start = time.process_time()
-        model = self.learner(**self.params)
+        model = self.learner()
+        model.set_params(**self.params)
         model.fit(X_train, y_train)
         y_pred = model.predict(X_test)
         # End timimg
@@ -225,7 +245,6 @@ class RegressionTask(LearningTask):
         "BaggingRegressor": BaggingRegressor,
         "ExtraTreesRegressor": ExtraTreesRegressor,
         "HistGradientBoostingRegressor": HistGradientBoostingRegressor,
-        "LGBMRegressor": LGBMRegressor,
         "LinearSVR": LinearSVR,
         "RidgeRegressor": Ridge,
         "MLPRegressor": MLPRegressor,
@@ -263,9 +282,10 @@ def cv_fold(self, train_index, test_index):
 
         # Start timing
         start = time.process_time()
-        m = self.learner(**self.params)
-        m.fit(X_train, y_train)
-        y_pred = m.predict(X_test)
+        model = self.learner()
+        model.set_params(**self.params)
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
         # End timimg
         end = time.process_time()