Skip to content

Commit

Permalink
Orchestrate (#14)
Browse files Browse the repository at this point in the history
* added orchestrator and template

* class for parameter grids

* Added entry points support for custom models with Pipeline

Co-authored-by: gwarmstrong <[email protected]>

* Adds ability to set_params of custom models (Pipelines)

* Adds a _parameters.py containing defined parameter grids

* parameter grids

* parameter get

* file locating bug fixed

* added templates

* template issues

* orchestrator json fixes in template

* Final Random Forest grids

* more updates to parameter grid for RF

* grid updates

* randomized parameters, minor template tweak

* Added template to setup

* remove extra template

* reduced param grids for rf

* update to template and preprocess bugfix

* Option for reduced parameter grid, where defined

* option flags

* (untested) adds force option, info txt

* tested previous additions

* Fixed result-skipping behavior, removed lgbm

* checking for existing results fix

* specify intel in resource list

* Update q2_mlab/orchestrator.py

Co-authored-by: Yoshiki Vázquez Baeza <[email protected]>

Co-authored-by: gwarmstrong <[email protected]>
Co-authored-by: Patrick McGrath <[email protected]>
Co-authored-by: Patrick McGrath <[email protected]>
Co-authored-by: Patrick McGrath <[email protected]>
Co-authored-by: Yoshiki Vázquez Baeza <[email protected]>
  • Loading branch information
6 people authored Aug 14, 2020
1 parent 3f137e4 commit 808a6d0
Show file tree
Hide file tree
Showing 8 changed files with 442 additions and 15 deletions.
2 changes: 2 additions & 0 deletions q2_mlab/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .learningtask import LearningTask, ClassificationTask, RegressionTask
from ._type import Target, Results
from ._format import ResultsDirectoryFormat, ResultsFormat
from ._parameters import ParameterGrids

__version__ = get_versions()["version"]

Expand All @@ -28,4 +29,5 @@
"LearningTask",
"ClassificationTask",
"RegressionTask",
"ParameterGrids",
]
103 changes: 103 additions & 0 deletions q2_mlab/_parameters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import numpy as np
from sklearn.model_selection import ParameterGrid


class ParameterGrids:
def get(algorithm):
grids = {
"LinearSVC": {
'penalty': ['l2'],
'tol': [1e-4, 1e-3, 1e-2, 1e-1],
'loss': ['hinge', 'squared_hinge'],
'random_state': [2018]
},
"LinearSVR": {
"C": [1e-4, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e4],
"epsilon": [1e-2, 1e-1, 0, 1],
"loss": ["squared_epsilon_insensitive", "epsilon_insensitive"],
"random_state": [2018],
},
"RidgeClassifier": {
"alpha": [1e-15, 1e-10, 1e-8, 1e-4],
"fit_intercept": [True],
"normalize": [True, False],
"tol": [1e-1, 1e-2, 1e-3],
"solver": [
"svd",
"cholesky",
"lsqr",
"sparse_cg",
"sag",
"saga",
],
"random_state": [2018],
},
"RidgeRegressor": {
"alpha": [1e-15, 1e-10, 1e-8, 1e-4],
"fit_intercept": [True],
"normalize": [True, False],
"tol": [1e-1, 1e-2, 1e-3],
"solver": [
"svd",
"cholesky",
"lsqr",
"sparse_cg",
"sag",
"saga",
],
"random_state": [2018],
},
"RandomForestClassifier": {
"n_estimators": [1000, 5000],
"criterion": ["gini", "entropy"],
"max_features": ["sqrt", "log2", None] + list(np.arange(0.2, 1, 0.2)),
"max_samples": [0.25, 0.5, 0.75, None],
"max_depth": [None],
"n_jobs": [-1],
"random_state": [2020],
"bootstrap": [True],
"min_samples_split": list(np.arange(0.2, 1, 0.2)) + [2],
"min_samples_leaf": list(np.arange(0.01, 0.5, 0.2)) + [1],
},
"RandomForestRegressor": {
'n_estimators': [1000, 5000],
'criterion': ['mse', 'mae'],
"max_features": ["sqrt", "log2", None] + list(np.arange(0.2, 1, 0.2)),
"max_samples": [0.25, 0.5, 0.75, None],
'max_depth': [None],
'n_jobs': [-1],
'random_state': [2020],
'bootstrap': [True],
'min_samples_split': list(np.arange(0.2, 1, 0.2)) + [2],
'min_samples_leaf': list(np.arange(0.01, .5, 0.2)) + [1],
},
}
return grids[algorithm]

def get_reduced(algorithm):
grids = {
"RandomForestClassifier": {
"n_estimators": [5000],
"criterion": ["gini"],
"max_features": ["sqrt", "log2", None] + list(np.arange(0.2, 1, 0.2)),
"max_samples": [0.25, 0.5, 0.75, None],
"max_depth": [None],
"n_jobs": [-1],
"random_state": [2020],
"bootstrap": [True],
},
"RandomForestRegressor": {
'n_estimators': [5000],
'criterion': ['mse'],
"max_features": ["sqrt", "log2", None] + list(np.arange(0.2, 1, 0.2)),
"max_samples": [0.25, 0.5, 0.75, None],
'max_depth': [None],
'n_jobs': [-1],
'random_state': [2020],
'bootstrap': [True],
},
}
return grids[algorithm]

def get_size(algorithm):
return len(list(ParameterGrid(ParameterGrids.get(algorithm))))
5 changes: 3 additions & 2 deletions q2_mlab/_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def preprocess(
initial_ids_to_keep = table.view(biom.Table).ids()
table_id_set = set(initial_ids_to_keep)
metadata_id_set = set(metadata.ids)
num_shared_ids = len(table_id_set.intersection(metadata_id_set))
shared_ids = table_id_set.intersection(metadata_id_set)
num_shared_ids = len(shared_ids)
if num_shared_ids == 0:
raise ValueError("No sample IDs are shared between Table and Metadata")
print(
Expand All @@ -69,7 +70,7 @@ def preprocess(

# Filter metadata by samples in table
print("Filtering Metadata by samples in table")
filteredmetadata = metadata.filter_ids(ids_to_keep=initial_ids_to_keep)
filteredmetadata = metadata.filter_ids(ids_to_keep=shared_ids)
print_datasize(table, filteredmetadata)

# Filter samples from metadata where NaN in target_variable column
Expand Down
38 changes: 29 additions & 9 deletions q2_mlab/learningtask.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pandas as pd
import time
import pkg_resources
from abc import ABC

# CV Methods
Expand All @@ -21,6 +22,7 @@
)

# Algorithms
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, Ridge
from xgboost import XGBRegressor, XGBClassifier
Expand All @@ -39,14 +41,18 @@
)
from sklearn.mixture import BayesianGaussianMixture
from sklearn.naive_bayes import ComplementNB
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import ElasticNet, Lasso


class LearningTask(ABC):
algorithms = {}

def iter_entry_points(cls):
for entry_point in pkg_resources.iter_entry_points(
group='q2_mlab.models'):
yield entry_point

def __init__(
self,
table,
Expand All @@ -56,12 +62,26 @@ def __init__(
n_repeats,
distance_matrix=None,
):
self.distance_matrix = distance_matrix
# Add any custom algorithms from entry points
for entry_point in self.iter_entry_points():
name = entry_point.name
method = entry_point.load()
self.algorithms.update({name: method})

self.learner = self.algorithms[algorithm]
print(params)
self.params = json.loads(params)
if isinstance(self.learner, Pipeline):
# Assumes that the last step in the pipeline is the model:
prefix = list(self.learner.named_steps)[-1] + "__"
# And adds the prefix of that last step to our param dict's keys
# so we can access that step's parameters.
newparams = {prefix + key: val for key, val in self.params.items()}
self.params = newparams
self.X = table.transpose().matrix_data
self.metadata = metadata
self.y = self.metadata.to_numpy()
self.learner = self.algorithms[algorithm]
self.distance_matrix = distance_matrix
self.cv_idx = 0
self.idx = 0
self.n_repeats = n_repeats
Expand Down Expand Up @@ -109,7 +129,6 @@ class ClassificationTask(LearningTask):
"BaggingClassifier": BaggingClassifier,
"ExtraTreesClassifier": ExtraTreesClassifier,
"HistGradientBoostingClassifier": HistGradientBoostingClassifier,
"LGBMClassifier": LGBMClassifier,
"BayesianGaussianMixture": BayesianGaussianMixture,
"ComplementNB": ComplementNB,
"BayesianGaussianMixture": BayesianGaussianMixture,
Expand Down Expand Up @@ -150,7 +169,8 @@ def cv_fold(self, train_index, test_index):

# Start timing
start = time.process_time()
model = self.learner(**self.params)
model = self.learner()
model.set_params(**self.params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# End timimg
Expand Down Expand Up @@ -225,7 +245,6 @@ class RegressionTask(LearningTask):
"BaggingRegressor": BaggingRegressor,
"ExtraTreesRegressor": ExtraTreesRegressor,
"HistGradientBoostingRegressor": HistGradientBoostingRegressor,
"LGBMRegressor": LGBMRegressor,
"LinearSVR": LinearSVR,
"RidgeRegressor": Ridge,
"MLPRegressor": MLPRegressor,
Expand Down Expand Up @@ -263,9 +282,10 @@ def cv_fold(self, train_index, test_index):

# Start timing
start = time.process_time()
m = self.learner(**self.params)
m.fit(X_train, y_train)
y_pred = m.predict(X_test)
model = self.learner()
model.set_params(**self.params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# End timimg
end = time.process_time()

Expand Down
Loading

0 comments on commit 808a6d0

Please sign in to comment.