From a3facd35eacd521552bcf4a93e99293949b3de3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taneli=20Mielik=C3=A4inen?= Date: Sun, 19 Aug 2018 19:13:27 -0700 Subject: [PATCH 01/11] update xgboost dependency --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index b65911d6cb..00623f264d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ numpy>=1.9.0 scipy>=0.14.1 scikit-learn>=0.19,<0.20 -xgboost==0.7.post3 +xgboost>=0.80 lockfile joblib diff --git a/setup.py b/setup.py index 4ae310adf0..573b06902a 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ "pynisher>=0.4,<0.5", "pyrfr>=0.6.1,<0.8", "smac>=0.8,<0.9", - "xgboost==0.7.post3", + "xgboost>=0.80", ] with open("autosklearn/__version__.py") as fh: From e998e73e002eca85977877f2b5cfeceb5b4fcdf1 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 12 Nov 2018 13:01:20 +0100 Subject: [PATCH 02/11] Closes #494 - remove dependency on six --- autosklearn/data/xy_data_manager.py | 3 +-- autosklearn/metalearning/metafeatures/metafeature.py | 7 +++---- autosklearn/metalearning/mismbo.py | 7 ------- autosklearn/metalearning/optimizers/optimizer_base.py | 4 ++-- autosklearn/util/common.py | 5 ++--- autosklearn/util/data.py | 2 -- requirements.txt | 2 -- setup.py | 1 - test/test_metalearning/pyMetaLearn/test_meta_features.py | 4 ++-- .../pyMetaLearn/test_meta_features_sparse.py | 2 +- 10 files changed, 11 insertions(+), 26 deletions(-) diff --git a/autosklearn/data/xy_data_manager.py b/autosklearn/data/xy_data_manager.py index 95539d98dc..3ee760a0f9 100644 --- a/autosklearn/data/xy_data_manager.py +++ b/autosklearn/data/xy_data_manager.py @@ -2,7 +2,6 @@ import numpy as np from scipy import sparse -import six from autosklearn.constants import * from autosklearn.data.abstract_data_manager import AbstractDataManager @@ -13,7 +12,7 @@ class XYDataManager(AbstractDataManager): def __init__(self, X, y, X_test, y_test, task, feat_type, dataset_name): super(XYDataManager, self).__init__(dataset_name) - if isinstance(task, six.string_types): + if isinstance(task, str): task = STRING_TO_TASK_TYPES[task] self.info['task'] = task diff --git a/autosklearn/metalearning/metafeatures/metafeature.py b/autosklearn/metalearning/metafeatures/metafeature.py index 7c5c714ed3..b14c138cf5 100644 --- a/autosklearn/metalearning/metafeatures/metafeature.py +++ b/autosklearn/metalearning/metafeatures/metafeature.py @@ -1,11 +1,10 @@ from abc import ABCMeta, abstractmethod -from six import StringIO +from io import StringIO import time import types import arff import scipy.sparse -import six from autosklearn.util.logging_ import get_logger @@ -108,7 +107,7 @@ def dumps(self): def dump(self, path_or_filehandle): output = self._get_arff() - if isinstance(path_or_filehandle, six.string_types): + if isinstance(path_or_filehandle, str): with open(path_or_filehandle, "w") as fh: arff.dump(output, fh) else: @@ -117,7 +116,7 @@ def dump(self, path_or_filehandle): @classmethod def load(cls, path_or_filehandle): - if isinstance(path_or_filehandle, six.string_types): + if isinstance(path_or_filehandle, str): with open(path_or_filehandle) as fh: input = arff.load(fh) else: diff --git a/autosklearn/metalearning/mismbo.py b/autosklearn/metalearning/mismbo.py index 12d8111382..a60b4ca42b 100644 --- a/autosklearn/metalearning/mismbo.py +++ b/autosklearn/metalearning/mismbo.py @@ -1,13 +1,6 @@ # -*- encoding: utf-8 -*- -import os import time -from six import StringIO - -import numpy as np -from autosklearn.metalearning.metafeatures.metafeatures import \ - calculate_all_metafeatures_with_labels, \ - calculate_all_metafeatures_encoded_labels, subsets from autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner \ import MetaLearningOptimizer diff --git a/autosklearn/metalearning/optimizers/optimizer_base.py b/autosklearn/metalearning/optimizers/optimizer_base.py index 6a24c20381..3e2ad92f76 100644 --- a/autosklearn/metalearning/optimizers/optimizer_base.py +++ b/autosklearn/metalearning/optimizers/optimizer_base.py @@ -1,6 +1,6 @@ from collections import OrderedDict from itertools import product -from six import StringIO +from io import StringIO import subprocess @@ -90,4 +90,4 @@ def build_grid(hyperparameters): parameter_tuples = zip(hyperparameters.keys(), parameters) parameter_dict = dict(parameter_tuples) parameter_dicts.append(parameter_dict) - return parameter_dicts \ No newline at end of file + return parameter_dicts diff --git a/autosklearn/util/common.py b/autosklearn/util/common.py index f11ce63978..6728961644 100644 --- a/autosklearn/util/common.py +++ b/autosklearn/util/common.py @@ -1,7 +1,6 @@ # -*- encoding: utf-8 -*- import os -from sklearn.externals import six import warnings __all__ = [ @@ -15,7 +14,7 @@ def warn_if_not_float(X, estimator='This algorithm'): Returns True if a warning was raised (i.e. the input is not float) and False otherwise, for easier input validation. """ - if not isinstance(estimator, six.string_types): + if not isinstance(estimator, str): estimator = estimator.__class__.__name__ if X.dtype.kind != 'f': warnings.warn("%s assumes floating point values as input, " @@ -58,4 +57,4 @@ def check_for_bool(p): elif check_true(p): return True else: - raise ValueError("%s is not a bool" % str(p)) \ No newline at end of file + raise ValueError("%s is not a bool" % str(p)) diff --git a/autosklearn/util/data.py b/autosklearn/util/data.py index 0d8e87ddde..5bae60c99d 100644 --- a/autosklearn/util/data.py +++ b/autosklearn/util/data.py @@ -2,8 +2,6 @@ # Functions performing various data conversions for the ChaLearn AutoML # challenge -from six.moves import range - import numpy as np __all__ = [ diff --git a/requirements.txt b/requirements.txt index 994063a6b4..b2cc529a3c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ setuptools nose - -six Cython numpy>=1.9.0<=1.14.5 diff --git a/setup.py b/setup.py index d72d364c8d..223977cf45 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,6 @@ requirements = [ "setuptools", "nose", - "six", "Cython", # Numpy version of higher than 1.14.5 causes libgcc_s.so.1 error. "numpy>=1.9.0<=1.14.5", diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features.py b/test/test_metalearning/pyMetaLearn/test_meta_features.py index 0634298ec1..b89808d122 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_features.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_features.py @@ -1,6 +1,6 @@ import os import tempfile -from six import StringIO +from io import StringIO from unittest import TestCase import unittest @@ -479,4 +479,4 @@ def test_calculate_all_metafeatures_multilabel(self): t = unittest.TestLoader().loadTestsFromName( "pyMetaLearn.metafeatures.test_meta_features.TestMetaFeatures" ".test_calculate_all_metafeatures") - unittest.TextTestRunner(verbosity=2).run(t) \ No newline at end of file + unittest.TextTestRunner(verbosity=2).run(t) diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py index e837adfc1c..f43f4b4c8f 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py @@ -1,4 +1,4 @@ -from six import StringIO +from io import StringIO import os import sys import unittest From 66d88942879a752a82110efa725c6c1e0c33721c Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 12 Nov 2018 13:57:21 +0100 Subject: [PATCH 03/11] FIX xgboost fixtures --- .../classification/xgradient_boosting.py | 8 +---- .../data_preprocessing/balancing/balancing.py | 6 ++-- .../regression/xgradient_boosting.py | 6 +--- .../components/classification/test_base.py | 4 +-- .../classification/test_xgradient_boosting.py | 2 +- .../data_preprocessing/test_balancing.py | 33 ++++++++++--------- .../regression/test_xgradient_boosting.py | 16 ++++----- 7 files changed, 35 insertions(+), 40 deletions(-) diff --git a/autosklearn/pipeline/components/classification/xgradient_boosting.py b/autosklearn/pipeline/components/classification/xgradient_boosting.py index c359a26b8a..eb8f2d1077 100644 --- a/autosklearn/pipeline/components/classification/xgradient_boosting.py +++ b/autosklearn/pipeline/components/classification/xgradient_boosting.py @@ -29,6 +29,7 @@ def __init__(self, # (Conditional) DART Hyperparameters sample_type=None, normalize_type=None, rate_drop=None, ): + self.seed = random_state.randint(0, 10000) self.learning_rate = learning_rate self.n_estimators = n_estimators @@ -60,12 +61,6 @@ def __init__(self, else: self.silent = True - # Random number seed. - if random_state is None: - self.seed = 1 - else: - self.seed = random_state.randint(1, 10000, size=1)[0] - ## new paramaters # Subsample ratio of columns when constructing each tree. self.colsample_bytree = colsample_bytree @@ -158,7 +153,6 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, base_score=self.base_score, - seed=self.seed, random_state=self.seed, **self.booster_args ) diff --git a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py index 9105412ab5..78fcdc80c5 100644 --- a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py +++ b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py @@ -42,8 +42,10 @@ def get_weights(self, Y, classifier, preprocessor, init_params, fit_params): Y_ = Y unique, counts = np.unique(Y_, return_counts=True) - cw = 1. / counts - cw = cw / np.mean(cw) + # This will result in an average weight of 1! + cw = 1 / (counts / np.sum(counts)) / 2 + if len(Y.shape) == 2: + cw /= Y.shape[1] sample_weights = np.ones(Y_.shape) diff --git a/autosklearn/pipeline/components/regression/xgradient_boosting.py b/autosklearn/pipeline/components/regression/xgradient_boosting.py index 09f943446e..0fccaec710 100644 --- a/autosklearn/pipeline/components/regression/xgradient_boosting.py +++ b/autosklearn/pipeline/components/regression/xgradient_boosting.py @@ -62,10 +62,7 @@ def __init__(self, self.silent = True # Random number seed. - if random_state is None: - self.seed = 1 - else: - self.seed = random_state.randint(1, 10000, size=1)[0] + self.seed = random_state.randint(1, 10000, size=1)[0] ## new paramaters # Subsample ratio of columns when constructing each tree. @@ -152,7 +149,6 @@ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, base_score=self.base_score, - seed=self.seed, random_state=self.seed, **self.booster_args ) diff --git a/test/test_pipeline/components/classification/test_base.py b/test/test_pipeline/components/classification/test_base.py index 9b6bc54c38..428735b6c2 100644 --- a/test/test_pipeline/components/classification/test_base.py +++ b/test/test_pipeline/components/classification/test_base.py @@ -202,7 +202,7 @@ def check_classifier(cls): params.append(p) if i > 0: - self.assertEquals( + self.assertEqual( params[-1], params[0], ) @@ -221,4 +221,4 @@ def check_classifier(cls): classifier = classifier(random_state=np.random.RandomState(1), **{hp_name: config[hp_name] for hp_name in config if config[hp_name] is not None}) - check_classifier(classifier) \ No newline at end of file + check_classifier(classifier) diff --git a/test/test_pipeline/components/classification/test_xgradient_boosting.py b/test/test_pipeline/components/classification/test_xgradient_boosting.py index a83be5082f..9b9df8968d 100644 --- a/test/test_pipeline/components/classification/test_xgradient_boosting.py +++ b/test/test_pipeline/components/classification/test_xgradient_boosting.py @@ -13,7 +13,7 @@ class XGradientBoostingComponentTest(BaseClassificationComponentTest): res["default_iris"] = 0.94 res["iris_n_calls"] = 6 res["default_iris_iterative"] = 0.94 - res["default_iris_proba"] = 0.1512353178486228 + res["default_iris_proba"] = 0.15122245267033577 res["default_iris_sparse"] = 0.74 res["default_digits"] = 0.8160291438979964 res["digits_n_calls"] = 7 diff --git a/test/test_pipeline/components/data_preprocessing/test_balancing.py b/test/test_pipeline/components/data_preprocessing/test_balancing.py index db37965195..93b66dcb66 100644 --- a/test/test_pipeline/components/data_preprocessing/test_balancing.py +++ b/test/test_pipeline/components/data_preprocessing/test_balancing.py @@ -30,12 +30,13 @@ def test_balancing_get_weights_treed_single_label(self): balancing = Balancing(strategy='weighting') init_params, fit_params = balancing.get_weights( Y, 'adaboost', None, None, None) - self.assertTrue(np.allclose(fit_params['classifier:sample_weight'], - np.array([0.4] * 80 + [1.6] * 20))) - #init_params, fit_params = balancing.get_weights( - # Y, None, 'extra_trees_preproc_for_classification', None, None) - #self.assertTrue(np.allclose(fit_params['preprocessor:sample_weight'], - # np.array([0.4] * 80 + [1.6] * 20))) + self.assertAlmostEqual( + np.mean(fit_params['classifier:sample_weight']), 1, + ) + np.testing.assert_allclose( + fit_params['classifier:sample_weight'], + np.array([0.625] * 80 + [2.5] * 20), + ) def test_balancing_get_weights_treed_multilabel(self): Y = np.array([[0, 0, 0]] * 100 + [[1, 0, 0]] * 100 + [[0, 1, 0]] * 100 + @@ -43,12 +44,14 @@ def test_balancing_get_weights_treed_multilabel(self): balancing = Balancing(strategy='weighting') init_params, fit_params = balancing.get_weights( Y, 'adaboost', None, None, None) - self.assertTrue(np.allclose(fit_params['classifier:sample_weight'], - np.array([0.4] * 500 + [4.0] * 10))) - #init_params, fit_params = balancing.get_weights( - # Y, None, 'extra_trees_preproc_for_classification', None, None) - #self.assertTrue(np.allclose(fit_params['preprocessor:sample_weight'], - # np.array([0.4] * 500 + [4.0] * 10))) + print(fit_params['classifier:sample_weight']) + self.assertAlmostEqual( + np.mean(fit_params['classifier:sample_weight']), 1, + ) + np.testing.assert_allclose( + fit_params['classifier:sample_weight'], + np.array([0.85] * 500 + [8.5] * 10), + ) def test_balancing_get_weights_svm_sgd(self): Y = np.array([0] * 80 + [1] * 20) @@ -77,8 +80,8 @@ def test_weighting_effect(self): ('random_forest', RandomForest, 0.780, 0.789, 3), ('libsvm_svc', LibSVM_SVC, 0.769, 0.72, 3), ('liblinear_svc', LibLinear_SVC, 0.762, 0.735, 3), - ('passive_aggressive', PassiveAggressive, 0.642, 0.449, 3), - ('sgd', SGD, 0.818, 0.575, 2) + ('passive_aggressive', PassiveAggressive, 0.642, 0.444, 3), + ('sgd', SGD, 0.818, 0.567, 2) ]: for strategy, acc in [ ('none', acc_no_weighting), @@ -127,7 +130,7 @@ def test_weighting_effect(self): [('extra_trees_preproc_for_classification', ExtraTreesPreprocessorClassification, 0.810, 0.563), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, - 0.837, 0.567)]: + 0.837, 0.576)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: data_ = copy.copy(data) diff --git a/test/test_pipeline/components/regression/test_xgradient_boosting.py b/test/test_pipeline/components/regression/test_xgradient_boosting.py index 5bb0c3ea95..a6a8f856a8 100644 --- a/test/test_pipeline/components/regression/test_xgradient_boosting.py +++ b/test/test_pipeline/components/regression/test_xgradient_boosting.py @@ -10,16 +10,16 @@ class XGradientBoostingComponentTest(BaseRegressionComponentTest): __test__ = True res = dict() - res["default_boston"] = 0.8155209334566791 + res["default_boston"] = 0.7950690273856177 res["boston_n_calls"] = 7 - res["default_boston_iterative"] = 0.8155209334566791 - res["default_boston_sparse"] = 0.5734978224089335 - res["default_boston_iterative_sparse"] = 0.5734978224089335 - res["default_diabetes"] = 0.29100776654206073 + res["default_boston_iterative"] = 0.7950690350340658 + res["default_boston_sparse"] = 0.4089636428137894 + res["default_boston_iterative_sparse"] = 0.40896364129803287 + res["default_diabetes"] = 0.3252009519763832 res["diabetes_n_calls"] = 7 - res["default_diabetes_iterative"] = 0.29100776654206073 - res["default_diabetes_sparse"] = 0.1996773189850003 - res["default_diabetes_iterative_sparse"] = 0.1996773189850003 + res["default_diabetes_iterative"] = 0.3252009519763832 + res["default_diabetes_sparse"] = 0.15356041856907898 + res["default_diabetes_iterative_sparse"] = 0.15356041856907898 res['ignore_hps'] = ['n_estimators'] sk_mod = autosklearn.pipeline.implementations.xgb.CustomXGBRegressor From 8bdcba15caa28cb4336d9cb6ee4108078ab6d8a2 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 12 Nov 2018 13:58:14 +0100 Subject: [PATCH 04/11] FIXES #558 - use less memory for ensemble builder, allow setting memory limit --- autosklearn/automl.py | 29 ++++++++++++--------- autosklearn/ensembles/ensemble_selection.py | 7 ++++- autosklearn/estimators.py | 14 +++++++--- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 91d66d4bd2..de76fd682f 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -66,6 +66,7 @@ def __init__(self, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, + ensemble_memory_limit=1000, seed=1, ml_memory_limit=3072, metadata_directory=None, @@ -94,6 +95,7 @@ def __init__(self, initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest + self._ensemble_memory_limit = ensemble_memory_limit self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None @@ -635,18 +637,21 @@ def _get_ensemble_process(self, time_left_for_ensembles, else: self._ensemble_size = ensemble_size - return EnsembleBuilder(backend=self._backend, - dataset_name=dataset_name, - task_type=task, - metric=metric, - limit=time_left_for_ensembles, - ensemble_size=ensemble_size, - ensemble_nbest=ensemble_nbest, - seed=self._seed, - shared_mode=self._shared_mode, - precision=precision, - max_iterations=max_iterations, - read_at_most=np.inf) + return EnsembleBuilder( + backend=self._backend, + dataset_name=dataset_name, + task_type=task, + metric=metric, + limit=time_left_for_ensembles, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + seed=self._seed, + shared_mode=self._shared_mode, + precision=precision, + max_iterations=max_iterations, + read_at_most=np.inf, + memory_limit=self._ensemble_memory_limit, + ) def _load_models(self): if self._shared_mode: diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 6540d1488b..e6a43cbf2f 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -74,7 +74,12 @@ def _fast(self, predictions, labels): if s == 0: weighted_ensemble_prediction = np.zeros(predictions[0].shape) else: - ensemble_prediction = np.mean(np.array(ensemble), axis=0) + # Memory-efficient averaging! + ensemble_prediction = np.zeros(ensemble[0].shape) + for pred in ensemble: + ensemble_prediction += pred + ensemble_prediction /= s + weighted_ensemble_prediction = (s / float(s + 1)) * \ ensemble_prediction fant_ensemble_prediction = np.zeros(weighted_ensemble_prediction.shape) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 6adedd0f56..267087ef65 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -14,6 +14,7 @@ def __init__(self, initial_configurations_via_metalearning=25, ensemble_size=50, ensemble_nbest=50, + ensemble_memory_limit=1024, seed=1, ml_memory_limit=3072, include_estimators=None, @@ -63,6 +64,11 @@ def __init__(self, ensemble. Implements `Model Library Pruning` from `Getting the most out of ensemble selection`. + ensemble_memory_limit : int, optional (1024) + Memory limit in MB for the ensemble building process. + `auto-sklearn` will reduce the number of considered models + (``ensemble_nbest``) if the memory limit is reached. + seed : int, optional (default=1) Used to seed SMAC. Will determine the output file names. @@ -157,16 +163,16 @@ def __init__(self, optimization/validation set, which would later on be used to build an ensemble. * ``'model'`` : do not save any model files - + smac_scenario_args : dict, optional (None) Additional arguments inserted into the scenario of SMAC. See the `SMAC documentation `_ for a list of available arguments. - + get_smac_object_callback : callable Callback function to create an object of class `smac.optimizer.smbo.SMBO `_. - The function must accept the arguments ``scenario_dict``, + The function must accept the arguments ``scenario_dict``, ``instances``, ``num_params``, ``runhistory``, ``seed`` and ``ta``. This is an advanced feature. Use only if you are familiar with `SMAC `_. @@ -191,6 +197,7 @@ def __init__(self, self.initial_configurations_via_metalearning = initial_configurations_via_metalearning self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest + self.ensemble_memory_limit = ensemble_memory_limit self.seed = seed self.ml_memory_limit = ml_memory_limit self.include_estimators = include_estimators @@ -236,6 +243,7 @@ def build_automl(self): self.initial_configurations_via_metalearning, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, + ensemble_memory_limit=self.ensemble_memory_limit, seed=self.seed, ml_memory_limit=self.ml_memory_limit, include_estimators=self.include_estimators, From 66d9f09e500ca9284cc1c170e7a07b289c181566 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 12 Nov 2018 14:15:48 +0100 Subject: [PATCH 05/11] FIXES #517 - add seed to ensemble builder --- autosklearn/automl.py | 1 + autosklearn/ensemble_builder.py | 127 +++++++++++--------- autosklearn/ensembles/ensemble_selection.py | 16 ++- 3 files changed, 81 insertions(+), 63 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index de76fd682f..2173875166 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -651,6 +651,7 @@ def _get_ensemble_process(self, time_left_for_ensembles, max_iterations=max_iterations, read_at_most=np.inf, memory_limit=self._ensemble_memory_limit, + random_state=self._seed, ) def _load_models(self): diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 6d96e30562..a719c42362 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -6,13 +6,15 @@ import re import time import traceback +from typing import Optional, Union import numpy as np import pynisher +from sklearn.utils.validation import check_random_state from autosklearn.util.backend import Backend from autosklearn.constants import BINARY_CLASSIFICATION -from autosklearn.metrics import calculate_score +from autosklearn.metrics import calculate_score, Scorer from autosklearn.ensembles.ensemble_selection import EnsembleSelection from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble from autosklearn.util.logging_ import get_logger @@ -28,7 +30,7 @@ def __init__( backend: Backend, dataset_name: str, task_type: int, - metric: str, + metric: Scorer, limit: int, ensemble_size: int=10, ensemble_nbest: int=100, @@ -39,10 +41,11 @@ def __init__( sleep_duration: int=2, memory_limit: int=1000, read_at_most: int=5, + random_state: Optional[Union[int, np.random.RandomState]]=None, ): """ Constructor - + Parameters ---------- backend: util.backend.Backend @@ -68,12 +71,12 @@ def __init__( maximal number of iterations to run this script (default None --> deactivated) precision: ["16","32","64","128"] - precision of floats to read the predictions + precision of floats to read the predictions sleep_duration: int duration of sleeping time between two iterations of this script (in sec) memory_limit: int memory limit in mb - read_at_most: int + read_at_most: int read at most n new prediction files in each iteration """ @@ -93,7 +96,8 @@ def __init__( self.sleep_duration = sleep_duration self.memory_limit = memory_limit self.read_at_most = read_at_most - + self.random_state = check_random_state(random_state) + # part of the original training set # used to build the ensemble self.dir_ensemble = os.path.join( @@ -120,7 +124,7 @@ def __init__( self.start_time = 0 self.model_fn_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy') - + # already read prediction files # {"file name": { # "ens_score": float @@ -167,7 +171,7 @@ def main(self): self.start_time = time.time() iteration = 0 - + while True: #maximal number of iterations @@ -176,29 +180,29 @@ def main(self): self.logger.info("Terminate ensemble building because of max iterations: %d of %d", self.max_iterations, iteration) - break - + break + used_time = time.time() - self.start_time self.logger.debug( 'Starting iteration %d, time left: %f', iteration, self.time_limit - used_time, ) - + # populates self.read_preds if not self.read_ensemble_preds(): time.sleep(self.sleep_duration) continue - + selected_models = self.get_n_best_preds() if not selected_models: # nothing selected continue - + # populates predictions in self.read_preds # reduces selected models if file reading failed n_sel_valid, n_sel_test = self.\ get_valid_test_preds(selected_keys=selected_models) - + selected_models_set = set(selected_models) if selected_models_set.intersection(n_sel_test): selected_models = list(selected_models_set.intersection(n_sel_test)) @@ -207,12 +211,12 @@ def main(self): else: # use selected_models only defined by ensemble data set pass - + # train ensemble ensemble = self.fit_ensemble(selected_keys=selected_models) - + if ensemble is not None: - + self.predict(set_="valid", ensemble=ensemble, selected_keys=n_sel_valid, @@ -220,22 +224,22 @@ def main(self): index_run=iteration) # TODO if predictions fails, build the model again during the # next iteration! - self.predict(set_="test", - ensemble=ensemble, - selected_keys=n_sel_test, - n_preds=len(selected_models), + self.predict(set_="test", + ensemble=ensemble, + selected_keys=n_sel_test, + n_preds=len(selected_models), index_run=iteration) iteration += 1 else: time.sleep(self.sleep_duration) - + def read_ensemble_preds(self): """ - reading predictions on ensemble building data set; + reading predictions on ensemble building data set; populates self.read_preds """ self.logger.debug("Read ensemble data set predictions") - + if self.y_true_ensemble is None: try: self.y_true_ensemble = self.backend.load_targets_ensemble() @@ -245,12 +249,12 @@ def read_ensemble_preds(self): traceback.format_exc(), ) return False - + # no validation predictions so far -- no dir if not os.path.isdir(self.dir_ensemble): self.logger.debug("No ensemble dataset prediction directory found") return False - + if self.shared_mode is False: pred_path = os.path.join( self.dir_ensemble, @@ -267,23 +271,23 @@ def read_ensemble_preds(self): self.logger.debug("Found no prediction files on ensemble data set:" " %s" % pred_path) return False - + n_read_files = 0 for y_ens_fn in y_ens_files: - + if self.read_at_most and n_read_files >= self.read_at_most: - # limit the number of files that will be read + # limit the number of files that will be read # to limit memory consumption break - + if not y_ens_fn.endswith(".npy"): self.logger.info('Error loading file (not .npy): %s', y_ens_fn) continue - + match = self.model_fn_re.search(y_ens_fn) _seed = int(match.group(1)) _num_run = int(match.group(2)) - + if not self.read_preds.get(y_ens_fn): self.read_preds[y_ens_fn] = { "ens_score": -1, @@ -301,7 +305,7 @@ def read_ensemble_preds(self): # 2 - loaded but dropped again "loaded": 0 } - + if self.read_preds[y_ens_fn]["mtime_ens"] == os.path.getmtime(y_ens_fn): # same time stamp; nothing changed; continue @@ -351,13 +355,13 @@ def read_ensemble_preds(self): np.sum([pred["loaded"] > 0 for pred in self.read_preds.values()]) ) return True - + def get_n_best_preds(self): """ get best n predictions (i.e., keys of self.read_preds) - according to score on "ensemble set" + according to score on "ensemble set" n: self.ensemble_nbest - + Side effect: delete predictions of non-winning models """ @@ -377,7 +381,7 @@ def get_n_best_preds(self): sorted_keys = filter(lambda x: x[1] > dummy_score[1], sorted_keys) # remove Dummy Classifier sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys)) - if not sorted_keys: + if not sorted_keys: # no model left; try to use dummy score (num_run==0) self.logger.warning("No models better than random - " "using Dummy Score!") @@ -446,7 +450,7 @@ def get_valid_test_preds(self, selected_keys: list): """ success_keys_valid = [] success_keys_test = [] - + for k in selected_keys: valid_fn = glob.glob( os.path.join(self.dir_valid, 'predictions_valid_%d_%d.npy' @@ -456,7 +460,7 @@ def get_valid_test_preds(self, selected_keys: list): os.path.join(self.dir_test, 'predictions_test_%d_%d.npy' % (self.read_preds[k]["seed"], self.read_preds[k]["num_run"]))) - + # TODO don't read valid and test if not changed if len(valid_fn) == 0: # self.logger.debug("Not found validation prediction file " @@ -478,7 +482,7 @@ def get_valid_test_preds(self, selected_keys: list): except Exception as e: self.logger.warning('Error loading %s: %s', valid_fn, traceback.format_exc()) - + if len(test_fn) == 0: # self.logger.debug("Not found test prediction file (although " # "ensemble predictions available):%s" % @@ -500,18 +504,18 @@ def get_valid_test_preds(self, selected_keys: list): except Exception as e: self.logger.warning('Error loading %s: %s', test_fn, traceback.format_exc()) - + return success_keys_valid, success_keys_test - + def fit_ensemble(self, selected_keys:list): """ - fit ensemble - + fit ensemble + Parameters --------- selected_keys: list list of selected keys of self.read_preds - + Returns ------- ensemble: EnsembleSelection @@ -520,7 +524,7 @@ def fit_ensemble(self, selected_keys:list): predictions_train = np.array([self.read_preds[k][Y_ENSEMBLE] for k in selected_keys]) include_num_runs = [(self.read_preds[k]["seed"], self.read_preds[k]["num_run"]) for k in selected_keys] - + # check hash if ensemble training data changed current_hash = hash(predictions_train.data.tobytes()) if self.last_hash == current_hash: @@ -531,11 +535,14 @@ def fit_ensemble(self, selected_keys:list): ) return None self.last_hash = current_hash - - ensemble = EnsembleSelection(ensemble_size=self.ensemble_size, - task_type=self.task_type, - metric=self.metric) - + + ensemble = EnsembleSelection( + ensemble_size=self.ensemble_size, + task_type=self.task_type, + metric=self.metric, + random_state=self.random_state, + ) + try: self.logger.debug( "Fitting the ensemble on %d models.", @@ -563,17 +570,17 @@ def fit_ensemble(self, selected_keys:list): self.logger.error('Caught IndexError: %s' + traceback.format_exc()) time.sleep(self.sleep_duration) return None - + return ensemble - + def predict(self, set_: str, ensemble: AbstractEnsemble, - selected_keys: list, - n_preds:int, + selected_keys: list, + n_preds:int, index_run:int): """ save preditions on ensemble, validation and test data on disc - + Parameters ---------- set_: ["valid","test"] @@ -587,13 +594,13 @@ def predict(self, set_: str, same number of predictions on valid and test are necessary index_run: int n-th time that ensemble predictions are written to disc - + Return ------ y: np.ndarray """ self.logger.debug("Predicting the %s set with the ensemble!", set_) - + # Save the ensemble for later use in the main auto-sklearn module! if self.SAVE2DISC: self.backend.save_ensemble(ensemble, index_run, self.seed) @@ -602,7 +609,7 @@ def predict(self, set_: str, self.read_preds[k][Y_VALID if set_ == 'valid' else Y_TEST] for k in selected_keys ]) - + if n_preds == predictions.shape[0]: y = ensemble.predict(predictions) if self.task_type == BINARY_CLASSIFICATION: @@ -626,7 +633,7 @@ def predict(self, set_: str, ) return None # TODO: ADD saving of predictions on "ensemble data" - + def _read_np_fn(self, fp): if self.precision is "16": predictions = np.load(fp).astype(dtype=np.float16) diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index e6a43cbf2f..9c0b7be849 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -2,6 +2,7 @@ import random import numpy as np +from sklearn.utils.validation import check_random_state from autosklearn.constants import * from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble @@ -10,14 +11,23 @@ class EnsembleSelection(AbstractEnsemble): - def __init__(self, ensemble_size, task_type, metric, - sorted_initialization=False, bagging=False, mode='fast'): + def __init__( + self, + ensemble_size: int, + task_type: int, + metric: Scorer, + sorted_initialization: bool=False, + bagging: bool=False, + mode: str='fast', + random_state: np.random.RandomState=None, + ): self.ensemble_size = ensemble_size self.task_type = task_type self.metric = metric self.sorted_initialization = sorted_initialization self.bagging = bagging self.mode = mode + self.random_state = random_state def fit(self, predictions, labels, identifiers): self.ensemble_size = int(self.ensemble_size) @@ -96,7 +106,7 @@ def _fast(self, predictions, labels): all_scoring_functions=False) all_best = np.argwhere(scores == np.nanmin(scores)).flatten() - best = np.random.choice(all_best) + best = self.random_state.choice(all_best) ensemble.append(predictions[best]) trajectory.append(scores[best]) order.append(best) From b58220607d09b5e709478dd8c7a04096e5c28dd9 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 29 Nov 2018 17:32:29 +0100 Subject: [PATCH 06/11] MAINT fix smac version to 0.8 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b2cc529a3c..61d2f92ef6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,4 @@ pandas ConfigSpace>=0.4.0,<0.5 pynisher>=0.4 pyrfr>=0.7,<0.8 -smac +smac==0.8 From 78003ac4d300c8a85131e667dbd73ccf7720dabf Mon Sep 17 00:00:00 2001 From: Diogo Bastos Date: Mon, 3 Dec 2018 09:38:02 +0000 Subject: [PATCH 07/11] Refactor hyperparameter name 'max_depth' to 'max_depth_factor' (#590) * refactor: changed max_depth parameter name * Changed the parameter name 'max_depth' to 'max_depth_factor' in the decision_tree.py file in both the 'regression/decision_tree' and 'classification/decision_tree' folders. Related to: #569 * fix: fixed invalid parameter name * Fixed invalid parameter name of 'max_depth_factor' to 'max_depth' while initializing the sklearn's DecisionTreeClassifier and DecisionTreeRegressor. Related to: #569 * refactor: added line breaks * Added line breaks in decision_tree.py files to enforce the rule that no line should have more then 79 characters. Related to: #569 * fix: removed trailing whitespaces * fix: replaced tabs with spaces * fix: fixed identation * fix: fixed identation * fix: reverted incorrect Makefile changes * fix: reverted Makefile change --- .../classification/decision_tree.py | 22 ++++++++++--------- .../components/regression/decision_tree.py | 22 ++++++++++--------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/autosklearn/pipeline/components/classification/decision_tree.py b/autosklearn/pipeline/components/classification/decision_tree.py index 245a0fe0c0..4b33a7f88f 100644 --- a/autosklearn/pipeline/components/classification/decision_tree.py +++ b/autosklearn/pipeline/components/classification/decision_tree.py @@ -13,13 +13,13 @@ class DecisionTree(AutoSklearnClassificationAlgorithm): - def __init__(self, criterion, max_features, max_depth, + def __init__(self, criterion, max_features, max_depth_factor, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, min_impurity_decrease, class_weight=None, random_state=None): self.criterion = criterion self.max_features = max_features - self.max_depth = max_depth + self.max_depth_factor = max_depth_factor self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_leaf_nodes = max_leaf_nodes @@ -34,12 +34,14 @@ def fit(self, X, y, sample_weight=None): self.max_features = float(self.max_features) # Heuristic to set the tree depth - if check_none(self.max_depth): - max_depth = self.max_depth = None + if check_none(self.max_depth_factor): + max_depth_factor = self.max_depth_factor = None else: num_features = X.shape[1] - self.max_depth = int(self.max_depth) - max_depth = max(1, int(np.round(self.max_depth * num_features, 0))) + self.max_depth_factor = int(self.max_depth_factor) + max_depth_factor = max( + 1, + int(np.round(self.max_depth_factor * num_features, 0))) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) if check_none(self.max_leaf_nodes): @@ -51,7 +53,7 @@ def fit(self, X, y, sample_weight=None): self.estimator = DecisionTreeClassifier( criterion=self.criterion, - max_depth=max_depth, + max_depth=max_depth_factor, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_leaf_nodes=self.max_leaf_nodes, @@ -92,8 +94,8 @@ def get_hyperparameter_search_space(dataset_properties=None): criterion = CategoricalHyperparameter( "criterion", ["gini", "entropy"], default_value="gini") - max_depth = UniformFloatHyperparameter( - 'max_depth', 0., 2., default_value=0.5) + max_depth_factor = UniformFloatHyperparameter( + 'max_depth_factor', 0., 2., default_value=0.5) min_samples_split = UniformIntegerHyperparameter( "min_samples_split", 2, 20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter( @@ -103,7 +105,7 @@ def get_hyperparameter_search_space(dataset_properties=None): max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_impurity_decrease = UnParametrizedHyperparameter('min_impurity_decrease', 0.0) - cs.add_hyperparameters([criterion, max_features, max_depth, + cs.add_hyperparameters([criterion, max_features, max_depth_factor, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, min_impurity_decrease]) diff --git a/autosklearn/pipeline/components/regression/decision_tree.py b/autosklearn/pipeline/components/regression/decision_tree.py index fdd73e8603..eb17fd12a5 100644 --- a/autosklearn/pipeline/components/regression/decision_tree.py +++ b/autosklearn/pipeline/components/regression/decision_tree.py @@ -11,12 +11,12 @@ from autosklearn.util.common import check_none class DecisionTree(AutoSklearnRegressionAlgorithm): - def __init__(self, criterion, max_features, max_depth, + def __init__(self, criterion, max_features, max_depth_factor, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, min_impurity_decrease, random_state=None): self.criterion = criterion self.max_features = max_features - self.max_depth = max_depth + self.max_depth_factor = max_depth_factor self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_leaf_nodes = max_leaf_nodes @@ -29,12 +29,14 @@ def fit(self, X, y, sample_weight=None): from sklearn.tree import DecisionTreeRegressor self.max_features = float(self.max_features) - if check_none(self.max_depth): - max_depth = self.max_depth = None + if check_none(self.max_depth_factor): + max_depth_factor = self.max_depth_factor = None else: num_features = X.shape[1] - self.max_depth = int(self.max_depth) - max_depth = max(1, int(np.round(self.max_depth * num_features, 0))) + self.max_depth_factor = int(self.max_depth_factor) + max_depth_factor = max( + 1, + int(np.round(self.max_depth_factor * num_features, 0))) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) if check_none(self.max_leaf_nodes): @@ -46,7 +48,7 @@ def fit(self, X, y, sample_weight=None): self.estimator = DecisionTreeRegressor( criterion=self.criterion, - max_depth=max_depth, + max_depth=max_depth_factor, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_leaf_nodes=self.max_leaf_nodes, @@ -80,8 +82,8 @@ def get_hyperparameter_search_space(dataset_properties=None): criterion = CategoricalHyperparameter('criterion', ['mse', 'friedman_mse', 'mae']) max_features = Constant('max_features', 1.0) - max_depth = UniformFloatHyperparameter( - 'max_depth', 0., 2., default_value=0.5) + max_depth_factor = UniformFloatHyperparameter( + 'max_depth_factor', 0., 2., default_value=0.5) min_samples_split = UniformIntegerHyperparameter( "min_samples_split", 2, 20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter( @@ -91,7 +93,7 @@ def get_hyperparameter_search_space(dataset_properties=None): min_impurity_decrease = UnParametrizedHyperparameter( 'min_impurity_decrease', 0.0) - cs.add_hyperparameters([criterion, max_features, max_depth, + cs.add_hyperparameters([criterion, max_features, max_depth_factor, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, min_impurity_decrease]) From 5f598a6194e32de312d2e3746ae7cdd62ef09213 Mon Sep 17 00:00:00 2001 From: Zeyi Wen Date: Mon, 3 Dec 2018 17:40:36 +0800 Subject: [PATCH 08/11] Update requirements.txt (#592) add "," for numpy versions for backward compatibility --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 61d2f92ef6..dd69c4b371 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ setuptools nose Cython -numpy>=1.9.0<=1.14.5 +numpy>=1.9.0,<=1.14.5 scipy>=0.14.1 scikit-learn>=0.19,<0.20 From ff2a91c18a87d5745fcb2c103a9cc3ef481e17ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20Concei=C3=A7=C3=A3o?= Date: Thu, 6 Dec 2018 09:49:32 +0000 Subject: [PATCH 09/11] Fix floating point issues (Issue #538) (#589) * Pass train_size and test_size as integers (number of samples) instead of floats (ratio of samples) * Changed assigment of train samples for general use in every cv case * Added Unit Tests * Changed for simpler solution. Remove 'raveling' for multilabel cases * Fix PEP8 errors * Addition Fix PEP8 errors * Addition Fix PEP8 errors * Delete competition_c_functions.c --- autosklearn/evaluation/train_evaluator.py | 16 +-- test/test_evaluation/test_train_evaluator.py | 111 ++++++++++++++++++- 2 files changed, 119 insertions(+), 8 deletions(-) diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 465f3b3a8b..013af9ade5 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -540,30 +540,32 @@ def get_splitter(self, D): return cv - y = D.data['Y_train'].ravel() + y = D.data['Y_train'] shuffle = self.resampling_strategy_args.get('shuffle', True) train_size = 0.67 if self.resampling_strategy_args: train_size = self.resampling_strategy_args.get('train_size', train_size) - test_size = 1 - train_size + test_size = float("%.4f" % (1 - train_size)) + if D.info['task'] in CLASSIFICATION_TASKS and \ D.info['task'] != MULTILABEL_CLASSIFICATION: + y = y.ravel() if self.resampling_strategy in ['holdout', 'holdout-iterative-fit']: + if shuffle: try: cv = StratifiedShuffleSplit(n_splits=1, - train_size=train_size, test_size=test_size, random_state=1) test_cv = copy.deepcopy(cv) next(test_cv.split(y, y)) except ValueError as e: if 'The least populated class in y has only' in e.args[0]: - cv = ShuffleSplit(n_splits=1, train_size=train_size, - test_size=test_size, random_state=1) + cv = ShuffleSplit(n_splits=1, test_size=test_size, + random_state=1) else: raise e else: @@ -588,8 +590,8 @@ def get_splitter(self, D): 'holdout-iterative-fit']: # TODO shuffle not taken into account for this if shuffle: - cv = ShuffleSplit(n_splits=1, train_size=train_size, - test_size=test_size, random_state=1) + cv = ShuffleSplit(n_splits=1, test_size=test_size, + random_state=1) else: tmp_train_size = int(np.floor(train_size * y.shape[0])) test_fold = np.zeros(y.shape[0]) diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index d5b402f839..577cb2809a 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -23,7 +23,10 @@ eval_holdout, eval_iterative_holdout, eval_cv, eval_partial_cv from autosklearn.util import backend from autosklearn.util.pipeline import get_configuration_space -from autosklearn.constants import * +from autosklearn.constants import BINARY_CLASSIFICATION, \ + MULTILABEL_CLASSIFICATION,\ + MULTICLASS_CLASSIFICATION,\ + REGRESSION from autosklearn.metrics import accuracy, r2, f1_macro this_directory = os.path.dirname(__file__) @@ -1226,6 +1229,112 @@ def test_get_splitter_cv_object(self, te_mock): next(cv.split(D.data['Y_train'], D.data['Y_train'] , groups=evaluator.resampling_strategy_args['groups'])) + @unittest.mock.patch.object(TrainEvaluator, "__init__") + def test_holdout_split_size(self, te_mock): + te_mock.return_value = None + D = unittest.mock.Mock(spec=AbstractDataManager) + D.feat_type = [] + + evaluator = TrainEvaluator() + evaluator.resampling_strategy = 'holdout' + + # Exact Ratio + D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])) + D.info = dict(task=BINARY_CLASSIFICATION) + evaluator.resampling_strategy_args = {'shuffle': True, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 7) + self.assertEqual(len(test_samples), 3) + + # No Shuffle + evaluator.resampling_strategy_args = {'shuffle': False, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 7) + self.assertEqual(len(test_samples), 3) + + # Rounded Ratio + D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1])) + + evaluator.resampling_strategy_args = {'shuffle': True, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 6) + self.assertEqual(len(test_samples), 3) + + # Rounded Ratio No Shuffle + evaluator.resampling_strategy_args = {'shuffle': False, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 6) + self.assertEqual(len(test_samples), 3) + + # More data + evaluator.resampling_strategy_args = {'shuffle': True, + 'train_size': 0.7} + + D.data = dict(Y_train=np.zeros((900, 1))) + cv = evaluator.get_splitter(D) + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 630) + self.assertEqual(len(test_samples), 270) + + evaluator.resampling_strategy_args = {'train_size': 0.752} + D.data = dict(Y_train=np.zeros((900, 1))) + cv = evaluator.get_splitter(D) + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 676) + self.assertEqual(len(test_samples), 224) + + # Multilabel Exact Ratio + D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1], + [1, 1], [1, 1], [1, 0], [1, 1], [1, 1]] + )) + D.info = dict(task=MULTILABEL_CLASSIFICATION) + evaluator.resampling_strategy_args = {'shuffle': True, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 7) + self.assertEqual(len(test_samples), 3) + + # Multilabel No Shuffle + D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1], + [1, 1], [1, 1], [1, 0], [1, 1]])) + evaluator.resampling_strategy_args = {'shuffle': False, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 6) + self.assertEqual(len(test_samples), 3) + class FunctionsTest(unittest.TestCase): def setUp(self): From b53c7e1fc510eaa3154c933e549de1580425a144 Mon Sep 17 00:00:00 2001 From: JinWoo <31531627+ahn1340@users.noreply.github.com> Date: Thu, 6 Dec 2018 16:31:28 +0100 Subject: [PATCH 10/11] Fix classifier bug (#585) * . * . * AutoSklearnClassifier/Regressor's fit, refit, fit_ensemble now return self. * Initial commit. Work in Progress. * Fix minor printing error in sprint_statistics. * Revert "Fix#460" * Raise error if ensemble is not built (#480) * . * . * AutoSklearnClassifier/Regressor's fit, refit, fit_ensemble now return self. * Initial commit. Work in Progress. * Fix minor printing error in sprint_statistics. * Revert "Fix#460" * Resolve rebase conflict * combined unittests to reduce travis runtime * . * . * . * . * . * Check target type at the beginning of the fitting process. * . * Fixed minor error in uniitest * . * Add unittest for target type checking. * . * . * [Debug] try with numpy version 1.14.5 * [Debug] Check if numpy version 1.14.6 raises error. * Check target type at the beginning of the fitting process. * . * Fixed minor error in uniitest * . * Add unittest for target type checking. * . * . * [Debug] Check if numpy version 1.14.6 raises error. * Fix numpy version to 1.14.5 * Add comment to Mock in test_type_of_target * Fix line length in example_parallel.py * Fix minor error * FIX classifier returning prediction larger than 1 * Remove comments * ADD unittest for ensemble_selection.predict() * minor FIX * ADD assertion in predict_proba to check probabilities sum up to 1. * REVERT changes in autosklearn/ensemble_builder.py * simplify ensemble prediction method * Modify assertion statement * ADD case check in ensemble_selection.predict() * Fix minor error in pred_probs verficiation. * Modify unittest for ensemble_selection.predict() * FIX flake8 errors * FIX flake8 error * ADD Ignore assertion for multilabel, check probabilities lie between [0, 1]. * Debug flake8 error --- autosklearn/ensembles/ensemble_selection.py | 21 ++++-- autosklearn/estimators.py | 22 ++++++- test/test_ensemble_builder/test_ensemble.py | 71 ++++++++++++++++++++- 3 files changed, 106 insertions(+), 8 deletions(-) diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 9c0b7be849..8677aa58cc 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -209,10 +209,23 @@ def _bagging(self, predictions, labels, fraction=0.5, n_bags=20): return np.array(order_of_each_bag) def predict(self, predictions): - non_null_weights = (weight for weight in self.weights_ if weight > 0) - for i, weight in enumerate(non_null_weights): - predictions[i] *= weight - return np.sum(predictions, axis=0) + predictions = np.asarray(predictions) + + # if predictions.shape[0] == len(self.weights_), + # predictions include those of zero-weight models. + if predictions.shape[0] == len(self.weights_): + return np.average(predictions, axis=0, weights=self.weights_) + + # if prediction model.shape[0] == len(non_null_weights), + # predictions do not include those of zero-weight models. + elif predictions.shape[0] == np.count_nonzero(self.weights_): + non_null_weights = [w for w in self.weights_ if w > 0] + return np.average(predictions, axis=0, weights=non_null_weights) + + # If none of the above applies, then something must have gone wrong. + else: + raise ValueError("The dimensions of ensemble predictions" + " and ensemble weights do not match!") def __str__(self): return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \ diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 267087ef65..cba56f5995 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -1,5 +1,6 @@ # -*- encoding: utf-8 -*- from sklearn.base import BaseEstimator +import numpy as np from autosklearn.automl import AutoMLClassifier, AutoMLRegressor from autosklearn.util.backend import create @@ -486,6 +487,9 @@ def fit(self, X, y, raise ValueError("classification with data of type %s is" " not supported" % target_type) + # remember target type for using in predict_proba later. + self.target_type = target_type + super().fit( X=X, y=y, @@ -527,9 +531,25 @@ def predict_proba(self, X, batch_size=None, n_jobs=1): The predicted class probabilities. """ - return super().predict_proba( + pred_proba = super().predict_proba( X, batch_size=batch_size, n_jobs=n_jobs) + # Check if all probabilities sum up to 1. + # Assert only if target type is not multilabel-indicator. + if self.target_type not in ['multilabel-indicator']: + assert( + np.allclose( + np.sum(pred_proba, axis=1), + np.ones_like(pred_proba[:, 0])) + ), "prediction probability does not sum up to 1!" + + # Check that all probability values lie between 0 and 1. + assert( + (pred_proba >= 0).all() and (pred_proba <= 1).all() + ), "found prediction probability value outside of [0, 1]!" + + return pred_proba + def _get_automl_class(self): return AutoMLClassifier diff --git a/test/test_ensemble_builder/test_ensemble.py b/test/test_ensemble_builder/test_ensemble.py index 7adc7c5e3b..7515f75e6d 100644 --- a/test/test_ensemble_builder/test_ensemble.py +++ b/test/test_ensemble_builder/test_ensemble.py @@ -6,14 +6,14 @@ import unittest import unittest.mock +from autosklearn.metrics import roc_auc, accuracy +from autosklearn.ensembles.ensemble_selection import EnsembleSelection +from autosklearn.ensemble_builder import EnsembleBuilder, Y_VALID, Y_TEST import numpy as np this_directory = os.path.dirname(__file__) sys.path.append(this_directory) -from autosklearn.ensemble_builder import EnsembleBuilder, Y_ENSEMBLE, Y_VALID, Y_TEST -from autosklearn.metrics import roc_auc - class BackendMock(object): @@ -260,3 +260,68 @@ def testLimit(self): # it should try to reduce ensemble_nbest until it also failed at 2 self.assertEqual(ensbuilder.ensemble_nbest,1) + + +class EnsembleSelectionTest(unittest.TestCase): + def testPredict(self): + # Test that ensemble prediction applies weights correctly to given + # predictions. There are two possible cases: + # 1) predictions.shape[0] == len(self.weights_). In this case, + # predictions include those made by zero-weighted models. Therefore, + # we simply apply each weights to the corresponding model preds. + # 2) predictions.shape[0] < len(self.weights_). In this case, + # predictions exclude those made by zero-weighted models. Therefore, + # we first exclude all occurrences of zero in self.weights_, and then + # apply the weights. + # If none of the above is the case, predict() raises Error. + ensemble = EnsembleSelection(ensemble_size=3, + task_type=1, + metric=accuracy, + ) + # Test for case 1. Create (3, 2, 2) predictions. + per_model_pred = np.array([ + [[0.9, 0.1], + [0.4, 0.6]], + [[0.8, 0.2], + [0.3, 0.7]], + [[1.0, 0.0], + [0.1, 0.9]] + ]) + # Weights of 3 hypothetical models + ensemble.weights_ = [0.7, 0.2, 0.1] + pred = ensemble.predict(per_model_pred) + truth = np.array([[0.89, 0.11], # This should be the true prediction. + [0.35, 0.65]]) + self.assertTrue(np.allclose(pred, truth)) + + # Test for case 2. + per_model_pred = np.array([ + [[0.9, 0.1], + [0.4, 0.6]], + [[0.8, 0.2], + [0.3, 0.7]], + [[1.0, 0.0], + [0.1, 0.9]] + ]) + # The third model now has weight of zero. + ensemble.weights_ = [0.7, 0.2, 0.0, 0.1] + pred = ensemble.predict(per_model_pred) + truth = np.array([[0.89, 0.11], + [0.35, 0.65]]) + self.assertTrue(np.allclose(pred, truth)) + + # Test for error case. + per_model_pred = np.array([ + [[0.9, 0.1], + [0.4, 0.6]], + [[0.8, 0.2], + [0.3, 0.7]], + [[1.0, 0.0], + [0.1, 0.9]] + ]) + # Now the weights have 2 zero weights and 2 non-zero weights, + # which is incompatible. + ensemble.weights_ = [0.6, 0.0, 0.0, 0.4] + + with self.assertRaises(ValueError): + ensemble.predict(per_model_pred) From 7efa57fdde1960b0b05489be9244bddb1237d389 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 7 Dec 2018 11:28:10 +0100 Subject: [PATCH 11/11] MAINT prepare 0.4.2 release --- autosklearn/__version__.py | 2 +- doc/releases.rst | 27 +++++++++++++++++++++++++++ requirements.txt | 2 +- setup.py | 4 ++-- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/autosklearn/__version__.py b/autosklearn/__version__.py index 4bfd2e72b7..93432f12db 100644 --- a/autosklearn/__version__.py +++ b/autosklearn/__version__.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.4.1" +__version__ = "0.4.2" diff --git a/doc/releases.rst b/doc/releases.rst index d4d5beea23..2e2ca11cfe 100644 --- a/doc/releases.rst +++ b/doc/releases.rst @@ -11,6 +11,33 @@ Releases ======== +Version 0.4.2 +============= + +* Fixes #538: Remove rounding errors when giving a training set fraction for + holdout. +* Fixes #558: Ensemble script now uses less memory and the memory limit can be + given to Auto-sklearn. +* Fixes #585: Auto-sklearn's ensemble script produced wrong results when + called directly (and not via one of Auto-sklearn's estimator classes). +* Fixes an error in the ensemble script which made it non-deterministic. +* MAINT #569: Rename hyperparameter to have a different name than a + scikit-learn hyperparameter with different meaning. +* MAINT #592: backwards compatible requirements.txt +* MAINT #588: Fix SMAC version to 0.8.0 +* MAINT: remove dependency on the six package +* MAINT: upgrade to XGBoost 0.80 + +Contributors +************ + +* Taneli Mielikäinen +* Matthias Feurer +* Diogo Bastos +* Zeyi Wen +* Teresa Conceição +* Jin Woo Ahn + Version 0.4.1 ============= diff --git a/requirements.txt b/requirements.txt index dd69c4b371..b6586a1fc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,6 @@ liac-arff pandas ConfigSpace>=0.4.0,<0.5 -pynisher>=0.4 +pynisher>=0.4.2 pyrfr>=0.7,<0.8 smac==0.8 diff --git a/setup.py b/setup.py index 223977cf45..bdca3c90af 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ "nose", "Cython", # Numpy version of higher than 1.14.5 causes libgcc_s.so.1 error. - "numpy>=1.9.0<=1.14.5", + "numpy>=1.9.0,<=1.14.5", "scipy>=0.14.1", "scikit-learn>=0.19,<0.20", "lockfile", @@ -45,7 +45,7 @@ "liac-arff", "pandas", "ConfigSpace>=0.4.0,<0.5", - "pynisher>=0.4,<0.5", + "pynisher>=0.4.2", "pyrfr>=0.6.1,<0.8", "smac>=0.8,<0.9", "xgboost>=0.80",