From 14200f24b233f3ccb9fb78d58d186f28810a9e31 Mon Sep 17 00:00:00 2001 From: Dainis Boumber Date: Tue, 14 Nov 2017 06:34:47 -0600 Subject: [PATCH] Optimization that should speed things up a lot added, see nK parameter to complexity constructor (more is faster) Same parameter is useful if datasets vary in size a lot, it can be used to scale them because it controls the space K lives in. For ex, if TGT is 10 times biggger than source we can set nK to 10 for it. Other minor fixes. Signed-off-by: Dainis Boumber --- complexity.py | 18 +++++++++--------- demo.py | 2 +- modules/complexity_estimator.py | 19 ++++++++++++++----- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/complexity.py b/complexity.py index 76a8c00..0a82dbf 100644 --- a/complexity.py +++ b/complexity.py @@ -3,8 +3,7 @@ import numpy as np from sklearn.datasets import make_moons, make_circles from sklearn.manifold.t_sne import TSNE -from sklearn.metrics import auc -from sklearn.naive_bayes import GaussianNB +from sklearn.tree import DecisionTreeClassifier import modules.complexity_estimator as ce import modules.util as u @@ -38,7 +37,7 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5): for dsix, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets): u_tgt = [None] * len(X_tgt) - est_src = ce.ComplexityEstimator(X_src, y_src) + est_src = ce.ComplexityEstimator(X_src, y_src, n_windows=10, nK=2) est_tgt = ce.ComplexityEstimator(X_tgt, y_tgt) # declare Dataset instance, X is the feature, y is the label (None if unlabeled) X = np.vstack((X_src, X_tgt)) @@ -46,8 +45,8 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5): X_tgt_plt = TSNE().fit_transform(X_tgt) X_plt = np.vstack((X_src_plt, X_tgt_plt)) h = .05 # step size in the mesh - x_min, x_max = X_plt[:, 0].min() - .5, X_plt[:, 0].max() + .5 - y_min, y_max = X_plt[:, 1].min() - .5, X_plt[:, 1].max() + .5 + x_min, x_max = X_plt[:, 0].min() - h, X_plt[:, 0].max() + h + y_min, y_max = X_plt[:, 1].min() - h, X_plt[:, 1].max() + h xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) figure = plt.figure(figsize=(27, 13)) @@ -62,7 +61,7 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5): ax.set_title('Src complexity') Ks, Es = est_src.get_k_complexity() ax.plot(Ks, Es) - ax.set_xlabel('AUC=' + ('%.2f' % auc(Ks, Es, reorder=True)).lstrip('0')) + ax.set_xlabel('AUC=' + ('%.2f' % est_src.auc()).lstrip('0')) #plt tgt plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Tgt', est_tgt.seeds) @@ -70,7 +69,7 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5): Ks, Es = est_tgt.get_k_complexity() ax.set_title('Tgt complexity') ax.plot(Ks, Es) - ax.set_xlabel('AUC=' + ('%.2f' % auc(Ks, Es, reorder=True)).lstrip('0')) + ax.set_xlabel('AUC=' + ('%.2f' % est_tgt.auc()).lstrip('0')) w = 0 X_known = X_src.tolist() y_known = y_src.tolist() @@ -95,18 +94,19 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5): figure.tight_layout() fname = './vis/' + str(experiments[dsix]) + '.png' figure.savefig(filename=fname) + plt.tight_layout() plt.show() def main(): # clfs = [SVC(), LinearSVC(), AdaBoostClassifier(), GaussianNB()] datasets = [] experiments = [] - clfs = [GaussianNB()] + clfs = [DecisionTreeClassifier()] # datasets.append( # (u.hastie(500), make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2))) # experiments.append('hastie_10_2_vs_gauss_quant_10_2') - datasets.append((make_moons(), make_circles())) + datasets.append((make_moons(n_samples=1000), make_circles())) experiments.append('moons') active(classifiers=clfs, datasets=datasets, experiments=experiments) diff --git a/demo.py b/demo.py index 0adcd09..6c9d385 100644 --- a/demo.py +++ b/demo.py @@ -20,7 +20,7 @@ #This experiment is generic and is best used to demonstrate our approach -# +# May be out of date def demo(datasets, dsnames, classifiers, nwindows): h = .05 # step size in the mesh figure = plt.figure(figsize=(27, 9)) diff --git a/modules/complexity_estimator.py b/modules/complexity_estimator.py index abf9187..51d87d0 100644 --- a/modules/complexity_estimator.py +++ b/modules/complexity_estimator.py @@ -1,24 +1,28 @@ import numpy as np import scipy +import sklearn.metrics as metr class ComplexityEstimator: - - def __init__(self, X, y, n_windows=10): + def __init__(self, X, y, n_windows=10, nK=1): assert (n_windows > 0) + assert (len(set(y)) > 1) + assert (len(X) == len(y)) + assert (nK < len(y)) self.X = X self.y = y self.seeds = np.random.random_integers(0, len(X) - 1, n_windows) self.tree = scipy.spatial.cKDTree(X) self.labels = set(y) - self.Ks = np.arange(1, len(self.X) + 1) # ckdTree starts counting from 1 + + self.Ks = np.arange(1, len(self.X) + 1, step=nK) # ckdTree starts counting from 1 self.Hs = np.zeros(len(self.Ks)) self.ws = np.ndarray((n_windows, len(self.Ks))) for i, k in enumerate(self.Ks): for j, seed in enumerate(self.seeds): h = self._H(k=k, seed=seed) - self.ws[j, k-1] = h - self.Hs[i] = np.sum(self.ws[:, k-1]) / len(self.seeds) + self.ws[j, i] = h + self.Hs[i] = np.sum(self.ws[:, i]) / len(self.seeds) for h in self.Hs: assert h >= 0.0 and h <= 1.0 @@ -26,6 +30,11 @@ def __init__(self, X, y, n_windows=10): def get_k_complexity(self): return self.Ks, self.Hs + def auc(self, normalize=True): + if normalize: + return metr.auc(self.Ks, self.Hs, reorder=True) / len(self.y) + return metr.auc(self.Ks, self.Hs, reorder=True) + def get_w_complexity(self): return self.ws