Optimization that should speed things up a lot added, see nK paramete…

…r to complexity constructor (more is faster) Same parameter is useful if datasets vary in size a lot, it can be used to scale them because it controls the space K lives in. For ex, if TGT is 10 times biggger than source we can set nK to 10 for it. Other minor fixes. Signed-off-by: Dainis Boumber <[email protected]>
dainis-boumber · Nov 14, 2017 · 14200f2 · 14200f2
1 parent 4db42eb
commit 14200f2
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 15 deletions.
diff --git a/complexity.py b/complexity.py
@@ -3,8 +3,7 @@
 import numpy as np
 from sklearn.datasets import make_moons, make_circles
 from sklearn.manifold.t_sne import TSNE
-from sklearn.metrics import auc
-from sklearn.naive_bayes import GaussianNB
+from sklearn.tree import DecisionTreeClassifier
 
 import modules.complexity_estimator as ce
 import modules.util as u
@@ -38,16 +37,16 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
 
     for dsix, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets):
         u_tgt = [None] * len(X_tgt)
-        est_src = ce.ComplexityEstimator(X_src, y_src)
+        est_src = ce.ComplexityEstimator(X_src, y_src, n_windows=10, nK=2)
         est_tgt = ce.ComplexityEstimator(X_tgt, y_tgt)
         # declare Dataset instance, X is the feature, y is the label (None if unlabeled)
         X = np.vstack((X_src, X_tgt))
         X_src_plt = TSNE().fit_transform(X_src)
         X_tgt_plt = TSNE().fit_transform(X_tgt)
         X_plt = np.vstack((X_src_plt, X_tgt_plt))
         h = .05  # step size in the mesh
-        x_min, x_max = X_plt[:, 0].min() - .5, X_plt[:, 0].max() + .5
-        y_min, y_max = X_plt[:, 1].min() - .5, X_plt[:, 1].max() + .5
+        x_min, x_max = X_plt[:, 0].min() - h, X_plt[:, 0].max() + h
+        y_min, y_max = X_plt[:, 1].min() - h, X_plt[:, 1].max() + h
         xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
         figure = plt.figure(figsize=(27, 13))
 
@@ -62,15 +61,15 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
             ax.set_title('Src complexity')
             Ks, Es = est_src.get_k_complexity()
             ax.plot(Ks, Es)
-            ax.set_xlabel('AUC=' + ('%.2f' % auc(Ks, Es, reorder=True)).lstrip('0'))
+            ax.set_xlabel('AUC=' + ('%.2f' % est_src.auc()).lstrip('0'))
 
             #plt tgt
             plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Tgt', est_tgt.seeds)
             ax = plt.subplot2grid(grid_size, (0,4), colspan=2)
             Ks, Es = est_tgt.get_k_complexity()
             ax.set_title('Tgt complexity')
             ax.plot(Ks, Es)
-            ax.set_xlabel('AUC=' + ('%.2f' % auc(Ks, Es, reorder=True)).lstrip('0'))
+            ax.set_xlabel('AUC=' + ('%.2f' % est_tgt.auc()).lstrip('0'))
             w = 0
             X_known = X_src.tolist()
             y_known = y_src.tolist()
@@ -95,18 +94,19 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
         figure.tight_layout()
         fname = './vis/' + str(experiments[dsix]) + '.png'
         figure.savefig(filename=fname)
+    plt.tight_layout()
     plt.show()
 
 def main():
     # clfs = [SVC(), LinearSVC(), AdaBoostClassifier(), GaussianNB()]
     datasets = []
     experiments = []
-    clfs = [GaussianNB()]
+    clfs = [DecisionTreeClassifier()]
 
     # datasets.append(
     #    (u.hastie(500), make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2)))
     # experiments.append('hastie_10_2_vs_gauss_quant_10_2')
-    datasets.append((make_moons(), make_circles()))
+    datasets.append((make_moons(n_samples=1000), make_circles()))
     experiments.append('moons')
 
     active(classifiers=clfs, datasets=datasets, experiments=experiments)

diff --git a/demo.py b/demo.py
@@ -20,7 +20,7 @@
 
 
 #This experiment is generic and is best used to demonstrate our approach
-#
+# May be out of date
 def demo(datasets, dsnames, classifiers, nwindows):
     h = .05  # step size in the mesh
     figure = plt.figure(figsize=(27, 9))

diff --git a/modules/complexity_estimator.py b/modules/complexity_estimator.py
@@ -1,31 +1,40 @@
 import numpy as np
 import scipy
+import sklearn.metrics as metr
 
 class ComplexityEstimator:
-
-    def __init__(self, X, y, n_windows=10):
+    def __init__(self, X, y, n_windows=10, nK=1):
         assert (n_windows > 0)
+        assert (len(set(y)) > 1)
+        assert (len(X) == len(y))
+        assert (nK < len(y))
         self.X = X
         self.y = y
         self.seeds = np.random.random_integers(0, len(X) - 1, n_windows)
         self.tree = scipy.spatial.cKDTree(X)
         self.labels = set(y)
-        self.Ks = np.arange(1, len(self.X) + 1)  # ckdTree starts counting from 1
+
+        self.Ks = np.arange(1, len(self.X) + 1, step=nK)  # ckdTree starts counting from 1
         self.Hs = np.zeros(len(self.Ks))
         self.ws = np.ndarray((n_windows, len(self.Ks)))
 
         for i, k in enumerate(self.Ks):
             for j, seed in enumerate(self.seeds):
                 h = self._H(k=k, seed=seed)
-                self.ws[j, k-1] = h
-                self.Hs[i] = np.sum(self.ws[:, k-1]) / len(self.seeds)
+                self.ws[j, i] = h
+                self.Hs[i] = np.sum(self.ws[:, i]) / len(self.seeds)
 
         for h in self.Hs:
             assert h >= 0.0 and h <= 1.0
 
     def get_k_complexity(self):
         return self.Ks, self.Hs
 
+    def auc(self, normalize=True):
+        if normalize:
+            return metr.auc(self.Ks, self.Hs, reorder=True) / len(self.y)
+        return metr.auc(self.Ks, self.Hs, reorder=True)
+
     def get_w_complexity(self):
         return self.ws