Skip to content

Commit

Permalink
Optimization that should speed things up a lot added, see nK paramete…
Browse files Browse the repository at this point in the history
…r to complexity constructor (more is faster)

Same parameter is useful if datasets vary in size a lot, it can be used to scale them because it controls the space K lives in. For ex, if TGT is 10 times biggger than source we can set nK to 10 for it.

Other minor fixes.

Signed-off-by: Dainis Boumber <[email protected]>
  • Loading branch information
dainis-boumber committed Nov 14, 2017
1 parent 4db42eb commit 14200f2
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 15 deletions.
18 changes: 9 additions & 9 deletions complexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import numpy as np
from sklearn.datasets import make_moons, make_circles
from sklearn.manifold.t_sne import TSNE
from sklearn.metrics import auc
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

import modules.complexity_estimator as ce
import modules.util as u
Expand Down Expand Up @@ -38,16 +37,16 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):

for dsix, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets):
u_tgt = [None] * len(X_tgt)
est_src = ce.ComplexityEstimator(X_src, y_src)
est_src = ce.ComplexityEstimator(X_src, y_src, n_windows=10, nK=2)
est_tgt = ce.ComplexityEstimator(X_tgt, y_tgt)
# declare Dataset instance, X is the feature, y is the label (None if unlabeled)
X = np.vstack((X_src, X_tgt))
X_src_plt = TSNE().fit_transform(X_src)
X_tgt_plt = TSNE().fit_transform(X_tgt)
X_plt = np.vstack((X_src_plt, X_tgt_plt))
h = .05 # step size in the mesh
x_min, x_max = X_plt[:, 0].min() - .5, X_plt[:, 0].max() + .5
y_min, y_max = X_plt[:, 1].min() - .5, X_plt[:, 1].max() + .5
x_min, x_max = X_plt[:, 0].min() - h, X_plt[:, 0].max() + h
y_min, y_max = X_plt[:, 1].min() - h, X_plt[:, 1].max() + h
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
figure = plt.figure(figsize=(27, 13))

Expand All @@ -62,15 +61,15 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
ax.set_title('Src complexity')
Ks, Es = est_src.get_k_complexity()
ax.plot(Ks, Es)
ax.set_xlabel('AUC=' + ('%.2f' % auc(Ks, Es, reorder=True)).lstrip('0'))
ax.set_xlabel('AUC=' + ('%.2f' % est_src.auc()).lstrip('0'))

#plt tgt
plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Tgt', est_tgt.seeds)
ax = plt.subplot2grid(grid_size, (0,4), colspan=2)
Ks, Es = est_tgt.get_k_complexity()
ax.set_title('Tgt complexity')
ax.plot(Ks, Es)
ax.set_xlabel('AUC=' + ('%.2f' % auc(Ks, Es, reorder=True)).lstrip('0'))
ax.set_xlabel('AUC=' + ('%.2f' % est_tgt.auc()).lstrip('0'))
w = 0
X_known = X_src.tolist()
y_known = y_src.tolist()
Expand All @@ -95,18 +94,19 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
figure.tight_layout()
fname = './vis/' + str(experiments[dsix]) + '.png'
figure.savefig(filename=fname)
plt.tight_layout()
plt.show()

def main():
# clfs = [SVC(), LinearSVC(), AdaBoostClassifier(), GaussianNB()]
datasets = []
experiments = []
clfs = [GaussianNB()]
clfs = [DecisionTreeClassifier()]

# datasets.append(
# (u.hastie(500), make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2)))
# experiments.append('hastie_10_2_vs_gauss_quant_10_2')
datasets.append((make_moons(), make_circles()))
datasets.append((make_moons(n_samples=1000), make_circles()))
experiments.append('moons')

active(classifiers=clfs, datasets=datasets, experiments=experiments)
Expand Down
2 changes: 1 addition & 1 deletion demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@


#This experiment is generic and is best used to demonstrate our approach
#
# May be out of date
def demo(datasets, dsnames, classifiers, nwindows):
h = .05 # step size in the mesh
figure = plt.figure(figsize=(27, 9))
Expand Down
19 changes: 14 additions & 5 deletions modules/complexity_estimator.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
import numpy as np
import scipy
import sklearn.metrics as metr

class ComplexityEstimator:

def __init__(self, X, y, n_windows=10):
def __init__(self, X, y, n_windows=10, nK=1):
assert (n_windows > 0)
assert (len(set(y)) > 1)
assert (len(X) == len(y))
assert (nK < len(y))
self.X = X
self.y = y
self.seeds = np.random.random_integers(0, len(X) - 1, n_windows)
self.tree = scipy.spatial.cKDTree(X)
self.labels = set(y)
self.Ks = np.arange(1, len(self.X) + 1) # ckdTree starts counting from 1

self.Ks = np.arange(1, len(self.X) + 1, step=nK) # ckdTree starts counting from 1
self.Hs = np.zeros(len(self.Ks))
self.ws = np.ndarray((n_windows, len(self.Ks)))

for i, k in enumerate(self.Ks):
for j, seed in enumerate(self.seeds):
h = self._H(k=k, seed=seed)
self.ws[j, k-1] = h
self.Hs[i] = np.sum(self.ws[:, k-1]) / len(self.seeds)
self.ws[j, i] = h
self.Hs[i] = np.sum(self.ws[:, i]) / len(self.seeds)

for h in self.Hs:
assert h >= 0.0 and h <= 1.0

def get_k_complexity(self):
return self.Ks, self.Hs

def auc(self, normalize=True):
if normalize:
return metr.auc(self.Ks, self.Hs, reorder=True) / len(self.y)
return metr.auc(self.Ks, self.Hs, reorder=True)

def get_w_complexity(self):
return self.ws

Expand Down

0 comments on commit 14200f2

Please sign in to comment.