Skip to content

Commit

Permalink
Added Uncertainty sampling strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
Mikhail Mekhedkin Meskhi committed Jun 20, 2018
1 parent 8c9ab11 commit b7a4a37
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 92 deletions.
164 changes: 105 additions & 59 deletions complexity.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,61 @@
# Necessities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
# Sklearn imports (models, synthetic data, etc...)
from sklearn.datasets import make_moons
from sklearn.datasets import make_gaussian_quantiles
from sklearn.manifold.t_sne import TSNE
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

import modules.complexity_estimator as ce
# Active Learning and Complexity Modules
import modules.util as u
from modules.oracle import Oracle
import modules.complexity_estimator as ce
from nd_boundary_plot.plots import nd_boundary_plot

# Data pre-processing and import
from modules import mnist

################################################################################################33
#scatter plot of a dataset helper
#
def plot_ds(grid_size, loc, X, y, xx, yy, title, seeds=None, colspan=1, rowspan=1):
####################################################

ax = plt.subplot2grid(grid_size, loc, rowspan=rowspan, colspan=colspan)
'''
Scatter plot for the dataset
'''
def plot_ds(grid_size, loc, X, y, xx, yy, title, seeds=None, colspan=1, rowspan=1):
ax = plt.subplot2grid(grid_size, loc, rowspan=rowspan, colspan=colspan)
ax.set_title(title)
# Plot also the training points

# Plot the training points
ax.scatter(X[:, 0],X[:, 1], c=y)
# and seeds

# Plot the seeds
if seeds is not None:
ax.scatter(X[seeds, 0], X[seeds, 1],
alpha=1.0, facecolors='magenta')
ax.scatter(X[seeds, 0], X[seeds, 1], alpha=1.0, facecolors='magenta')

ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())

#perform active learning
#
def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
# USE THIS INSTEAD OF YTGT WHICH WE PRETEND TO NOT KNOW

for dsix, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets):
'''
Perform Active Learning
QueryStrategy (Random Sampling or Uncertainty Sampling)
'''
def active(classifiers, datasets, experiments, qs, quota=25, plot_every_n=5):
for dataset_index, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets):
u_tgt = [None] * len(X_tgt)
est_src = ce.ComplexityEstimator(X_src, y_src, n_windows=10, nK=1)
est_tgt = ce.ComplexityEstimator(X_tgt, y_tgt, n_windows=10, nK=1)
# declare Dataset instance, X is the feature, y is the label (None if unlabeled)

# Declare Dataset instance, X is the feature, y is the label (None if unlabeled)
X = np.vstack((X_src, X_tgt))

if X.shape[1] > 2:
X_src_plt = TSNE().fit_transform(X_src)
X_tgt_plt = TSNE().fit_transform(X_tgt)
Expand All @@ -52,82 +67,113 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
else:
raise AttributeError

h = .05 # step size in the mesh
h = .05 # Step size in the mesh
x_min, x_max = X_plt[:, 0].min() - h, X_plt[:, 0].max() + h
y_min, y_max = X_plt[:, 1].min() - h, X_plt[:, 1].max() + h
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
figure = plt.figure(figsize=(27, 13))

grid_size = (1+len(classifiers), 6)
for n, classifier in enumerate(classifiers):

for classifier_index, classifier in enumerate(classifiers):
model = classifier
oracle = Oracle(X_tgt, y_tgt)
# plot src
plot_ds(grid_size, (0, 0), X_src_plt, y_src, xx, yy, 'Src', est_src.seeds)

# Plot source dataset
plot_ds(grid_size, (0, 0), X_src_plt, y_src, xx, yy, 'Source', est_src.seeds)
ax = plt.subplot2grid(grid_size, (0,1), colspan=2)
ax.set_title('Src complexity')
ax.set_title('Source complexity')
Ks, Es = est_src.get_k_complexity()
ax.plot(Ks, Es)
ax.set_xlabel('AUC=' + ('%.2f' % est_src.auc()).lstrip('0'))

#plt tgt
plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Tgt', est_tgt.seeds)
# Plot target dataset
plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Target', est_tgt.seeds)
ax = plt.subplot2grid(grid_size, (0,4), colspan=2)
Ks, Es = est_tgt.get_k_complexity()
ax.set_title('Tgt complexity')
ax.set_title('Target complexity')
ax.plot(Ks, Es)
ax.set_xlabel('AUC=' + ('%.2f' % est_tgt.auc()).lstrip('0'))
w = 0
X_known = X_src.tolist()
y_known = y_src.tolist()
for i in range(quota): # loop through the number of queries
loc, y_loc = oracle.random_query() # let the specified QueryStrategy suggest a data to query
u_tgt[loc] = y_loc
X_known.append(X_tgt[loc])
y_known.append(y_tgt[loc])
if i == 0 or i % plot_every_n == 0 or i == quota - 1:
model.fit(X_known, y_known) # train model with newly-updated Dataset
score = model.score(X_tgt, y_tgt)
y_predicted = model.predict(X_tgt)
ax = plt.subplot2grid(grid_size, (n + 1, w))
nd_boundary_plot(X_tgt, y_predicted, model, ax)
if i == 0:
ax.set_ylabel(u.classname(model))
if n == 0:
ax.set_title('# queries=' + str(i))
ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0'))
w += 1

figure.suptitle(experiments[dsix])

for i in range(quota): # Loop through the number of queries

if qs == 1 :
qs_name = 'RandomSampling'
loc, y_loc = oracle.random_query() # Sample target using RandomSampling strategy
u_tgt[loc] = y_loc
X_known.append(X_tgt[loc])
y_known.append(y_tgt[loc])

if i == 0 or i % plot_every_n == 0 or i == quota - 1:
model.fit(X_known, y_known) # Train model with newly-updated dataset
score = model.score(X_tgt, y_tgt)
y_predicted = model.predict(X_tgt)
ax = plt.subplot2grid(grid_size, (classifier_index + 1, w))
nd_boundary_plot(X_tgt, y_predicted, model, ax)

if i == 0:
ax.set_ylabel(u.classname(model))

if classifier_index == 0:
ax.set_title('# Queries=' + str(i))

ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0'))
w += 1

elif qs == 2:
qs_name = 'UncertaintySampling'
model.fit(X_known, y_known) # Fit model on source only to predict probabilities
loc, X_chosen = oracle.uncertainty_sampling(model) # Sample target using UncertaintySampling strategy
X_known.append(X_tgt[loc])
y_known.append(y_tgt[loc])

if i == 0 or i % plot_every_n == 0 or i == quota - 1:
model.fit(X_known, y_known) # Train model with newly-updated dataset
score = model.score(X_tgt, y_tgt)
y_predicted = model.predict(X_tgt)
ax = plt.subplot2grid(grid_size, (classifier_index + 1, w))
nd_boundary_plot(X_tgt, y_predicted, model, ax)

if i == 0:
ax.set_ylabel(u.classname(model))

if classifier_index == 0:
ax.set_title('# Queries=' + str(i))

ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0'))
w += 1

figure.suptitle(experiments[dataset_index] + qs_name )
figure.tight_layout()
fname = './vis/' + str(experiments[dsix]) + '.png'
figure.savefig(filename=fname)
fname = './vis/' + str(experiments[dataset_index] + qs_name) + '.png'
figure.savefig(fname)

plt.tight_layout()
plt.show()


def main():
# clfs = [SVC(), LinearSVC(), AdaBoostClassifier(), GaussianNB()]
clfs = [SVC(), GaussianNB(), DecisionTreeClassifier(), MLPClassifier(hidden_layer_sizes=(10,10,10,10,10,10), solver='lbfgs', alpha=2, random_state=1, activation='relu')]
datasets = []
experiments = []
clfs = [DecisionTreeClassifier()]

# datasets.append(
# (, make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2)))
# datasets.append((make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2),
# make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2)))
# experiments.append('hastie_10_2_vs_gauss_quant_10_2')
# datasets.append((make_moons(n_samples=1000), make_moons(n_samples=1000)))

# experiments.append('moons')
# datasets.append((u.hastie(1000), u.hastie(1000)))

datasets.append((make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=3),
make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=3)))
experiments.append('gauus')
# datasets.append((mnist.load_mnist(), mnist.load_mnist_rotated()))
# experiments.append('MNIST_vs_MNIST_Rotated')
# datasets.append((make_gaussian_quantiles(n_samples=2000, n_features=10, n_classes=3),
# make_gaussian_quantiles(n_samples=2000, n_features=10, n_classes=3)))
# experiments.append('gauus')

datasets.append((mnist.load_mnist(), mnist.load_mnist_rotated()))
experiments.append('MNIST_vs_MNIST_Rotated')

active(classifiers=clfs, datasets=datasets, experiments=experiments)
active(classifiers=clfs, datasets=datasets, experiments=experiments, qs=1)

if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions data/MNIST Data.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# MNIST Data

[Download Here](https://drive.google.com/open?id=12E----VtVc03jqRQc8QaFuAbi1SMtI7i)
32 changes: 1 addition & 31 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ def demo(datasets, dsnames, classifiers, nwindows):
f1 = figure.number
figure2 = plt.figure(figsize=(27, 9))
f2 = figure2.number
figure3 = plt.figure(figsize=(27, 9))
f3 = figure3.number

i = 1
j = 1
Expand Down Expand Up @@ -112,32 +110,6 @@ def demo(datasets, dsnames, classifiers, nwindows):
ax.set_title('Avg. Complexity')
ax.plot(Ks, Es)
j+=1


# plot data and
figure3, a = plt.subplots(nrows=len(datasets), ncols=2,figsize=(27,9))
a = a.ravel()

for idx,ax in enumerate(a):
if idx % 2 == 0:
ax.set_title(dsnames[ds_cnt])
# Plot also the training points
ax.scatter(X[:, 0], X[:, 1], c=y)
# and seeds
ax.scatter(X[estimator.seeds, 0], X[estimator.seeds, 1],
alpha=1.0, facecolors='black')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
else:
ax.hist(Es, 10)
ax.set_xlabel('E')
ax.set_ylabel('frequency')
ax.set_title('Hist. of Entropy')
figure3.tight_layout()
figure3.savefig(filename=('./vis/' + dsnames[ds_cnt] + 'Histograms.png'))

'''
ws = estimator.get_w_complexity()
for wi, w in enumerate(ws):
Expand All @@ -149,18 +121,16 @@ def demo(datasets, dsnames, classifiers, nwindows):

figure.tight_layout()
figure2.tight_layout()


figure.savefig(filename=('./vis/'+ ''.join(dsnames)+'Classifications.png'))
figure2.savefig(filename=('./vis/'+''.join(dsnames) + 'Complexities.png'))

plt.show()

def main():
classifiers = [
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
KNeighborsClassifier(3),
MLPClassifier(alpha=1),
SVC(gamma=2, C=1),
LinearSVC(),
GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
Expand Down
59 changes: 59 additions & 0 deletions modules/mnist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import numpy as np
import pandas as pd
from scipy import ndimage
import matplotlib.pyplot as plt
from sklearn.utils import shuffle


# Import data and preprocess
mnist = pd.read_csv('./data/mnist.csv') # Using 100 samples only for this test run
labels = mnist.as_matrix(columns=['label'])
dataset = mnist.drop('label', axis = 1).as_matrix()
dataset[dataset > 0] = 1 # Convert each pixel either 0 for white and 1 for black for better classification


def load_mnist():

rows = 42000
columns = 784
index = 1
X = []
for image in dataset[:rows*columns]:
img = np.reshape(image, [28, 28])
X.append(img)
index += 1
X = np.array(X).reshape(rows, -1)
mnist = pd.DataFrame(X)
mnist = mnist.as_matrix()
y = labels.flatten()

print("Completed with X shape: ", mnist.shape)
print("Flattened y shape: ", y.shape)

mnist, y = shuffle(X, y, random_state = 5)
return mnist, y


def load_mnist_rotated():

rows = 42000
columns = 784
indx = 1
X = []
for image in dataset[:rows*columns]:
img = np.reshape(image, [28, 28])
rotated = ndimage.rotate(img, 90) # Rotate the images by 90 degrees
X.append(rotated)
indx += 1
X = np.array(X).reshape(rows, -1)

mnist_rotated = pd.DataFrame(X)
# mnist_rotated.to_csv('./data/mnist_rotated/minst_rotated_21000.csv', index=False, header=False)
mnist_rotated = mnist_rotated.as_matrix()

y = labels.flatten()
print("Completed with X shape: ", mnist_rotated.shape)
print("Flattened y shape: ", y.shape)

mnist_rotated, y = shuffle(X, y, random_state = 15)
return mnist_rotated, y
25 changes: 24 additions & 1 deletion modules/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,27 @@ def random_query(self):
if loc not in self.queried:
self.queried.append(loc)
break
return loc, self.y[loc]
return loc, self.y[loc]

def classifier_uncertainty(self, classifier):
try:
classwise_uncertainty = classifier.predict_proba(self.X)
except AttributeError:
classwise_uncertainty = classifier.decision_function(self.X)

uncertainty = 1 - np.max(classwise_uncertainty, axis=1)

return uncertainty

def uncertainty_sampling(self, classifier, n_instances=1):
uncertainty = self.classifier_uncertainty(classifier)
query_idx = self.multi_argmax(uncertainty, n_instances=n_instances)

return int(query_idx), self.X[query_idx]

def multi_argmax(self, values, n_instances=1):
assert n_instances <= len(values), 'n_instances must be less or equal than the size of utility'
max_idx = np.argpartition(-values, n_instances-1, axis=0)[:n_instances]

return max_idx

2 changes: 1 addition & 1 deletion nd_boundary_plot
Submodule nd_boundary_plot updated 1 files
+1 −1 LICENSE

0 comments on commit b7a4a37

Please sign in to comment.