Added Uncertainty sampling strategy

dainis-boumber · Jun 20, 2018 · b7a4a37 · b7a4a37
1 parent 8c9ab11
commit b7a4a37
Show file tree

Hide file tree

Showing 6 changed files with 193 additions and 92 deletions.
diff --git a/complexity.py b/complexity.py
@@ -1,46 +1,61 @@
+# Necessities
+import numpy as np
+import pandas as pd
 import matplotlib.pyplot as plt
 
-import numpy as np
+# Sklearn imports (models, synthetic data, etc...)
+from sklearn.datasets import make_moons
 from sklearn.datasets import make_gaussian_quantiles
 from sklearn.manifold.t_sne import TSNE
+from sklearn.svm import SVC
+from sklearn.naive_bayes import GaussianNB
 from sklearn.tree import DecisionTreeClassifier
+from sklearn.neural_network import MLPClassifier
 
-import modules.complexity_estimator as ce
+# Active Learning and Complexity Modules
 import modules.util as u
 from modules.oracle import Oracle
+import modules.complexity_estimator as ce
 from nd_boundary_plot.plots import nd_boundary_plot
 
+# Data pre-processing and import
+from modules import mnist
 
-################################################################################################33
-#scatter plot of a dataset helper
-#
-def plot_ds(grid_size, loc, X, y, xx, yy, title, seeds=None, colspan=1, rowspan=1):
+####################################################
 
-    ax = plt.subplot2grid(grid_size, loc, rowspan=rowspan, colspan=colspan)
+'''
+    Scatter plot for the dataset
 
+'''
+def plot_ds(grid_size, loc, X, y, xx, yy, title, seeds=None, colspan=1, rowspan=1):
+    ax = plt.subplot2grid(grid_size, loc, rowspan=rowspan, colspan=colspan)
     ax.set_title(title)
-    # Plot also the training points
+
+    # Plot the training points
     ax.scatter(X[:, 0],X[:, 1], c=y)
-    # and seeds
+
+    # Plot the seeds
     if seeds is not None:
-        ax.scatter(X[seeds, 0], X[seeds, 1],
-                   alpha=1.0, facecolors='magenta')
+        ax.scatter(X[seeds, 0], X[seeds, 1], alpha=1.0, facecolors='magenta')
+
     ax.set_xlim(xx.min(), xx.max())
     ax.set_ylim(yy.min(), yy.max())
     ax.set_xticks(())
     ax.set_yticks(())
 
-#perform active learning
-#
-def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
-    # USE THIS INSTEAD OF YTGT WHICH WE PRETEND TO NOT KNOW
-
-    for dsix, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets):
+'''
+    Perform Active Learning
+    QueryStrategy (Random Sampling or Uncertainty Sampling)
+'''
+def active(classifiers, datasets, experiments, qs, quota=25, plot_every_n=5):
+    for dataset_index, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets):
         u_tgt = [None] * len(X_tgt)
         est_src = ce.ComplexityEstimator(X_src, y_src, n_windows=10, nK=1)
         est_tgt = ce.ComplexityEstimator(X_tgt, y_tgt, n_windows=10, nK=1)
-        # declare Dataset instance, X is the feature, y is the label (None if unlabeled)
+
+        # Declare Dataset instance, X is the feature, y is the label (None if unlabeled)
         X = np.vstack((X_src, X_tgt))
+
         if X.shape[1] > 2:
             X_src_plt = TSNE().fit_transform(X_src)
             X_tgt_plt = TSNE().fit_transform(X_tgt)
@@ -52,82 +67,113 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5):
         else:
             raise AttributeError
 
-        h = .05  # step size in the mesh
+        h = .05  # Step size in the mesh
         x_min, x_max = X_plt[:, 0].min() - h, X_plt[:, 0].max() + h
         y_min, y_max = X_plt[:, 1].min() - h, X_plt[:, 1].max() + h
         xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
         figure = plt.figure(figsize=(27, 13))
-
         grid_size = (1+len(classifiers), 6)
-        for n, classifier in enumerate(classifiers):
 
+        for classifier_index, classifier in enumerate(classifiers):
             model = classifier
             oracle = Oracle(X_tgt, y_tgt)
-            # plot src
-            plot_ds(grid_size, (0, 0), X_src_plt, y_src, xx, yy, 'Src', est_src.seeds)
+
+            # Plot source dataset
+            plot_ds(grid_size, (0, 0), X_src_plt, y_src, xx, yy, 'Source', est_src.seeds)
             ax = plt.subplot2grid(grid_size, (0,1), colspan=2)
-            ax.set_title('Src complexity')
+            ax.set_title('Source complexity')
             Ks, Es = est_src.get_k_complexity()
             ax.plot(Ks, Es)
             ax.set_xlabel('AUC=' + ('%.2f' % est_src.auc()).lstrip('0'))
 
-            #plt tgt
-            plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Tgt', est_tgt.seeds)
+            # Plot target dataset
+            plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Target', est_tgt.seeds)
             ax = plt.subplot2grid(grid_size, (0,4), colspan=2)
             Ks, Es = est_tgt.get_k_complexity()
-            ax.set_title('Tgt complexity')
+            ax.set_title('Target complexity')
             ax.plot(Ks, Es)
             ax.set_xlabel('AUC=' + ('%.2f' % est_tgt.auc()).lstrip('0'))
             w = 0
             X_known = X_src.tolist()
             y_known = y_src.tolist()
-            for i in range(quota):  # loop through the number of queries
-                loc, y_loc = oracle.random_query()  # let the specified QueryStrategy suggest a data to query
-                u_tgt[loc] = y_loc
-                X_known.append(X_tgt[loc])
-                y_known.append(y_tgt[loc])
-                if i == 0 or i % plot_every_n == 0 or i == quota - 1:
-                    model.fit(X_known, y_known)  # train model with newly-updated Dataset
-                    score = model.score(X_tgt, y_tgt)
-                    y_predicted = model.predict(X_tgt)
-                    ax = plt.subplot2grid(grid_size, (n + 1, w))
-                    nd_boundary_plot(X_tgt, y_predicted, model, ax)
-                    if i == 0:
-                        ax.set_ylabel(u.classname(model))
-                    if n == 0:
-                        ax.set_title('# queries=' + str(i))
-                    ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0'))
-                    w += 1
-
-        figure.suptitle(experiments[dsix])
+
+            for i in range(quota):  # Loop through the number of queries
+
+                if qs == 1 :
+                    qs_name = 'RandomSampling'
+                    loc, y_loc = oracle.random_query()  # Sample target using RandomSampling strategy
+                    u_tgt[loc] = y_loc
+                    X_known.append(X_tgt[loc])
+                    y_known.append(y_tgt[loc])
+
+                    if i == 0 or i % plot_every_n == 0 or i == quota - 1:
+                        model.fit(X_known, y_known)  # Train model with newly-updated dataset
+                        score = model.score(X_tgt, y_tgt)
+                        y_predicted = model.predict(X_tgt)
+                        ax = plt.subplot2grid(grid_size, (classifier_index + 1, w))
+                        nd_boundary_plot(X_tgt, y_predicted, model, ax)
+
+                        if i == 0:
+                            ax.set_ylabel(u.classname(model))
+
+                        if classifier_index == 0:
+                            ax.set_title('# Queries=' + str(i))
+
+                        ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0'))
+                        w += 1
+
+                elif qs == 2:
+                    qs_name = 'UncertaintySampling'
+                    model.fit(X_known, y_known) # Fit model on source only to predict probabilities
+                    loc, X_chosen = oracle.uncertainty_sampling(model) # Sample target using UncertaintySampling strategy
+                    X_known.append(X_tgt[loc])
+                    y_known.append(y_tgt[loc])
+
+                    if i == 0 or i % plot_every_n == 0 or i == quota - 1:
+                        model.fit(X_known, y_known)  # Train model with newly-updated dataset
+                        score = model.score(X_tgt, y_tgt)
+                        y_predicted = model.predict(X_tgt)
+                        ax = plt.subplot2grid(grid_size, (classifier_index + 1, w))
+                        nd_boundary_plot(X_tgt, y_predicted, model, ax)
+
+                        if i == 0:
+                            ax.set_ylabel(u.classname(model))
+
+                        if classifier_index == 0:
+                            ax.set_title('# Queries=' + str(i))
+
+                        ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0'))
+                        w += 1
+
+        figure.suptitle(experiments[dataset_index] + qs_name )
         figure.tight_layout()
-        fname = './vis/' + str(experiments[dsix]) + '.png'
-        figure.savefig(filename=fname)
+        fname = './vis/' + str(experiments[dataset_index] + qs_name) + '.png'
+        figure.savefig(fname)
+
     plt.tight_layout()
     plt.show()
 
-
 def main():
-    # clfs = [SVC(), LinearSVC(), AdaBoostClassifier(), GaussianNB()]
+    clfs = [SVC(), GaussianNB(), DecisionTreeClassifier(), MLPClassifier(hidden_layer_sizes=(10,10,10,10,10,10), solver='lbfgs', alpha=2, random_state=1, activation='relu')]
     datasets = []
     experiments = []
-    clfs = [DecisionTreeClassifier()]
 
-    # datasets.append(
-    #    (, make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2)))
+    # datasets.append((make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2), 
+    #                  make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2)))
     # experiments.append('hastie_10_2_vs_gauss_quant_10_2')
     # datasets.append((make_moons(n_samples=1000), make_moons(n_samples=1000)))
 
     # experiments.append('moons')
     # datasets.append((u.hastie(1000), u.hastie(1000)))
 
-    datasets.append((make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=3),
-                     make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=3)))
-    experiments.append('gauus')
-    # datasets.append((mnist.load_mnist(), mnist.load_mnist_rotated()))
-    # experiments.append('MNIST_vs_MNIST_Rotated')
+    # datasets.append((make_gaussian_quantiles(n_samples=2000, n_features=10, n_classes=3),
+    #                 make_gaussian_quantiles(n_samples=2000, n_features=10, n_classes=3)))
+    # experiments.append('gauus')
+
+    datasets.append((mnist.load_mnist(), mnist.load_mnist_rotated()))
+    experiments.append('MNIST_vs_MNIST_Rotated')
 
-    active(classifiers=clfs, datasets=datasets, experiments=experiments)
+    active(classifiers=clfs, datasets=datasets, experiments=experiments, qs=1)
 
 if __name__ == "__main__":
     main()
diff --git a/data/MNIST Data.md b/data/MNIST Data.md
@@ -0,0 +1,3 @@
+# MNIST Data
+
+[Download Here](https://drive.google.com/open?id=12E----VtVc03jqRQc8QaFuAbi1SMtI7i)
diff --git a/demo.py b/demo.py
@@ -27,8 +27,6 @@ def demo(datasets, dsnames, classifiers, nwindows):
     f1 = figure.number
     figure2 = plt.figure(figsize=(27, 9))
     f2 = figure2.number
-    figure3 = plt.figure(figsize=(27, 9))
-    f3 = figure3.number
 
     i = 1
     j = 1
@@ -112,32 +110,6 @@ def demo(datasets, dsnames, classifiers, nwindows):
             ax.set_title('Avg. Complexity')
         ax.plot(Ks, Es)
         j+=1
-
-
-        # plot data and
-        figure3, a = plt.subplots(nrows=len(datasets), ncols=2,figsize=(27,9))
-        a = a.ravel()
-
-        for idx,ax in enumerate(a):
-            if idx % 2 == 0:
-                ax.set_title(dsnames[ds_cnt])
-                # Plot also the training points
-                ax.scatter(X[:, 0], X[:, 1], c=y)
-                # and seeds
-                ax.scatter(X[estimator.seeds, 0], X[estimator.seeds, 1],
-                           alpha=1.0, facecolors='black')
-                ax.set_xlim(xx.min(), xx.max())
-                ax.set_ylim(yy.min(), yy.max())
-                ax.set_xticks(())
-                ax.set_yticks(())
-            else:
-                ax.hist(Es, 10)
-                ax.set_xlabel('E')
-                ax.set_ylabel('frequency')
-                ax.set_title('Hist. of Entropy')
-        figure3.tight_layout()
-        figure3.savefig(filename=('./vis/' + dsnames[ds_cnt] + 'Histograms.png'))
-
         '''
                 ws = estimator.get_w_complexity()
                 for wi, w in enumerate(ws):
@@ -149,18 +121,16 @@ def demo(datasets, dsnames, classifiers, nwindows):
 
     figure.tight_layout()
     figure2.tight_layout()
-
-
     figure.savefig(filename=('./vis/'+ ''.join(dsnames)+'Classifications.png'))
     figure2.savefig(filename=('./vis/'+''.join(dsnames) + 'Complexities.png'))
-
     plt.show()
 
 def main():
     classifiers = [
         LinearDiscriminantAnalysis(),
         QuadraticDiscriminantAnalysis(),
         KNeighborsClassifier(3),
+        MLPClassifier(alpha=1),
         SVC(gamma=2, C=1),
         LinearSVC(),
         GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),

diff --git a/modules/mnist.py b/modules/mnist.py
@@ -0,0 +1,59 @@
+import numpy as np
+import pandas as pd
+from scipy import ndimage
+import matplotlib.pyplot as plt
+from sklearn.utils import shuffle
+
+
+# Import data and preprocess 
+mnist = pd.read_csv('./data/mnist.csv') # Using 100 samples only for this test run
+labels = mnist.as_matrix(columns=['label'])
+dataset = mnist.drop('label', axis = 1).as_matrix()
+dataset[dataset > 0] = 1 # Convert each pixel either 0 for white and 1 for black for better classification
+
+
+def load_mnist():
+
+    rows = 42000
+    columns = 784
+    index = 1
+    X = []
+    for image in dataset[:rows*columns]:
+        img = np.reshape(image, [28, 28])
+        X.append(img)
+        index += 1
+    X = np.array(X).reshape(rows, -1)
+    mnist = pd.DataFrame(X)
+    mnist = mnist.as_matrix()
+    y = labels.flatten()
+
+    print("Completed with X shape: ", mnist.shape)
+    print("Flattened y shape: ", y.shape)
+
+    mnist, y = shuffle(X, y, random_state = 5)
+    return mnist, y
+
+
+def load_mnist_rotated():
+
+    rows = 42000
+    columns = 784
+    indx = 1
+    X = []
+    for image in dataset[:rows*columns]:
+        img = np.reshape(image, [28, 28])
+        rotated = ndimage.rotate(img, 90) # Rotate the images by 90 degrees
+        X.append(rotated)
+        indx += 1
+    X = np.array(X).reshape(rows, -1)
+
+    mnist_rotated = pd.DataFrame(X)
+    # mnist_rotated.to_csv('./data/mnist_rotated/minst_rotated_21000.csv', index=False, header=False)
+    mnist_rotated = mnist_rotated.as_matrix()
+
+    y = labels.flatten()
+    print("Completed with X shape: ", mnist_rotated.shape)
+    print("Flattened y shape: ", y.shape)
+
+    mnist_rotated, y = shuffle(X, y, random_state = 15)
+    return mnist_rotated, y
diff --git a/modules/oracle.py b/modules/oracle.py
@@ -16,4 +16,27 @@ def random_query(self):
             if loc not in self.queried:
                 self.queried.append(loc)
                 break
-        return loc, self.y[loc]
+        return loc, self.y[loc]
+
+    def classifier_uncertainty(self, classifier):
+        try:
+            classwise_uncertainty = classifier.predict_proba(self.X)
+        except AttributeError:
+            classwise_uncertainty = classifier.decision_function(self.X)
+
+        uncertainty = 1 - np.max(classwise_uncertainty, axis=1)
+
+        return uncertainty
+
+    def uncertainty_sampling(self, classifier, n_instances=1):
+        uncertainty = self.classifier_uncertainty(classifier)
+        query_idx = self.multi_argmax(uncertainty, n_instances=n_instances)
+
+        return int(query_idx), self.X[query_idx]
+
+    def multi_argmax(self, values, n_instances=1):
+        assert n_instances <= len(values), 'n_instances must be less or equal than the size of utility'
+        max_idx = np.argpartition(-values, n_instances-1, axis=0)[:n_instances]
+
+        return max_idx
+
diff --git a/nd_boundary_plot b/nd_boundary_plot
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# MNIST Data

		[Download Here](https://drive.google.com/open?id=12E----VtVc03jqRQc8QaFuAbi1SMtI7i)