From b7a4a370ef85daabff706fbef6109f08e52f5113 Mon Sep 17 00:00:00 2001 From: Mikhail Mekhedkin Meskhi Date: Wed, 20 Jun 2018 19:44:59 +0400 Subject: [PATCH 1/3] Added Uncertainty sampling strategy --- complexity.py | 164 +++++++++++++++++++++++++++++---------------- data/MNIST Data.md | 3 + demo.py | 32 +-------- modules/mnist.py | 59 ++++++++++++++++ modules/oracle.py | 25 ++++++- nd_boundary_plot | 2 +- 6 files changed, 193 insertions(+), 92 deletions(-) create mode 100644 data/MNIST Data.md create mode 100644 modules/mnist.py diff --git a/complexity.py b/complexity.py index be0f046..0ac730a 100644 --- a/complexity.py +++ b/complexity.py @@ -1,46 +1,61 @@ +# Necessities +import numpy as np +import pandas as pd import matplotlib.pyplot as plt -import numpy as np +# Sklearn imports (models, synthetic data, etc...) +from sklearn.datasets import make_moons from sklearn.datasets import make_gaussian_quantiles from sklearn.manifold.t_sne import TSNE +from sklearn.svm import SVC +from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier +from sklearn.neural_network import MLPClassifier -import modules.complexity_estimator as ce +# Active Learning and Complexity Modules import modules.util as u from modules.oracle import Oracle +import modules.complexity_estimator as ce from nd_boundary_plot.plots import nd_boundary_plot +# Data pre-processing and import +from modules import mnist -################################################################################################33 -#scatter plot of a dataset helper -# -def plot_ds(grid_size, loc, X, y, xx, yy, title, seeds=None, colspan=1, rowspan=1): +#################################################### - ax = plt.subplot2grid(grid_size, loc, rowspan=rowspan, colspan=colspan) +''' + Scatter plot for the dataset +''' +def plot_ds(grid_size, loc, X, y, xx, yy, title, seeds=None, colspan=1, rowspan=1): + ax = plt.subplot2grid(grid_size, loc, rowspan=rowspan, colspan=colspan) ax.set_title(title) - # Plot also the training points + + # Plot the training points ax.scatter(X[:, 0],X[:, 1], c=y) - # and seeds + + # Plot the seeds if seeds is not None: - ax.scatter(X[seeds, 0], X[seeds, 1], - alpha=1.0, facecolors='magenta') + ax.scatter(X[seeds, 0], X[seeds, 1], alpha=1.0, facecolors='magenta') + ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) -#perform active learning -# -def active(classifiers, datasets, experiments, quota=25, plot_every_n=5): - # USE THIS INSTEAD OF YTGT WHICH WE PRETEND TO NOT KNOW - - for dsix, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets): +''' + Perform Active Learning + QueryStrategy (Random Sampling or Uncertainty Sampling) +''' +def active(classifiers, datasets, experiments, qs, quota=25, plot_every_n=5): + for dataset_index, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets): u_tgt = [None] * len(X_tgt) est_src = ce.ComplexityEstimator(X_src, y_src, n_windows=10, nK=1) est_tgt = ce.ComplexityEstimator(X_tgt, y_tgt, n_windows=10, nK=1) - # declare Dataset instance, X is the feature, y is the label (None if unlabeled) + + # Declare Dataset instance, X is the feature, y is the label (None if unlabeled) X = np.vstack((X_src, X_tgt)) + if X.shape[1] > 2: X_src_plt = TSNE().fit_transform(X_src) X_tgt_plt = TSNE().fit_transform(X_tgt) @@ -52,82 +67,113 @@ def active(classifiers, datasets, experiments, quota=25, plot_every_n=5): else: raise AttributeError - h = .05 # step size in the mesh + h = .05 # Step size in the mesh x_min, x_max = X_plt[:, 0].min() - h, X_plt[:, 0].max() + h y_min, y_max = X_plt[:, 1].min() - h, X_plt[:, 1].max() + h xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) figure = plt.figure(figsize=(27, 13)) - grid_size = (1+len(classifiers), 6) - for n, classifier in enumerate(classifiers): + for classifier_index, classifier in enumerate(classifiers): model = classifier oracle = Oracle(X_tgt, y_tgt) - # plot src - plot_ds(grid_size, (0, 0), X_src_plt, y_src, xx, yy, 'Src', est_src.seeds) + + # Plot source dataset + plot_ds(grid_size, (0, 0), X_src_plt, y_src, xx, yy, 'Source', est_src.seeds) ax = plt.subplot2grid(grid_size, (0,1), colspan=2) - ax.set_title('Src complexity') + ax.set_title('Source complexity') Ks, Es = est_src.get_k_complexity() ax.plot(Ks, Es) ax.set_xlabel('AUC=' + ('%.2f' % est_src.auc()).lstrip('0')) - #plt tgt - plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Tgt', est_tgt.seeds) + # Plot target dataset + plot_ds(grid_size, (0, 3), X_tgt_plt, y_tgt, xx, yy, 'Target', est_tgt.seeds) ax = plt.subplot2grid(grid_size, (0,4), colspan=2) Ks, Es = est_tgt.get_k_complexity() - ax.set_title('Tgt complexity') + ax.set_title('Target complexity') ax.plot(Ks, Es) ax.set_xlabel('AUC=' + ('%.2f' % est_tgt.auc()).lstrip('0')) w = 0 X_known = X_src.tolist() y_known = y_src.tolist() - for i in range(quota): # loop through the number of queries - loc, y_loc = oracle.random_query() # let the specified QueryStrategy suggest a data to query - u_tgt[loc] = y_loc - X_known.append(X_tgt[loc]) - y_known.append(y_tgt[loc]) - if i == 0 or i % plot_every_n == 0 or i == quota - 1: - model.fit(X_known, y_known) # train model with newly-updated Dataset - score = model.score(X_tgt, y_tgt) - y_predicted = model.predict(X_tgt) - ax = plt.subplot2grid(grid_size, (n + 1, w)) - nd_boundary_plot(X_tgt, y_predicted, model, ax) - if i == 0: - ax.set_ylabel(u.classname(model)) - if n == 0: - ax.set_title('# queries=' + str(i)) - ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0')) - w += 1 - - figure.suptitle(experiments[dsix]) + + for i in range(quota): # Loop through the number of queries + + if qs == 1 : + qs_name = 'RandomSampling' + loc, y_loc = oracle.random_query() # Sample target using RandomSampling strategy + u_tgt[loc] = y_loc + X_known.append(X_tgt[loc]) + y_known.append(y_tgt[loc]) + + if i == 0 or i % plot_every_n == 0 or i == quota - 1: + model.fit(X_known, y_known) # Train model with newly-updated dataset + score = model.score(X_tgt, y_tgt) + y_predicted = model.predict(X_tgt) + ax = plt.subplot2grid(grid_size, (classifier_index + 1, w)) + nd_boundary_plot(X_tgt, y_predicted, model, ax) + + if i == 0: + ax.set_ylabel(u.classname(model)) + + if classifier_index == 0: + ax.set_title('# Queries=' + str(i)) + + ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0')) + w += 1 + + elif qs == 2: + qs_name = 'UncertaintySampling' + model.fit(X_known, y_known) # Fit model on source only to predict probabilities + loc, X_chosen = oracle.uncertainty_sampling(model) # Sample target using UncertaintySampling strategy + X_known.append(X_tgt[loc]) + y_known.append(y_tgt[loc]) + + if i == 0 or i % plot_every_n == 0 or i == quota - 1: + model.fit(X_known, y_known) # Train model with newly-updated dataset + score = model.score(X_tgt, y_tgt) + y_predicted = model.predict(X_tgt) + ax = plt.subplot2grid(grid_size, (classifier_index + 1, w)) + nd_boundary_plot(X_tgt, y_predicted, model, ax) + + if i == 0: + ax.set_ylabel(u.classname(model)) + + if classifier_index == 0: + ax.set_title('# Queries=' + str(i)) + + ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0')) + w += 1 + + figure.suptitle(experiments[dataset_index] + qs_name ) figure.tight_layout() - fname = './vis/' + str(experiments[dsix]) + '.png' - figure.savefig(filename=fname) + fname = './vis/' + str(experiments[dataset_index] + qs_name) + '.png' + figure.savefig(fname) + plt.tight_layout() plt.show() - def main(): - # clfs = [SVC(), LinearSVC(), AdaBoostClassifier(), GaussianNB()] + clfs = [SVC(), GaussianNB(), DecisionTreeClassifier(), MLPClassifier(hidden_layer_sizes=(10,10,10,10,10,10), solver='lbfgs', alpha=2, random_state=1, activation='relu')] datasets = [] experiments = [] - clfs = [DecisionTreeClassifier()] - # datasets.append( - # (, make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2))) + # datasets.append((make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2), + # make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2))) # experiments.append('hastie_10_2_vs_gauss_quant_10_2') # datasets.append((make_moons(n_samples=1000), make_moons(n_samples=1000))) # experiments.append('moons') # datasets.append((u.hastie(1000), u.hastie(1000))) - datasets.append((make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=3), - make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=3))) - experiments.append('gauus') - # datasets.append((mnist.load_mnist(), mnist.load_mnist_rotated())) - # experiments.append('MNIST_vs_MNIST_Rotated') + # datasets.append((make_gaussian_quantiles(n_samples=2000, n_features=10, n_classes=3), + # make_gaussian_quantiles(n_samples=2000, n_features=10, n_classes=3))) + # experiments.append('gauus') + + datasets.append((mnist.load_mnist(), mnist.load_mnist_rotated())) + experiments.append('MNIST_vs_MNIST_Rotated') - active(classifiers=clfs, datasets=datasets, experiments=experiments) + active(classifiers=clfs, datasets=datasets, experiments=experiments, qs=1) if __name__ == "__main__": main() diff --git a/data/MNIST Data.md b/data/MNIST Data.md new file mode 100644 index 0000000..6149480 --- /dev/null +++ b/data/MNIST Data.md @@ -0,0 +1,3 @@ +# MNIST Data + +[Download Here](https://drive.google.com/open?id=12E----VtVc03jqRQc8QaFuAbi1SMtI7i) \ No newline at end of file diff --git a/demo.py b/demo.py index 2d1f929..6c9d385 100644 --- a/demo.py +++ b/demo.py @@ -27,8 +27,6 @@ def demo(datasets, dsnames, classifiers, nwindows): f1 = figure.number figure2 = plt.figure(figsize=(27, 9)) f2 = figure2.number - figure3 = plt.figure(figsize=(27, 9)) - f3 = figure3.number i = 1 j = 1 @@ -112,32 +110,6 @@ def demo(datasets, dsnames, classifiers, nwindows): ax.set_title('Avg. Complexity') ax.plot(Ks, Es) j+=1 - - - # plot data and - figure3, a = plt.subplots(nrows=len(datasets), ncols=2,figsize=(27,9)) - a = a.ravel() - - for idx,ax in enumerate(a): - if idx % 2 == 0: - ax.set_title(dsnames[ds_cnt]) - # Plot also the training points - ax.scatter(X[:, 0], X[:, 1], c=y) - # and seeds - ax.scatter(X[estimator.seeds, 0], X[estimator.seeds, 1], - alpha=1.0, facecolors='black') - ax.set_xlim(xx.min(), xx.max()) - ax.set_ylim(yy.min(), yy.max()) - ax.set_xticks(()) - ax.set_yticks(()) - else: - ax.hist(Es, 10) - ax.set_xlabel('E') - ax.set_ylabel('frequency') - ax.set_title('Hist. of Entropy') - figure3.tight_layout() - figure3.savefig(filename=('./vis/' + dsnames[ds_cnt] + 'Histograms.png')) - ''' ws = estimator.get_w_complexity() for wi, w in enumerate(ws): @@ -149,11 +121,8 @@ def demo(datasets, dsnames, classifiers, nwindows): figure.tight_layout() figure2.tight_layout() - - figure.savefig(filename=('./vis/'+ ''.join(dsnames)+'Classifications.png')) figure2.savefig(filename=('./vis/'+''.join(dsnames) + 'Complexities.png')) - plt.show() def main(): @@ -161,6 +130,7 @@ def main(): LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), KNeighborsClassifier(3), + MLPClassifier(alpha=1), SVC(gamma=2, C=1), LinearSVC(), GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), diff --git a/modules/mnist.py b/modules/mnist.py new file mode 100644 index 0000000..d4ae5dc --- /dev/null +++ b/modules/mnist.py @@ -0,0 +1,59 @@ +import numpy as np +import pandas as pd +from scipy import ndimage +import matplotlib.pyplot as plt +from sklearn.utils import shuffle + + +# Import data and preprocess +mnist = pd.read_csv('./data/mnist.csv') # Using 100 samples only for this test run +labels = mnist.as_matrix(columns=['label']) +dataset = mnist.drop('label', axis = 1).as_matrix() +dataset[dataset > 0] = 1 # Convert each pixel either 0 for white and 1 for black for better classification + + +def load_mnist(): + + rows = 42000 + columns = 784 + index = 1 + X = [] + for image in dataset[:rows*columns]: + img = np.reshape(image, [28, 28]) + X.append(img) + index += 1 + X = np.array(X).reshape(rows, -1) + mnist = pd.DataFrame(X) + mnist = mnist.as_matrix() + y = labels.flatten() + + print("Completed with X shape: ", mnist.shape) + print("Flattened y shape: ", y.shape) + + mnist, y = shuffle(X, y, random_state = 5) + return mnist, y + + +def load_mnist_rotated(): + + rows = 42000 + columns = 784 + indx = 1 + X = [] + for image in dataset[:rows*columns]: + img = np.reshape(image, [28, 28]) + rotated = ndimage.rotate(img, 90) # Rotate the images by 90 degrees + X.append(rotated) + indx += 1 + X = np.array(X).reshape(rows, -1) + + mnist_rotated = pd.DataFrame(X) + # mnist_rotated.to_csv('./data/mnist_rotated/minst_rotated_21000.csv', index=False, header=False) + mnist_rotated = mnist_rotated.as_matrix() + + y = labels.flatten() + print("Completed with X shape: ", mnist_rotated.shape) + print("Flattened y shape: ", y.shape) + + mnist_rotated, y = shuffle(X, y, random_state = 15) + return mnist_rotated, y \ No newline at end of file diff --git a/modules/oracle.py b/modules/oracle.py index bfd4b0a..57047c0 100644 --- a/modules/oracle.py +++ b/modules/oracle.py @@ -16,4 +16,27 @@ def random_query(self): if loc not in self.queried: self.queried.append(loc) break - return loc, self.y[loc] \ No newline at end of file + return loc, self.y[loc] + + def classifier_uncertainty(self, classifier): + try: + classwise_uncertainty = classifier.predict_proba(self.X) + except AttributeError: + classwise_uncertainty = classifier.decision_function(self.X) + + uncertainty = 1 - np.max(classwise_uncertainty, axis=1) + + return uncertainty + + def uncertainty_sampling(self, classifier, n_instances=1): + uncertainty = self.classifier_uncertainty(classifier) + query_idx = self.multi_argmax(uncertainty, n_instances=n_instances) + + return int(query_idx), self.X[query_idx] + + def multi_argmax(self, values, n_instances=1): + assert n_instances <= len(values), 'n_instances must be less or equal than the size of utility' + max_idx = np.argpartition(-values, n_instances-1, axis=0)[:n_instances] + + return max_idx + diff --git a/nd_boundary_plot b/nd_boundary_plot index 12dd5fd..974a0ed 160000 --- a/nd_boundary_plot +++ b/nd_boundary_plot @@ -1 +1 @@ -Subproject commit 12dd5fdcef68344bac04b08bac14ab0db8d8bf8b +Subproject commit 974a0ed54f0bf820e602ef026b71c2955d796a9b From 0de3ebc68f1cdc19608015908c7a21b1ac153f56 Mon Sep 17 00:00:00 2001 From: Mikhail Mekhedkin Meskhi Date: Wed, 20 Jun 2018 19:46:20 +0400 Subject: [PATCH 2/3] Added Data Link --- data/{MNIST Data.md => README.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data/{MNIST Data.md => README.md} (100%) diff --git a/data/MNIST Data.md b/data/README.md similarity index 100% rename from data/MNIST Data.md rename to data/README.md From cac0b57df8e6b1cc8358ed22b51c6ea88335978b Mon Sep 17 00:00:00 2001 From: Mikhail Mekhedkin Meskhi Date: Wed, 20 Jun 2018 19:49:30 +0400 Subject: [PATCH 3/3] Fixed Query Startegy Parameter --- complexity.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/complexity.py b/complexity.py index 0ac730a..eee1f07 100644 --- a/complexity.py +++ b/complexity.py @@ -47,7 +47,7 @@ def plot_ds(grid_size, loc, X, y, xx, yy, title, seeds=None, colspan=1, rowspan= Perform Active Learning QueryStrategy (Random Sampling or Uncertainty Sampling) ''' -def active(classifiers, datasets, experiments, qs, quota=25, plot_every_n=5): +def active(classifiers, datasets, experiments, query_strat, quota=25, plot_every_n=5): for dataset_index, ((X_src, y_src), (X_tgt, y_tgt)) in enumerate(datasets): u_tgt = [None] * len(X_tgt) est_src = ce.ComplexityEstimator(X_src, y_src, n_windows=10, nK=1) @@ -99,8 +99,7 @@ def active(classifiers, datasets, experiments, qs, quota=25, plot_every_n=5): for i in range(quota): # Loop through the number of queries - if qs == 1 : - qs_name = 'RandomSampling' + if query_strat == 'RandomSampling' : loc, y_loc = oracle.random_query() # Sample target using RandomSampling strategy u_tgt[loc] = y_loc X_known.append(X_tgt[loc]) @@ -122,8 +121,7 @@ def active(classifiers, datasets, experiments, qs, quota=25, plot_every_n=5): ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0')) w += 1 - elif qs == 2: - qs_name = 'UncertaintySampling' + elif query_strat == 'UncertaintySampling': model.fit(X_known, y_known) # Fit model on source only to predict probabilities loc, X_chosen = oracle.uncertainty_sampling(model) # Sample target using UncertaintySampling strategy X_known.append(X_tgt[loc]) @@ -145,9 +143,9 @@ def active(classifiers, datasets, experiments, qs, quota=25, plot_every_n=5): ax.set_xlabel('Accuracy='+('%.2f' % score).lstrip('0')) w += 1 - figure.suptitle(experiments[dataset_index] + qs_name ) + figure.suptitle(experiments[dataset_index] + query_strat ) figure.tight_layout() - fname = './vis/' + str(experiments[dataset_index] + qs_name) + '.png' + fname = './vis/' + str(experiments[dataset_index] + query_strat ) + '.png' figure.savefig(fname) plt.tight_layout() @@ -157,6 +155,7 @@ def main(): clfs = [SVC(), GaussianNB(), DecisionTreeClassifier(), MLPClassifier(hidden_layer_sizes=(10,10,10,10,10,10), solver='lbfgs', alpha=2, random_state=1, activation='relu')] datasets = [] experiments = [] + query_strat = 'RandomSampling' # datasets.append((make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2), # make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2))) @@ -173,7 +172,7 @@ def main(): datasets.append((mnist.load_mnist(), mnist.load_mnist_rotated())) experiments.append('MNIST_vs_MNIST_Rotated') - active(classifiers=clfs, datasets=datasets, experiments=experiments, qs=1) + active(classifiers=clfs, datasets=datasets, experiments=experiments, query_strat=query_strat) if __name__ == "__main__": main()