all classifiers also working

seekshreyas · Mar 31, 2014 · fd3d5ca · fd3d5ca
1 parent c63a177
commit fd3d5ca
Show file tree

Hide file tree

Showing 4 changed files with 523 additions and 453 deletions.
diff --git a/appClassifierBenchmark.py b/appClassifierBenchmark.py
@@ -29,6 +29,7 @@ def getUserInput(models):
     optionparser = OptionParser(add_help_option=False, epilog="multiline")
 
     optionparser.add_option('-c', '--classifier', dest='classifier', default="all")
+    optionparser.add_option('-s', '--sample', dest='sample', default="all")
     optionparser.add_option('-h', '--help', dest='help', action='store_true',
                   help='show this help message and exit')
     optionparser.add_option('-f', '--file', dest='file')
@@ -58,7 +59,11 @@ def getUserInput(models):
             return optionparser.error('Data File path not provided.\n Usage: --file="path.to.appData"')
 
 
-    return { 'classifier' : option.classifier, 'file': option.file }
+    return {
+        'classifier' : option.classifier,
+        'file': option.file,
+        'sample' : option.sample
+    }
 
 
 
@@ -101,9 +106,9 @@ def trimDf(df):
 
 
 
-def prepareClassifier(df, models, choice):
+def prepareSplitClassifier(df, models, choice):
     """
-    Classify the apps
+    Classify the apps for equal splits
     """
 
 
@@ -119,7 +124,7 @@ def classificationOutput(clf, X, Y):
         print "#" * 79
         # classifier_gnb = naive_bayes.GaussianNB() # initiating the classifier
 
-        Y_pred = clf.fit(X[:n_samples], Y[:n_samples]) # train on first n_samples and test on last 10
+        clf.fit(X[:n_samples], Y[:n_samples]) # train on first n_samples and test on last 10
 
         expected = Y[n_samples:]
         predicted = clf.predict(X[n_samples:])
@@ -129,7 +134,7 @@ def classificationOutput(clf, X, Y):
 
 
 
-    def classify(cDf):
+    def splitclassify(cDf):
         """
         Given the dataframe combined with equal fair and unfair apps,
         classify them
@@ -139,6 +144,7 @@ def classify(cDf):
         featCols.remove('appLabel')
 
         features = cDf[list(featCols)].astype('float')
+
         ## Scale the features to a common range
         min_max_scaler = preprocessing.MinMaxScaler()
         X = min_max_scaler.fit_transform(features.values)
@@ -173,12 +179,79 @@ def classify(cDf):
         # print fairDf.values, unfairDf.values
         print "Classifying %d th split of fair apps with unfair app" % (i)
         print "-" * 79
-        classify(clDf)
+        splitclassify(clDf)
         print "\n\n"
 
 
 
 
+def performClassification(clf, featVector, labelVector, fold=4):
+    """
+    Perform Classification
+    """
+
+    (numrow, numcol) = featVector.shape
+
+    foldsize = int(numrow//fold)
+
+    print "FoldSize: %s" % (foldsize)
+
+    for i in range(fold):
+        X_test = featVector[i*foldsize:(i+1)*foldsize]
+        Y_test = labelVector[i*foldsize:(i+1)*foldsize]
+
+        X_train = np.concatenate((featVector[:i*foldsize], featVector[(i+1)*foldsize:]))
+        Y_train = np.concatenate((labelVector[:i*foldsize], labelVector[(i+1)*foldsize:]))
+
+        print " X_train: %s, Y_train: %s, X_test: %s, Y_test: %s" % (X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
+
+        print "#### Classifier: \n %s" % (clf)
+
+
+        clf.fit(X_train, Y_train) # train on first n_samples and test on last 10
+
+        expected = Y_test
+        predicted = clf.predict(X_test)
+        print "Classification report:\n%s\n" % metrics.classification_report(expected, predicted)
+        print "\nConfusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)
+
+
+
+
+def allClassifier(cDf, models, modelchoice):
+    """
+    Classifier for all apps
+    """
+
+    print "Data Size: %s, \t Model Choice: %s" % (cDf.shape, modelchoice)
+
+    cDf = cDf.reindex(np.random.permutation(cDf.index)) # shuffle the dataframe
+    featCols = set(cDf.columns)
+    featCols.remove('appLabel')
+
+    features = cDf[list(featCols)].astype('float')
+
+    ## Scale the features to a common range
+    min_max_scaler = preprocessing.MinMaxScaler()
+    featVector = min_max_scaler.fit_transform(features.values) #scaled feature vector
+
+    labelVector = cDf['appLabel'].values #label vector
+
+
+    if modelchoice == 'all':
+        for key in models:
+            if key != 'svm-nl':
+                classifier = models[key]
+                performClassification(classifier, featVector, labelVector)
+    else:
+        if modelchoice in models and modelchoice != 'svm-nl':
+            classifier = models[choice]
+            performClassification(classifier, featVector, labelVector)
+        else:
+            print "Incorrect Choice"
+
+
+
 
 
 
@@ -187,26 +260,23 @@ def main():
     # Supported classifier models
     n_neighbors = 3
     models = {
-                'nb' : naive_bayes.GaussianNB(),
-                'svm-l' : svm.SVC(),
-                'svm-nl' : svm.NuSVC(),
-                'tree' : tree.DecisionTreeClassifier(),
-                'forest': AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200),
-                'knn-uniform' : neighbors.KNeighborsClassifier(n_neighbors, weights='uniform'),
-                'knn-distance' : neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
+        'nb' : naive_bayes.GaussianNB(),
+        'svm-l' : svm.SVC(),
+        'svm-nl' : svm.NuSVC(),
+        'tree' : tree.DecisionTreeClassifier(),
+        'forest': AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200),
+        'knn-uniform' : neighbors.KNeighborsClassifier(n_neighbors, weights='uniform'),
+        'knn-distance' : neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
     }
 
     userInput = getUserInput(models)
     appDf = loadAppData(userInput['file'])
     appDf = trimDf(appDf)
 
-
-    # print "Sample Data"
-    # print "-" * 79
-    # print appDf.head()
-
-
-    prepareClassifier(appDf, models, userInput['classifier'])
+    if userInput['sample'] == 'all':
+        allClassifier(appDf, models, userInput['classifier'])
+    else:
+        prepareSplitClassifier(appDf, models, userInput['classifier'])
 
 
 

diff --git a/classifier.py b/classifier.py
@@ -375,7 +375,7 @@ def classifier(alldata, fold=4):
     # pprint(data)
 
     claccuracy = []
-    size = int(math.floor(len(data) / 10.0))
+    size = int(math.floor(len(data) / fold))
 
     for i in range(fold):
         test_this_round = data[i*size:][:size]