-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a34df51
commit 4852a7f
Showing
3 changed files
with
305 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
#! /usr/bin/env python | ||
# -*- coding: UTF-8 -*- | ||
""" | ||
Benchmark different classifiers | ||
Classifiers tried: | ||
- Naive Bayes | ||
- Random Forest | ||
- SVM | ||
- linear | ||
- kernelized | ||
""" | ||
from __future__ import division | ||
import sys | ||
import pandas as pd | ||
import numpy as np | ||
from optparse import OptionParser | ||
from sklearn import metrics, preprocessing | ||
from sklearn import svm, naive_bayes, neighbors, tree | ||
from sklearn.ensemble import AdaBoostClassifier | ||
|
||
|
||
|
||
def getUserInput(models): | ||
""" | ||
Get User Input | ||
""" | ||
optionparser = OptionParser(add_help_option=False, epilog="multiline") | ||
|
||
optionparser.add_option('-c', '--classifier', dest='classifier', default="all") | ||
optionparser.add_option('-s', '--sample', dest='sample', default="all") | ||
optionparser.add_option('-h', '--help', dest='help', action='store_true', | ||
help='show this help message and exit') | ||
optionparser.add_option('-f', '--file', dest='file') | ||
|
||
|
||
(option, args) = optionparser.parse_args() | ||
|
||
if option.help: | ||
print optionparser.print_help() | ||
print __doc__ | ||
print "Supported Classifier Models:" | ||
|
||
|
||
# print models | ||
for index, key in enumerate(models): | ||
print "%2s % 20s" % (index, key) | ||
|
||
print "Default option: 'all'\n" | ||
|
||
print "To run the program, provide app features file path" | ||
print "Usage: --file='path.to.appData'" | ||
|
||
sys.exit() | ||
|
||
|
||
if not option.file: | ||
return optionparser.error('Data File path not provided.\n Usage: --file="path.to.data"') | ||
|
||
|
||
return { | ||
'classifier' : option.classifier, | ||
'file': option.file, | ||
'sample' : option.sample | ||
} | ||
|
||
|
||
|
||
|
||
|
||
def loadAppData(datafile): | ||
""" | ||
Data File added | ||
{ | ||
'fair' : False, | ||
'unfair': True | ||
} | ||
""" | ||
df = pd.read_csv(datafile) | ||
|
||
## Remove the unnamed column as not sure | ||
# cols = set(df.columns) | ||
# cols.remove('Unnamed: 7') | ||
# df = df[list(cols)] | ||
|
||
## Convert appLabel to boolean: True for 'unfair' | ||
df['label'] = df['label'].map(lambda x: x=='unfair') | ||
|
||
return df | ||
|
||
|
||
def trimDf(df): | ||
""" | ||
Trim the dataframe provided | ||
Remove features that we don't think are helping | ||
""" | ||
cols = set(df.columns) | ||
|
||
cols.remove('feat3') # bug in our feature extraction code | ||
cols.remove('feat8') # considered only free apps | ||
|
||
|
||
return df[list(cols)] | ||
|
||
|
||
|
||
|
||
def prepareSplitClassifier(df, models, choice): | ||
""" | ||
Classify the apps for equal splits | ||
""" | ||
|
||
|
||
def classificationOutput(clf, X, Y): | ||
""" | ||
Fit the model and print the classification results | ||
- confusion_matrix | ||
- avg scores etc | ||
""" | ||
n_samples = 36 | ||
|
||
print "\n\nClassifier: \n %s" % (clf) | ||
print "#" * 79 | ||
# classifier_gnb = naive_bayes.GaussianNB() # initiating the classifier | ||
|
||
clf.fit(X[:n_samples], Y[:n_samples]) # train on first n_samples and test on last 10 | ||
|
||
expected = Y[n_samples:] | ||
predicted = clf.predict(X[n_samples:]) | ||
print("Classification report:\n%s\n" % (metrics.classification_report(expected, predicted))) | ||
print("\nConfusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) | ||
|
||
|
||
|
||
|
||
def splitclassify(cDf): | ||
""" | ||
Given the dataframe combined with equal fair and unfair apps, | ||
classify them | ||
""" | ||
cDf = cDf.reindex(np.random.permutation(cDf.index)) # shuffle the dataframe | ||
featCols = set(cDf.columns) | ||
featCols.remove('label') | ||
|
||
features = cDf[list(featCols)].astype('float') | ||
|
||
## Scale the features to a common range | ||
min_max_scaler = preprocessing.MinMaxScaler() | ||
X = min_max_scaler.fit_transform(features.values) | ||
|
||
Y = cDf['label'].values | ||
|
||
|
||
if choice == 'all': | ||
for key in models: | ||
classifier = models[key] | ||
classificationOutput(classifier, X, Y) | ||
else: | ||
if choice in models: | ||
classifier = models[choice] | ||
classificationOutput(classifier, X, Y) | ||
else: | ||
print "Incorrect Choice" | ||
|
||
|
||
|
||
fairDf = df[df['label'] == False] | ||
unfairDf = df[df['label'] == True] | ||
|
||
|
||
# calculate total possible splits of fair data frame relatie to | ||
# size of unfair dataframe | ||
splits = len(fairDf) // len(unfairDf) | ||
|
||
for i in range(splits): | ||
clDf = fairDf[i : i+len(unfairDf)].append(unfairDf) | ||
|
||
# print fairDf.values, unfairDf.values | ||
print "Classifying %d th split of fair with unfair " % (i) | ||
print "-" * 79 | ||
splitclassify(clDf) | ||
print "\n\n" | ||
|
||
|
||
|
||
|
||
def performClassification(clf, featVector, labelVector, fold=4): | ||
""" | ||
Perform Classification | ||
""" | ||
|
||
(numrow, numcol) = featVector.shape | ||
|
||
foldsize = int(numrow//fold) | ||
|
||
print "FoldSize: %s" % (foldsize) | ||
|
||
for i in range(fold): | ||
X_test = featVector[i*foldsize:(i+1)*foldsize] | ||
Y_test = labelVector[i*foldsize:(i+1)*foldsize] | ||
|
||
X_train = np.concatenate((featVector[:i*foldsize], featVector[(i+1)*foldsize:])) | ||
Y_train = np.concatenate((labelVector[:i*foldsize], labelVector[(i+1)*foldsize:])) | ||
|
||
print " X_train: %s, Y_train: %s, X_test: %s, Y_test: %s" % (X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) | ||
|
||
print "#### Classifier: \n %s" % (clf) | ||
|
||
|
||
clf.fit(X_train, Y_train) # train on first n_samples and test on last 10 | ||
|
||
expected = Y_test | ||
predicted = clf.predict(X_test) | ||
print "Classification report:\n%s\n" % metrics.classification_report(expected, predicted) | ||
print "\nConfusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted) | ||
|
||
|
||
|
||
|
||
def allClassifier(cDf, models, modelchoice): | ||
""" | ||
Classifier for all apps | ||
""" | ||
|
||
print "Data Size: %s, \t Model Choice: %s" % (cDf.shape, modelchoice) | ||
|
||
cDf = cDf.reindex(np.random.permutation(cDf.index)) # shuffle the dataframe | ||
featCols = set(cDf.columns) | ||
featCols.remove('label') | ||
|
||
features = cDf[list(featCols)].astype('float') | ||
|
||
## Scale the features to a common range | ||
min_max_scaler = preprocessing.MinMaxScaler() | ||
featVector = min_max_scaler.fit_transform(features.values) #scaled feature vector | ||
|
||
labelVector = cDf['label'].values #label vector | ||
|
||
|
||
if modelchoice == 'all': | ||
for key in models: | ||
if key != 'svm-nl': | ||
classifier = models[key] | ||
performClassification(classifier, featVector, labelVector) | ||
else: | ||
if modelchoice in models and modelchoice != 'svm-nl': | ||
classifier = models[choice] | ||
performClassification(classifier, featVector, labelVector) | ||
else: | ||
print "Incorrect Choice" | ||
|
||
|
||
|
||
|
||
|
||
|
||
def main(): | ||
|
||
# Supported classifier models | ||
n_neighbors = 3 | ||
models = { | ||
'nb' : naive_bayes.GaussianNB(), | ||
'svm-l' : svm.SVC(), | ||
'svm-nl' : svm.NuSVC(), | ||
'tree' : tree.DecisionTreeClassifier(), | ||
'forest': AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200), | ||
'knn-uniform' : neighbors.KNeighborsClassifier(n_neighbors, weights='uniform'), | ||
'knn-distance' : neighbors.KNeighborsClassifier(n_neighbors, weights='distance') | ||
} | ||
|
||
userInput = getUserInput(models) | ||
appDf = loadAppData(userInput['file']) | ||
appDf = trimDf(appDf) | ||
|
||
if userInput['sample'] == 'all': | ||
allClassifier(appDf, models, userInput['classifier']) | ||
else: | ||
prepareSplitClassifier(appDf, models, userInput['classifier']) | ||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.