Skip to content

Commit

Permalink
added the classifier changes
Browse files Browse the repository at this point in the history
  • Loading branch information
seekshreyas committed Apr 25, 2014
1 parent a34df51 commit 4852a7f
Show file tree
Hide file tree
Showing 3 changed files with 305 additions and 0 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,22 @@ classifier-scaffold
===================

My reusable code for using many different classifiers on a dataset.

It takes some command line flags, like:

```bash
python appClassifierBenchmark.py --file="exports/appFeatures.csv" --classifier="all" --sample="split"
```

where,

- `--classifier`: takes inputs of what classifier we want to run. By default it runs all and takes key values of other models to specifically only run that model.
- `--sample`: takes inputs for whether we want to run the classifiers on all dataset, or equal splits. By default it takes all.
- `--help`: provides some output for the above help, but frankly it still needs a little improvement.


Save output from console to a file as:

```bash
$python appClassifierBenchmark.py --file="exports/appFeatures.csv" > <outputfile>
```
285 changes: 285 additions & 0 deletions classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
#! /usr/bin/env python
# -*- coding: UTF-8 -*-
"""
Benchmark different classifiers
Classifiers tried:
- Naive Bayes
- Random Forest
- SVM
- linear
- kernelized
"""
from __future__ import division
import sys
import pandas as pd
import numpy as np
from optparse import OptionParser
from sklearn import metrics, preprocessing
from sklearn import svm, naive_bayes, neighbors, tree
from sklearn.ensemble import AdaBoostClassifier



def getUserInput(models):
"""
Get User Input
"""
optionparser = OptionParser(add_help_option=False, epilog="multiline")

optionparser.add_option('-c', '--classifier', dest='classifier', default="all")
optionparser.add_option('-s', '--sample', dest='sample', default="all")
optionparser.add_option('-h', '--help', dest='help', action='store_true',
help='show this help message and exit')
optionparser.add_option('-f', '--file', dest='file')


(option, args) = optionparser.parse_args()

if option.help:
print optionparser.print_help()
print __doc__
print "Supported Classifier Models:"


# print models
for index, key in enumerate(models):
print "%2s % 20s" % (index, key)

print "Default option: 'all'\n"

print "To run the program, provide app features file path"
print "Usage: --file='path.to.appData'"

sys.exit()


if not option.file:
return optionparser.error('Data File path not provided.\n Usage: --file="path.to.data"')


return {
'classifier' : option.classifier,
'file': option.file,
'sample' : option.sample
}





def loadAppData(datafile):
"""
Data File added
{
'fair' : False,
'unfair': True
}
"""
df = pd.read_csv(datafile)

## Remove the unnamed column as not sure
# cols = set(df.columns)
# cols.remove('Unnamed: 7')
# df = df[list(cols)]

## Convert appLabel to boolean: True for 'unfair'
df['label'] = df['label'].map(lambda x: x=='unfair')

return df


def trimDf(df):
"""
Trim the dataframe provided
Remove features that we don't think are helping
"""
cols = set(df.columns)

cols.remove('feat3') # bug in our feature extraction code
cols.remove('feat8') # considered only free apps


return df[list(cols)]




def prepareSplitClassifier(df, models, choice):
"""
Classify the apps for equal splits
"""


def classificationOutput(clf, X, Y):
"""
Fit the model and print the classification results
- confusion_matrix
- avg scores etc
"""
n_samples = 36

print "\n\nClassifier: \n %s" % (clf)
print "#" * 79
# classifier_gnb = naive_bayes.GaussianNB() # initiating the classifier

clf.fit(X[:n_samples], Y[:n_samples]) # train on first n_samples and test on last 10

expected = Y[n_samples:]
predicted = clf.predict(X[n_samples:])
print("Classification report:\n%s\n" % (metrics.classification_report(expected, predicted)))
print("\nConfusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))




def splitclassify(cDf):
"""
Given the dataframe combined with equal fair and unfair apps,
classify them
"""
cDf = cDf.reindex(np.random.permutation(cDf.index)) # shuffle the dataframe
featCols = set(cDf.columns)
featCols.remove('label')

features = cDf[list(featCols)].astype('float')

## Scale the features to a common range
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(features.values)

Y = cDf['label'].values


if choice == 'all':
for key in models:
classifier = models[key]
classificationOutput(classifier, X, Y)
else:
if choice in models:
classifier = models[choice]
classificationOutput(classifier, X, Y)
else:
print "Incorrect Choice"



fairDf = df[df['label'] == False]
unfairDf = df[df['label'] == True]


# calculate total possible splits of fair data frame relatie to
# size of unfair dataframe
splits = len(fairDf) // len(unfairDf)

for i in range(splits):
clDf = fairDf[i : i+len(unfairDf)].append(unfairDf)

# print fairDf.values, unfairDf.values
print "Classifying %d th split of fair with unfair " % (i)
print "-" * 79
splitclassify(clDf)
print "\n\n"




def performClassification(clf, featVector, labelVector, fold=4):
"""
Perform Classification
"""

(numrow, numcol) = featVector.shape

foldsize = int(numrow//fold)

print "FoldSize: %s" % (foldsize)

for i in range(fold):
X_test = featVector[i*foldsize:(i+1)*foldsize]
Y_test = labelVector[i*foldsize:(i+1)*foldsize]

X_train = np.concatenate((featVector[:i*foldsize], featVector[(i+1)*foldsize:]))
Y_train = np.concatenate((labelVector[:i*foldsize], labelVector[(i+1)*foldsize:]))

print " X_train: %s, Y_train: %s, X_test: %s, Y_test: %s" % (X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

print "#### Classifier: \n %s" % (clf)


clf.fit(X_train, Y_train) # train on first n_samples and test on last 10

expected = Y_test
predicted = clf.predict(X_test)
print "Classification report:\n%s\n" % metrics.classification_report(expected, predicted)
print "\nConfusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)




def allClassifier(cDf, models, modelchoice):
"""
Classifier for all apps
"""

print "Data Size: %s, \t Model Choice: %s" % (cDf.shape, modelchoice)

cDf = cDf.reindex(np.random.permutation(cDf.index)) # shuffle the dataframe
featCols = set(cDf.columns)
featCols.remove('label')

features = cDf[list(featCols)].astype('float')

## Scale the features to a common range
min_max_scaler = preprocessing.MinMaxScaler()
featVector = min_max_scaler.fit_transform(features.values) #scaled feature vector

labelVector = cDf['label'].values #label vector


if modelchoice == 'all':
for key in models:
if key != 'svm-nl':
classifier = models[key]
performClassification(classifier, featVector, labelVector)
else:
if modelchoice in models and modelchoice != 'svm-nl':
classifier = models[choice]
performClassification(classifier, featVector, labelVector)
else:
print "Incorrect Choice"






def main():

# Supported classifier models
n_neighbors = 3
models = {
'nb' : naive_bayes.GaussianNB(),
'svm-l' : svm.SVC(),
'svm-nl' : svm.NuSVC(),
'tree' : tree.DecisionTreeClassifier(),
'forest': AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200),
'knn-uniform' : neighbors.KNeighborsClassifier(n_neighbors, weights='uniform'),
'knn-distance' : neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
}

userInput = getUserInput(models)
appDf = loadAppData(userInput['file'])
appDf = trimDf(appDf)

if userInput['sample'] == 'all':
allClassifier(appDf, models, userInput['classifier'])
else:
prepareSplitClassifier(appDf, models, userInput['classifier'])




if __name__ == '__main__':
main()
Loading

0 comments on commit 4852a7f

Please sign in to comment.