Skip to content

Commit

Permalink
all classifiers also working
Browse files Browse the repository at this point in the history
  • Loading branch information
seekshreyas committed Mar 31, 2014
1 parent c63a177 commit fd3d5ca
Show file tree
Hide file tree
Showing 4 changed files with 523 additions and 453 deletions.
110 changes: 90 additions & 20 deletions appClassifierBenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def getUserInput(models):
optionparser = OptionParser(add_help_option=False, epilog="multiline")

optionparser.add_option('-c', '--classifier', dest='classifier', default="all")
optionparser.add_option('-s', '--sample', dest='sample', default="all")
optionparser.add_option('-h', '--help', dest='help', action='store_true',
help='show this help message and exit')
optionparser.add_option('-f', '--file', dest='file')
Expand Down Expand Up @@ -58,7 +59,11 @@ def getUserInput(models):
return optionparser.error('Data File path not provided.\n Usage: --file="path.to.appData"')


return { 'classifier' : option.classifier, 'file': option.file }
return {
'classifier' : option.classifier,
'file': option.file,
'sample' : option.sample
}



Expand Down Expand Up @@ -101,9 +106,9 @@ def trimDf(df):



def prepareClassifier(df, models, choice):
def prepareSplitClassifier(df, models, choice):
"""
Classify the apps
Classify the apps for equal splits
"""


Expand All @@ -119,7 +124,7 @@ def classificationOutput(clf, X, Y):
print "#" * 79
# classifier_gnb = naive_bayes.GaussianNB() # initiating the classifier

Y_pred = clf.fit(X[:n_samples], Y[:n_samples]) # train on first n_samples and test on last 10
clf.fit(X[:n_samples], Y[:n_samples]) # train on first n_samples and test on last 10

expected = Y[n_samples:]
predicted = clf.predict(X[n_samples:])
Expand All @@ -129,7 +134,7 @@ def classificationOutput(clf, X, Y):



def classify(cDf):
def splitclassify(cDf):
"""
Given the dataframe combined with equal fair and unfair apps,
classify them
Expand All @@ -139,6 +144,7 @@ def classify(cDf):
featCols.remove('appLabel')

features = cDf[list(featCols)].astype('float')

## Scale the features to a common range
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(features.values)
Expand Down Expand Up @@ -173,12 +179,79 @@ def classify(cDf):
# print fairDf.values, unfairDf.values
print "Classifying %d th split of fair apps with unfair app" % (i)
print "-" * 79
classify(clDf)
splitclassify(clDf)
print "\n\n"




def performClassification(clf, featVector, labelVector, fold=4):
"""
Perform Classification
"""

(numrow, numcol) = featVector.shape

foldsize = int(numrow//fold)

print "FoldSize: %s" % (foldsize)

for i in range(fold):
X_test = featVector[i*foldsize:(i+1)*foldsize]
Y_test = labelVector[i*foldsize:(i+1)*foldsize]

X_train = np.concatenate((featVector[:i*foldsize], featVector[(i+1)*foldsize:]))
Y_train = np.concatenate((labelVector[:i*foldsize], labelVector[(i+1)*foldsize:]))

print " X_train: %s, Y_train: %s, X_test: %s, Y_test: %s" % (X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

print "#### Classifier: \n %s" % (clf)


clf.fit(X_train, Y_train) # train on first n_samples and test on last 10

expected = Y_test
predicted = clf.predict(X_test)
print "Classification report:\n%s\n" % metrics.classification_report(expected, predicted)
print "\nConfusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)




def allClassifier(cDf, models, modelchoice):
"""
Classifier for all apps
"""

print "Data Size: %s, \t Model Choice: %s" % (cDf.shape, modelchoice)

cDf = cDf.reindex(np.random.permutation(cDf.index)) # shuffle the dataframe
featCols = set(cDf.columns)
featCols.remove('appLabel')

features = cDf[list(featCols)].astype('float')

## Scale the features to a common range
min_max_scaler = preprocessing.MinMaxScaler()
featVector = min_max_scaler.fit_transform(features.values) #scaled feature vector

labelVector = cDf['appLabel'].values #label vector


if modelchoice == 'all':
for key in models:
if key != 'svm-nl':
classifier = models[key]
performClassification(classifier, featVector, labelVector)
else:
if modelchoice in models and modelchoice != 'svm-nl':
classifier = models[choice]
performClassification(classifier, featVector, labelVector)
else:
print "Incorrect Choice"






Expand All @@ -187,26 +260,23 @@ def main():
# Supported classifier models
n_neighbors = 3
models = {
'nb' : naive_bayes.GaussianNB(),
'svm-l' : svm.SVC(),
'svm-nl' : svm.NuSVC(),
'tree' : tree.DecisionTreeClassifier(),
'forest': AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200),
'knn-uniform' : neighbors.KNeighborsClassifier(n_neighbors, weights='uniform'),
'knn-distance' : neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
'nb' : naive_bayes.GaussianNB(),
'svm-l' : svm.SVC(),
'svm-nl' : svm.NuSVC(),
'tree' : tree.DecisionTreeClassifier(),
'forest': AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200),
'knn-uniform' : neighbors.KNeighborsClassifier(n_neighbors, weights='uniform'),
'knn-distance' : neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
}

userInput = getUserInput(models)
appDf = loadAppData(userInput['file'])
appDf = trimDf(appDf)


# print "Sample Data"
# print "-" * 79
# print appDf.head()


prepareClassifier(appDf, models, userInput['classifier'])
if userInput['sample'] == 'all':
allClassifier(appDf, models, userInput['classifier'])
else:
prepareSplitClassifier(appDf, models, userInput['classifier'])



Expand Down
2 changes: 1 addition & 1 deletion classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def classifier(alldata, fold=4):
# pprint(data)

claccuracy = []
size = int(math.floor(len(data) / 10.0))
size = int(math.floor(len(data) / fold))

for i in range(fold):
test_this_round = data[i*size:][:size]
Expand Down
Loading

0 comments on commit fd3d5ca

Please sign in to comment.