From e398fc15fa3c5fec723b3858068cbba5a8d0daf5 Mon Sep 17 00:00:00 2001 From: Shreyas Date: Sun, 15 Dec 2013 19:16:39 -0800 Subject: [PATCH] adding NLP features for malware classifier --- classifier.py | 137 ++++++++++++++++++++++++++++++++++++++++++++------ dataExport.py | 2 + 2 files changed, 123 insertions(+), 16 deletions(-) diff --git a/classifier.py b/classifier.py index f97d15e..dfaa307 100644 --- a/classifier.py +++ b/classifier.py @@ -23,6 +23,13 @@ from collections import OrderedDict from collections import Counter import pdb +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.collocations import * +# from nltk.collocations import BigramCollocationFinder +from nltk.metrics import BigramAssocMeasures as BAM +from nltk.metrics import TrigramAssocMeasures as TAM +from itertools import chain # from nltk.classify import apply_features @@ -76,16 +83,30 @@ def posReviewsBySentence(tokenizedReviews): +def getAllReviewsAsString(app): + revStr = '' + for rev in app['reviews']: + # combine title and review sentence + revStr += rev[0] + rev[1] + + return revStr + + def featureExtractor(app): featDict = {} + revStr = getAllReviewsAsString(app) + revWords = [w.lower() + for w in word_tokenize(revStr) + if w not in stopwords.words('english') ] + tokenizedReviews = tokenizeReviewsBySentence(app['reviews']) posReviews = posReviewsBySentence(tokenizedReviews) - # fObj = open('mySentClassifier.pickle') - # cl = load(fObj) - # fObj.close() + fObj = open('mySentClassifier.pickle') + cl = load(fObj) + fObj.close() @@ -98,7 +119,7 @@ def featureExtractor(app): # featDict['5starRating'] = getFiveStarRating(app) featDict['avgRating'] = getAverageRating(app) featDict['hasPrivacy'] = getPrivacyState(app) - # featDict['revSent'] = getReviewSentiment(tokenizedReviews, cl) + featDict['revSent'] = getReviewSentiment(tokenizedReviews, cl) featDict['hasDeveloperEmail'] = getDeveloperEmailState(app) featDict['hasDeveloperWebsite'] = getDeveloperWebsiteState(app) featDict['hasMultipleApps'] = getDeveloperHasMultipleApps(app) @@ -106,6 +127,12 @@ def featureExtractor(app): featDict['exclamationCount'] = getExclamationCount(app) featDict['adjectiveCount'] = getAdjectiveCount(posReviews) + # featDict.update(getUnigramWordFeatures(revWords)) + # featDict.update(getBigramWordFeatures(revWords)) + # featDict.update(getTrigramWordFeatures(revWords)) + + + return { 'appFeatures': featDict, 'appName': app['name'] } def getAdjectiveCount(pos_revs): @@ -116,6 +143,91 @@ def getAdjectiveCount(pos_revs): return int(adj_counter) + + + + + + +def getPostiveWordCount(revStr): + positive_keywords = ["good", "happy", "love", "great", "reasonable", "glad", "simple", "outstanding", "easy", + "wonderful", "cool", "remarkably", "remarkable", "enjoy", "nice", "thoughtful", "pretty", + "responsive", "comforatable", "favorite", "desire", "best", "solid", "cool", "impressed", + "sleek", "appealing", "rocks", "blazing", "amazing", "plus", "blessing", "awesome", "loved", + "enjoyed", "desired", "impressive", "impress", "rocked", "bless", "positive", "fabulous"] + postiveCount = 0 + for word in sent.split(" "): + word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace("##","").replace("(","").replace(")","").replace("**","") + if word.lower() in positive_keywords: + postiveCount += 1 + return postiveCount + + + +def getNegativeWordCount(revStr): + negative_keywords = ["bad", "sad", "don't", "could not", "crappy", "unfortunately", "remove", "why", "poor", + "bothersome", "terrible", "although", "complaints", "outrageous", "isn't", "poorly", + "drawback", "annoying", "against", "irritating", "wouldn't", "won't", "wasn't", "couldn't", + "awful", "didn't", "hasn't", "difficult", "hate", "incorrect", "junk", "trash", "removed", + "complain", "complained", "hated", "negative"] + negativeCount = 0 + for word in sent.split(" "): + word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace("##","").replace("(","").replace(")","").replace("**","") + if word.lower() in negative_keywords: + negativeCount += 1 + return negativeCount + + + + +def getUnigramWordFeatures(words): + """ + Unigrams of the apps reviews + """ + malIndicatorWords = ['spam', 'virus', 'viruses' 'permissions', 'security', + 'spying', 'access', 'warning', 'facebook', 'contacts', 'fake', 'permission', + 'beware', 'lies', 'liar', 'why', 'age'] + return dict(('contains("%s")' % word, True) for word in words if word in malIndicatorWords) + + + +def getBigramWordFeatures(words): + """ + Get Relevant Bigrams + """ + + filtered_words = [w for w in words if w != '.' and w != '?' and w != ')' and w != '(' and w != '-'] + + bigram_finder = BigramCollocationFinder.from_words(filtered_words) + + + + # bigram_finder = BigramCollocationFinder.from_words(filtered_words) + # score = bigram_finder.score_ngrams(BAM.jaccard) + + bigrams = bigram_finder.nbest(BAM.likelihood_ratio, 20) + + + return dict((bg, True) for bg in chain(words, bigrams)) + + + + +def getTrigramWordFeatures(words): + """ + Return relevant Trigrams + """ + + filtered_words = [w for w in words if w != '.' and w != '?' and w != ')' and w != '(' and w != '-'] + trigram_measures = nltk.collocations.TrigramAssocMeasures() + trigram_finder = TrigramCollocationFinder.from_words(filtered_words) + trigrams = trigram_finder.nbest(trigram_measures.raw_freq, 3) + + return dict((tg, True) for tg in chain(words, trigrams)) + + + + def getExclamationCount(app): exclaimCount = 0 for rev in app['reviews']: @@ -246,12 +358,11 @@ def getReviewSentiment(tknRevs, classifier): def classifier(alldata, fold=4): - # name = alldata[0] - data = alldata[1] + data = [(row['appFeatures'], row['appLabel']) for row in alldata] random.shuffle(data) - pprint(data) + # pprint(data) claccuracy = [] size = int(math.floor(len(data) / 10.0)) @@ -259,7 +370,7 @@ def classifier(alldata, fold=4): for i in range(fold): test_this_round = data[i*size:][:size] train_this_round = data[:i*size] + data[(i+1)*size:] - #pdb.set_trace() + acc = myclassifier(train_this_round, test_this_round) claccuracy.append(acc) @@ -276,13 +387,13 @@ def myclassifier(train_data, test_data): print "Train Data" print "=" * 79 print len(train_data) - pprint(train_data[0]) + # pprint(train_data[0]) print "Test Data" print "=" * 79 print len(test_data) - pprint(test_data[0]) + # pprint(test_data[0]) @@ -341,12 +452,6 @@ def main(): data = getAnalysisData(userinput) - - - # extract = fileExtractor(userinput['file']) - - # pprint(data) - # features = featureAggregator(extract) classifier(data) diff --git a/dataExport.py b/dataExport.py index 6c5abd5..c3ee036 100644 --- a/dataExport.py +++ b/dataExport.py @@ -69,6 +69,8 @@ def export(data): # if k == 'hasPrivacy' or k == 'hasDeveloperEmail' or k == 'hasDeveloperWebsite' or k == 'hasMultipleApps': if isinstance(v, bool): v = int(bool(v)) + elif isinstance(v, basestring): + pass else: try: v = float(v)