From e398fc15fa3c5fec723b3858068cbba5a8d0daf5 Mon Sep 17 00:00:00 2001
From: Shreyas <seekshreyas@gmail.com>
Date: Sun, 15 Dec 2013 19:16:39 -0800
Subject: [PATCH] adding NLP features for malware classifier

---
 classifier.py | 137 ++++++++++++++++++++++++++++++++++++++++++++------
 dataExport.py |   2 +
 2 files changed, 123 insertions(+), 16 deletions(-)

diff --git a/classifier.py b/classifier.py
index f97d15e..dfaa307 100644
--- a/classifier.py
+++ b/classifier.py
@@ -23,6 +23,13 @@
 from collections import OrderedDict
 from collections import Counter
 import pdb
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.collocations import *
+# from nltk.collocations import BigramCollocationFinder
+from nltk.metrics import BigramAssocMeasures as BAM
+from nltk.metrics import TrigramAssocMeasures as TAM
+from itertools import chain
 # from nltk.classify import apply_features
 
 
@@ -76,16 +83,30 @@ def posReviewsBySentence(tokenizedReviews):
 
 
 
+def getAllReviewsAsString(app):
+    revStr = ''
+    for rev in app['reviews']:
+        # combine title and review sentence
+        revStr += rev[0] + rev[1]
+
+    return revStr
+
+
 def featureExtractor(app):
     featDict = {}
+    revStr = getAllReviewsAsString(app)
+    revWords = [w.lower()
+                    for w in word_tokenize(revStr)
+                        if w not in stopwords.words('english') ]
+
     tokenizedReviews = tokenizeReviewsBySentence(app['reviews'])
     posReviews = posReviewsBySentence(tokenizedReviews)
 
 
 
-    # fObj = open('mySentClassifier.pickle')
-    # cl = load(fObj)
-    # fObj.close()
+    fObj = open('mySentClassifier.pickle')
+    cl = load(fObj)
+    fObj.close()
 
 
 
@@ -98,7 +119,7 @@ def featureExtractor(app):
     # featDict['5starRating'] = getFiveStarRating(app)
     featDict['avgRating'] = getAverageRating(app)
     featDict['hasPrivacy'] = getPrivacyState(app)
-    # featDict['revSent'] = getReviewSentiment(tokenizedReviews, cl)
+    featDict['revSent'] = getReviewSentiment(tokenizedReviews, cl)
     featDict['hasDeveloperEmail'] = getDeveloperEmailState(app)
     featDict['hasDeveloperWebsite'] = getDeveloperWebsiteState(app)
     featDict['hasMultipleApps'] = getDeveloperHasMultipleApps(app)
@@ -106,6 +127,12 @@ def featureExtractor(app):
     featDict['exclamationCount'] = getExclamationCount(app)
     featDict['adjectiveCount'] = getAdjectiveCount(posReviews)
 
+    # featDict.update(getUnigramWordFeatures(revWords))
+    # featDict.update(getBigramWordFeatures(revWords))
+    # featDict.update(getTrigramWordFeatures(revWords))
+
+
+
     return { 'appFeatures': featDict, 'appName': app['name'] }
 
 def getAdjectiveCount(pos_revs):
@@ -116,6 +143,91 @@ def getAdjectiveCount(pos_revs):
 
     return int(adj_counter)
 
+
+
+
+
+
+
+def getPostiveWordCount(revStr):
+    positive_keywords = ["good", "happy", "love", "great", "reasonable", "glad", "simple", "outstanding", "easy",
+                     "wonderful", "cool", "remarkably", "remarkable", "enjoy", "nice", "thoughtful", "pretty",
+                     "responsive", "comforatable", "favorite", "desire", "best", "solid", "cool", "impressed",
+                     "sleek", "appealing", "rocks", "blazing", "amazing", "plus", "blessing", "awesome", "loved",
+                        "enjoyed", "desired", "impressive", "impress", "rocked", "bless", "positive", "fabulous"]
+    postiveCount = 0
+    for word in sent.split(" "):
+        word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace("##","").replace("(","").replace(")","").replace("**","")
+        if word.lower() in positive_keywords:
+            postiveCount += 1
+    return postiveCount
+
+
+
+def getNegativeWordCount(revStr):
+    negative_keywords = ["bad", "sad", "don't", "could not", "crappy", "unfortunately", "remove", "why", "poor",
+                     "bothersome", "terrible", "although", "complaints", "outrageous", "isn't", "poorly",
+                     "drawback", "annoying", "against", "irritating", "wouldn't", "won't", "wasn't", "couldn't",
+                     "awful", "didn't", "hasn't", "difficult", "hate", "incorrect", "junk", "trash", "removed",
+                         "complain", "complained", "hated", "negative"]
+    negativeCount = 0
+    for word in sent.split(" "):
+        word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace("##","").replace("(","").replace(")","").replace("**","")
+        if word.lower() in negative_keywords:
+            negativeCount += 1
+    return negativeCount
+
+
+
+
+def getUnigramWordFeatures(words):
+    """
+    Unigrams of the apps reviews
+    """
+    malIndicatorWords = ['spam', 'virus', 'viruses' 'permissions', 'security',
+        'spying', 'access', 'warning', 'facebook', 'contacts', 'fake', 'permission',
+        'beware', 'lies', 'liar', 'why', 'age']
+    return dict(('contains("%s")' % word, True) for word in words if word in malIndicatorWords)
+
+
+
+def getBigramWordFeatures(words):
+    """
+    Get Relevant Bigrams
+    """
+
+    filtered_words = [w for w in words if w != '.' and w != '?' and w != ')' and w != '(' and w != '-']
+
+    bigram_finder = BigramCollocationFinder.from_words(filtered_words)
+
+
+
+    # bigram_finder = BigramCollocationFinder.from_words(filtered_words)
+    # score = bigram_finder.score_ngrams(BAM.jaccard)
+
+    bigrams =  bigram_finder.nbest(BAM.likelihood_ratio, 20)
+
+
+    return dict((bg, True) for bg in chain(words, bigrams))
+
+
+
+
+def getTrigramWordFeatures(words):
+    """
+    Return relevant Trigrams
+    """
+
+    filtered_words = [w for w in words if w != '.' and w != '?' and w != ')' and w != '(' and w != '-']
+    trigram_measures = nltk.collocations.TrigramAssocMeasures()
+    trigram_finder = TrigramCollocationFinder.from_words(filtered_words)
+    trigrams = trigram_finder.nbest(trigram_measures.raw_freq, 3)
+
+    return dict((tg, True) for tg in chain(words, trigrams))
+
+
+
+
 def getExclamationCount(app):
     exclaimCount = 0
     for rev in app['reviews']:
@@ -246,12 +358,11 @@ def getReviewSentiment(tknRevs, classifier):
 
 def classifier(alldata, fold=4):
 
-    # name = alldata[0]
-    data = alldata[1]
+    data = [(row['appFeatures'], row['appLabel']) for row in alldata]
 
 
     random.shuffle(data)
-    pprint(data)
+    # pprint(data)
 
     claccuracy = []
     size = int(math.floor(len(data) / 10.0))
@@ -259,7 +370,7 @@ def classifier(alldata, fold=4):
     for i in range(fold):
         test_this_round = data[i*size:][:size]
         train_this_round = data[:i*size] + data[(i+1)*size:]
-        #pdb.set_trace()
+
         acc = myclassifier(train_this_round, test_this_round)
 
         claccuracy.append(acc)
@@ -276,13 +387,13 @@ def myclassifier(train_data, test_data):
     print "Train Data"
     print "=" * 79
     print len(train_data)
-    pprint(train_data[0])
+    # pprint(train_data[0])
 
 
     print "Test Data"
     print "=" * 79
     print len(test_data)
-    pprint(test_data[0])
+    # pprint(test_data[0])
 
 
 
@@ -341,12 +452,6 @@ def main():
     data = getAnalysisData(userinput)
 
 
-
-
-    # extract = fileExtractor(userinput['file'])
-
-    # pprint(data)
-    # features = featureAggregator(extract)
     classifier(data)
 
 
diff --git a/dataExport.py b/dataExport.py
index 6c5abd5..c3ee036 100644
--- a/dataExport.py
+++ b/dataExport.py
@@ -69,6 +69,8 @@ def export(data):
             # if k == 'hasPrivacy' or k == 'hasDeveloperEmail' or k == 'hasDeveloperWebsite' or k == 'hasMultipleApps':
             if isinstance(v, bool):
                 v = int(bool(v))
+            elif isinstance(v, basestring):
+                pass
             else:
                 try:
                     v = float(v)