Skip to content

Commit

Permalink
adding NLP features for malware classifier
Browse files Browse the repository at this point in the history
  • Loading branch information
seekshreyas committed Dec 16, 2013
1 parent df44116 commit e398fc1
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 16 deletions.
137 changes: 121 additions & 16 deletions classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@
from collections import OrderedDict
from collections import Counter
import pdb
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.collocations import *
# from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures as BAM
from nltk.metrics import TrigramAssocMeasures as TAM
from itertools import chain
# from nltk.classify import apply_features


Expand Down Expand Up @@ -76,16 +83,30 @@ def posReviewsBySentence(tokenizedReviews):



def getAllReviewsAsString(app):
revStr = ''
for rev in app['reviews']:
# combine title and review sentence
revStr += rev[0] + rev[1]

return revStr


def featureExtractor(app):
featDict = {}
revStr = getAllReviewsAsString(app)
revWords = [w.lower()
for w in word_tokenize(revStr)
if w not in stopwords.words('english') ]

tokenizedReviews = tokenizeReviewsBySentence(app['reviews'])
posReviews = posReviewsBySentence(tokenizedReviews)



# fObj = open('mySentClassifier.pickle')
# cl = load(fObj)
# fObj.close()
fObj = open('mySentClassifier.pickle')
cl = load(fObj)
fObj.close()



Expand All @@ -98,14 +119,20 @@ def featureExtractor(app):
# featDict['5starRating'] = getFiveStarRating(app)
featDict['avgRating'] = getAverageRating(app)
featDict['hasPrivacy'] = getPrivacyState(app)
# featDict['revSent'] = getReviewSentiment(tokenizedReviews, cl)
featDict['revSent'] = getReviewSentiment(tokenizedReviews, cl)
featDict['hasDeveloperEmail'] = getDeveloperEmailState(app)
featDict['hasDeveloperWebsite'] = getDeveloperWebsiteState(app)
featDict['hasMultipleApps'] = getDeveloperHasMultipleApps(app)
featDict['installRange'] = getInstallRange(app)
featDict['exclamationCount'] = getExclamationCount(app)
featDict['adjectiveCount'] = getAdjectiveCount(posReviews)

# featDict.update(getUnigramWordFeatures(revWords))
# featDict.update(getBigramWordFeatures(revWords))
# featDict.update(getTrigramWordFeatures(revWords))



return { 'appFeatures': featDict, 'appName': app['name'] }

def getAdjectiveCount(pos_revs):
Expand All @@ -116,6 +143,91 @@ def getAdjectiveCount(pos_revs):

return int(adj_counter)







def getPostiveWordCount(revStr):
positive_keywords = ["good", "happy", "love", "great", "reasonable", "glad", "simple", "outstanding", "easy",
"wonderful", "cool", "remarkably", "remarkable", "enjoy", "nice", "thoughtful", "pretty",
"responsive", "comforatable", "favorite", "desire", "best", "solid", "cool", "impressed",
"sleek", "appealing", "rocks", "blazing", "amazing", "plus", "blessing", "awesome", "loved",
"enjoyed", "desired", "impressive", "impress", "rocked", "bless", "positive", "fabulous"]
postiveCount = 0
for word in sent.split(" "):
word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace("##","").replace("(","").replace(")","").replace("**","")
if word.lower() in positive_keywords:
postiveCount += 1
return postiveCount



def getNegativeWordCount(revStr):
negative_keywords = ["bad", "sad", "don't", "could not", "crappy", "unfortunately", "remove", "why", "poor",
"bothersome", "terrible", "although", "complaints", "outrageous", "isn't", "poorly",
"drawback", "annoying", "against", "irritating", "wouldn't", "won't", "wasn't", "couldn't",
"awful", "didn't", "hasn't", "difficult", "hate", "incorrect", "junk", "trash", "removed",
"complain", "complained", "hated", "negative"]
negativeCount = 0
for word in sent.split(" "):
word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace("##","").replace("(","").replace(")","").replace("**","")
if word.lower() in negative_keywords:
negativeCount += 1
return negativeCount




def getUnigramWordFeatures(words):
"""
Unigrams of the apps reviews
"""
malIndicatorWords = ['spam', 'virus', 'viruses' 'permissions', 'security',
'spying', 'access', 'warning', 'facebook', 'contacts', 'fake', 'permission',
'beware', 'lies', 'liar', 'why', 'age']
return dict(('contains("%s")' % word, True) for word in words if word in malIndicatorWords)



def getBigramWordFeatures(words):
"""
Get Relevant Bigrams
"""

filtered_words = [w for w in words if w != '.' and w != '?' and w != ')' and w != '(' and w != '-']

bigram_finder = BigramCollocationFinder.from_words(filtered_words)



# bigram_finder = BigramCollocationFinder.from_words(filtered_words)
# score = bigram_finder.score_ngrams(BAM.jaccard)

bigrams = bigram_finder.nbest(BAM.likelihood_ratio, 20)


return dict((bg, True) for bg in chain(words, bigrams))




def getTrigramWordFeatures(words):
"""
Return relevant Trigrams
"""

filtered_words = [w for w in words if w != '.' and w != '?' and w != ')' and w != '(' and w != '-']
trigram_measures = nltk.collocations.TrigramAssocMeasures()
trigram_finder = TrigramCollocationFinder.from_words(filtered_words)
trigrams = trigram_finder.nbest(trigram_measures.raw_freq, 3)

return dict((tg, True) for tg in chain(words, trigrams))




def getExclamationCount(app):
exclaimCount = 0
for rev in app['reviews']:
Expand Down Expand Up @@ -246,20 +358,19 @@ def getReviewSentiment(tknRevs, classifier):

def classifier(alldata, fold=4):

# name = alldata[0]
data = alldata[1]
data = [(row['appFeatures'], row['appLabel']) for row in alldata]


random.shuffle(data)
pprint(data)
# pprint(data)

claccuracy = []
size = int(math.floor(len(data) / 10.0))

for i in range(fold):
test_this_round = data[i*size:][:size]
train_this_round = data[:i*size] + data[(i+1)*size:]
#pdb.set_trace()

acc = myclassifier(train_this_round, test_this_round)

claccuracy.append(acc)
Expand All @@ -276,13 +387,13 @@ def myclassifier(train_data, test_data):
print "Train Data"
print "=" * 79
print len(train_data)
pprint(train_data[0])
# pprint(train_data[0])


print "Test Data"
print "=" * 79
print len(test_data)
pprint(test_data[0])
# pprint(test_data[0])



Expand Down Expand Up @@ -341,12 +452,6 @@ def main():
data = getAnalysisData(userinput)




# extract = fileExtractor(userinput['file'])

# pprint(data)
# features = featureAggregator(extract)
classifier(data)


Expand Down
2 changes: 2 additions & 0 deletions dataExport.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def export(data):
# if k == 'hasPrivacy' or k == 'hasDeveloperEmail' or k == 'hasDeveloperWebsite' or k == 'hasMultipleApps':
if isinstance(v, bool):
v = int(bool(v))
elif isinstance(v, basestring):
pass
else:
try:
v = float(v)
Expand Down

0 comments on commit e398fc1

Please sign in to comment.