Skip to content

Commit

Permalink
data Export working well
Browse files Browse the repository at this point in the history
  • Loading branch information
seekshreyas committed Dec 16, 2013
1 parent 84e5813 commit df44116
Show file tree
Hide file tree
Showing 4 changed files with 433 additions and 27 deletions.
45 changes: 30 additions & 15 deletions classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def featureAggregator(extract):

return outputdata



def tokenizeReviewsBySentence(reviews):
rev_tokenized = list()
for rev in reviews:
Expand All @@ -81,9 +83,9 @@ def featureExtractor(app):



fObj = open('mySentClassifier.pickle')
cl = load(fObj)
fObj.close()
# fObj = open('mySentClassifier.pickle')
# cl = load(fObj)
# fObj.close()



Expand All @@ -96,22 +98,24 @@ def featureExtractor(app):
# featDict['5starRating'] = getFiveStarRating(app)
featDict['avgRating'] = getAverageRating(app)
featDict['hasPrivacy'] = getPrivacyState(app)
featDict['revSent'] = getReviewSentiment(tokenizedReviews, cl)
# featDict['revSent'] = getReviewSentiment(tokenizedReviews, cl)
featDict['hasDeveloperEmail'] = getDeveloperEmailState(app)
featDict['hasDeveloperWebsite'] = getDeveloperWebsiteState(app)
featDict['hasMultipleApps'] = getDeveloperHasMultipleApps(app)
featDict['installRange'] = getInstallRange(app)
featDict['exclamationCount'] = getExclamationCount(app)
featDict['adjectiveCount'] = getAdjectiveCount(posReviews)

return featDict
return { 'appFeatures': featDict, 'appName': app['name'] }

def getAdjectiveCount(pos_revs):
adj_counter = 0
#pdb.set_trace()
for pos_sent in pos_revs:
adj_counter += Counter(tag for word, tag in pos_sent)['JJ']

return int(adj_counter)

def getExclamationCount(app):
exclaimCount = 0
for rev in app['reviews']:
Expand Down Expand Up @@ -240,8 +244,10 @@ def getReviewSentiment(tknRevs, classifier):



def classifier(data, fold=4):
def classifier(alldata, fold=4):

# name = alldata[0]
data = alldata[1]


random.shuffle(data)
Expand Down Expand Up @@ -297,24 +303,33 @@ def myclassifier(train_data, test_data):


def getAnalysisData(uinput):
data = []
appData = []
for f in listdir(uinput['dir']):
fname = f.split('_')

if fname[-1] == 'all.json':
print uinput['dir'] + f
fdata = fileExtractor(uinput['dir'] + f)
features = featureAggregator(fdata)
appAggData = featureAggregator(fdata)

if fname[0] == 'malapps':
for apps in features:
data.append([apps, 'unfair'])
else:
for apps in features:
data.append([apps, 'fair'])
# appDict = {}

for apps in appAggData:

return data
if fname[0] == 'malapps':
apps['appLabel'] = 'unfair'
else:
apps['appLabel'] = 'fair'

# appDict['appName'] = apps['appName']
# appDict['appFeatures'] = apps['appFeatures']


appData.append(apps)



return appData



Expand Down
89 changes: 78 additions & 11 deletions dataExport.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pprint import pprint
from classifier import getAnalysisData

import json
import csv


Expand All @@ -33,56 +34,122 @@ def getUserInput():

def export(data):
# pprint(data)

header = []
vectors = []
# vectors = []
labels = []

appFeaturesFileObj = open('appFeatures.csv', 'wb')
appFeaturesFileObj = open('exports/appFeatures.csv', 'wb')
wr = csv.writer(appFeaturesFileObj)




counter = 0
for row in data:
name = row[0][0]
rowval = []
labels = []
labelval = True

if counter == 0:
header.append('appName')
else:
rowval.append(name)

if row[1] == 'unfair':
labelval = False
labels.append(labelval)

for k,v in row[0].iteritems():



for k,v in row[0][1].iteritems():
if counter == 0:
header.append(k)

if k == 'hasPrivacy' or k == 'hasDeveloperEmail' or k == 'hasDeveloperWebsite' or k == 'hasMultipleApps':
# if k == 'hasPrivacy' or k == 'hasDeveloperEmail' or k == 'hasDeveloperWebsite' or k == 'hasMultipleApps':
if isinstance(v, bool):
v = int(bool(v))
else:
v = float(v)
try:
v = float(v)
except:
print "exception: ", k, v

rowval.append(v)

if counter == 0:
header.append('Fair')
wr.writerow(header)

vectors.append(rowval)
wr.writerow(rowval)
rowval.append(labelval)


try:
wr.writerow(rowval)
except:
pprint(rowval)
counter += 1


pprint(vectors)

appFeaturesFileObj.close()



def exportFile(data):

headers = ['appName']


featNames = sorted(data[0].keys())
headers.append(featNames)
headers.append('appLabel')


appFeaturesFileObj = open('exports/appFeatures.csv', 'wb')
wr = csv.writer(appFeaturesFileObj)

wr.writerow(headers)

for row in data:
# pprint(row)
# break
rowvals = []
name = row['appName'].encode('UTF-8').strip()
rowvals.append(name)

appFeatures = row['appFeatures']

for k in sorted(appFeatures.keys()):
value = appFeatures[k]
rowvals.append(value)

rowvals.append(row['appLabel'])

# pprint(rowvals)

try:
wr.writerow(rowvals)
except:
print "Skipping %s app for ASCII error: " % (rowvals[0])

appFeaturesFileObj.close()





def main():
userinput = getUserInput()

data = getAnalysisData(userinput)

export(data)

# pprint(data)

# export(data)

exportFile(data)



Expand Down
Loading

0 comments on commit df44116

Please sign in to comment.