Skip to content

Commit

Permalink
revising the mapreduce for review id
Browse files Browse the repository at this point in the history
  • Loading branch information
seekshreyas committed May 5, 2014
1 parent f8074e3 commit 4641faa
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 25 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ Thumbs.db
.ipynb_checkpoints/

## aws credentials
*.conf
*.pem
mrjob.conf
aws.pem
credentials.json
db/features.txt

46 changes: 23 additions & 23 deletions obidroidMR.py → mapreduce/obidroidMR.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import re
import nltk
# import pattern
# from textblob import TextBlob
# from textblob.sentiments import NaiveBayesAnalyzer
import simplejson
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
# import simplejson


class ObidroidReview(MRJob):
Expand Down Expand Up @@ -37,27 +37,27 @@ def getFeatures(rev):

revAdjCount = 0

# revPosTokens = nltk.pos_tag(nltk.word_tokenize(rev))
revPosTokens = nltk.pos_tag(nltk.word_tokenize(rev))

# for _, pos in revPosTokens:
# if pos == 'JJ' or pos == 'VBP':
# revAdjCount += 1
for _, pos in revPosTokens:
if pos == 'JJ' or pos == 'VBP':
revAdjCount += 1


## Sentiment Classifiers:
# revSentAgg = sentClassify(rev)
## overall production sentiment classifier
# blob = TextBlob(rev, analyzer=NaiveBayesAnalyzer())
# blobSent = blob.sentiment
# Sentiment Classifiers:
revSentAgg = sentClassify(rev)
# overall production sentiment classifier
blob = TextBlob(rev, analyzer=NaiveBayesAnalyzer())
blobSent = blob.sentiment

# print blobSent
print blobSent

# if blobSent[0] == 'pos':
# revSent = 1 * blobSent[1]
# elif blobSent[0] == 'neg':
# revSent = -1 * blobSent[2]
# else:
# revSent = 0
if blobSent[0] == 'pos':
revSent = 1 * blobSent[1]
elif blobSent[0] == 'neg':
revSent = -1 * blobSent[2]
else:
revSent = 0



Expand All @@ -67,10 +67,10 @@ def getFeatures(rev):
revWordsLength,
revUniqueWordLength,
revCapCount,
revExclaimCount
# revAdjCount
# revSentAgg,
# revSent
revExclaimCount,
revAdjCount,
revSentAgg,
revSent
]


Expand Down
72 changes: 72 additions & 0 deletions mapreduce/outputparserMR.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#! /usr/bin/env python
# -*- coding: UTF-8 -*-
"""
OutputParser
==========
Parse the output of map reduce jobs
"""

from __future__ import division
from optparse import OptionParser
from collections import defaultdict
import ast
import pandas as pd

def getUserInput():
optionparser = OptionParser()

optionparser.add_option('-f', '--file', dest='inputfile')
optionparser.add_option('-d', '--dir', dest='directory')


(option, args) = optionparser.parse_args()

if not option.inputfile:
return optionparser.error('html file input not provided.\n Usage: --url="path.to.appurl"')

return { 'file' : option.inputfile, 'dir' : option.directory }



def parseInputFile(path):
df = pd.read_table(path, names=['appid', 'appfeatures'])

df['appfeatures'] = df['appfeatures'].apply(lambda x: ast.literal_eval(x))

print df['appfeatures'].dtype

return df



def getItem(x):
y = ast.literal_eval(x)

print type(y)

return y[0][0]







def main():
userInput = getUserInput()

rawdf = parseInputFile(userInput['file'])

rawdf['revCharLength'] = rawdf['appfeatures']

rawdf['revCharLength'] = rawdf['revCharLength'].apply(lambda x: getItem(x))


print rawdf.head()



if __name__ == '__main__':
main()


0 comments on commit 4641faa

Please sign in to comment.