revising the mapreduce for review id

seekshreyas · May 5, 2014 · 4641faa · 4641faa
1 parent f8074e3
commit 4641faa
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -67,6 +67,8 @@ Thumbs.db
 .ipynb_checkpoints/
 
 ## aws credentials
-*.conf
-*.pem
+mrjob.conf
+aws.pem
+credentials.json
 db/features.txt
+
diff --git a/obidroidMR.py → mapreduce/obidroidMR.py b/obidroidMR.py → mapreduce/obidroidMR.py
@@ -4,9 +4,9 @@
 import re
 import nltk
 # import pattern
-# from textblob import TextBlob
-# from textblob.sentiments import NaiveBayesAnalyzer
-import simplejson
+from textblob import TextBlob
+from textblob.sentiments import NaiveBayesAnalyzer
+# import simplejson
 
 
 class ObidroidReview(MRJob):
@@ -37,27 +37,27 @@ def getFeatures(rev):
 
 		revAdjCount = 0
 
-		# revPosTokens = nltk.pos_tag(nltk.word_tokenize(rev))
+		revPosTokens = nltk.pos_tag(nltk.word_tokenize(rev))
 
-		# for _, pos in revPosTokens:
-		# 	if pos == 'JJ' or pos == 'VBP':
-		# 		revAdjCount += 1
+		for _, pos in revPosTokens:
+			if pos == 'JJ' or pos == 'VBP':
+				revAdjCount += 1
 
 
-		## Sentiment Classifiers:
-		# revSentAgg = sentClassify(rev)
-		## overall production sentiment classifier
-		# blob = TextBlob(rev, analyzer=NaiveBayesAnalyzer())
-		# blobSent = blob.sentiment
+		# Sentiment Classifiers:
+		revSentAgg = sentClassify(rev)
+		# overall production sentiment classifier
+		blob = TextBlob(rev, analyzer=NaiveBayesAnalyzer())
+		blobSent = blob.sentiment
 
-		# print blobSent
+		print blobSent
 
-		# if blobSent[0] == 'pos':
-		# 	revSent = 1 * blobSent[1]
-		# elif blobSent[0] == 'neg':
-		# 	revSent = -1 * blobSent[2]
-		# else:
-		# 	revSent = 0
+		if blobSent[0] == 'pos':
+			revSent = 1 * blobSent[1]
+		elif blobSent[0] == 'neg':
+			revSent = -1 * blobSent[2]
+		else:
+			revSent = 0
 
 
 
@@ -67,10 +67,10 @@ def getFeatures(rev):
 			revWordsLength,
 			revUniqueWordLength,
 			revCapCount,
-			revExclaimCount
-			# revAdjCount
-			# revSentAgg,
-			# revSent
+			revExclaimCount,
+			revAdjCount,
+			revSentAgg,
+			revSent
 		]
 
 

diff --git a/mapreduce/outputparserMR.py b/mapreduce/outputparserMR.py
@@ -0,0 +1,72 @@
+#! /usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+OutputParser
+==========
+Parse the output of map reduce jobs
+"""
+
+from __future__ import division
+from optparse import OptionParser
+from collections import defaultdict
+import ast
+import pandas as pd
+
+def getUserInput():
+    optionparser = OptionParser()
+
+    optionparser.add_option('-f', '--file', dest='inputfile')
+    optionparser.add_option('-d', '--dir', dest='directory')
+
+
+    (option, args) = optionparser.parse_args()
+
+    if not option.inputfile:
+        return optionparser.error('html file input not provided.\n Usage: --url="path.to.appurl"')
+
+    return { 'file' : option.inputfile, 'dir' : option.directory }
+
+
+
+def parseInputFile(path):
+    df = pd.read_table(path, names=['appid', 'appfeatures'])
+
+    df['appfeatures'] = df['appfeatures'].apply(lambda x: ast.literal_eval(x))
+
+    print df['appfeatures'].dtype
+
+    return df
+
+
+
+def getItem(x):
+    y = ast.literal_eval(x)
+
+    print type(y)
+
+    return y[0][0]
+
+
+
+
+
+
+
+def main():
+    userInput = getUserInput()
+
+    rawdf = parseInputFile(userInput['file'])
+
+    rawdf['revCharLength'] = rawdf['appfeatures']
+
+    rawdf['revCharLength'] = rawdf['revCharLength'].apply(lambda x: getItem(x))
+
+
+    print rawdf.head()
+
+
+
+if __name__ == '__main__':
+    main()
+
+