-
Notifications
You must be signed in to change notification settings - Fork 2
/
find_accuracy.py
61 lines (51 loc) · 1.73 KB
/
find_accuracy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures
from nltk import pos_tag
import random
import re
import json
import sys
import pickle
def word_feats(words):
features = dict([(word, True) for word in words])
finder = BigramCollocationFinder.from_words(words)
finder.apply_word_filter(lambda x: False if re.match('\w', x) else True)
bigrams = finder.nbest(BigramAssocMeasures.chi_sq, 20000)
features.update(dict([(bigram, True) for bigram in bigrams]))
return features
def tokenize(sentence):
tokens = word_tokenize(sentence.lower())
return [w for w in tokens if not w in stopwords.words('english')]# and re.match('\w', w)]
f = open('my_classifier.pickle', 'r')
classifier = pickle.load(f)
f.close()
f_out = open('review_sentiment_full_2.json', 'w')
f = open('review.json', 'r')
read = 0
tp = tn = fp = fn = 0
for line in f:
line = eval(line)
read+=1
try:
output_class = classifier.classify(word_feats(tokenize(line['text'])))
except:
print "error", read
continue
line['predicted_sentiment'] = output_class
if output_class == 'pos' and line['stars'] >= 3.5:
tp += 1
elif output_class == 'neg' and line['stars'] >= 3.5:
fn += 1
elif output_class == 'pos' and line['stars'] < 3.5:
fp += 1
elif output_class == 'neg' and line['stars'] < 3.5:
tn += 1
f_out.write(json.dumps(line)+"\n")
if read % 1000 == 0:
print read
print "total",read, "tp", tp, "tn", tn, "fp", fp, "fn", fn
classifier.show_most_informative_features()