-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathmytask3b.py
154 lines (118 loc) · 4.72 KB
/
mytask3b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
""" Classification
The objective of this task is to build a classifier that can tell us whether a new, unseen deal
requires a coupon code or not.
We would like to see a couple of steps:
1. You should use bad_deals.txt and good_deals.txt as training data
2. You should use test_deals.txt to test your code
3. Each time you tune your code, please commit that change so we can see your tuning over time
Also, provide comments on:
- How general is your classifier?
- How did you test your classifier?
"""
############################################################################################
# Solution 2 for Q3
# 1. The solution is similar to the method for sentiment analysis, see below
# 2. show how to split sentences and generate word features
# 3. raw data is transformed into a new training data, different from the method used in textblob;
# ref: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
#
# note: however, the performance is poorer than that by textblob, discussed in the first solution, from our empirical results
############################################################################################
# import useful tools or package for different learning problems
from __future__ import print_function
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import random
import nltk
#from pylab import *
#from sklearn.datasets import fetch_20newsgroups
#from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
from textblob.classifiers import NaiveBayesClassifier
from textblob.classifiers import DecisionTreeClassifier
from numpy import genfromtxt, savetxt
from sklearn.cross_validation import train_test_split
# 1. load training data for building a simple naive bayes classifier for text classificaiton
#
bad_data = open('../data/bad_deals.txt','r')
deals = []
good_data = open('../data/good_deals.txt','r')
for line in good_data:
deals.append((line, 'good'))
for line in bad_data:
deals.append((line, 'bad'))
test_data = open('../data/test_deals.txt','r')
test_dat = []
for line in test_data:
test_dat.append((line))
# end
##names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
## "Random Forest", "AdaBoost", "Naive Bayes"]
##classifiers = [
## KNeighborsClassifier(3),
## SVC(kernel="linear", C=0.025),
## SVC(gamma=2, C=1),
## DecisionTreeClassifier(max_depth = 5),
## RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
## AdaBoostClassifier(),
## NaiveBayesClassifier()]
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
def get_words_in_deals(deals):
all_words = []
for (words, t) in deals:
all_words.extend(words)
return all_words
word_features = get_word_features(get_words_in_deals(deals))
def extract_features(deal):
deal_words = set(deal)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in deal_words)
return features
training_set = nltk.classify.apply_features(extract_features, deals)
# evaluation by split
X_train = training_set[:int(len(training_set) *0.9)]
X_test = training_set[int(len(training_set) *0.9):]
clf= NaiveBayesClassifier(X_train)
# accuracy
print(clf.accuracy(X_test))
##target = [x[len(x)-1] for x in training_set]
##train = [x[:len(x)-2] for x in training_set]
##
##X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=.4)
##
### iterate over classifiers
##for name, clf in zip(names, classifiers):
## clf.fit(X_train, y_train)
##
## score = clf.score(X_test, y_test)
## print(name, score)
# classify
classifier = nltk.NaiveBayesClassifier.train(training_set)
for deal in test_dat:
print(classifier.classify(extract_features(deal.split())))