-
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathspam-sms.py
128 lines (87 loc) · 3.92 KB
/
spam-sms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
import re
import nltk
# nltk.download('stopwords') #used once to download stopwords.
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle
import keras
from keras.models import Sequential
from keras.layers import Dense
# https://medium.com/coinmonks/spam-detector-using-naive-bayes-c22cc740e257
messages = pd.read_csv('data/spam-ham.csv', delimiter=',\t', names=['label', 'sms'], engine='python')
messages['label'] = messages.label.map({'ham':0, 'spam':1}) #Making spam/ham as 0/1 instead
# Cleaning the data
corpus = []
for i in range(0,len(messages['sms'])):
smsDocument = re.sub('[^a-zA-Z]',' ',messages['sms'][i])
smsDocument = smsDocument.lower()
smsDocument = smsDocument.split()
ps = PorterStemmer()
smsDocument = [ps.stem(word) for word in smsDocument if not word in set(stopwords.words('english'))]
smsDocument = ' '.join(smsDocument)
corpus.append(smsDocument)
smsDocuments = pd.Series(corpus)
################### Summary #####################
count_vector = CountVectorizer()
'''
CountVectorizer will convert all the text into lowercase, will remove all the punctuations and all the stopwords and make
a frequency matrix similar to document term matrix.
'''
count_vector.fit(smsDocuments) #Bag of words algorithm.
'''
The basic idea of BoW is to take a piece of text and count the frequency of the words in that text.
It is important to note that the BoW concept treats each word individually and the order in which
the words occur does not matter.
'''
count_vector.get_feature_names()
doc_array = count_vector.transform(smsDocuments).toarray()
'''
Now we have a clean representation of the documents in terms of the frequency distribution of the words in them.
'''
freq_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
################## Real Detector ################
count_vector = CountVectorizer(max_features = len(messages['sms'])) #Making document term matrix / frequency matrix
smsDocuments = count_vector.fit_transform(smsDocuments)
training_data, testing_data, y_train, y_test = train_test_split(smsDocuments, messages['label'], test_size=0.20, random_state=0)
#Multinomail distribution (naive bayes classification) works better in case of counts.
model = MultinomialNB()
model.fit(training_data, y_train)
y_score = model.predict(testing_data)
acc = accuracy_score(y_test, y_score)
print("Accuracy Score Bayes: ", acc)
'''
filename = 'data/spam_model.sav'
pickle.dump(model, open(filename, 'wb')) #Saving the model
model = pickle.load(open(filename, 'rb')) #To load saved model from local directory
'''
#Building Neural Net
classifier = Sequential()
classifier.add(Dense(output_dim = 50, init = 'uniform', activation = 'relu', input_dim = len(messages['sms'])))
classifier.add(Dense(output_dim = 50, init = 'uniform', activation = 'relu'))
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
classifier.summary()
classifier.fit(training_data, y_train, batch_size = 10, epochs = 100)
y_pred = classifier.predict(testing_data)
y_pred = (y_pred > 0.5) # Converting probability to 0 or 1 depending upon 0.5 threshold
# Making the Confusion Matrix
acc_nn = accuracy_score(y_test, y_pred)
print('Accuracy Score NN: ', acc_nn)
'''
fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
auc = metrics.roc_auc_score(y_test, y_score) #Find AUC score for the data set based on LR
print("AUC Score: ", auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot(fpr,tpr,label="spam-ham.csv, auc="+str(auc))
plt.legend(loc=4)
plt.show()
'''