-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathNameEvaluation.py
275 lines (236 loc) · 10.4 KB
/
NameEvaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""
Choose best
Likelihood of letters appearing in that order (n-grams of CMU dict or pokedex?)
→ Namelette in “Checking the phonetic likelihood of the new word”
In case all are bad: pokemon suffixes? (e.g. horsemon)
"""
import numpy as np
import pickle
import nltk
import pandas as pd
import nltk.tokenize.sonority_sequencing as sequencing
from nltk.util import ngrams
import math
import re
import operator
nltk.download('cmudict')
def load_poke_data():
"""
Function to load the Pokémon name data
:return: Pokémon name data as list
"""
poke_data = pd.read_csv("Data/pokemon.csv")
poke_df = pd.DataFrame(poke_data)
poke_data = []
for name in poke_df["name"]:
if name in re.findall("[a-zA-Z]*", name):
poke_data.append(name)
else:
pass
return poke_data
def load_cmu_data():
"""
Function to load the CMU dictionary data
:return: list of words from CMU dictionary that have at least 2 characters
"""
entries = nltk.corpus.cmudict.entries()
cmu_data = []
for item in entries:
word, pronunciation = item
if word in re.findall("[a-zA-Z]{2,}", word):
cmu_data.append(word)
else:
pass
return cmu_data
def ngram_lists_syllables(data):
"""
Creates unigrams and bigrams from the training corpus based on the syllables of words
:param data: syllable data
:return bigram_list: list of all the syllable bigrams that were created
:return unigram_list: list of all the syllable unigrams that were created
:return V: size of the vocabulary of the bigrams
"""
bigram_list = []
unigram_list = []
tok = sequencing.SyllableTokenizer()
for word in data:
if len(tok.tokenize(word)) >= 2:
syllables = tok.tokenize(word.lower())
bigrams = list(ngrams(syllables, 2))
bigram_list.extend(bigrams)
unigram_list.extend(syllables)
else:
pass
V = len(bigram_list)
return bigram_list, unigram_list, V
def ngram_lists_characters(data):
"""
Creates unigrams and bigrams from the training corpus based on the characters of words
:param data: word character data
:return bigram_list: list of all the character bigrams that were created
:return unigram_list: list of all the character unigrams that were created
:return V: size of the bigram vocabulary of the training data
"""
bigram_list = []
unigram_list = []
for word in data:
characters = list(word.lower())
bigrams = list(ngrams(characters, 2))
bigram_list.extend(bigrams)
unigram_list.extend(characters)
V = len(bigram_list)
return bigram_list, unigram_list, V
def frequency_count(ngram_list):
"""
Creates a list of unique ngrams and counts the frequency of the ngrams of the input list
:param ngram_list: a list of all the ngrams in the training data
:return unique_ngrams: list which contains each ngram once
:return count_gram: dictionary with the ngram as the key and its count in the training set as the value
"""
unique_ngrams = []
for ngram in ngram_list:
if ngram in unique_ngrams:
pass
else:
unique_ngrams.append(ngram)
count_ngram = {}
for ngram in unique_ngrams:
count_ngram[ngram] = ngram_list.count(ngram)
return unique_ngrams, count_ngram
def probability(bi_count, uni_count, V):
"""
Calculates the Naive Bayes probability of a bigram with k-smoothing
:param bi_count: count of the occurrence of the bigram in the training set
:param uni_count: count of the occurrence of the first unigram of the bigram in the training data
:param V: size of the bigram vocabulary of the training data
:return: the posterior probability of a bigram
"""
k = 1
prob = math.log2((bi_count + k) / (uni_count + (k * V)))
return prob
def train_model(vocab, bi_count, uni_count, V, path):
"""
Calculates and stores a model with a bigram as the dictionary key and its probability as the value
:param vocab: unique list of bigrams of training set
:param bi_count: count of bigram in the training set
:param uni_count: count of unigram in the training set
:param V: size of the bigram vocabulary of the training set
:param path: path under which to store the model
:return: dictionary with bigram as key and its probability as the value
"""
propability_list_bigram = {}
for bigram in vocab:
first, second = bigram
prob_bigram = probability(bi_count[bigram], uni_count[first], V)
propability_list_bigram[bigram] = prob_bigram
pickle.dump([propability_list_bigram, uni_count, V], open(path, "wb"))
return propability_list_bigram
def evaluation_prob(input_bigrams, input_unigram, prob_list, uni_count, V):
"""
Calculates the probability of the input word
:param input_bigrams: a list of the bigrams of the input word
:param input_unigram: a list of the unigrams of the input word
:param prob_list: model with the stored probabilites for each bigram of the training set
:param uni_count: count of occurrence of a unigram in the training set
:param V: size of the bigram vocabulary of the training set
:return: probability of the input word
"""
prob = 0
for bigram in input_bigrams:
first, second = bigram
if bigram in prob_list:
prob += 2**prob_list[bigram]
else:
if first in uni_count:
prob += ((0 + 1) / (uni_count[first] + (1 * V)))
else:
prob += ((0 + 1) / (0 + (1 * V)))
prob = (prob / len(input_unigram))
return prob
def evaluation_syllable(poke_name):
"""
Evaluates an input word that has several syllables based on its syllables
:param poke_name: input word
:return: probability of the input word
"""
prob = 0
input_name = poke_name
tok = sequencing.SyllableTokenizer()
prob_list_poke, uni_count_poke, V_poke = pickle.load(open("Data/model_syllables_poke.pckl", "rb"))
prob_list_cmu, uni_count_cmu, V_cmu = pickle.load(open("Data/model_syllables_cmu.pckl", "rb"))
input_unigram = tok.tokenize(input_name.lower())
input_bigrams = list(ngrams(input_unigram, 2))
prob = (0.4 * evaluation_prob(input_bigrams, input_unigram, prob_list_poke, uni_count_poke, V_poke)
+ 0.6 * evaluation_prob(input_bigrams, input_unigram, prob_list_cmu, uni_count_cmu, V_cmu))
return prob
def evaluation_character(poke_name):
"""
Evaluates an input word with an artificial suffix based on its characters
:param poke_name: input word
:return: probability of the input word
"""
prob = 0
input_name = poke_name
prob_list_poke, uni_count_poke, V_poke = pickle.load(open("Data/model_characters_poke.pckl", "rb"))
prob_list_cmu, uni_count_cmu, V_cmu = pickle.load(open("Data/model_characters_cmu.pckl", "rb"))
endings = ['saur', 'bat', 'puff', 'don', 'gon', 'low', 'pede', 'no', 'ta']
for suffix in endings:
if input_name.endswith(suffix):
input_name = input_name.replace(suffix, '')
input_unigram = list(input_name.lower())
input_bigrams = list(ngrams(input_unigram, 2))
prob = (0.4 * evaluation_prob(input_bigrams, input_unigram, prob_list_poke, uni_count_poke, V_poke))\
+ (0.6 * evaluation_prob(input_bigrams, input_unigram, prob_list_cmu, uni_count_cmu, V_cmu))
input_name = poke_name
break
return prob
def get_model():
"""
Creates all the parameters necessary to train the model and to evaluate an input name and it trains the model
"""
poke_data = load_poke_data()
bigram_list_syl_poke, unigram_list_syl_poke, V_syl_poke = ngram_lists_syllables(poke_data)
bigram_vocab_syl_poke, bigram_count_syl_poke = frequency_count(bigram_list_syl_poke)
unigram_vocab_syl_poke, unigram_count_syl_poke = frequency_count(unigram_list_syl_poke)
bigram_list_char_poke, unigram_list_char_poke, V_char_poke = ngram_lists_characters(poke_data)
bigram_vocab_char_poke, bigram_count_char_poke = frequency_count(bigram_list_char_poke)
unigram_vocab_char_poke, unigram_count_char_poke = frequency_count(unigram_list_char_poke)
syllable_probabilities_poke = train_model(bigram_vocab_syl_poke, bigram_count_syl_poke, unigram_count_syl_poke,
V_syl_poke, "Data/model_syllables_poke.pckl")
character_probabilities_poke = train_model(bigram_vocab_char_poke, bigram_count_char_poke, unigram_count_char_poke,
V_char_poke, "Data/model_characters_poke.pckl")
cmu_data = load_cmu_data()
bigram_list_syl_cmu, unigram_list_syl_cmu, V_syl_cmu = ngram_lists_syllables(cmu_data)
bigram_vocab_syl_cmu, bigram_count_syl_cmu = frequency_count(bigram_list_syl_cmu)
unigram_vocab_syl_cmu, unigram_count_syl_cmu = frequency_count(unigram_list_syl_cmu)
bigram_list_char_cmu, unigram_list_char_cmu, V_char_cmu = ngram_lists_characters(cmu_data)
bigram_vocab_char_cmu, bigram_count_char_cmu = frequency_count(bigram_list_char_cmu)
unigram_vocab_char_cmu, unigram_count_char_cmu = frequency_count(unigram_list_char_cmu)
syllable_probabilities_cmu = train_model(bigram_vocab_syl_cmu, bigram_count_syl_cmu, unigram_count_syl_cmu,
V_syl_cmu, "Data/model_syllables_cmu.pckl")
character_probabilities_cmu = train_model(bigram_vocab_char_cmu, bigram_count_char_cmu, unigram_count_char_cmu,
V_char_cmu, "Data/model_characters_cmu.pckl")
def evaluation_name(blended_words):
"""
Evaluates a list of input names and decides which one is the best name
:param blended_words: input names
:return: best input name
"""
cmu_data = load_cmu_data()
# get_model()
endings = ['saur', 'bat', 'puff', 'don', 'gon', 'low', 'pede', 'no', 'ta']
input_probs = {}
for name in blended_words:
if name in cmu_data:
pass
else:
for suffix in endings:
if name.endswith(suffix):
prob = evaluation_character(name)
input_probs[name] = prob
break
else:
prob = evaluation_syllable(name)
input_probs[name] = prob
best_name = max(input_probs.items(), key=operator.itemgetter(1))[0]
return best_name