-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw2v initial.py
151 lines (98 loc) · 5.72 KB
/
w2v initial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from wolframclient.evaluation import WolframLanguageSession
from wolframclient.language import wl, wlexpr
#session = WolframLanguageSession()
#session.evaluate(wlexpr('')) #Can put wolfram expressions into the program to use functionality
#session.evaluate(w1.MinMax()) #Can be used to evaluate any built in functions from wolfram language
#session.terminate()
#Error is generally taken as the magnitude of the distance.
#Word vec model using gensim library in python
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import re
import io
def line_array(file):
word_array = []
for n, line in enumerate(csv.readlines(),1):
words = re.findall(r"\b\w+\b",line)
word_array.append([w.lower() for w in words if w.isalpha() == True]) #Words must not have any other characters or numbers - pure lowercase strings
return word_array
#Generate the lines, then filter based on words only
with open(r'C:\Users\retro\Documents\1Hack reservoir\sustainability texts.txt', 'r') as csv:
line_array = line_array(csv)
model = Word2Vec(sentences= line_array, vector_size=100, window=5, min_count=1, workers=4) #Tweak workers for the number of processors dedicated to the training
model.save("sustainability.model") #Save to be able to load it up for re-training later
model = Word2Vec.load("sustainability.model")
############### We put in a logical sentence which makes logical sense, and can be used to alter the weights of the words
sentence = 'Sustainability is very important to save the environment'
sent_list = [w.lower() for w in sentence.split(' ') if w.isalpha() == True] #Can add sentence divided into list of words to add to training data
print(sent_list)
model.train(sent_list, total_examples=1, epochs=1)
#1. Function that generates associated words form preference words for each category
vector = model.wv['secondary'] # get numpy vector of a word
sims = model.wv.most_similar(positive = 'life', topn=15) # get other similar words - positive is for closer matches, negative is for words farther away
print("\n Similarity tuples:")
#Checking the list of similar words
for i in sims: #Note sims stores the words and the vector weighting as a tuple
print(i)
from gensim.models import KeyedVectors
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("sustainability.wordvectors")
# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("sustainability.wordvectors", mmap='r')
#Example: to print the vector positions of a limuted set of words:
def sample_vectors(line_array):
for l_num, line in enumerate(line_array, 0):
if l_num > 10:
break
for word in line:
print('\n {}'.format(word))
print(wv[word])
sample_vectors(line_array)
import itertools
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = map(float, tokens[1:])
return data
fpath = r'C:\Users\retro\Documents\1Hack reservoir\samplevc.vec'
#with open(r'C:\Users\retro\Documents\1Hack reservoir\samplevc.vec', 'r') as sample_vec:
load_vectors = load_vectors(fpath)
model = Word2Vec(sentences= line_array, vector_size=100, window=5, min_count=1, workers=4) #Tweak workers for the number of processors dedicated to the training
model.save("sustainability.model") #Save to be able to load it up for re-training later
sims = model.wv.most_similar(positive = 'life', topn=15) # get other similar words - positive is for closer matches, negative is for words farther away
print(sims)
##########################################################################################
class Categories:
def __init__(self, industries, goal, regions, interests):
self.industries = industries[0]; self.industries_reject = industries[1] #List of 2, for industries of interest, industries of lower interest
self.goal = goal #Company ethos/ goals
self.regions = regions
self.technologies = interests #More general, associated with specific areas of industry (i.e. in vehicles, specifically in EV charging, EV battery coolant etc...)
@property
def industry_sims(self):
pos = [interest for interest in self.industries]
neg = [n_interest for n_interest in self.industries_reject]
sims = model.wv.most_similar(positive = pos, negative = neg,topn = 15) #Check if positive and negative can take in a list rather than just a string
return sims[:, 0] #First row extracted - of all similar words
@property
def goals_sim(self):
pos = [goal for goal in self.goals]
sims = model.wv.most_similar(positive = pos,topn = 15) #Check if positive and negative can take in a list rather than just a string
return sims[:, 0] #First row extracted - of all similar words
#Need to do for 2 more categories
site = io.open(sitename, 'r', encoding='utf-8', newline='\n', errors='ignore')
dir = "C:\Users\retro\Documents\1Hack reservoir\Sitewords"
for filename in os.listdir(r"{}".format(dir)):
if filename.endswith(".txt"):
with open(r"{}/{}".format(dir,filename), 'r') as sitefile:
#similarity matchup , generates reservour of vectors for industry
#numpy.unique() Will count the number of occurences of each word
#Generate a set of similar words associated with sustainability - based on preferences (set P)
#Checks the maximum number of matches within the scraped vector of a website to give match 1.
#OR
#Alternatively: performs a similarity test with the vector from set P - by itertools (Set A)
#Complexity order 2?