-
Notifications
You must be signed in to change notification settings - Fork 14
/
create_word2vec.py
41 lines (31 loc) · 1.19 KB
/
create_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import logging
from gensim.models import Word2Vec
from sentences import SentenceIter
class Word2VecFactory:
"""
A simple wrapper around the Word2Vec implementation of gensim.
For reproducibility, just use the default settings.
"""
@staticmethod
def create(basedir, num_workers=12, size=320, threshold=5):
"""
Creates a word2vec model using the Gensim word2vec implementation.
:param basedir: the dir from which to get the documents.
:param num_workers: the number of workers to use for training word2vec
:param size: the size of the resulting vectors.
:param threshold: the frequency threshold.
:return: the model.
"""
logging.basicConfig(level=logging.INFO)
sentences = SentenceIter(root=basedir)
model = Word2Vec(sentences=sentences,
sg=True,
size=size,
workers=num_workers,
min_count=threshold,
window=11,
negative=15)
model.save_word2vec_format("{0}-{1}.wordvecs", "{0}-{1}.vocab")
return model
if __name__ == "__main__":
pass