forkd project; updated libraries and all calls; dockerized app

nlpie · Jul 17, 2018 · 5793e47 · 5793e47
1 parent 7da58f8
commit 5793e47
Show file tree

Hide file tree

Showing 9 changed files with 151 additions and 28 deletions.
diff --git a/.gitignore b/.gitignore
@@ -52,3 +52,5 @@ coverage.xml
 # Sphinx documentation
 docs/_build/
 
+.DS_Store
+word2vec-api/.DS_Store
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,9 @@
+FROM ubuntu:latest
+RUN apt-get update -y
+RUN apt-get install -y python3-pip python3-dev build-essential
+ADD word2vec-api /word2vec
+WORKDIR /word2vec
+RUN pip3 install -r requirements.txt
+WORKDIR /word2vec
+ENTRYPOINT ["python3"]
+CMD ["word2vec-api.py", "--model", "/data/test.model"]
diff --git a/requirements.txt b/requirements.txt
diff --git a/word2vec-api/.DS_Store b/word2vec-api/.DS_Store
diff --git a/word2vec-api/.gitignore b/word2vec-api/.gitignore
@@ -0,0 +1,54 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+# Rope
+.ropeproject
+
+# Django stuff:
+*.log
+*.pot
+
+# Sphinx documentation
+docs/_build/
+
diff --git a/word2vec-api/README.md b/word2vec-api/README.md
@@ -0,0 +1,55 @@
+word2vec-api
+============
+
+Simple web service providing a word embedding API. The methods are based on Gensim Word2Vec implementation. Models are passed as parameters and must be in the Word2Vec text or binary format.
+* Install Dependencies   
+```
+pip2 install -r requirements.txt
+```
+
+* Launching the service
+```
+python word2vec-api --model path/to/the/model [--host host --port 1234]
+```
+or   
+```
+python word2vec-api.py --model /path/to/GoogleNews-vectors-negative300.bin --binary BINARY --path /word2vec --host 0.0.0.0 --port 5000
+```
+
+
+
+* Example calls
+```
+curl http://127.0.0.1:5000/word2vec/n_similarity?ws1=Sushi&ws1=Shop&ws2=Japanese&ws2=Restaurant
+curl http://127.0.0.1:5000/word2vec/similarity?w1=Sushi&w2=Japanese
+curl http://127.0.0.1:5000/word2vec/most_similar?positive=indian&positive=food[&negative=][&topn=]
+curl http://127.0.0.1:5000/word2vec/model?word=restaurant
+curl http://127.0.0.1:5000/word2vec/model_word_set
+```
+
+Note: The "model" method returns a base64 encoding of the vector. "model\_word\_set" returns a base64 encoded pickle of the model's vocabulary. 
+
+## Where to get a pretrained model
+
+In case you do not have domain specific data to train, it can be convenient to use a pretrained model. 
+Please feel free to submit additions to this list through a pull request.
+
+
+| Model file | Number of dimensions | Corpus (size)| Vocabulary size | Author | Architecture | Training Algorithm | Context window - size | Web page |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| [Google News](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/) | 300 |Google News (100B) | 3M | Google | word2vec | negative sampling | BoW - ~5| [link](http://code.google.com/p/word2vec/) |
+| [Freebase IDs](https://docs.google.com/file/d/0B7XkCwpI5KDYaDBDQm1tZGNDRHc/edit?usp=sharing) | 1000 | Gooogle News (100B) | 1.4M | Google | word2vec, skip-gram | ? | BoW - ~10 | [link](http://code.google.com/p/word2vec/) |
+| [Freebase names](https://docs.google.com/file/d/0B7XkCwpI5KDYeFdmcVltWkhtbmM/edit?usp=sharing) | 1000 | Gooogle News (100B) | 1.4M | Google | word2vec, skip-gram | ? | BoW - ~10 | [link](http://code.google.com/p/word2vec/) |
+| [Wikipedia+Gigaword 5](http://nlp.stanford.edu/data/glove.6B.zip) | 50 | Wikipedia+Gigaword 5 (6B) | 400,000 | GloVe | GloVe | AdaGrad | 10+10 | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Wikipedia+Gigaword 5](http://nlp.stanford.edu/data/glove.6B.zip) | 100 | Wikipedia+Gigaword 5 (6B) | 400,000 | GloVe | GloVe | AdaGrad | 10+10 | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Wikipedia+Gigaword 5](http://nlp.stanford.edu/data/glove.6B.zip) | 200 | Wikipedia+Gigaword 5 (6B) | 400,000 | GloVe | GloVe | AdaGrad | 10+10 | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Wikipedia+Gigaword 5](http://nlp.stanford.edu/data/glove.6B.zip) | 300 | Wikipedia+Gigaword 5 (6B) | 400,000 | GloVe | GloVe | AdaGrad | 10+10 | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Common Crawl 42B](http://nlp.stanford.edu/data/glove.42B.300d.zip) | 300 | Common Crawl (42B) | 1.9M | GloVe | GloVe | GloVe | AdaGrad | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Common Crawl 840B](http://nlp.stanford.edu/data/glove.840B.300d.zip) | 300 | Common Crawl (840B) | 2.2M | GloVe | GloVe | GloVe | AdaGrad | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Twitter (2B Tweets)](http://www-nlp.stanford.edu/data/glove.twitter.27B.zip) | 25 | Twitter (27B) | ? | GloVe | GloVe | GloVe | AdaGrad | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Twitter (2B Tweets)](http://www-nlp.stanford.edu/data/glove.twitter.27B.zip) | 50 | Twitter (27B) | ? | GloVe | GloVe | GloVe | AdaGrad | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Twitter (2B Tweets)](http://www-nlp.stanford.edu/data/glove.twitter.27B.zip) | 100 | Twitter (27B) | ? | GloVe | GloVe | GloVe | AdaGrad | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Twitter (2B Tweets)](http://www-nlp.stanford.edu/data/glove.twitter.27B.zip) | 200 | Twitter (27B) | ? | GloVe | GloVe | GloVe | AdaGrad | [link](http://nlp.stanford.edu/projects/glove/) |
+| [Wikipedia dependency](http://u.cs.biu.ac.il/~yogo/data/syntemb/deps.words.bz2) | 300 | Wikipedia (?) | 174,015 | Levy \& Goldberg | word2vec modified | word2vec | syntactic dependencies | [link](https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/) |
+| [DBPedia vectors (wiki2vec)](https://github.com/idio/wiki2vec/raw/master/torrents/enwiki-gensim-word2vec-1000-nostem-10cbow.torrent) | 1000 | Wikipedia (?) | ? | Idio | word2vec | word2vec, skip-gram | BoW, 10 | [link](https://github.com/idio/wiki2vec#prebuilt-models) |
+| [60 Wikipedia embeddings with 4 kinds of context](http://vsmlib.readthedocs.io/en/latest/tutorial/getting_vectors.html#) | 25,50,100,250,500 | Wikipedia | varies | Li, Liu et al. | Skip-Gram, CBOW, GloVe | original and modified | 2 | [link](http://vsmlib.readthedocs.io/en/latest/tutorial/getting_vectors.html#) |
diff --git a/clean-word2vec-text-format.py → word2vec-api/clean-word2vec-text-format.py b/clean-word2vec-text-format.py → word2vec-api/clean-word2vec-text-format.py
diff --git a/word2vec-api/requirements.txt b/word2vec-api/requirements.txt
@@ -0,0 +1,4 @@
+Flask
+Flask-RESTful
+gensim==3.4.0
+
diff --git a/word2vec-api.py → word2vec-api/word2vec-api.py b/word2vec-api.py → word2vec-api/word2vec-api.py
@@ -7,43 +7,43 @@
 '''
 
 from flask import Flask, request, jsonify
-from flask.ext.restful import Resource, Api, reqparse
+from flask_restful import Resource, Api, reqparse
 from gensim.models.word2vec import Word2Vec as w
 from gensim import utils, matutils
-from numpy import exp, dot, zeros, outer, random, dtype, get_include, float32 as REAL,\
+from numpy import  exp, dot, zeros, outer, random, dtype, get_include, float32 as REAL,\
      uint32, seterr, array, uint8, vstack, argsort, fromstring, sqrt, newaxis, ndarray, empty, sum as np_sum
-import cPickle
+import pickle
 import argparse
 import base64
 import sys
 
-parser = reqparse.RequestParser()
-
 
 def filter_words(words):
     if words is None:
         return
-    return [word for word in words if word in model.vocab]
-
+    return [word for word in words if word in model.wv.vocab]
 
+# http://127.0.0.1:5000/word2vec/n_similarity?ws1=data&ws1=science&ws2=computer&ws2=science
 class N_Similarity(Resource):
     def get(self):
         parser = reqparse.RequestParser()
         parser.add_argument('ws1', type=str, required=True, help="Word set 1 cannot be blank!", action='append')
         parser.add_argument('ws2', type=str, required=True, help="Word set 2 cannot be blank!", action='append')
         args = parser.parse_args()
-        return model.n_similarity(filter_words(args['ws1']),filter_words(args['ws2']))
-
+
+        return model.wv.n_similarity(filter_words(args['ws1']),filter_words(args['ws2']))
 
 class Similarity(Resource):
     def get(self):
         parser = reqparse.RequestParser()
         parser.add_argument('w1', type=str, required=True, help="Word 1 cannot be blank!")
         parser.add_argument('w2', type=str, required=True, help="Word 2 cannot be blank!")
         args = parser.parse_args()
-        return model.similarity(args['w1'], args['w2'])
 
+        return model.wv.similarity(w1=args['w1'], w2=args['w2'])
 
+
+# http://127.0.0.1:5000/word2vec/most_similar?positive=kolaczkowski&topn=5
 class MostSimilar(Resource):
     def get(self):
         parser = reqparse.RequestParser()
@@ -57,14 +57,13 @@ def get(self):
         pos = [] if pos == None else pos
         neg = [] if neg == None else neg
         t = 10 if t == None else t
-        print "positive: " + str(pos) + " negative: " + str(neg) + " topn: " + str(t)
+        print("positive: " + str(pos) + " negative: " + str(neg) + " topn: " + str(t))
         try:
             res = model.most_similar_cosmul(positive=pos,negative=neg,topn=t)
             return res
-        except Exception, e:
-            print e
-            print res
-
+        except Exception as e:
+            print(e)
+            print(res)
 
 class Model(Resource):
     def get(self):
@@ -74,18 +73,20 @@ def get(self):
         try:
             res = model[args['word']]
             res = base64.b64encode(res)
+            res = str(res, 'utf-8', 'ignore')
             return res
-        except Exception, e:
-            print e
+        except Exception as e:
+            print(e)
             return
 
 class ModelWordSet(Resource):
     def get(self):
         try:
-            res = base64.b64encode(cPickle.dumps(set(model.index2word)))
-            return res
-        except Exception, e:
-            print e
+            res = base64.b64encode(pickle.dumps(set(model.wv.index2word)))
+            res = str(res, 'utf-8', 'ignore')
+            return str(res)
+        except Exception as e:
+            print(e)
             return
 
 app = Flask(__name__)
@@ -112,16 +113,18 @@ def raiseError(error):
     args = p.parse_args()
 
     model_path = args.model if args.model else "./model.bin.gz"
+    print(model_path)
     binary = True if args.binary else False
     host = args.host if args.host else "localhost"
     path = args.path if args.path else "/word2vec"
     port = int(args.port) if args.port else 5000
     if not args.model:
-        print "Usage: word2vec-apy.py --model path/to/the/model [--host host --port 1234]"
-    model = w.load_word2vec_format(model_path, binary=binary)
+        print("Usage: word2vec-apy.py --model path/to/the/model [--host host --port 1234]")
+    #model = w.load_word2vec_format(model_path, binary=binary)
+    model = w.load(model_path)
     api.add_resource(N_Similarity, path+'/n_similarity')
     api.add_resource(Similarity, path+'/similarity')
     api.add_resource(MostSimilar, path+'/most_similar')
     api.add_resource(Model, path+'/model')
     api.add_resource(ModelWordSet, '/word2vec/model_word_set')
-    app.run(host=host, port=port)
+    app.run(host='0.0.0.0', port=port, debug=True)