Merge pull request #107 from BrikerMan/develop

release v0.2.4
BrikerMan · Jun 6, 2019 · 90fcd52 · 90fcd52
2 parents 1a281bf + 929de04
commit 90fcd52
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## release v0.2.4
+
+* Add BERT output feature layer finetune support. Discussion: #103
+* Add BERT output feature layer number selection, default 4 according to BERT paper.
+* Fix BERT embedding token index offset issue #104.
+
 ## release v0.1.4
 
 * fix classification model evaluate result output

diff --git a/README.md b/README.md
@@ -47,11 +47,12 @@ Kashgare is:
 
 ## Performance
 
-| Task                     | Language | Dataset                   | Score          | Detail                                                                         |
-| ------------------------ | -------- | ------------------------- | -------------- | ------------------------------------------------------------------------------ |
+| Task                     | Language | Dataset                   | Score          | Detail                                                                   |
+| ------------------------ | -------- | ------------------------- | -------------- | ------------------------------------------------------------------------ |
 | Named Entity Recognition | Chinese  | People's Daily Ner Corpus | **92.20** (F1) | [基于 BERT 的中文命名实体识别](https://eliyar.biz/nlp_chinese_bert_ner/) |
 
 ## Roadmap
+
 * [ ] **[Migrate to tf.keras](https://github.com/BrikerMan/Kashgari/issues/77)**
 * [ ] ELMo Embedding
 * [ ] Pre-trained models
@@ -160,7 +161,7 @@ from kashgari.embeddings import BERTEmbedding
 from kashgari.tasks.classification import CNNLSTMModel
 from kashgari.corpus import SMP2017ECDTClassificationCorpus
 
-bert_embedding = BERTEmbedding('bert-base-chinese', sequence_length=30)                                   
+bert_embedding = BERTEmbedding('<bert-model-folder>', sequence_length=30)                                   
 model = CNNLSTMModel(bert_embedding)
 
 train_x, train_y = SMP2017ECDTClassificationCorpus.get_classification_data()

diff --git a/kashgari/__init__.py b/kashgari/__init__.py
@@ -19,6 +19,7 @@
 
 from kashgari.macros import config
 
+from kashgari.version import __version__
 
 if __name__ == "__main__":
     print("Hello world")
diff --git a/kashgari/embeddings/embeddings.py b/kashgari/embeddings/embeddings.py
@@ -293,6 +293,30 @@ class BERTEmbedding(BaseEmbedding):
                                    'chinese_L-12_H-768_A-12.zip',
     }
 
+
+    def __init__(self,
+                 name_or_path: str,
+                 sequence_length: int = None,
+                 embedding_size: int = None,
+                 layer_nums: int = 4,
+                 trainable: bool = False,
+                 **kwargs,):
+        """
+        init a WordEmbedding
+        :param name_or_path: model name as `sgns.weibo.bigram` or model path like '/home/brikerman/w2v.model
+        :param sequence_length: length of max sequence, all embedding is shaped as (sequence_length, embedding_size)
+        :param embedding_size: embedding vector size, only need to set when using a CustomEmbedding
+        :param layer_nums: number of layers whose outputs will be concatenated as a single output.
+                           default `4`, the last 4 hidden layers
+        :param trainable: whether if the output feature layer is trainable, default `False` and set it to `True` for finetune
+        :param kwargs: kwargs to pass to the method, func: `BaseEmbedding.build`
+        """
+        self.layer_nums = layer_nums
+        self.trainable = trainable
+        self.training = False # We do not need to train the whole bert model so set it to `False`
+        super(BERTEmbedding, self).__init__(name_or_path, sequence_length, embedding_size, **kwargs)
+
+
     def build(self):
         self.embedding_type = 'bert'
         url = self.pre_trained_models.get(self.model_key_map.get(self.name, self.name))
@@ -305,23 +329,26 @@ def build(self):
         logging.info('loading bert model from {}\n'.format(self.model_path))
         model = keras_bert.load_trained_model_from_checkpoint(config_path,
                                                               check_point_path,
-                                                              seq_len=self.sequence_length)
-        num_layers = len(model.layers)
-        features_layers = [model.get_layer(index=num_layers-1+idx*8).output\
-                            for idx in range(-3, 1)]
-        embedding_layer = concatenate(features_layers)
-        output_layer = NonMaskingLayer()(embedding_layer)
-        #output_layer = NonMaskingLayer()(model.output)
+                                                              seq_len=self.sequence_length,
+                                                              output_layer_num=self.layer_nums,
+                                                              training=self.training,
+                                                              trainable=self.trainable
+                                                              )
+        #num_layers = len(model.layers)
+        #features_layers = [model.get_layer(index=num_layers-1+idx*8).output\
+        #                    for idx in range(-3, 1)]
+        #embedding_layer = concatenate(features_layers)
+        #output_layer = NonMaskingLayer()(embedding_layer)
+        output_layer = NonMaskingLayer()(model.output)
         self._model = Model(model.inputs, output_layer)
 
         self.embedding_size = self.model.output_shape[-1]
         dict_path = os.path.join(self.model_path, 'vocab.txt')
         word2idx = {}
         with open(dict_path, 'r', encoding='utf-8') as f:
             words = f.read().splitlines()
-        for idx, word in enumerate(words):
-            word2idx[word] = idx
-            #word2idx[word] = len(word2idx)
+        for word in words:
+            word2idx[word] = len(word2idx)
         for key, value in self.special_tokens.items():
             word2idx[key] = word2idx[value]
 

diff --git a/version.py → kashgari/version.py b/version.py → kashgari/version.py
@@ -11,4 +11,4 @@
 
 """
 
-__version__ = '0.2.3'
+__version__ = '0.2.4'
diff --git a/setup.py b/setup.py
@@ -11,11 +11,29 @@
 @time: 2019-01-24 16:42
 
 """
+import os
+import re
+import codecs
 import pathlib
 
 from setuptools import find_packages, setup
 
-from version import __version__
+HERE = pathlib.Path(__file__).parent
+
+
+def read(*parts):
+    with codecs.open(os.path.join(HERE, *parts), 'r') as fp:
+        return fp.read()
+
+
+def find_version(*file_paths):
+    version_file = read(*file_paths)
+    version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                              version_file, re.M)
+    if version_match:
+        return version_match.group(1)
+    raise RuntimeError("Unable to find version string.")
+
 
 # Package meta-data.
 NAME = 'kashgari'
@@ -28,13 +46,14 @@
 AUTHOR = 'BrikerMan'
 LICENSE = 'Apache License 2.0'
 
-HERE = pathlib.Path(__file__).parent
 README = (HERE / "README.md").read_text()
 
+__version__ = find_version('kashgari', 'version.py')
+
 required = [
     'Keras>=2.2.0',
     'h5py>=2.7.1',
-    'keras-bert==0.41.0',
+    'keras-bert==0.57.1',
     'scikit-learn>=0.19.1',
     'numpy>=1.14.3',
     'download>=0.3.3',
@@ -44,7 +63,7 @@
     # 'bz2file>=0.98',
     'sklearn',
     'pandas>=0.23.0',
-    'keras-gpt-2==0.7.0'
+    'keras-gpt-2==0.11.1'
 ]
 
 # long_description = ""
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,4 +11,4 @@

		"""

		__version__ = '0.2.3'
		__version__ = '0.2.4'