Skip to content

Commit

Permalink
Merge pull request #107 from BrikerMan/develop
Browse files Browse the repository at this point in the history
release v0.2.4
  • Loading branch information
BrikerMan authored Jun 6, 2019
2 parents 1a281bf + 929de04 commit 90fcd52
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 18 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## release v0.2.4

* Add BERT output feature layer finetune support. Discussion: #103
* Add BERT output feature layer number selection, default 4 according to BERT paper.
* Fix BERT embedding token index offset issue #104.

## release v0.1.4

* fix classification model evaluate result output
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,12 @@ Kashgare is:

## Performance

| Task | Language | Dataset | Score | Detail |
| ------------------------ | -------- | ------------------------- | -------------- | ------------------------------------------------------------------------------ |
| Task | Language | Dataset | Score | Detail |
| ------------------------ | -------- | ------------------------- | -------------- | ------------------------------------------------------------------------ |
| Named Entity Recognition | Chinese | People's Daily Ner Corpus | **92.20** (F1) | [基于 BERT 的中文命名实体识别](https://eliyar.biz/nlp_chinese_bert_ner/) |

## Roadmap

* [ ] **[Migrate to tf.keras](https://github.com/BrikerMan/Kashgari/issues/77)**
* [ ] ELMo Embedding
* [ ] Pre-trained models
Expand Down Expand Up @@ -160,7 +161,7 @@ from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.classification import CNNLSTMModel
from kashgari.corpus import SMP2017ECDTClassificationCorpus

bert_embedding = BERTEmbedding('bert-base-chinese', sequence_length=30)
bert_embedding = BERTEmbedding('<bert-model-folder>', sequence_length=30)
model = CNNLSTMModel(bert_embedding)

train_x, train_y = SMP2017ECDTClassificationCorpus.get_classification_data()
Expand Down
1 change: 1 addition & 0 deletions kashgari/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from kashgari.macros import config

from kashgari.version import __version__

if __name__ == "__main__":
print("Hello world")
47 changes: 37 additions & 10 deletions kashgari/embeddings/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,30 @@ class BERTEmbedding(BaseEmbedding):
'chinese_L-12_H-768_A-12.zip',
}


def __init__(self,
name_or_path: str,
sequence_length: int = None,
embedding_size: int = None,
layer_nums: int = 4,
trainable: bool = False,
**kwargs,):
"""
init a WordEmbedding
:param name_or_path: model name as `sgns.weibo.bigram` or model path like '/home/brikerman/w2v.model
:param sequence_length: length of max sequence, all embedding is shaped as (sequence_length, embedding_size)
:param embedding_size: embedding vector size, only need to set when using a CustomEmbedding
:param layer_nums: number of layers whose outputs will be concatenated as a single output.
default `4`, the last 4 hidden layers
:param trainable: whether if the output feature layer is trainable, default `False` and set it to `True` for finetune
:param kwargs: kwargs to pass to the method, func: `BaseEmbedding.build`
"""
self.layer_nums = layer_nums
self.trainable = trainable
self.training = False # We do not need to train the whole bert model so set it to `False`
super(BERTEmbedding, self).__init__(name_or_path, sequence_length, embedding_size, **kwargs)


def build(self):
self.embedding_type = 'bert'
url = self.pre_trained_models.get(self.model_key_map.get(self.name, self.name))
Expand All @@ -305,23 +329,26 @@ def build(self):
logging.info('loading bert model from {}\n'.format(self.model_path))
model = keras_bert.load_trained_model_from_checkpoint(config_path,
check_point_path,
seq_len=self.sequence_length)
num_layers = len(model.layers)
features_layers = [model.get_layer(index=num_layers-1+idx*8).output\
for idx in range(-3, 1)]
embedding_layer = concatenate(features_layers)
output_layer = NonMaskingLayer()(embedding_layer)
#output_layer = NonMaskingLayer()(model.output)
seq_len=self.sequence_length,
output_layer_num=self.layer_nums,
training=self.training,
trainable=self.trainable
)
#num_layers = len(model.layers)
#features_layers = [model.get_layer(index=num_layers-1+idx*8).output\
# for idx in range(-3, 1)]
#embedding_layer = concatenate(features_layers)
#output_layer = NonMaskingLayer()(embedding_layer)
output_layer = NonMaskingLayer()(model.output)
self._model = Model(model.inputs, output_layer)

self.embedding_size = self.model.output_shape[-1]
dict_path = os.path.join(self.model_path, 'vocab.txt')
word2idx = {}
with open(dict_path, 'r', encoding='utf-8') as f:
words = f.read().splitlines()
for idx, word in enumerate(words):
word2idx[word] = idx
#word2idx[word] = len(word2idx)
for word in words:
word2idx[word] = len(word2idx)
for key, value in self.special_tokens.items():
word2idx[key] = word2idx[value]

Expand Down
2 changes: 1 addition & 1 deletion version.py → kashgari/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
"""

__version__ = '0.2.3'
__version__ = '0.2.4'
27 changes: 23 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,29 @@
@time: 2019-01-24 16:42
"""
import os
import re
import codecs
import pathlib

from setuptools import find_packages, setup

from version import __version__
HERE = pathlib.Path(__file__).parent


def read(*parts):
with codecs.open(os.path.join(HERE, *parts), 'r') as fp:
return fp.read()


def find_version(*file_paths):
version_file = read(*file_paths)
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
version_file, re.M)
if version_match:
return version_match.group(1)
raise RuntimeError("Unable to find version string.")


# Package meta-data.
NAME = 'kashgari'
Expand All @@ -28,13 +46,14 @@
AUTHOR = 'BrikerMan'
LICENSE = 'Apache License 2.0'

HERE = pathlib.Path(__file__).parent
README = (HERE / "README.md").read_text()

__version__ = find_version('kashgari', 'version.py')

required = [
'Keras>=2.2.0',
'h5py>=2.7.1',
'keras-bert==0.41.0',
'keras-bert==0.57.1',
'scikit-learn>=0.19.1',
'numpy>=1.14.3',
'download>=0.3.3',
Expand All @@ -44,7 +63,7 @@
# 'bz2file>=0.98',
'sklearn',
'pandas>=0.23.0',
'keras-gpt-2==0.7.0'
'keras-gpt-2==0.11.1'
]

# long_description = ""
Expand Down

0 comments on commit 90fcd52

Please sign in to comment.