diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7cb88b34..fc67a068 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,17 +19,23 @@ jobs: python-version: 3.8 - name: Install deps run: | + python -m pip install --upgrade pip + python scripts/install_tf.py 2.2 + python scripts/install_addons.py 2.2 pip install -r requirements.dev.txt + pip install -r requirements.txt - name: Run lint script run: sh ./scripts/lint.sh + test: if: always() name: "Test with TF ${{ matrix.tensorflow_version }} - ${{ matrix.group }}" + needs: lint runs-on: ubuntu-latest strategy: matrix: - group: [ 1, 2, 3, 4, 5, 6 ] - tensorflow_version: [ 2.2.0, 2.3.0 ] + group: [ 1, 2, 3 ] + tensorflow_version: [2.2, 2.3, 2.4, 2.5] steps: - uses: actions/checkout@v2 - name: Set up Python 3.8 @@ -39,8 +45,8 @@ jobs: - name: Install deps run: | python -m pip install --upgrade pip - pip install tensorflow==${{ matrix.tensorflow_version }} - python scripts/install_addons.py + python scripts/install_tf.py '${{ matrix.tensorflow_version }}' + python scripts/install_addons.py '${{ matrix.tensorflow_version }}' pip install -r requirements.dev.txt pip install -r requirements.txt @@ -53,7 +59,7 @@ jobs: --cov-report term --cov-config .coveragerc --cov - --splits 6 + --splits 3 --group ${{ matrix.group }} tests/' diff --git a/README.md b/README.md index 638a4678..da651d37 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,9 @@ Here is a set of quick tutorials to get you started with the library: There are also articles and posts that illustrate how to use Kashgari: +- [基于 Kashgari 2 的短文本分类: 数据分析和预处理](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_1) +- [基于 Kashgari 2 的短文本分类: 训练模型和调优](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_2) +- [基于 Kashgari 2 的短文本分类: 模型部署](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_3) - [15 分钟搭建中文文本分类模型](https://eliyar.biz/nlp_chinese_text_classification_in_15mins/) - [基于 BERT 的中文命名实体识别(NER)](https://eliyar.biz/nlp_chinese_bert_ner/) - [BERT/ERNIE 文本分类和部署](https://eliyar.biz/nlp_train_and_deploy_bert_text_classification/) diff --git a/docs/about/release-notes.md b/docs/about/release-notes.md index 546c24a3..4d0f09a3 100644 --- a/docs/about/release-notes.md +++ b/docs/about/release-notes.md @@ -17,6 +17,8 @@ pip show kashgari ## Current Release +### [2.0.2] - 2020.11.18 +- 🐛 Fixed Custom Model load issue. ### [2.0.1] - 2020.10.28 - ✨ Add `convert_to_saved_model` API for tf-serving use case. diff --git a/kashgari/__init__.py b/kashgari/__init__.py index 2509f016..f7d80513 100644 --- a/kashgari/__init__.py +++ b/kashgari/__init__.py @@ -12,25 +12,41 @@ """ import os -from typing import Dict, Any +from distutils.version import LooseVersion +from typing import Any, Dict -os.environ['TF_KERAS'] = '1' -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ["TF_KERAS"] = "1" +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" custom_objects: Dict[str, Any] = {} -from kashgari.__version__ import __version__ -from kashgari.macros import config -from kashgari import layers -from kashgari import corpus -from kashgari import embeddings -from kashgari import macros -from kashgari import processors -from kashgari import tasks -from kashgari import utils -from kashgari.utils.dependency_check import dependency_check +def check_tfa_version(tf_version: str) -> str: + if LooseVersion(tf_version) < "2.2.0": + return "0.9.1" + elif LooseVersion(tf_version) < "2.3.0": + return "0.11.2" + else: + return "0.13.0" + + +def dependency_check() -> None: + import tensorflow as tf + + tfa_version = check_tfa_version(tf_version=tf.__version__) + try: + import tensorflow_addons as tfa + except: + raise ImportError( + "Kashgari request tensorflow_addons, please install via the " + f"`$pip install tensorflow_addons=={tfa_version}`" + ) -custom_objects = layers.resigter_custom_layers(custom_objects) dependency_check() + +from kashgari import corpus, embeddings, layers, macros, processors, tasks, utils +from kashgari.__version__ import __version__ +from kashgari.macros import config + +custom_objects = layers.resigter_custom_layers(custom_objects) diff --git a/kashgari/__version__.py b/kashgari/__version__.py index c77c520a..74c65ca9 100644 --- a/kashgari/__version__.py +++ b/kashgari/__version__.py @@ -7,4 +7,4 @@ # file: __version__.py.py # time: 2019-05-20 16:32 -__version__ = '2.0.1' +__version__ = '2.0.2' diff --git a/kashgari/callbacks/save_callback.py b/kashgari/callbacks/save_callback.py new file mode 100644 index 00000000..c9f3bd9a --- /dev/null +++ b/kashgari/callbacks/save_callback.py @@ -0,0 +1,103 @@ +import os +import numpy as np +from typing import Union, Any, AnyStr + +import tensorflow as tf +from kashgari.tasks.abs_task_model import ABCTaskModel +from kashgari.logger import logger + + +class KashgariModelCheckpoint(tf.keras.callbacks.ModelCheckpoint): + """Save the model after every epoch. + Arguments: + filepath: string, path to save the model file. + monitor: quantity to monitor. + verbose: verbosity mode, 0 or 1. + save_best_only: if `save_best_only=True`, the latest best model according + to the quantity monitored will not be overwritten. + mode: one of {auto, min, max}. If `save_best_only=True`, the decision to + overwrite the current save file is made based on either the maximization + or the minimization of the monitored quantity. For `val_acc`, this + should be `max`, for `val_loss` this should be `min`, etc. In `auto` + mode, the direction is automatically inferred from the name of the + monitored quantity. + save_weights_only: if True, then only the model's weights will be saved + (`model.save_weights(filepath)`), else the full model is saved + (`model.save(filepath)`). + save_freq: `'epoch'` or integer. When using `'epoch'`, the callback saves + the model after each epoch. When using integer, the callback saves the + model at end of a batch at which this many samples have been seen since + last saving. Note that if the saving isn't aligned to epochs, the + monitored metric may potentially be less reliable (it could reflect as + little as 1 batch, since the metrics get reset every epoch). Defaults to + `'epoch'` + **kwargs: Additional arguments for backwards compatibility. Possible key + is `period`. + """ + + def __init__(self, + filepath: AnyStr, + monitor: str = 'val_loss', + verbose: int = 1, + save_best_only: bool = False, + save_weights_only: bool = False, + mode: str = 'auto', + save_freq: Union[str, int] = 'epoch', + kash_model: ABCTaskModel = None, + **kwargs: Any) -> None: + super(KashgariModelCheckpoint, self).__init__( + filepath=filepath, + monitor=monitor, + verbose=verbose, + save_best_only=save_best_only, + save_weights_only=save_weights_only, + mode=mode, + save_freq=save_freq, + **kwargs) + self.kash_model = kash_model + + def _save_model(self, epoch: int, logs: dict) -> None: + """Saves the model. + Arguments: + epoch: the epoch this iteration is in. + logs: the `logs` dict passed in to `on_batch_end` or `on_epoch_end`. + """ + logs = logs or {} + + if isinstance(self.save_freq, + int) or self.epochs_since_last_save >= self.period: + self.epochs_since_last_save: int = 0 + filepath = self._get_file_path(epoch, logs) + + if self.save_best_only: + current = logs.get(self.monitor) + if current is None: + logger.warning('Can save best model only with %s available, skipping.', self.monitor) + else: + if self.monitor_op(current, self.best): + if self.verbose > 0: + print('\nEpoch %d: %s improved from %0.5f to %0.5f,' + ' saving model to %s' % (epoch + 1, self.monitor, self.best, + current, filepath)) + self.best: float = current + if self.save_weights_only: + filepath = os.path.join(filepath, 'cp') + self.model.save_weights(filepath, overwrite=True) + logger.info(f'checkpoint saved to {filepath}') + else: + self.kash_model.save(filepath) + else: + if self.verbose > 0: + print('\nEpoch %d: %s did not improve from %0.5f' % + (epoch + 1, self.monitor, self.best)) + else: + if self.verbose > 0: + print('\nEpoch %d: saving model to %s' % (epoch + 1, filepath)) + if self.save_weights_only: + filepath = os.path.join(filepath, 'cp') + self.model.save_weights(filepath, overwrite=True) + logger.info(f'checkpoint saved to {filepath}') + else: + self.kash_model.save(filepath) + + self._maybe_remove_file() diff --git a/kashgari/metrics/sequence_labeling.py b/kashgari/metrics/sequence_labeling.py index 4299966d..8ed5b434 100644 --- a/kashgari/metrics/sequence_labeling.py +++ b/kashgari/metrics/sequence_labeling.py @@ -320,8 +320,8 @@ def sequence_labeling_report(y_true: List[List[str]], pred_entities = set(bulk_get_entities(y_pred, suffix=suffix)) name_width = 0 - d1 = defaultdict(set) - d2 = defaultdict(set) + d1: Dict = defaultdict(set) + d2: Dict = defaultdict(set) for e in true_entities: d1[e[0]].add((e[1], e[2])) name_width = max(name_width, len(e[0])) diff --git a/kashgari/tasks/abs_task_model.py b/kashgari/tasks/abs_task_model.py index c85e7847..c9008587 100644 --- a/kashgari/tasks/abs_task_model.py +++ b/kashgari/tasks/abs_task_model.py @@ -11,20 +11,20 @@ import os import pathlib from abc import ABC, abstractmethod -from typing import Dict, Any, TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Any, Dict, Union import tensorflow as tf import kashgari from kashgari.embeddings import ABCEmbedding +from kashgari.layers import KConditionalRandomField from kashgari.logger import logger from kashgari.processors.abc_processor import ABCProcessor from kashgari.utils import load_data_object -from kashgari.layers import KConditionalRandomField if TYPE_CHECKING: - from kashgari.tasks.labeling import ABCLabelingModel from kashgari.tasks.classification import ABCClassificationModel + from kashgari.tasks.labeling import ABCLabelingModel class ABCTaskModel(ABC): @@ -76,11 +76,11 @@ def default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]: """ raise NotImplementedError - def save(self, model_path: str) -> str: + def save(self, model_path: str, encoding: str = 'utf-8') -> str: pathlib.Path(model_path).mkdir(exist_ok=True, parents=True) model_path = os.path.abspath(model_path) - with open(os.path.join(model_path, 'model_config.json'), 'w', encoding='utf8') as f: + with open(os.path.join(model_path, 'model_config.json'), 'w', encoding=encoding) as f: f.write(json.dumps(self.to_dict(), indent=2, ensure_ascii=False)) f.close() @@ -90,14 +90,22 @@ def save(self, model_path: str) -> str: return model_path @classmethod - def load_model(cls, model_path: str) -> Union["ABCLabelingModel", "ABCClassificationModel"]: + def load_model(cls, model_path: str, + custom_objects: Dict = None, + encoding: str = 'utf-8') -> Union["ABCLabelingModel", "ABCClassificationModel"]: + if custom_objects is None: + custom_objects = {} + + if cls.__name__ not in custom_objects: + custom_objects[cls.__name__] = cls + model_config_path = os.path.join(model_path, 'model_config.json') - model_config = json.loads(open(model_config_path, 'r').read()) - model = load_data_object(model_config) + model_config = json.loads(open(model_config_path, 'r', encoding=encoding).read()) + model = load_data_object(model_config, custom_objects) - model.embedding = load_data_object(model_config['embedding']) - model.text_processor = load_data_object(model_config['text_processor']) - model.label_processor = load_data_object(model_config['label_processor']) + model.embedding = load_data_object(model_config['embedding'], custom_objects) + model.text_processor = load_data_object(model_config['text_processor'], custom_objects) + model.label_processor = load_data_object(model_config['label_processor'], custom_objects) tf_model_str = json.dumps(model_config['tf_model']) @@ -105,7 +113,7 @@ def load_model(cls, model_path: str) -> Union["ABCLabelingModel", "ABCClassifica custom_objects=kashgari.custom_objects) if isinstance(model.tf_model.layers[-1], KConditionalRandomField): - model.layer_crf = model.tf_model.layers[-1] + model.crf_layer = model.tf_model.layers[-1] model.tf_model.load_weights(os.path.join(model_path, 'model_weights.h5')) model.embedding.embed_model.load_weights(os.path.join(model_path, 'embed_model_weights.h5')) diff --git a/kashgari/tokenizers/bert_tokenizer.py b/kashgari/tokenizers/bert_tokenizer.py index d3b91b3d..25b2f6f6 100644 --- a/kashgari/tokenizers/bert_tokenizer.py +++ b/kashgari/tokenizers/bert_tokenizer.py @@ -98,7 +98,7 @@ def _tokenize(self, text: str) -> List[str]: spaced += ch if len(self._token_dict) > 0: - tokens = [] + tokens: List[str] = [] for word in spaced.strip().split(): tokens += self._word_piece_tokenize(word) return tokens diff --git a/kashgari/utils/dependency_check.py b/kashgari/utils/dependency_check.py deleted file mode 100644 index 1b69525e..00000000 --- a/kashgari/utils/dependency_check.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Author : BrikerMan -# Site : https://eliyar.biz - -# Time : 2020/9/2 12:12 下午 -# File : dependency_check.py -# Project : Kashgari - -import tensorflow as tf - -from distutils.version import LooseVersion - - -def dependency_check() -> None: - if LooseVersion(tf.__version__) < '2.2.0': - try: - import tensorflow_addons as tfa - if LooseVersion(tfa.__version__) > '0.10.0': - raise ImportError("TF 2.1 required lower version of tensorflow_addons, " - "install using `$pip install tensorflow_addons<=0.10.0`") - except ImportError: - raise ImportError("TF 2.1 required lower version of tensorflow_addons, " - "install using `$pip install tensorflow_addons<=0.10.0`") diff --git a/kashgari/utils/serialize.py b/kashgari/utils/serialize.py index a8897e02..12668c91 100644 --- a/kashgari/utils/serialize.py +++ b/kashgari/utils/serialize.py @@ -11,18 +11,27 @@ from typing import Dict, Any -def load_data_object(data: Dict, **kwargs: Dict) -> Any: +def load_data_object(data: Dict, + custom_objects: Dict = None, + **kwargs: Dict) -> Any: """ Load Object From Dict Args: data: + custom_objects: **kwargs: Returns: """ - module_name = f"{data['__module__']}.{data['__class_name__']}" - obj: Any = pydoc.locate(module_name)(**data['config'], **kwargs) # type: ignore + if custom_objects is None: + custom_objects = {} + + if data['__class_name__'] in custom_objects: + obj: Any = custom_objects[data['__class_name__']](**data['config'], **kwargs) + else: + module_name = f"{data['__module__']}.{data['__class_name__']}" + obj: Any = pydoc.locate(module_name)(**data['config'], **kwargs) # type: ignore if hasattr(obj, '_override_load_model'): obj._override_load_model(data) diff --git a/requirements.dev.txt b/requirements.dev.txt index 8bc9b794..69169687 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,8 +1,8 @@ # test & coverage -flake8 +flake8==3.8.4 flake8-builtins -mypy -pytest>=5.4.3 +mypy==0.790 +pytest==5.4.3 pytest-cov pytest-split coveralls diff --git a/scripts/install_addons.py b/scripts/install_addons.py index 40e89e4c..0491ba48 100644 --- a/scripts/install_addons.py +++ b/scripts/install_addons.py @@ -1,25 +1,18 @@ import os -from distutils.version import LooseVersion -from importlib.metadata import version +import sys -tf_version = LooseVersion(version('tensorflow')) - -print(f'TF version: {tf_version}') - -addons_version = '' +tf_version = str(sys.argv[1]) # TF 2.0, 2.1 -if tf_version < LooseVersion('2.2.0'): +if tf_version in ['2.0', '2.1']: addons_version = '0.9.1' # TF 2.2 -elif tf_version < LooseVersion('2.3.0'): +elif tf_version == '2.2': addons_version = '0.11.2' # TF 2.3+ -elif tf_version < LooseVersion('2.6.0'): +if tf_version in ['2.3', '2.4', '2.5']: addons_version = '0.13.0' -else: - print(f'New Version, {tf_version}.') if addons_version: print(f'Should Install tensorflow-addons=={addons_version}') - os.system(f"pip install tensorflow-addons=={addons_version}") + os.system(f"pip install 'tensorflow-addons=={addons_version}'") diff --git a/scripts/install_tf.py b/scripts/install_tf.py new file mode 100644 index 00000000..9c647051 --- /dev/null +++ b/scripts/install_tf.py @@ -0,0 +1,12 @@ +import os +import sys + +tf_args = str(sys.argv[1]) +major_version, minor_version = tf_args.split('.') + +command = ( + f"pip install 'tensorflow>={major_version}.{minor_version}.0," + f"<{major_version}.{int(minor_version)+1}.0'" +) +print(command) +os.system(command) diff --git a/tests/test_classification/test_custom_model.py b/tests/test_classification/test_custom_model.py new file mode 100644 index 00000000..d6b08203 --- /dev/null +++ b/tests/test_classification/test_custom_model.py @@ -0,0 +1,67 @@ +# encoding: utf-8 + +# author: BrikerMan +# contact: eliyar917@gmail.com +# blog: https://eliyar.biz + +# file: test_custom_model.py +# time: 6:07 下午 + +import unittest +from typing import Dict, Any + +from tensorflow import keras + +import tests.test_classification.test_bi_lstm_model as base +from kashgari.embeddings import WordEmbedding +from kashgari.layers import L +from kashgari.tasks.classification.abc_model import ABCClassificationModel +from tests.test_macros import TestMacros + + +class Double_BiLSTM_Model(ABCClassificationModel): + @classmethod + def default_hyper_parameters(cls) -> Dict[str, Any]: + return { + 'layer_lstm1': { + 'units': 128, + 'return_sequences': True + }, + 'layer_lstm2': { + 'units': 64, + 'return_sequences': False + }, + 'layer_dropout': { + 'rate': 0.5 + }, + 'layer_output': { + + } + } + + def build_model_arc(self) -> None: + config = self.hyper_parameters + output_dim = self.label_processor.vocab_size + embed_model = self.embedding.embed_model + + # 定义模型架构 + self.tf_model = keras.Sequential([ + embed_model, + L.Bidirectional(L.LSTM(**config['layer_lstm1'])), + L.Bidirectional(L.LSTM(**config['layer_lstm2'])), + L.Dropout(**config['layer_dropout']), + L.Dense(output_dim, **config['layer_output']), + self._activation_layer() + ]) + + +class TestCustom_Model(base.TestBiLSTM_Model): + @classmethod + def setUpClass(cls): + cls.EPOCH_COUNT = 1 + cls.TASK_MODEL_CLASS = Double_BiLSTM_Model + cls.w2v_embedding = WordEmbedding(TestMacros.w2v_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_utils.py b/tests/test_utils.py index bbee710b..511b7573 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ # time: 10:48 上午 import unittest -import numpy as np +import numpy as np from kashgari.utils import unison_shuffled_copies from kashgari.utils import get_list_subset