Merge pull request #470 from BrikerMan/v2-dev

V2 dev
BrikerMan · Jul 4, 2021 · 2a4433b · 2a4433b
2 parents 3dfc846 + e4d4db4
commit 2a4433b
Show file tree

Hide file tree

Showing 16 changed files with 274 additions and 79 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -19,17 +19,23 @@ jobs:
           python-version: 3.8
       - name: Install deps
         run: |
+          python -m pip install --upgrade pip
+          python scripts/install_tf.py 2.2
+          python scripts/install_addons.py 2.2
           pip install -r requirements.dev.txt
+          pip install -r requirements.txt
       - name: Run lint script
         run: sh ./scripts/lint.sh
+
   test:
     if: always()
     name: "Test with TF ${{ matrix.tensorflow_version }} - ${{ matrix.group }}"
+    needs: lint
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        group: [ 1, 2, 3, 4, 5, 6 ]
-        tensorflow_version: [ 2.2.0, 2.3.0 ]
+        group: [ 1, 2, 3 ]
+        tensorflow_version: [2.2, 2.3, 2.4, 2.5]
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.8
@@ -39,8 +45,8 @@ jobs:
       - name: Install deps
         run: |
           python -m pip install --upgrade pip
-          pip install tensorflow==${{ matrix.tensorflow_version }}
-          python scripts/install_addons.py
+          python scripts/install_tf.py '${{ matrix.tensorflow_version }}'
+          python scripts/install_addons.py '${{ matrix.tensorflow_version }}'
           pip install -r requirements.dev.txt
           pip install -r requirements.txt
 
@@ -53,7 +59,7 @@ jobs:
          --cov-report term
          --cov-config .coveragerc
          --cov
-         --splits 6
+         --splits 3
          --group ${{ matrix.group }}
          tests/'
 

diff --git a/README.md b/README.md
@@ -97,6 +97,9 @@ Here is a set of quick tutorials to get you started with the library:
 
 There are also articles and posts that illustrate how to use Kashgari:
 
+- [基于 Kashgari 2 的短文本分类: 数据分析和预处理](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_1)
+- [基于 Kashgari 2 的短文本分类: 训练模型和调优](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_2)
+- [基于 Kashgari 2 的短文本分类: 模型部署](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_3)
 - [15 分钟搭建中文文本分类模型](https://eliyar.biz/nlp_chinese_text_classification_in_15mins/)
 - [基于 BERT 的中文命名实体识别（NER)](https://eliyar.biz/nlp_chinese_bert_ner/)
 - [BERT/ERNIE 文本分类和部署](https://eliyar.biz/nlp_train_and_deploy_bert_text_classification/)

diff --git a/docs/about/release-notes.md b/docs/about/release-notes.md
@@ -17,6 +17,8 @@ pip show kashgari
 
 ## Current Release
 
+### [2.0.2] - 2020.11.18
+- 🐛 Fixed Custom Model load issue.
 ### [2.0.1] - 2020.10.28
 
 - ✨ Add `convert_to_saved_model` API for tf-serving use case.

diff --git a/kashgari/__init__.py b/kashgari/__init__.py
@@ -12,25 +12,41 @@
 """
 
 import os
-from typing import Dict, Any
+from distutils.version import LooseVersion
+from typing import Any, Dict
 
-os.environ['TF_KERAS'] = '1'
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+os.environ["TF_KERAS"] = "1"
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 
 custom_objects: Dict[str, Any] = {}
 
-from kashgari.__version__ import __version__
-from kashgari.macros import config
-from kashgari import layers
-from kashgari import corpus
-from kashgari import embeddings
-from kashgari import macros
-from kashgari import processors
-from kashgari import tasks
-from kashgari import utils
 
-from kashgari.utils.dependency_check import dependency_check
+def check_tfa_version(tf_version: str) -> str:
+    if LooseVersion(tf_version) < "2.2.0":
+        return "0.9.1"
+    elif LooseVersion(tf_version) < "2.3.0":
+        return "0.11.2"
+    else:
+        return "0.13.0"
+
+
+def dependency_check() -> None:
+    import tensorflow as tf
+
+    tfa_version = check_tfa_version(tf_version=tf.__version__)
+    try:
+        import tensorflow_addons as tfa
+    except:
+        raise ImportError(
+            "Kashgari request tensorflow_addons, please install via the "
+            f"`$pip install tensorflow_addons=={tfa_version}`"
+        )
 
-custom_objects = layers.resigter_custom_layers(custom_objects)
 
 dependency_check()
+
+from kashgari import corpus, embeddings, layers, macros, processors, tasks, utils
+from kashgari.__version__ import __version__
+from kashgari.macros import config
+
+custom_objects = layers.resigter_custom_layers(custom_objects)
diff --git a/kashgari/__version__.py b/kashgari/__version__.py
@@ -7,4 +7,4 @@
 # file: __version__.py.py
 # time: 2019-05-20 16:32
 
-__version__ = '2.0.1'
+__version__ = '2.0.2'
diff --git a/kashgari/callbacks/save_callback.py b/kashgari/callbacks/save_callback.py
@@ -0,0 +1,103 @@
+import os
+import numpy as np
+from typing import Union, Any, AnyStr
+
+import tensorflow as tf
+from kashgari.tasks.abs_task_model import ABCTaskModel
+from kashgari.logger import logger
+
+
+class KashgariModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
+    """Save the model after every epoch.
+     Arguments:
+         filepath: string, path to save the model file.
+         monitor: quantity to monitor.
+         verbose: verbosity mode, 0 or 1.
+         save_best_only: if `save_best_only=True`, the latest best model according
+           to the quantity monitored will not be overwritten.
+         mode: one of {auto, min, max}. If `save_best_only=True`, the decision to
+           overwrite the current save file is made based on either the maximization
+           or the minimization of the monitored quantity. For `val_acc`, this
+           should be `max`, for `val_loss` this should be `min`, etc. In `auto`
+           mode, the direction is automatically inferred from the name of the
+           monitored quantity.
+         save_weights_only: if True, then only the model's weights will be saved
+           (`model.save_weights(filepath)`), else the full model is saved
+           (`model.save(filepath)`).
+         save_freq: `'epoch'` or integer. When using `'epoch'`, the callback saves
+           the model after each epoch. When using integer, the callback saves the
+           model at end of a batch at which this many samples have been seen since
+           last saving. Note that if the saving isn't aligned to epochs, the
+           monitored metric may potentially be less reliable (it could reflect as
+           little as 1 batch, since the metrics get reset every epoch). Defaults to
+           `'epoch'`
+         **kwargs: Additional arguments for backwards compatibility. Possible key
+           is `period`.
+     """
+
+    def __init__(self,
+                 filepath: AnyStr,
+                 monitor: str = 'val_loss',
+                 verbose: int = 1,
+                 save_best_only: bool = False,
+                 save_weights_only: bool = False,
+                 mode: str = 'auto',
+                 save_freq: Union[str, int] = 'epoch',
+                 kash_model: ABCTaskModel = None,
+                 **kwargs: Any) -> None:
+        super(KashgariModelCheckpoint, self).__init__(
+            filepath=filepath,
+            monitor=monitor,
+            verbose=verbose,
+            save_best_only=save_best_only,
+            save_weights_only=save_weights_only,
+            mode=mode,
+            save_freq=save_freq,
+            **kwargs)
+        self.kash_model = kash_model
+
+    def _save_model(self, epoch: int, logs: dict) -> None:
+        """Saves the model.
+        Arguments:
+            epoch: the epoch this iteration is in.
+            logs: the `logs` dict passed in to `on_batch_end` or `on_epoch_end`.
+        """
+        logs = logs or {}
+
+        if isinstance(self.save_freq,
+                      int) or self.epochs_since_last_save >= self.period:
+            self.epochs_since_last_save: int = 0
+            filepath = self._get_file_path(epoch, logs)
+
+            if self.save_best_only:
+                current = logs.get(self.monitor)
+                if current is None:
+                    logger.warning('Can save best model only with %s available, skipping.', self.monitor)
+                else:
+                    if self.monitor_op(current, self.best):
+                        if self.verbose > 0:
+                            print('\nEpoch %d: %s improved from %0.5f to %0.5f,'
+                                  ' saving model to %s' % (epoch + 1, self.monitor, self.best,
+                                                           current, filepath))
+                        self.best: float = current
+                        if self.save_weights_only:
+                            filepath = os.path.join(filepath, 'cp')
+                            self.model.save_weights(filepath, overwrite=True)
+                            logger.info(f'checkpoint saved to {filepath}')
+                        else:
+                            self.kash_model.save(filepath)
+                    else:
+                        if self.verbose > 0:
+                            print('\nEpoch %d: %s did not improve from %0.5f' %
+                                  (epoch + 1, self.monitor, self.best))
+            else:
+                if self.verbose > 0:
+                    print('\nEpoch %d: saving model to %s' % (epoch + 1, filepath))
+                if self.save_weights_only:
+                    filepath = os.path.join(filepath, 'cp')
+                    self.model.save_weights(filepath, overwrite=True)
+                    logger.info(f'checkpoint saved to {filepath}')
+                else:
+                    self.kash_model.save(filepath)
+
+            self._maybe_remove_file()
diff --git a/kashgari/metrics/sequence_labeling.py b/kashgari/metrics/sequence_labeling.py
@@ -320,8 +320,8 @@ def sequence_labeling_report(y_true: List[List[str]],
     pred_entities = set(bulk_get_entities(y_pred, suffix=suffix))
 
     name_width = 0
-    d1 = defaultdict(set)
-    d2 = defaultdict(set)
+    d1: Dict = defaultdict(set)
+    d2: Dict = defaultdict(set)
     for e in true_entities:
         d1[e[0]].add((e[1], e[2]))
         name_width = max(name_width, len(e[0]))

diff --git a/kashgari/tasks/abs_task_model.py b/kashgari/tasks/abs_task_model.py
@@ -11,20 +11,20 @@
 import os
 import pathlib
 from abc import ABC, abstractmethod
-from typing import Dict, Any, TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Any, Dict, Union
 
 import tensorflow as tf
 
 import kashgari
 from kashgari.embeddings import ABCEmbedding
+from kashgari.layers import KConditionalRandomField
 from kashgari.logger import logger
 from kashgari.processors.abc_processor import ABCProcessor
 from kashgari.utils import load_data_object
-from kashgari.layers import KConditionalRandomField
 
 if TYPE_CHECKING:
-    from kashgari.tasks.labeling import ABCLabelingModel
     from kashgari.tasks.classification import ABCClassificationModel
+    from kashgari.tasks.labeling import ABCLabelingModel
 
 
 class ABCTaskModel(ABC):
@@ -76,11 +76,11 @@ def default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
         """
         raise NotImplementedError
 
-    def save(self, model_path: str) -> str:
+    def save(self, model_path: str, encoding: str = 'utf-8') -> str:
         pathlib.Path(model_path).mkdir(exist_ok=True, parents=True)
         model_path = os.path.abspath(model_path)
 
-        with open(os.path.join(model_path, 'model_config.json'), 'w', encoding='utf8') as f:
+        with open(os.path.join(model_path, 'model_config.json'), 'w', encoding=encoding) as f:
             f.write(json.dumps(self.to_dict(), indent=2, ensure_ascii=False))
             f.close()
 
@@ -90,22 +90,30 @@ def save(self, model_path: str) -> str:
         return model_path
 
     @classmethod
-    def load_model(cls, model_path: str) -> Union["ABCLabelingModel", "ABCClassificationModel"]:
+    def load_model(cls, model_path: str,
+                   custom_objects: Dict = None,
+                   encoding: str = 'utf-8') -> Union["ABCLabelingModel", "ABCClassificationModel"]:
+        if custom_objects is None:
+            custom_objects = {}
+
+        if cls.__name__ not in custom_objects:
+            custom_objects[cls.__name__] = cls
+
         model_config_path = os.path.join(model_path, 'model_config.json')
-        model_config = json.loads(open(model_config_path, 'r').read())
-        model = load_data_object(model_config)
+        model_config = json.loads(open(model_config_path, 'r', encoding=encoding).read())
+        model = load_data_object(model_config, custom_objects)
 
-        model.embedding = load_data_object(model_config['embedding'])
-        model.text_processor = load_data_object(model_config['text_processor'])
-        model.label_processor = load_data_object(model_config['label_processor'])
+        model.embedding = load_data_object(model_config['embedding'], custom_objects)
+        model.text_processor = load_data_object(model_config['text_processor'], custom_objects)
+        model.label_processor = load_data_object(model_config['label_processor'], custom_objects)
 
         tf_model_str = json.dumps(model_config['tf_model'])
 
         model.tf_model = tf.keras.models.model_from_json(tf_model_str,
                                                          custom_objects=kashgari.custom_objects)
 
         if isinstance(model.tf_model.layers[-1], KConditionalRandomField):
-            model.layer_crf = model.tf_model.layers[-1]
+            model.crf_layer = model.tf_model.layers[-1]
 
         model.tf_model.load_weights(os.path.join(model_path, 'model_weights.h5'))
         model.embedding.embed_model.load_weights(os.path.join(model_path, 'embed_model_weights.h5'))

diff --git a/kashgari/tokenizers/bert_tokenizer.py b/kashgari/tokenizers/bert_tokenizer.py
@@ -98,7 +98,7 @@ def _tokenize(self, text: str) -> List[str]:
                 spaced += ch
 
         if len(self._token_dict) > 0:
-            tokens = []
+            tokens: List[str] = []
             for word in spaced.strip().split():
                 tokens += self._word_piece_tokenize(word)
             return tokens

diff --git a/kashgari/utils/dependency_check.py b/kashgari/utils/dependency_check.py
diff --git a/kashgari/utils/serialize.py b/kashgari/utils/serialize.py
@@ -11,18 +11,27 @@
 from typing import Dict, Any
 
 
-def load_data_object(data: Dict, **kwargs: Dict) -> Any:
+def load_data_object(data: Dict,
+                     custom_objects: Dict = None,
+                     **kwargs: Dict) -> Any:
     """
     Load Object From Dict
     Args:
         data:
+        custom_objects:
         **kwargs:
 
     Returns:
 
     """
-    module_name = f"{data['__module__']}.{data['__class_name__']}"
-    obj: Any = pydoc.locate(module_name)(**data['config'], **kwargs)  # type: ignore
+    if custom_objects is None:
+        custom_objects = {}
+
+    if data['__class_name__'] in custom_objects:
+        obj: Any = custom_objects[data['__class_name__']](**data['config'], **kwargs)
+    else:
+        module_name = f"{data['__module__']}.{data['__class_name__']}"
+        obj: Any = pydoc.locate(module_name)(**data['config'], **kwargs)  # type: ignore
     if hasattr(obj, '_override_load_model'):
         obj._override_load_model(data)
 

diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -1,8 +1,8 @@
 # test & coverage
-flake8
+flake8==3.8.4
 flake8-builtins
-mypy
-pytest>=5.4.3
+mypy==0.790
+pytest==5.4.3
 pytest-cov
 pytest-split
 coveralls