Skip to content

Commit

Permalink
Merge pull request #470 from BrikerMan/v2-dev
Browse files Browse the repository at this point in the history
V2 dev
  • Loading branch information
BrikerMan authored Jul 4, 2021
2 parents 3dfc846 + e4d4db4 commit 2a4433b
Show file tree
Hide file tree
Showing 16 changed files with 274 additions and 79 deletions.
16 changes: 11 additions & 5 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,23 @@ jobs:
python-version: 3.8
- name: Install deps
run: |
python -m pip install --upgrade pip
python scripts/install_tf.py 2.2
python scripts/install_addons.py 2.2
pip install -r requirements.dev.txt
pip install -r requirements.txt
- name: Run lint script
run: sh ./scripts/lint.sh

test:
if: always()
name: "Test with TF ${{ matrix.tensorflow_version }} - ${{ matrix.group }}"
needs: lint
runs-on: ubuntu-latest
strategy:
matrix:
group: [ 1, 2, 3, 4, 5, 6 ]
tensorflow_version: [ 2.2.0, 2.3.0 ]
group: [ 1, 2, 3 ]
tensorflow_version: [2.2, 2.3, 2.4, 2.5]
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
Expand All @@ -39,8 +45,8 @@ jobs:
- name: Install deps
run: |
python -m pip install --upgrade pip
pip install tensorflow==${{ matrix.tensorflow_version }}
python scripts/install_addons.py
python scripts/install_tf.py '${{ matrix.tensorflow_version }}'
python scripts/install_addons.py '${{ matrix.tensorflow_version }}'
pip install -r requirements.dev.txt
pip install -r requirements.txt
Expand All @@ -53,7 +59,7 @@ jobs:
--cov-report term
--cov-config .coveragerc
--cov
--splits 6
--splits 3
--group ${{ matrix.group }}
tests/'

Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ Here is a set of quick tutorials to get you started with the library:

There are also articles and posts that illustrate how to use Kashgari:

- [基于 Kashgari 2 的短文本分类: 数据分析和预处理](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_1)
- [基于 Kashgari 2 的短文本分类: 训练模型和调优](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_2)
- [基于 Kashgari 2 的短文本分类: 模型部署](https://eliyar.biz/nlp/short_text_classificaion_with_kashgari_v2_part_3)
- [15 分钟搭建中文文本分类模型](https://eliyar.biz/nlp_chinese_text_classification_in_15mins/)
- [基于 BERT 的中文命名实体识别(NER)](https://eliyar.biz/nlp_chinese_bert_ner/)
- [BERT/ERNIE 文本分类和部署](https://eliyar.biz/nlp_train_and_deploy_bert_text_classification/)
Expand Down
2 changes: 2 additions & 0 deletions docs/about/release-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ pip show kashgari

## Current Release

### [2.0.2] - 2020.11.18
- 🐛 Fixed Custom Model load issue.
### [2.0.1] - 2020.10.28

- ✨ Add `convert_to_saved_model` API for tf-serving use case.
Expand Down
44 changes: 30 additions & 14 deletions kashgari/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,41 @@
"""

import os
from typing import Dict, Any
from distutils.version import LooseVersion
from typing import Any, Dict

os.environ['TF_KERAS'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["TF_KERAS"] = "1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

custom_objects: Dict[str, Any] = {}

from kashgari.__version__ import __version__
from kashgari.macros import config
from kashgari import layers
from kashgari import corpus
from kashgari import embeddings
from kashgari import macros
from kashgari import processors
from kashgari import tasks
from kashgari import utils

from kashgari.utils.dependency_check import dependency_check
def check_tfa_version(tf_version: str) -> str:
if LooseVersion(tf_version) < "2.2.0":
return "0.9.1"
elif LooseVersion(tf_version) < "2.3.0":
return "0.11.2"
else:
return "0.13.0"


def dependency_check() -> None:
import tensorflow as tf

tfa_version = check_tfa_version(tf_version=tf.__version__)
try:
import tensorflow_addons as tfa
except:
raise ImportError(
"Kashgari request tensorflow_addons, please install via the "
f"`$pip install tensorflow_addons=={tfa_version}`"
)

custom_objects = layers.resigter_custom_layers(custom_objects)

dependency_check()

from kashgari import corpus, embeddings, layers, macros, processors, tasks, utils
from kashgari.__version__ import __version__
from kashgari.macros import config

custom_objects = layers.resigter_custom_layers(custom_objects)
2 changes: 1 addition & 1 deletion kashgari/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
# file: __version__.py.py
# time: 2019-05-20 16:32

__version__ = '2.0.1'
__version__ = '2.0.2'
103 changes: 103 additions & 0 deletions kashgari/callbacks/save_callback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
import numpy as np
from typing import Union, Any, AnyStr

import tensorflow as tf
from kashgari.tasks.abs_task_model import ABCTaskModel
from kashgari.logger import logger


class KashgariModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
"""Save the model after every epoch.
Arguments:
filepath: string, path to save the model file.
monitor: quantity to monitor.
verbose: verbosity mode, 0 or 1.
save_best_only: if `save_best_only=True`, the latest best model according
to the quantity monitored will not be overwritten.
mode: one of {auto, min, max}. If `save_best_only=True`, the decision to
overwrite the current save file is made based on either the maximization
or the minimization of the monitored quantity. For `val_acc`, this
should be `max`, for `val_loss` this should be `min`, etc. In `auto`
mode, the direction is automatically inferred from the name of the
monitored quantity.
save_weights_only: if True, then only the model's weights will be saved
(`model.save_weights(filepath)`), else the full model is saved
(`model.save(filepath)`).
save_freq: `'epoch'` or integer. When using `'epoch'`, the callback saves
the model after each epoch. When using integer, the callback saves the
model at end of a batch at which this many samples have been seen since
last saving. Note that if the saving isn't aligned to epochs, the
monitored metric may potentially be less reliable (it could reflect as
little as 1 batch, since the metrics get reset every epoch). Defaults to
`'epoch'`
**kwargs: Additional arguments for backwards compatibility. Possible key
is `period`.
"""

def __init__(self,
filepath: AnyStr,
monitor: str = 'val_loss',
verbose: int = 1,
save_best_only: bool = False,
save_weights_only: bool = False,
mode: str = 'auto',
save_freq: Union[str, int] = 'epoch',
kash_model: ABCTaskModel = None,
**kwargs: Any) -> None:
super(KashgariModelCheckpoint, self).__init__(
filepath=filepath,
monitor=monitor,
verbose=verbose,
save_best_only=save_best_only,
save_weights_only=save_weights_only,
mode=mode,
save_freq=save_freq,
**kwargs)
self.kash_model = kash_model

def _save_model(self, epoch: int, logs: dict) -> None:
"""Saves the model.
Arguments:
epoch: the epoch this iteration is in.
logs: the `logs` dict passed in to `on_batch_end` or `on_epoch_end`.
"""
logs = logs or {}

if isinstance(self.save_freq,
int) or self.epochs_since_last_save >= self.period:
self.epochs_since_last_save: int = 0
filepath = self._get_file_path(epoch, logs)

if self.save_best_only:
current = logs.get(self.monitor)
if current is None:
logger.warning('Can save best model only with %s available, skipping.', self.monitor)
else:
if self.monitor_op(current, self.best):
if self.verbose > 0:
print('\nEpoch %d: %s improved from %0.5f to %0.5f,'
' saving model to %s' % (epoch + 1, self.monitor, self.best,
current, filepath))
self.best: float = current
if self.save_weights_only:
filepath = os.path.join(filepath, 'cp')
self.model.save_weights(filepath, overwrite=True)
logger.info(f'checkpoint saved to {filepath}')
else:
self.kash_model.save(filepath)
else:
if self.verbose > 0:
print('\nEpoch %d: %s did not improve from %0.5f' %
(epoch + 1, self.monitor, self.best))
else:
if self.verbose > 0:
print('\nEpoch %d: saving model to %s' % (epoch + 1, filepath))
if self.save_weights_only:
filepath = os.path.join(filepath, 'cp')
self.model.save_weights(filepath, overwrite=True)
logger.info(f'checkpoint saved to {filepath}')
else:
self.kash_model.save(filepath)

self._maybe_remove_file()
4 changes: 2 additions & 2 deletions kashgari/metrics/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,8 @@ def sequence_labeling_report(y_true: List[List[str]],
pred_entities = set(bulk_get_entities(y_pred, suffix=suffix))

name_width = 0
d1 = defaultdict(set)
d2 = defaultdict(set)
d1: Dict = defaultdict(set)
d2: Dict = defaultdict(set)
for e in true_entities:
d1[e[0]].add((e[1], e[2]))
name_width = max(name_width, len(e[0]))
Expand Down
32 changes: 20 additions & 12 deletions kashgari/tasks/abs_task_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,20 @@
import os
import pathlib
from abc import ABC, abstractmethod
from typing import Dict, Any, TYPE_CHECKING, Union
from typing import TYPE_CHECKING, Any, Dict, Union

import tensorflow as tf

import kashgari
from kashgari.embeddings import ABCEmbedding
from kashgari.layers import KConditionalRandomField
from kashgari.logger import logger
from kashgari.processors.abc_processor import ABCProcessor
from kashgari.utils import load_data_object
from kashgari.layers import KConditionalRandomField

if TYPE_CHECKING:
from kashgari.tasks.labeling import ABCLabelingModel
from kashgari.tasks.classification import ABCClassificationModel
from kashgari.tasks.labeling import ABCLabelingModel


class ABCTaskModel(ABC):
Expand Down Expand Up @@ -76,11 +76,11 @@ def default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
"""
raise NotImplementedError

def save(self, model_path: str) -> str:
def save(self, model_path: str, encoding: str = 'utf-8') -> str:
pathlib.Path(model_path).mkdir(exist_ok=True, parents=True)
model_path = os.path.abspath(model_path)

with open(os.path.join(model_path, 'model_config.json'), 'w', encoding='utf8') as f:
with open(os.path.join(model_path, 'model_config.json'), 'w', encoding=encoding) as f:
f.write(json.dumps(self.to_dict(), indent=2, ensure_ascii=False))
f.close()

Expand All @@ -90,22 +90,30 @@ def save(self, model_path: str) -> str:
return model_path

@classmethod
def load_model(cls, model_path: str) -> Union["ABCLabelingModel", "ABCClassificationModel"]:
def load_model(cls, model_path: str,
custom_objects: Dict = None,
encoding: str = 'utf-8') -> Union["ABCLabelingModel", "ABCClassificationModel"]:
if custom_objects is None:
custom_objects = {}

if cls.__name__ not in custom_objects:
custom_objects[cls.__name__] = cls

model_config_path = os.path.join(model_path, 'model_config.json')
model_config = json.loads(open(model_config_path, 'r').read())
model = load_data_object(model_config)
model_config = json.loads(open(model_config_path, 'r', encoding=encoding).read())
model = load_data_object(model_config, custom_objects)

model.embedding = load_data_object(model_config['embedding'])
model.text_processor = load_data_object(model_config['text_processor'])
model.label_processor = load_data_object(model_config['label_processor'])
model.embedding = load_data_object(model_config['embedding'], custom_objects)
model.text_processor = load_data_object(model_config['text_processor'], custom_objects)
model.label_processor = load_data_object(model_config['label_processor'], custom_objects)

tf_model_str = json.dumps(model_config['tf_model'])

model.tf_model = tf.keras.models.model_from_json(tf_model_str,
custom_objects=kashgari.custom_objects)

if isinstance(model.tf_model.layers[-1], KConditionalRandomField):
model.layer_crf = model.tf_model.layers[-1]
model.crf_layer = model.tf_model.layers[-1]

model.tf_model.load_weights(os.path.join(model_path, 'model_weights.h5'))
model.embedding.embed_model.load_weights(os.path.join(model_path, 'embed_model_weights.h5'))
Expand Down
2 changes: 1 addition & 1 deletion kashgari/tokenizers/bert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _tokenize(self, text: str) -> List[str]:
spaced += ch

if len(self._token_dict) > 0:
tokens = []
tokens: List[str] = []
for word in spaced.strip().split():
tokens += self._word_piece_tokenize(word)
return tokens
Expand Down
24 changes: 0 additions & 24 deletions kashgari/utils/dependency_check.py

This file was deleted.

15 changes: 12 additions & 3 deletions kashgari/utils/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,27 @@
from typing import Dict, Any


def load_data_object(data: Dict, **kwargs: Dict) -> Any:
def load_data_object(data: Dict,
custom_objects: Dict = None,
**kwargs: Dict) -> Any:
"""
Load Object From Dict
Args:
data:
custom_objects:
**kwargs:
Returns:
"""
module_name = f"{data['__module__']}.{data['__class_name__']}"
obj: Any = pydoc.locate(module_name)(**data['config'], **kwargs) # type: ignore
if custom_objects is None:
custom_objects = {}

if data['__class_name__'] in custom_objects:
obj: Any = custom_objects[data['__class_name__']](**data['config'], **kwargs)
else:
module_name = f"{data['__module__']}.{data['__class_name__']}"
obj: Any = pydoc.locate(module_name)(**data['config'], **kwargs) # type: ignore
if hasattr(obj, '_override_load_model'):
obj._override_load_model(data)

Expand Down
6 changes: 3 additions & 3 deletions requirements.dev.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# test & coverage
flake8
flake8==3.8.4
flake8-builtins
mypy
pytest>=5.4.3
mypy==0.790
pytest==5.4.3
pytest-cov
pytest-split
coveralls
Expand Down
Loading

0 comments on commit 2a4433b

Please sign in to comment.