diff --git a/README.md b/README.md index 778a17f..2420482 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,20 @@ print(convert('在搜尋欄位使用萬用字元。')) # 在搜索字段使用通配符。 ``` +## 高級用法 Advanced Usage + +### 在簡轉繁時使用外部分詞 Use external segmentation tools when converting from Simplified to Traditional + +此功能已預設開啓 This function is enabled by default + +```python +from StarCC import PresetConversion +convert = PresetConversion(src='cn', dst='hk', with_phrase=False, use_seg=True) +convert('拥有 116 年历史') # Correct: 擁有 116 年歷史 +convert = PresetConversion(src='cn', dst='hk', with_phrase=False, use_seg=False) +convert('拥有 116 年历史') # Wrong: 擁有 116 年曆史 +``` + ## 轉換模式一覽 Supported conversion modes - `cn`: Simplified Chinese (Mainland China) diff --git a/setup.py b/setup.py index 967092d..8e93626 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='starcc', - version='0.0.2', + version='0.0.3', description='Python implementation of StarCC', long_description=long_description, long_description_content_type='text/markdown', @@ -36,7 +36,7 @@ }, include_package_data=True, python_requires='>=3.7, <4', - install_requires=['pygtrie'], + install_requires=['jieba', 'pygtrie'], entry_points={}, project_urls={ 'Bug Reports': 'https://github.com/StarCC0/StarCC/issues', diff --git a/src/StarCC/__init__.py b/src/StarCC/__init__.py index 308aceb..aadea19 100644 --- a/src/StarCC/__init__.py +++ b/src/StarCC/__init__.py @@ -1,5 +1,7 @@ +import jieba from os import path -import pygtrie +from pygtrie import CharTrie +from typing import Callable, Optional, Sequence here = path.abspath(path.dirname(__file__)) @@ -16,8 +18,8 @@ class Dicts: ST2TWP = ('TWVariants', 'TWPhrasesIT', 'TWPhrasesName', 'TWPhrasesOther') ST2JP = ('JPVariants',) -def _dicts2trie(dicts): - trie = pygtrie.CharTrie() +def _dicts2trie(dicts: str) -> CharTrie: + trie = CharTrie() for filename in dicts: if not path.exists(filename): @@ -39,7 +41,7 @@ def _dicts2trie(dicts): return trie -def _convert(trie, s: str) -> str: +def _convert(trie: CharTrie, s: str) -> str: results = [] total_len = len(s) @@ -64,16 +66,38 @@ def _convert(trie, s: str) -> str: return ''.join(results) class Conversion: - def __init__(self, dicts_list) -> None: + def __init__(self, dicts_list: Sequence[str], seg_funcs: Optional[Sequence[Callable]]=None) -> None: self.tries = [_dicts2trie(dicts) for dicts in dicts_list] + if seg_funcs is None: + self.seg_funcs = [None for _ in dicts_list] + else: + if len(dicts_list) != len(seg_funcs): + raise ValueError('`seg_funcs` should either be `None`, or has the same length with `dicts_list`') + self.seg_funcs = seg_funcs + def __call__(self, s: str) -> str: - for trie in self.tries: - s = _convert(trie, s) + for trie, seg_func in zip(self.tries, self.seg_funcs): + if seg_func is None: + s = _convert(trie, s) + else: + results = [] + for segment in seg_func(s): + segment = _convert(trie, segment) + results.append(segment) + s = ''.join(results) return s class PresetConversion(Conversion): - def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None: + def __init__(self, src: str='cn', dst: str='hk', with_phrase: bool=False, use_seg: bool=True) -> None: + ''' + Initialize a `PresetConversion` object. + + `use_seg` Whether to use an external segmentation tool (i.e. jieba) or not + when converting from Simplified to Traditional. If the conversion is not + from Simplified to Traditional, this option has no effect. + ''' + if src not in ('st', 'cn', 'hk', 'tw', 'jp'): raise ValueError(f'Invalid src value: {src}') if dst not in ('st', 'cn', 'hk', 'tw', 'jp'): @@ -81,6 +105,7 @@ def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None: assert src != dst dicts_list = [] + seg_funcs = [] if src != 'st': if not with_phrase: @@ -98,6 +123,11 @@ def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None: 'tw': Dicts.TWP2ST, }[src]) + if src == 'cn' and use_seg: + seg_funcs.append(jieba.cut) + else: + seg_funcs.append(None) + if dst != 'st': if not with_phrase: dicts_list.append({ @@ -114,4 +144,6 @@ def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None: 'tw': Dicts.ST2TWP, }[dst]) - super().__init__(dicts_list) + seg_funcs.append(None) + + super().__init__(dicts_list, seg_funcs)