Skip to content

Commit

Permalink
Add jieba segmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
ayaka14732 committed Apr 26, 2022
1 parent 675ed02 commit 7f24e71
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 11 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,20 @@ print(convert('在搜尋欄位使用萬用字元。'))
# 在搜索字段使用通配符。
```

## 高級用法 Advanced Usage

### 在簡轉繁時使用外部分詞 Use external segmentation tools when converting from Simplified to Traditional

此功能已預設開啓 This function is enabled by default

```python
from StarCC import PresetConversion
convert = PresetConversion(src='cn', dst='hk', with_phrase=False, use_seg=True)
convert('拥有 116 年历史') # Correct: 擁有 116 年歷史
convert = PresetConversion(src='cn', dst='hk', with_phrase=False, use_seg=False)
convert('拥有 116 年历史') # Wrong: 擁有 116 年曆史
```

## 轉換模式一覽 Supported conversion modes

- `cn`: Simplified Chinese (Mainland China)
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name='starcc',
version='0.0.2',
version='0.0.3',
description='Python implementation of StarCC',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down Expand Up @@ -36,7 +36,7 @@
},
include_package_data=True,
python_requires='>=3.7, <4',
install_requires=['pygtrie'],
install_requires=['jieba', 'pygtrie'],
entry_points={},
project_urls={
'Bug Reports': 'https://github.com/StarCC0/StarCC/issues',
Expand Down
50 changes: 41 additions & 9 deletions src/StarCC/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import jieba
from os import path
import pygtrie
from pygtrie import CharTrie
from typing import Callable, Optional, Sequence

here = path.abspath(path.dirname(__file__))

Expand All @@ -16,8 +18,8 @@ class Dicts:
ST2TWP = ('TWVariants', 'TWPhrasesIT', 'TWPhrasesName', 'TWPhrasesOther')
ST2JP = ('JPVariants',)

def _dicts2trie(dicts):
trie = pygtrie.CharTrie()
def _dicts2trie(dicts: str) -> CharTrie:
trie = CharTrie()

for filename in dicts:
if not path.exists(filename):
Expand All @@ -39,7 +41,7 @@ def _dicts2trie(dicts):

return trie

def _convert(trie, s: str) -> str:
def _convert(trie: CharTrie, s: str) -> str:
results = []

total_len = len(s)
Expand All @@ -64,23 +66,46 @@ def _convert(trie, s: str) -> str:
return ''.join(results)

class Conversion:
def __init__(self, dicts_list) -> None:
def __init__(self, dicts_list: Sequence[str], seg_funcs: Optional[Sequence[Callable]]=None) -> None:
self.tries = [_dicts2trie(dicts) for dicts in dicts_list]

if seg_funcs is None:
self.seg_funcs = [None for _ in dicts_list]
else:
if len(dicts_list) != len(seg_funcs):
raise ValueError('`seg_funcs` should either be `None`, or has the same length with `dicts_list`')
self.seg_funcs = seg_funcs

def __call__(self, s: str) -> str:
for trie in self.tries:
s = _convert(trie, s)
for trie, seg_func in zip(self.tries, self.seg_funcs):
if seg_func is None:
s = _convert(trie, s)
else:
results = []
for segment in seg_func(s):
segment = _convert(trie, segment)
results.append(segment)
s = ''.join(results)
return s

class PresetConversion(Conversion):
def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None:
def __init__(self, src: str='cn', dst: str='hk', with_phrase: bool=False, use_seg: bool=True) -> None:
'''
Initialize a `PresetConversion` object.
`use_seg` Whether to use an external segmentation tool (i.e. jieba) or not
when converting from Simplified to Traditional. If the conversion is not
from Simplified to Traditional, this option has no effect.
'''

if src not in ('st', 'cn', 'hk', 'tw', 'jp'):
raise ValueError(f'Invalid src value: {src}')
if dst not in ('st', 'cn', 'hk', 'tw', 'jp'):
raise ValueError(f'Invalid dst value: {dst}')
assert src != dst

dicts_list = []
seg_funcs = []

if src != 'st':
if not with_phrase:
Expand All @@ -98,6 +123,11 @@ def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None:
'tw': Dicts.TWP2ST,
}[src])

if src == 'cn' and use_seg:
seg_funcs.append(jieba.cut)
else:
seg_funcs.append(None)

if dst != 'st':
if not with_phrase:
dicts_list.append({
Expand All @@ -114,4 +144,6 @@ def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None:
'tw': Dicts.ST2TWP,
}[dst])

super().__init__(dicts_list)
seg_funcs.append(None)

super().__init__(dicts_list, seg_funcs)

0 comments on commit 7f24e71

Please sign in to comment.