forked from go-ego/gse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
seg_utils.go
executable file
·254 lines (218 loc) · 5.92 KB
/
seg_utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// Copyright 2013 Hui Chen
// Copyright 2016 ego authors
//
// Licensed under the Apache License, Version 2.0 (the "License"): you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
package gse
import (
"bytes"
"fmt"
)
// ToString segments to string 输出分词结果为字符串
//
// 有两种输出模式,以 "山达尔星联邦共和国" 为例
//
// 普通模式(searchMode=false)输出一个分词 "山达尔星联邦共和国/ns "
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
// "山达尔星/nz 联邦/n 共和/nz 国/n 共和国/ns 联邦共和国/nt 山达尔星联邦共和国/ns "
//
// 默认 searchMode=false
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见 Token 结构体的注释。
func ToString(segs []Segment, searchMode ...bool) (output string) {
var mode bool
if len(searchMode) > 0 {
mode = searchMode[0]
}
if mode {
for _, seg := range segs {
output += tokenToString(seg.token)
}
return
}
for _, seg := range segs {
output += fmt.Sprintf("%s/%s ",
textSliceToString(seg.token.text), seg.token.pos)
}
return
}
func tokenToString(token *Token) (output string) {
hasOnlyTerminalToken := true
for _, s := range token.segments {
if len(s.token.segments) > 1 || IsJp(string(s.token.text[0])) {
hasOnlyTerminalToken = false
}
if !hasOnlyTerminalToken && s != nil {
output += tokenToString(s.token)
}
}
output += fmt.Sprintf("%s/%s ", textSliceToString(token.text), token.pos)
return
}
func tokenToBytes(token *Token) (output []byte) {
for _, s := range token.segments {
output = append(output, tokenToBytes(s.token)...)
}
output = append(output,
[]byte(fmt.Sprintf("%s/%s ", textSliceToString(token.text), token.pos))...)
return
}
// ToSlice segments to slice 输出分词结果到一个字符串 slice
//
// 有两种输出模式,以 "山达尔星联邦共和国" 为例
//
// 普通模式(searchMode=false)输出一个分词"[山达尔星联邦共和国]"
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
// "[山达尔星 联邦 共和 国 共和国 联邦共和国 山达尔星联邦共和国]"
//
// 默认 searchMode=false
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
func ToSlice(segs []Segment, searchMode ...bool) (output []string) {
var mode bool
if len(searchMode) > 0 {
mode = searchMode[0]
}
if mode {
for _, seg := range segs {
output = append(output, tokenToSlice(seg.token)...)
}
return
}
for _, seg := range segs {
output = append(output, seg.token.Text())
}
return
}
func tokenToSlice(token *Token) (output []string) {
hasOnlyTerminalToken := true
for _, s := range token.segments {
if len(s.token.segments) > 1 || IsJp(string(s.token.text[0])) {
hasOnlyTerminalToken = false
}
if !hasOnlyTerminalToken {
output = append(output, tokenToSlice(s.token)...)
}
}
output = append(output, textSliceToString(token.text))
return
}
// ToPos segments to SegPos
func ToPos(segs []Segment, searchMode ...bool) (output []SegPos) {
var mode bool
if len(searchMode) > 0 {
mode = searchMode[0]
}
if mode {
for _, seg := range segs {
output = append(output, tokenToPos(seg.token)...)
}
return
}
for _, seg := range segs {
pos1 := SegPos{
Text: textSliceToString(seg.token.text),
Pos: seg.token.pos,
}
output = append(output, pos1)
}
return
}
func tokenToPos(token *Token) (output []SegPos) {
hasOnlyTerminalToken := true
for _, s := range token.segments {
if len(s.token.segments) > 1 || IsJp(string(s.token.text[0])) {
hasOnlyTerminalToken = false
}
if !hasOnlyTerminalToken {
output = append(output, tokenToPos(s.token)...)
}
}
pos1 := SegPos{
Text: textSliceToString(token.text),
Pos: token.pos,
}
output = append(output, pos1)
return
}
// 将多个字元拼接一个字符串输出
func textToString(text []Text) (output string) {
for _, word := range text {
output += string(word)
}
return
}
// 将多个字元拼接一个字符串输出
func textSliceToString(text []Text) string {
return Join(text)
}
// 返回多个字元的字节总长度
func textSliceByteLen(text []Text) (length int) {
for _, word := range text {
length += len(word)
}
return
}
func textSliceToBytes(text []Text) []byte {
var buf bytes.Buffer
for _, word := range text {
buf.Write(word)
}
return buf.Bytes()
}
// Join is better string splicing
func Join(text []Text) string {
switch len(text) {
case 0:
return ""
case 1:
return string(text[0])
case 2:
// Special case for common small values.
// Remove if github.com/golang/go/issues/6714 is fixed
return string(text[0]) + string(text[1])
case 3:
// Special case for common small values.
// Remove if #6714 is fixed
return string(text[0]) + string(text[1]) + string(text[2])
}
n := 0
for i := 0; i < len(text); i++ {
n += len(text[i])
}
b := make([]byte, n)
bp := copy(b, text[0])
for _, str := range text[1:] {
bp += copy(b[bp:], str)
}
return string(b)
}
func printTokens(tokens []*Token, numTokens int) (output string) {
for iToken := 0; iToken < numTokens; iToken++ {
for _, word := range tokens[iToken].text {
output += fmt.Sprint(string(word))
}
output += " "
}
return
}
func toWords(strings ...string) []Text {
words := []Text{}
for _, s := range strings {
words = append(words, []byte(s))
}
return words
}
func bytesToString(bytes []Text) (output string) {
for _, b := range bytes {
output += (string(b) + "/")
}
return
}