forked from aceimnorstuvwxz/chatbot-zh-torch7
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtokenizer.lua
executable file
·60 lines (48 loc) · 1.09 KB
/
tokenizer.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
local lexer = require "pl.lexer"
local yield = coroutine.yield
local M = {}
local function word(token)
return yield("word", token)
end
local function quote(token)
return yield("quote", token)
end
local function space(token)
return yield("space", token)
end
local function tag(token)
return yield("tag", token)
end
local function punct(token)
return yield("punct", token)
end
local function endpunct(token)
return yield("endpunct", token)
end
local function unknown(token)
print("unknown")
return yield("unknown", token)
end
function M.tokenize(text)
print(text)
--{ "^[\128-\193]+", word },
return lexer.scan(text, {
{ "^%s+", space },
{ "^['\"]", quote },
{ "^%w+", word },
{ "^%-+", space },
{ "^[,:;%-]", punct },
{ "^%.+", endpunct },
{ "^[%.%?!]", endpunct },
{ "^</?.->", tag },
{ "^.", unknown },
}, { [space]=true, [tag]=true })
end
function M.join(words)
local s = table.concat(words, " ")
s = s:gsub("^%l", string.upper)
s = s:gsub(" (') ", "%1")
s = s:gsub(" ([,:;%-%.%?!])", "%1")
return s
end
return M