-
-
Notifications
You must be signed in to change notification settings - Fork 275
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #89 from rain1024/chunking
add chunking api
- Loading branch information
Showing
22 changed files
with
360 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
" CH O | ||
Chỗ N B-NP | ||
này P I-NP | ||
xưa N I-NP | ||
là V B-VP | ||
đồn bót N B-NP | ||
nè T I-NP | ||
! CH O | ||
Kia P B-NP | ||
là V B-VP | ||
những L B-NP | ||
ngôi nhà N I-NP | ||
vách N I-NP | ||
đất N I-NP | ||
. CH O | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
Thủ tướng N B-NP | ||
: CH I-NP | ||
Những L I-NP | ||
tồn tại N I-NP | ||
đại biểu N I-NP | ||
Quốc hội N I-NP | ||
chỉ R B-VP | ||
ra V I-VP | ||
, CH O | ||
tôi P B-NP | ||
rất R B-VP | ||
thấm thía A O |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Bác sĩ N B-NP | ||
bây giờ P I-NP | ||
có thể R B-VP | ||
thản nhiên V I-VP | ||
báo tin N B-NP | ||
bệnh nhân N I-NP | ||
bị V B-VP | ||
ung thư N I-VP | ||
? CH O |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
" CH B-NP | ||
Bộ ba N I-NP | ||
quyền lực N I-NP | ||
" CH I-NP | ||
phía N I-NP | ||
sau A I-NP | ||
chương trình N I-NP | ||
tên lửa N I-NP | ||
của E B-PP | ||
Triều Tiên Np B-NP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
Dàn N B-NP | ||
sao P I-NP | ||
MU Np I-NP | ||
tưng bừng A B-VP | ||
khoe V I-VP | ||
cúp N B-NP | ||
vô địch N I-NP | ||
Europa League Np O |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# -*- coding: utf-8 -*- | ||
from os import listdir | ||
from unittest import TestCase, skip | ||
from underthesea import chunk | ||
from os.path import dirname, join | ||
|
||
samples_dir = join(dirname(__file__), "samples") | ||
|
||
|
||
def load_input(input_file): | ||
content = [text.split("\t")[0].decode("utf-8") for text in open(input_file, "r").read().strip().split("\n")] | ||
content = u" ".join(content) | ||
return content | ||
|
||
|
||
def load_output(input_file): | ||
lines = [text.split("\t") for text in open(input_file, "r").read().strip().split("\n")] | ||
output = [] | ||
for item in lines: | ||
word, pos_tag, chunk_tag = item | ||
output.append((word.decode("utf-8"), pos_tag, chunk_tag)) | ||
return output | ||
|
||
|
||
def save_temp(id, output): | ||
temp_file = join(samples_dir, "%s.actual" % id) | ||
content = u"\n".join([u"\t".join(item) for item in output]) | ||
open(temp_file, "w").write(content.encode("utf-8")) | ||
|
||
|
||
class TestChunking(TestCase): | ||
def test_simple_cases(self): | ||
sentence = u"" | ||
actual = chunk(sentence) | ||
expected = [] | ||
self.assertEqual(actual, expected) | ||
|
||
def test_accuracy(self): | ||
test_dir = join(dirname(__file__), "samples") | ||
files = listdir(test_dir) | ||
ids = [f.split(".")[0] for f in files] | ||
for id in ids: | ||
file = join(test_dir, "%s.txt" % id) | ||
sentence = load_input(file) | ||
actual = chunk(sentence) | ||
expected = load_output(file) | ||
if actual != expected: | ||
print("Fail {}".format(id)) | ||
save_temp(id, actual) | ||
self.assertEqual(actual, expected) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Hãy tham dự Cuộc thi viết phóng sự - ký sự báo Tuổi Trẻ 2004 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
Hãy | ||
tham dự | ||
Cuộc thi | ||
viết | ||
phóng sự | ||
- | ||
ký | ||
sự | ||
báo | ||
Tuổi Trẻ | ||
2004 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from underthesea import pos_tag | ||
from underthesea.chunking.model import ChunkingCRFModel | ||
|
||
|
||
def chunk(sentence, format=None): | ||
""" | ||
chunk a sentence to phrases | ||
:param unicode sentence: raw sentence | ||
:return: list of tuple with word, pos tag, chunking tag | ||
:rtype: list | ||
""" | ||
sentence = pos_tag(sentence) | ||
crf_model = ChunkingCRFModel.Instance() | ||
result = crf_model.predict(sentence, format) | ||
return result | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# =========================== | ||
# token syntax | ||
# =========================== | ||
# _ row 1 | ||
# / _ row 2 | ||
# / / _ column | ||
# / / / | ||
# T[0,2][0] | ||
# .is_digit | ||
# \_ function | ||
# | ||
# =========================== | ||
# sample tagged sentence | ||
# =========================== | ||
# this A | ||
# is B | ||
# a C | ||
# sample D | ||
# sentence E | ||
# | ||
|
||
import re | ||
|
||
|
||
def text_lower(word): | ||
return word.lower() | ||
|
||
|
||
def text_istitle(word): | ||
if len(word) == 0: | ||
return False | ||
if type(word) == str: | ||
word = word.decode("utf-8") | ||
try: | ||
titles = [s[0] for s in word.split(" ")] | ||
for token in titles: | ||
if token[0].istitle() is False: | ||
return False | ||
return True | ||
except: | ||
return False | ||
|
||
|
||
def apply_function(name, word): | ||
functions = { | ||
"lower": text_lower, | ||
"istitle": text_istitle | ||
} | ||
return functions[name](word) | ||
|
||
|
||
def template2features(sent, i, token, debug=True): | ||
""" | ||
:type token: object | ||
""" | ||
columns = [ | ||
[t[0] for t in sent], | ||
[t[1] for t in sent] | ||
] | ||
matched = re.match("T\[(?P<index1>\-?\d+)(\,(?P<index2>\-?\d+))?\](\[(?P<column>.*)\])?(\.(?P<function>.*))?", token) | ||
column = matched.group("column") | ||
column = int(column) if column else 0 | ||
index1 = int(matched.group("index1")) | ||
index2 = matched.group("index2") | ||
index2 = int(index2) if index2 else None | ||
func = matched.group("function") | ||
if i + index1 < 0: | ||
return ["%s=BOS" % token] | ||
if i + index1 >= len(sent): | ||
return ["%s=EOS" % token] | ||
if index2 is not None: | ||
if i + index2 >= len(sent): | ||
return ["%s=EOS" % token] | ||
word = " ".join(columns[column][i + index1: i + index2 + 1]) | ||
else: | ||
word = sent[i + index1][column] | ||
if func is not None: | ||
result = apply_function(func, word) | ||
else: | ||
result = word | ||
return ["%s=%s" % (token, result)] | ||
|
||
|
||
def word2features(sent, i, template): | ||
features = [] | ||
for token in template: | ||
features.extend(template2features(sent, i, token)) | ||
return features |
Oops, something went wrong.