Skip to content

Commit

Permalink
Merge pull request #89 from rain1024/chunking
Browse files Browse the repository at this point in the history
add chunking api
  • Loading branch information
Brother Rain authored May 26, 2017
2 parents 1f0dd30 + 19a596e commit ac65ab0
Show file tree
Hide file tree
Showing 22 changed files with 360 additions and 39 deletions.
37 changes: 34 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Usage
* `1. Corpus <#1-corpus>`_
* `2. Word Segmentation <#2-word-segmentation>`_
* `3. POS Tagging <#3-pos-tagging>`_
* `4. Chunking <#4-chunking>`_

****************************************
1. Corpus
Expand Down Expand Up @@ -78,7 +79,7 @@ Vietnamese Word Segmentation using Conditional Random Fields

.. code-block:: python
# -*- coding: utf-8 -*-
>>> # -*- coding: utf-8 -*-
>>> from underthesea import word_sent
>>> sentence = u"Chúng ta thường nói đến Rau sạch, Rau an toàn để phân biệt với các rau bình thường bán ngoài chợ."
Expand Down Expand Up @@ -106,7 +107,7 @@ Vietnamese Part of Speech Tagging using Conditional Random Fields

.. code-block:: python
# -*- coding: utf-8 -*-
>>> # -*- coding: utf-8 -*-
>>> from underthesea import pos_tag
>>> text = u"Chợ thịt chó nổi tiếng ở TP Hồ Chí Minh bị truy quét"
>>> pos_tag(text)
Expand All @@ -119,11 +120,41 @@ Vietnamese Part of Speech Tagging using Conditional Random Fields
(u'bị', 'V'),
(u'truy quét', 'V')]
****************************************
4. Chunking
****************************************

.. image:: https://img.shields.io/badge/F1-85.1%25-red.svg
:target: https://github.com/magizbox/underthesea.chunking

.. image:: https://img.shields.io/badge/%E2%98%85-can%20beat%20it%3F-blue.svg
:target: https://github.com/magizbox/underthesea.chunking

Vietnamese Chunking using Conditional Random Fields

* `Chunking API <https://magizbox-underthesea.readthedocs-hosted.com/en/latest/api.html#chunking-package>`_
* `Chunking Experiences <https://github.com/magizbox/underthesea.chunking>`_

.. code-block:: python
>>> # -*- coding: utf-8 -*-
>>> from underthesea import chunk
>>> text = u"Bác sĩ bây giờ có thể thản nhiên báo tin bệnh nhân bị ung thư?"
>>> chunk(text)
[(u'Bác sĩ', 'N', 'B-NP'),
(u'bây giờ', 'P', 'I-NP'),
(u'có thể', 'R', 'B-VP'),
(u'thản nhiên', 'V', 'I-VP'),
(u'báo tin', 'N', 'B-NP'),
(u'bệnh nhân', 'N', 'I-NP'),
(u'bị', 'V', 'B-VP'),
(u'ung thư', 'N', 'I-VP'),
(u'?', 'CH', 'O')]
Up Coming Features
----------------------------------------

* Word Representation (`Word Representation Experiences <https://github.com/magizbox/underthesea.word_representation>`_)
* Chunking (Experiences)
* Dependency Parsing (Experiences)
* Named Entity Recognition
* Sentiment Analysis
Expand Down
31 changes: 29 additions & 2 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ API

.. code-block:: python
# -*- coding: utf-8 -*-
>>> # -*- coding: utf-8 -*-
>>> from underthesea import word_sent
>>> sentence = u"Chúng ta thường nói đến Rau sạch , Rau an toàn để phân biệt với các rau bình thường bán ngoài chợ ."
Expand All @@ -118,7 +118,7 @@ API

.. code-block:: python
# -*- coding: utf-8 -*-
>>> # -*- coding: utf-8 -*-
>>> from underthesea import pos_tag
>>> text = u"Chợ thịt chó nổi tiếng ở TP Hồ Chí Minh bị truy quét"
>>> pos_tag(text)
Expand All @@ -130,3 +130,30 @@ API
(u'TP HCM', 'Np'),
(u'bị', 'V'),
(u'truy quét', 'V')]
:mod:`chunking` Package
-------------------------

.. py:function:: underthesea.chunk(sentence)
chunk a sentence to phrases

:param unicode sentence: raw sentence
:return: list of tuple with word, pos tag, chunking tag
:rtype: list

.. code-block:: python
>>> # -*- coding: utf-8 -*-
>>> from underthesea import chunk
>>> text = u"Bác sĩ bây giờ có thể thản nhiên báo tin bệnh nhân bị ung thư?"
>>> chunk(text)
[(u'Bác sĩ', 'N', 'B-NP'),
(u'bây giờ', 'P', 'I-NP'),
(u'có thể', 'R', 'B-VP'),
(u'thản nhiên', 'V', 'I-VP'),
(u'báo tin', 'N', 'B-NP'),
(u'bệnh nhân', 'N', 'I-NP'),
(u'bị', 'V', 'B-VP'),
(u'ung thư', 'N', 'I-VP'),
(u'?', 'CH', 'O')]
19 changes: 19 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,22 @@ POS Tagging
(u'bị', 'V'),
(u'truy quét', 'V')]
Chunking
-------------------------

.. code-block:: python
>>> # -*- coding: utf-8 -*-
>>> from underthesea import chunk
>>> text = u"Bác sĩ bây giờ có thể thản nhiên báo tin bệnh nhân bị ung thư?"
>>> chunk(text)
[(u'Bác sĩ', 'N', 'B-NP'),
(u'bây giờ', 'P', 'I-NP'),
(u'có thể', 'R', 'B-VP'),
(u'thản nhiên', 'V', 'I-VP'),
(u'báo tin', 'N', 'B-NP'),
(u'bệnh nhân', 'N', 'I-NP'),
(u'bị', 'V', 'B-VP'),
(u'ung thư', 'N', 'I-VP'),
(u'?', 'CH', 'O')]
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

setup(
name='underthesea',
version='1.0.19',
version='1.0.20',
description="Vietnamese NLP Toolkit",
long_description=readme + '\n\n' + history,
author="Vu Anh",
Expand Down
16 changes: 16 additions & 0 deletions tests/chunking/samples/1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
" CH O
Chỗ N B-NP
này P I-NP
xưa N I-NP
là V B-VP
đồn bót N B-NP
nè T I-NP
! CH O
Kia P B-NP
là V B-VP
những L B-NP
ngôi nhà N I-NP
vách N I-NP
đất N I-NP
. CH O

12 changes: 12 additions & 0 deletions tests/chunking/samples/2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Thủ tướng N B-NP
: CH I-NP
Những L I-NP
tồn tại N I-NP
đại biểu N I-NP
Quốc hội N I-NP
chỉ R B-VP
ra V I-VP
, CH O
tôi P B-NP
rất R B-VP
thấm thía A O
9 changes: 9 additions & 0 deletions tests/chunking/samples/3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Bác sĩ N B-NP
bây giờ P I-NP
có thể R B-VP
thản nhiên V I-VP
báo tin N B-NP
bệnh nhân N I-NP
bị V B-VP
ung thư N I-VP
? CH O
10 changes: 10 additions & 0 deletions tests/chunking/samples/4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
" CH B-NP
Bộ ba N I-NP
quyền lực N I-NP
" CH I-NP
phía N I-NP
sau A I-NP
chương trình N I-NP
tên lửa N I-NP
của E B-PP
Triều Tiên Np B-NP
8 changes: 8 additions & 0 deletions tests/chunking/samples/5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Dàn N B-NP
sao P I-NP
MU Np I-NP
tưng bừng A B-VP
khoe V I-VP
cúp N B-NP
vô địch N I-NP
Europa League Np O
50 changes: 50 additions & 0 deletions tests/chunking/test_chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
from os import listdir
from unittest import TestCase, skip
from underthesea import chunk
from os.path import dirname, join

samples_dir = join(dirname(__file__), "samples")


def load_input(input_file):
content = [text.split("\t")[0].decode("utf-8") for text in open(input_file, "r").read().strip().split("\n")]
content = u" ".join(content)
return content


def load_output(input_file):
lines = [text.split("\t") for text in open(input_file, "r").read().strip().split("\n")]
output = []
for item in lines:
word, pos_tag, chunk_tag = item
output.append((word.decode("utf-8"), pos_tag, chunk_tag))
return output


def save_temp(id, output):
temp_file = join(samples_dir, "%s.actual" % id)
content = u"\n".join([u"\t".join(item) for item in output])
open(temp_file, "w").write(content.encode("utf-8"))


class TestChunking(TestCase):
def test_simple_cases(self):
sentence = u""
actual = chunk(sentence)
expected = []
self.assertEqual(actual, expected)

def test_accuracy(self):
test_dir = join(dirname(__file__), "samples")
files = listdir(test_dir)
ids = [f.split(".")[0] for f in files]
for id in ids:
file = join(test_dir, "%s.txt" % id)
sentence = load_input(file)
actual = chunk(sentence)
expected = load_output(file)
if actual != expected:
print("Fail {}".format(id))
save_temp(id, actual)
self.assertEqual(actual, expected)
1 change: 1 addition & 0 deletions tests/word_sent/samples/accuracy/17.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hãy tham dự Cuộc thi viết phóng sự - ký sự báo Tuổi Trẻ 2004
11 changes: 11 additions & 0 deletions tests/word_sent/samples/accuracy/17.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Hãy
tham dự
Cuộc thi
viết
phóng sự
-
sự
báo
Tuổi Trẻ
2004
1 change: 1 addition & 0 deletions underthesea/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from word_sent.word_sent import word_sent
from pos_tag.pos_tag import pos_tag
from chunking.chunk import chunk


def info(version):
Expand Down
Empty file.
17 changes: 17 additions & 0 deletions underthesea/chunking/chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from underthesea import pos_tag
from underthesea.chunking.model import ChunkingCRFModel


def chunk(sentence, format=None):
"""
chunk a sentence to phrases
:param unicode sentence: raw sentence
:return: list of tuple with word, pos tag, chunking tag
:rtype: list
"""
sentence = pos_tag(sentence)
crf_model = ChunkingCRFModel.Instance()
result = crf_model.predict(sentence, format)
return result

Binary file added underthesea/chunking/chunking_crf_v1.model
Binary file not shown.
88 changes: 88 additions & 0 deletions underthesea/chunking/feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# ===========================
# token syntax
# ===========================
# _ row 1
# / _ row 2
# / / _ column
# / / /
# T[0,2][0]
# .is_digit
# \_ function
#
# ===========================
# sample tagged sentence
# ===========================
# this A
# is B
# a C
# sample D
# sentence E
#

import re


def text_lower(word):
return word.lower()


def text_istitle(word):
if len(word) == 0:
return False
if type(word) == str:
word = word.decode("utf-8")
try:
titles = [s[0] for s in word.split(" ")]
for token in titles:
if token[0].istitle() is False:
return False
return True
except:
return False


def apply_function(name, word):
functions = {
"lower": text_lower,
"istitle": text_istitle
}
return functions[name](word)


def template2features(sent, i, token, debug=True):
"""
:type token: object
"""
columns = [
[t[0] for t in sent],
[t[1] for t in sent]
]
matched = re.match("T\[(?P<index1>\-?\d+)(\,(?P<index2>\-?\d+))?\](\[(?P<column>.*)\])?(\.(?P<function>.*))?", token)
column = matched.group("column")
column = int(column) if column else 0
index1 = int(matched.group("index1"))
index2 = matched.group("index2")
index2 = int(index2) if index2 else None
func = matched.group("function")
if i + index1 < 0:
return ["%s=BOS" % token]
if i + index1 >= len(sent):
return ["%s=EOS" % token]
if index2 is not None:
if i + index2 >= len(sent):
return ["%s=EOS" % token]
word = " ".join(columns[column][i + index1: i + index2 + 1])
else:
word = sent[i + index1][column]
if func is not None:
result = apply_function(func, word)
else:
result = word
return ["%s=%s" % (token, result)]


def word2features(sent, i, template):
features = []
for token in template:
features.extend(template2features(sent, i, token))
return features
Loading

0 comments on commit ac65ab0

Please sign in to comment.