Merge pull request #89 from rain1024/chunking

add chunking api
undertheseanlp · May 26, 2017 · ac65ab0 · ac65ab0
2 parents 1f0dd30 + 19a596e
commit ac65ab0
Show file tree

Hide file tree

Showing 22 changed files with 360 additions and 39 deletions.
diff --git a/README.rst b/README.rst
@@ -43,6 +43,7 @@ Usage
 * `1. Corpus <#1-corpus>`_
 * `2. Word Segmentation <#2-word-segmentation>`_
 * `3. POS Tagging <#3-pos-tagging>`_
+* `4. Chunking <#4-chunking>`_
 
 ****************************************
 1. Corpus
@@ -78,7 +79,7 @@ Vietnamese Word Segmentation using Conditional Random Fields
 
 .. code-block:: python
 
-    # -*- coding: utf-8 -*-
+    >>> # -*- coding: utf-8 -*-
     >>> from underthesea import word_sent
     >>> sentence = u"Chúng ta thường nói đến Rau sạch, Rau an toàn để phân biệt với các rau bình thường bán ngoài chợ."
 
@@ -106,7 +107,7 @@ Vietnamese Part of Speech Tagging using Conditional Random Fields
 
 .. code-block:: python
 
-    # -*- coding: utf-8 -*-
+    >>> # -*- coding: utf-8 -*-
     >>> from underthesea import pos_tag
     >>> text = u"Chợ thịt chó nổi tiếng ở TP Hồ Chí Minh bị truy quét"
     >>> pos_tag(text)
@@ -119,11 +120,41 @@ Vietnamese Part of Speech Tagging using Conditional Random Fields
      (u'bị', 'V'),
      (u'truy quét', 'V')]
 
+****************************************
+4. Chunking
+****************************************
+
+.. image:: https://img.shields.io/badge/F1-85.1%25-red.svg
+		:target: https://github.com/magizbox/underthesea.chunking
+
+.. image:: https://img.shields.io/badge/%E2%98%85-can%20beat%20it%3F-blue.svg
+		:target: https://github.com/magizbox/underthesea.chunking
+
+Vietnamese Chunking using Conditional Random Fields
+
+* `Chunking API <https://magizbox-underthesea.readthedocs-hosted.com/en/latest/api.html#chunking-package>`_
+* `Chunking Experiences <https://github.com/magizbox/underthesea.chunking>`_
+
+.. code-block:: python
+
+    >>> # -*- coding: utf-8 -*-
+    >>> from underthesea import chunk
+    >>> text = u"Bác sĩ bây giờ có thể thản nhiên báo tin bệnh nhân bị ung thư?"
+    >>> chunk(text)
+    [(u'Bác sĩ', 'N', 'B-NP'),
+     (u'bây giờ', 'P', 'I-NP'),
+     (u'có thể', 'R', 'B-VP'),
+     (u'thản nhiên', 'V', 'I-VP'),
+     (u'báo tin', 'N', 'B-NP'),
+     (u'bệnh nhân', 'N', 'I-NP'),
+     (u'bị', 'V', 'B-VP'),
+     (u'ung thư', 'N', 'I-VP'),
+     (u'?', 'CH', 'O')]
+
 Up Coming Features
 ----------------------------------------
 
 * Word Representation (`Word Representation Experiences <https://github.com/magizbox/underthesea.word_representation>`_)
-* Chunking (Experiences)
 * Dependency Parsing (Experiences)
 * Named Entity Recognition
 * Sentiment Analysis

diff --git a/docs/api.rst b/docs/api.rst
@@ -94,7 +94,7 @@ API
 
 .. code-block:: python
 
-    # -*- coding: utf-8 -*-
+    >>> # -*- coding: utf-8 -*-
     >>> from underthesea import word_sent
     >>> sentence = u"Chúng ta thường nói đến Rau sạch , Rau an toàn để phân biệt với các rau bình thường bán ngoài chợ ."
 
@@ -118,7 +118,7 @@ API
 
 .. code-block:: python
 
-    # -*- coding: utf-8 -*-
+    >>> # -*- coding: utf-8 -*-
     >>> from underthesea import pos_tag
     >>> text = u"Chợ thịt chó nổi tiếng ở TP Hồ Chí Minh bị truy quét"
     >>> pos_tag(text)
@@ -130,3 +130,30 @@ API
      (u'TP HCM', 'Np'),
      (u'bị', 'V'),
      (u'truy quét', 'V')]
+
+:mod:`chunking` Package
+-------------------------
+
+.. py:function:: underthesea.chunk(sentence)
+
+    chunk a sentence to phrases
+
+    :param unicode sentence: raw sentence
+    :return: list of tuple with word, pos tag, chunking tag
+    :rtype: list
+
+.. code-block:: python
+
+	>>> # -*- coding: utf-8 -*-
+	>>> from underthesea import chunk
+	>>> text = u"Bác sĩ bây giờ có thể thản nhiên báo tin bệnh nhân bị ung thư?"
+	>>> chunk(text)
+	[(u'Bác sĩ', 'N', 'B-NP'),
+	 (u'bây giờ', 'P', 'I-NP'),
+	 (u'có thể', 'R', 'B-VP'),
+	 (u'thản nhiên', 'V', 'I-VP'),
+	 (u'báo tin', 'N', 'B-NP'),
+	 (u'bệnh nhân', 'N', 'I-NP'),
+	 (u'bị', 'V', 'B-VP'),
+	 (u'ung thư', 'N', 'I-VP'),
+	 (u'?', 'CH', 'O')]
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -40,3 +40,22 @@ POS Tagging
      (u'bị', 'V'),
      (u'truy quét', 'V')]
 
+Chunking
+-------------------------
+
+.. code-block:: python
+
+    >>> # -*- coding: utf-8 -*-
+    >>> from underthesea import chunk
+    >>> text = u"Bác sĩ bây giờ có thể thản nhiên báo tin bệnh nhân bị ung thư?"
+    >>> chunk(text)
+    [(u'Bác sĩ', 'N', 'B-NP'),
+     (u'bây giờ', 'P', 'I-NP'),
+     (u'có thể', 'R', 'B-VP'),
+     (u'thản nhiên', 'V', 'I-VP'),
+     (u'báo tin', 'N', 'B-NP'),
+     (u'bệnh nhân', 'N', 'I-NP'),
+     (u'bị', 'V', 'B-VP'),
+     (u'ung thư', 'N', 'I-VP'),
+     (u'?', 'CH', 'O')]
+
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 
 setup(
     name='underthesea',
-    version='1.0.19',
+    version='1.0.20',
     description="Vietnamese NLP Toolkit",
     long_description=readme + '\n\n' + history,
     author="Vu Anh",

diff --git a/tests/chunking/samples/1.txt b/tests/chunking/samples/1.txt
@@ -0,0 +1,16 @@
+"	CH	O
+Chỗ	N	B-NP
+này	P	I-NP
+xưa	N	I-NP
+là	V	B-VP
+đồn bót	N	B-NP
+nè	T	I-NP
+!	CH	O
+Kia	P	B-NP
+là	V	B-VP
+những	L	B-NP
+ngôi nhà	N	I-NP
+vách	N	I-NP
+đất	N	I-NP
+.	CH	O
+
diff --git a/tests/chunking/samples/2.txt b/tests/chunking/samples/2.txt
@@ -0,0 +1,12 @@
+Thủ tướng	N	B-NP
+:	CH	I-NP
+Những	L	I-NP
+tồn tại	N	I-NP
+đại biểu	N	I-NP
+Quốc hội	N	I-NP
+chỉ	R	B-VP
+ra	V	I-VP
+,	CH	O
+tôi	P	B-NP
+rất	R	B-VP
+thấm thía	A	O
diff --git a/tests/chunking/samples/3.txt b/tests/chunking/samples/3.txt
@@ -0,0 +1,9 @@
+Bác sĩ	N	B-NP
+bây giờ	P	I-NP
+có thể	R	B-VP
+thản nhiên	V	I-VP
+báo tin	N	B-NP
+bệnh nhân	N	I-NP
+bị	V	B-VP
+ung thư	N	I-VP
+?	CH	O
diff --git a/tests/chunking/samples/4.txt b/tests/chunking/samples/4.txt
@@ -0,0 +1,10 @@
+"	CH	B-NP
+Bộ ba	N	I-NP
+quyền lực	N	I-NP
+"	CH	I-NP
+phía	N	I-NP
+sau	A	I-NP
+chương trình	N	I-NP
+tên lửa	N	I-NP
+của	E	B-PP
+Triều Tiên	Np	B-NP
diff --git a/tests/chunking/samples/5.txt b/tests/chunking/samples/5.txt
@@ -0,0 +1,8 @@
+Dàn	N	B-NP
+sao	P	I-NP
+MU	Np	I-NP
+tưng bừng	A	B-VP
+khoe	V	I-VP
+cúp	N	B-NP
+vô địch	N	I-NP
+Europa League	Np	O
diff --git a/tests/chunking/test_chunking.py b/tests/chunking/test_chunking.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+from os import listdir
+from unittest import TestCase, skip
+from underthesea import chunk
+from os.path import dirname, join
+
+samples_dir = join(dirname(__file__), "samples")
+
+
+def load_input(input_file):
+    content = [text.split("\t")[0].decode("utf-8") for text in open(input_file, "r").read().strip().split("\n")]
+    content = u" ".join(content)
+    return content
+
+
+def load_output(input_file):
+    lines = [text.split("\t") for text in open(input_file, "r").read().strip().split("\n")]
+    output = []
+    for item in lines:
+        word, pos_tag, chunk_tag = item
+        output.append((word.decode("utf-8"), pos_tag, chunk_tag))
+    return output
+
+
+def save_temp(id, output):
+    temp_file = join(samples_dir, "%s.actual" % id)
+    content = u"\n".join([u"\t".join(item) for item in output])
+    open(temp_file, "w").write(content.encode("utf-8"))
+
+
+class TestChunking(TestCase):
+    def test_simple_cases(self):
+        sentence = u""
+        actual = chunk(sentence)
+        expected = []
+        self.assertEqual(actual, expected)
+
+    def test_accuracy(self):
+        test_dir = join(dirname(__file__), "samples")
+        files = listdir(test_dir)
+        ids = [f.split(".")[0] for f in files]
+        for id in ids:
+            file = join(test_dir, "%s.txt" % id)
+            sentence = load_input(file)
+            actual = chunk(sentence)
+            expected = load_output(file)
+            if actual != expected:
+                print("Fail {}".format(id))
+                save_temp(id, actual)
+            self.assertEqual(actual, expected)
diff --git a/tests/word_sent/samples/accuracy/17.in b/tests/word_sent/samples/accuracy/17.in
@@ -0,0 +1 @@
+Hãy tham dự Cuộc thi viết phóng sự - ký sự báo Tuổi Trẻ 2004
diff --git a/tests/word_sent/samples/accuracy/17.out b/tests/word_sent/samples/accuracy/17.out
@@ -0,0 +1,11 @@
+Hãy
+tham dự
+Cuộc thi
+viết
+phóng sự
+-
+ký
+sự
+báo
+Tuổi Trẻ
+2004
diff --git a/underthesea/__init__.py b/underthesea/__init__.py
@@ -6,6 +6,7 @@
 
 from word_sent.word_sent import word_sent
 from pos_tag.pos_tag import pos_tag
+from chunking.chunk import chunk
 
 
 def info(version):

diff --git a/underthesea/chunking/__init__.py b/underthesea/chunking/__init__.py
diff --git a/underthesea/chunking/chunk.py b/underthesea/chunking/chunk.py
@@ -0,0 +1,17 @@
+from underthesea import pos_tag
+from underthesea.chunking.model import ChunkingCRFModel
+
+
+def chunk(sentence, format=None):
+    """
+    chunk a sentence to phrases 
+    
+    :param unicode sentence: raw sentence
+    :return: list of tuple with word, pos tag, chunking tag 
+    :rtype: list 
+    """
+    sentence = pos_tag(sentence)
+    crf_model = ChunkingCRFModel.Instance()
+    result = crf_model.predict(sentence, format)
+    return result
+
diff --git a/underthesea/chunking/chunking_crf_v1.model b/underthesea/chunking/chunking_crf_v1.model
diff --git a/underthesea/chunking/feature.py b/underthesea/chunking/feature.py
@@ -0,0 +1,88 @@
+# ===========================
+# token syntax
+# ===========================
+#         _ row 1
+#        /  _ row 2
+#       /  /  _ column
+#      /  /  /
+#    T[0,2][0]
+#          .is_digit
+#            \_ function
+#
+# ===========================
+# sample tagged sentence
+# ===========================
+# this     A
+# is       B
+# a        C
+# sample   D
+# sentence E
+#
+
+import re
+
+
+def text_lower(word):
+    return word.lower()
+
+
+def text_istitle(word):
+    if len(word) == 0:
+        return False
+    if type(word) == str:
+        word = word.decode("utf-8")
+    try:
+        titles = [s[0] for s in word.split(" ")]
+        for token in titles:
+            if token[0].istitle() is False:
+                return False
+        return True
+    except:
+        return False
+
+
+def apply_function(name, word):
+    functions = {
+        "lower": text_lower,
+        "istitle": text_istitle
+    }
+    return functions[name](word)
+
+
+def template2features(sent, i, token, debug=True):
+    """
+    :type token: object
+    """
+    columns = [
+        [t[0] for t in sent],
+        [t[1] for t in sent]
+    ]
+    matched = re.match("T\[(?P<index1>\-?\d+)(\,(?P<index2>\-?\d+))?\](\[(?P<column>.*)\])?(\.(?P<function>.*))?", token)
+    column = matched.group("column")
+    column = int(column) if column else 0
+    index1 = int(matched.group("index1"))
+    index2 = matched.group("index2")
+    index2 = int(index2) if index2 else None
+    func = matched.group("function")
+    if i + index1 < 0:
+        return ["%s=BOS" % token]
+    if i + index1 >= len(sent):
+        return ["%s=EOS" % token]
+    if index2 is not None:
+        if i + index2 >= len(sent):
+            return ["%s=EOS" % token]
+        word = " ".join(columns[column][i + index1: i + index2 + 1])
+    else:
+        word = sent[i + index1][column]
+    if func is not None:
+        result = apply_function(func, word)
+    else:
+        result = word
+    return ["%s=%s" % (token, result)]
+
+
+def word2features(sent, i, template):
+    features = []
+    for token in template:
+        features.extend(template2features(sent, i, token))
+    return features
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Hãy tham dự Cuộc thi viết phóng sự - ký sự báo Tuổi Trẻ 2004