adds emoji and smiley support, some refactoring, adds prioritized met…

…hod calls support
s · Jan 27, 2016 · c0e78d4 · c0e78d4
1 parent 490f864
commit c0e78d4
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 22 deletions.
diff --git a/preprocessor/constants.py b/preprocessor/constants.py
@@ -1,14 +1,30 @@
+# -*- coding: utf-8 -*-
 """
 preprocessor.constants
 ~~~~~~~~~~~~
 This module includes the constant variables used in Preprocessor
 """
+import re
+
+PREPROCESS_METHODS_PREFIX = 'preprocess_'
+PARSE_METHODS_PREFIX = 'parse_'
+PRIORITISED_METHODS = ['urls']
 
 class Patterns:
-    URL_PATTERN=ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))'
-    HASHTAG_PATTERN = r'#\w*'
-    MENTION_PATTERN = r'@\w*'
-    RESERVED_WORDS_PATTERN = r'^(RT|FAV)'
+    URL_PATTERN=re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
+    HASHTAG_PATTERN = re.compile(r'#\w*')
+    MENTION_PATTERN = re.compile(r'@\w*')
+    RESERVED_WORDS_PATTERN = re.compile(r'^(RT|FAV)')
+
+    try:
+        # UCS-4
+        EMOJIS_PATTERN = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
+    except re.error:
+        # UCS-2
+        EMOJIS_PATTERN = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|'
+                                    u'([\uD83D][\uDE80-\uDEFF])')
+
+    SMILEYS_PATTERN = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P|S){1,}")
 
 class Functions:
     CLEAN=1

diff --git a/preprocessor/parse.py b/preprocessor/parse.py
@@ -6,11 +6,13 @@
 """
 
 import re
-from .constants import Patterns
 from .utils import Util
+from .constants import *
 
 class ParseResult:
     urls = None
+    emojis = None
+    smileys = None
     hashtags = None
     mentions = None
     reserved_words = None
@@ -37,11 +39,11 @@ def __init__(self):
     def parse(self, tweet_string):
         parse_result_obj = ParseResult()
 
-        parser_methods = self.u.get_worker_methods(self, 'parse_')
+        parser_methods = self.u.get_worker_methods(self, PARSE_METHODS_PREFIX)
 
-        for a_cleaner_method in parser_methods:
-            method_to_call = getattr(self, a_cleaner_method)
-            attr = a_cleaner_method.split('_')[1]
+        for a_parser_method in parser_methods:
+            method_to_call = getattr(self, a_parser_method)
+            attr = a_parser_method.split('_')[1]
 
             items = method_to_call(tweet_string)
             setattr(parse_result_obj, attr, items)
@@ -53,7 +55,7 @@ def parser(self, pattern, string):
         items = []
 
         for match_object in re.finditer(pattern, string):
-            parse_item = ParseItem(match_object.start(), match_object.end(), match_object.group())
+            parse_item = ParseItem(match_object.start(), match_object.end(), match_object.group().encode('utf-8'))
             items.append(parse_item)
 
         if len(items):
@@ -71,3 +73,9 @@ def parse_mentions(self, tweet_string):
     def parse_reserved_words(self, tweet_string):
         return self.parser(Patterns.RESERVED_WORDS_PATTERN, tweet_string)
 
+    def parse_emojis(self, tweet_string):
+        tweet_to_clean = tweet_string.decode('utf-8')
+        return self.parser(Patterns.EMOJIS_PATTERN, tweet_to_clean)
+
+    def parse_smileys(self, tweet_string):
+        return self.parser(Patterns.SMILEYS_PATTERN, tweet_string)
diff --git a/preprocessor/preprocess.py b/preprocessor/preprocess.py
@@ -8,7 +8,7 @@
 """
 
 import re
-from .constants import Patterns, Functions
+from .constants import *
 from .utils import Util
 
 class Preprocess:
@@ -21,7 +21,7 @@ def __init__(self):
 
     def clean(self, tweet_string, repl):
 
-        cleaner_methods = self.u.get_worker_methods(self, 'preprocess_')
+        cleaner_methods = self.u.get_worker_methods(self, PREPROCESS_METHODS_PREFIX)
 
         for a_cleaner_method in cleaner_methods:
             token = self.get_token_string_from_method_name(a_cleaner_method)
@@ -36,16 +36,23 @@ def clean(self, tweet_string, repl):
         return tweet_string
 
     def preprocess_urls(self, tweet_string, repl):
-        return re.sub(Patterns.URL_PATTERN, repl, tweet_string)
+        return Patterns.URL_PATTERN.sub(repl, tweet_string)
 
     def preprocess_hashtags(self, tweet_string, repl):
-        return re.sub(Patterns.HASHTAG_PATTERN, repl, tweet_string)
+        return Patterns.HASHTAG_PATTERN.sub(repl, tweet_string)
 
     def preprocess_mentions(self, tweet_string, repl):
-        return re.sub(Patterns.MENTION_PATTERN, repl, tweet_string)
+        return Patterns.MENTION_PATTERN.sub(repl, tweet_string)
 
     def preprocess_reserved_words(self, tweet_string, repl):
-        return re.sub(Patterns.RESERVED_WORDS_PATTERN, repl, tweet_string)
+        return Patterns.RESERVED_WORDS_PATTERN.sub(repl, tweet_string)
+
+    def preprocess_emojis(self, tweet_string, repl):
+        tweet_to_clean = tweet_string.decode('utf-8')
+        return Patterns.EMOJIS_PATTERN.sub(repl, tweet_to_clean)
+
+    def preprocess_smileys(self, tweet_string, repl):
+        return Patterns.SMILEYS_PATTERN.sub(repl, tweet_string)
 
     def remove_unneccessary_characters(self, tweet_string):
         return ' '.join(tweet_string.split())

diff --git a/preprocessor/utils.py b/preprocessor/utils.py
@@ -4,6 +4,8 @@
 This module includes utility methods which are used in Preprocessor
 """
 
+from .constants import PRIORITISED_METHODS
+
 class Util:
 
     def __init__(self):
@@ -12,4 +14,11 @@ def __init__(self):
     def get_worker_methods(self, object, prefix):
         all_methods = dir(object)
         relevant_methods = filter(lambda x: x.startswith(prefix), all_methods)
+        prefixed_prioritised_methods = [prefix+m for m in PRIORITISED_METHODS]
+
+        offset = 0
+        for ind, pri_method in enumerate(prefixed_prioritised_methods):
+            relevant_methods.remove(pri_method)
+            relevant_methods.insert(offset+ind, pri_method)
+
         return relevant_methods
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name='tweet-preprocessor',
-    version='0.2.0',
+    version='0.3.0',
     description='Elegant tweet preprocessing',
     long_description=long_description,
     author='Said Özcan',

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import io
 import unittest
 
@@ -6,17 +8,17 @@
 class PreprocessorTest(unittest.TestCase):
 
     def test_clean(self):
-        tweet = 'Hello there! @pyistanbul #packathon was awesome. http://packathon.org'
+        tweet = "Hello there! @pyistanbul #packathon was awesome 😀. http://packathon.org"
         cleaned_tweeet = p.clean(tweet)
-        self.assertEqual(cleaned_tweeet, 'Hello there! was awesome.')
+        self.assertEqual(cleaned_tweeet, 'Hello there! was awesome .')
 
     def test_tokenize(self):
-        tweet = 'Packathon was a really #nice challenging. @packathonorg http://packathon.org'
+        tweet = 'Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org'
         tokenized_tweet = p.tokenize(tweet)
-        self.assertEqual(tokenized_tweet, 'Packathon was a really $HASHTAG$ challenging. $MENTION$ $URL$')
+        self.assertEqual(tokenized_tweet, 'Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$')
 
     def test_parse(self):
-        tweet = 'A tweet with #hashtag @mention and http://github.com/s.'
+        tweet = 'A tweet with #hashtag :) @mention 😀 and http://github.com/s.'
         parsed_tweet = p.parse(tweet)
 
         self.assertIsNotNone(parsed_tweet.urls)
@@ -29,6 +31,14 @@ def test_parse(self):
         self.assertEqual(1, len(parsed_tweet.mentions))
 
         self.assertIsNone(parsed_tweet.reserved_words)
+
+        self.assertIsNotNone(parsed_tweet.emojis)
+        self.assertEqual(1, len(parsed_tweet.emojis))
+        self.assertEqual("😀", parsed_tweet.emojis[0].match)
+
+        self.assertIsNotNone(parsed_tweet.smileys)
+        self.assertEqual(1, len(parsed_tweet.smileys))
+        self.assertEqual(":)", parsed_tweet.smileys[0].match)
 
 if __name__ == '__main__':
     unittest.main()