From a5f4010033dffa1e93a0a0f9d8b4a0a2c87920c2 Mon Sep 17 00:00:00 2001 From: ranzaka Date: Fri, 21 Jun 2024 14:10:31 +0530 Subject: [PATCH] Added text romanization logics and added proper dockstrings --- .idea/.gitignore | 3 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 10 + .idea/modules.xml | 8 + .idea/sinlib.iml | 14 + .idea/vcs.xml | 6 + README.md | 19 +- data/char_map.json | 1 + data/vocab_map.json | 2276 +++++++++++++++++ examples/examples.ipynb | 340 ++- pyproject.toml | 2 +- src/sinlib/__init__.py | 4 +- src/sinlib/romanize.py | 43 + src/sinlib/tokenizer.py | 152 +- src/sinlib/utils/chars.py | 315 ++- src/sinlib/utils/preprocessing.py | 113 +- 16 files changed, 3119 insertions(+), 193 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/sinlib.iml create mode 100644 .idea/vcs.xml create mode 100644 data/char_map.json create mode 100644 data/vocab_map.json create mode 100644 src/sinlib/romanize.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..8596b18 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,10 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..5d481c1 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/sinlib.iml b/.idea/sinlib.iml new file mode 100644 index 0000000..ee28fd3 --- /dev/null +++ b/.idea/sinlib.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index b7a1c66..2e5fb41 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Sinlib (Buggy alpha version) +# Sinlib ![Alt text](sinlib.png) @@ -29,14 +29,27 @@ encoding = tokenizer("මේ අතර, පෙබරවාරි මාසයේ [tokenizer.token_id_to_token_map[id] for id in encoding] ['මේ', ' ', 'අ', 'ත', 'ර', ',', ' ', 'පෙ', 'බ', 'ර', 'වා', 'රි', ' ', 'මා', 'ස', 'යේ', ' ', 'ප', 'ළ', 'මු'] ``` + 02. Preprocessor ```python sent = ['මෙය සිංහල වාක්‍යක්', 'මෙය සිංහල වාක්‍යක් සමග english character කීපයක්','This is complete english sentence'] print(sent) -['මෙය සිංහල වාක්\u200dයක්', 'මෙය සිංහල වාක්\u200dයක් සමග english character කීපයක්', 'This is complete english sentence'] +#['මෙය සිංහල වාක්\u200dයක්', 'මෙය සිංහල වාක්\u200dයක් සමග english character කීපයක්', 'This is #complete english sentence'] from sinlib.preprocessing import get_sinhala_character_ratio get_sinhala_character_ratio(sent) -[0.9, 0.46875, 0.0] +#[0.9, 0.46875, 0.0] +``` + +03. Sinnhala Romanizer + ```python +texts = ["hello, මේ මාසයේ ගත වූ දින 15ක කාලය තුළ කොළඹ නගරය ආශ්‍රිත ව", "මෑතකාලීන ව රට මුහුණ දුන් අභියෝගාත්මකම ආර්ථික කාරණාව ණය ප්‍රතිව්‍යුගතකරණය බව මුදල් රාජ්‍ය අමාත්‍ය ආචාර්ය රංජිත් සියඹ$$$ mahatha see more****"] + +from sinlib import Romanizer + +romanizer = Romanizer(char_mapper_fp = None, tokenizer_vocab_path = None) +romanizer(text) +#['hello, me masaye gatha wu dina 15ka kalaya thula kolaba nagaraya ashritha wa', +# 'methakaleena wa rata muhuna dun abhiyogathmakama arthika karanawa naya prathiwyugathakaranaya #bawa mudal rajya amathya acharya ranjith siyaba$$$ mahatha see more****'] ``` diff --git a/data/char_map.json b/data/char_map.json new file mode 100644 index 0000000..96e71af --- /dev/null +++ b/data/char_map.json @@ -0,0 +1 @@ +{"\u0dc6\u0dd2": "fi", "\u0db6\u0dd8": "bru", "\u0d86\u0dd9": "aa", "\u0d9f\u0dd4": "gu", "\u0dc1\u0dd9": "she", "\u0dbb\u0dd8": "ru", "\u0daa": "ta", "\u0db4\u0d83": "ph", "\u0dab\u0dd6": "nu", "\u0db4\u0dd3": "pe", "\u0dc6\u0dcf": "fa", "\u0d8c": "u", "\u0dab": "na", "\u0daf\u0ddb": "dai", "\u0dc4\u0dd8": "hur", "\u0daf\u0dd1": "de", "\u0d9d\u0dd9": "ghe", "\u0dbd\u0dd9\u0dcf": "lo", "\u0dbd\u0d82": "lan", "\u0dc5": "la", "\u0dc0\u0dd0": "wa", "\u0d9a\u0d82": "kan", "\u0d87\u0dd4": "e", "\u0dab\u0dd8": "nru", "\u0db4\u0dd1": "pe", "\u0d9a\u0ddb": "kai", "\u0d85\u0d83": "a", "\u0d9c\u0dd9\u0dcf": "go", "\u0dba\u0d83": "yan", "\u0dba\u0dd9": "ye", "\u0db1\u0dd0": "ne", "\u0dbb": "ra", "\u0da1\u0dd9": "che", "\u0db6\u0dd1": "be", "\u0d9c\u0dd2": "gi", "\u0dae\u0dd9\u0dcf": "tho", "\u0d95": "o", "\u0da7\u0dd9\u0dcf": "to", "\u0dae\u0dd9": "the", "\u0da0\u0dd3": "chi", "\u0da9\u0dd4": "du", "\u0db6\u0dd2": "bi", "\u0d8b": "u", "\u0d9c\u0df2": "gru", "\u0dbd\u0dd4": "lu", "\u0dc2\u0dd8": "shru", "\u0dc2\u0dd3": "shi", "\u0da8\u0d82": "tan", "\u0d91\u0dd0": "e", "\u0dbd\u0dd0": "la", "\u0dc3\u0dd9\u0ddf": "sau", "\u0d9a\u0dd9": "ke", "\u0d9c\u0dd3": "ge", "\u0db5\u0dd4": "pu", "\u0d87\u0dd0": "e", "\u0d9f\u0dd0": "ge", "\u0db8\u0dd6": "mu", "\u0d9c\u0dd0": "ge", "\u0dae": "tha", "\u0d9f\u0dd9\u0dcf": "go", "\u0da4\u0dd4": "du", "\u0db1\u0d82": "nam", "\u0d91\u0ddb": "e", "\u0da7\u0dd3": "ti", "\u0daf\u0dd3": "di", "\u0d9f\u0dd2": "gi", "\u0da7\u0dd8": "tru", "\u0dac\u0dd2": "di", "\u0db5\u0ddb": "pi", "\u0db8\u0dcf": "ma", "\u0db7\u0dd4": "bhu", "\u0db3\u0dd0": "dhe", "\u0da8\u0dd9\u0dcf": "tho", "\u0d9c\u0dd9\u0ddf": "gau", "\u0daf\u0dd9": "de", "\u0da8": "ta", "\u0d88\u0d82": "een", "\u0d9d\u0dcf": "gha", "\u0da9\u0dcf": "da", "\u0dc3\u0dd8": "ru", "\u0d95\u0d82": "oon", "\u0d8b\u0dd1": "ue", "\u0db9\u0dd9\u0ddf": "au", "\u0da2\u0dd9\u0dcf": "jo", "\u0da0\u0dd9\u0ddf": "chau", "\u0dbd": "la", "\u0db8\u0dd9\u0ddf": "mau", "\u0db8\u0dd9\u0dcf": "mo", "\u0dc6\u0dd1": "fa", "\u0da1\u0dd4": "ju", "\u0dc3\u0dd9": "se", "\u0da9\u0dd0": "da", "\u0db0\u0ddb": "dhai", "\u0d8b\u0dd6": "u", "\u0da6": "cha", "\u0d8b\u0dd4": "u", "\u0d87\u0d82": "en", "\u0dbb\u0dd9\u0dcf": "ro", "\u0dba": "ya", "\u0d9b\u0dd4": "ku", "\u0dab\u0dd9\u0dcf": "no", "\u0da4\u0dd9\u0dcf": "gho", "\u0daa\u0dcf": "dha", "\u0dbb\u0dd6": "ru", "\u0dbb\u0dd4": "ru", "\u0d9b\u0dd6": "ku", "\u0dab\u0dd2": "ni", "\u0db6\u0dd9\u0ddf": "bau", "\u0d8d\u0dd0": "ru", "\u0da7\u0dd4": "tu", "\u0dc4\u0d82": "han", "\u0da9\u0dd1": "de", "\u0da0\u0ddb": "chai", "\u0dc5\u0dd9": "le", "\u0db0\u0dd3": "dhi", "\u0da9\u0dd9": "de", "\u0dc4\u0dd9\u0dcf": "ho", "\u0d9a\u0dd0": "ke", "\u0d9a\u0dd2": "ki", "\u0db3\u0dd6": "du", "\u0db3\u0dd9": "dhe", "\u0d9e": "n", "\u0d9f": "ga", "\u0da9\u0df2": "dru", "\u0dba\u0dd9\u0dcf": "yo", "\u0db4\u0dd0": "pe", "\u0dc2": "sha", "\u0dae\u0dcf": "tha", "\u0dad\u0d83": "th", "\u0d86\u0d82": "aan", "\u0db5": "pa", "\u0dc0\u0dd9\u0ddf": "chau", "\u0dba\u0dd6": "yu", "\u0dc6\u0dd9\u0dcf": "fo", "\u0db1\u0dd9\u0dcf": "no", "\u0d9c\u0d82": "gan", "\u0da7\u0dd1": "te", "\u0db3\u0dcf": "dha", "\u0d92": "e", "\u0dc4\u0d83": "ha", "\u0d96": "au", "\u0d9a\u0dd1": "ke", "\u0d8a": "e", "\u0dac\u0dcf": "da", "\u0dad\u0dd9": "the", "\u0dc2\u0d82": "shan", "\u0d9a\u0d83": "kan", "\u0da2\u0dd8": "jru", "\u0db3\u0dd9\u0dcf": "dho", "\u0da8\u0dd3": "ti", "\u0db4\u0d82": "pan", "\u0dc5\u0dd9\u0dcf": "lo", "\u0dbd\u0ddb": "lai", "\u0db0\u0dd1": "dee", "\u0d85\u0dd1": "ee", "\u0db0\u0dd9\u0dcf": "dho", "\u0db9\u0dd3": "bi", "\u0d9f\u0dd9": "ge", "\u0da6\u0dcf": "ja", "\u0db9": "ba", "\u0dbb\u0dd9": "re", "\u0da8\u0dd0": "te", "\u0dc3\u0dd9\u0dcf": "so", "\u0dc2\u0dd9\u0dcf": "sho", "\u0dc0\u0dd6": "wu", "\u0db8\u0d82": "man", "\u0dc6\u0dd6": "fu", "\u0da7\u0d82": "tan", "\u0d94\u0dd1": "we", "\u0d8d\u0dd3": "ri", "\u0da4\u0dd0": "ke", "\u0daf\u0dd9\u0ddf": "dau", "\u0da3\u0dcf": "ja", "\u0dc1": "sha", "\u0dc2\u0d83": "sha", "\u0d9e\u0dcf": "dha", "\u0da0\u0dd2": "chi", "\u0dad\u0dd4": "thu", "\u0db9\u0dd6": "bhu", "\u0dc0\u0dd4": "wu", "\u0da2\u0dd9": "je", "\u0dc0\u0dd9\u0dcf": "wo", "\u0d86\u0dcf": "aa", "\u0dc2\u0dd2": "shi", "\u0dae\u0dd2": "thi", "\u0da9": "da", "\u0db6\u0d82": "ban", "\u0dc3\u0dd1": "se", "\u0dad\u0dd9\u0dcf": "tho", "\u0db0\u0dd0": "dhe", "\u0dc6\u0dd3": "fee", "\u0dc5\u0dd0": "le", "\u0d89": "e", "\u0da0\u0dcf": "cha", "\u0dc6\u0dd9": "fe", "\u0daf\u0dd6": "du", "\u0d9b\u0dd3": "ki", "\u0dc4\u0dcf": "ha", "\u0d94\u0dd3": "o", "\u0dbb\u0dd2": "ri", "\u0dc4\u0ddb": "hai", "\u0d9b": "ka", "\u0db7": "bha", "\u0db7\u0dd9": "bhe", "\u0d91\u0dd3": "ee", "\u0db1\u0dd9": "ne", "\u0dc3": "sa", "\u0daf\u0dd2": "di", "\u0dab\u0dd9": "ne", "\u0c02": "n", "\u0db4": "pa", "\u0da0\u0dd1": "che", "\u0dc5\u0dcf": "la", "\u0db0\u0dd9\u0ddf": "dhau", "\u0db7\u0ddb": "bhi", "\u0dae\u0d83": "tha", "\u0d9f\u0dcf": "gha", "\u0db4\u0dd9": "pe", "\u0d9c\u0dd1": "ge", "\u0dc1\u0dd3": "shi", "\u0dc3\u0dd2": "si", "\u0db7\u0dd9\u0ddf": "bhau", "\u0d9e\u0dd3": "n", "\u0dbd\u0dd2": "li", "\u0d86": "a", "\u0dbb\u0df2": "ru", "\u0d85\u0dd9": "a", "\u0dad\u0dd1": "the", "\u0db6\u0ddb": "bai", "\u0dc4\u0dd9": "he", "\u0db6\u0dd9": "be", "\u0dba\u0ddb": "yai", "\u0dc0\u0ddb": "wai", "\u0db0\u0df2": "dru", "\u0dc4\u0dd3": "he", "\u0da5": "gha", "\u0dc2\u0dd9": "she", "\u0dc0\u0dd2": "vi", "\u0da2\u0dcf": "ja", "\u0da7\u0dd9": "te", "\u0dba\u0dd2": "i", "\u0da7": "ta", "\u0d9b\u0dcf": "ka", "\u0dac\u0dd4": "du", "\u0da2\u0dd2": "ji", "\u0dac": "da", "\u0dc2\u0dd1": "she", "\u0da1": "cha", "\u0db0\u0dd8": "dhru", "\u0da2\u0ddb": "jai", "\u0db8": "ma", "\u0dac\u0dd1": "de", "\u0dc2\u0ddb": "shai", "\u0da1\u0dd2": "chi", "\u0d9b\u0dd9": "ke", "\u0db3": "da", "\u0db6\u0dd9\u0dcf": "bo", "\u0dc4": "ha", "\u0da1\u0dd0": "je", "\u0da7\u0dd6": "tu", "\u0d9b\u0d82": "kan", "\u0d9c": "ga", "\u0db5\u0dd1": "pe", "\u0daa\u0dd2": "dhi", "\u0dc1\u0dd9\u0dcf": "sho", "\u03bf": "n", "\u0d89\u0dd0": "e", "\u0dc3\u0d83": "san", "\u0d9d": "gha", "\u0d9b\u0dd1": "ke", "\u0dc0\u0dd1": "we", "\u0d8a\u0dd9": "ee", "\u0db6\u0dd3": "bi", "\u0dc3\u0dd6": "su", "\u0da9\u0dd3": "di", "\u0dbb\u0dcf": "ra", "\u0dc1\u0d82": "shan", "\u0db7\u0dd9\u0dcf": "bho", "\u0daf\u0d82": "dan", "\u0da2\u0dd6": "ju", "\u0dba\u0dd8": "yur", "\u0d89\u0dd2": "e", "\u0dc0": "wa", "\u0db7\u0dcf": "bha", "\u0db7\u0dd2": "bhi", "\u0daa\u0dd4": "dhu", "\u0d94\u0dd8": "au", "\u0d9a\u0dd6": "ku", "\u0dbd\u0dd3": "lee", "\u0daf\u0dd9\u0dcf": "do", "\u0db6\u0df2": "bru", "\u0da7\u0dd2": "ti", "\u0dc4\u0dd6": "hu", "\u0db8\u0ddb": "mai", "\u0dc3\u0df2": "ru", "\u0db9\u0dd9": "bhe", "\u0d9c\u0dd9": "ge", "\u0dba\u0dd3": "yi", "\u0da8\u0dd2": "ti", "\u0db7\u0d82": "bhan", "\u0daf\u0dd4": "du", "\u0d9f\u0dd6": "ghu", "\u0da9\u0d82": "dan", "\u0da7\u0df2": "tru", "\u0dad\u0dd2": "thi", "\u0dad": "tha", "\u0dc4\u0dd9\u0ddf": "bhau", "\u0dc2\u0dd4": "shu", "\u0d89\u0dcf": "e", "\u0d9a": "ka", "\u0d85\u0dd2": "a", "\u0da7\u0ddb": "tai", "\u0dbd\u0dd1": "le", "\u0dac\u0dd9\u0dcf": "dho", "\u0d85\u0dd0": "e", "\u0d9a\u0dcf": "ka", "\u0db4\u0dcf": "pa", "\u0db1\u0dd1": "ne", "\u0db5\u0dd9\u0dcf": "po", "\u0da2\u0dd0": "je", "\u0da9\u0dd9\u0dcf": "do", "\u0dad\u0dd6": "thu", "\u0db3\u0dd2": "dhi", "\u0d92\u0d82": "en", "\u0d9d\u0dd9\u0dcf": "gho", "\u0d9c\u0dd8": "gru", "\u0da4\u0dd9": "ke", "\u0db7\u0dd1": "bhe", "\u0db5\u0dd3": "pi", "\u0dba\u0dcf": "ya", "\u0d9c\u0ddb": "gai", "\u0d9d\u0dd6": "ghu", "\u0db0\u0dcf": "dha", "\u0dbd\u0dd9": "le", "\u0d9d\u0dd8": "ru", "\u0dc6\u0dd8": "fru", "\u0dc0\u0dd8": "wru", "\u0da9\u0ddb": "dai", "\u0d8f": "pru", "\u0dac\u0dd6": "du", "\u0d85\u0dcf": "a", "\u0db7\u0dd3": "bhi", "\u0dc4\u0dd2": "hi", "\u0db6\u0dcf": "ba", "\u0dbb\u0dd3": "ri", "\u0d8d": "ru", "\u0dbb\u0dd0": "re", "\u0da1\u0dd3": "chi", "\u0da3\u0dd3": "jhi", "\u0da4": "gha", "\u0db1": "na", "\u0d91\u0dd2": "e", "\u0dc1\u0dd8": "shru", "\u0dab\u0dd0": "ne", "\u0d87\u0dd9": "e", "\u0dc1\u0dd1": "she", "\u0d87": "e", "\u0dc3\u0dd0": "se", "\u0dae\u0dd6": "thu", "\u0d9f\u0dd3": "gi", "\u0d95\u0dd8": "o", "\u0db5\u0dcf": "pa", "\u0db1\u0dd9\u0ddf": "nau", "\u0d8b\u0dd9": "u", "\u0dbd\u0dd9\u0ddf": "lau", "\u0dad\u0d82": "than", "\u0d9e\u0dd4": "du", "\u0dbb\u0ddb": "rai", "\u0da9\u0dd2": "di", "\u0d9c\u0dd4": "gu", "\u0da4\u0d82": "ghan", "\u0dc2\u0dcf": "sha", "\u0d85": "a", "\u0dc6\u0dd4": "fu", "\u0db0\u0dd9": "dhe", "\u0d9a\u0dd9\u0ddf": "kau", "\u0d85\u0d82": "an", "\u0dc5\u0dd6": "lu", "\u0dc5\u0dd8": "lu", "\u0da1\u0d82": "chan", "\u0db4\u0dd9\u0ddf": "pau", "\u0dc1\u0dd2": "shi", "\u0dc1\u0dd6": "shu", "\u0dad\u0dd8": "thru", "\u0da0\u0dd6": "chu", "\u0da0\u0d82": "chan", "\u0db0\u0d82": "dhan", "\u0dab\u0dcf": "na", "\u0db1\u0dd3": "ni", "\u0dac\u0dd0": "dhe", "\u0d9c\u0dcf": "ga", "\u0db3\u0dd3": "di", "\u0dc2\u0dd0": "she", "\u0d91": "e", "\u0d8b\u0dcf": "u", "\u0d89\u0dd9": "e", "\u0dbb\u0d82": "ran", "\u0dae\u0dd0": "the", "\u0db1\u0dd8": "nru", "\u0daa\u0dd9\u0dcf": "to", "\u0db5\u0d82": "pan", "\u0d93": "e", "\u0db6\u0dd4": "bu", "\u0da7\u0dd9\u0ddf": "tau", "\u0d9a\u0dd8": "kru", "\u0db8\u0d83": "man", "\u0dab\u0dd4": "nu", "\u0dc3\u0dd3": "si", "\u0db4\u0dd6": "pu", "\u0da8\u0dcf": "ta", "\u0da0\u0dd9": "che", "\u0db5\u0dd2": "phi", "\u0dc2\u0dd6": "shu", "\u0d9e\u0dd9\u0dcf": "do", "\u0db7\u0dd6": "bhu", "\u0dad\u0ddb": "thai", "\u0da2\u0d82": "jan", "\u0db9\u0dd0": "be", "\u0d94": "o", "\u0daf\u0df2": "dhru", "\u0da9\u0dd8": "dru", "\u0da4\u0dcf": "ghan", "\u0dbd\u0dd6": "lu", "\u0dc0\u0dd9": "we", "\u0d94\u0dcf": "o", "\u0d8b\u0d82": "un", "\u0db6": "ba", "\u0db1\u0dd2": "ni", "\u0d9d\u0dd3": "ghi", "\u0dbd\u0dcf": "la", "\u0db7\u0dd8": "bru", "\u0da3": "gha", "\u0dab\u0dd1": "ne", "\u0d88": "e", "\u0dc6\u0dd0": "fa", "\u0dc6": "fa", "\u0dad\u0dd0": "the", "\u0d92\u0dd2": "e", "\u0da7\u0dd0": "te", "\u0db1\u0dd4": "nu", "\u0dba\u0d82": "yan", "\u0db1\u0dd6": "nu", "\u0d9a\u0dd4": "ku", "\u0dba\u0dd1": "ye", "\u0db8\u0dd1": "me", "\u0dc5\u0d82": "lan", "\u0da0\u0dd9\u0dcf": "cho", "\u0da9\u0dd9\u0ddf": "dau", "\u0db4\u0ddb": "pai", "\u0da7\u0dcf": "ta", "\u0db3\u0dd4": "du", "\u0d9a\u0dd9\u0dcf": "ko", "\u0d9d\u0d82": "ghan", "\u0dba\u0dd9\u0ddf": "yau", "\u0d9e\u0dd2": "di", "\u0dc4\u0dd0": "he", "\u0db4\u0dd4": "pu", "\u0dc5\u0dd4": "lu", "\u0d9d\u0dd4": "gu", "\u0db1\u0dcf": "na", "\u0db4\u0df2": "pru", "\u0db3\u0dd1": "de", "\u0dc1\u0dd4": "shu", "\u0da3\u0dd9\u0dcf": "gha", "\u0da2\u0dd1": "je", "\u0da4\u0dd2": "di", "\u0da1\u0dcf": "cha", "\u0dc5\u0dd3": "li", "\u0da0\u0dd4": "chu", "\u0db0": "dha", "\u0d9f\u0d82": "ghan", "\u0db0\u0dd2": "dhi", "\u0d91\u0dcf": "e", "\u0d8b\u0dd3": "u", "\u0db9\u0dd2": "bhi", "\u0db8\u0dd3": "me", "\u0db5\u0dd0": "pe", "\u0d9d\u0dd2": "ghi", "\u0da4\u0dd6": "du", "\u0db0\u0dd6": "dhu", "\u0daf\u0dd8": "dru", "\u0dba\u0dd4": "yu", "\u0db9\u0dd4": "bu", "\u0dc1\u0dd9\u0ddf": "shau", "\u0db8\u0dd9": "me", "\u0da4\u0dd3": "di", "\u0daf": "da", "\u0d94\u0d82": "on", "\u0d94\u0dd2": "o", "\u0dc5\u0dd2": "li", "\u0da2\u0dd3": "ji", "\u0dc1\u0dcf": "sha", "\u0da8\u0dd9": "te", "\u0db4\u0dd8": "pru", "\u0d91\u0dd9": "e", "\u0dc1\u0d83": "shan", "\u0d90": "pau", "\u0db8\u0dd2": "mi", "\u0db0\u0dd4": "dhu", "\u0dc3\u0ddb": "sai", "\u0da2\u0dd9\u0ddf": "jau", "\u0db9\u0dd1": "be", "\u0d9b\u0dd2": "ki", "\u043e": "n", "\u0da3\u0d82": "jan", "\u0dab\u0dd3": "ni", "\u0dc4\u0dd4": "hu", "\u0d9c\u0dd6": "gu", "\u0da0": "cha", "\u0db5\u0dd9": "pe", "\u0dc0\u0df2": "wru", "\u0d94\u0dd9": "o", "\u0dae\u0d82": "than", "\u0db5\u0dd6": "pu", "\u0d8b\u0dd8": "u", "\u0db6\u0dd0": "be", "\u0dc3\u0dcf": "sa", "\u0db3\u0d82": "dan", "\u0da2\u0d83": "jah", "\u0da9\u0dd6": "du", "\u0dbb\u0dd1": "re", "\u0dc4\u0df2": "hru", "\u0d9a\u0dd3": "ki", "\u0da1\u0dd8": "chru", "\u0daf\u0dcf": "da", "\u0dc5\u0dd1": "le", "\u0da1\u0dd9\u0dcf": "cho", "\u0dad\u0df2": "thru", "\u0db4\u0dd9\u0dcf": "po", "\u0d9b\u0dd9\u0dcf": "ko", "\u0d9e\u0dd9": "de", "\u0db8\u0dd8": "mur", "\u0dc0\u0d83": "wah", "\u0d86\u0d83": "an", "\u0d9f\u0dd1": "ge", "\u0dc3\u0d82": "san", "\u0daf\u0dd0": "de", "\u0da8\u0dd4": "tu", "\u0dba\u0dd0": "ye", "\u0dbb\u0dd9\u0ddf": "rau", "\u0db9\u0dcf": "bha", "\u0db9\u0dd9\u0dcf": "bho", "\u0d9a\u0df2": "kru", "\u0dc0\u0d82": "wan", "\u0da0\u0dd0": "che", "\u0da2": "ja", "\u0dc4\u0dd1": "he", "\u0dc1\u0dd0": "sha", "\u0dc1\u0ddb": "shai", "\u0dae\u0dd8": "tru", "\u0dc0\u0dcf": "wa", "\u0dac\u0dd9": "de", "\u0db4\u0dd2": "pi", "\u0da0\u0d83": "chah", "\u0da3\u0dd2": "dhi", "\u0dad\u0dd3": "thi", "\u0db8\u0dd4": "mu", "\u0d94\u0dd4": "o", "\u0da2\u0dd4": "ju", "\u0d91\u0d82": "en", "\u0dac\u0dd3": "di", "\u0db8\u0dd0": "me", "\u0d92\u0dd9": "e", "\u0dc0\u0dd3": "we", "\u0dad\u0dcf": "tha", "\u0db1\u0ddb": "nai", "\u0db6\u0dd6": "bu", "\u0dae\u0dd4": "thu", "\u0dae\u0dd3": "thi", "\u0dc3\u0dd4": "su", "\u0dac\u0d82": "ghan", "\u0d89\u0d82": "en", "\u0da7\u0dca": "t", "\u0dc2\u0ddc": "sho", "\u0dab\u0dda": "no", "\u0dab\u0dca": "n", "\u0dbd\u0dde": "lau", "\u0da3\u0dca": "j", "\u0dc3\u0dde": "sau", "\u0dba\u0dde": "yau", "\u0d9a\u0dca": "k", "\u0db5\u0ddd": "po", "\u0db6\u0ddc": "bo", "\u0dbb\u0dde": "rau", "\u0dc1\u0ddd": "sho", "\u0dc5\u0ddd": "lo", "\u0d9b\u0dda": "ke", "\u0dc0\u0ddd": "wo", "\u0daa\u0dca": "d", "\u0db9\u0ddd": "bho", "\u0db3\u0ddc": "dho", "\u0db3\u0ddd": "dho", "\u0dc6\u0ddd": "fho", "\u0da2\u0dda": "je", "\u0dbb\u0dca": "r", "\u0dc0\u0ddc": "wo", "\u0da9\u0dca": "d", "\u0dba\u0ddc": "yo", "\u0db0\u0dda": "dhe", "\u0da0\u0dda": "che", "\u0da1\u0dda": "che", "\u0db7\u0ddc": "bho", "\u0dac\u0dda": "de", "\u0db7\u0dda": "bhe", "\u0da9\u0ddc": "do", "\u0dbb\u0ddc": "ro", "\u0dc1\u0dda": "she", "\u0d91\u0dca": "e", "\u0d9c\u0dde": "gau", "\u0da9\u0dde": "dau", "\u0da4\u0ddc": "gho", "\u0daf\u0dde": "dhou", "\u0db3\u0dca": "d", "\u0da3\u0ddd": "do", "\u0dac\u0dca": "d", "\u0db5\u0ddc": "po", "\u0dba\u0ddd": "yo", "\u0dc1\u0dca": "sh", "\u0db0\u0dca": "dh", "\u0dbb\u0dda": "re", "\u0d9f\u0dca": "g", "\u0dae\u0ddc": "tho", "\u0dae\u0dda": "the", "\u0da4\u0dca": "ghe", "\u0da8\u0dca": "t", "\u0dc4\u0dca": "h", "\u0d9d\u0ddc": "gho", "\u0da0\u0ddc": "cho", "\u0dc4\u0dda": "he", "\u0dab\u0ddc": "no", "\u0dc5\u0ddc": "lo", "\u0dc0\u0dda": "we", "\u0dad\u0ddd": "tho", "\u0da2\u0ddd": "jo", "\u0da9\u0ddd": "do", "\u0db9\u0dda": "be", "\u0dc5\u0dca": "l", "\u0dae\u0dca": "th", "\u0db1\u0dda": "ne", "\u0db8\u0ddc": "mo", "\u0d9d\u0dda": "ghe", "\u0dc3\u0dda": "se", "\u0db4\u0dca": "p", "\u0d87\u0dca": "e", "\u0d9a\u0dde": "kau", "\u0d9e\u0ddc": "do", "\u0db8\u0dda": "me", "\u0d9b\u0ddc": "ko", "\u0dc4\u0ddc": "ho", "\u0dba\u0dca": "y", "\u0db1\u0dca": "n", "\u0dc1\u0ddc": "sho", "\u0da4\u0ddd": "gho", "\u0dc3\u0dca": "s", "\u0da1\u0ddc": "cho", "\u0daf\u0dca": "d", "\u0db6\u0dca": "b", "\u0d9d\u0ddd": "gho", "\u0db5\u0dca": "e", "\u0d9a\u0ddc": "ko", "\u0dc0\u0dde": "wau", "\u0da8\u0ddc": "to", "\u0dae\u0ddd": "tho", "\u0da8\u0ddd": "to", "\u0dc3\u0ddd": "so", "\u0db6\u0ddd": "bo", "\u0d92\u0dca": "e", "\u0db7\u0dde": "bhau", "\u0db9\u0ddc": "bho", "\u0da7\u0dda": "te", "\u0da0\u0ddd": "cho", "\u0da9\u0dda": "de", "\u0da1\u0ddd": "cho", "\u0d9f\u0ddc": "go", "\u0da0\u0dca": "ch", "\u0db1\u0ddd": "no", "\u0da2\u0dca": "j", "\u0db0\u0ddc": "dho", "\u0db4\u0dda": "je", "\u0dbb\u0ddd": "ro", "\u0dbd\u0dca": "l", "\u0db1\u0ddc": "no", "\u0d94\u0dca": "o", "\u0dc6\u0dca": "f", "\u0dc2\u0dca": "sh", "\u0d89\u0dca": "e", "\u0dad\u0ddc": "tho", "\u0dad\u0dda": "the", "\u0dad\u0dca": "th", "\u0db3\u0dda": "dhe", "\u0dc3\u0ddc": "so", "\u0db8\u0dca": "m", "\u0daa\u0ddc": "to", "\u0d9f\u0ddd": "go", "\u0daa\u0ddd": "to", "\u0d9a\u0dda": "ke", "\u0db4\u0ddc": "po", "\u0dac\u0ddd": "do", "\u0da2\u0dde": "jau", "\u0dba\u0dda": "ye", "\u0dc6\u0ddc": "fo", "\u0db9\u0dde": "bhau", "\u0d9e\u0dca": "n", "\u0db4\u0dde": "pau", "\u0d8a\u0dca": "e", "\u0dc5\u0dda": "le", "\u0db9\u0dca": "b", "\u0db6\u0dde": "bau", "\u0db8\u0ddd": "mo", "\u0d9c\u0dca": "g", "\u0dc2\u0dda": "she", "\u0d9c\u0dda": "ge", "\u0db4\u0ddd": "po", "\u0da1\u0dca": "ch", "\u0d9b\u0dca": "k", "\u0dbd\u0ddd": "lo", "\u0d85\u0dda": "a", "\u0d9e\u0ddd": "do", "\u0d9c\u0ddc": "go", "\u0da8\u0dda": "dhe", "\u0d9e\u0dda": "n", "\u0dc1\u0dde": "shau", "\u0dc6\u0dda": "fe", "\u0db1\u0dde": "nau", "\u0da2\u0ddc": "jo", "\u0d9c\u0ddd": "go", "\u0db8\u0dde": "mau", "\u0da4\u0dda": "g", "\u0dbd\u0dda": "le", "\u0da7\u0ddc": "to", "\u0daf\u0ddd": "dho", "\u0d85\u0dca": "a", "\u0dab\u0ddd": "no", "\u0db7\u0dca": "b", "\u0dbd\u0ddc": "lo", "\u0daf\u0ddc": "do", "\u0dc4\u0dde": "bhau", "\u0da7\u0dde": "tau", "\u0d9d\u0dca": "g", "\u0d9b\u0ddd": "ko", "\u0db0\u0dde": "dhau", "\u0db7\u0ddd": "bho", "\u0db0\u0ddd": "dho", "\u0dc2\u0ddd": "sho", "\u0d9a\u0ddd": "ko", "\u0daf\u0dda": "de", "\u0dc4\u0ddd": "ho", "\u0db6\u0dda": "be", "\u0da7\u0ddd": "to", "\u0da0\u0dde": "chau", "\u0d9f\u0dda": "ge", "\u0dc0\u0dca": "w"} \ No newline at end of file diff --git a/data/vocab_map.json b/data/vocab_map.json new file mode 100644 index 0000000..7410d4f --- /dev/null +++ b/data/vocab_map.json @@ -0,0 +1,2276 @@ +{ + "»": 0, + "መ": 1, + "💻": 2, + "ˈ": 3, + "🙊": 4, + "D": 5, + "ծ": 6, + "නේ": 7, + "😐": 8, + "ධෑ": 9, + "연": 10, + "አ": 11, + "යඃ": 12, + "ู": 13, + "ඔ්": 14, + "ථ්": 15, + "ඇ": 16, + "専": 17, + "ஆ": 18, + "ʒ": 19, + "ඝී": 20, + "پ": 21, + "శ": 22, + "Ã": 23, + "📚": 24, + "부": 25, + "ሊ": 26, + "배": 27, + "බි": 28, + "?": 29, + "³": 30, + "官": 31, + "ඨ්": 32, + "🐼": 33, + "ج": 34, + "ඳේ": 35, + "ඟෑ": 36, + "∶": 37, + "ඩි": 38, + "නී": 39, + "≡": 40, + "දු": 41, + "මි": 42, + "නෝ": 43, + "ව්": 44, + "↑": 45, + "ए": 46, + "स": 47, + "බෞ": 48, + "ℜ": 49, + "出": 50, + "ජ්": 51, + "😖": 52, + "🌻": 53, + "ගි": 54, + "ด": 55, + "ሰ": 56, + "එැ": 57, + "ង": 58, + "ලු": 59, + "වූ": 60, + "ʰ": 61, + "⇪": 62, + "ő": 63, + "ඤු": 64, + "N": 65, + "⇝": 66, + "ෂී": 67, + "දං": 68, + "저": 69, + "ඝි": 70, + "۷": 71, + "ජැ": 72, + "ੱ": 73, + "^": 74, + "ነ": 75, + "ã": 76, + "චෙ": 77, + "♡": 78, + "の": 79, + "Ş": 80, + "සේ": 81, + "ਾ": 82, + "௨": 83, + "📅": 84, + "영": 85, + "ክ": 86, + "ආෙ": 87, + "ඍැ": 88, + "ඬෑ": 89, + "දෙ": 90, + "በ": 91, + "語": 92, + "̴": 93, + "🇸": 94, + "ฉ": 95, + "පෘ": 96, + "😳": 97, + "ශ": 98, + "➚": 99, + "ක": 100, + "ब": 101, + "☹": 102, + "නෑ": 103, + "භේ": 104, + "北": 105, + "有": 106, + "ك": 107, + "吏": 108, + "බූ": 109, + "👇": 110, + "ෂො": 111, + "😠": 112, + "ع": 113, + "י": 114, + "හෲ": 115, + "ḍ": 116, + "ඞේ": 117, + "紀": 118, + "豪": 119, + "නෛ": 120, + "Ε": 121, + "හෞ": 122, + "╗": 123, + "යෛ": 124, + "සෞ": 125, + "这": 126, + "ති": 127, + "ㄸ": 128, + "즈": 129, + "♐": 130, + "කං": 131, + "ළෑ": 132, + "̯": 133, + "ඔැ": 134, + "通": 135, + "ක්": 136, + "ඝු": 137, + "ඞි": 138, + "局": 139, + "ඬේ": 140, + "❉": 141, + "신": 142, + "ඛු": 143, + "ඞී": 144, + "₹": 145, + "・": 146, + "§": 147, + "": 148, + "−": 149, + "α": 150, + "☀": 151, + "➨": 152, + "ඡු": 153, + "♬": 154, + "ﻲ": 155, + "}": 156, + "ඵෛ": 157, + "පූ": 158, + "·": 159, + "චු": 160, + "④": 161, + "වේ": 162, + "Æ": 163, + "ਹ": 164, + "Š": 165, + "医": 166, + "็": 167, + "නැ": 168, + "ණ": 169, + "려": 170, + "钱": 171, + "{": 172, + "呼": 173, + "": 174, + "p": 175, + "­": 176, + "ඝෘ": 177, + "ණැ": 178, + "鬲": 179, + "’": 180, + "ው": 181, + "ς": 182, + "х": 183, + "ඉෙ": 184, + "🏽": 185, + "ʊ": 186, + "ඩු": 187, + "🌹": 188, + "⋆": 189, + "丂": 190, + "్": 191, + "โ": 192, + "๑": 193, + "樂": 194, + "ካ": 195, + "😱": 196, + "දූ": 197, + "满": 198, + "ධු": 199, + "⏰": 200, + "ආ්": 201, + "ටු": 202, + "💡": 203, + "ලඃ": 204, + "ぎ": 205, + "Г": 206, + "병": 207, + "🤣": 208, + "ど": 209, + "Ś": 210, + "就": 211, + "ร": 212, + "笑": 213, + "體": 214, + "ශඃ": 215, + "¨": 216, + "ン": 217, + "印": 218, + "": 219, + "උැ": 220, + "➠": 221, + "හෛ": 222, + "ගෘ": 223, + "杯": 224, + "බෛ": 225, + "٦": 226, + "🎸": 227, + "ா": 228, + "ô": 229, + "ḱ": 230, + "යං": 231, + "වෙ": 232, + "据": 233, + "කේ": 234, + "ඨි": 235, + "ඛි": 236, + "も": 237, + "ມ": 238, + "ょ": 239, + "ო": 240, + "ங": 241, + "ඳෑ": 242, + "野": 243, + "ඩේ": 244, + "ಕ": 245, + "ɐ": 246, + "호": 247, + "국": 248, + "ඹා": 249, + "ගී": 250, + "Œ": 251, + "株": 252, + "사": 253, + "w": 254, + "පං": 255, + "ණූ": 256, + "වු": 257, + "院": 258, + "ඹ්": 259, + "ළා": 260, + "o": 261, + "සු": 262, + "චේ": 263, + "ყ": 264, + "☜": 265, + "↳": 266, + "🍕": 267, + "у": 268, + "හී": 269, + "な": 270, + "õ": 271, + "ና": 272, + "අා": 273, + "ඐ": 274, + "අු": 275, + "O": 276, + "ඤි": 277, + "ගෛ": 278, + "උෑ": 279, + "진": 280, + "එා": 281, + "": 282, + "。": 283, + "පේ": 284, + "ඬැ": 285, + "ෂෞ": 286, + "व": 287, + "ో": 288, + "කෑ": 289, + "َ": 290, + "Ⅲ": 291, + "ජෑ": 292, + "චෞ": 293, + "ባ": 294, + "🙈": 295, + "උෘ": 296, + "☻": 297, + "නු": 298, + "(": 299, + "琮": 300, + "˙": 301, + "日": 302, + "පා": 303, + "…": 304, + "භ": 305, + "": 306, + "ï": 307, + "Λ": 308, + "ඒි": 309, + "•": 310, + "開": 311, + "い": 312, + "ል": 313, + "ඇ්": 314, + "": 315, + "당": 316, + "/": 317, + "ल": 318, + "할": 319, + "□": 320, + "": 321, + "서": 322, + "س": 323, + "ඬි": 324, + "🍶": 325, + "ඔෘ": 326, + "ඳෝ": 327, + "]": 328, + "සං": 329, + "Ú": 330, + "ស": 331, + "Κ": 332, + "": 333, + "යෝ": 334, + "🔴": 335, + "ት": 336, + "▪": 337, + "ලෘ": 338, + "ɪ": 339, + "魏": 340, + "රෑ": 341, + "🔥": 342, + "ඩෑ": 343, + "⦁": 344, + "ළෝ": 345, + "事": 346, + "한": 347, + "Ф": 348, + "体": 349, + "": 350, + "✺": 351, + "පී": 352, + "ඝ්": 353, + "ඬෝ": 354, + "ටෘ": 355, + "芸": 356, + "ඨෝ": 357, + "යෑ": 358, + "ඤී": 359, + "Ø": 360, + "⚑": 361, + "මී": 362, + "ª": 363, + "В": 364, + "የ": 365, + "ලෙ": 366, + "📱": 367, + "学": 368, + "ඪෘ": 369, + "ិ": 370, + "克": 371, + "ඣ්": 372, + "ī": 373, + "ṃ": 374, + "🏦": 375, + "주": 376, + "ŗ": 377, + "个": 378, + "_": 379, + "Ӂ": 380, + "葉": 381, + "ビ": 382, + "": 383, + "ගේ": 384, + "නෞ": 385, + "軍": 386, + "ْ": 387, + "₂": 388, + "ගෲ": 389, + "💧": 390, + "셔": 391, + "歌": 392, + "u": 393, + "ටා": 394, + "": 395, + "ඟූ": 396, + "ෂැ": 397, + "ඳෙ": 398, + "": 399, + "ජෝ": 400, + "නඃ": 401, + "ゅ": 402, + "❖": 403, + "ඛෙ": 404, + "ɡ": 405, + "#": 406, + "თ": 407, + "Ž": 408, + "察": 409, + "ਮ": 410, + "公": 411, + "र": 412, + "ɨ": 413, + "ණා": 414, + "ɕ": 415, + "ʌ": 416, + "☢": 417, + "ෆෛ": 418, + "": 419, + "ெ": 420, + "ෆෲ": 421, + "ඊ්": 422, + "අෙ": 423, + "ṭ": 424, + "—": 425, + "🌍": 426, + "ම්": 427, + "ण": 428, + "鑑": 429, + "잘": 430, + "ඝං": 431, + "I": 432, + "ජෘ": 433, + "ය": 434, + "Ñ": 435, + "Ʌ": 436, + "ă": 437, + "ඹෑ": 438, + "ㅂ": 439, + "ඔෑ": 440, + "යූ": 441, + "料": 442, + "ඔූ": 443, + "చ": 444, + "◙": 445, + "ථෝ": 446, + "ජු": 447, + "์": 448, + "ň": 449, + "ไ": 450, + "使": 451, + "지": 452, + "බ්": 453, + "😕": 454, + "Т": 455, + "භ්": 456, + "ලැ": 457, + "ه": 458, + "ታ": 459, + "පෛ": 460, + "▌": 461, + "٩": 462, + "η": 463, + "格": 464, + "ශි": 465, + "記": 466, + "ටෑ": 467, + "8": 468, + "ෆ": 469, + "දෘ": 470, + "ඇු": 471, + "ඡී": 472, + "ģ": 473, + " ": 474, + "◘": 475, + "◣": 476, + "ලේ": 477, + "ඤඃ": 478, + " ": 479, + "Ħ": 480, + "බ": 481, + "ඤේ": 482, + "ඩෞ": 483, + "デ": 484, + "‟": 485, + "ළ්": 486, + "ी": 487, + "ඩ": 488, + "z": 489, + "ඵැ": 490, + "පි": 491, + "治": 492, + "ෂේ": 493, + "조": 494, + "ෳ": 495, + "ගං": 496, + "ć": 497, + "චඃ": 498, + "Å": 499, + "送": 500, + "ත": 501, + "‚": 502, + "තෘ": 503, + "😒": 504, + "و": 505, + "讀": 506, + "字": 507, + "テ": 508, + "¼": 509, + "උං": 510, + "ح": 511, + "Þ": 512, + "詰": 513, + "ద": 514, + "📑": 515, + "෦": 516, + "ሌ": 517, + "π": 518, + "චැ": 519, + "ඳ්": 520, + "ඤො": 521, + "ඓ": 522, + "두": 523, + "フ": 524, + "💀": 525, + "ሁ": 526, + "ෆැ": 527, + "生": 528, + "라": 529, + "門": 530, + "ñ": 531, + "ੀ": 532, + "ඵ": 533, + "ม": 534, + "ෆේ": 535, + "": 536, + "🌸": 537, + "🌷": 538, + "වා": 539, + "н": 540, + "රේ": 541, + "ෆෑ": 542, + "💁": 543, + "J": 544, + "ਿ": 545, + "සෘ": 546, + "ඬො": 547, + "ගැ": 548, + "ෆ්": 549, + "ච්": 550, + "ඳී": 551, + "ච": 552, + "ඦ": 553, + "⁣": 554, + "त": 555, + "ඡි": 556, + "ඟ්": 557, + "と": 558, + "장": 559, + "‍": 560, + "ඡෙ": 561, + "න්": 562, + "ඤං": 563, + "親": 564, + "Û": 565, + "ථො": 566, + "ෟ": 567, + "ł": 568, + "‬": 569, + "〒": 570, + "අ": 571, + "ළී": 572, + "社": 573, + "ඟේ": 574, + "▻": 575, + "♥": 576, + "し": 577, + "ム": 578, + "📞": 579, + "っ": 580, + "º": 581, + "": 582, + "ඛං": 583, + "i": 584, + "ඨෘ": 585, + "m": 586, + "තූ": 587, + "乃": 588, + "ส": 589, + "ቅ": 590, + "උෙ": 591, + "在": 592, + "ા": 593, + "‪": 594, + "🐱": 595, + "ඩෲ": 596, + "රැ": 597, + "ටෛ": 598, + "博": 599, + "️": 600, + "ហ": 601, + "වෞ": 602, + "ρ": 603, + "ළ": 604, + "ටෙ": 605, + "😍": 606, + "入": 607, + "†": 608, + "∘": 609, + "보": 610, + "ೀ": 611, + "ඵ්": 612, + "😓": 613, + "文": 614, + "投": 615, + "": 616, + "ඵී": 617, + "資": 618, + "භි": 619, + "↕": 620, + "A": 621, + "⚓": 622, + "න": 623, + "ද": 624, + "ඹ": 625, + "ㅉ": 626, + "ዳ": 627, + "ඍ": 628, + "่": 629, + "\u0003": 630, + "ù": 631, + "ලං": 632, + "😏": 633, + "ు": 634, + "➡": 635, + "۞": 636, + "א": 637, + "兀": 638, + "ඨං": 639, + "ਨ": 640, + "й": 641, + "เ": 642, + "ඝූ": 643, + "決": 644, + "晉": 645, + "🚀": 646, + "": 647, + "කෝ": 648, + "까": 649, + "ி": 650, + "රූ": 651, + "ಳ": 652, + "ඒ්": 653, + "ඞෝ": 654, + "ථු": 655, + "つ": 656, + "販": 657, + "ළූ": 658, + "ශා": 659, + "ඬා": 660, + "옹": 661, + "ð": 662, + "ூ": 663, + "ગ": 664, + "诶": 665, + "💕": 666, + "場": 667, + "ථඃ": 668, + "ی": 669, + "👨": 670, + "": 671, + "👌": 672, + "ඹැ": 673, + "ජ": 674, + ",": 675, + "F": 676, + "Ç": 677, + "ó": 678, + "": 679, + "Ô": 680, + "😬": 681, + "කු": 682, + "б": 683, + "ඹේ": 684, + "ν": 685, + "¾": 686, + "🙁": 687, + "ග්": 688, + "ප්": 689, + "Ā": 690, + "閉": 691, + "ඉු": 692, + "డ": 693, + "🏆": 694, + "සූ": 695, + "ඊ": 696, + "비": 697, + "∙": 698, + "ඣං": 699, + "ọ": 700, + "ṉ": 701, + "す": 702, + "රෘ": 703, + "札": 704, + "දෛ": 705, + "🇰": 706, + "※": 707, + "පෝ": 708, + "繁": 709, + "K": 710, + "☸": 711, + "ඩී": 712, + "ජෞ": 713, + "ー": 714, + "河": 715, + "\\": 716, + "ב": 717, + "මෑ": 718, + "ටො": 719, + "ǐ": 720, + "ඞො": 721, + "↯": 722, + "ష": 723, + "ஞ": 724, + "⇔": 725, + "ඩූ": 726, + "ඟො": 727, + "✿": 728, + "": 729, + "м": 730, + "❶": 731, + "ඳැ": 732, + "ښ": 733, + "✉": 734, + "ඕෘ": 735, + "භී": 736, + "ඔි": 737, + "哪": 738, + "ග": 739, + "🏻": 740, + "ε": 741, + "ழ": 742, + "符": 743, + "": 744, + "නා": 745, + "ǣ": 746, + "ණෝ": 747, + "ロ": 748, + "ಜ": 749, + "": 750, + "À": 751, + "와": 752, + "ඕ": 753, + "♣": 754, + "බෘ": 755, + "🙃": 756, + "ෂං": 757, + "ඵෝ": 758, + "に": 759, + "9": 760, + "漢": 761, + "කෘ": 762, + "📖": 763, + "ɳ": 764, + "😭": 765, + "ɛ": 766, + "Ξ": 767, + "ධං": 768, + "😢": 769, + "😯": 770, + "දො": 771, + "ச": 772, + "සි": 773, + "ඡ": 774, + "ඇැ": 775, + "ඊෙ": 776, + "🌲": 777, + "ラ": 778, + "‡": 779, + "ண": 780, + "«": 781, + "М": 782, + "ች": 783, + "ශැ": 784, + "ɖ": 785, + "හා": 786, + "贝": 787, + "ගූ": 788, + "😞": 789, + "6": 790, + "ლ": 791, + "İ": 792, + "ㅆ": 793, + "ُ": 794, + "🔷": 795, + "ණෑ": 796, + "😗": 797, + "ત": 798, + "⇨": 799, + "ಇ": 800, + "✍": 801, + "ට": 802, + "曹": 803, + " ": 804, + "ධො": 805, + "කෙ": 806, + "ෆෘ": 807, + "ඞ": 808, + "": 809, + ")": 810, + "舊": 811, + "@": 812, + "ඪූ": 813, + "수": 814, + "🔘": 815, + "ದ": 816, + "ළේ": 817, + "ś": 818, + "간": 819, + "豆": 820, + "ፍ": 821, + "": 822, + "θ": 823, + "සො": 824, + "め": 825, + "ඩා": 826, + "ː": 827, + "く": 828, + "k": 829, + "අ්": 830, + "තං": 831, + "◼": 832, + "”": 833, + "සෙ": 834, + "節": 835, + "ඪු": 836, + "담": 837, + "±": 838, + "🕒": 839, + "房": 840, + "ගො": 841, + "ඔී": 842, + "௧": 843, + "": 844, + "😅": 845, + "協": 846, + "к": 847, + "ධී": 848, + "ණේ": 849, + "ट": 850, + "භෘ": 851, + "麗": 852, + "ලා": 853, + "⋅": 854, + "ඳු": 855, + "දැ": 856, + "₰": 857, + "යෙ": 858, + "خ": 859, + "😂": 860, + "ŭ": 861, + "경": 862, + "ټ": 863, + "ජී": 864, + "育": 865, + "ﺎ": 866, + "ẻ": 867, + "ඵෘ": 868, + "යෘ": 869, + "🛌": 870, + "ậ": 871, + "平": 872, + "ீ": 873, + "Е": 874, + "ල": 875, + "経": 876, + "ш": 877, + "g": 878, + "Ŵ": 879, + "😘": 880, + "ਗ": 881, + "향": 882, + "県": 883, + "ㅇ": 884, + "울": 885, + "안": 886, + "උු": 887, + "ඡ්": 888, + ":": 889, + "♀": 890, + "ළො": 891, + "園": 892, + "´": 893, + "甫": 894, + "ඛෝ": 895, + "ඳ": 896, + "인": 897, + "තෲ": 898, + "ඬු": 899, + "💓": 900, + "\u0014": 901, + "ඹි": 902, + "鄕": 903, + "ඩෘ": 904, + "高": 905, + "̈": 906, + "て": 907, + "ඤ්": 908, + "ර": 909, + "部": 910, + "ʼ": 911, + "😎": 912, + "ඝෝ": 913, + "~": 914, + "ඹෙ": 915, + "👍": 916, + "✅": 917, + "ඤැ": 918, + "¹": 919, + "Ƹ": 920, + "ඹෞ": 921, + "圓": 922, + "μ": 923, + "ඵි": 924, + "Đ": 925, + "💗": 926, + "च": 927, + "ඡෘ": 928, + "¢": 929, + "ö": 930, + "▃": 931, + "🐌": 932, + "西": 933, + "─": 934, + "ட": 935, + "ජේ": 936, + "": 937, + "👎": 938, + "‏": 939, + "ㅈ": 940, + "≤": 941, + " ": 942, + "商": 943, + "∕": 944, + "╔": 945, + "ඇං": 946, + "릴": 947, + "℘": 948, + "ණු": 949, + "0": 950, + "මෝ": 951, + "🌺": 952, + "වො": 953, + "✓": 954, + "ይ": 955, + "寶": 956, + "කෛ": 957, + "知": 958, + "警": 959, + "": 960, + "்": 961, + "시": 962, + "&": 963, + "භෞ": 964, + "ගෙ": 965, + "ה": 966, + "💞": 967, + "运": 968, + "": 969, + "සා": 970, + " ": 971, + "✊": 972, + "♜": 973, + "】": 974, + "ඤ": 975, + "✪": 976, + "प": 977, + "ඛා": 978, + "‫": 979, + "ந": 980, + "ን": 981, + "": 982, + "ँ": 983, + "භෝ": 984, + "Â": 985, + "ඉ": 986, + "උ්": 987, + "т": 988, + "වි": 989, + "තෝ": 990, + "f": 991, + "ก": 992, + "ط": 993, + "٠": 994, + "ඛඃ": 995, + "舞": 996, + "ㅣ": 997, + "÷": 998, + "ඈ": 999, + "ඞූ": 1000, + "定": 1001, + "ධෙ": 1002, + "ෆෙ": 1003, + "යැ": 1004, + "ඹෝ": 1005, + "年": 1006, + "නි": 1007, + "ı": 1008, + "අං": 1009, + "": 1010, + "ย": 1011, + "à": 1012, + "З": 1013, + "ि": 1014, + "G": 1015, + "ඨැ": 1016, + "店": 1017, + "🔆": 1018, + "ඞු": 1019, + "♭": 1020, + "අඃ": 1021, + "ටේ": 1022, + "ෆූ": 1023, + ";": 1024, + "พ": 1025, + "ළි": 1026, + "ʟ": 1027, + "う": 1028, + "ა": 1029, + "天": 1030, + "உ": 1031, + "英": 1032, + "ò": 1033, + "හැ": 1034, + "": 1035, + "📝": 1036, + "රෛ": 1037, + "වෑ": 1038, + "ජං": 1039, + "ඔ": 1040, + "🚈": 1041, + "ㅃ": 1042, + "λ": 1043, + "🌞": 1044, + "✚": 1045, + "Q": 1046, + "හේ": 1047, + "ඏ": 1048, + "\"": 1049, + "ඔං": 1050, + "₦": 1051, + "ˌ": 1052, + "¥": 1053, + "网": 1054, + "☞": 1055, + "່": 1056, + "ඝ": 1057, + "▒": 1058, + "陽": 1059, + "ඈං": 1060, + "蘭": 1061, + "ು": 1062, + "⁄": 1063, + "නෘ": 1064, + "ඡේ": 1065, + "ㄲ": 1066, + "ඬී": 1067, + "චෑ": 1068, + "ඟෝ": 1069, + "პ": 1070, + ".": 1071, + "රෲ": 1072, + "ධා": 1073, + "ළැ": 1074, + "í": 1075, + "මූ": 1076, + "ථි": 1077, + "ό": 1078, + "동": 1079, + "ά": 1080, + "用": 1081, + "€": 1082, + "V": 1083, + "鼎": 1084, + "💖": 1085, + "女": 1086, + "ा": 1087, + "ප": 1088, + "ஜ": 1089, + "ශූ": 1090, + "ඪි": 1091, + "κ": 1092, + "℉": 1093, + "þ": 1094, + "ቁ": 1095, + "එේ": 1096, + "ර්": 1097, + "팔": 1098, + "ੈ": 1099, + "Y": 1100, + "🏠": 1101, + "க": 1102, + "ජි": 1103, + "▲": 1104, + "ụ": 1105, + "ṁ": 1106, + "名": 1107, + "۶": 1108, + "ඪී": 1109, + "、": 1110, + "任": 1111, + "චෝ": 1112, + "ඡැ": 1113, + "ධෘ": 1114, + "ධෲ": 1115, + "족": 1116, + "甦": 1117, + "💹": 1118, + "ඤා": 1119, + "研": 1120, + "υ": 1121, + "සී": 1122, + "ֻ": 1123, + "►": 1124, + "+": 1125, + "ෆී": 1126, + "‘": 1127, + "😁": 1128, + "ඣු": 1129, + "ล": 1130, + "": 1131, + "е": 1132, + "ය්": 1133, + "מ": 1134, + "唐": 1135, + "ष": 1136, + "ِ": 1137, + "Ò": 1138, + "හෝ": 1139, + "😩": 1140, + "放": 1141, + "ु": 1142, + "👏": 1143, + "🙂": 1144, + "ℯ": 1145, + "-": 1146, + "整": 1147, + "එි": 1148, + "雨": 1149, + "ب": 1150, + "ʃ": 1151, + "Д": 1152, + "サ": 1153, + "ශෞ": 1154, + "ለ": 1155, + "∎": 1156, + "ι": 1157, + "블": 1158, + "ඝො": 1159, + "🚇": 1160, + "Ö": 1161, + "ț": 1162, + "7": 1163, + "කැ": 1164, + "ගෞ": 1165, + "ඟැ": 1166, + "ス": 1167, + "පඃ": 1168, + "만": 1169, + "📷": 1170, + "食": 1171, + "හං": 1172, + "ඓං": 1173, + "γ": 1174, + "🔵": 1175, + "ệ": 1176, + "ே": 1177, + "තෛ": 1178, + "伎": 1179, + "වඃ": 1180, + "ද්": 1181, + "ુ": 1182, + "➦": 1183, + "": 1184, + "සෑ": 1185, + "ඬ්": 1186, + "А": 1187, + "තඃ": 1188, + "ණී": 1189, + "😦": 1190, + "ஸ": 1191, + "¡": 1192, + "ʂ": 1193, + "ඣී": 1194, + "මේ": 1195, + "ලො": 1196, + "ෂි": 1197, + "ያ": 1198, + "工": 1199, + "ע": 1200, + "빈": 1201, + "▆": 1202, + "ගු": 1203, + "µ": 1204, + "á": 1205, + "ඒ": 1206, + "ණො": 1207, + "": 1208, + "": 1209, + "た": 1210, + "▅": 1211, + "ப": 1212, + "도": 1213, + "ටී": 1214, + "a": 1215, + "e": 1216, + "ඹූ": 1217, + "а": 1218, + ",": 1219, + "✈": 1220, + "შ": 1221, + "ω": 1222, + "[": 1223, + "щ": 1224, + "ê": 1225, + "â": 1226, + "☚": 1227, + "\u0002": 1228, + "⌖": 1229, + "순": 1230, + "🍂": 1231, + "ඊැ": 1232, + "℃": 1233, + "කූ": 1234, + "ඌ": 1235, + "තෞ": 1236, + "ඛෘ": 1237, + "ң": 1238, + "戌": 1239, + "อ": 1240, + "😨": 1241, + "ֹ": 1242, + "ඩ්": 1243, + "า": 1244, + "☆": 1245, + "ඪ්": 1246, + "😡": 1247, + "千": 1248, + "ඡෛ": 1249, + "田": 1250, + "යී": 1251, + "대": 1252, + "결": 1253, + "💎": 1254, + "ඦා": 1255, + "හ": 1256, + "😄": 1257, + "චි": 1258, + "෴": 1259, + "දෑ": 1260, + "<": 1261, + "ණෙ": 1262, + "බං": 1263, + "ん": 1264, + "": 1265, + "හෑ": 1266, + "": 1267, + "🔑": 1268, + "ලි": 1269, + "👈": 1270, + "බා": 1271, + "": 1272, + "වෘ": 1273, + "තැ": 1274, + "චො": 1275, + "哈": 1276, + "ɒ": 1277, + "": 1278, + "口": 1279, + "¦": 1280, + "කා": 1281, + "鈴": 1282, + "ứ": 1283, + "反": 1284, + "ළං": 1285, + "ㄱ": 1286, + "බී": 1287, + "උූ": 1288, + "ŋ": 1289, + "者": 1290, + "'": 1291, + "T": 1292, + "【": 1293, + "ሸ": 1294, + "⚜": 1295, + "≈": 1296, + "ጣ": 1297, + "య": 1298, + "💩": 1299, + "🎤": 1300, + "É": 1301, + "ළු": 1302, + "👆": 1303, + "☼": 1304, + "δ": 1305, + "ආඃ": 1306, + "씨": 1307, + "💥": 1308, + "г": 1309, + "Б": 1310, + "ඩො": 1311, + "ุ": 1312, + "И": 1313, + "๏": 1314, + "😝": 1315, + "ඵා": 1316, + "ジ": 1317, + "නං": 1318, + "생": 1319, + "": 1320, + "ඣෝ": 1321, + "ඉි": 1322, + "ō": 1323, + "ෂෙ": 1324, + "වං": 1325, + "ज": 1326, + "": 1327, + "කෞ": 1328, + "市": 1329, + "ü": 1330, + "ל": 1331, + "භූ": 1332, + "ů": 1333, + "ව": 1334, + "P": 1335, + "🇱": 1336, + "🕛": 1337, + "か": 1338, + "Z": 1339, + "ل": 1340, + "ტ": 1341, + "▼": 1342, + "⛅": 1343, + "☝": 1344, + "ē": 1345, + "ి": 1346, + "営": 1347, + "ශෙ": 1348, + "W": 1349, + "": 1350, + "සඃ": 1351, + "⛺": 1352, + "": 1353, + "ෂු": 1354, + "ž": 1355, + "තා": 1356, + "Ê": 1357, + "👹": 1358, + "උී": 1359, + "合": 1360, + "ඨේ": 1361, + "ಧ": 1362, + "ᄊ": 1363, + "♂": 1364, + "ඖ": 1365, + " ": 1366, + "මෛ": 1367, + "😃": 1368, + "📲": 1369, + "古": 1370, + "ж": 1371, + "j": 1372, + "ශු": 1373, + "➲": 1374, + "с": 1375, + "ā": 1376, + "3": 1377, + "お": 1378, + "ඹු": 1379, + "ნ": 1380, + "証": 1381, + "🤔": 1382, + "고": 1383, + "එං": 1384, + "ř": 1385, + "∏": 1386, + "ථූ": 1387, + "◀": 1388, + "Õ": 1389, + "ඇෙ": 1390, + "艾": 1391, + "Р": 1392, + "ෆො": 1393, + "ගා": 1394, + "c": 1395, + "🔫": 1396, + "ඨෙ": 1397, + "": 1398, + "ජා": 1399, + "의": 1400, + "書": 1401, + "ت": 1402, + "๐": 1403, + "🌴": 1404, + "క": 1405, + "श": 1406, + "බෲ": 1407, + "犬": 1408, + " ": 1409, + "රු": 1410, + "": 1411, + "ʔ": 1412, + "🙄": 1413, + "왕": 1414, + "Л": 1415, + "Ο": 1416, + "未": 1417, + "♨": 1418, + "ز": 1419, + "හු": 1420, + "ඟා": 1421, + "성": 1422, + "क": 1423, + "තො": 1424, + "ලෛ": 1425, + "ढ": 1426, + "ແ": 1427, + "〜": 1428, + "ජෙ": 1429, + "–": 1430, + "🌟": 1431, + "": 1432, + "🏳": 1433, + "ඍී": 1434, + "オ": 1435, + "ا": 1436, + "නූ": 1437, + "Α": 1438, + "චූ": 1439, + "😮": 1440, + "ඵෑ": 1441, + "🏃": 1442, + "ඛො": 1443, + "人": 1444, + "ෂා": 1445, + "狄": 1446, + "රෙ": 1447, + "ṅ": 1448, + "අි": 1449, + "ඤෝ": 1450, + "ਡ": 1451, + "ನ": 1452, + "及": 1453, + "ඩං": 1454, + "р": 1455, + "⇻": 1456, + "🌼": 1457, + "ٌ": 1458, + "ர": 1459, + "τ": 1460, + "ǚ": 1461, + "ඒෙ": 1462, + "活": 1463, + "මඃ": 1464, + "ට්": 1465, + "n": 1466, + "行": 1467, + "私": 1468, + "යෞ": 1469, + "ல": 1470, + "X": 1471, + "උා": 1472, + "ඪෝ": 1473, + "🚴": 1474, + "ਬ": 1475, + "ደ": 1476, + "ඵො": 1477, + "ו": 1478, + "▶": 1479, + "ய": 1480, + "❤": 1481, + "අෘ": 1482, + "′": 1483, + "★": 1484, + "Δ": 1485, + "î": 1486, + "ር": 1487, + "나": 1488, + "۩": 1489, + "ඎ": 1490, + "ඨූ": 1491, + "主": 1492, + "๬": 1493, + "": 1494, + "攻": 1495, + "숙": 1496, + "": 1497, + "小": 1498, + "곰": 1499, + "ن": 1500, + "ඹී": 1501, + "మ": 1502, + "三": 1503, + "由": 1504, + "ඬං": 1505, + "🍅": 1506, + "තු": 1507, + "ඉ්": 1508, + "ㅅ": 1509, + "ç": 1510, + "数": 1511, + "මෞ": 1512, + "후": 1513, + "ඟ": 1514, + "$": 1515, + "←": 1516, + "ள": 1517, + "ת": 1518, + "€": 1519, + "ṇ": 1520, + "イ": 1521, + "ವ": 1522, + "ک": 1523, + "ಂ": 1524, + "ඛ්": 1525, + "☯": 1526, + "缶": 1527, + "역": 1528, + "": 1529, + "වැ": 1530, + "%": 1531, + "☛": 1532, + "▫": 1533, + "තී": 1534, + "≥": 1535, + "E": 1536, + "成": 1537, + "È": 1538, + "前": 1539, + "¿": 1540, + "බු": 1541, + "ம": 1542, + "レ": 1543, + "ධෝ": 1544, + "ශ්": 1545, + "そ": 1546, + "里": 1547, + "ற": 1548, + "û": 1549, + "◇": 1550, + "្": 1551, + "許": 1552, + "ድ": 1553, + "壬": 1554, + "`": 1555, + "🛑": 1556, + "國": 1557, + "‌": 1558, + "ʻ": 1559, + "ධ්": 1560, + "ஷ": 1561, + "ク": 1562, + "ථැ": 1563, + "⚪": 1564, + "ภ": 1565, + "ඡා": 1566, + "£": 1567, + "ο": 1568, + "​": 1569, + "乎": 1570, + "ě": 1571, + "ר": 1572, + "Ŧ": 1573, + "➢": 1574, + "ɔ": 1575, + "චෛ": 1576, + "S": 1577, + "ી": 1578, + "ඝේ": 1579, + "හො": 1580, + "q": 1581, + "識": 1582, + "з": 1583, + "එෛ": 1584, + "в": 1585, + "カ": 1586, + "➩": 1587, + "රා": 1588, + "🤷": 1589, + "�": 1590, + "💙": 1591, + "―": 1592, + "あ": 1593, + "明": 1594, + "තේ": 1595, + "ņ": 1596, + "හඃ": 1597, + "U": 1598, + "R": 1599, + "ਤ": 1600, + "ඨා": 1601, + "م": 1602, + "节": 1603, + "♦": 1604, + "පෞ": 1605, + "මෙ": 1606, + "ශී": 1607, + "🔨": 1608, + "ජො": 1609, + "ე": 1610, + "ّ": 1611, + "े": 1612, + "午": 1613, + "එ්": 1614, + "4": 1615, + "ණ්": 1616, + "*": 1617, + "校": 1618, + "̵": 1619, + "ස්": 1620, + "": 1621, + "ඣ": 1622, + "п": 1623, + "아": 1624, + "Ω": 1625, + "é": 1626, + "y": 1627, + "දෞ": 1628, + "ግ": 1629, + "↓": 1630, + "ﻨ": 1631, + "ඪා": 1632, + "ජෛ": 1633, + "।": 1634, + "斯": 1635, + "ඛ": 1636, + "ي": 1637, + "„": 1638, + "ť": 1639, + "": 1640, + "සෝ": 1641, + "式": 1642, + "л": 1643, + "රො": 1644, + "ධි": 1645, + "і": 1646, + "චා": 1647, + "වෝ": 1648, + "ආා": 1649, + "ø": 1650, + "わ": 1651, + "එු": 1652, + "👀": 1653, + "ඞා": 1654, + "ä": 1655, + "ַ": 1656, + "ලෝ": 1657, + "த": 1658, + "ወ": 1659, + "මං": 1660, + "ත්": 1661, + "ま": 1662, + "パ": 1663, + "広": 1664, + "会": 1665, + "問": 1666, + "": 1667, + "志": 1668, + "리": 1669, + "丝": 1670, + "о": 1671, + "😊": 1672, + "責": 1673, + "ඛූ": 1674, + "❣": 1675, + "を": 1676, + "😌": 1677, + "ඬෙ": 1678, + "杜": 1679, + "∫": 1680, + "එී": 1681, + "ඝා": 1682, + "군": 1683, + "š": 1684, + "ඵූ": 1685, + "ί": 1686, + "😆": 1687, + "😛": 1688, + "卡": 1689, + "": 1690, + "කී": 1691, + "ටං": 1692, + "를": 1693, + "බැ": 1694, + "💵": 1695, + "™": 1696, + "산": 1697, + "🌎": 1698, + "ශො": 1699, + "乇": 1700, + "↠": 1701, + "💐": 1702, + "බෙ": 1703, + "元": 1704, + "ඛේ": 1705, + "2": 1706, + "උ": 1707, + "드": 1708, + "දා": 1709, + "ඳෘ": 1710, + "මු": 1711, + "後": 1712, + "ë": 1713, + "➤": 1714, + "ׁ": 1715, + "č": 1716, + "材": 1717, + "ණි": 1718, + "¤": 1719, + "″": 1720, + "究": 1721, + "ගෑ": 1722, + "쉬": 1723, + "d": 1724, + "訥": 1725, + "•": 1726, + "城": 1727, + "🚆": 1728, + "→": 1729, + "こ": 1730, + "H": 1731, + "म": 1732, + "දේ": 1733, + "රෞ": 1734, + "“": 1735, + "ඪො": 1736, + "ሽ": 1737, + "C": 1738, + "ं": 1739, + "ඳූ": 1740, + "吗": 1741, + "😵": 1742, + "": 1743, + "▂": 1744, + "ඡො": 1745, + "දඃ": 1746, + "이": 1747, + "⚘": 1748, + "රී": 1749, + "り": 1750, + "🙉": 1751, + "ф": 1752, + "\b": 1753, + "ඨී": 1754, + "正": 1755, + "催": 1756, + "찰": 1757, + "어": 1758, + "👉": 1759, + "ş": 1760, + "ෆි": 1761, + "ෆං": 1762, + "ටෲ": 1763, + "ማ": 1764, + "ධ": 1765, + "्": 1766, + "業": 1767, + "🎁": 1768, + "": 1769, + "භු": 1770, + "ь": 1771, + "近": 1772, + "බෝ": 1773, + "尺": 1774, + "ㄷ": 1775, + "ر": 1776, + "වෛ": 1777, + "√": 1778, + "": 1779, + "👓": 1780, + "බේ": 1781, + "L": 1782, + "“": 1783, + "ඕං": 1784, + "ෆෝ": 1785, + "මැ": 1786, + "ʈ": 1787, + "ở": 1788, + "😀": 1789, + "ṣ": 1790, + "©": 1791, + "덕": 1792, + "ం": 1793, + "薦": 1794, + "රි": 1795, + "ם": 1796, + "ハ": 1797, + "ਰ": 1798, + "ෂ": 1799, + "®": 1800, + "∋": 1801, + "Ш": 1802, + "සැ": 1803, + "؟": 1804, + "˝": 1805, + "원": 1806, + "ස": 1807, + "и": 1808, + "物": 1809, + "🙌": 1810, + "妇": 1811, + "여": 1812, + "˜": 1813, + "べ": 1814, + "ඟී": 1815, + "ඛී": 1816, + "✴": 1817, + "‹": 1818, + "💇": 1819, + "💃": 1820, + "ύ": 1821, + "හ්": 1822, + "භෛ": 1823, + "ੁ": 1824, + "வ": 1825, + "යේ": 1826, + "▬": 1827, + "යු": 1828, + "දෝ": 1829, + "ධැ": 1830, + "": 1831, + "😔": 1832, + "ŝ": 1833, + "職": 1834, + "П": 1835, + "✌": 1836, + "ටෞ": 1837, + "გ": 1838, + "¯": 1839, + "น": 1840, + "说": 1841, + "飛": 1842, + "ة": 1843, + "යි": 1844, + "ገ": 1845, + "ಯ": 1846, + "පො": 1847, + "ථේ": 1848, + "å": 1849, + "ථ": 1850, + "ෂෘ": 1851, + "!": 1852, + "육": 1853, + "ह": 1854, + "ú": 1855, + "ටූ": 1856, + "祭": 1857, + "せ": 1858, + "භෙ": 1859, + "░": 1860, + "烧": 1861, + "🧗": 1862, + "තෙ": 1863, + "r": 1864, + "☂": 1865, + "කො": 1866, + "ඩෛ": 1867, + "æ": 1868, + "∞": 1869, + "ิ": 1870, + "和": 1871, + "ටි": 1872, + "ථෙ": 1873, + "ﷺ": 1874, + "": 1875, + "ෂෑ": 1876, + "ලී": 1877, + "ගෝ": 1878, + "❋": 1879, + "ඈෑ": 1880, + "ト": 1881, + "හි": 1882, + "ටැ": 1883, + "❸": 1884, + "ශෲ": 1885, + "ලෑ": 1886, + "➧": 1887, + "📕": 1888, + "ණං": 1889, + "ලූ": 1890, + "◄": 1891, + "ඟු": 1892, + "ශේ": 1893, + "К": 1894, + "Ó": 1895, + "ඵු": 1896, + "": 1897, + "ਵ": 1898, + "͡": 1899, + "": 1900, + "🌊": 1901, + "ತ": 1902, + "5": 1903, + "සෛ": 1904, + "⭐": 1905, + "‎": 1906, + "⇒": 1907, + "ෆෞ": 1908, + "幣": 1909, + "🖤": 1910, + "シ": 1911, + "ṟ": 1912, + "╮": 1913, + "☟": 1914, + "පෲ": 1915, + "כ": 1916, + "遅": 1917, + "😉": 1918, + "අැ": 1919, + "■": 1920, + "我": 1921, + "අෑ": 1922, + "B": 1923, + "მ": 1924, + "◢": 1925, + "භො": 1926, + "タ": 1927, + "දෲ": 1928, + "ū": 1929, + "ዬ": 1930, + "ኋ": 1931, + "½": 1932, + "族": 1933, + "β": 1934, + "ល": 1935, + "ඵෙ": 1936, + "ඤෑ": 1937, + "જ": 1938, + "ෂෝ": 1939, + "🤗": 1940, + "ვ": 1941, + "×": 1942, + "ش": 1943, + "ℓ": 1944, + "ง": 1945, + "වෲ": 1946, + "维": 1947, + "녕": 1948, + "Ţ": 1949, + "M": 1950, + "": 1951, + "කඃ": 1952, + "🏁": 1953, + "💰": 1954, + "本": 1955, + "█": 1956, + "හෘ": 1957, + "l": 1958, + "ඩෝ": 1959, + "යා": 1960, + "😥": 1961, + "ः": 1962, + "Ʒ": 1963, + "ඳි": 1964, + "曜": 1965, + "弘": 1966, + "ළෙ": 1967, + "動": 1968, + "☺": 1969, + "යො": 1970, + "应": 1971, + "プ": 1972, + "ජූ": 1973, + "φ": 1974, + "ძ": 1975, + "✖": 1976, + "h": 1977, + "井": 1978, + "錢": 1979, + "ɑ": 1980, + "ඨ": 1981, + "ශං": 1982, + "的": 1983, + "පැ": 1984, + "෿": 1985, + "ְ": 1986, + "ඳො": 1987, + "නෙ": 1988, + "": 1989, + "⇜": 1990, + "ÿ": 1991, + "ල්": 1992, + "ඩැ": 1993, + "ு": 1994, + "ඛෑ": 1995, + "特": 1996, + "は": 1997, + "۱": 1998, + "🚗": 1999, + "ඡෝ": 2000, + "ශෛ": 2001, + "భ": 2002, + "౦": 2003, + "ඟං": 2004, + "እ": 2005, + "තෑ": 2006, + "ශෑ": 2007, + "ධේ": 2008, + "ை": 2009, + "ы": 2010, + "Ὑ": 2011, + "ບ": 2012, + "정": 2013, + "ඹො": 2014, + "ථී": 2015, + "ඟෙ": 2016, + "›": 2017, + "🙏": 2018, + "➣": 2019, + "ධෞ": 2020, + "🎰": 2021, + "ථා": 2022, + "ف": 2023, + "ə": 2024, + "ඪ": 2025, + "👼": 2026, + "ರ": 2027, + "Ŝ": 2028, + "新": 2029, + "ටෝ": 2030, + "ም": 2031, + ")": 2032, + "v": 2033, + "ص": 2034, + "ئ": 2035, + "兵": 2036, + "ඛැ": 2037, + "ෂඃ": 2038, + "Ü": 2039, + "न": 2040, + "ම": 2041, + "я": 2042, + "ہ": 2043, + "Γ": 2044, + "ශෝ": 2045, + "இ": 2046, + "°": 2047, + "": 2048, + "(": 2049, + "訣": 2050, + "එෙ": 2051, + "=": 2052, + "ッ": 2053, + "ป": 2054, + "Σ": 2055, + "堡": 2056, + "ඬ": 2057, + "✘": 2058, + " ": 2059, + "۹": 2060, + "包": 2061, + "д": 2062, + "Π": 2063, + "ඉා": 2064, + "♪": 2065, + "ලෞ": 2066, + "": 2067, + "පු": 2068, + "ධෛ": 2069, + "헌": 2070, + "එ": 2071, + "භැ": 2072, + "ಅ": 2073, + "‑": 2074, + "Ἀ": 2075, + "¶": 2076, + "": 2077, + "є": 2078, + "s": 2079, + "リ": 2080, + "ඉං": 2081, + "ඨො": 2082, + "🤓": 2083, + "إ": 2084, + "ோ": 2085, + "ඍ්": 2086, + "💠": 2087, + "击": 2088, + "ඕෙ": 2089, + "☖": 2090, + "ඒං": 2091, + "ή": 2092, + "ඨු": 2093, + "මො": 2094, + "x": 2095, + "د": 2096, + "ﻟ": 2097, + "ආ": 2098, + "ඔු": 2099, + "客": 2100, + "🏿": 2101, + "少": 2102, + "☎": 2103, + "ඔෙ": 2104, + "↔": 2105, + "ඳං": 2106, + "租": 2107, + "😲": 2108, + "🔖": 2109, + "💯": 2110, + "ඉැ": 2111, + "구": 2112, + "が": 2113, + "": 2114, + "වී": 2115, + "ඳා": 2116, + "රං": 2117, + "චං": 2118, + ":": 2119, + "බෑ": 2120, + "ש": 2121, + "ʁ": 2122, + "ණෘ": 2123, + "සෲ": 2124, + "ả": 2125, + "ጥ": 2126, + "ඇඃ": 2127, + "ಿ": 2128, + "😇": 2129, + "🚘": 2130, + "并": 2131, + "천": 2132, + "რ": 2133, + "ඤූ": 2134, + "ඬූ": 2135, + "ช": 2136, + "¸": 2137, + "": 2138, + "ਸ": 2139, + "♚": 2140, + "﴿": 2141, + "මා": 2142, + "⏳": 2143, + "හූ": 2144, + "-": 2145, + "": 2146, + "නො": 2147, + "╰": 2148, + "ඵං": 2149, + "̃": 2150, + "汉": 2151, + "●": 2152, + "ḷ": 2153, + "²": 2154, + "▷": 2155, + "ი": 2156, + "ア": 2157, + "ෂෛ": 2158, + "🎼": 2159, + "": 2160, + "භෑ": 2161, + "♠": 2162, + "රෝ": 2163, + "🚉": 2164, + "ህ": 2165, + "乾": 2166, + "අේ": 2167, + "ٹ": 2168, + "卒": 2169, + ">": 2170, + "ર": 2171, + "දී": 2172, + "ඩෙ": 2173, + "推": 2174, + "ඥ": 2175, + "ኛ": 2176, + "络": 2177, + "ἄ": 2178, + "t": 2179, + "ඞ්": 2180, + "😋": 2181, + "ý": 2182, + "∆": 2183, + "ජඃ": 2184, + "?": 2185, + "れ": 2186, + "ふ": 2187, + "最": 2188, + "🔬": 2189, + "ἀ": 2190, + "ථං": 2191, + "排": 2192, + "요": 2193, + "್": 2194, + "◎": 2195, + "無": 2196, + "ொ": 2197, + "🔓": 2198, + "භා": 2199, + "": 2200, + "ඝෙ": 2201, + "😜": 2202, + "С": 2203, + "ඣා": 2204, + "♊": 2205, + "◆": 2206, + "书": 2207, + "☠": 2208, + "χ": 2209, + "b": 2210, + "👭": 2211, + "ﻬ": 2212, + "德": 2213, + "‐": 2214, + "ඣි": 2215, + "ධූ": 2216, + "": 2217, + "1": 2218, + "공": 2219, + "σ": 2220, + "ළෘ": 2221, + "அ": 2222, + "贼": 2223, + "래": 2224, + "මෘ": 2225, + "කෲ": 2226, + "ì": 2227, + "˚": 2228, + "è": 2229, + "🍔": 2230, + "館": 2231, + "ථෛ": 2232, + "๒": 2233, + "ц": 2234, + "ඡං": 2235, + "☐": 2236, + "ෂ්": 2237, + "✔": 2238, + "චී": 2239, + "👊": 2240, + "ඔා": 2241, + "ඤෙ": 2242, + "ಗ": 2243, + "ශෘ": 2244, + "ಾ": 2245, + "ආං": 2246, + "ඟි": 2247, + "හෙ": 2248, + "大": 2249, + "ෆු": 2250, + "භං": 2251, + "ன": 2252, + "කි": 2253, + "ෂූ": 2254, + "♫": 2255, + "පෙ": 2256, + "ミ": 2257, + "🇺": 2258, + "Н": 2259, + "尔": 2260, + "▣": 2261, + "ч": 2262, + "‛": 2263, + "දි": 2264, + "පෑ": 2265, + "ෆා": 2266, + "ථෘ": 2267, + "ャ": 2268, + "ϕ": 2269, + "ח": 2270, + "多": 2271, + "බො": 2272, + "": 2273 +} \ No newline at end of file diff --git a/examples/examples.ipynb b/examples/examples.ipynb index 461880a..811cf4e 100644 --- a/examples/examples.ipynb +++ b/examples/examples.ipynb @@ -16,8 +16,14 @@ "metadata": {}, "outputs": [], "source": [ - "from sinlib import Tokenizer\n", - "from sinlib import preprocessing" + "from sinlib import Tokenizer, preprocessing, Romanizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training a sinlib tokenizer" ] }, { @@ -26,7 +32,14 @@ "metadata": {}, "outputs": [], "source": [ - "text = [\"ක්‍රමවත්ව, ඉවසිලිවන්තව\"] * 10" + "corpus = [\n", + " \"\"\"මෙරටට බුදදහම දායාද කරමින් අනුබුදු මිහිඳු හිමිගේ ලංකා ගමනය සිදුවූ උතුම් පොසොන් පුර පසළොස්වක පොහොය දිනය අදට යෙදී තිබේ.\n", + "\n", + "මිහිඳු මහරහතන් වහන්සේ ප්‍රමුඛ ඉට්ඨිය, උත්ථිය, සම්බල, බද්දසාල යන රහතන් වහන්සේලාත් සුමන සාමණේරයන් වහන්සේත් භණ්ඩුක උපාසකක් බුදුරජාණන් වහන්සේගේ නිර්මල බුදුදහම රැගෙන මිහින්තලා පව්වට වැඩම කරවීම අද වැනි පොසොන් පුර පසළොස්වක පෙහොය දිනක සිදුවූ බව බෞද්ධ ඉතිහාසයේ සඳහන් වෙයි.\n", + "\n", + "දේවානම් පියතිස්ස රජු ඇතුළු පිරිස චුල්ලහත්ථි පදෝපම සූත්‍රය අසා තෙරුවන් සරණ යාම සිදු වූයේද අද වැනි පොසොන් පොහොය දිනකය.\"\"\",\n", + "\"මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසින් අනු දැන වදාළ ධර්ම මාර්ගය මෙරට පත්වී ඇති දේශපාලන, සමාජ හා ආර්ථික ගැටළු නිරාකරණය කර ගනිමින් දියුණු රටක් ගොඩනැඟීමට ඉවහල් කරගන්නා ලෙස සියලු දෙනාගෙන් ඉල්ලා සිටින බවය.\"\n", + "]" ] }, { @@ -35,44 +48,41 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "['ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n", - " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31mSignature:\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_list\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m\n", + "Train the tokenizer on a list of text strings.\n", + "\n", + "Parameters\n", + "----------\n", + "text_list : list of str\n", + " List of text strings to be used for training the tokenizer.\n", + "\n", + "Examples\n", + "--------\n", + ">>> from sinlib import Tokenizer\n", + ">>> corpus = [...]\n", + ">>> tokenizer = Tokenizer()\n", + ">>> tokenizer.train(corpus)\n", + "\u001b[0;31mFile:\u001b[0m ~/learning/sinlib/src/sinlib/tokenizer.py\n", + "\u001b[0;31mType:\u001b[0m method" + ] } ], "source": [ - "text" + "tokenizer = Tokenizer()\n", + "tokenizer.train?" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව']\n" - ] - } - ], + "outputs": [], "source": [ - "print(text) # have non printables \\u200d" + "tokenizer.train(corpus)" ] }, { @@ -83,16 +93,7 @@ { "data": { "text/plain": [ - "[0.9333333333333333,\n", - " 0.9333333333333333,\n", - " 0.9333333333333333,\n", - " 0.9333333333333333,\n", - " 0.9333333333333333,\n", - " 0.9333333333333333,\n", - " 0.9333333333333333,\n", - " 0.9333333333333333,\n", - " 0.9333333333333333,\n", - " 0.9333333333333333]" + "127" ] }, "execution_count": 6, @@ -101,237 +102,364 @@ } ], "source": [ - "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=False)" + "len(tokenizer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Encoding text" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, + "outputs": [], + "source": [ + "text = \"උතුම් පොසොන් පොහොය අද\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "encodings = tokenizer(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]" + "[51, 118, 33, 54, 121, 13, 97, 54, 121, 29, 50, 54, 52, 120]" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True)" + "encodings" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]" + "['උ', 'තු', 'ම්', ' ', 'පො', 'සො', 'න්', ' ', 'පො', 'හො', 'ය', ' ', 'අ', 'ද']" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True, ignore_non_printable=True)" + "[tokenizer.token_id_to_token_map[tok] for tok in encodings]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save trained tokenizer and load from disk" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer.save_tokenizer(\".\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_tokenizer = Tokenizer().load_from_pretrained(\"./vocab.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "assert loaded_tokenizer(text)==tokenizer(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sinhala text romanization" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "romanizer = Romanizer(char_mapper_fp=None, tokenizer_vocab_path=None) #pass both none to load from default configs" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[0.9375,\n", - " 0.9375,\n", - " 0.9375,\n", - " 0.9375,\n", - " 0.9375,\n", - " 0.9375,\n", - " 0.9375,\n", - " 0.9375,\n", - " 0.9375,\n", - " 0.9375]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "meratata budadahama dayada karamin anubudu mihidu himige lanka gamanaya siduwu uthum poson pura pasaloswaka pohoya dinaya adata yedi thibe.mihidu maharahathan wahanse pramuka ettiya, uththiya, sambala, baddasala yana rahathan wahanselath sumana samanorayan wahanseth bhanduka upasakak budurajanan wahansege nirmala bududahama regena mihinthala pawwata wadama karawema ada wani poson pura pasaloswaka pehoya dinaka siduwu bawa bauddha ethihasaye sadahan wei.dewanam piyathissa raju ethulu pirisa chullahaththi padhopama suthraya asa theruwan sarana yama sidu wuyeda ada wani poson pohoya dinakaya.\n" + ] } ], "source": [ - "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True, ignore_non_printable=False)" + "print(romanizer(corpus[0]))" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "tokeniser = Tokenizer()" + "more_complex_text = corpus[1]" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "corpus = [\"\"\"මේ මාසයේ ගත වූ දින 15ක කාලය තුළ කොළඹ නගරය ආශ්‍රිත ව සීසීටීවී දර්ශන මඟින් වැරදිවලට සම්බන්ධ පුද්ගලයන් 793 දෙනෙකු හදුනාගත් බව පොලීසිය නිවේදනය කර තිබේ.\"\"\"\n", - " \"\"\"මෑතකාලීන ව රට මුහුණ දුන් අභියෝගාත්මකම ආර්ථික කාරණාව ණය ප්‍රතිව්‍යුගතකරණය බව මුදල් රාජ්‍ය අමාත්‍ය ආචාර්ය රංජිත් සියඹලාපිටිය මහතා පවසයි.\"\"\",\n", - " \"භාෂාව\"\n", - " ]" + "more_complex_text = more_complex_text[:100] + \".... \\nIn linguistics, romanization is the conversion...., adding special chars ^^*#(&#&$^)\"" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'me athara poson poho dina paniwudayak nikuth karamin janadhipathiwaraya penwa denne mihidu maharahathan wahanse visi.... In linguistics, romanization is the conversion...., adding special chars ^^*#(&#&$^)'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "tokeniser.train(corpus)" + "romanizer(more_complex_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Few available preprocessing methods on Sinhala texts" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "tokens = tokeniser(\"රට මුහුණ දුන් සිද්ධියේ\")" + "_, token_count = preprocessing.process_text_with_token_counts(corpus[0], consider_special_character_as_sinhala=False, ignore_non_printable=True)" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[73, 37, 2, 68, 56, 38, 2, 62, 29, 2, 46, 54, 87, 4]" + "271" ] }, - "execution_count": 34, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tokens" + "token_count" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "decoded_tokens = [tokeniser.token_id_to_token_map[id] for id in tokens]" + "more_complex_text += \"ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී\"" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසි.... \n", + "In linguistics, romanization is the conversion...., adding special chars ^^*#(&#&$^)ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී\n" + ] + } + ], + "source": [ + "print(more_complex_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'රට මුහුණ දුන් සිද්යේ'" + "'rs ^^*#(&#&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී'" ] }, - "execution_count": 36, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "\"\".join(decoded_tokens)" + "more_complex_text[-50:]" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'රට මුහුණ දුන් සිද්යේ'" + "'මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසි.... , ...., ^^*#(&#&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී'" ] }, - "execution_count": 37, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tokeniser.decode(tokens)" + "preprocessing.remove_english_characters(more_complex_text)" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['භා', 'ෂා', 'ව']" + "'rs ^^*#(&#&$^)ශ්රී ලංකා ප්රජාතාන්ත්රික සමාජවාදී'" ] }, - "execution_count": 38, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "[tokeniser.token_id_to_token_map[id] for id in tokeniser(\"භාෂාව\")]" + "preprocessing.remove_non_printable(more_complex_text[-50:])" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['ස', 'ි', 'ං', 'හ', 'ල']" + "0.610738255033557" ] }, - "execution_count": 29, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "list(\"සිංහල\")" + "preprocessing.get_sinhala_character_ratio(more_complex_text)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessing.get_sinhala_character_ratio(\n", + " preprocessing.remove_english_characters(\n", + " more_complex_text\n", + " )\n", + ")" + ] } ], "metadata": { + "kernelspec": { + "display_name": "analysis-env", + "language": "python", + "name": "analysis" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 4398668..704f290 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sinlib" -version = "0.0.8.5" +version = "0.0.8.6" description = "Sinhala NLP Toolkit" authors = [ { name = "Ransaka", email = "ransaka.ravihara@gmail.com" } diff --git a/src/sinlib/__init__.py b/src/sinlib/__init__.py index e48d083..129cf78 100644 --- a/src/sinlib/__init__.py +++ b/src/sinlib/__init__.py @@ -1,7 +1,9 @@ from sinlib.tokenizer import Tokenizer from sinlib.utils import preprocessing +from sinlib.romanize import Romanizer __all__ = [ "Tokenizer", - "preprocessing" + "preprocessing", + "Romanizer" ] diff --git a/src/sinlib/romanize.py b/src/sinlib/romanize.py new file mode 100644 index 0000000..dafa4d3 --- /dev/null +++ b/src/sinlib/romanize.py @@ -0,0 +1,43 @@ +from .utils.preprocessing import load_char_mapper +from .tokenizer import Tokenizer +from .utils.preprocessing import DEFAULT_VOCAB_MAP_FP, CHAR_MAPPER_FP +from .utils.chars import ALL_SINHALA_CHARACTERS, NUBERS_AND_PUNKTS +from .utils.preprocessing import remove_non_printable +import numpy as np + + +class Romanizer: + def __init__(self, char_mapper_fp: str, tokenizer_vocab_path: str): + if char_mapper_fp is None: + char_mapper_fp = CHAR_MAPPER_FP + if tokenizer_vocab_path is None: + tokenizer_vocab_path = DEFAULT_VOCAB_MAP_FP + self.char_mapper = load_char_mapper(char_mapper_fp) + self.tokenizer = Tokenizer() + self.tokenizer.load_from_pretrained(tokenizer_vocab_path) + + def __call__(self, text): + return self.__romanize(text) + + def __romanize(self, text: str): + text = remove_non_printable(text) + chars = np.array(list(text)) + sinhala_mask = [ + True + if ch in ALL_SINHALA_CHARACTERS + list(NUBERS_AND_PUNKTS) + [" "] + else False + for ch in chars + ] + sinhala_text = "".join(chars[sinhala_mask]).strip() + encodings = self.tokenizer(sinhala_text) + decoded_sinhala_chars = [ + self.tokenizer.token_id_to_token_map[c] for c in encodings + ] + romanized_sinhala = [ + self.char_mapper.get(ch, ch if ch in NUBERS_AND_PUNKTS.union(" ") else None) + for ch in decoded_sinhala_chars + ] + romanized_sinhala = "".join(romanized_sinhala) + word_2_word_mapping = dict(zip(sinhala_text.split(), romanized_sinhala.split())) + romanized_text = [word_2_word_mapping.get(word, word) for word in text.split()] + return " ".join(romanized_text) diff --git a/src/sinlib/tokenizer.py b/src/sinlib/tokenizer.py index f2436b9..b5bc0ee 100644 --- a/src/sinlib/tokenizer.py +++ b/src/sinlib/tokenizer.py @@ -1,26 +1,96 @@ +import json +import warnings +from pathlib import Path import concurrent.futures -from .utils.preprocessing import process_text +from .utils.preprocessing import process_text, load_default_vocab_map + class Tokenizer: def __init__(self): + self.unknown_token_id = None + self.token_id_to_token_map = None + self.vocab_map = None self.unknown_token = "" self.tokenized_chars = [] self.unique_chars = [] - - def __encode(self, text): + + def __encode(self, text) -> list: processed_text = self.__process_text(text) - encoded_text = [self.vocab_map.get(char, self.unknown_token_id) for char in processed_text] + encoded_text = [ + self.vocab_map.get(char, self.unknown_token_id) for char in processed_text + ] return encoded_text - - def __call__(self, text): + + def __call__(self, text) -> list: + """ + Encode the given text into a list of tokens. + + Parameters + ---------- + text : str + Text to be encoded. + + Returns + ------- + encoded_tokens : list of int + List of tokens representing the encoded text. + + Examples + -------- + >>> from sinlib import Tokenizer + >>> corpus = [...] + >>> tokenizer = Tokenizer() + >>> tokenizer.train(corpus) + >>> tokenizer("මම ගෙදර ගියා") + [2041, 2041, 942, 965, 624, 909, 942, 54, 1960] + """ return self.__encode(text) - - def decode(self, ids): - return "".join([self.token_id_to_token_map.get(token,self.unknown_token) for token in ids]) - def train(self, text_list): - self.__train_chracter_level_tokenizer(text_list) - + def decode(self, ids) -> str: + """ + Decode a list of token IDs into a string. + + Parameters + ---------- + ids : list of int + List of token IDs to be decoded. + + Returns + ------- + decoded_text : str + The decoded text string. + + Examples + -------- + >>> from sinlib import Tokenizer + >>> tokenizer = Tokenizer() + >>> tokenizer.train([...]) + >>> encoded_tokens = [2041, 2041, 942, 965, 624, 909, 942, 54, 1960] + >>> tokenizer.decode(encoded_tokens) + 'මම ගෙදර ගියා' + """ + return "".join( + [self.token_id_to_token_map.get(token, self.unknown_token) for token in ids] + ) + + def train(self, text_list) -> None: + """ + Train the tokenizer on a list of text strings. + + Parameters + ---------- + text_list : list of str + List of text strings to be used for training the tokenizer. + + Examples + -------- + >>> from sinlib import Tokenizer + >>> corpus = [...] + >>> tokenizer = Tokenizer() + >>> tokenizer.train(corpus) + """ + self.__train_character_level_tokenizer(text_list) + def __len__(self): return len(self.vocab_map) @@ -28,12 +98,64 @@ def __len__(self): def __process_text(t): return process_text(t) - def __train_chracter_level_tokenizer(self, text_list): + def __train_character_level_tokenizer(self, text_list): with concurrent.futures.ThreadPoolExecutor() as executor: results = list(executor.map(self.__process_text, text_list)) self.tokenized_chars = [char for sublist in results for char in sublist] self.unique_chars = set(self.tokenized_chars) - self.vocab_map = dict(zip(self.unique_chars,range(len(self.unique_chars)))) + self.vocab_map = dict(zip(self.unique_chars, range(len(self.unique_chars)))) self.vocab_map[self.unknown_token] = len(self.vocab_map) self.unknown_token_id = self.vocab_map[self.unknown_token] - self.token_id_to_token_map = {value:key for key,value in self.vocab_map.items()} \ No newline at end of file + self.token_id_to_token_map = { + value: key for key, value in self.vocab_map.items() + } + + def load_from_pretrained(self, file_path: str) -> None: + """ + Load the vocabulary map from a pre-trained file. + + Parameters + ---------- + file_path : str + Path to the file containing the pre-trained vocabulary map. + + Returns + ------- + None + + Warns + ----- + UserWarning + If the file is not found at the specified path, a default vocabulary map is loaded and a warning is issued. + + Examples + -------- + >>> from sinlib import Tokenizer + >>> tokenizer = Tokenizer() + >>> tokenizer.load_from_pretrained("pretrained_vocab.json") + """ + if Path(file_path).is_file(): + with open(file_path, "r") as f: + self.vocab_map = json.load(f) + else: + warnings.warn( + "File not found at the specified path. Loaded default vocab map.", + UserWarning, + ) + self.vocab_map = load_default_vocab_map() + + self.token_id_to_token_map = { + value: key for key, value in self.vocab_map.items() + } + self.unknown_token_id = self.vocab_map[self.unknown_token] + return self + + def save_tokenizer(self, save_path: str): + save_path = Path(save_path) + configurations = {"unknown_token": self.unknown_token} + + with open(save_path / "vocab.json", "w", encoding="utf-8") as file: + json.dump(self.vocab_map, file, ensure_ascii=False, indent=4) + + with open(save_path / "config.json", "w") as file: + json.dump(configurations, file, indent=4) \ No newline at end of file diff --git a/src/sinlib/utils/chars.py b/src/sinlib/utils/chars.py index 3dd2cb1..228b247 100644 --- a/src/sinlib/utils/chars.py +++ b/src/sinlib/utils/chars.py @@ -1,43 +1,188 @@ from string import punctuation -BASE_CONSONANTS = [ - 'ක', 'ඛ', 'ග', 'ඝ', 'ඞ', 'ඟ', - 'ච', 'ඡ', 'ජ', 'ඣ', 'ඤ', 'ඦ', - 'ට', 'ඨ', 'ඩ', 'ඪ', 'ණ', 'ඬ', - 'ත', 'ථ', 'ද', 'ධ', 'න', 'ඳ', - 'ප', 'ඵ', 'බ', 'භ', 'ම', 'ඹ', - 'ය', 'ර', 'ල', 'ව', - 'ශ', 'ෂ', 'ස', 'හ', 'ළ', 'ෆ', +ALL_SINHALA_CHARACTERS = [ + "ඏ", + "ඛ", + "ම", + "ඍ", + "ு", + "ා", + "ප", + "ඝ", + "ඹ", + "ඓ", + "ෑ", + "ෂ", + "ැ", + "ෲ", + "ි", + "ක", + "ණ", + "ධ", + "்", + "ඵ", + "ඞ", + "ජ", + "හ", + "ෝ", + "ඤ", + "ට", + "ඇ", + "ෞ", + "ඒ", + "ූ", + "ව", + "ඣ", + "ච", + "ඖ", + "ෘ", + "ු", + "ඳ", + "ඌ", + "ෙ", + "්", + "ඥ", + "ீ", + "ෛ", + "ෳ", + "ඔ", + "ආ", + "ළ", + "උ", + "ඟ", + "ඃ", + "ඈ", + "ඪ", + "බ", + "අ", + "ෆ", + "ත", + "ේ", + "ඬ", + "ය", + "ො", + "ශ", + "භ", + "ං", + "ර", + "ඉ", + "ඨ", + "ී", + "ඕ", + "ඡ", + "න", + "ස", + "ද", + "ඩ", + "ෟ", + "ග", + "එ", + "ඊ", + "ල", + "ථ", ] -SAN = [ - 'ඟ', 'ඦ', 'ඬ', 'ඳ', 'ඹ' +BASE_CONSONANTS = [ + "ක", + "ඛ", + "ග", + "ඝ", + "ඞ", + "ඟ", + "ච", + "ඡ", + "ජ", + "ඣ", + "ඤ", + "ඦ", + "ට", + "ඨ", + "ඩ", + "ඪ", + "ණ", + "ඬ", + "ත", + "ථ", + "ද", + "ධ", + "න", + "ඳ", + "ප", + "ඵ", + "බ", + "භ", + "ම", + "ඹ", + "ය", + "ර", + "ල", + "ව", + "ශ", + "ෂ", + "ස", + "හ", + "ළ", + "ෆ", ] -SAN_MAPPING = {'ඟ': 'ංග', 'ඦ': 'ඤ්ජ', 'ඬ': 'ණ්ඩ', 'ඳ': 'න්ද', 'ඹ': 'ම්බ'} +SAN = ["ඟ", "ඦ", "ඬ", "ඳ", "ඹ"] + +SAN_MAPPING = {"ඟ": "ංග", "ඦ": "ඤ්ජ", "ඬ": "ණ්ඩ", "ඳ": "න්ද", "ඹ": "ම්බ"} REVERSE_SAN_MAPPING = {d: v for v, d in SAN_MAPPING.items()} -CONSONANTS = [c + '්' for c in BASE_CONSONANTS] +CONSONANTS = [c + "්" for c in BASE_CONSONANTS] VOWELS = [ - 'අ', 'ආ', 'ඇ', 'ඈ', 'ඉ', 'ඊ', 'උ', 'ඌ', - 'ඍ', 'ඎ', 'එ', 'ඒ', 'ඓ', 'ඔ', 'ඕ', 'ඖ', - 'අං', 'අඃ', + "අ", + "ආ", + "ඇ", + "ඈ", + "ඉ", + "ඊ", + "උ", + "ඌ", + "ඍ", + "ඎ", + "එ", + "ඒ", + "ඓ", + "ඔ", + "ඕ", + "ඖ", + "අං", + "අඃ", ] VOWEL_DIACRITICS = [ - '', 'ා', 'ැ', 'ෑ', 'ි', 'ී', 'ු', 'ූ', 'ෘ', - 'ෲ', 'ෙ', 'ේ', 'ෛ', 'ො', 'ෝ', 'ෞ', - 'ං', 'ඃ', '්', 'ෳ' + "", + "ා", + "ැ", + "ෑ", + "ි", + "ී", + "ු", + "ූ", + "ෘ", + "ෲ", + "ෙ", + "ේ", + "ෛ", + "ො", + "ෝ", + "ෞ", + "ං", + "ඃ", + "්", + "ෳ", ] LONG_TO_SHORT_VOWEL_DIACRITICS_MAPPING = { - '': 'ා', - 'ෑ': 'ැ', - 'ී': 'ි', - 'ූ': 'ු', - 'ේ': 'ෙ', - 'ෝ': 'ො' + "": "ා", + "ෑ": "ැ", + "ී": "ි", + "ූ": "ු", + "ේ": "ෙ", + "ෝ": "ො", } DIACRITICS_MAPPING = {v: d for v, d in zip(VOWELS, VOWEL_DIACRITICS)} @@ -45,49 +190,111 @@ REVERSE_DIACRITICS_MAPPING = {d: v for v, d in zip(VOWELS, VOWEL_DIACRITICS)} CONJUNCT_CONSONANTS = [ - 'ක්ර', 'ඛ්ර', 'ග්ර', 'ඝ්ර', 'ඞ්ර', 'ඟ්ර', - 'ක්ය', 'ඛ්ය', 'ග්ය', 'ඝ්ය', 'ඞ්ය', 'ඟ්ය', - 'ක්ෂ', '෴', + "ක්ර", + "ඛ්ර", + "ග්ර", + "ඝ්ර", + "ඞ්ර", + "ඟ්ර", + "ක්ය", + "ඛ්ය", + "ග්ය", + "ඝ්ය", + "ඞ්ය", + "ඟ්ය", + "ක්ෂ", + "෴", ] NUMERALS = [ - '𑇡', '𑇢', '𑇣', '𑇤', '𑇥', '𑇦', '𑇧', '𑇨', '𑇩', '𑇪', - '𑇫', '𑇬', '𑇭', '𑇮', '𑇯', '𑇰', '𑇱', '𑇲', '𑇳', '𑇴', + "𑇡", + "𑇢", + "𑇣", + "𑇤", + "𑇥", + "𑇦", + "𑇧", + "𑇨", + "𑇩", + "𑇪", + "𑇫", + "𑇬", + "𑇭", + "𑇮", + "𑇯", + "𑇰", + "𑇱", + "𑇲", + "𑇳", + "𑇴", ] GOSHA_LETTERS = [ - 'අ', 'ආ', 'ඇ', 'ඈ', 'ඉ', 'ඊ', 'උ', 'ඌ', - 'ඍ', 'ඎ', 'එ', 'ඒ', 'ඓ', 'ඔ', 'ඕ', 'ඖ', - 'අං', 'අඃ', - 'ග', 'ඝ', 'ඞ', - 'ජ', 'ඣ', 'ඤ', - 'ඩ', 'ඪ', 'ණ', - 'ද', 'ධ', 'න', - 'බ', 'භ', 'ම', - 'ය', 'ර', 'ල', 'ව', - 'හ' + "අ", + "ආ", + "ඇ", + "ඈ", + "ඉ", + "ඊ", + "උ", + "ඌ", + "ඍ", + "ඎ", + "එ", + "ඒ", + "ඓ", + "ඔ", + "ඕ", + "ඖ", + "අං", + "අඃ", + "ග", + "ඝ", + "ඞ", + "ජ", + "ඣ", + "ඤ", + "ඩ", + "ඪ", + "ණ", + "ද", + "ධ", + "න", + "බ", + "භ", + "ම", + "ය", + "ර", + "ල", + "ව", + "හ", ] AGOSHA_LETTERS = [ - 'ක්', 'ඛ්', - 'ච්', 'ඡ්', - 'ට්', 'ඨ්', - 'ත්', 'ථ්', - 'ප්', 'ඵ්', + "ක්", + "ඛ්", + "ච්", + "ඡ්", + "ට්", + "ඨ්", + "ත්", + "ථ්", + "ප්", + "ඵ්", ] AGOSHA_TO_GOSHA_MAPPING = { - 'ක්': 'ග්', - 'ඛ්': 'ඝ්', - 'ච්': 'ජ්', - 'ඡ්': 'ඣ්', - 'ට්': 'ඩ්', - 'ඨ්': 'ඪ්', - 'ත්': 'ද්', - 'ථ්': 'ධ්', - 'ප්': 'බ්', - 'ඵ්': 'භ්', + "ක්": "ග්", + "ඛ්": "ඝ්", + "ච්": "ජ්", + "ඡ්": "ඣ්", + "ට්": "ඩ්", + "ඨ්": "ඪ්", + "ත්": "ද්", + "ථ්": "ධ්", + "ප්": "බ්", + "ඵ්": "භ්", } PUNKT = set(punctuation) NUMBERS = set("1234567890") diff --git a/src/sinlib/utils/preprocessing.py b/src/sinlib/utils/preprocessing.py index b2be711..3ce505f 100644 --- a/src/sinlib/utils/preprocessing.py +++ b/src/sinlib/utils/preprocessing.py @@ -2,13 +2,32 @@ import multiprocessing import re from .chars import VOWEL_DIACRITICS, NUBERS_AND_PUNKTS, ALL_LETTERS -import numpy as np -import os +import json +from pathlib import Path +import warnings -# file_path = os.path.join(os.path.dirname(__file__), '../data', 'sinhala_chars_with_special_chars.txt') +DEFAULT_VOCAB_MAP_FP = "../data/vocab_map.json" +CHAR_MAPPER_FP = "../data/char_map.json" -# with open(file_path,'r') as f: -# SINHALA_CHARS_WITH_SPECIAL_CHARS = f.read().split("\n") + +def load_char_mapper(char_mapper_fp): + if Path(char_mapper_fp).is_file(): + with open(char_mapper_fp, "r") as f: + char_mapper = json.load(f) + else: + warnings.warn( + "File not found at the specified path. Loaded default char map.", + UserWarning, + ) + with open(CHAR_MAPPER_FP, "r") as f: + char_mapper = json.load(f) + return char_mapper + + +def load_default_vocab_map(): + with open(DEFAULT_VOCAB_MAP_FP, "r") as f: + vocab_map = json.load(f) + return vocab_map def remove_non_printable(input_string): @@ -69,7 +88,38 @@ def process_text(t): return tokenized_chars -def process_text_with_token_counts(t:str, consider_special_character_as_sinhala:bool, ignore_non_printable:bool): +def process_text_with_token_counts( + t: str, consider_special_character_as_sinhala: bool, ignore_non_printable: bool +): + """ + Process the given text, tokenizing it and counting the tokens. + + Parameters + ---------- + t : str + The text to be processed. + consider_special_character_as_sinhala : bool + If True, special characters will be considered as Sinhala characters. + ignore_non_printable : bool + If True, non-printable characters will be removed from the text. + + Returns + ------- + tokenized_chars : list of str + List of tokenized characters from the text. + token_counts : int + Total count of tokens in the text. + + Examples + -------- + >>> from sinlib.utils.preprocessing import process_text_with_token_counts + >>> text = "මම ගෙදර ගියා." + >>> tokenized_chars, token_counts = process_text_with_token_counts(text, True, True) + >>> print(tokenized_chars) + ['ම', 'ම', ' ', 'ගෙ', 'ද', 'ර', ' ', 'ගි', 'යා', '.'] + >>> print(token_counts) + 10 + """ if ignore_non_printable: t = remove_non_printable(t) @@ -92,25 +142,62 @@ def process_text_with_token_counts(t:str, consider_special_character_as_sinhala: tokenized_chars.append(char + t[i + 1]) else: tokenized_chars.append(char) - else: tokenized_chars.append(char) return tokenized_chars, token_counts -def get_sinhala_character_ratio(text, consider_special_character_as_sinhala:bool=True, ignore_non_printable:bool=True): - """Retuning sinhala character ratio for given text string for given settings. Expects optional two parameters. - consider_special_character_as_sinhala: if this set to true all numbers and special characters will consider as sinhala. - ignore_non_printable: if this set to true non printables will remove before start processing +def get_sinhala_character_ratio( + text, + consider_special_character_as_sinhala: bool = True, + ignore_non_printable: bool = True, +): + """ + Calculate the ratio of Sinhala characters in the given text. + + Parameters + ---------- + text : str or list of str + The text or list of text strings to be processed. + consider_special_character_as_sinhala : bool, default=True + If True, numbers and special characters will be considered as Sinhala characters. + ignore_non_printable : bool, default=True + If True, non-printable characters will be removed before processing. + + Returns + ------- + ratio : float or list of float + The ratio of Sinhala characters in the text. If the input is a list, returns a list of ratios for each text string. + + Examples + -------- + >>> from sinlib.utils.preprocessing import get_sinhala_character_ratio + >>> text = "මම ගෙදර ගියා." + >>> ratio = get_sinhala_character_ratio(text, True, True) + >>> print(ratio) + 1.0 + + >>> texts = ["මම ගෙදර ගියා.", "This is an example."] + >>> ratio = get_sinhala_character_ratio(texts, False, True) + >>> print(ratios) + [0.875, 0.0] """ if isinstance(text, str): - tokenized_text, sinhala_token_count = process_text_with_token_counts(text,consider_special_character_as_sinhala,ignore_non_printable=ignore_non_printable) + tokenized_text, sinhala_token_count = process_text_with_token_counts( + text, + consider_special_character_as_sinhala, + ignore_non_printable=ignore_non_printable, + ) tokenized_text = [tok for tok in tokenized_text if tok != " "] return sinhala_token_count / len(tokenized_text) elif isinstance(text, list): pool = multiprocessing.Pool() - partial_process_text = partial(process_text_with_token_counts, consider_special_character_as_sinhala=consider_special_character_as_sinhala, ignore_non_printable=ignore_non_printable) + partial_process_text = partial( + process_text_with_token_counts, + consider_special_character_as_sinhala=consider_special_character_as_sinhala, + ignore_non_printable=ignore_non_printable, + ) results = pool.map(partial_process_text, text) pool.close() pool.join()