diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..8596b18
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..5d481c1
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/sinlib.iml b/.idea/sinlib.iml
new file mode 100644
index 0000000..ee28fd3
--- /dev/null
+++ b/.idea/sinlib.iml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index b7a1c66..2e5fb41 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Sinlib (Buggy alpha version)
+# Sinlib
![Alt text](sinlib.png)
@@ -29,14 +29,27 @@ encoding = tokenizer("මේ අතර, පෙබරවාරි මාසයේ
[tokenizer.token_id_to_token_map[id] for id in encoding]
['මේ', ' ', 'අ', 'ත', 'ර', ',', ' ', 'පෙ', 'බ', 'ර', 'වා', 'රි', ' ', 'මා', 'ස', 'යේ', ' ', 'ප', 'ළ', 'මු']
```
+
02. Preprocessor
```python
sent = ['මෙය සිංහල වාක්යක්', 'මෙය සිංහල වාක්යක් සමග english character කීපයක්','This is complete english sentence']
print(sent)
-['මෙය සිංහල වාක්\u200dයක්', 'මෙය සිංහල වාක්\u200dයක් සමග english character කීපයක්', 'This is complete english sentence']
+#['මෙය සිංහල වාක්\u200dයක්', 'මෙය සිංහල වාක්\u200dයක් සමග english character කීපයක්', 'This is #complete english sentence']
from sinlib.preprocessing import get_sinhala_character_ratio
get_sinhala_character_ratio(sent)
-[0.9, 0.46875, 0.0]
+#[0.9, 0.46875, 0.0]
+```
+
+03. Sinnhala Romanizer
+ ```python
+texts = ["hello, මේ මාසයේ ගත වූ දින 15ක කාලය තුළ කොළඹ නගරය ආශ්රිත ව", "මෑතකාලීන ව රට මුහුණ දුන් අභියෝගාත්මකම ආර්ථික කාරණාව ණය ප්රතිව්යුගතකරණය බව මුදල් රාජ්ය අමාත්ය ආචාර්ය රංජිත් සියඹ$$$ mahatha see more****"]
+
+from sinlib import Romanizer
+
+romanizer = Romanizer(char_mapper_fp = None, tokenizer_vocab_path = None)
+romanizer(text)
+#['hello, me masaye gatha wu dina 15ka kalaya thula kolaba nagaraya ashritha wa',
+# 'methakaleena wa rata muhuna dun abhiyogathmakama arthika karanawa naya prathiwyugathakaranaya #bawa mudal rajya amathya acharya ranjith siyaba$$$ mahatha see more****']
```
diff --git a/data/char_map.json b/data/char_map.json
new file mode 100644
index 0000000..96e71af
--- /dev/null
+++ b/data/char_map.json
@@ -0,0 +1 @@
+{"\u0dc6\u0dd2": "fi", "\u0db6\u0dd8": "bru", "\u0d86\u0dd9": "aa", "\u0d9f\u0dd4": "gu", "\u0dc1\u0dd9": "she", "\u0dbb\u0dd8": "ru", "\u0daa": "ta", "\u0db4\u0d83": "ph", "\u0dab\u0dd6": "nu", "\u0db4\u0dd3": "pe", "\u0dc6\u0dcf": "fa", "\u0d8c": "u", "\u0dab": "na", "\u0daf\u0ddb": "dai", "\u0dc4\u0dd8": "hur", "\u0daf\u0dd1": "de", "\u0d9d\u0dd9": "ghe", "\u0dbd\u0dd9\u0dcf": "lo", "\u0dbd\u0d82": "lan", "\u0dc5": "la", "\u0dc0\u0dd0": "wa", "\u0d9a\u0d82": "kan", "\u0d87\u0dd4": "e", "\u0dab\u0dd8": "nru", "\u0db4\u0dd1": "pe", "\u0d9a\u0ddb": "kai", "\u0d85\u0d83": "a", "\u0d9c\u0dd9\u0dcf": "go", "\u0dba\u0d83": "yan", "\u0dba\u0dd9": "ye", "\u0db1\u0dd0": "ne", "\u0dbb": "ra", "\u0da1\u0dd9": "che", "\u0db6\u0dd1": "be", "\u0d9c\u0dd2": "gi", "\u0dae\u0dd9\u0dcf": "tho", "\u0d95": "o", "\u0da7\u0dd9\u0dcf": "to", "\u0dae\u0dd9": "the", "\u0da0\u0dd3": "chi", "\u0da9\u0dd4": "du", "\u0db6\u0dd2": "bi", "\u0d8b": "u", "\u0d9c\u0df2": "gru", "\u0dbd\u0dd4": "lu", "\u0dc2\u0dd8": "shru", "\u0dc2\u0dd3": "shi", "\u0da8\u0d82": "tan", "\u0d91\u0dd0": "e", "\u0dbd\u0dd0": "la", "\u0dc3\u0dd9\u0ddf": "sau", "\u0d9a\u0dd9": "ke", "\u0d9c\u0dd3": "ge", "\u0db5\u0dd4": "pu", "\u0d87\u0dd0": "e", "\u0d9f\u0dd0": "ge", "\u0db8\u0dd6": "mu", "\u0d9c\u0dd0": "ge", "\u0dae": "tha", "\u0d9f\u0dd9\u0dcf": "go", "\u0da4\u0dd4": "du", "\u0db1\u0d82": "nam", "\u0d91\u0ddb": "e", "\u0da7\u0dd3": "ti", "\u0daf\u0dd3": "di", "\u0d9f\u0dd2": "gi", "\u0da7\u0dd8": "tru", "\u0dac\u0dd2": "di", "\u0db5\u0ddb": "pi", "\u0db8\u0dcf": "ma", "\u0db7\u0dd4": "bhu", "\u0db3\u0dd0": "dhe", "\u0da8\u0dd9\u0dcf": "tho", "\u0d9c\u0dd9\u0ddf": "gau", "\u0daf\u0dd9": "de", "\u0da8": "ta", "\u0d88\u0d82": "een", "\u0d9d\u0dcf": "gha", "\u0da9\u0dcf": "da", "\u0dc3\u0dd8": "ru", "\u0d95\u0d82": "oon", "\u0d8b\u0dd1": "ue", "\u0db9\u0dd9\u0ddf": "au", "\u0da2\u0dd9\u0dcf": "jo", "\u0da0\u0dd9\u0ddf": "chau", "\u0dbd": "la", "\u0db8\u0dd9\u0ddf": "mau", "\u0db8\u0dd9\u0dcf": "mo", "\u0dc6\u0dd1": "fa", "\u0da1\u0dd4": "ju", "\u0dc3\u0dd9": "se", "\u0da9\u0dd0": "da", "\u0db0\u0ddb": "dhai", "\u0d8b\u0dd6": "u", "\u0da6": "cha", "\u0d8b\u0dd4": "u", "\u0d87\u0d82": "en", "\u0dbb\u0dd9\u0dcf": "ro", "\u0dba": "ya", "\u0d9b\u0dd4": "ku", "\u0dab\u0dd9\u0dcf": "no", "\u0da4\u0dd9\u0dcf": "gho", "\u0daa\u0dcf": "dha", "\u0dbb\u0dd6": "ru", "\u0dbb\u0dd4": "ru", "\u0d9b\u0dd6": "ku", "\u0dab\u0dd2": "ni", "\u0db6\u0dd9\u0ddf": "bau", "\u0d8d\u0dd0": "ru", "\u0da7\u0dd4": "tu", "\u0dc4\u0d82": "han", "\u0da9\u0dd1": "de", "\u0da0\u0ddb": "chai", "\u0dc5\u0dd9": "le", "\u0db0\u0dd3": "dhi", "\u0da9\u0dd9": "de", "\u0dc4\u0dd9\u0dcf": "ho", "\u0d9a\u0dd0": "ke", "\u0d9a\u0dd2": "ki", "\u0db3\u0dd6": "du", "\u0db3\u0dd9": "dhe", "\u0d9e": "n", "\u0d9f": "ga", "\u0da9\u0df2": "dru", "\u0dba\u0dd9\u0dcf": "yo", "\u0db4\u0dd0": "pe", "\u0dc2": "sha", "\u0dae\u0dcf": "tha", "\u0dad\u0d83": "th", "\u0d86\u0d82": "aan", "\u0db5": "pa", "\u0dc0\u0dd9\u0ddf": "chau", "\u0dba\u0dd6": "yu", "\u0dc6\u0dd9\u0dcf": "fo", "\u0db1\u0dd9\u0dcf": "no", "\u0d9c\u0d82": "gan", "\u0da7\u0dd1": "te", "\u0db3\u0dcf": "dha", "\u0d92": "e", "\u0dc4\u0d83": "ha", "\u0d96": "au", "\u0d9a\u0dd1": "ke", "\u0d8a": "e", "\u0dac\u0dcf": "da", "\u0dad\u0dd9": "the", "\u0dc2\u0d82": "shan", "\u0d9a\u0d83": "kan", "\u0da2\u0dd8": "jru", "\u0db3\u0dd9\u0dcf": "dho", "\u0da8\u0dd3": "ti", "\u0db4\u0d82": "pan", "\u0dc5\u0dd9\u0dcf": "lo", "\u0dbd\u0ddb": "lai", "\u0db0\u0dd1": "dee", "\u0d85\u0dd1": "ee", "\u0db0\u0dd9\u0dcf": "dho", "\u0db9\u0dd3": "bi", "\u0d9f\u0dd9": "ge", "\u0da6\u0dcf": "ja", "\u0db9": "ba", "\u0dbb\u0dd9": "re", "\u0da8\u0dd0": "te", "\u0dc3\u0dd9\u0dcf": "so", "\u0dc2\u0dd9\u0dcf": "sho", "\u0dc0\u0dd6": "wu", "\u0db8\u0d82": "man", "\u0dc6\u0dd6": "fu", "\u0da7\u0d82": "tan", "\u0d94\u0dd1": "we", "\u0d8d\u0dd3": "ri", "\u0da4\u0dd0": "ke", "\u0daf\u0dd9\u0ddf": "dau", "\u0da3\u0dcf": "ja", "\u0dc1": "sha", "\u0dc2\u0d83": "sha", "\u0d9e\u0dcf": "dha", "\u0da0\u0dd2": "chi", "\u0dad\u0dd4": "thu", "\u0db9\u0dd6": "bhu", "\u0dc0\u0dd4": "wu", "\u0da2\u0dd9": "je", "\u0dc0\u0dd9\u0dcf": "wo", "\u0d86\u0dcf": "aa", "\u0dc2\u0dd2": "shi", "\u0dae\u0dd2": "thi", "\u0da9": "da", "\u0db6\u0d82": "ban", "\u0dc3\u0dd1": "se", "\u0dad\u0dd9\u0dcf": "tho", "\u0db0\u0dd0": "dhe", "\u0dc6\u0dd3": "fee", "\u0dc5\u0dd0": "le", "\u0d89": "e", "\u0da0\u0dcf": "cha", "\u0dc6\u0dd9": "fe", "\u0daf\u0dd6": "du", "\u0d9b\u0dd3": "ki", "\u0dc4\u0dcf": "ha", "\u0d94\u0dd3": "o", "\u0dbb\u0dd2": "ri", "\u0dc4\u0ddb": "hai", "\u0d9b": "ka", "\u0db7": "bha", "\u0db7\u0dd9": "bhe", "\u0d91\u0dd3": "ee", "\u0db1\u0dd9": "ne", "\u0dc3": "sa", "\u0daf\u0dd2": "di", "\u0dab\u0dd9": "ne", "\u0c02": "n", "\u0db4": "pa", "\u0da0\u0dd1": "che", "\u0dc5\u0dcf": "la", "\u0db0\u0dd9\u0ddf": "dhau", "\u0db7\u0ddb": "bhi", "\u0dae\u0d83": "tha", "\u0d9f\u0dcf": "gha", "\u0db4\u0dd9": "pe", "\u0d9c\u0dd1": "ge", "\u0dc1\u0dd3": "shi", "\u0dc3\u0dd2": "si", "\u0db7\u0dd9\u0ddf": "bhau", "\u0d9e\u0dd3": "n", "\u0dbd\u0dd2": "li", "\u0d86": "a", "\u0dbb\u0df2": "ru", "\u0d85\u0dd9": "a", "\u0dad\u0dd1": "the", "\u0db6\u0ddb": "bai", "\u0dc4\u0dd9": "he", "\u0db6\u0dd9": "be", "\u0dba\u0ddb": "yai", "\u0dc0\u0ddb": "wai", "\u0db0\u0df2": "dru", "\u0dc4\u0dd3": "he", "\u0da5": "gha", "\u0dc2\u0dd9": "she", "\u0dc0\u0dd2": "vi", "\u0da2\u0dcf": "ja", "\u0da7\u0dd9": "te", "\u0dba\u0dd2": "i", "\u0da7": "ta", "\u0d9b\u0dcf": "ka", "\u0dac\u0dd4": "du", "\u0da2\u0dd2": "ji", "\u0dac": "da", "\u0dc2\u0dd1": "she", "\u0da1": "cha", "\u0db0\u0dd8": "dhru", "\u0da2\u0ddb": "jai", "\u0db8": "ma", "\u0dac\u0dd1": "de", "\u0dc2\u0ddb": "shai", "\u0da1\u0dd2": "chi", "\u0d9b\u0dd9": "ke", "\u0db3": "da", "\u0db6\u0dd9\u0dcf": "bo", "\u0dc4": "ha", "\u0da1\u0dd0": "je", "\u0da7\u0dd6": "tu", "\u0d9b\u0d82": "kan", "\u0d9c": "ga", "\u0db5\u0dd1": "pe", "\u0daa\u0dd2": "dhi", "\u0dc1\u0dd9\u0dcf": "sho", "\u03bf": "n", "\u0d89\u0dd0": "e", "\u0dc3\u0d83": "san", "\u0d9d": "gha", "\u0d9b\u0dd1": "ke", "\u0dc0\u0dd1": "we", "\u0d8a\u0dd9": "ee", "\u0db6\u0dd3": "bi", "\u0dc3\u0dd6": "su", "\u0da9\u0dd3": "di", "\u0dbb\u0dcf": "ra", "\u0dc1\u0d82": "shan", "\u0db7\u0dd9\u0dcf": "bho", "\u0daf\u0d82": "dan", "\u0da2\u0dd6": "ju", "\u0dba\u0dd8": "yur", "\u0d89\u0dd2": "e", "\u0dc0": "wa", "\u0db7\u0dcf": "bha", "\u0db7\u0dd2": "bhi", "\u0daa\u0dd4": "dhu", "\u0d94\u0dd8": "au", "\u0d9a\u0dd6": "ku", "\u0dbd\u0dd3": "lee", "\u0daf\u0dd9\u0dcf": "do", "\u0db6\u0df2": "bru", "\u0da7\u0dd2": "ti", "\u0dc4\u0dd6": "hu", "\u0db8\u0ddb": "mai", "\u0dc3\u0df2": "ru", "\u0db9\u0dd9": "bhe", "\u0d9c\u0dd9": "ge", "\u0dba\u0dd3": "yi", "\u0da8\u0dd2": "ti", "\u0db7\u0d82": "bhan", "\u0daf\u0dd4": "du", "\u0d9f\u0dd6": "ghu", "\u0da9\u0d82": "dan", "\u0da7\u0df2": "tru", "\u0dad\u0dd2": "thi", "\u0dad": "tha", "\u0dc4\u0dd9\u0ddf": "bhau", "\u0dc2\u0dd4": "shu", "\u0d89\u0dcf": "e", "\u0d9a": "ka", "\u0d85\u0dd2": "a", "\u0da7\u0ddb": "tai", "\u0dbd\u0dd1": "le", "\u0dac\u0dd9\u0dcf": "dho", "\u0d85\u0dd0": "e", "\u0d9a\u0dcf": "ka", "\u0db4\u0dcf": "pa", "\u0db1\u0dd1": "ne", "\u0db5\u0dd9\u0dcf": "po", "\u0da2\u0dd0": "je", "\u0da9\u0dd9\u0dcf": "do", "\u0dad\u0dd6": "thu", "\u0db3\u0dd2": "dhi", "\u0d92\u0d82": "en", "\u0d9d\u0dd9\u0dcf": "gho", "\u0d9c\u0dd8": "gru", "\u0da4\u0dd9": "ke", "\u0db7\u0dd1": "bhe", "\u0db5\u0dd3": "pi", "\u0dba\u0dcf": "ya", "\u0d9c\u0ddb": "gai", "\u0d9d\u0dd6": "ghu", "\u0db0\u0dcf": "dha", "\u0dbd\u0dd9": "le", "\u0d9d\u0dd8": "ru", "\u0dc6\u0dd8": "fru", "\u0dc0\u0dd8": "wru", "\u0da9\u0ddb": "dai", "\u0d8f": "pru", "\u0dac\u0dd6": "du", "\u0d85\u0dcf": "a", "\u0db7\u0dd3": "bhi", "\u0dc4\u0dd2": "hi", "\u0db6\u0dcf": "ba", "\u0dbb\u0dd3": "ri", "\u0d8d": "ru", "\u0dbb\u0dd0": "re", "\u0da1\u0dd3": "chi", "\u0da3\u0dd3": "jhi", "\u0da4": "gha", "\u0db1": "na", "\u0d91\u0dd2": "e", "\u0dc1\u0dd8": "shru", "\u0dab\u0dd0": "ne", "\u0d87\u0dd9": "e", "\u0dc1\u0dd1": "she", "\u0d87": "e", "\u0dc3\u0dd0": "se", "\u0dae\u0dd6": "thu", "\u0d9f\u0dd3": "gi", "\u0d95\u0dd8": "o", "\u0db5\u0dcf": "pa", "\u0db1\u0dd9\u0ddf": "nau", "\u0d8b\u0dd9": "u", "\u0dbd\u0dd9\u0ddf": "lau", "\u0dad\u0d82": "than", "\u0d9e\u0dd4": "du", "\u0dbb\u0ddb": "rai", "\u0da9\u0dd2": "di", "\u0d9c\u0dd4": "gu", "\u0da4\u0d82": "ghan", "\u0dc2\u0dcf": "sha", "\u0d85": "a", "\u0dc6\u0dd4": "fu", "\u0db0\u0dd9": "dhe", "\u0d9a\u0dd9\u0ddf": "kau", "\u0d85\u0d82": "an", "\u0dc5\u0dd6": "lu", "\u0dc5\u0dd8": "lu", "\u0da1\u0d82": "chan", "\u0db4\u0dd9\u0ddf": "pau", "\u0dc1\u0dd2": "shi", "\u0dc1\u0dd6": "shu", "\u0dad\u0dd8": "thru", "\u0da0\u0dd6": "chu", "\u0da0\u0d82": "chan", "\u0db0\u0d82": "dhan", "\u0dab\u0dcf": "na", "\u0db1\u0dd3": "ni", "\u0dac\u0dd0": "dhe", "\u0d9c\u0dcf": "ga", "\u0db3\u0dd3": "di", "\u0dc2\u0dd0": "she", "\u0d91": "e", "\u0d8b\u0dcf": "u", "\u0d89\u0dd9": "e", "\u0dbb\u0d82": "ran", "\u0dae\u0dd0": "the", "\u0db1\u0dd8": "nru", "\u0daa\u0dd9\u0dcf": "to", "\u0db5\u0d82": "pan", "\u0d93": "e", "\u0db6\u0dd4": "bu", "\u0da7\u0dd9\u0ddf": "tau", "\u0d9a\u0dd8": "kru", "\u0db8\u0d83": "man", "\u0dab\u0dd4": "nu", "\u0dc3\u0dd3": "si", "\u0db4\u0dd6": "pu", "\u0da8\u0dcf": "ta", "\u0da0\u0dd9": "che", "\u0db5\u0dd2": "phi", "\u0dc2\u0dd6": "shu", "\u0d9e\u0dd9\u0dcf": "do", "\u0db7\u0dd6": "bhu", "\u0dad\u0ddb": "thai", "\u0da2\u0d82": "jan", "\u0db9\u0dd0": "be", "\u0d94": "o", "\u0daf\u0df2": "dhru", "\u0da9\u0dd8": "dru", "\u0da4\u0dcf": "ghan", "\u0dbd\u0dd6": "lu", "\u0dc0\u0dd9": "we", "\u0d94\u0dcf": "o", "\u0d8b\u0d82": "un", "\u0db6": "ba", "\u0db1\u0dd2": "ni", "\u0d9d\u0dd3": "ghi", "\u0dbd\u0dcf": "la", "\u0db7\u0dd8": "bru", "\u0da3": "gha", "\u0dab\u0dd1": "ne", "\u0d88": "e", "\u0dc6\u0dd0": "fa", "\u0dc6": "fa", "\u0dad\u0dd0": "the", "\u0d92\u0dd2": "e", "\u0da7\u0dd0": "te", "\u0db1\u0dd4": "nu", "\u0dba\u0d82": "yan", "\u0db1\u0dd6": "nu", "\u0d9a\u0dd4": "ku", "\u0dba\u0dd1": "ye", "\u0db8\u0dd1": "me", "\u0dc5\u0d82": "lan", "\u0da0\u0dd9\u0dcf": "cho", "\u0da9\u0dd9\u0ddf": "dau", "\u0db4\u0ddb": "pai", "\u0da7\u0dcf": "ta", "\u0db3\u0dd4": "du", "\u0d9a\u0dd9\u0dcf": "ko", "\u0d9d\u0d82": "ghan", "\u0dba\u0dd9\u0ddf": "yau", "\u0d9e\u0dd2": "di", "\u0dc4\u0dd0": "he", "\u0db4\u0dd4": "pu", "\u0dc5\u0dd4": "lu", "\u0d9d\u0dd4": "gu", "\u0db1\u0dcf": "na", "\u0db4\u0df2": "pru", "\u0db3\u0dd1": "de", "\u0dc1\u0dd4": "shu", "\u0da3\u0dd9\u0dcf": "gha", "\u0da2\u0dd1": "je", "\u0da4\u0dd2": "di", "\u0da1\u0dcf": "cha", "\u0dc5\u0dd3": "li", "\u0da0\u0dd4": "chu", "\u0db0": "dha", "\u0d9f\u0d82": "ghan", "\u0db0\u0dd2": "dhi", "\u0d91\u0dcf": "e", "\u0d8b\u0dd3": "u", "\u0db9\u0dd2": "bhi", "\u0db8\u0dd3": "me", "\u0db5\u0dd0": "pe", "\u0d9d\u0dd2": "ghi", "\u0da4\u0dd6": "du", "\u0db0\u0dd6": "dhu", "\u0daf\u0dd8": "dru", "\u0dba\u0dd4": "yu", "\u0db9\u0dd4": "bu", "\u0dc1\u0dd9\u0ddf": "shau", "\u0db8\u0dd9": "me", "\u0da4\u0dd3": "di", "\u0daf": "da", "\u0d94\u0d82": "on", "\u0d94\u0dd2": "o", "\u0dc5\u0dd2": "li", "\u0da2\u0dd3": "ji", "\u0dc1\u0dcf": "sha", "\u0da8\u0dd9": "te", "\u0db4\u0dd8": "pru", "\u0d91\u0dd9": "e", "\u0dc1\u0d83": "shan", "\u0d90": "pau", "\u0db8\u0dd2": "mi", "\u0db0\u0dd4": "dhu", "\u0dc3\u0ddb": "sai", "\u0da2\u0dd9\u0ddf": "jau", "\u0db9\u0dd1": "be", "\u0d9b\u0dd2": "ki", "\u043e": "n", "\u0da3\u0d82": "jan", "\u0dab\u0dd3": "ni", "\u0dc4\u0dd4": "hu", "\u0d9c\u0dd6": "gu", "\u0da0": "cha", "\u0db5\u0dd9": "pe", "\u0dc0\u0df2": "wru", "\u0d94\u0dd9": "o", "\u0dae\u0d82": "than", "\u0db5\u0dd6": "pu", "\u0d8b\u0dd8": "u", "\u0db6\u0dd0": "be", "\u0dc3\u0dcf": "sa", "\u0db3\u0d82": "dan", "\u0da2\u0d83": "jah", "\u0da9\u0dd6": "du", "\u0dbb\u0dd1": "re", "\u0dc4\u0df2": "hru", "\u0d9a\u0dd3": "ki", "\u0da1\u0dd8": "chru", "\u0daf\u0dcf": "da", "\u0dc5\u0dd1": "le", "\u0da1\u0dd9\u0dcf": "cho", "\u0dad\u0df2": "thru", "\u0db4\u0dd9\u0dcf": "po", "\u0d9b\u0dd9\u0dcf": "ko", "\u0d9e\u0dd9": "de", "\u0db8\u0dd8": "mur", "\u0dc0\u0d83": "wah", "\u0d86\u0d83": "an", "\u0d9f\u0dd1": "ge", "\u0dc3\u0d82": "san", "\u0daf\u0dd0": "de", "\u0da8\u0dd4": "tu", "\u0dba\u0dd0": "ye", "\u0dbb\u0dd9\u0ddf": "rau", "\u0db9\u0dcf": "bha", "\u0db9\u0dd9\u0dcf": "bho", "\u0d9a\u0df2": "kru", "\u0dc0\u0d82": "wan", "\u0da0\u0dd0": "che", "\u0da2": "ja", "\u0dc4\u0dd1": "he", "\u0dc1\u0dd0": "sha", "\u0dc1\u0ddb": "shai", "\u0dae\u0dd8": "tru", "\u0dc0\u0dcf": "wa", "\u0dac\u0dd9": "de", "\u0db4\u0dd2": "pi", "\u0da0\u0d83": "chah", "\u0da3\u0dd2": "dhi", "\u0dad\u0dd3": "thi", "\u0db8\u0dd4": "mu", "\u0d94\u0dd4": "o", "\u0da2\u0dd4": "ju", "\u0d91\u0d82": "en", "\u0dac\u0dd3": "di", "\u0db8\u0dd0": "me", "\u0d92\u0dd9": "e", "\u0dc0\u0dd3": "we", "\u0dad\u0dcf": "tha", "\u0db1\u0ddb": "nai", "\u0db6\u0dd6": "bu", "\u0dae\u0dd4": "thu", "\u0dae\u0dd3": "thi", "\u0dc3\u0dd4": "su", "\u0dac\u0d82": "ghan", "\u0d89\u0d82": "en", "\u0da7\u0dca": "t", "\u0dc2\u0ddc": "sho", "\u0dab\u0dda": "no", "\u0dab\u0dca": "n", "\u0dbd\u0dde": "lau", "\u0da3\u0dca": "j", "\u0dc3\u0dde": "sau", "\u0dba\u0dde": "yau", "\u0d9a\u0dca": "k", "\u0db5\u0ddd": "po", "\u0db6\u0ddc": "bo", "\u0dbb\u0dde": "rau", "\u0dc1\u0ddd": "sho", "\u0dc5\u0ddd": "lo", "\u0d9b\u0dda": "ke", "\u0dc0\u0ddd": "wo", "\u0daa\u0dca": "d", "\u0db9\u0ddd": "bho", "\u0db3\u0ddc": "dho", "\u0db3\u0ddd": "dho", "\u0dc6\u0ddd": "fho", "\u0da2\u0dda": "je", "\u0dbb\u0dca": "r", "\u0dc0\u0ddc": "wo", "\u0da9\u0dca": "d", "\u0dba\u0ddc": "yo", "\u0db0\u0dda": "dhe", "\u0da0\u0dda": "che", "\u0da1\u0dda": "che", "\u0db7\u0ddc": "bho", "\u0dac\u0dda": "de", "\u0db7\u0dda": "bhe", "\u0da9\u0ddc": "do", "\u0dbb\u0ddc": "ro", "\u0dc1\u0dda": "she", "\u0d91\u0dca": "e", "\u0d9c\u0dde": "gau", "\u0da9\u0dde": "dau", "\u0da4\u0ddc": "gho", "\u0daf\u0dde": "dhou", "\u0db3\u0dca": "d", "\u0da3\u0ddd": "do", "\u0dac\u0dca": "d", "\u0db5\u0ddc": "po", "\u0dba\u0ddd": "yo", "\u0dc1\u0dca": "sh", "\u0db0\u0dca": "dh", "\u0dbb\u0dda": "re", "\u0d9f\u0dca": "g", "\u0dae\u0ddc": "tho", "\u0dae\u0dda": "the", "\u0da4\u0dca": "ghe", "\u0da8\u0dca": "t", "\u0dc4\u0dca": "h", "\u0d9d\u0ddc": "gho", "\u0da0\u0ddc": "cho", "\u0dc4\u0dda": "he", "\u0dab\u0ddc": "no", "\u0dc5\u0ddc": "lo", "\u0dc0\u0dda": "we", "\u0dad\u0ddd": "tho", "\u0da2\u0ddd": "jo", "\u0da9\u0ddd": "do", "\u0db9\u0dda": "be", "\u0dc5\u0dca": "l", "\u0dae\u0dca": "th", "\u0db1\u0dda": "ne", "\u0db8\u0ddc": "mo", "\u0d9d\u0dda": "ghe", "\u0dc3\u0dda": "se", "\u0db4\u0dca": "p", "\u0d87\u0dca": "e", "\u0d9a\u0dde": "kau", "\u0d9e\u0ddc": "do", "\u0db8\u0dda": "me", "\u0d9b\u0ddc": "ko", "\u0dc4\u0ddc": "ho", "\u0dba\u0dca": "y", "\u0db1\u0dca": "n", "\u0dc1\u0ddc": "sho", "\u0da4\u0ddd": "gho", "\u0dc3\u0dca": "s", "\u0da1\u0ddc": "cho", "\u0daf\u0dca": "d", "\u0db6\u0dca": "b", "\u0d9d\u0ddd": "gho", "\u0db5\u0dca": "e", "\u0d9a\u0ddc": "ko", "\u0dc0\u0dde": "wau", "\u0da8\u0ddc": "to", "\u0dae\u0ddd": "tho", "\u0da8\u0ddd": "to", "\u0dc3\u0ddd": "so", "\u0db6\u0ddd": "bo", "\u0d92\u0dca": "e", "\u0db7\u0dde": "bhau", "\u0db9\u0ddc": "bho", "\u0da7\u0dda": "te", "\u0da0\u0ddd": "cho", "\u0da9\u0dda": "de", "\u0da1\u0ddd": "cho", "\u0d9f\u0ddc": "go", "\u0da0\u0dca": "ch", "\u0db1\u0ddd": "no", "\u0da2\u0dca": "j", "\u0db0\u0ddc": "dho", "\u0db4\u0dda": "je", "\u0dbb\u0ddd": "ro", "\u0dbd\u0dca": "l", "\u0db1\u0ddc": "no", "\u0d94\u0dca": "o", "\u0dc6\u0dca": "f", "\u0dc2\u0dca": "sh", "\u0d89\u0dca": "e", "\u0dad\u0ddc": "tho", "\u0dad\u0dda": "the", "\u0dad\u0dca": "th", "\u0db3\u0dda": "dhe", "\u0dc3\u0ddc": "so", "\u0db8\u0dca": "m", "\u0daa\u0ddc": "to", "\u0d9f\u0ddd": "go", "\u0daa\u0ddd": "to", "\u0d9a\u0dda": "ke", "\u0db4\u0ddc": "po", "\u0dac\u0ddd": "do", "\u0da2\u0dde": "jau", "\u0dba\u0dda": "ye", "\u0dc6\u0ddc": "fo", "\u0db9\u0dde": "bhau", "\u0d9e\u0dca": "n", "\u0db4\u0dde": "pau", "\u0d8a\u0dca": "e", "\u0dc5\u0dda": "le", "\u0db9\u0dca": "b", "\u0db6\u0dde": "bau", "\u0db8\u0ddd": "mo", "\u0d9c\u0dca": "g", "\u0dc2\u0dda": "she", "\u0d9c\u0dda": "ge", "\u0db4\u0ddd": "po", "\u0da1\u0dca": "ch", "\u0d9b\u0dca": "k", "\u0dbd\u0ddd": "lo", "\u0d85\u0dda": "a", "\u0d9e\u0ddd": "do", "\u0d9c\u0ddc": "go", "\u0da8\u0dda": "dhe", "\u0d9e\u0dda": "n", "\u0dc1\u0dde": "shau", "\u0dc6\u0dda": "fe", "\u0db1\u0dde": "nau", "\u0da2\u0ddc": "jo", "\u0d9c\u0ddd": "go", "\u0db8\u0dde": "mau", "\u0da4\u0dda": "g", "\u0dbd\u0dda": "le", "\u0da7\u0ddc": "to", "\u0daf\u0ddd": "dho", "\u0d85\u0dca": "a", "\u0dab\u0ddd": "no", "\u0db7\u0dca": "b", "\u0dbd\u0ddc": "lo", "\u0daf\u0ddc": "do", "\u0dc4\u0dde": "bhau", "\u0da7\u0dde": "tau", "\u0d9d\u0dca": "g", "\u0d9b\u0ddd": "ko", "\u0db0\u0dde": "dhau", "\u0db7\u0ddd": "bho", "\u0db0\u0ddd": "dho", "\u0dc2\u0ddd": "sho", "\u0d9a\u0ddd": "ko", "\u0daf\u0dda": "de", "\u0dc4\u0ddd": "ho", "\u0db6\u0dda": "be", "\u0da7\u0ddd": "to", "\u0da0\u0dde": "chau", "\u0d9f\u0dda": "ge", "\u0dc0\u0dca": "w"}
\ No newline at end of file
diff --git a/data/vocab_map.json b/data/vocab_map.json
new file mode 100644
index 0000000..7410d4f
--- /dev/null
+++ b/data/vocab_map.json
@@ -0,0 +1,2276 @@
+{
+ "»": 0,
+ "መ": 1,
+ "💻": 2,
+ "ˈ": 3,
+ "🙊": 4,
+ "D": 5,
+ "ծ": 6,
+ "නේ": 7,
+ "😐": 8,
+ "ධෑ": 9,
+ "연": 10,
+ "አ": 11,
+ "යඃ": 12,
+ "ู": 13,
+ "ඔ්": 14,
+ "ථ්": 15,
+ "ඇ": 16,
+ "専": 17,
+ "ஆ": 18,
+ "ʒ": 19,
+ "ඝී": 20,
+ "پ": 21,
+ "శ": 22,
+ "Ã": 23,
+ "📚": 24,
+ "부": 25,
+ "ሊ": 26,
+ "배": 27,
+ "බි": 28,
+ "?": 29,
+ "³": 30,
+ "官": 31,
+ "ඨ්": 32,
+ "🐼": 33,
+ "ج": 34,
+ "ඳේ": 35,
+ "ඟෑ": 36,
+ "∶": 37,
+ "ඩි": 38,
+ "නී": 39,
+ "≡": 40,
+ "දු": 41,
+ "මි": 42,
+ "නෝ": 43,
+ "ව්": 44,
+ "↑": 45,
+ "ए": 46,
+ "स": 47,
+ "බෞ": 48,
+ "ℜ": 49,
+ "出": 50,
+ "ජ්": 51,
+ "😖": 52,
+ "🌻": 53,
+ "ගි": 54,
+ "ด": 55,
+ "ሰ": 56,
+ "එැ": 57,
+ "ង": 58,
+ "ලු": 59,
+ "වූ": 60,
+ "ʰ": 61,
+ "⇪": 62,
+ "ő": 63,
+ "ඤු": 64,
+ "N": 65,
+ "⇝": 66,
+ "ෂී": 67,
+ "දං": 68,
+ "저": 69,
+ "ඝි": 70,
+ "۷": 71,
+ "ජැ": 72,
+ "ੱ": 73,
+ "^": 74,
+ "ነ": 75,
+ "ã": 76,
+ "චෙ": 77,
+ "♡": 78,
+ "の": 79,
+ "Ş": 80,
+ "සේ": 81,
+ "ਾ": 82,
+ "௨": 83,
+ "📅": 84,
+ "영": 85,
+ "ክ": 86,
+ "ආෙ": 87,
+ "ඍැ": 88,
+ "ඬෑ": 89,
+ "දෙ": 90,
+ "በ": 91,
+ "語": 92,
+ "̴": 93,
+ "🇸": 94,
+ "ฉ": 95,
+ "පෘ": 96,
+ "😳": 97,
+ "ශ": 98,
+ "➚": 99,
+ "ක": 100,
+ "ब": 101,
+ "☹": 102,
+ "නෑ": 103,
+ "භේ": 104,
+ "北": 105,
+ "有": 106,
+ "ك": 107,
+ "吏": 108,
+ "බූ": 109,
+ "👇": 110,
+ "ෂො": 111,
+ "😠": 112,
+ "ع": 113,
+ "י": 114,
+ "හෲ": 115,
+ "ḍ": 116,
+ "ඞේ": 117,
+ "紀": 118,
+ "豪": 119,
+ "නෛ": 120,
+ "Ε": 121,
+ "හෞ": 122,
+ "╗": 123,
+ "යෛ": 124,
+ "සෞ": 125,
+ "这": 126,
+ "ති": 127,
+ "ㄸ": 128,
+ "즈": 129,
+ "♐": 130,
+ "කං": 131,
+ "ළෑ": 132,
+ "̯": 133,
+ "ඔැ": 134,
+ "通": 135,
+ "ක්": 136,
+ "ඝු": 137,
+ "ඞි": 138,
+ "局": 139,
+ "ඬේ": 140,
+ "❉": 141,
+ "신": 142,
+ "ඛු": 143,
+ "ඞී": 144,
+ "₹": 145,
+ "・": 146,
+ "§": 147,
+ "": 148,
+ "−": 149,
+ "α": 150,
+ "☀": 151,
+ "➨": 152,
+ "ඡු": 153,
+ "♬": 154,
+ "ﻲ": 155,
+ "}": 156,
+ "ඵෛ": 157,
+ "පූ": 158,
+ "·": 159,
+ "චු": 160,
+ "④": 161,
+ "වේ": 162,
+ "Æ": 163,
+ "ਹ": 164,
+ "Š": 165,
+ "医": 166,
+ "็": 167,
+ "නැ": 168,
+ "ණ": 169,
+ "려": 170,
+ "钱": 171,
+ "{": 172,
+ "呼": 173,
+ "": 174,
+ "p": 175,
+ "": 176,
+ "ඝෘ": 177,
+ "ණැ": 178,
+ "鬲": 179,
+ "’": 180,
+ "ው": 181,
+ "ς": 182,
+ "х": 183,
+ "ඉෙ": 184,
+ "🏽": 185,
+ "ʊ": 186,
+ "ඩු": 187,
+ "🌹": 188,
+ "⋆": 189,
+ "丂": 190,
+ "్": 191,
+ "โ": 192,
+ "๑": 193,
+ "樂": 194,
+ "ካ": 195,
+ "😱": 196,
+ "දූ": 197,
+ "满": 198,
+ "ධු": 199,
+ "⏰": 200,
+ "ආ්": 201,
+ "ටු": 202,
+ "💡": 203,
+ "ලඃ": 204,
+ "ぎ": 205,
+ "Г": 206,
+ "병": 207,
+ "🤣": 208,
+ "ど": 209,
+ "Ś": 210,
+ "就": 211,
+ "ร": 212,
+ "笑": 213,
+ "體": 214,
+ "ශඃ": 215,
+ "¨": 216,
+ "ン": 217,
+ "印": 218,
+ "": 219,
+ "උැ": 220,
+ "➠": 221,
+ "හෛ": 222,
+ "ගෘ": 223,
+ "杯": 224,
+ "බෛ": 225,
+ "٦": 226,
+ "🎸": 227,
+ "ா": 228,
+ "ô": 229,
+ "ḱ": 230,
+ "යං": 231,
+ "වෙ": 232,
+ "据": 233,
+ "කේ": 234,
+ "ඨි": 235,
+ "ඛි": 236,
+ "も": 237,
+ "ມ": 238,
+ "ょ": 239,
+ "ო": 240,
+ "ங": 241,
+ "ඳෑ": 242,
+ "野": 243,
+ "ඩේ": 244,
+ "ಕ": 245,
+ "ɐ": 246,
+ "호": 247,
+ "국": 248,
+ "ඹා": 249,
+ "ගී": 250,
+ "Œ": 251,
+ "株": 252,
+ "사": 253,
+ "w": 254,
+ "පං": 255,
+ "ණූ": 256,
+ "වු": 257,
+ "院": 258,
+ "ඹ්": 259,
+ "ළා": 260,
+ "o": 261,
+ "සු": 262,
+ "චේ": 263,
+ "ყ": 264,
+ "☜": 265,
+ "↳": 266,
+ "🍕": 267,
+ "у": 268,
+ "හී": 269,
+ "な": 270,
+ "õ": 271,
+ "ና": 272,
+ "අා": 273,
+ "ඐ": 274,
+ "අු": 275,
+ "O": 276,
+ "ඤි": 277,
+ "ගෛ": 278,
+ "උෑ": 279,
+ "진": 280,
+ "එා": 281,
+ "": 282,
+ "。": 283,
+ "පේ": 284,
+ "ඬැ": 285,
+ "ෂෞ": 286,
+ "व": 287,
+ "ో": 288,
+ "කෑ": 289,
+ "َ": 290,
+ "Ⅲ": 291,
+ "ජෑ": 292,
+ "චෞ": 293,
+ "ባ": 294,
+ "🙈": 295,
+ "උෘ": 296,
+ "☻": 297,
+ "නු": 298,
+ "(": 299,
+ "琮": 300,
+ "˙": 301,
+ "日": 302,
+ "පා": 303,
+ "…": 304,
+ "භ": 305,
+ "": 306,
+ "ï": 307,
+ "Λ": 308,
+ "ඒි": 309,
+ "": 310,
+ "開": 311,
+ "い": 312,
+ "ል": 313,
+ "ඇ්": 314,
+ "": 315,
+ "당": 316,
+ "/": 317,
+ "ल": 318,
+ "할": 319,
+ "□": 320,
+ "": 321,
+ "서": 322,
+ "س": 323,
+ "ඬි": 324,
+ "🍶": 325,
+ "ඔෘ": 326,
+ "ඳෝ": 327,
+ "]": 328,
+ "සං": 329,
+ "Ú": 330,
+ "ស": 331,
+ "Κ": 332,
+ "": 333,
+ "යෝ": 334,
+ "🔴": 335,
+ "ት": 336,
+ "▪": 337,
+ "ලෘ": 338,
+ "ɪ": 339,
+ "魏": 340,
+ "රෑ": 341,
+ "🔥": 342,
+ "ඩෑ": 343,
+ "⦁": 344,
+ "ළෝ": 345,
+ "事": 346,
+ "한": 347,
+ "Ф": 348,
+ "体": 349,
+ "": 350,
+ "✺": 351,
+ "පී": 352,
+ "ඝ්": 353,
+ "ඬෝ": 354,
+ "ටෘ": 355,
+ "芸": 356,
+ "ඨෝ": 357,
+ "යෑ": 358,
+ "ඤී": 359,
+ "Ø": 360,
+ "⚑": 361,
+ "මී": 362,
+ "ª": 363,
+ "В": 364,
+ "የ": 365,
+ "ලෙ": 366,
+ "📱": 367,
+ "学": 368,
+ "ඪෘ": 369,
+ "ិ": 370,
+ "克": 371,
+ "ඣ්": 372,
+ "ī": 373,
+ "ṃ": 374,
+ "🏦": 375,
+ "주": 376,
+ "ŗ": 377,
+ "个": 378,
+ "_": 379,
+ "Ӂ": 380,
+ "葉": 381,
+ "ビ": 382,
+ "": 383,
+ "ගේ": 384,
+ "නෞ": 385,
+ "軍": 386,
+ "ْ": 387,
+ "₂": 388,
+ "ගෲ": 389,
+ "💧": 390,
+ "셔": 391,
+ "歌": 392,
+ "u": 393,
+ "ටා": 394,
+ "": 395,
+ "ඟූ": 396,
+ "ෂැ": 397,
+ "ඳෙ": 398,
+ "": 399,
+ "ජෝ": 400,
+ "නඃ": 401,
+ "ゅ": 402,
+ "❖": 403,
+ "ඛෙ": 404,
+ "ɡ": 405,
+ "#": 406,
+ "თ": 407,
+ "Ž": 408,
+ "察": 409,
+ "ਮ": 410,
+ "公": 411,
+ "र": 412,
+ "ɨ": 413,
+ "ණා": 414,
+ "ɕ": 415,
+ "ʌ": 416,
+ "☢": 417,
+ "ෆෛ": 418,
+ "": 419,
+ "ெ": 420,
+ "ෆෲ": 421,
+ "ඊ්": 422,
+ "අෙ": 423,
+ "ṭ": 424,
+ "—": 425,
+ "🌍": 426,
+ "ම්": 427,
+ "ण": 428,
+ "鑑": 429,
+ "잘": 430,
+ "ඝං": 431,
+ "I": 432,
+ "ජෘ": 433,
+ "ය": 434,
+ "Ñ": 435,
+ "Ʌ": 436,
+ "ă": 437,
+ "ඹෑ": 438,
+ "ㅂ": 439,
+ "ඔෑ": 440,
+ "යූ": 441,
+ "料": 442,
+ "ඔූ": 443,
+ "చ": 444,
+ "◙": 445,
+ "ථෝ": 446,
+ "ජු": 447,
+ "์": 448,
+ "ň": 449,
+ "ไ": 450,
+ "使": 451,
+ "지": 452,
+ "බ්": 453,
+ "😕": 454,
+ "Т": 455,
+ "භ්": 456,
+ "ලැ": 457,
+ "ه": 458,
+ "ታ": 459,
+ "පෛ": 460,
+ "▌": 461,
+ "٩": 462,
+ "η": 463,
+ "格": 464,
+ "ශි": 465,
+ "記": 466,
+ "ටෑ": 467,
+ "8": 468,
+ "ෆ": 469,
+ "දෘ": 470,
+ "ඇු": 471,
+ "ඡී": 472,
+ "ģ": 473,
+ " ": 474,
+ "◘": 475,
+ "◣": 476,
+ "ලේ": 477,
+ "ඤඃ": 478,
+ " ": 479,
+ "Ħ": 480,
+ "බ": 481,
+ "ඤේ": 482,
+ "ඩෞ": 483,
+ "デ": 484,
+ "‟": 485,
+ "ළ්": 486,
+ "ी": 487,
+ "ඩ": 488,
+ "z": 489,
+ "ඵැ": 490,
+ "පි": 491,
+ "治": 492,
+ "ෂේ": 493,
+ "조": 494,
+ "ෳ": 495,
+ "ගං": 496,
+ "ć": 497,
+ "චඃ": 498,
+ "Å": 499,
+ "送": 500,
+ "ත": 501,
+ "‚": 502,
+ "තෘ": 503,
+ "😒": 504,
+ "و": 505,
+ "讀": 506,
+ "字": 507,
+ "テ": 508,
+ "¼": 509,
+ "උං": 510,
+ "ح": 511,
+ "Þ": 512,
+ "詰": 513,
+ "ద": 514,
+ "📑": 515,
+ "෦": 516,
+ "ሌ": 517,
+ "π": 518,
+ "චැ": 519,
+ "ඳ්": 520,
+ "ඤො": 521,
+ "ඓ": 522,
+ "두": 523,
+ "フ": 524,
+ "💀": 525,
+ "ሁ": 526,
+ "ෆැ": 527,
+ "生": 528,
+ "라": 529,
+ "門": 530,
+ "ñ": 531,
+ "ੀ": 532,
+ "ඵ": 533,
+ "ม": 534,
+ "ෆේ": 535,
+ "": 536,
+ "🌸": 537,
+ "🌷": 538,
+ "වා": 539,
+ "н": 540,
+ "රේ": 541,
+ "ෆෑ": 542,
+ "💁": 543,
+ "J": 544,
+ "ਿ": 545,
+ "සෘ": 546,
+ "ඬො": 547,
+ "ගැ": 548,
+ "ෆ්": 549,
+ "ච්": 550,
+ "ඳී": 551,
+ "ච": 552,
+ "ඦ": 553,
+ "": 554,
+ "त": 555,
+ "ඡි": 556,
+ "ඟ්": 557,
+ "と": 558,
+ "장": 559,
+ "": 560,
+ "ඡෙ": 561,
+ "න්": 562,
+ "ඤං": 563,
+ "親": 564,
+ "Û": 565,
+ "ථො": 566,
+ "ෟ": 567,
+ "ł": 568,
+ "": 569,
+ "〒": 570,
+ "අ": 571,
+ "ළී": 572,
+ "社": 573,
+ "ඟේ": 574,
+ "▻": 575,
+ "♥": 576,
+ "し": 577,
+ "ム": 578,
+ "📞": 579,
+ "っ": 580,
+ "º": 581,
+ "": 582,
+ "ඛං": 583,
+ "i": 584,
+ "ඨෘ": 585,
+ "m": 586,
+ "තූ": 587,
+ "乃": 588,
+ "ส": 589,
+ "ቅ": 590,
+ "උෙ": 591,
+ "在": 592,
+ "ા": 593,
+ "": 594,
+ "🐱": 595,
+ "ඩෲ": 596,
+ "රැ": 597,
+ "ටෛ": 598,
+ "博": 599,
+ "️": 600,
+ "ហ": 601,
+ "වෞ": 602,
+ "ρ": 603,
+ "ළ": 604,
+ "ටෙ": 605,
+ "😍": 606,
+ "入": 607,
+ "†": 608,
+ "∘": 609,
+ "보": 610,
+ "ೀ": 611,
+ "ඵ්": 612,
+ "😓": 613,
+ "文": 614,
+ "投": 615,
+ "": 616,
+ "ඵී": 617,
+ "資": 618,
+ "භි": 619,
+ "↕": 620,
+ "A": 621,
+ "⚓": 622,
+ "න": 623,
+ "ද": 624,
+ "ඹ": 625,
+ "ㅉ": 626,
+ "ዳ": 627,
+ "ඍ": 628,
+ "่": 629,
+ "\u0003": 630,
+ "ù": 631,
+ "ලං": 632,
+ "😏": 633,
+ "ు": 634,
+ "➡": 635,
+ "۞": 636,
+ "א": 637,
+ "兀": 638,
+ "ඨං": 639,
+ "ਨ": 640,
+ "й": 641,
+ "เ": 642,
+ "ඝූ": 643,
+ "決": 644,
+ "晉": 645,
+ "🚀": 646,
+ "": 647,
+ "කෝ": 648,
+ "까": 649,
+ "ி": 650,
+ "රූ": 651,
+ "ಳ": 652,
+ "ඒ්": 653,
+ "ඞෝ": 654,
+ "ථු": 655,
+ "つ": 656,
+ "販": 657,
+ "ළූ": 658,
+ "ශා": 659,
+ "ඬා": 660,
+ "옹": 661,
+ "ð": 662,
+ "ூ": 663,
+ "ગ": 664,
+ "诶": 665,
+ "💕": 666,
+ "場": 667,
+ "ථඃ": 668,
+ "ی": 669,
+ "👨": 670,
+ "": 671,
+ "👌": 672,
+ "ඹැ": 673,
+ "ජ": 674,
+ ",": 675,
+ "F": 676,
+ "Ç": 677,
+ "ó": 678,
+ "": 679,
+ "Ô": 680,
+ "😬": 681,
+ "කු": 682,
+ "б": 683,
+ "ඹේ": 684,
+ "ν": 685,
+ "¾": 686,
+ "🙁": 687,
+ "ග්": 688,
+ "ප්": 689,
+ "Ā": 690,
+ "閉": 691,
+ "ඉු": 692,
+ "డ": 693,
+ "🏆": 694,
+ "සූ": 695,
+ "ඊ": 696,
+ "비": 697,
+ "∙": 698,
+ "ඣං": 699,
+ "ọ": 700,
+ "ṉ": 701,
+ "す": 702,
+ "රෘ": 703,
+ "札": 704,
+ "දෛ": 705,
+ "🇰": 706,
+ "※": 707,
+ "පෝ": 708,
+ "繁": 709,
+ "K": 710,
+ "☸": 711,
+ "ඩී": 712,
+ "ජෞ": 713,
+ "ー": 714,
+ "河": 715,
+ "\\": 716,
+ "ב": 717,
+ "මෑ": 718,
+ "ටො": 719,
+ "ǐ": 720,
+ "ඞො": 721,
+ "↯": 722,
+ "ష": 723,
+ "ஞ": 724,
+ "⇔": 725,
+ "ඩූ": 726,
+ "ඟො": 727,
+ "✿": 728,
+ "": 729,
+ "м": 730,
+ "❶": 731,
+ "ඳැ": 732,
+ "ښ": 733,
+ "✉": 734,
+ "ඕෘ": 735,
+ "භී": 736,
+ "ඔි": 737,
+ "哪": 738,
+ "ග": 739,
+ "🏻": 740,
+ "ε": 741,
+ "ழ": 742,
+ "符": 743,
+ "": 744,
+ "නා": 745,
+ "ǣ": 746,
+ "ණෝ": 747,
+ "ロ": 748,
+ "ಜ": 749,
+ "": 750,
+ "À": 751,
+ "와": 752,
+ "ඕ": 753,
+ "♣": 754,
+ "බෘ": 755,
+ "🙃": 756,
+ "ෂං": 757,
+ "ඵෝ": 758,
+ "に": 759,
+ "9": 760,
+ "漢": 761,
+ "කෘ": 762,
+ "📖": 763,
+ "ɳ": 764,
+ "😭": 765,
+ "ɛ": 766,
+ "Ξ": 767,
+ "ධං": 768,
+ "😢": 769,
+ "😯": 770,
+ "දො": 771,
+ "ச": 772,
+ "සි": 773,
+ "ඡ": 774,
+ "ඇැ": 775,
+ "ඊෙ": 776,
+ "🌲": 777,
+ "ラ": 778,
+ "‡": 779,
+ "ண": 780,
+ "«": 781,
+ "М": 782,
+ "ች": 783,
+ "ශැ": 784,
+ "ɖ": 785,
+ "හා": 786,
+ "贝": 787,
+ "ගූ": 788,
+ "😞": 789,
+ "6": 790,
+ "ლ": 791,
+ "İ": 792,
+ "ㅆ": 793,
+ "ُ": 794,
+ "🔷": 795,
+ "ණෑ": 796,
+ "😗": 797,
+ "ત": 798,
+ "⇨": 799,
+ "ಇ": 800,
+ "✍": 801,
+ "ට": 802,
+ "曹": 803,
+ " ": 804,
+ "ධො": 805,
+ "කෙ": 806,
+ "ෆෘ": 807,
+ "ඞ": 808,
+ "": 809,
+ ")": 810,
+ "舊": 811,
+ "@": 812,
+ "ඪූ": 813,
+ "수": 814,
+ "🔘": 815,
+ "ದ": 816,
+ "ළේ": 817,
+ "ś": 818,
+ "간": 819,
+ "豆": 820,
+ "ፍ": 821,
+ "": 822,
+ "θ": 823,
+ "සො": 824,
+ "め": 825,
+ "ඩා": 826,
+ "ː": 827,
+ "く": 828,
+ "k": 829,
+ "අ්": 830,
+ "තං": 831,
+ "◼": 832,
+ "”": 833,
+ "සෙ": 834,
+ "節": 835,
+ "ඪු": 836,
+ "담": 837,
+ "±": 838,
+ "🕒": 839,
+ "房": 840,
+ "ගො": 841,
+ "ඔී": 842,
+ "௧": 843,
+ "": 844,
+ "😅": 845,
+ "協": 846,
+ "к": 847,
+ "ධී": 848,
+ "ණේ": 849,
+ "ट": 850,
+ "භෘ": 851,
+ "麗": 852,
+ "ලා": 853,
+ "⋅": 854,
+ "ඳු": 855,
+ "දැ": 856,
+ "₰": 857,
+ "යෙ": 858,
+ "خ": 859,
+ "😂": 860,
+ "ŭ": 861,
+ "경": 862,
+ "ټ": 863,
+ "ජී": 864,
+ "育": 865,
+ "ﺎ": 866,
+ "ẻ": 867,
+ "ඵෘ": 868,
+ "යෘ": 869,
+ "🛌": 870,
+ "ậ": 871,
+ "平": 872,
+ "ீ": 873,
+ "Е": 874,
+ "ල": 875,
+ "経": 876,
+ "ш": 877,
+ "g": 878,
+ "Ŵ": 879,
+ "😘": 880,
+ "ਗ": 881,
+ "향": 882,
+ "県": 883,
+ "ㅇ": 884,
+ "울": 885,
+ "안": 886,
+ "උු": 887,
+ "ඡ්": 888,
+ ":": 889,
+ "♀": 890,
+ "ළො": 891,
+ "園": 892,
+ "´": 893,
+ "甫": 894,
+ "ඛෝ": 895,
+ "ඳ": 896,
+ "인": 897,
+ "තෲ": 898,
+ "ඬු": 899,
+ "💓": 900,
+ "\u0014": 901,
+ "ඹි": 902,
+ "鄕": 903,
+ "ඩෘ": 904,
+ "高": 905,
+ "̈": 906,
+ "て": 907,
+ "ඤ්": 908,
+ "ර": 909,
+ "部": 910,
+ "ʼ": 911,
+ "😎": 912,
+ "ඝෝ": 913,
+ "~": 914,
+ "ඹෙ": 915,
+ "👍": 916,
+ "✅": 917,
+ "ඤැ": 918,
+ "¹": 919,
+ "Ƹ": 920,
+ "ඹෞ": 921,
+ "圓": 922,
+ "μ": 923,
+ "ඵි": 924,
+ "Đ": 925,
+ "💗": 926,
+ "च": 927,
+ "ඡෘ": 928,
+ "¢": 929,
+ "ö": 930,
+ "▃": 931,
+ "🐌": 932,
+ "西": 933,
+ "─": 934,
+ "ட": 935,
+ "ජේ": 936,
+ "": 937,
+ "👎": 938,
+ "": 939,
+ "ㅈ": 940,
+ "≤": 941,
+ " ": 942,
+ "商": 943,
+ "∕": 944,
+ "╔": 945,
+ "ඇං": 946,
+ "릴": 947,
+ "℘": 948,
+ "ණු": 949,
+ "0": 950,
+ "මෝ": 951,
+ "🌺": 952,
+ "වො": 953,
+ "✓": 954,
+ "ይ": 955,
+ "寶": 956,
+ "කෛ": 957,
+ "知": 958,
+ "警": 959,
+ "": 960,
+ "்": 961,
+ "시": 962,
+ "&": 963,
+ "භෞ": 964,
+ "ගෙ": 965,
+ "ה": 966,
+ "💞": 967,
+ "运": 968,
+ "": 969,
+ "සා": 970,
+ " ": 971,
+ "✊": 972,
+ "♜": 973,
+ "】": 974,
+ "ඤ": 975,
+ "✪": 976,
+ "प": 977,
+ "ඛා": 978,
+ "": 979,
+ "ந": 980,
+ "ን": 981,
+ "": 982,
+ "ँ": 983,
+ "භෝ": 984,
+ "Â": 985,
+ "ඉ": 986,
+ "උ්": 987,
+ "т": 988,
+ "වි": 989,
+ "තෝ": 990,
+ "f": 991,
+ "ก": 992,
+ "ط": 993,
+ "٠": 994,
+ "ඛඃ": 995,
+ "舞": 996,
+ "ㅣ": 997,
+ "÷": 998,
+ "ඈ": 999,
+ "ඞූ": 1000,
+ "定": 1001,
+ "ධෙ": 1002,
+ "ෆෙ": 1003,
+ "යැ": 1004,
+ "ඹෝ": 1005,
+ "年": 1006,
+ "නි": 1007,
+ "ı": 1008,
+ "අං": 1009,
+ "": 1010,
+ "ย": 1011,
+ "à": 1012,
+ "З": 1013,
+ "ि": 1014,
+ "G": 1015,
+ "ඨැ": 1016,
+ "店": 1017,
+ "🔆": 1018,
+ "ඞු": 1019,
+ "♭": 1020,
+ "අඃ": 1021,
+ "ටේ": 1022,
+ "ෆූ": 1023,
+ ";": 1024,
+ "พ": 1025,
+ "ළි": 1026,
+ "ʟ": 1027,
+ "う": 1028,
+ "ა": 1029,
+ "天": 1030,
+ "உ": 1031,
+ "英": 1032,
+ "ò": 1033,
+ "හැ": 1034,
+ "": 1035,
+ "📝": 1036,
+ "රෛ": 1037,
+ "වෑ": 1038,
+ "ජං": 1039,
+ "ඔ": 1040,
+ "🚈": 1041,
+ "ㅃ": 1042,
+ "λ": 1043,
+ "🌞": 1044,
+ "✚": 1045,
+ "Q": 1046,
+ "හේ": 1047,
+ "ඏ": 1048,
+ "\"": 1049,
+ "ඔං": 1050,
+ "₦": 1051,
+ "ˌ": 1052,
+ "¥": 1053,
+ "网": 1054,
+ "☞": 1055,
+ "່": 1056,
+ "ඝ": 1057,
+ "▒": 1058,
+ "陽": 1059,
+ "ඈං": 1060,
+ "蘭": 1061,
+ "ು": 1062,
+ "⁄": 1063,
+ "නෘ": 1064,
+ "ඡේ": 1065,
+ "ㄲ": 1066,
+ "ඬී": 1067,
+ "චෑ": 1068,
+ "ඟෝ": 1069,
+ "პ": 1070,
+ ".": 1071,
+ "රෲ": 1072,
+ "ධා": 1073,
+ "ළැ": 1074,
+ "í": 1075,
+ "මූ": 1076,
+ "ථි": 1077,
+ "ό": 1078,
+ "동": 1079,
+ "ά": 1080,
+ "用": 1081,
+ "": 1082,
+ "V": 1083,
+ "鼎": 1084,
+ "💖": 1085,
+ "女": 1086,
+ "ा": 1087,
+ "ප": 1088,
+ "ஜ": 1089,
+ "ශූ": 1090,
+ "ඪි": 1091,
+ "κ": 1092,
+ "℉": 1093,
+ "þ": 1094,
+ "ቁ": 1095,
+ "එේ": 1096,
+ "ර්": 1097,
+ "팔": 1098,
+ "ੈ": 1099,
+ "Y": 1100,
+ "🏠": 1101,
+ "க": 1102,
+ "ජි": 1103,
+ "▲": 1104,
+ "ụ": 1105,
+ "ṁ": 1106,
+ "名": 1107,
+ "۶": 1108,
+ "ඪී": 1109,
+ "、": 1110,
+ "任": 1111,
+ "චෝ": 1112,
+ "ඡැ": 1113,
+ "ධෘ": 1114,
+ "ධෲ": 1115,
+ "족": 1116,
+ "甦": 1117,
+ "💹": 1118,
+ "ඤා": 1119,
+ "研": 1120,
+ "υ": 1121,
+ "සී": 1122,
+ "ֻ": 1123,
+ "►": 1124,
+ "+": 1125,
+ "ෆී": 1126,
+ "‘": 1127,
+ "😁": 1128,
+ "ඣු": 1129,
+ "ล": 1130,
+ "": 1131,
+ "е": 1132,
+ "ය්": 1133,
+ "מ": 1134,
+ "唐": 1135,
+ "ष": 1136,
+ "ِ": 1137,
+ "Ò": 1138,
+ "හෝ": 1139,
+ "😩": 1140,
+ "放": 1141,
+ "ु": 1142,
+ "👏": 1143,
+ "🙂": 1144,
+ "ℯ": 1145,
+ "-": 1146,
+ "整": 1147,
+ "එි": 1148,
+ "雨": 1149,
+ "ب": 1150,
+ "ʃ": 1151,
+ "Д": 1152,
+ "サ": 1153,
+ "ශෞ": 1154,
+ "ለ": 1155,
+ "∎": 1156,
+ "ι": 1157,
+ "블": 1158,
+ "ඝො": 1159,
+ "🚇": 1160,
+ "Ö": 1161,
+ "ț": 1162,
+ "7": 1163,
+ "කැ": 1164,
+ "ගෞ": 1165,
+ "ඟැ": 1166,
+ "ス": 1167,
+ "පඃ": 1168,
+ "만": 1169,
+ "📷": 1170,
+ "食": 1171,
+ "හං": 1172,
+ "ඓං": 1173,
+ "γ": 1174,
+ "🔵": 1175,
+ "ệ": 1176,
+ "ே": 1177,
+ "තෛ": 1178,
+ "伎": 1179,
+ "වඃ": 1180,
+ "ද්": 1181,
+ "ુ": 1182,
+ "➦": 1183,
+ "": 1184,
+ "සෑ": 1185,
+ "ඬ්": 1186,
+ "А": 1187,
+ "තඃ": 1188,
+ "ණී": 1189,
+ "😦": 1190,
+ "ஸ": 1191,
+ "¡": 1192,
+ "ʂ": 1193,
+ "ඣී": 1194,
+ "මේ": 1195,
+ "ලො": 1196,
+ "ෂි": 1197,
+ "ያ": 1198,
+ "工": 1199,
+ "ע": 1200,
+ "빈": 1201,
+ "▆": 1202,
+ "ගු": 1203,
+ "µ": 1204,
+ "á": 1205,
+ "ඒ": 1206,
+ "ණො": 1207,
+ "": 1208,
+ "": 1209,
+ "た": 1210,
+ "▅": 1211,
+ "ப": 1212,
+ "도": 1213,
+ "ටී": 1214,
+ "a": 1215,
+ "e": 1216,
+ "ඹූ": 1217,
+ "а": 1218,
+ ",": 1219,
+ "✈": 1220,
+ "შ": 1221,
+ "ω": 1222,
+ "[": 1223,
+ "щ": 1224,
+ "ê": 1225,
+ "â": 1226,
+ "☚": 1227,
+ "\u0002": 1228,
+ "⌖": 1229,
+ "순": 1230,
+ "🍂": 1231,
+ "ඊැ": 1232,
+ "℃": 1233,
+ "කූ": 1234,
+ "ඌ": 1235,
+ "තෞ": 1236,
+ "ඛෘ": 1237,
+ "ң": 1238,
+ "戌": 1239,
+ "อ": 1240,
+ "😨": 1241,
+ "ֹ": 1242,
+ "ඩ්": 1243,
+ "า": 1244,
+ "☆": 1245,
+ "ඪ්": 1246,
+ "😡": 1247,
+ "千": 1248,
+ "ඡෛ": 1249,
+ "田": 1250,
+ "යී": 1251,
+ "대": 1252,
+ "결": 1253,
+ "💎": 1254,
+ "ඦා": 1255,
+ "හ": 1256,
+ "😄": 1257,
+ "චි": 1258,
+ "෴": 1259,
+ "දෑ": 1260,
+ "<": 1261,
+ "ණෙ": 1262,
+ "බං": 1263,
+ "ん": 1264,
+ "": 1265,
+ "හෑ": 1266,
+ "": 1267,
+ "🔑": 1268,
+ "ලි": 1269,
+ "👈": 1270,
+ "බා": 1271,
+ "": 1272,
+ "වෘ": 1273,
+ "තැ": 1274,
+ "චො": 1275,
+ "哈": 1276,
+ "ɒ": 1277,
+ "": 1278,
+ "口": 1279,
+ "¦": 1280,
+ "කා": 1281,
+ "鈴": 1282,
+ "ứ": 1283,
+ "反": 1284,
+ "ළං": 1285,
+ "ㄱ": 1286,
+ "බී": 1287,
+ "උූ": 1288,
+ "ŋ": 1289,
+ "者": 1290,
+ "'": 1291,
+ "T": 1292,
+ "【": 1293,
+ "ሸ": 1294,
+ "⚜": 1295,
+ "≈": 1296,
+ "ጣ": 1297,
+ "య": 1298,
+ "💩": 1299,
+ "🎤": 1300,
+ "É": 1301,
+ "ළු": 1302,
+ "👆": 1303,
+ "☼": 1304,
+ "δ": 1305,
+ "ආඃ": 1306,
+ "씨": 1307,
+ "💥": 1308,
+ "г": 1309,
+ "Б": 1310,
+ "ඩො": 1311,
+ "ุ": 1312,
+ "И": 1313,
+ "๏": 1314,
+ "😝": 1315,
+ "ඵා": 1316,
+ "ジ": 1317,
+ "නං": 1318,
+ "생": 1319,
+ "": 1320,
+ "ඣෝ": 1321,
+ "ඉි": 1322,
+ "ō": 1323,
+ "ෂෙ": 1324,
+ "වං": 1325,
+ "ज": 1326,
+ "": 1327,
+ "කෞ": 1328,
+ "市": 1329,
+ "ü": 1330,
+ "ל": 1331,
+ "භූ": 1332,
+ "ů": 1333,
+ "ව": 1334,
+ "P": 1335,
+ "🇱": 1336,
+ "🕛": 1337,
+ "か": 1338,
+ "Z": 1339,
+ "ل": 1340,
+ "ტ": 1341,
+ "▼": 1342,
+ "⛅": 1343,
+ "☝": 1344,
+ "ē": 1345,
+ "ి": 1346,
+ "営": 1347,
+ "ශෙ": 1348,
+ "W": 1349,
+ "": 1350,
+ "සඃ": 1351,
+ "⛺": 1352,
+ "": 1353,
+ "ෂු": 1354,
+ "ž": 1355,
+ "තා": 1356,
+ "Ê": 1357,
+ "👹": 1358,
+ "උී": 1359,
+ "合": 1360,
+ "ඨේ": 1361,
+ "ಧ": 1362,
+ "ᄊ": 1363,
+ "♂": 1364,
+ "ඖ": 1365,
+ " ": 1366,
+ "මෛ": 1367,
+ "😃": 1368,
+ "📲": 1369,
+ "古": 1370,
+ "ж": 1371,
+ "j": 1372,
+ "ශු": 1373,
+ "➲": 1374,
+ "с": 1375,
+ "ā": 1376,
+ "3": 1377,
+ "お": 1378,
+ "ඹු": 1379,
+ "ნ": 1380,
+ "証": 1381,
+ "🤔": 1382,
+ "고": 1383,
+ "එං": 1384,
+ "ř": 1385,
+ "∏": 1386,
+ "ථූ": 1387,
+ "◀": 1388,
+ "Õ": 1389,
+ "ඇෙ": 1390,
+ "艾": 1391,
+ "Р": 1392,
+ "ෆො": 1393,
+ "ගා": 1394,
+ "c": 1395,
+ "🔫": 1396,
+ "ඨෙ": 1397,
+ "": 1398,
+ "ජා": 1399,
+ "의": 1400,
+ "書": 1401,
+ "ت": 1402,
+ "๐": 1403,
+ "🌴": 1404,
+ "క": 1405,
+ "श": 1406,
+ "බෲ": 1407,
+ "犬": 1408,
+ " ": 1409,
+ "රු": 1410,
+ "": 1411,
+ "ʔ": 1412,
+ "🙄": 1413,
+ "왕": 1414,
+ "Л": 1415,
+ "Ο": 1416,
+ "未": 1417,
+ "♨": 1418,
+ "ز": 1419,
+ "හු": 1420,
+ "ඟා": 1421,
+ "성": 1422,
+ "क": 1423,
+ "තො": 1424,
+ "ලෛ": 1425,
+ "ढ": 1426,
+ "ແ": 1427,
+ "〜": 1428,
+ "ජෙ": 1429,
+ "–": 1430,
+ "🌟": 1431,
+ "": 1432,
+ "🏳": 1433,
+ "ඍී": 1434,
+ "オ": 1435,
+ "ا": 1436,
+ "නූ": 1437,
+ "Α": 1438,
+ "චූ": 1439,
+ "😮": 1440,
+ "ඵෑ": 1441,
+ "🏃": 1442,
+ "ඛො": 1443,
+ "人": 1444,
+ "ෂා": 1445,
+ "狄": 1446,
+ "රෙ": 1447,
+ "ṅ": 1448,
+ "අි": 1449,
+ "ඤෝ": 1450,
+ "ਡ": 1451,
+ "ನ": 1452,
+ "及": 1453,
+ "ඩං": 1454,
+ "р": 1455,
+ "⇻": 1456,
+ "🌼": 1457,
+ "ٌ": 1458,
+ "ர": 1459,
+ "τ": 1460,
+ "ǚ": 1461,
+ "ඒෙ": 1462,
+ "活": 1463,
+ "මඃ": 1464,
+ "ට්": 1465,
+ "n": 1466,
+ "行": 1467,
+ "私": 1468,
+ "යෞ": 1469,
+ "ல": 1470,
+ "X": 1471,
+ "උා": 1472,
+ "ඪෝ": 1473,
+ "🚴": 1474,
+ "ਬ": 1475,
+ "ደ": 1476,
+ "ඵො": 1477,
+ "ו": 1478,
+ "▶": 1479,
+ "ய": 1480,
+ "❤": 1481,
+ "අෘ": 1482,
+ "′": 1483,
+ "★": 1484,
+ "Δ": 1485,
+ "î": 1486,
+ "ር": 1487,
+ "나": 1488,
+ "۩": 1489,
+ "ඎ": 1490,
+ "ඨූ": 1491,
+ "主": 1492,
+ "": 1493,
+ "": 1494,
+ "攻": 1495,
+ "숙": 1496,
+ "": 1497,
+ "小": 1498,
+ "곰": 1499,
+ "ن": 1500,
+ "ඹී": 1501,
+ "మ": 1502,
+ "三": 1503,
+ "由": 1504,
+ "ඬං": 1505,
+ "🍅": 1506,
+ "තු": 1507,
+ "ඉ්": 1508,
+ "ㅅ": 1509,
+ "ç": 1510,
+ "数": 1511,
+ "මෞ": 1512,
+ "후": 1513,
+ "ඟ": 1514,
+ "$": 1515,
+ "←": 1516,
+ "ள": 1517,
+ "ת": 1518,
+ "€": 1519,
+ "ṇ": 1520,
+ "イ": 1521,
+ "ವ": 1522,
+ "ک": 1523,
+ "ಂ": 1524,
+ "ඛ්": 1525,
+ "☯": 1526,
+ "缶": 1527,
+ "역": 1528,
+ "": 1529,
+ "වැ": 1530,
+ "%": 1531,
+ "☛": 1532,
+ "▫": 1533,
+ "තී": 1534,
+ "≥": 1535,
+ "E": 1536,
+ "成": 1537,
+ "È": 1538,
+ "前": 1539,
+ "¿": 1540,
+ "බු": 1541,
+ "ம": 1542,
+ "レ": 1543,
+ "ධෝ": 1544,
+ "ශ්": 1545,
+ "そ": 1546,
+ "里": 1547,
+ "ற": 1548,
+ "û": 1549,
+ "◇": 1550,
+ "្": 1551,
+ "許": 1552,
+ "ድ": 1553,
+ "壬": 1554,
+ "`": 1555,
+ "🛑": 1556,
+ "國": 1557,
+ "": 1558,
+ "ʻ": 1559,
+ "ධ්": 1560,
+ "ஷ": 1561,
+ "ク": 1562,
+ "ථැ": 1563,
+ "⚪": 1564,
+ "ภ": 1565,
+ "ඡා": 1566,
+ "£": 1567,
+ "ο": 1568,
+ "": 1569,
+ "乎": 1570,
+ "ě": 1571,
+ "ר": 1572,
+ "Ŧ": 1573,
+ "➢": 1574,
+ "ɔ": 1575,
+ "චෛ": 1576,
+ "S": 1577,
+ "ી": 1578,
+ "ඝේ": 1579,
+ "හො": 1580,
+ "q": 1581,
+ "識": 1582,
+ "з": 1583,
+ "එෛ": 1584,
+ "в": 1585,
+ "カ": 1586,
+ "➩": 1587,
+ "රා": 1588,
+ "🤷": 1589,
+ "�": 1590,
+ "💙": 1591,
+ "―": 1592,
+ "あ": 1593,
+ "明": 1594,
+ "තේ": 1595,
+ "ņ": 1596,
+ "හඃ": 1597,
+ "U": 1598,
+ "R": 1599,
+ "ਤ": 1600,
+ "ඨා": 1601,
+ "م": 1602,
+ "节": 1603,
+ "♦": 1604,
+ "පෞ": 1605,
+ "මෙ": 1606,
+ "ශී": 1607,
+ "🔨": 1608,
+ "ජො": 1609,
+ "ე": 1610,
+ "ّ": 1611,
+ "े": 1612,
+ "午": 1613,
+ "එ්": 1614,
+ "4": 1615,
+ "ණ්": 1616,
+ "*": 1617,
+ "校": 1618,
+ "̵": 1619,
+ "ස්": 1620,
+ "": 1621,
+ "ඣ": 1622,
+ "п": 1623,
+ "아": 1624,
+ "Ω": 1625,
+ "é": 1626,
+ "y": 1627,
+ "දෞ": 1628,
+ "ግ": 1629,
+ "↓": 1630,
+ "ﻨ": 1631,
+ "ඪා": 1632,
+ "ජෛ": 1633,
+ "।": 1634,
+ "斯": 1635,
+ "ඛ": 1636,
+ "ي": 1637,
+ "„": 1638,
+ "ť": 1639,
+ "": 1640,
+ "සෝ": 1641,
+ "式": 1642,
+ "л": 1643,
+ "රො": 1644,
+ "ධි": 1645,
+ "і": 1646,
+ "චා": 1647,
+ "වෝ": 1648,
+ "ආා": 1649,
+ "ø": 1650,
+ "わ": 1651,
+ "එු": 1652,
+ "👀": 1653,
+ "ඞා": 1654,
+ "ä": 1655,
+ "ַ": 1656,
+ "ලෝ": 1657,
+ "த": 1658,
+ "ወ": 1659,
+ "මං": 1660,
+ "ත්": 1661,
+ "ま": 1662,
+ "パ": 1663,
+ "広": 1664,
+ "会": 1665,
+ "問": 1666,
+ "": 1667,
+ "志": 1668,
+ "리": 1669,
+ "丝": 1670,
+ "о": 1671,
+ "😊": 1672,
+ "責": 1673,
+ "ඛූ": 1674,
+ "❣": 1675,
+ "を": 1676,
+ "😌": 1677,
+ "ඬෙ": 1678,
+ "杜": 1679,
+ "∫": 1680,
+ "එී": 1681,
+ "ඝා": 1682,
+ "군": 1683,
+ "š": 1684,
+ "ඵූ": 1685,
+ "ί": 1686,
+ "😆": 1687,
+ "😛": 1688,
+ "卡": 1689,
+ "": 1690,
+ "කී": 1691,
+ "ටං": 1692,
+ "를": 1693,
+ "බැ": 1694,
+ "💵": 1695,
+ "™": 1696,
+ "산": 1697,
+ "🌎": 1698,
+ "ශො": 1699,
+ "乇": 1700,
+ "↠": 1701,
+ "💐": 1702,
+ "බෙ": 1703,
+ "元": 1704,
+ "ඛේ": 1705,
+ "2": 1706,
+ "උ": 1707,
+ "드": 1708,
+ "දා": 1709,
+ "ඳෘ": 1710,
+ "මු": 1711,
+ "後": 1712,
+ "ë": 1713,
+ "➤": 1714,
+ "ׁ": 1715,
+ "č": 1716,
+ "材": 1717,
+ "ණි": 1718,
+ "¤": 1719,
+ "″": 1720,
+ "究": 1721,
+ "ගෑ": 1722,
+ "쉬": 1723,
+ "d": 1724,
+ "訥": 1725,
+ "•": 1726,
+ "城": 1727,
+ "🚆": 1728,
+ "→": 1729,
+ "こ": 1730,
+ "H": 1731,
+ "म": 1732,
+ "දේ": 1733,
+ "රෞ": 1734,
+ "“": 1735,
+ "ඪො": 1736,
+ "ሽ": 1737,
+ "C": 1738,
+ "ं": 1739,
+ "ඳූ": 1740,
+ "吗": 1741,
+ "😵": 1742,
+ "": 1743,
+ "▂": 1744,
+ "ඡො": 1745,
+ "දඃ": 1746,
+ "이": 1747,
+ "⚘": 1748,
+ "රී": 1749,
+ "り": 1750,
+ "🙉": 1751,
+ "ф": 1752,
+ "\b": 1753,
+ "ඨී": 1754,
+ "正": 1755,
+ "催": 1756,
+ "찰": 1757,
+ "어": 1758,
+ "👉": 1759,
+ "ş": 1760,
+ "ෆි": 1761,
+ "ෆං": 1762,
+ "ටෲ": 1763,
+ "ማ": 1764,
+ "ධ": 1765,
+ "्": 1766,
+ "業": 1767,
+ "🎁": 1768,
+ "": 1769,
+ "භු": 1770,
+ "ь": 1771,
+ "近": 1772,
+ "බෝ": 1773,
+ "尺": 1774,
+ "ㄷ": 1775,
+ "ر": 1776,
+ "වෛ": 1777,
+ "√": 1778,
+ "": 1779,
+ "👓": 1780,
+ "බේ": 1781,
+ "L": 1782,
+ "": 1783,
+ "ඕං": 1784,
+ "ෆෝ": 1785,
+ "මැ": 1786,
+ "ʈ": 1787,
+ "ở": 1788,
+ "😀": 1789,
+ "ṣ": 1790,
+ "©": 1791,
+ "덕": 1792,
+ "ం": 1793,
+ "薦": 1794,
+ "රි": 1795,
+ "ם": 1796,
+ "ハ": 1797,
+ "ਰ": 1798,
+ "ෂ": 1799,
+ "®": 1800,
+ "∋": 1801,
+ "Ш": 1802,
+ "සැ": 1803,
+ "؟": 1804,
+ "˝": 1805,
+ "원": 1806,
+ "ස": 1807,
+ "и": 1808,
+ "物": 1809,
+ "🙌": 1810,
+ "妇": 1811,
+ "여": 1812,
+ "˜": 1813,
+ "べ": 1814,
+ "ඟී": 1815,
+ "ඛී": 1816,
+ "✴": 1817,
+ "‹": 1818,
+ "💇": 1819,
+ "💃": 1820,
+ "ύ": 1821,
+ "හ්": 1822,
+ "භෛ": 1823,
+ "ੁ": 1824,
+ "வ": 1825,
+ "යේ": 1826,
+ "▬": 1827,
+ "යු": 1828,
+ "දෝ": 1829,
+ "ධැ": 1830,
+ "": 1831,
+ "😔": 1832,
+ "ŝ": 1833,
+ "職": 1834,
+ "П": 1835,
+ "✌": 1836,
+ "ටෞ": 1837,
+ "გ": 1838,
+ "¯": 1839,
+ "น": 1840,
+ "说": 1841,
+ "飛": 1842,
+ "ة": 1843,
+ "යි": 1844,
+ "ገ": 1845,
+ "ಯ": 1846,
+ "පො": 1847,
+ "ථේ": 1848,
+ "å": 1849,
+ "ථ": 1850,
+ "ෂෘ": 1851,
+ "!": 1852,
+ "육": 1853,
+ "ह": 1854,
+ "ú": 1855,
+ "ටූ": 1856,
+ "祭": 1857,
+ "せ": 1858,
+ "භෙ": 1859,
+ "░": 1860,
+ "烧": 1861,
+ "🧗": 1862,
+ "තෙ": 1863,
+ "r": 1864,
+ "☂": 1865,
+ "කො": 1866,
+ "ඩෛ": 1867,
+ "æ": 1868,
+ "∞": 1869,
+ "ิ": 1870,
+ "和": 1871,
+ "ටි": 1872,
+ "ථෙ": 1873,
+ "ﷺ": 1874,
+ "": 1875,
+ "ෂෑ": 1876,
+ "ලී": 1877,
+ "ගෝ": 1878,
+ "❋": 1879,
+ "ඈෑ": 1880,
+ "ト": 1881,
+ "හි": 1882,
+ "ටැ": 1883,
+ "❸": 1884,
+ "ශෲ": 1885,
+ "ලෑ": 1886,
+ "➧": 1887,
+ "📕": 1888,
+ "ණං": 1889,
+ "ලූ": 1890,
+ "◄": 1891,
+ "ඟු": 1892,
+ "ශේ": 1893,
+ "К": 1894,
+ "Ó": 1895,
+ "ඵු": 1896,
+ "": 1897,
+ "ਵ": 1898,
+ "͡": 1899,
+ "": 1900,
+ "🌊": 1901,
+ "ತ": 1902,
+ "5": 1903,
+ "සෛ": 1904,
+ "⭐": 1905,
+ "": 1906,
+ "⇒": 1907,
+ "ෆෞ": 1908,
+ "幣": 1909,
+ "🖤": 1910,
+ "シ": 1911,
+ "ṟ": 1912,
+ "╮": 1913,
+ "☟": 1914,
+ "පෲ": 1915,
+ "כ": 1916,
+ "遅": 1917,
+ "😉": 1918,
+ "අැ": 1919,
+ "■": 1920,
+ "我": 1921,
+ "අෑ": 1922,
+ "B": 1923,
+ "მ": 1924,
+ "◢": 1925,
+ "භො": 1926,
+ "タ": 1927,
+ "දෲ": 1928,
+ "ū": 1929,
+ "ዬ": 1930,
+ "ኋ": 1931,
+ "½": 1932,
+ "族": 1933,
+ "β": 1934,
+ "ល": 1935,
+ "ඵෙ": 1936,
+ "ඤෑ": 1937,
+ "જ": 1938,
+ "ෂෝ": 1939,
+ "🤗": 1940,
+ "ვ": 1941,
+ "×": 1942,
+ "ش": 1943,
+ "ℓ": 1944,
+ "ง": 1945,
+ "වෲ": 1946,
+ "维": 1947,
+ "녕": 1948,
+ "Ţ": 1949,
+ "M": 1950,
+ "": 1951,
+ "කඃ": 1952,
+ "🏁": 1953,
+ "💰": 1954,
+ "本": 1955,
+ "█": 1956,
+ "හෘ": 1957,
+ "l": 1958,
+ "ඩෝ": 1959,
+ "යා": 1960,
+ "😥": 1961,
+ "ः": 1962,
+ "Ʒ": 1963,
+ "ඳි": 1964,
+ "曜": 1965,
+ "弘": 1966,
+ "ළෙ": 1967,
+ "動": 1968,
+ "☺": 1969,
+ "යො": 1970,
+ "应": 1971,
+ "プ": 1972,
+ "ජූ": 1973,
+ "φ": 1974,
+ "ძ": 1975,
+ "✖": 1976,
+ "h": 1977,
+ "井": 1978,
+ "錢": 1979,
+ "ɑ": 1980,
+ "ඨ": 1981,
+ "ශං": 1982,
+ "的": 1983,
+ "පැ": 1984,
+ "": 1985,
+ "ְ": 1986,
+ "ඳො": 1987,
+ "නෙ": 1988,
+ "": 1989,
+ "⇜": 1990,
+ "ÿ": 1991,
+ "ල්": 1992,
+ "ඩැ": 1993,
+ "ு": 1994,
+ "ඛෑ": 1995,
+ "特": 1996,
+ "は": 1997,
+ "۱": 1998,
+ "🚗": 1999,
+ "ඡෝ": 2000,
+ "ශෛ": 2001,
+ "భ": 2002,
+ "౦": 2003,
+ "ඟං": 2004,
+ "እ": 2005,
+ "තෑ": 2006,
+ "ශෑ": 2007,
+ "ධේ": 2008,
+ "ை": 2009,
+ "ы": 2010,
+ "Ὑ": 2011,
+ "ບ": 2012,
+ "정": 2013,
+ "ඹො": 2014,
+ "ථී": 2015,
+ "ඟෙ": 2016,
+ "›": 2017,
+ "🙏": 2018,
+ "➣": 2019,
+ "ධෞ": 2020,
+ "🎰": 2021,
+ "ථා": 2022,
+ "ف": 2023,
+ "ə": 2024,
+ "ඪ": 2025,
+ "👼": 2026,
+ "ರ": 2027,
+ "Ŝ": 2028,
+ "新": 2029,
+ "ටෝ": 2030,
+ "ም": 2031,
+ ")": 2032,
+ "v": 2033,
+ "ص": 2034,
+ "ئ": 2035,
+ "兵": 2036,
+ "ඛැ": 2037,
+ "ෂඃ": 2038,
+ "Ü": 2039,
+ "न": 2040,
+ "ම": 2041,
+ "я": 2042,
+ "ہ": 2043,
+ "Γ": 2044,
+ "ශෝ": 2045,
+ "இ": 2046,
+ "°": 2047,
+ "": 2048,
+ "(": 2049,
+ "訣": 2050,
+ "එෙ": 2051,
+ "=": 2052,
+ "ッ": 2053,
+ "ป": 2054,
+ "Σ": 2055,
+ "堡": 2056,
+ "ඬ": 2057,
+ "✘": 2058,
+ " ": 2059,
+ "۹": 2060,
+ "包": 2061,
+ "д": 2062,
+ "Π": 2063,
+ "ඉා": 2064,
+ "♪": 2065,
+ "ලෞ": 2066,
+ "": 2067,
+ "පු": 2068,
+ "ධෛ": 2069,
+ "헌": 2070,
+ "එ": 2071,
+ "භැ": 2072,
+ "ಅ": 2073,
+ "‑": 2074,
+ "Ἀ": 2075,
+ "¶": 2076,
+ "": 2077,
+ "є": 2078,
+ "s": 2079,
+ "リ": 2080,
+ "ඉං": 2081,
+ "ඨො": 2082,
+ "🤓": 2083,
+ "إ": 2084,
+ "ோ": 2085,
+ "ඍ්": 2086,
+ "💠": 2087,
+ "击": 2088,
+ "ඕෙ": 2089,
+ "☖": 2090,
+ "ඒං": 2091,
+ "ή": 2092,
+ "ඨු": 2093,
+ "මො": 2094,
+ "x": 2095,
+ "د": 2096,
+ "ﻟ": 2097,
+ "ආ": 2098,
+ "ඔු": 2099,
+ "客": 2100,
+ "🏿": 2101,
+ "少": 2102,
+ "☎": 2103,
+ "ඔෙ": 2104,
+ "↔": 2105,
+ "ඳං": 2106,
+ "租": 2107,
+ "😲": 2108,
+ "🔖": 2109,
+ "💯": 2110,
+ "ඉැ": 2111,
+ "구": 2112,
+ "が": 2113,
+ "": 2114,
+ "වී": 2115,
+ "ඳා": 2116,
+ "රං": 2117,
+ "චං": 2118,
+ ":": 2119,
+ "බෑ": 2120,
+ "ש": 2121,
+ "ʁ": 2122,
+ "ණෘ": 2123,
+ "සෲ": 2124,
+ "ả": 2125,
+ "ጥ": 2126,
+ "ඇඃ": 2127,
+ "ಿ": 2128,
+ "😇": 2129,
+ "🚘": 2130,
+ "并": 2131,
+ "천": 2132,
+ "რ": 2133,
+ "ඤූ": 2134,
+ "ඬූ": 2135,
+ "ช": 2136,
+ "¸": 2137,
+ "": 2138,
+ "ਸ": 2139,
+ "♚": 2140,
+ "﴿": 2141,
+ "මා": 2142,
+ "⏳": 2143,
+ "හූ": 2144,
+ "-": 2145,
+ "": 2146,
+ "නො": 2147,
+ "╰": 2148,
+ "ඵං": 2149,
+ "̃": 2150,
+ "汉": 2151,
+ "●": 2152,
+ "ḷ": 2153,
+ "²": 2154,
+ "▷": 2155,
+ "ი": 2156,
+ "ア": 2157,
+ "ෂෛ": 2158,
+ "🎼": 2159,
+ "": 2160,
+ "භෑ": 2161,
+ "♠": 2162,
+ "රෝ": 2163,
+ "🚉": 2164,
+ "ህ": 2165,
+ "乾": 2166,
+ "අේ": 2167,
+ "ٹ": 2168,
+ "卒": 2169,
+ ">": 2170,
+ "ર": 2171,
+ "දී": 2172,
+ "ඩෙ": 2173,
+ "推": 2174,
+ "ඥ": 2175,
+ "ኛ": 2176,
+ "络": 2177,
+ "ἄ": 2178,
+ "t": 2179,
+ "ඞ්": 2180,
+ "😋": 2181,
+ "ý": 2182,
+ "∆": 2183,
+ "ජඃ": 2184,
+ "?": 2185,
+ "れ": 2186,
+ "ふ": 2187,
+ "最": 2188,
+ "🔬": 2189,
+ "ἀ": 2190,
+ "ථං": 2191,
+ "排": 2192,
+ "요": 2193,
+ "್": 2194,
+ "◎": 2195,
+ "無": 2196,
+ "ொ": 2197,
+ "🔓": 2198,
+ "භා": 2199,
+ "": 2200,
+ "ඝෙ": 2201,
+ "😜": 2202,
+ "С": 2203,
+ "ඣා": 2204,
+ "♊": 2205,
+ "◆": 2206,
+ "书": 2207,
+ "☠": 2208,
+ "χ": 2209,
+ "b": 2210,
+ "👭": 2211,
+ "ﻬ": 2212,
+ "德": 2213,
+ "‐": 2214,
+ "ඣි": 2215,
+ "ධූ": 2216,
+ "": 2217,
+ "1": 2218,
+ "공": 2219,
+ "σ": 2220,
+ "ළෘ": 2221,
+ "அ": 2222,
+ "贼": 2223,
+ "래": 2224,
+ "මෘ": 2225,
+ "කෲ": 2226,
+ "ì": 2227,
+ "˚": 2228,
+ "è": 2229,
+ "🍔": 2230,
+ "館": 2231,
+ "ථෛ": 2232,
+ "๒": 2233,
+ "ц": 2234,
+ "ඡං": 2235,
+ "☐": 2236,
+ "ෂ්": 2237,
+ "✔": 2238,
+ "චී": 2239,
+ "👊": 2240,
+ "ඔා": 2241,
+ "ඤෙ": 2242,
+ "ಗ": 2243,
+ "ශෘ": 2244,
+ "ಾ": 2245,
+ "ආං": 2246,
+ "ඟි": 2247,
+ "හෙ": 2248,
+ "大": 2249,
+ "ෆු": 2250,
+ "භං": 2251,
+ "ன": 2252,
+ "කි": 2253,
+ "ෂූ": 2254,
+ "♫": 2255,
+ "පෙ": 2256,
+ "ミ": 2257,
+ "🇺": 2258,
+ "Н": 2259,
+ "尔": 2260,
+ "▣": 2261,
+ "ч": 2262,
+ "‛": 2263,
+ "දි": 2264,
+ "පෑ": 2265,
+ "ෆා": 2266,
+ "ථෘ": 2267,
+ "ャ": 2268,
+ "ϕ": 2269,
+ "ח": 2270,
+ "多": 2271,
+ "බො": 2272,
+ "": 2273
+}
\ No newline at end of file
diff --git a/examples/examples.ipynb b/examples/examples.ipynb
index 461880a..811cf4e 100644
--- a/examples/examples.ipynb
+++ b/examples/examples.ipynb
@@ -16,8 +16,14 @@
"metadata": {},
"outputs": [],
"source": [
- "from sinlib import Tokenizer\n",
- "from sinlib import preprocessing"
+ "from sinlib import Tokenizer, preprocessing, Romanizer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Training a sinlib tokenizer"
]
},
{
@@ -26,7 +32,14 @@
"metadata": {},
"outputs": [],
"source": [
- "text = [\"ක්රමවත්ව, ඉවසිලිවන්තව\"] * 10"
+ "corpus = [\n",
+ " \"\"\"මෙරටට බුදදහම දායාද කරමින් අනුබුදු මිහිඳු හිමිගේ ලංකා ගමනය සිදුවූ උතුම් පොසොන් පුර පසළොස්වක පොහොය දිනය අදට යෙදී තිබේ.\n",
+ "\n",
+ "මිහිඳු මහරහතන් වහන්සේ ප්රමුඛ ඉට්ඨිය, උත්ථිය, සම්බල, බද්දසාල යන රහතන් වහන්සේලාත් සුමන සාමණේරයන් වහන්සේත් භණ්ඩුක උපාසකක් බුදුරජාණන් වහන්සේගේ නිර්මල බුදුදහම රැගෙන මිහින්තලා පව්වට වැඩම කරවීම අද වැනි පොසොන් පුර පසළොස්වක පෙහොය දිනක සිදුවූ බව බෞද්ධ ඉතිහාසයේ සඳහන් වෙයි.\n",
+ "\n",
+ "දේවානම් පියතිස්ස රජු ඇතුළු පිරිස චුල්ලහත්ථි පදෝපම සූත්රය අසා තෙරුවන් සරණ යාම සිදු වූයේද අද වැනි පොසොන් පොහොය දිනකය.\"\"\",\n",
+ "\"මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසින් අනු දැන වදාළ ධර්ම මාර්ගය මෙරට පත්වී ඇති දේශපාලන, සමාජ හා ආර්ථික ගැටළු නිරාකරණය කර ගනිමින් දියුණු රටක් ගොඩනැඟීමට ඉවහල් කරගන්නා ලෙස සියලු දෙනාගෙන් ඉල්ලා සිටින බවය.\"\n",
+ "]"
]
},
{
@@ -35,44 +48,41 @@
"metadata": {},
"outputs": [
{
- "data": {
- "text/plain": [
- "['ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
- " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[0;31mSignature:\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_list\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mDocstring:\u001b[0m\n",
+ "Train the tokenizer on a list of text strings.\n",
+ "\n",
+ "Parameters\n",
+ "----------\n",
+ "text_list : list of str\n",
+ " List of text strings to be used for training the tokenizer.\n",
+ "\n",
+ "Examples\n",
+ "--------\n",
+ ">>> from sinlib import Tokenizer\n",
+ ">>> corpus = [...]\n",
+ ">>> tokenizer = Tokenizer()\n",
+ ">>> tokenizer.train(corpus)\n",
+ "\u001b[0;31mFile:\u001b[0m ~/learning/sinlib/src/sinlib/tokenizer.py\n",
+ "\u001b[0;31mType:\u001b[0m method"
+ ]
}
],
"source": [
- "text"
+ "tokenizer = Tokenizer()\n",
+ "tokenizer.train?"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව']\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "print(text) # have non printables \\u200d"
+ "tokenizer.train(corpus)"
]
},
{
@@ -83,16 +93,7 @@
{
"data": {
"text/plain": [
- "[0.9333333333333333,\n",
- " 0.9333333333333333,\n",
- " 0.9333333333333333,\n",
- " 0.9333333333333333,\n",
- " 0.9333333333333333,\n",
- " 0.9333333333333333,\n",
- " 0.9333333333333333,\n",
- " 0.9333333333333333,\n",
- " 0.9333333333333333,\n",
- " 0.9333333333333333]"
+ "127"
]
},
"execution_count": 6,
@@ -101,237 +102,364 @@
}
],
"source": [
- "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=False)"
+ "len(tokenizer)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Encoding text"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
+ "outputs": [],
+ "source": [
+ "text = \"උතුම් පොසොන් පොහොය අද\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "encodings = tokenizer(text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"
+ "[51, 118, 33, 54, 121, 13, 97, 54, 121, 29, 50, 54, 52, 120]"
]
},
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True)"
+ "encodings"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"
+ "['උ', 'තු', 'ම්', ' ', 'පො', 'සො', 'න්', ' ', 'පො', 'හො', 'ය', ' ', 'අ', 'ද']"
]
},
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True, ignore_non_printable=True)"
+ "[tokenizer.token_id_to_token_map[tok] for tok in encodings]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Save trained tokenizer and load from disk"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tokenizer.save_tokenizer(\".\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loaded_tokenizer = Tokenizer().load_from_pretrained(\"./vocab.json\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assert loaded_tokenizer(text)==tokenizer(text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Sinhala text romanization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "romanizer = Romanizer(char_mapper_fp=None, tokenizer_vocab_path=None) #pass both none to load from default configs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
- "data": {
- "text/plain": [
- "[0.9375,\n",
- " 0.9375,\n",
- " 0.9375,\n",
- " 0.9375,\n",
- " 0.9375,\n",
- " 0.9375,\n",
- " 0.9375,\n",
- " 0.9375,\n",
- " 0.9375,\n",
- " 0.9375]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "meratata budadahama dayada karamin anubudu mihidu himige lanka gamanaya siduwu uthum poson pura pasaloswaka pohoya dinaya adata yedi thibe.mihidu maharahathan wahanse pramuka ettiya, uththiya, sambala, baddasala yana rahathan wahanselath sumana samanorayan wahanseth bhanduka upasakak budurajanan wahansege nirmala bududahama regena mihinthala pawwata wadama karawema ada wani poson pura pasaloswaka pehoya dinaka siduwu bawa bauddha ethihasaye sadahan wei.dewanam piyathissa raju ethulu pirisa chullahaththi padhopama suthraya asa theruwan sarana yama sidu wuyeda ada wani poson pohoya dinakaya.\n"
+ ]
}
],
"source": [
- "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True, ignore_non_printable=False)"
+ "print(romanizer(corpus[0]))"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
- "tokeniser = Tokenizer()"
+ "more_complex_text = corpus[1]"
]
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
- "corpus = [\"\"\"මේ මාසයේ ගත වූ දින 15ක කාලය තුළ කොළඹ නගරය ආශ්රිත ව සීසීටීවී දර්ශන මඟින් වැරදිවලට සම්බන්ධ පුද්ගලයන් 793 දෙනෙකු හදුනාගත් බව පොලීසිය නිවේදනය කර තිබේ.\"\"\"\n",
- " \"\"\"මෑතකාලීන ව රට මුහුණ දුන් අභියෝගාත්මකම ආර්ථික කාරණාව ණය ප්රතිව්යුගතකරණය බව මුදල් රාජ්ය අමාත්ය ආචාර්ය රංජිත් සියඹලාපිටිය මහතා පවසයි.\"\"\",\n",
- " \"භාෂාව\"\n",
- " ]"
+ "more_complex_text = more_complex_text[:100] + \".... \\nIn linguistics, romanization is the conversion...., adding special chars ^^*#(&$^)\""
]
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 18,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'me athara poson poho dina paniwudayak nikuth karamin janadhipathiwaraya penwa denne mihidu maharahathan wahanse visi.... In linguistics, romanization is the conversion...., adding special chars ^^*#(&$^)'"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "tokeniser.train(corpus)"
+ "romanizer(more_complex_text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Few available preprocessing methods on Sinhala texts"
]
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
- "tokens = tokeniser(\"රට මුහුණ දුන් සිද්ධියේ\")"
+ "_, token_count = preprocessing.process_text_with_token_counts(corpus[0], consider_special_character_as_sinhala=False, ignore_non_printable=True)"
]
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[73, 37, 2, 68, 56, 38, 2, 62, 29, 2, 46, 54, 87, 4]"
+ "271"
]
},
- "execution_count": 34,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "tokens"
+ "token_count"
]
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
- "decoded_tokens = [tokeniser.token_id_to_token_map[id] for id in tokens]"
+ "more_complex_text += \"ශ්රී ලංකා ප්රජාතාන්ත්රික සමාජවාදී\""
]
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසි.... \n",
+ "In linguistics, romanization is the conversion...., adding special chars ^^*#(&$^)ශ්රී ලංකා ප්රජාතාන්ත්රික සමාජවාදී\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(more_complex_text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'රට මුහුණ දුන් සිද්යේ'"
+ "'rs ^^*#(&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී'"
]
},
- "execution_count": 36,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "\"\".join(decoded_tokens)"
+ "more_complex_text[-50:]"
]
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'රට මුහුණ දුන් සිද්යේ'"
+ "'මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසි.... , ...., ^^*#(&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී'"
]
},
- "execution_count": 37,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "tokeniser.decode(tokens)"
+ "preprocessing.remove_english_characters(more_complex_text)"
]
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "['භා', 'ෂා', 'ව']"
+ "'rs ^^*#(&$^)ශ්රී ලංකා ප්රජාතාන්ත්රික සමාජවාදී'"
]
},
- "execution_count": 38,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "[tokeniser.token_id_to_token_map[id] for id in tokeniser(\"භාෂාව\")]"
+ "preprocessing.remove_non_printable(more_complex_text[-50:])"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "['ස', 'ි', 'ං', 'හ', 'ල']"
+ "0.610738255033557"
]
},
- "execution_count": 29,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "list(\"සිංහල\")"
+ "preprocessing.get_sinhala_character_ratio(more_complex_text)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 27,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessing.get_sinhala_character_ratio(\n",
+ " preprocessing.remove_english_characters(\n",
+ " more_complex_text\n",
+ " )\n",
+ ")"
+ ]
}
],
"metadata": {
+ "kernelspec": {
+ "display_name": "analysis-env",
+ "language": "python",
+ "name": "analysis"
+ },
"language_info": {
- "name": "python"
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.9"
}
},
"nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index 4398668..704f290 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "sinlib"
-version = "0.0.8.5"
+version = "0.0.8.6"
description = "Sinhala NLP Toolkit"
authors = [
{ name = "Ransaka", email = "ransaka.ravihara@gmail.com" }
diff --git a/src/sinlib/__init__.py b/src/sinlib/__init__.py
index e48d083..129cf78 100644
--- a/src/sinlib/__init__.py
+++ b/src/sinlib/__init__.py
@@ -1,7 +1,9 @@
from sinlib.tokenizer import Tokenizer
from sinlib.utils import preprocessing
+from sinlib.romanize import Romanizer
__all__ = [
"Tokenizer",
- "preprocessing"
+ "preprocessing",
+ "Romanizer"
]
diff --git a/src/sinlib/romanize.py b/src/sinlib/romanize.py
new file mode 100644
index 0000000..dafa4d3
--- /dev/null
+++ b/src/sinlib/romanize.py
@@ -0,0 +1,43 @@
+from .utils.preprocessing import load_char_mapper
+from .tokenizer import Tokenizer
+from .utils.preprocessing import DEFAULT_VOCAB_MAP_FP, CHAR_MAPPER_FP
+from .utils.chars import ALL_SINHALA_CHARACTERS, NUBERS_AND_PUNKTS
+from .utils.preprocessing import remove_non_printable
+import numpy as np
+
+
+class Romanizer:
+ def __init__(self, char_mapper_fp: str, tokenizer_vocab_path: str):
+ if char_mapper_fp is None:
+ char_mapper_fp = CHAR_MAPPER_FP
+ if tokenizer_vocab_path is None:
+ tokenizer_vocab_path = DEFAULT_VOCAB_MAP_FP
+ self.char_mapper = load_char_mapper(char_mapper_fp)
+ self.tokenizer = Tokenizer()
+ self.tokenizer.load_from_pretrained(tokenizer_vocab_path)
+
+ def __call__(self, text):
+ return self.__romanize(text)
+
+ def __romanize(self, text: str):
+ text = remove_non_printable(text)
+ chars = np.array(list(text))
+ sinhala_mask = [
+ True
+ if ch in ALL_SINHALA_CHARACTERS + list(NUBERS_AND_PUNKTS) + [" "]
+ else False
+ for ch in chars
+ ]
+ sinhala_text = "".join(chars[sinhala_mask]).strip()
+ encodings = self.tokenizer(sinhala_text)
+ decoded_sinhala_chars = [
+ self.tokenizer.token_id_to_token_map[c] for c in encodings
+ ]
+ romanized_sinhala = [
+ self.char_mapper.get(ch, ch if ch in NUBERS_AND_PUNKTS.union(" ") else None)
+ for ch in decoded_sinhala_chars
+ ]
+ romanized_sinhala = "".join(romanized_sinhala)
+ word_2_word_mapping = dict(zip(sinhala_text.split(), romanized_sinhala.split()))
+ romanized_text = [word_2_word_mapping.get(word, word) for word in text.split()]
+ return " ".join(romanized_text)
diff --git a/src/sinlib/tokenizer.py b/src/sinlib/tokenizer.py
index f2436b9..b5bc0ee 100644
--- a/src/sinlib/tokenizer.py
+++ b/src/sinlib/tokenizer.py
@@ -1,26 +1,96 @@
+import json
+import warnings
+from pathlib import Path
import concurrent.futures
-from .utils.preprocessing import process_text
+from .utils.preprocessing import process_text, load_default_vocab_map
+
class Tokenizer:
def __init__(self):
+ self.unknown_token_id = None
+ self.token_id_to_token_map = None
+ self.vocab_map = None
self.unknown_token = ""
self.tokenized_chars = []
self.unique_chars = []
-
- def __encode(self, text):
+
+ def __encode(self, text) -> list:
processed_text = self.__process_text(text)
- encoded_text = [self.vocab_map.get(char, self.unknown_token_id) for char in processed_text]
+ encoded_text = [
+ self.vocab_map.get(char, self.unknown_token_id) for char in processed_text
+ ]
return encoded_text
-
- def __call__(self, text):
+
+ def __call__(self, text) -> list:
+ """
+ Encode the given text into a list of tokens.
+
+ Parameters
+ ----------
+ text : str
+ Text to be encoded.
+
+ Returns
+ -------
+ encoded_tokens : list of int
+ List of tokens representing the encoded text.
+
+ Examples
+ --------
+ >>> from sinlib import Tokenizer
+ >>> corpus = [...]
+ >>> tokenizer = Tokenizer()
+ >>> tokenizer.train(corpus)
+ >>> tokenizer("මම ගෙදර ගියා")
+ [2041, 2041, 942, 965, 624, 909, 942, 54, 1960]
+ """
return self.__encode(text)
-
- def decode(self, ids):
- return "".join([self.token_id_to_token_map.get(token,self.unknown_token) for token in ids])
- def train(self, text_list):
- self.__train_chracter_level_tokenizer(text_list)
-
+ def decode(self, ids) -> str:
+ """
+ Decode a list of token IDs into a string.
+
+ Parameters
+ ----------
+ ids : list of int
+ List of token IDs to be decoded.
+
+ Returns
+ -------
+ decoded_text : str
+ The decoded text string.
+
+ Examples
+ --------
+ >>> from sinlib import Tokenizer
+ >>> tokenizer = Tokenizer()
+ >>> tokenizer.train([...])
+ >>> encoded_tokens = [2041, 2041, 942, 965, 624, 909, 942, 54, 1960]
+ >>> tokenizer.decode(encoded_tokens)
+ 'මම ගෙදර ගියා'
+ """
+ return "".join(
+ [self.token_id_to_token_map.get(token, self.unknown_token) for token in ids]
+ )
+
+ def train(self, text_list) -> None:
+ """
+ Train the tokenizer on a list of text strings.
+
+ Parameters
+ ----------
+ text_list : list of str
+ List of text strings to be used for training the tokenizer.
+
+ Examples
+ --------
+ >>> from sinlib import Tokenizer
+ >>> corpus = [...]
+ >>> tokenizer = Tokenizer()
+ >>> tokenizer.train(corpus)
+ """
+ self.__train_character_level_tokenizer(text_list)
+
def __len__(self):
return len(self.vocab_map)
@@ -28,12 +98,64 @@ def __len__(self):
def __process_text(t):
return process_text(t)
- def __train_chracter_level_tokenizer(self, text_list):
+ def __train_character_level_tokenizer(self, text_list):
with concurrent.futures.ThreadPoolExecutor() as executor:
results = list(executor.map(self.__process_text, text_list))
self.tokenized_chars = [char for sublist in results for char in sublist]
self.unique_chars = set(self.tokenized_chars)
- self.vocab_map = dict(zip(self.unique_chars,range(len(self.unique_chars))))
+ self.vocab_map = dict(zip(self.unique_chars, range(len(self.unique_chars))))
self.vocab_map[self.unknown_token] = len(self.vocab_map)
self.unknown_token_id = self.vocab_map[self.unknown_token]
- self.token_id_to_token_map = {value:key for key,value in self.vocab_map.items()}
\ No newline at end of file
+ self.token_id_to_token_map = {
+ value: key for key, value in self.vocab_map.items()
+ }
+
+ def load_from_pretrained(self, file_path: str) -> None:
+ """
+ Load the vocabulary map from a pre-trained file.
+
+ Parameters
+ ----------
+ file_path : str
+ Path to the file containing the pre-trained vocabulary map.
+
+ Returns
+ -------
+ None
+
+ Warns
+ -----
+ UserWarning
+ If the file is not found at the specified path, a default vocabulary map is loaded and a warning is issued.
+
+ Examples
+ --------
+ >>> from sinlib import Tokenizer
+ >>> tokenizer = Tokenizer()
+ >>> tokenizer.load_from_pretrained("pretrained_vocab.json")
+ """
+ if Path(file_path).is_file():
+ with open(file_path, "r") as f:
+ self.vocab_map = json.load(f)
+ else:
+ warnings.warn(
+ "File not found at the specified path. Loaded default vocab map.",
+ UserWarning,
+ )
+ self.vocab_map = load_default_vocab_map()
+
+ self.token_id_to_token_map = {
+ value: key for key, value in self.vocab_map.items()
+ }
+ self.unknown_token_id = self.vocab_map[self.unknown_token]
+ return self
+
+ def save_tokenizer(self, save_path: str):
+ save_path = Path(save_path)
+ configurations = {"unknown_token": self.unknown_token}
+
+ with open(save_path / "vocab.json", "w", encoding="utf-8") as file:
+ json.dump(self.vocab_map, file, ensure_ascii=False, indent=4)
+
+ with open(save_path / "config.json", "w") as file:
+ json.dump(configurations, file, indent=4)
\ No newline at end of file
diff --git a/src/sinlib/utils/chars.py b/src/sinlib/utils/chars.py
index 3dd2cb1..228b247 100644
--- a/src/sinlib/utils/chars.py
+++ b/src/sinlib/utils/chars.py
@@ -1,43 +1,188 @@
from string import punctuation
-BASE_CONSONANTS = [
- 'ක', 'ඛ', 'ග', 'ඝ', 'ඞ', 'ඟ',
- 'ච', 'ඡ', 'ජ', 'ඣ', 'ඤ', 'ඦ',
- 'ට', 'ඨ', 'ඩ', 'ඪ', 'ණ', 'ඬ',
- 'ත', 'ථ', 'ද', 'ධ', 'න', 'ඳ',
- 'ප', 'ඵ', 'බ', 'භ', 'ම', 'ඹ',
- 'ය', 'ර', 'ල', 'ව',
- 'ශ', 'ෂ', 'ස', 'හ', 'ළ', 'ෆ',
+ALL_SINHALA_CHARACTERS = [
+ "ඏ",
+ "ඛ",
+ "ම",
+ "ඍ",
+ "ு",
+ "ා",
+ "ප",
+ "ඝ",
+ "ඹ",
+ "ඓ",
+ "ෑ",
+ "ෂ",
+ "ැ",
+ "ෲ",
+ "ි",
+ "ක",
+ "ණ",
+ "ධ",
+ "்",
+ "ඵ",
+ "ඞ",
+ "ජ",
+ "හ",
+ "ෝ",
+ "ඤ",
+ "ට",
+ "ඇ",
+ "ෞ",
+ "ඒ",
+ "ූ",
+ "ව",
+ "ඣ",
+ "ච",
+ "ඖ",
+ "ෘ",
+ "ු",
+ "ඳ",
+ "ඌ",
+ "ෙ",
+ "්",
+ "ඥ",
+ "ீ",
+ "ෛ",
+ "ෳ",
+ "ඔ",
+ "ආ",
+ "ළ",
+ "උ",
+ "ඟ",
+ "ඃ",
+ "ඈ",
+ "ඪ",
+ "බ",
+ "අ",
+ "ෆ",
+ "ත",
+ "ේ",
+ "ඬ",
+ "ය",
+ "ො",
+ "ශ",
+ "භ",
+ "ං",
+ "ර",
+ "ඉ",
+ "ඨ",
+ "ී",
+ "ඕ",
+ "ඡ",
+ "න",
+ "ස",
+ "ද",
+ "ඩ",
+ "ෟ",
+ "ග",
+ "එ",
+ "ඊ",
+ "ල",
+ "ථ",
]
-SAN = [
- 'ඟ', 'ඦ', 'ඬ', 'ඳ', 'ඹ'
+BASE_CONSONANTS = [
+ "ක",
+ "ඛ",
+ "ග",
+ "ඝ",
+ "ඞ",
+ "ඟ",
+ "ච",
+ "ඡ",
+ "ජ",
+ "ඣ",
+ "ඤ",
+ "ඦ",
+ "ට",
+ "ඨ",
+ "ඩ",
+ "ඪ",
+ "ණ",
+ "ඬ",
+ "ත",
+ "ථ",
+ "ද",
+ "ධ",
+ "න",
+ "ඳ",
+ "ප",
+ "ඵ",
+ "බ",
+ "භ",
+ "ම",
+ "ඹ",
+ "ය",
+ "ර",
+ "ල",
+ "ව",
+ "ශ",
+ "ෂ",
+ "ස",
+ "හ",
+ "ළ",
+ "ෆ",
]
-SAN_MAPPING = {'ඟ': 'ංග', 'ඦ': 'ඤ්ජ', 'ඬ': 'ණ්ඩ', 'ඳ': 'න්ද', 'ඹ': 'ම්බ'}
+SAN = ["ඟ", "ඦ", "ඬ", "ඳ", "ඹ"]
+
+SAN_MAPPING = {"ඟ": "ංග", "ඦ": "ඤ්ජ", "ඬ": "ණ්ඩ", "ඳ": "න්ද", "ඹ": "ම්බ"}
REVERSE_SAN_MAPPING = {d: v for v, d in SAN_MAPPING.items()}
-CONSONANTS = [c + '්' for c in BASE_CONSONANTS]
+CONSONANTS = [c + "්" for c in BASE_CONSONANTS]
VOWELS = [
- 'අ', 'ආ', 'ඇ', 'ඈ', 'ඉ', 'ඊ', 'උ', 'ඌ',
- 'ඍ', 'ඎ', 'එ', 'ඒ', 'ඓ', 'ඔ', 'ඕ', 'ඖ',
- 'අං', 'අඃ',
+ "අ",
+ "ආ",
+ "ඇ",
+ "ඈ",
+ "ඉ",
+ "ඊ",
+ "උ",
+ "ඌ",
+ "ඍ",
+ "ඎ",
+ "එ",
+ "ඒ",
+ "ඓ",
+ "ඔ",
+ "ඕ",
+ "ඖ",
+ "අං",
+ "අඃ",
]
VOWEL_DIACRITICS = [
- '', 'ා', 'ැ', 'ෑ', 'ි', 'ී', 'ු', 'ූ', 'ෘ',
- 'ෲ', 'ෙ', 'ේ', 'ෛ', 'ො', 'ෝ', 'ෞ',
- 'ං', 'ඃ', '්', 'ෳ'
+ "",
+ "ා",
+ "ැ",
+ "ෑ",
+ "ි",
+ "ී",
+ "ු",
+ "ූ",
+ "ෘ",
+ "ෲ",
+ "ෙ",
+ "ේ",
+ "ෛ",
+ "ො",
+ "ෝ",
+ "ෞ",
+ "ං",
+ "ඃ",
+ "්",
+ "ෳ",
]
LONG_TO_SHORT_VOWEL_DIACRITICS_MAPPING = {
- '': 'ා',
- 'ෑ': 'ැ',
- 'ී': 'ි',
- 'ූ': 'ු',
- 'ේ': 'ෙ',
- 'ෝ': 'ො'
+ "": "ා",
+ "ෑ": "ැ",
+ "ී": "ි",
+ "ූ": "ු",
+ "ේ": "ෙ",
+ "ෝ": "ො",
}
DIACRITICS_MAPPING = {v: d for v, d in zip(VOWELS, VOWEL_DIACRITICS)}
@@ -45,49 +190,111 @@
REVERSE_DIACRITICS_MAPPING = {d: v for v, d in zip(VOWELS, VOWEL_DIACRITICS)}
CONJUNCT_CONSONANTS = [
- 'ක්ර', 'ඛ්ර', 'ග්ර', 'ඝ්ර', 'ඞ්ර', 'ඟ්ර',
- 'ක්ය', 'ඛ්ය', 'ග්ය', 'ඝ්ය', 'ඞ්ය', 'ඟ්ය',
- 'ක්ෂ', '෴',
+ "ක්ර",
+ "ඛ්ර",
+ "ග්ර",
+ "ඝ්ර",
+ "ඞ්ර",
+ "ඟ්ර",
+ "ක්ය",
+ "ඛ්ය",
+ "ග්ය",
+ "ඝ්ය",
+ "ඞ්ය",
+ "ඟ්ය",
+ "ක්ෂ",
+ "෴",
]
NUMERALS = [
- '𑇡', '𑇢', '𑇣', '𑇤', '𑇥', '𑇦', '𑇧', '𑇨', '𑇩', '𑇪',
- '𑇫', '𑇬', '𑇭', '𑇮', '𑇯', '𑇰', '𑇱', '𑇲', '𑇳', '𑇴',
+ "𑇡",
+ "𑇢",
+ "𑇣",
+ "𑇤",
+ "𑇥",
+ "𑇦",
+ "𑇧",
+ "𑇨",
+ "𑇩",
+ "𑇪",
+ "𑇫",
+ "𑇬",
+ "𑇭",
+ "𑇮",
+ "𑇯",
+ "𑇰",
+ "𑇱",
+ "𑇲",
+ "𑇳",
+ "𑇴",
]
GOSHA_LETTERS = [
- 'අ', 'ආ', 'ඇ', 'ඈ', 'ඉ', 'ඊ', 'උ', 'ඌ',
- 'ඍ', 'ඎ', 'එ', 'ඒ', 'ඓ', 'ඔ', 'ඕ', 'ඖ',
- 'අං', 'අඃ',
- 'ග', 'ඝ', 'ඞ',
- 'ජ', 'ඣ', 'ඤ',
- 'ඩ', 'ඪ', 'ණ',
- 'ද', 'ධ', 'න',
- 'බ', 'භ', 'ම',
- 'ය', 'ර', 'ල', 'ව',
- 'හ'
+ "අ",
+ "ආ",
+ "ඇ",
+ "ඈ",
+ "ඉ",
+ "ඊ",
+ "උ",
+ "ඌ",
+ "ඍ",
+ "ඎ",
+ "එ",
+ "ඒ",
+ "ඓ",
+ "ඔ",
+ "ඕ",
+ "ඖ",
+ "අං",
+ "අඃ",
+ "ග",
+ "ඝ",
+ "ඞ",
+ "ජ",
+ "ඣ",
+ "ඤ",
+ "ඩ",
+ "ඪ",
+ "ණ",
+ "ද",
+ "ධ",
+ "න",
+ "බ",
+ "භ",
+ "ම",
+ "ය",
+ "ර",
+ "ල",
+ "ව",
+ "හ",
]
AGOSHA_LETTERS = [
- 'ක්', 'ඛ්',
- 'ච්', 'ඡ්',
- 'ට්', 'ඨ්',
- 'ත්', 'ථ්',
- 'ප්', 'ඵ්',
+ "ක්",
+ "ඛ්",
+ "ච්",
+ "ඡ්",
+ "ට්",
+ "ඨ්",
+ "ත්",
+ "ථ්",
+ "ප්",
+ "ඵ්",
]
AGOSHA_TO_GOSHA_MAPPING = {
- 'ක්': 'ග්',
- 'ඛ්': 'ඝ්',
- 'ච්': 'ජ්',
- 'ඡ්': 'ඣ්',
- 'ට්': 'ඩ්',
- 'ඨ්': 'ඪ්',
- 'ත්': 'ද්',
- 'ථ්': 'ධ්',
- 'ප්': 'බ්',
- 'ඵ්': 'භ්',
+ "ක්": "ග්",
+ "ඛ්": "ඝ්",
+ "ච්": "ජ්",
+ "ඡ්": "ඣ්",
+ "ට්": "ඩ්",
+ "ඨ්": "ඪ්",
+ "ත්": "ද්",
+ "ථ්": "ධ්",
+ "ප්": "බ්",
+ "ඵ්": "භ්",
}
PUNKT = set(punctuation)
NUMBERS = set("1234567890")
diff --git a/src/sinlib/utils/preprocessing.py b/src/sinlib/utils/preprocessing.py
index b2be711..3ce505f 100644
--- a/src/sinlib/utils/preprocessing.py
+++ b/src/sinlib/utils/preprocessing.py
@@ -2,13 +2,32 @@
import multiprocessing
import re
from .chars import VOWEL_DIACRITICS, NUBERS_AND_PUNKTS, ALL_LETTERS
-import numpy as np
-import os
+import json
+from pathlib import Path
+import warnings
-# file_path = os.path.join(os.path.dirname(__file__), '../data', 'sinhala_chars_with_special_chars.txt')
+DEFAULT_VOCAB_MAP_FP = "../data/vocab_map.json"
+CHAR_MAPPER_FP = "../data/char_map.json"
-# with open(file_path,'r') as f:
-# SINHALA_CHARS_WITH_SPECIAL_CHARS = f.read().split("\n")
+
+def load_char_mapper(char_mapper_fp):
+ if Path(char_mapper_fp).is_file():
+ with open(char_mapper_fp, "r") as f:
+ char_mapper = json.load(f)
+ else:
+ warnings.warn(
+ "File not found at the specified path. Loaded default char map.",
+ UserWarning,
+ )
+ with open(CHAR_MAPPER_FP, "r") as f:
+ char_mapper = json.load(f)
+ return char_mapper
+
+
+def load_default_vocab_map():
+ with open(DEFAULT_VOCAB_MAP_FP, "r") as f:
+ vocab_map = json.load(f)
+ return vocab_map
def remove_non_printable(input_string):
@@ -69,7 +88,38 @@ def process_text(t):
return tokenized_chars
-def process_text_with_token_counts(t:str, consider_special_character_as_sinhala:bool, ignore_non_printable:bool):
+def process_text_with_token_counts(
+ t: str, consider_special_character_as_sinhala: bool, ignore_non_printable: bool
+):
+ """
+ Process the given text, tokenizing it and counting the tokens.
+
+ Parameters
+ ----------
+ t : str
+ The text to be processed.
+ consider_special_character_as_sinhala : bool
+ If True, special characters will be considered as Sinhala characters.
+ ignore_non_printable : bool
+ If True, non-printable characters will be removed from the text.
+
+ Returns
+ -------
+ tokenized_chars : list of str
+ List of tokenized characters from the text.
+ token_counts : int
+ Total count of tokens in the text.
+
+ Examples
+ --------
+ >>> from sinlib.utils.preprocessing import process_text_with_token_counts
+ >>> text = "මම ගෙදර ගියා."
+ >>> tokenized_chars, token_counts = process_text_with_token_counts(text, True, True)
+ >>> print(tokenized_chars)
+ ['ම', 'ම', ' ', 'ගෙ', 'ද', 'ර', ' ', 'ගි', 'යා', '.']
+ >>> print(token_counts)
+ 10
+ """
if ignore_non_printable:
t = remove_non_printable(t)
@@ -92,25 +142,62 @@ def process_text_with_token_counts(t:str, consider_special_character_as_sinhala:
tokenized_chars.append(char + t[i + 1])
else:
tokenized_chars.append(char)
-
else:
tokenized_chars.append(char)
return tokenized_chars, token_counts
-def get_sinhala_character_ratio(text, consider_special_character_as_sinhala:bool=True, ignore_non_printable:bool=True):
- """Retuning sinhala character ratio for given text string for given settings. Expects optional two parameters.
- consider_special_character_as_sinhala: if this set to true all numbers and special characters will consider as sinhala.
- ignore_non_printable: if this set to true non printables will remove before start processing
+def get_sinhala_character_ratio(
+ text,
+ consider_special_character_as_sinhala: bool = True,
+ ignore_non_printable: bool = True,
+):
+ """
+ Calculate the ratio of Sinhala characters in the given text.
+
+ Parameters
+ ----------
+ text : str or list of str
+ The text or list of text strings to be processed.
+ consider_special_character_as_sinhala : bool, default=True
+ If True, numbers and special characters will be considered as Sinhala characters.
+ ignore_non_printable : bool, default=True
+ If True, non-printable characters will be removed before processing.
+
+ Returns
+ -------
+ ratio : float or list of float
+ The ratio of Sinhala characters in the text. If the input is a list, returns a list of ratios for each text string.
+
+ Examples
+ --------
+ >>> from sinlib.utils.preprocessing import get_sinhala_character_ratio
+ >>> text = "මම ගෙදර ගියා."
+ >>> ratio = get_sinhala_character_ratio(text, True, True)
+ >>> print(ratio)
+ 1.0
+
+ >>> texts = ["මම ගෙදර ගියා.", "This is an example."]
+ >>> ratio = get_sinhala_character_ratio(texts, False, True)
+ >>> print(ratios)
+ [0.875, 0.0]
"""
if isinstance(text, str):
- tokenized_text, sinhala_token_count = process_text_with_token_counts(text,consider_special_character_as_sinhala,ignore_non_printable=ignore_non_printable)
+ tokenized_text, sinhala_token_count = process_text_with_token_counts(
+ text,
+ consider_special_character_as_sinhala,
+ ignore_non_printable=ignore_non_printable,
+ )
tokenized_text = [tok for tok in tokenized_text if tok != " "]
return sinhala_token_count / len(tokenized_text)
elif isinstance(text, list):
pool = multiprocessing.Pool()
- partial_process_text = partial(process_text_with_token_counts, consider_special_character_as_sinhala=consider_special_character_as_sinhala, ignore_non_printable=ignore_non_printable)
+ partial_process_text = partial(
+ process_text_with_token_counts,
+ consider_special_character_as_sinhala=consider_special_character_as_sinhala,
+ ignore_non_printable=ignore_non_printable,
+ )
results = pool.map(partial_process_text, text)
pool.close()
pool.join()