diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..8596b18
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="enabledOnReformat" value="true" />
+    <option name="executionMode" value="BINARY" />
+    <option name="pathToExecutable" value="/opt/homebrew/bin/black" />
+    <option name="sdkName" value="Python 3.10 (sinlib)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (sinlib)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..5d481c1
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/sinlib.iml" filepath="$PROJECT_DIR$/.idea/sinlib.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/sinlib.iml b/.idea/sinlib.iml
new file mode 100644
index 0000000..ee28fd3
--- /dev/null
+++ b/.idea/sinlib.iml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/README.md b/README.md
index b7a1c66..2e5fb41 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Sinlib (Buggy alpha version)
+# Sinlib
 
 ![Alt text](sinlib.png)
 
@@ -29,14 +29,27 @@ encoding = tokenizer("මේ අතර, පෙබරවාරි මාසයේ
 [tokenizer.token_id_to_token_map[id] for id in encoding]
 ['මේ', ' ', 'අ', 'ත', 'ර', ',', ' ', 'පෙ', 'බ', 'ර', 'වා', 'රි', ' ', 'මා', 'ස', 'යේ', ' ', 'ප', 'ළ', 'මු']
 ```
+
 02. Preprocessor
    ```python
 sent = ['මෙය සිංහල වාක්‍යක්', 'මෙය සිංහල වාක්‍යක් සමග english character කීපයක්','This is complete english sentence']
 print(sent)
-['මෙය සිංහල වාක්\u200dයක්', 'මෙය සිංහල වාක්\u200dයක් සමග english character කීපයක්', 'This is complete english sentence']
+#['මෙය සිංහල වාක්\u200dයක්', 'මෙය සිංහල වාක්\u200dයක් සමග english character කීපයක්', 'This is #complete english sentence']
 
 from sinlib.preprocessing import get_sinhala_character_ratio
 
 get_sinhala_character_ratio(sent)
-[0.9, 0.46875, 0.0]
+#[0.9, 0.46875, 0.0]
+```
+
+03. Sinnhala Romanizer
+   ```python
+texts = ["hello, මේ මාසයේ ගත වූ දින 15ක කාලය තුළ කොළඹ නගරය ආශ්‍රිත ව", "මෑතකාලීන ව රට මුහුණ දුන් අභියෝගාත්මකම ආර්ථික කාරණාව ණය ප්‍රතිව්‍යුගතකරණය බව මුදල් රාජ්‍ය අමාත්‍ය ආචාර්ය රංජිත් සියඹ$$$ mahatha see more****"]
+
+from sinlib import Romanizer
+
+romanizer = Romanizer(char_mapper_fp = None, tokenizer_vocab_path = None)
+romanizer(text)
+#['hello, me masaye gatha wu dina 15ka kalaya thula kolaba nagaraya ashritha wa',
+# 'methakaleena wa rata muhuna dun abhiyogathmakama arthika karanawa naya prathiwyugathakaranaya #bawa mudal rajya amathya acharya ranjith siyaba$$$ mahatha see more****']
 ```
diff --git a/data/char_map.json b/data/char_map.json
new file mode 100644
index 0000000..96e71af
--- /dev/null
+++ b/data/char_map.json
@@ -0,0 +1 @@
+{"\u0dc6\u0dd2": "fi", "\u0db6\u0dd8": "bru", "\u0d86\u0dd9": "aa", "\u0d9f\u0dd4": "gu", "\u0dc1\u0dd9": "she", "\u0dbb\u0dd8": "ru", "\u0daa": "ta", "\u0db4\u0d83": "ph", "\u0dab\u0dd6": "nu", "\u0db4\u0dd3": "pe", "\u0dc6\u0dcf": "fa", "\u0d8c": "u", "\u0dab": "na", "\u0daf\u0ddb": "dai", "\u0dc4\u0dd8": "hur", "\u0daf\u0dd1": "de", "\u0d9d\u0dd9": "ghe", "\u0dbd\u0dd9\u0dcf": "lo", "\u0dbd\u0d82": "lan", "\u0dc5": "la", "\u0dc0\u0dd0": "wa", "\u0d9a\u0d82": "kan", "\u0d87\u0dd4": "e", "\u0dab\u0dd8": "nru", "\u0db4\u0dd1": "pe", "\u0d9a\u0ddb": "kai", "\u0d85\u0d83": "a", "\u0d9c\u0dd9\u0dcf": "go", "\u0dba\u0d83": "yan", "\u0dba\u0dd9": "ye", "\u0db1\u0dd0": "ne", "\u0dbb": "ra", "\u0da1\u0dd9": "che", "\u0db6\u0dd1": "be", "\u0d9c\u0dd2": "gi", "\u0dae\u0dd9\u0dcf": "tho", "\u0d95": "o", "\u0da7\u0dd9\u0dcf": "to", "\u0dae\u0dd9": "the", "\u0da0\u0dd3": "chi", "\u0da9\u0dd4": "du", "\u0db6\u0dd2": "bi", "\u0d8b": "u", "\u0d9c\u0df2": "gru", "\u0dbd\u0dd4": "lu", "\u0dc2\u0dd8": "shru", "\u0dc2\u0dd3": "shi", "\u0da8\u0d82": "tan", "\u0d91\u0dd0": "e", "\u0dbd\u0dd0": "la", "\u0dc3\u0dd9\u0ddf": "sau", "\u0d9a\u0dd9": "ke", "\u0d9c\u0dd3": "ge", "\u0db5\u0dd4": "pu", "\u0d87\u0dd0": "e", "\u0d9f\u0dd0": "ge", "\u0db8\u0dd6": "mu", "\u0d9c\u0dd0": "ge", "\u0dae": "tha", "\u0d9f\u0dd9\u0dcf": "go", "\u0da4\u0dd4": "du", "\u0db1\u0d82": "nam", "\u0d91\u0ddb": "e", "\u0da7\u0dd3": "ti", "\u0daf\u0dd3": "di", "\u0d9f\u0dd2": "gi", "\u0da7\u0dd8": "tru", "\u0dac\u0dd2": "di", "\u0db5\u0ddb": "pi", "\u0db8\u0dcf": "ma", "\u0db7\u0dd4": "bhu", "\u0db3\u0dd0": "dhe", "\u0da8\u0dd9\u0dcf": "tho", "\u0d9c\u0dd9\u0ddf": "gau", "\u0daf\u0dd9": "de", "\u0da8": "ta", "\u0d88\u0d82": "een", "\u0d9d\u0dcf": "gha", "\u0da9\u0dcf": "da", "\u0dc3\u0dd8": "ru", "\u0d95\u0d82": "oon", "\u0d8b\u0dd1": "ue", "\u0db9\u0dd9\u0ddf": "au", "\u0da2\u0dd9\u0dcf": "jo", "\u0da0\u0dd9\u0ddf": "chau", "\u0dbd": "la", "\u0db8\u0dd9\u0ddf": "mau", "\u0db8\u0dd9\u0dcf": "mo", "\u0dc6\u0dd1": "fa", "\u0da1\u0dd4": "ju", "\u0dc3\u0dd9": "se", "\u0da9\u0dd0": "da", "\u0db0\u0ddb": "dhai", "\u0d8b\u0dd6": "u", "\u0da6": "cha", "\u0d8b\u0dd4": "u", "\u0d87\u0d82": "en", "\u0dbb\u0dd9\u0dcf": "ro", "\u0dba": "ya", "\u0d9b\u0dd4": "ku", "\u0dab\u0dd9\u0dcf": "no", "\u0da4\u0dd9\u0dcf": "gho", "\u0daa\u0dcf": "dha", "\u0dbb\u0dd6": "ru", "\u0dbb\u0dd4": "ru", "\u0d9b\u0dd6": "ku", "\u0dab\u0dd2": "ni", "\u0db6\u0dd9\u0ddf": "bau", "\u0d8d\u0dd0": "ru", "\u0da7\u0dd4": "tu", "\u0dc4\u0d82": "han", "\u0da9\u0dd1": "de", "\u0da0\u0ddb": "chai", "\u0dc5\u0dd9": "le", "\u0db0\u0dd3": "dhi", "\u0da9\u0dd9": "de", "\u0dc4\u0dd9\u0dcf": "ho", "\u0d9a\u0dd0": "ke", "\u0d9a\u0dd2": "ki", "\u0db3\u0dd6": "du", "\u0db3\u0dd9": "dhe", "\u0d9e": "n", "\u0d9f": "ga", "\u0da9\u0df2": "dru", "\u0dba\u0dd9\u0dcf": "yo", "\u0db4\u0dd0": "pe", "\u0dc2": "sha", "\u0dae\u0dcf": "tha", "\u0dad\u0d83": "th", "\u0d86\u0d82": "aan", "\u0db5": "pa", "\u0dc0\u0dd9\u0ddf": "chau", "\u0dba\u0dd6": "yu", "\u0dc6\u0dd9\u0dcf": "fo", "\u0db1\u0dd9\u0dcf": "no", "\u0d9c\u0d82": "gan", "\u0da7\u0dd1": "te", "\u0db3\u0dcf": "dha", "\u0d92": "e", "\u0dc4\u0d83": "ha", "\u0d96": "au", "\u0d9a\u0dd1": "ke", "\u0d8a": "e", "\u0dac\u0dcf": "da", "\u0dad\u0dd9": "the", "\u0dc2\u0d82": "shan", "\u0d9a\u0d83": "kan", "\u0da2\u0dd8": "jru", "\u0db3\u0dd9\u0dcf": "dho", "\u0da8\u0dd3": "ti", "\u0db4\u0d82": "pan", "\u0dc5\u0dd9\u0dcf": "lo", "\u0dbd\u0ddb": "lai", "\u0db0\u0dd1": "dee", "\u0d85\u0dd1": "ee", "\u0db0\u0dd9\u0dcf": "dho", "\u0db9\u0dd3": "bi", "\u0d9f\u0dd9": "ge", "\u0da6\u0dcf": "ja", "\u0db9": "ba", "\u0dbb\u0dd9": "re", "\u0da8\u0dd0": "te", "\u0dc3\u0dd9\u0dcf": "so", "\u0dc2\u0dd9\u0dcf": "sho", "\u0dc0\u0dd6": "wu", "\u0db8\u0d82": "man", "\u0dc6\u0dd6": "fu", "\u0da7\u0d82": "tan", "\u0d94\u0dd1": "we", "\u0d8d\u0dd3": "ri", "\u0da4\u0dd0": "ke", "\u0daf\u0dd9\u0ddf": "dau", "\u0da3\u0dcf": "ja", "\u0dc1": "sha", "\u0dc2\u0d83": "sha", "\u0d9e\u0dcf": "dha", "\u0da0\u0dd2": "chi", "\u0dad\u0dd4": "thu", "\u0db9\u0dd6": "bhu", "\u0dc0\u0dd4": "wu", "\u0da2\u0dd9": "je", "\u0dc0\u0dd9\u0dcf": "wo", "\u0d86\u0dcf": "aa", "\u0dc2\u0dd2": "shi", "\u0dae\u0dd2": "thi", "\u0da9": "da", "\u0db6\u0d82": "ban", "\u0dc3\u0dd1": "se", "\u0dad\u0dd9\u0dcf": "tho", "\u0db0\u0dd0": "dhe", "\u0dc6\u0dd3": "fee", "\u0dc5\u0dd0": "le", "\u0d89": "e", "\u0da0\u0dcf": "cha", "\u0dc6\u0dd9": "fe", "\u0daf\u0dd6": "du", "\u0d9b\u0dd3": "ki", "\u0dc4\u0dcf": "ha", "\u0d94\u0dd3": "o", "\u0dbb\u0dd2": "ri", "\u0dc4\u0ddb": "hai", "\u0d9b": "ka", "\u0db7": "bha", "\u0db7\u0dd9": "bhe", "\u0d91\u0dd3": "ee", "\u0db1\u0dd9": "ne", "\u0dc3": "sa", "\u0daf\u0dd2": "di", "\u0dab\u0dd9": "ne", "\u0c02": "n", "\u0db4": "pa", "\u0da0\u0dd1": "che", "\u0dc5\u0dcf": "la", "\u0db0\u0dd9\u0ddf": "dhau", "\u0db7\u0ddb": "bhi", "\u0dae\u0d83": "tha", "\u0d9f\u0dcf": "gha", "\u0db4\u0dd9": "pe", "\u0d9c\u0dd1": "ge", "\u0dc1\u0dd3": "shi", "\u0dc3\u0dd2": "si", "\u0db7\u0dd9\u0ddf": "bhau", "\u0d9e\u0dd3": "n", "\u0dbd\u0dd2": "li", "\u0d86": "a", "\u0dbb\u0df2": "ru", "\u0d85\u0dd9": "a", "\u0dad\u0dd1": "the", "\u0db6\u0ddb": "bai", "\u0dc4\u0dd9": "he", "\u0db6\u0dd9": "be", "\u0dba\u0ddb": "yai", "\u0dc0\u0ddb": "wai", "\u0db0\u0df2": "dru", "\u0dc4\u0dd3": "he", "\u0da5": "gha", "\u0dc2\u0dd9": "she", "\u0dc0\u0dd2": "vi", "\u0da2\u0dcf": "ja", "\u0da7\u0dd9": "te", "\u0dba\u0dd2": "i", "\u0da7": "ta", "\u0d9b\u0dcf": "ka", "\u0dac\u0dd4": "du", "\u0da2\u0dd2": "ji", "\u0dac": "da", "\u0dc2\u0dd1": "she", "\u0da1": "cha", "\u0db0\u0dd8": "dhru", "\u0da2\u0ddb": "jai", "\u0db8": "ma", "\u0dac\u0dd1": "de", "\u0dc2\u0ddb": "shai", "\u0da1\u0dd2": "chi", "\u0d9b\u0dd9": "ke", "\u0db3": "da", "\u0db6\u0dd9\u0dcf": "bo", "\u0dc4": "ha", "\u0da1\u0dd0": "je", "\u0da7\u0dd6": "tu", "\u0d9b\u0d82": "kan", "\u0d9c": "ga", "\u0db5\u0dd1": "pe", "\u0daa\u0dd2": "dhi", "\u0dc1\u0dd9\u0dcf": "sho", "\u03bf": "n", "\u0d89\u0dd0": "e", "\u0dc3\u0d83": "san", "\u0d9d": "gha", "\u0d9b\u0dd1": "ke", "\u0dc0\u0dd1": "we", "\u0d8a\u0dd9": "ee", "\u0db6\u0dd3": "bi", "\u0dc3\u0dd6": "su", "\u0da9\u0dd3": "di", "\u0dbb\u0dcf": "ra", "\u0dc1\u0d82": "shan", "\u0db7\u0dd9\u0dcf": "bho", "\u0daf\u0d82": "dan", "\u0da2\u0dd6": "ju", "\u0dba\u0dd8": "yur", "\u0d89\u0dd2": "e", "\u0dc0": "wa", "\u0db7\u0dcf": "bha", "\u0db7\u0dd2": "bhi", "\u0daa\u0dd4": "dhu", "\u0d94\u0dd8": "au", "\u0d9a\u0dd6": "ku", "\u0dbd\u0dd3": "lee", "\u0daf\u0dd9\u0dcf": "do", "\u0db6\u0df2": "bru", "\u0da7\u0dd2": "ti", "\u0dc4\u0dd6": "hu", "\u0db8\u0ddb": "mai", "\u0dc3\u0df2": "ru", "\u0db9\u0dd9": "bhe", "\u0d9c\u0dd9": "ge", "\u0dba\u0dd3": "yi", "\u0da8\u0dd2": "ti", "\u0db7\u0d82": "bhan", "\u0daf\u0dd4": "du", "\u0d9f\u0dd6": "ghu", "\u0da9\u0d82": "dan", "\u0da7\u0df2": "tru", "\u0dad\u0dd2": "thi", "\u0dad": "tha", "\u0dc4\u0dd9\u0ddf": "bhau", "\u0dc2\u0dd4": "shu", "\u0d89\u0dcf": "e", "\u0d9a": "ka", "\u0d85\u0dd2": "a", "\u0da7\u0ddb": "tai", "\u0dbd\u0dd1": "le", "\u0dac\u0dd9\u0dcf": "dho", "\u0d85\u0dd0": "e", "\u0d9a\u0dcf": "ka", "\u0db4\u0dcf": "pa", "\u0db1\u0dd1": "ne", "\u0db5\u0dd9\u0dcf": "po", "\u0da2\u0dd0": "je", "\u0da9\u0dd9\u0dcf": "do", "\u0dad\u0dd6": "thu", "\u0db3\u0dd2": "dhi", "\u0d92\u0d82": "en", "\u0d9d\u0dd9\u0dcf": "gho", "\u0d9c\u0dd8": "gru", "\u0da4\u0dd9": "ke", "\u0db7\u0dd1": "bhe", "\u0db5\u0dd3": "pi", "\u0dba\u0dcf": "ya", "\u0d9c\u0ddb": "gai", "\u0d9d\u0dd6": "ghu", "\u0db0\u0dcf": "dha", "\u0dbd\u0dd9": "le", "\u0d9d\u0dd8": "ru", "\u0dc6\u0dd8": "fru", "\u0dc0\u0dd8": "wru", "\u0da9\u0ddb": "dai", "\u0d8f": "pru", "\u0dac\u0dd6": "du", "\u0d85\u0dcf": "a", "\u0db7\u0dd3": "bhi", "\u0dc4\u0dd2": "hi", "\u0db6\u0dcf": "ba", "\u0dbb\u0dd3": "ri", "\u0d8d": "ru", "\u0dbb\u0dd0": "re", "\u0da1\u0dd3": "chi", "\u0da3\u0dd3": "jhi", "\u0da4": "gha", "\u0db1": "na", "\u0d91\u0dd2": "e", "\u0dc1\u0dd8": "shru", "\u0dab\u0dd0": "ne", "\u0d87\u0dd9": "e", "\u0dc1\u0dd1": "she", "\u0d87": "e", "\u0dc3\u0dd0": "se", "\u0dae\u0dd6": "thu", "\u0d9f\u0dd3": "gi", "\u0d95\u0dd8": "o", "\u0db5\u0dcf": "pa", "\u0db1\u0dd9\u0ddf": "nau", "\u0d8b\u0dd9": "u", "\u0dbd\u0dd9\u0ddf": "lau", "\u0dad\u0d82": "than", "\u0d9e\u0dd4": "du", "\u0dbb\u0ddb": "rai", "\u0da9\u0dd2": "di", "\u0d9c\u0dd4": "gu", "\u0da4\u0d82": "ghan", "\u0dc2\u0dcf": "sha", "\u0d85": "a", "\u0dc6\u0dd4": "fu", "\u0db0\u0dd9": "dhe", "\u0d9a\u0dd9\u0ddf": "kau", "\u0d85\u0d82": "an", "\u0dc5\u0dd6": "lu", "\u0dc5\u0dd8": "lu", "\u0da1\u0d82": "chan", "\u0db4\u0dd9\u0ddf": "pau", "\u0dc1\u0dd2": "shi", "\u0dc1\u0dd6": "shu", "\u0dad\u0dd8": "thru", "\u0da0\u0dd6": "chu", "\u0da0\u0d82": "chan", "\u0db0\u0d82": "dhan", "\u0dab\u0dcf": "na", "\u0db1\u0dd3": "ni", "\u0dac\u0dd0": "dhe", "\u0d9c\u0dcf": "ga", "\u0db3\u0dd3": "di", "\u0dc2\u0dd0": "she", "\u0d91": "e", "\u0d8b\u0dcf": "u", "\u0d89\u0dd9": "e", "\u0dbb\u0d82": "ran", "\u0dae\u0dd0": "the", "\u0db1\u0dd8": "nru", "\u0daa\u0dd9\u0dcf": "to", "\u0db5\u0d82": "pan", "\u0d93": "e", "\u0db6\u0dd4": "bu", "\u0da7\u0dd9\u0ddf": "tau", "\u0d9a\u0dd8": "kru", "\u0db8\u0d83": "man", "\u0dab\u0dd4": "nu", "\u0dc3\u0dd3": "si", "\u0db4\u0dd6": "pu", "\u0da8\u0dcf": "ta", "\u0da0\u0dd9": "che", "\u0db5\u0dd2": "phi", "\u0dc2\u0dd6": "shu", "\u0d9e\u0dd9\u0dcf": "do", "\u0db7\u0dd6": "bhu", "\u0dad\u0ddb": "thai", "\u0da2\u0d82": "jan", "\u0db9\u0dd0": "be", "\u0d94": "o", "\u0daf\u0df2": "dhru", "\u0da9\u0dd8": "dru", "\u0da4\u0dcf": "ghan", "\u0dbd\u0dd6": "lu", "\u0dc0\u0dd9": "we", "\u0d94\u0dcf": "o", "\u0d8b\u0d82": "un", "\u0db6": "ba", "\u0db1\u0dd2": "ni", "\u0d9d\u0dd3": "ghi", "\u0dbd\u0dcf": "la", "\u0db7\u0dd8": "bru", "\u0da3": "gha", "\u0dab\u0dd1": "ne", "\u0d88": "e", "\u0dc6\u0dd0": "fa", "\u0dc6": "fa", "\u0dad\u0dd0": "the", "\u0d92\u0dd2": "e", "\u0da7\u0dd0": "te", "\u0db1\u0dd4": "nu", "\u0dba\u0d82": "yan", "\u0db1\u0dd6": "nu", "\u0d9a\u0dd4": "ku", "\u0dba\u0dd1": "ye", "\u0db8\u0dd1": "me", "\u0dc5\u0d82": "lan", "\u0da0\u0dd9\u0dcf": "cho", "\u0da9\u0dd9\u0ddf": "dau", "\u0db4\u0ddb": "pai", "\u0da7\u0dcf": "ta", "\u0db3\u0dd4": "du", "\u0d9a\u0dd9\u0dcf": "ko", "\u0d9d\u0d82": "ghan", "\u0dba\u0dd9\u0ddf": "yau", "\u0d9e\u0dd2": "di", "\u0dc4\u0dd0": "he", "\u0db4\u0dd4": "pu", "\u0dc5\u0dd4": "lu", "\u0d9d\u0dd4": "gu", "\u0db1\u0dcf": "na", "\u0db4\u0df2": "pru", "\u0db3\u0dd1": "de", "\u0dc1\u0dd4": "shu", "\u0da3\u0dd9\u0dcf": "gha", "\u0da2\u0dd1": "je", "\u0da4\u0dd2": "di", "\u0da1\u0dcf": "cha", "\u0dc5\u0dd3": "li", "\u0da0\u0dd4": "chu", "\u0db0": "dha", "\u0d9f\u0d82": "ghan", "\u0db0\u0dd2": "dhi", "\u0d91\u0dcf": "e", "\u0d8b\u0dd3": "u", "\u0db9\u0dd2": "bhi", "\u0db8\u0dd3": "me", "\u0db5\u0dd0": "pe", "\u0d9d\u0dd2": "ghi", "\u0da4\u0dd6": "du", "\u0db0\u0dd6": "dhu", "\u0daf\u0dd8": "dru", "\u0dba\u0dd4": "yu", "\u0db9\u0dd4": "bu", "\u0dc1\u0dd9\u0ddf": "shau", "\u0db8\u0dd9": "me", "\u0da4\u0dd3": "di", "\u0daf": "da", "\u0d94\u0d82": "on", "\u0d94\u0dd2": "o", "\u0dc5\u0dd2": "li", "\u0da2\u0dd3": "ji", "\u0dc1\u0dcf": "sha", "\u0da8\u0dd9": "te", "\u0db4\u0dd8": "pru", "\u0d91\u0dd9": "e", "\u0dc1\u0d83": "shan", "\u0d90": "pau", "\u0db8\u0dd2": "mi", "\u0db0\u0dd4": "dhu", "\u0dc3\u0ddb": "sai", "\u0da2\u0dd9\u0ddf": "jau", "\u0db9\u0dd1": "be", "\u0d9b\u0dd2": "ki", "\u043e": "n", "\u0da3\u0d82": "jan", "\u0dab\u0dd3": "ni", "\u0dc4\u0dd4": "hu", "\u0d9c\u0dd6": "gu", "\u0da0": "cha", "\u0db5\u0dd9": "pe", "\u0dc0\u0df2": "wru", "\u0d94\u0dd9": "o", "\u0dae\u0d82": "than", "\u0db5\u0dd6": "pu", "\u0d8b\u0dd8": "u", "\u0db6\u0dd0": "be", "\u0dc3\u0dcf": "sa", "\u0db3\u0d82": "dan", "\u0da2\u0d83": "jah", "\u0da9\u0dd6": "du", "\u0dbb\u0dd1": "re", "\u0dc4\u0df2": "hru", "\u0d9a\u0dd3": "ki", "\u0da1\u0dd8": "chru", "\u0daf\u0dcf": "da", "\u0dc5\u0dd1": "le", "\u0da1\u0dd9\u0dcf": "cho", "\u0dad\u0df2": "thru", "\u0db4\u0dd9\u0dcf": "po", "\u0d9b\u0dd9\u0dcf": "ko", "\u0d9e\u0dd9": "de", "\u0db8\u0dd8": "mur", "\u0dc0\u0d83": "wah", "\u0d86\u0d83": "an", "\u0d9f\u0dd1": "ge", "\u0dc3\u0d82": "san", "\u0daf\u0dd0": "de", "\u0da8\u0dd4": "tu", "\u0dba\u0dd0": "ye", "\u0dbb\u0dd9\u0ddf": "rau", "\u0db9\u0dcf": "bha", "\u0db9\u0dd9\u0dcf": "bho", "\u0d9a\u0df2": "kru", "\u0dc0\u0d82": "wan", "\u0da0\u0dd0": "che", "\u0da2": "ja", "\u0dc4\u0dd1": "he", "\u0dc1\u0dd0": "sha", "\u0dc1\u0ddb": "shai", "\u0dae\u0dd8": "tru", "\u0dc0\u0dcf": "wa", "\u0dac\u0dd9": "de", "\u0db4\u0dd2": "pi", "\u0da0\u0d83": "chah", "\u0da3\u0dd2": "dhi", "\u0dad\u0dd3": "thi", "\u0db8\u0dd4": "mu", "\u0d94\u0dd4": "o", "\u0da2\u0dd4": "ju", "\u0d91\u0d82": "en", "\u0dac\u0dd3": "di", "\u0db8\u0dd0": "me", "\u0d92\u0dd9": "e", "\u0dc0\u0dd3": "we", "\u0dad\u0dcf": "tha", "\u0db1\u0ddb": "nai", "\u0db6\u0dd6": "bu", "\u0dae\u0dd4": "thu", "\u0dae\u0dd3": "thi", "\u0dc3\u0dd4": "su", "\u0dac\u0d82": "ghan", "\u0d89\u0d82": "en", "\u0da7\u0dca": "t", "\u0dc2\u0ddc": "sho", "\u0dab\u0dda": "no", "\u0dab\u0dca": "n", "\u0dbd\u0dde": "lau", "\u0da3\u0dca": "j", "\u0dc3\u0dde": "sau", "\u0dba\u0dde": "yau", "\u0d9a\u0dca": "k", "\u0db5\u0ddd": "po", "\u0db6\u0ddc": "bo", "\u0dbb\u0dde": "rau", "\u0dc1\u0ddd": "sho", "\u0dc5\u0ddd": "lo", "\u0d9b\u0dda": "ke", "\u0dc0\u0ddd": "wo", "\u0daa\u0dca": "d", "\u0db9\u0ddd": "bho", "\u0db3\u0ddc": "dho", "\u0db3\u0ddd": "dho", "\u0dc6\u0ddd": "fho", "\u0da2\u0dda": "je", "\u0dbb\u0dca": "r", "\u0dc0\u0ddc": "wo", "\u0da9\u0dca": "d", "\u0dba\u0ddc": "yo", "\u0db0\u0dda": "dhe", "\u0da0\u0dda": "che", "\u0da1\u0dda": "che", "\u0db7\u0ddc": "bho", "\u0dac\u0dda": "de", "\u0db7\u0dda": "bhe", "\u0da9\u0ddc": "do", "\u0dbb\u0ddc": "ro", "\u0dc1\u0dda": "she", "\u0d91\u0dca": "e", "\u0d9c\u0dde": "gau", "\u0da9\u0dde": "dau", "\u0da4\u0ddc": "gho", "\u0daf\u0dde": "dhou", "\u0db3\u0dca": "d", "\u0da3\u0ddd": "do", "\u0dac\u0dca": "d", "\u0db5\u0ddc": "po", "\u0dba\u0ddd": "yo", "\u0dc1\u0dca": "sh", "\u0db0\u0dca": "dh", "\u0dbb\u0dda": "re", "\u0d9f\u0dca": "g", "\u0dae\u0ddc": "tho", "\u0dae\u0dda": "the", "\u0da4\u0dca": "ghe", "\u0da8\u0dca": "t", "\u0dc4\u0dca": "h", "\u0d9d\u0ddc": "gho", "\u0da0\u0ddc": "cho", "\u0dc4\u0dda": "he", "\u0dab\u0ddc": "no", "\u0dc5\u0ddc": "lo", "\u0dc0\u0dda": "we", "\u0dad\u0ddd": "tho", "\u0da2\u0ddd": "jo", "\u0da9\u0ddd": "do", "\u0db9\u0dda": "be", "\u0dc5\u0dca": "l", "\u0dae\u0dca": "th", "\u0db1\u0dda": "ne", "\u0db8\u0ddc": "mo", "\u0d9d\u0dda": "ghe", "\u0dc3\u0dda": "se", "\u0db4\u0dca": "p", "\u0d87\u0dca": "e", "\u0d9a\u0dde": "kau", "\u0d9e\u0ddc": "do", "\u0db8\u0dda": "me", "\u0d9b\u0ddc": "ko", "\u0dc4\u0ddc": "ho", "\u0dba\u0dca": "y", "\u0db1\u0dca": "n", "\u0dc1\u0ddc": "sho", "\u0da4\u0ddd": "gho", "\u0dc3\u0dca": "s", "\u0da1\u0ddc": "cho", "\u0daf\u0dca": "d", "\u0db6\u0dca": "b", "\u0d9d\u0ddd": "gho", "\u0db5\u0dca": "e", "\u0d9a\u0ddc": "ko", "\u0dc0\u0dde": "wau", "\u0da8\u0ddc": "to", "\u0dae\u0ddd": "tho", "\u0da8\u0ddd": "to", "\u0dc3\u0ddd": "so", "\u0db6\u0ddd": "bo", "\u0d92\u0dca": "e", "\u0db7\u0dde": "bhau", "\u0db9\u0ddc": "bho", "\u0da7\u0dda": "te", "\u0da0\u0ddd": "cho", "\u0da9\u0dda": "de", "\u0da1\u0ddd": "cho", "\u0d9f\u0ddc": "go", "\u0da0\u0dca": "ch", "\u0db1\u0ddd": "no", "\u0da2\u0dca": "j", "\u0db0\u0ddc": "dho", "\u0db4\u0dda": "je", "\u0dbb\u0ddd": "ro", "\u0dbd\u0dca": "l", "\u0db1\u0ddc": "no", "\u0d94\u0dca": "o", "\u0dc6\u0dca": "f", "\u0dc2\u0dca": "sh", "\u0d89\u0dca": "e", "\u0dad\u0ddc": "tho", "\u0dad\u0dda": "the", "\u0dad\u0dca": "th", "\u0db3\u0dda": "dhe", "\u0dc3\u0ddc": "so", "\u0db8\u0dca": "m", "\u0daa\u0ddc": "to", "\u0d9f\u0ddd": "go", "\u0daa\u0ddd": "to", "\u0d9a\u0dda": "ke", "\u0db4\u0ddc": "po", "\u0dac\u0ddd": "do", "\u0da2\u0dde": "jau", "\u0dba\u0dda": "ye", "\u0dc6\u0ddc": "fo", "\u0db9\u0dde": "bhau", "\u0d9e\u0dca": "n", "\u0db4\u0dde": "pau", "\u0d8a\u0dca": "e", "\u0dc5\u0dda": "le", "\u0db9\u0dca": "b", "\u0db6\u0dde": "bau", "\u0db8\u0ddd": "mo", "\u0d9c\u0dca": "g", "\u0dc2\u0dda": "she", "\u0d9c\u0dda": "ge", "\u0db4\u0ddd": "po", "\u0da1\u0dca": "ch", "\u0d9b\u0dca": "k", "\u0dbd\u0ddd": "lo", "\u0d85\u0dda": "a", "\u0d9e\u0ddd": "do", "\u0d9c\u0ddc": "go", "\u0da8\u0dda": "dhe", "\u0d9e\u0dda": "n", "\u0dc1\u0dde": "shau", "\u0dc6\u0dda": "fe", "\u0db1\u0dde": "nau", "\u0da2\u0ddc": "jo", "\u0d9c\u0ddd": "go", "\u0db8\u0dde": "mau", "\u0da4\u0dda": "g", "\u0dbd\u0dda": "le", "\u0da7\u0ddc": "to", "\u0daf\u0ddd": "dho", "\u0d85\u0dca": "a", "\u0dab\u0ddd": "no", "\u0db7\u0dca": "b", "\u0dbd\u0ddc": "lo", "\u0daf\u0ddc": "do", "\u0dc4\u0dde": "bhau", "\u0da7\u0dde": "tau", "\u0d9d\u0dca": "g", "\u0d9b\u0ddd": "ko", "\u0db0\u0dde": "dhau", "\u0db7\u0ddd": "bho", "\u0db0\u0ddd": "dho", "\u0dc2\u0ddd": "sho", "\u0d9a\u0ddd": "ko", "\u0daf\u0dda": "de", "\u0dc4\u0ddd": "ho", "\u0db6\u0dda": "be", "\u0da7\u0ddd": "to", "\u0da0\u0dde": "chau", "\u0d9f\u0dda": "ge", "\u0dc0\u0dca": "w"}
\ No newline at end of file
diff --git a/data/vocab_map.json b/data/vocab_map.json
new file mode 100644
index 0000000..7410d4f
--- /dev/null
+++ b/data/vocab_map.json
@@ -0,0 +1,2276 @@
+{
+    "»": 0,
+    "መ": 1,
+    "💻": 2,
+    "ˈ": 3,
+    "🙊": 4,
+    "D": 5,
+    "ծ": 6,
+    "නේ": 7,
+    "😐": 8,
+    "ධෑ": 9,
+    "연": 10,
+    "አ": 11,
+    "යඃ": 12,
+    "ู": 13,
+    "ඔ්": 14,
+    "ථ්": 15,
+    "ඇ": 16,
+    "専": 17,
+    "ஆ": 18,
+    "ʒ": 19,
+    "ඝී": 20,
+    "پ": 21,
+    "శ": 22,
+    "Ã": 23,
+    "📚": 24,
+    "부": 25,
+    "ሊ": 26,
+    "배": 27,
+    "බි": 28,
+    "?": 29,
+    "³": 30,
+    "官": 31,
+    "ඨ්": 32,
+    "🐼": 33,
+    "ج": 34,
+    "ඳේ": 35,
+    "ඟෑ": 36,
+    "∶": 37,
+    "ඩි": 38,
+    "නී": 39,
+    "≡": 40,
+    "දු": 41,
+    "මි": 42,
+    "නෝ": 43,
+    "ව්": 44,
+    "↑": 45,
+    "ए": 46,
+    "स": 47,
+    "බෞ": 48,
+    "ℜ": 49,
+    "出": 50,
+    "ජ්": 51,
+    "😖": 52,
+    "🌻": 53,
+    "ගි": 54,
+    "ด": 55,
+    "ሰ": 56,
+    "එැ": 57,
+    "ង": 58,
+    "ලු": 59,
+    "වූ": 60,
+    "ʰ": 61,
+    "⇪": 62,
+    "ő": 63,
+    "ඤු": 64,
+    "N": 65,
+    "⇝": 66,
+    "ෂී": 67,
+    "දං": 68,
+    "저": 69,
+    "ඝි": 70,
+    "۷": 71,
+    "ජැ": 72,
+    "ੱ": 73,
+    "^": 74,
+    "ነ": 75,
+    "ã": 76,
+    "චෙ": 77,
+    "♡": 78,
+    "の": 79,
+    "Ş": 80,
+    "සේ": 81,
+    "ਾ": 82,
+    "௨": 83,
+    "📅": 84,
+    "영": 85,
+    "ክ": 86,
+    "ආෙ": 87,
+    "ඍැ": 88,
+    "ඬෑ": 89,
+    "දෙ": 90,
+    "በ": 91,
+    "語": 92,
+    "̴": 93,
+    "🇸": 94,
+    "ฉ": 95,
+    "පෘ": 96,
+    "😳": 97,
+    "ශ": 98,
+    "➚": 99,
+    "ක": 100,
+    "ब": 101,
+    "☹": 102,
+    "නෑ": 103,
+    "භේ": 104,
+    "北": 105,
+    "有": 106,
+    "ك": 107,
+    "吏": 108,
+    "බූ": 109,
+    "👇": 110,
+    "ෂො": 111,
+    "😠": 112,
+    "ع": 113,
+    "י": 114,
+    "හෲ": 115,
+    "ḍ": 116,
+    "ඞේ": 117,
+    "紀": 118,
+    "豪": 119,
+    "නෛ": 120,
+    "Ε": 121,
+    "හෞ": 122,
+    "╗": 123,
+    "යෛ": 124,
+    "සෞ": 125,
+    "这": 126,
+    "ති": 127,
+    "ㄸ": 128,
+    "즈": 129,
+    "♐": 130,
+    "කං": 131,
+    "ළෑ": 132,
+    "̯": 133,
+    "ඔැ": 134,
+    "通": 135,
+    "ක්": 136,
+    "ඝු": 137,
+    "ඞි": 138,
+    "局": 139,
+    "ඬේ": 140,
+    "❉": 141,
+    "신": 142,
+    "ඛු": 143,
+    "ඞී": 144,
+    "₹": 145,
+    "・": 146,
+    "§": 147,
+    "": 148,
+    "−": 149,
+    "α": 150,
+    "☀": 151,
+    "➨": 152,
+    "ඡු": 153,
+    "♬": 154,
+    "ﻲ": 155,
+    "}": 156,
+    "ඵෛ": 157,
+    "පූ": 158,
+    "·": 159,
+    "චු": 160,
+    "④": 161,
+    "වේ": 162,
+    "Æ": 163,
+    "ਹ": 164,
+    "Š": 165,
+    "医": 166,
+    "็": 167,
+    "නැ": 168,
+    "ණ": 169,
+    "려": 170,
+    "钱": 171,
+    "{": 172,
+    "呼": 173,
+    "": 174,
+    "p": 175,
+    "­": 176,
+    "ඝෘ": 177,
+    "ණැ": 178,
+    "鬲": 179,
+    "’": 180,
+    "ው": 181,
+    "ς": 182,
+    "х": 183,
+    "ඉෙ": 184,
+    "🏽": 185,
+    "ʊ": 186,
+    "ඩු": 187,
+    "🌹": 188,
+    "⋆": 189,
+    "丂": 190,
+    "్": 191,
+    "โ": 192,
+    "๑": 193,
+    "樂": 194,
+    "ካ": 195,
+    "😱": 196,
+    "දූ": 197,
+    "满": 198,
+    "ධු": 199,
+    "⏰": 200,
+    "ආ්": 201,
+    "ටු": 202,
+    "💡": 203,
+    "ලඃ": 204,
+    "ぎ": 205,
+    "Г": 206,
+    "병": 207,
+    "🤣": 208,
+    "ど": 209,
+    "Ś": 210,
+    "就": 211,
+    "ร": 212,
+    "笑": 213,
+    "體": 214,
+    "ශඃ": 215,
+    "¨": 216,
+    "ン": 217,
+    "印": 218,
+    "": 219,
+    "උැ": 220,
+    "➠": 221,
+    "හෛ": 222,
+    "ගෘ": 223,
+    "杯": 224,
+    "බෛ": 225,
+    "٦": 226,
+    "🎸": 227,
+    "ா": 228,
+    "ô": 229,
+    "ḱ": 230,
+    "යං": 231,
+    "වෙ": 232,
+    "据": 233,
+    "කේ": 234,
+    "ඨි": 235,
+    "ඛි": 236,
+    "も": 237,
+    "ມ": 238,
+    "ょ": 239,
+    "ო": 240,
+    "ங": 241,
+    "ඳෑ": 242,
+    "野": 243,
+    "ඩේ": 244,
+    "ಕ": 245,
+    "ɐ": 246,
+    "호": 247,
+    "국": 248,
+    "ඹා": 249,
+    "ගී": 250,
+    "Œ": 251,
+    "株": 252,
+    "사": 253,
+    "w": 254,
+    "පං": 255,
+    "ණූ": 256,
+    "වු": 257,
+    "院": 258,
+    "ඹ්": 259,
+    "ළා": 260,
+    "o": 261,
+    "සු": 262,
+    "චේ": 263,
+    "ყ": 264,
+    "☜": 265,
+    "↳": 266,
+    "🍕": 267,
+    "у": 268,
+    "හී": 269,
+    "な": 270,
+    "õ": 271,
+    "ና": 272,
+    "අා": 273,
+    "ඐ": 274,
+    "අු": 275,
+    "O": 276,
+    "ඤි": 277,
+    "ගෛ": 278,
+    "උෑ": 279,
+    "진": 280,
+    "එා": 281,
+    "": 282,
+    "。": 283,
+    "පේ": 284,
+    "ඬැ": 285,
+    "ෂෞ": 286,
+    "व": 287,
+    "ో": 288,
+    "කෑ": 289,
+    "َ": 290,
+    "Ⅲ": 291,
+    "ජෑ": 292,
+    "චෞ": 293,
+    "ባ": 294,
+    "🙈": 295,
+    "උෘ": 296,
+    "☻": 297,
+    "නු": 298,
+    "(": 299,
+    "琮": 300,
+    "˙": 301,
+    "日": 302,
+    "පා": 303,
+    "…": 304,
+    "භ": 305,
+    "": 306,
+    "ï": 307,
+    "Λ": 308,
+    "ඒි": 309,
+    "": 310,
+    "開": 311,
+    "い": 312,
+    "ል": 313,
+    "ඇ්": 314,
+    "": 315,
+    "당": 316,
+    "/": 317,
+    "ल": 318,
+    "할": 319,
+    "□": 320,
+    "": 321,
+    "서": 322,
+    "س": 323,
+    "ඬි": 324,
+    "🍶": 325,
+    "ඔෘ": 326,
+    "ඳෝ": 327,
+    "]": 328,
+    "සං": 329,
+    "Ú": 330,
+    "ស": 331,
+    "Κ": 332,
+    "": 333,
+    "යෝ": 334,
+    "🔴": 335,
+    "ት": 336,
+    "▪": 337,
+    "ලෘ": 338,
+    "ɪ": 339,
+    "魏": 340,
+    "රෑ": 341,
+    "🔥": 342,
+    "ඩෑ": 343,
+    "⦁": 344,
+    "ළෝ": 345,
+    "事": 346,
+    "한": 347,
+    "Ф": 348,
+    "体": 349,
+    "": 350,
+    "✺": 351,
+    "පී": 352,
+    "ඝ්": 353,
+    "ඬෝ": 354,
+    "ටෘ": 355,
+    "芸": 356,
+    "ඨෝ": 357,
+    "යෑ": 358,
+    "ඤී": 359,
+    "Ø": 360,
+    "⚑": 361,
+    "මී": 362,
+    "ª": 363,
+    "В": 364,
+    "የ": 365,
+    "ලෙ": 366,
+    "📱": 367,
+    "学": 368,
+    "ඪෘ": 369,
+    "ិ": 370,
+    "克": 371,
+    "ඣ්": 372,
+    "ī": 373,
+    "ṃ": 374,
+    "🏦": 375,
+    "주": 376,
+    "ŗ": 377,
+    "个": 378,
+    "_": 379,
+    "Ӂ": 380,
+    "葉": 381,
+    "ビ": 382,
+    "": 383,
+    "ගේ": 384,
+    "නෞ": 385,
+    "軍": 386,
+    "ْ": 387,
+    "₂": 388,
+    "ගෲ": 389,
+    "💧": 390,
+    "셔": 391,
+    "歌": 392,
+    "u": 393,
+    "ටා": 394,
+    "": 395,
+    "ඟූ": 396,
+    "ෂැ": 397,
+    "ඳෙ": 398,
+    "": 399,
+    "ජෝ": 400,
+    "නඃ": 401,
+    "ゅ": 402,
+    "❖": 403,
+    "ඛෙ": 404,
+    "ɡ": 405,
+    "#": 406,
+    "თ": 407,
+    "Ž": 408,
+    "察": 409,
+    "ਮ": 410,
+    "公": 411,
+    "र": 412,
+    "ɨ": 413,
+    "ණා": 414,
+    "ɕ": 415,
+    "ʌ": 416,
+    "☢": 417,
+    "ෆෛ": 418,
+    "": 419,
+    "ெ": 420,
+    "ෆෲ": 421,
+    "ඊ්": 422,
+    "අෙ": 423,
+    "ṭ": 424,
+    "—": 425,
+    "🌍": 426,
+    "ම්": 427,
+    "ण": 428,
+    "鑑": 429,
+    "잘": 430,
+    "ඝං": 431,
+    "I": 432,
+    "ජෘ": 433,
+    "ය": 434,
+    "Ñ": 435,
+    "Ʌ": 436,
+    "ă": 437,
+    "ඹෑ": 438,
+    "ㅂ": 439,
+    "ඔෑ": 440,
+    "යූ": 441,
+    "料": 442,
+    "ඔූ": 443,
+    "చ": 444,
+    "◙": 445,
+    "ථෝ": 446,
+    "ජු": 447,
+    "์": 448,
+    "ň": 449,
+    "ไ": 450,
+    "使": 451,
+    "지": 452,
+    "බ්": 453,
+    "😕": 454,
+    "Т": 455,
+    "භ්": 456,
+    "ලැ": 457,
+    "ه": 458,
+    "ታ": 459,
+    "පෛ": 460,
+    "▌": 461,
+    "٩": 462,
+    "η": 463,
+    "格": 464,
+    "ශි": 465,
+    "記": 466,
+    "ටෑ": 467,
+    "8": 468,
+    "ෆ": 469,
+    "දෘ": 470,
+    "ඇු": 471,
+    "ඡී": 472,
+    "ģ": 473,
+    "　": 474,
+    "◘": 475,
+    "◣": 476,
+    "ලේ": 477,
+    "ඤඃ": 478,
+    " ": 479,
+    "Ħ": 480,
+    "බ": 481,
+    "ඤේ": 482,
+    "ඩෞ": 483,
+    "デ": 484,
+    "‟": 485,
+    "ළ්": 486,
+    "ी": 487,
+    "ඩ": 488,
+    "z": 489,
+    "ඵැ": 490,
+    "පි": 491,
+    "治": 492,
+    "ෂේ": 493,
+    "조": 494,
+    "ෳ": 495,
+    "ගං": 496,
+    "ć": 497,
+    "චඃ": 498,
+    "Å": 499,
+    "送": 500,
+    "ත": 501,
+    "‚": 502,
+    "තෘ": 503,
+    "😒": 504,
+    "و": 505,
+    "讀": 506,
+    "字": 507,
+    "テ": 508,
+    "¼": 509,
+    "උං": 510,
+    "ح": 511,
+    "Þ": 512,
+    "詰": 513,
+    "ద": 514,
+    "📑": 515,
+    "෦": 516,
+    "ሌ": 517,
+    "π": 518,
+    "චැ": 519,
+    "ඳ්": 520,
+    "ඤො": 521,
+    "ඓ": 522,
+    "두": 523,
+    "フ": 524,
+    "💀": 525,
+    "ሁ": 526,
+    "ෆැ": 527,
+    "生": 528,
+    "라": 529,
+    "門": 530,
+    "ñ": 531,
+    "ੀ": 532,
+    "ඵ": 533,
+    "ม": 534,
+    "ෆේ": 535,
+    "": 536,
+    "🌸": 537,
+    "🌷": 538,
+    "වා": 539,
+    "н": 540,
+    "රේ": 541,
+    "ෆෑ": 542,
+    "💁": 543,
+    "J": 544,
+    "ਿ": 545,
+    "සෘ": 546,
+    "ඬො": 547,
+    "ගැ": 548,
+    "ෆ්": 549,
+    "ච්": 550,
+    "ඳී": 551,
+    "ච": 552,
+    "ඦ": 553,
+    "⁣": 554,
+    "त": 555,
+    "ඡි": 556,
+    "ඟ්": 557,
+    "と": 558,
+    "장": 559,
+    "‍": 560,
+    "ඡෙ": 561,
+    "න්": 562,
+    "ඤං": 563,
+    "親": 564,
+    "Û": 565,
+    "ථො": 566,
+    "ෟ": 567,
+    "ł": 568,
+    "‬": 569,
+    "〒": 570,
+    "අ": 571,
+    "ළී": 572,
+    "社": 573,
+    "ඟේ": 574,
+    "▻": 575,
+    "♥": 576,
+    "し": 577,
+    "ﾑ": 578,
+    "📞": 579,
+    "っ": 580,
+    "º": 581,
+    "": 582,
+    "ඛං": 583,
+    "i": 584,
+    "ඨෘ": 585,
+    "m": 586,
+    "තූ": 587,
+    "乃": 588,
+    "ส": 589,
+    "ቅ": 590,
+    "උෙ": 591,
+    "在": 592,
+    "ા": 593,
+    "‪": 594,
+    "🐱": 595,
+    "ඩෲ": 596,
+    "රැ": 597,
+    "ටෛ": 598,
+    "博": 599,
+    "️": 600,
+    "ហ": 601,
+    "වෞ": 602,
+    "ρ": 603,
+    "ළ": 604,
+    "ටෙ": 605,
+    "😍": 606,
+    "入": 607,
+    "†": 608,
+    "∘": 609,
+    "보": 610,
+    "ೀ": 611,
+    "ඵ්": 612,
+    "😓": 613,
+    "文": 614,
+    "投": 615,
+    "": 616,
+    "ඵී": 617,
+    "資": 618,
+    "භි": 619,
+    "↕": 620,
+    "A": 621,
+    "⚓": 622,
+    "න": 623,
+    "ද": 624,
+    "ඹ": 625,
+    "ㅉ": 626,
+    "ዳ": 627,
+    "ඍ": 628,
+    "่": 629,
+    "\u0003": 630,
+    "ù": 631,
+    "ලං": 632,
+    "😏": 633,
+    "ు": 634,
+    "➡": 635,
+    "۞": 636,
+    "א": 637,
+    "兀": 638,
+    "ඨං": 639,
+    "ਨ": 640,
+    "й": 641,
+    "เ": 642,
+    "ඝූ": 643,
+    "決": 644,
+    "晉": 645,
+    "🚀": 646,
+    "": 647,
+    "කෝ": 648,
+    "까": 649,
+    "ி": 650,
+    "රූ": 651,
+    "ಳ": 652,
+    "ඒ්": 653,
+    "ඞෝ": 654,
+    "ථු": 655,
+    "つ": 656,
+    "販": 657,
+    "ළූ": 658,
+    "ශා": 659,
+    "ඬා": 660,
+    "옹": 661,
+    "ð": 662,
+    "ூ": 663,
+    "ગ": 664,
+    "诶": 665,
+    "💕": 666,
+    "場": 667,
+    "ථඃ": 668,
+    "ی": 669,
+    "👨": 670,
+    "": 671,
+    "👌": 672,
+    "ඹැ": 673,
+    "ජ": 674,
+    ",": 675,
+    "F": 676,
+    "Ç": 677,
+    "ó": 678,
+    "": 679,
+    "Ô": 680,
+    "😬": 681,
+    "කු": 682,
+    "б": 683,
+    "ඹේ": 684,
+    "ν": 685,
+    "¾": 686,
+    "🙁": 687,
+    "ග්": 688,
+    "ප්": 689,
+    "Ā": 690,
+    "閉": 691,
+    "ඉු": 692,
+    "డ": 693,
+    "🏆": 694,
+    "සූ": 695,
+    "ඊ": 696,
+    "비": 697,
+    "∙": 698,
+    "ඣං": 699,
+    "ọ": 700,
+    "ṉ": 701,
+    "す": 702,
+    "රෘ": 703,
+    "札": 704,
+    "දෛ": 705,
+    "🇰": 706,
+    "※": 707,
+    "පෝ": 708,
+    "繁": 709,
+    "K": 710,
+    "☸": 711,
+    "ඩී": 712,
+    "ජෞ": 713,
+    "ー": 714,
+    "河": 715,
+    "\\": 716,
+    "ב": 717,
+    "මෑ": 718,
+    "ටො": 719,
+    "ǐ": 720,
+    "ඞො": 721,
+    "↯": 722,
+    "ష": 723,
+    "ஞ": 724,
+    "⇔": 725,
+    "ඩූ": 726,
+    "ඟො": 727,
+    "✿": 728,
+    "": 729,
+    "м": 730,
+    "❶": 731,
+    "ඳැ": 732,
+    "ښ": 733,
+    "✉": 734,
+    "ඕෘ": 735,
+    "භී": 736,
+    "ඔි": 737,
+    "哪": 738,
+    "ග": 739,
+    "🏻": 740,
+    "ε": 741,
+    "ழ": 742,
+    "符": 743,
+    "": 744,
+    "නා": 745,
+    "ǣ": 746,
+    "ණෝ": 747,
+    "ロ": 748,
+    "ಜ": 749,
+    "": 750,
+    "À": 751,
+    "와": 752,
+    "ඕ": 753,
+    "♣": 754,
+    "බෘ": 755,
+    "🙃": 756,
+    "ෂං": 757,
+    "ඵෝ": 758,
+    "に": 759,
+    "9": 760,
+    "漢": 761,
+    "කෘ": 762,
+    "📖": 763,
+    "ɳ": 764,
+    "😭": 765,
+    "ɛ": 766,
+    "Ξ": 767,
+    "ධං": 768,
+    "😢": 769,
+    "😯": 770,
+    "දො": 771,
+    "ச": 772,
+    "සි": 773,
+    "ඡ": 774,
+    "ඇැ": 775,
+    "ඊෙ": 776,
+    "🌲": 777,
+    "ラ": 778,
+    "‡": 779,
+    "ண": 780,
+    "«": 781,
+    "М": 782,
+    "ች": 783,
+    "ශැ": 784,
+    "ɖ": 785,
+    "හා": 786,
+    "贝": 787,
+    "ගූ": 788,
+    "😞": 789,
+    "6": 790,
+    "ლ": 791,
+    "İ": 792,
+    "ㅆ": 793,
+    "ُ": 794,
+    "🔷": 795,
+    "ණෑ": 796,
+    "😗": 797,
+    "ત": 798,
+    "⇨": 799,
+    "ಇ": 800,
+    "✍": 801,
+    "ට": 802,
+    "曹": 803,
+    " ": 804,
+    "ධො": 805,
+    "කෙ": 806,
+    "ෆෘ": 807,
+    "ඞ": 808,
+    "": 809,
+    "）": 810,
+    "舊": 811,
+    "@": 812,
+    "ඪූ": 813,
+    "수": 814,
+    "🔘": 815,
+    "ದ": 816,
+    "ළේ": 817,
+    "ś": 818,
+    "간": 819,
+    "豆": 820,
+    "ፍ": 821,
+    "": 822,
+    "θ": 823,
+    "සො": 824,
+    "め": 825,
+    "ඩා": 826,
+    "ː": 827,
+    "く": 828,
+    "k": 829,
+    "අ්": 830,
+    "තං": 831,
+    "◼": 832,
+    "”": 833,
+    "සෙ": 834,
+    "節": 835,
+    "ඪු": 836,
+    "담": 837,
+    "±": 838,
+    "🕒": 839,
+    "房": 840,
+    "ගො": 841,
+    "ඔී": 842,
+    "௧": 843,
+    "": 844,
+    "😅": 845,
+    "協": 846,
+    "к": 847,
+    "ධී": 848,
+    "ණේ": 849,
+    "ट": 850,
+    "භෘ": 851,
+    "麗": 852,
+    "ලා": 853,
+    "⋅": 854,
+    "ඳු": 855,
+    "දැ": 856,
+    "₰": 857,
+    "යෙ": 858,
+    "خ": 859,
+    "😂": 860,
+    "ŭ": 861,
+    "경": 862,
+    "ټ": 863,
+    "ජී": 864,
+    "育": 865,
+    "ﺎ": 866,
+    "ẻ": 867,
+    "ඵෘ": 868,
+    "යෘ": 869,
+    "🛌": 870,
+    "ậ": 871,
+    "平": 872,
+    "ீ": 873,
+    "Е": 874,
+    "ල": 875,
+    "経": 876,
+    "ш": 877,
+    "g": 878,
+    "Ŵ": 879,
+    "😘": 880,
+    "ਗ": 881,
+    "향": 882,
+    "県": 883,
+    "ㅇ": 884,
+    "울": 885,
+    "안": 886,
+    "උු": 887,
+    "ඡ්": 888,
+    ":": 889,
+    "♀": 890,
+    "ළො": 891,
+    "園": 892,
+    "´": 893,
+    "甫": 894,
+    "ඛෝ": 895,
+    "ඳ": 896,
+    "인": 897,
+    "තෲ": 898,
+    "ඬු": 899,
+    "💓": 900,
+    "\u0014": 901,
+    "ඹි": 902,
+    "鄕": 903,
+    "ඩෘ": 904,
+    "高": 905,
+    "̈": 906,
+    "て": 907,
+    "ඤ්": 908,
+    "ර": 909,
+    "部": 910,
+    "ʼ": 911,
+    "😎": 912,
+    "ඝෝ": 913,
+    "~": 914,
+    "ඹෙ": 915,
+    "👍": 916,
+    "✅": 917,
+    "ඤැ": 918,
+    "¹": 919,
+    "Ƹ": 920,
+    "ඹෞ": 921,
+    "圓": 922,
+    "μ": 923,
+    "ඵි": 924,
+    "Đ": 925,
+    "💗": 926,
+    "च": 927,
+    "ඡෘ": 928,
+    "¢": 929,
+    "ö": 930,
+    "▃": 931,
+    "🐌": 932,
+    "西": 933,
+    "─": 934,
+    "ட": 935,
+    "ජේ": 936,
+    "": 937,
+    "👎": 938,
+    "‏": 939,
+    "ㅈ": 940,
+    "≤": 941,
+    " ": 942,
+    "商": 943,
+    "∕": 944,
+    "╔": 945,
+    "ඇං": 946,
+    "릴": 947,
+    "℘": 948,
+    "ණු": 949,
+    "0": 950,
+    "මෝ": 951,
+    "🌺": 952,
+    "වො": 953,
+    "✓": 954,
+    "ይ": 955,
+    "寶": 956,
+    "කෛ": 957,
+    "知": 958,
+    "警": 959,
+    "￼": 960,
+    "்": 961,
+    "시": 962,
+    "&": 963,
+    "භෞ": 964,
+    "ගෙ": 965,
+    "ה": 966,
+    "💞": 967,
+    "运": 968,
+    "": 969,
+    "සා": 970,
+    " ": 971,
+    "✊": 972,
+    "♜": 973,
+    "】": 974,
+    "ඤ": 975,
+    "✪": 976,
+    "प": 977,
+    "ඛා": 978,
+    "‫": 979,
+    "ந": 980,
+    "ን": 981,
+    "": 982,
+    "ँ": 983,
+    "භෝ": 984,
+    "Â": 985,
+    "ඉ": 986,
+    "උ්": 987,
+    "т": 988,
+    "වි": 989,
+    "තෝ": 990,
+    "f": 991,
+    "ก": 992,
+    "ط": 993,
+    "٠": 994,
+    "ඛඃ": 995,
+    "舞": 996,
+    "ㅣ": 997,
+    "÷": 998,
+    "ඈ": 999,
+    "ඞූ": 1000,
+    "定": 1001,
+    "ධෙ": 1002,
+    "ෆෙ": 1003,
+    "යැ": 1004,
+    "ඹෝ": 1005,
+    "年": 1006,
+    "නි": 1007,
+    "ı": 1008,
+    "අං": 1009,
+    "": 1010,
+    "ย": 1011,
+    "à": 1012,
+    "З": 1013,
+    "ि": 1014,
+    "G": 1015,
+    "ඨැ": 1016,
+    "店": 1017,
+    "🔆": 1018,
+    "ඞු": 1019,
+    "♭": 1020,
+    "අඃ": 1021,
+    "ටේ": 1022,
+    "ෆූ": 1023,
+    ";": 1024,
+    "พ": 1025,
+    "ළි": 1026,
+    "ʟ": 1027,
+    "う": 1028,
+    "ა": 1029,
+    "天": 1030,
+    "உ": 1031,
+    "英": 1032,
+    "ò": 1033,
+    "හැ": 1034,
+    "": 1035,
+    "📝": 1036,
+    "රෛ": 1037,
+    "වෑ": 1038,
+    "ජං": 1039,
+    "ඔ": 1040,
+    "🚈": 1041,
+    "ㅃ": 1042,
+    "λ": 1043,
+    "🌞": 1044,
+    "✚": 1045,
+    "Q": 1046,
+    "හේ": 1047,
+    "ඏ": 1048,
+    "\"": 1049,
+    "ඔං": 1050,
+    "₦": 1051,
+    "ˌ": 1052,
+    "¥": 1053,
+    "网": 1054,
+    "☞": 1055,
+    "່": 1056,
+    "ඝ": 1057,
+    "▒": 1058,
+    "陽": 1059,
+    "ඈං": 1060,
+    "蘭": 1061,
+    "ು": 1062,
+    "⁄": 1063,
+    "නෘ": 1064,
+    "ඡේ": 1065,
+    "ㄲ": 1066,
+    "ඬී": 1067,
+    "චෑ": 1068,
+    "ඟෝ": 1069,
+    "პ": 1070,
+    ".": 1071,
+    "රෲ": 1072,
+    "ධා": 1073,
+    "ළැ": 1074,
+    "í": 1075,
+    "මූ": 1076,
+    "ථි": 1077,
+    "ό": 1078,
+    "동": 1079,
+    "ά": 1080,
+    "用": 1081,
+    "": 1082,
+    "V": 1083,
+    "鼎": 1084,
+    "💖": 1085,
+    "女": 1086,
+    "ा": 1087,
+    "ප": 1088,
+    "ஜ": 1089,
+    "ශූ": 1090,
+    "ඪි": 1091,
+    "κ": 1092,
+    "℉": 1093,
+    "þ": 1094,
+    "ቁ": 1095,
+    "එේ": 1096,
+    "ර්": 1097,
+    "팔": 1098,
+    "ੈ": 1099,
+    "Y": 1100,
+    "🏠": 1101,
+    "க": 1102,
+    "ජි": 1103,
+    "▲": 1104,
+    "ụ": 1105,
+    "ṁ": 1106,
+    "名": 1107,
+    "۶": 1108,
+    "ඪී": 1109,
+    "、": 1110,
+    "任": 1111,
+    "චෝ": 1112,
+    "ඡැ": 1113,
+    "ධෘ": 1114,
+    "ධෲ": 1115,
+    "족": 1116,
+    "甦": 1117,
+    "💹": 1118,
+    "ඤා": 1119,
+    "研": 1120,
+    "υ": 1121,
+    "සී": 1122,
+    "ֻ": 1123,
+    "►": 1124,
+    "+": 1125,
+    "ෆී": 1126,
+    "‘": 1127,
+    "😁": 1128,
+    "ඣු": 1129,
+    "ล": 1130,
+    "": 1131,
+    "е": 1132,
+    "ය්": 1133,
+    "מ": 1134,
+    "唐": 1135,
+    "ष": 1136,
+    "ِ": 1137,
+    "Ò": 1138,
+    "හෝ": 1139,
+    "😩": 1140,
+    "放": 1141,
+    "ु": 1142,
+    "👏": 1143,
+    "🙂": 1144,
+    "ℯ": 1145,
+    "-": 1146,
+    "整": 1147,
+    "එි": 1148,
+    "雨": 1149,
+    "ب": 1150,
+    "ʃ": 1151,
+    "Д": 1152,
+    "サ": 1153,
+    "ශෞ": 1154,
+    "ለ": 1155,
+    "∎": 1156,
+    "ι": 1157,
+    "블": 1158,
+    "ඝො": 1159,
+    "🚇": 1160,
+    "Ö": 1161,
+    "ț": 1162,
+    "7": 1163,
+    "කැ": 1164,
+    "ගෞ": 1165,
+    "ඟැ": 1166,
+    "ス": 1167,
+    "පඃ": 1168,
+    "만": 1169,
+    "📷": 1170,
+    "食": 1171,
+    "හං": 1172,
+    "ඓං": 1173,
+    "γ": 1174,
+    "🔵": 1175,
+    "ệ": 1176,
+    "ே": 1177,
+    "තෛ": 1178,
+    "伎": 1179,
+    "වඃ": 1180,
+    "ද්": 1181,
+    "ુ": 1182,
+    "➦": 1183,
+    "": 1184,
+    "සෑ": 1185,
+    "ඬ්": 1186,
+    "А": 1187,
+    "තඃ": 1188,
+    "ණී": 1189,
+    "😦": 1190,
+    "ஸ": 1191,
+    "¡": 1192,
+    "ʂ": 1193,
+    "ඣී": 1194,
+    "මේ": 1195,
+    "ලො": 1196,
+    "ෂි": 1197,
+    "ያ": 1198,
+    "工": 1199,
+    "ע": 1200,
+    "빈": 1201,
+    "▆": 1202,
+    "ගු": 1203,
+    "µ": 1204,
+    "á": 1205,
+    "ඒ": 1206,
+    "ණො": 1207,
+    "": 1208,
+    "": 1209,
+    "た": 1210,
+    "▅": 1211,
+    "ப": 1212,
+    "도": 1213,
+    "ටී": 1214,
+    "a": 1215,
+    "e": 1216,
+    "ඹූ": 1217,
+    "а": 1218,
+    "，": 1219,
+    "✈": 1220,
+    "შ": 1221,
+    "ω": 1222,
+    "[": 1223,
+    "щ": 1224,
+    "ê": 1225,
+    "â": 1226,
+    "☚": 1227,
+    "\u0002": 1228,
+    "⌖": 1229,
+    "순": 1230,
+    "🍂": 1231,
+    "ඊැ": 1232,
+    "℃": 1233,
+    "කූ": 1234,
+    "ඌ": 1235,
+    "තෞ": 1236,
+    "ඛෘ": 1237,
+    "ң": 1238,
+    "戌": 1239,
+    "อ": 1240,
+    "😨": 1241,
+    "ֹ": 1242,
+    "ඩ්": 1243,
+    "า": 1244,
+    "☆": 1245,
+    "ඪ්": 1246,
+    "😡": 1247,
+    "千": 1248,
+    "ඡෛ": 1249,
+    "田": 1250,
+    "යී": 1251,
+    "대": 1252,
+    "결": 1253,
+    "💎": 1254,
+    "ඦා": 1255,
+    "හ": 1256,
+    "😄": 1257,
+    "චි": 1258,
+    "෴": 1259,
+    "දෑ": 1260,
+    "<": 1261,
+    "ණෙ": 1262,
+    "බං": 1263,
+    "ん": 1264,
+    "": 1265,
+    "හෑ": 1266,
+    "": 1267,
+    "🔑": 1268,
+    "ලි": 1269,
+    "👈": 1270,
+    "බා": 1271,
+    "": 1272,
+    "වෘ": 1273,
+    "තැ": 1274,
+    "චො": 1275,
+    "哈": 1276,
+    "ɒ": 1277,
+    "": 1278,
+    "口": 1279,
+    "¦": 1280,
+    "කා": 1281,
+    "鈴": 1282,
+    "ứ": 1283,
+    "反": 1284,
+    "ළං": 1285,
+    "ㄱ": 1286,
+    "බී": 1287,
+    "උූ": 1288,
+    "ŋ": 1289,
+    "者": 1290,
+    "'": 1291,
+    "T": 1292,
+    "【": 1293,
+    "ሸ": 1294,
+    "⚜": 1295,
+    "≈": 1296,
+    "ጣ": 1297,
+    "య": 1298,
+    "💩": 1299,
+    "🎤": 1300,
+    "É": 1301,
+    "ළු": 1302,
+    "👆": 1303,
+    "☼": 1304,
+    "δ": 1305,
+    "ආඃ": 1306,
+    "씨": 1307,
+    "💥": 1308,
+    "г": 1309,
+    "Б": 1310,
+    "ඩො": 1311,
+    "ุ": 1312,
+    "И": 1313,
+    "๏": 1314,
+    "😝": 1315,
+    "ඵා": 1316,
+    "ジ": 1317,
+    "නං": 1318,
+    "생": 1319,
+    "": 1320,
+    "ඣෝ": 1321,
+    "ඉි": 1322,
+    "ō": 1323,
+    "ෂෙ": 1324,
+    "වං": 1325,
+    "ज": 1326,
+    "": 1327,
+    "කෞ": 1328,
+    "市": 1329,
+    "ü": 1330,
+    "ל": 1331,
+    "භූ": 1332,
+    "ů": 1333,
+    "ව": 1334,
+    "P": 1335,
+    "🇱": 1336,
+    "🕛": 1337,
+    "か": 1338,
+    "Z": 1339,
+    "ل": 1340,
+    "ტ": 1341,
+    "▼": 1342,
+    "⛅": 1343,
+    "☝": 1344,
+    "ē": 1345,
+    "ి": 1346,
+    "営": 1347,
+    "ශෙ": 1348,
+    "W": 1349,
+    "": 1350,
+    "සඃ": 1351,
+    "⛺": 1352,
+    "": 1353,
+    "ෂු": 1354,
+    "ž": 1355,
+    "තා": 1356,
+    "Ê": 1357,
+    "👹": 1358,
+    "උී": 1359,
+    "合": 1360,
+    "ඨේ": 1361,
+    "ಧ": 1362,
+    "ﾶ": 1363,
+    "♂": 1364,
+    "ඖ": 1365,
+    " ": 1366,
+    "මෛ": 1367,
+    "😃": 1368,
+    "📲": 1369,
+    "古": 1370,
+    "ж": 1371,
+    "j": 1372,
+    "ශු": 1373,
+    "➲": 1374,
+    "с": 1375,
+    "ā": 1376,
+    "3": 1377,
+    "お": 1378,
+    "ඹු": 1379,
+    "ნ": 1380,
+    "証": 1381,
+    "🤔": 1382,
+    "고": 1383,
+    "එං": 1384,
+    "ř": 1385,
+    "∏": 1386,
+    "ථූ": 1387,
+    "◀": 1388,
+    "Õ": 1389,
+    "ඇෙ": 1390,
+    "艾": 1391,
+    "Р": 1392,
+    "ෆො": 1393,
+    "ගා": 1394,
+    "c": 1395,
+    "🔫": 1396,
+    "ඨෙ": 1397,
+    "": 1398,
+    "ජා": 1399,
+    "의": 1400,
+    "書": 1401,
+    "ت": 1402,
+    "๐": 1403,
+    "🌴": 1404,
+    "క": 1405,
+    "श": 1406,
+    "බෲ": 1407,
+    "犬": 1408,
+    " ": 1409,
+    "රු": 1410,
+    "": 1411,
+    "ʔ": 1412,
+    "🙄": 1413,
+    "왕": 1414,
+    "Л": 1415,
+    "Ο": 1416,
+    "未": 1417,
+    "♨": 1418,
+    "ز": 1419,
+    "හු": 1420,
+    "ඟා": 1421,
+    "성": 1422,
+    "क": 1423,
+    "තො": 1424,
+    "ලෛ": 1425,
+    "ढ": 1426,
+    "ແ": 1427,
+    "〜": 1428,
+    "ජෙ": 1429,
+    "–": 1430,
+    "🌟": 1431,
+    "": 1432,
+    "🏳": 1433,
+    "ඍී": 1434,
+    "オ": 1435,
+    "ا": 1436,
+    "නූ": 1437,
+    "Α": 1438,
+    "චූ": 1439,
+    "😮": 1440,
+    "ඵෑ": 1441,
+    "🏃": 1442,
+    "ඛො": 1443,
+    "人": 1444,
+    "ෂා": 1445,
+    "狄": 1446,
+    "රෙ": 1447,
+    "ṅ": 1448,
+    "අි": 1449,
+    "ඤෝ": 1450,
+    "ਡ": 1451,
+    "ನ": 1452,
+    "及": 1453,
+    "ඩං": 1454,
+    "р": 1455,
+    "⇻": 1456,
+    "🌼": 1457,
+    "ٌ": 1458,
+    "ர": 1459,
+    "τ": 1460,
+    "ǚ": 1461,
+    "ඒෙ": 1462,
+    "活": 1463,
+    "මඃ": 1464,
+    "ට්": 1465,
+    "n": 1466,
+    "行": 1467,
+    "私": 1468,
+    "යෞ": 1469,
+    "ல": 1470,
+    "X": 1471,
+    "උා": 1472,
+    "ඪෝ": 1473,
+    "🚴": 1474,
+    "ਬ": 1475,
+    "ደ": 1476,
+    "ඵො": 1477,
+    "ו": 1478,
+    "▶": 1479,
+    "ய": 1480,
+    "❤": 1481,
+    "අෘ": 1482,
+    "′": 1483,
+    "★": 1484,
+    "Δ": 1485,
+    "î": 1486,
+    "ር": 1487,
+    "나": 1488,
+    "۩": 1489,
+    "ඎ": 1490,
+    "ඨූ": 1491,
+    "主": 1492,
+    "๬": 1493,
+    "": 1494,
+    "攻": 1495,
+    "숙": 1496,
+    "": 1497,
+    "小": 1498,
+    "곰": 1499,
+    "ن": 1500,
+    "ඹී": 1501,
+    "మ": 1502,
+    "三": 1503,
+    "由": 1504,
+    "ඬං": 1505,
+    "🍅": 1506,
+    "තු": 1507,
+    "ඉ්": 1508,
+    "ㅅ": 1509,
+    "ç": 1510,
+    "数": 1511,
+    "මෞ": 1512,
+    "후": 1513,
+    "ඟ": 1514,
+    "$": 1515,
+    "←": 1516,
+    "ள": 1517,
+    "ת": 1518,
+    "€": 1519,
+    "ṇ": 1520,
+    "ｲ": 1521,
+    "ವ": 1522,
+    "ک": 1523,
+    "ಂ": 1524,
+    "ඛ්": 1525,
+    "☯": 1526,
+    "缶": 1527,
+    "역": 1528,
+    "": 1529,
+    "වැ": 1530,
+    "%": 1531,
+    "☛": 1532,
+    "▫": 1533,
+    "තී": 1534,
+    "≥": 1535,
+    "E": 1536,
+    "成": 1537,
+    "È": 1538,
+    "前": 1539,
+    "¿": 1540,
+    "බු": 1541,
+    "ம": 1542,
+    "レ": 1543,
+    "ධෝ": 1544,
+    "ශ්": 1545,
+    "そ": 1546,
+    "里": 1547,
+    "ற": 1548,
+    "û": 1549,
+    "◇": 1550,
+    "្": 1551,
+    "許": 1552,
+    "ድ": 1553,
+    "壬": 1554,
+    "`": 1555,
+    "🛑": 1556,
+    "國": 1557,
+    "‌": 1558,
+    "ʻ": 1559,
+    "ධ්": 1560,
+    "ஷ": 1561,
+    "ク": 1562,
+    "ථැ": 1563,
+    "⚪": 1564,
+    "ภ": 1565,
+    "ඡා": 1566,
+    "£": 1567,
+    "ο": 1568,
+    "​": 1569,
+    "乎": 1570,
+    "ě": 1571,
+    "ר": 1572,
+    "Ŧ": 1573,
+    "➢": 1574,
+    "ɔ": 1575,
+    "චෛ": 1576,
+    "S": 1577,
+    "ી": 1578,
+    "ඝේ": 1579,
+    "හො": 1580,
+    "q": 1581,
+    "識": 1582,
+    "з": 1583,
+    "එෛ": 1584,
+    "в": 1585,
+    "カ": 1586,
+    "➩": 1587,
+    "රා": 1588,
+    "🤷": 1589,
+    "�": 1590,
+    "💙": 1591,
+    "―": 1592,
+    "あ": 1593,
+    "明": 1594,
+    "තේ": 1595,
+    "ņ": 1596,
+    "හඃ": 1597,
+    "U": 1598,
+    "R": 1599,
+    "ਤ": 1600,
+    "ඨා": 1601,
+    "م": 1602,
+    "节": 1603,
+    "♦": 1604,
+    "පෞ": 1605,
+    "මෙ": 1606,
+    "ශී": 1607,
+    "🔨": 1608,
+    "ජො": 1609,
+    "ე": 1610,
+    "ّ": 1611,
+    "े": 1612,
+    "午": 1613,
+    "එ්": 1614,
+    "4": 1615,
+    "ණ්": 1616,
+    "*": 1617,
+    "校": 1618,
+    "̵": 1619,
+    "ස්": 1620,
+    "": 1621,
+    "ඣ": 1622,
+    "п": 1623,
+    "아": 1624,
+    "Ω": 1625,
+    "é": 1626,
+    "y": 1627,
+    "දෞ": 1628,
+    "ግ": 1629,
+    "↓": 1630,
+    "ﻨ": 1631,
+    "ඪා": 1632,
+    "ජෛ": 1633,
+    "।": 1634,
+    "斯": 1635,
+    "ඛ": 1636,
+    "ي": 1637,
+    "„": 1638,
+    "ť": 1639,
+    "": 1640,
+    "සෝ": 1641,
+    "式": 1642,
+    "л": 1643,
+    "රො": 1644,
+    "ධි": 1645,
+    "і": 1646,
+    "චා": 1647,
+    "වෝ": 1648,
+    "ආා": 1649,
+    "ø": 1650,
+    "わ": 1651,
+    "එු": 1652,
+    "👀": 1653,
+    "ඞා": 1654,
+    "ä": 1655,
+    "ַ": 1656,
+    "ලෝ": 1657,
+    "த": 1658,
+    "ወ": 1659,
+    "මං": 1660,
+    "ත්": 1661,
+    "ま": 1662,
+    "パ": 1663,
+    "広": 1664,
+    "会": 1665,
+    "問": 1666,
+    "": 1667,
+    "志": 1668,
+    "리": 1669,
+    "丝": 1670,
+    "о": 1671,
+    "😊": 1672,
+    "責": 1673,
+    "ඛූ": 1674,
+    "❣": 1675,
+    "を": 1676,
+    "😌": 1677,
+    "ඬෙ": 1678,
+    "杜": 1679,
+    "∫": 1680,
+    "එී": 1681,
+    "ඝා": 1682,
+    "군": 1683,
+    "š": 1684,
+    "ඵූ": 1685,
+    "ί": 1686,
+    "😆": 1687,
+    "😛": 1688,
+    "卡": 1689,
+    "": 1690,
+    "කී": 1691,
+    "ටං": 1692,
+    "를": 1693,
+    "බැ": 1694,
+    "💵": 1695,
+    "™": 1696,
+    "산": 1697,
+    "🌎": 1698,
+    "ශො": 1699,
+    "乇": 1700,
+    "↠": 1701,
+    "💐": 1702,
+    "බෙ": 1703,
+    "元": 1704,
+    "ඛේ": 1705,
+    "2": 1706,
+    "උ": 1707,
+    "드": 1708,
+    "දා": 1709,
+    "ඳෘ": 1710,
+    "මු": 1711,
+    "後": 1712,
+    "ë": 1713,
+    "➤": 1714,
+    "ׁ": 1715,
+    "č": 1716,
+    "材": 1717,
+    "ණි": 1718,
+    "¤": 1719,
+    "″": 1720,
+    "究": 1721,
+    "ගෑ": 1722,
+    "쉬": 1723,
+    "d": 1724,
+    "訥": 1725,
+    "•": 1726,
+    "城": 1727,
+    "🚆": 1728,
+    "→": 1729,
+    "こ": 1730,
+    "H": 1731,
+    "म": 1732,
+    "දේ": 1733,
+    "රෞ": 1734,
+    "“": 1735,
+    "ඪො": 1736,
+    "ሽ": 1737,
+    "C": 1738,
+    "ं": 1739,
+    "ඳූ": 1740,
+    "吗": 1741,
+    "😵": 1742,
+    "": 1743,
+    "▂": 1744,
+    "ඡො": 1745,
+    "දඃ": 1746,
+    "이": 1747,
+    "⚘": 1748,
+    "රී": 1749,
+    "り": 1750,
+    "🙉": 1751,
+    "ф": 1752,
+    "\b": 1753,
+    "ඨී": 1754,
+    "正": 1755,
+    "催": 1756,
+    "찰": 1757,
+    "어": 1758,
+    "👉": 1759,
+    "ş": 1760,
+    "ෆි": 1761,
+    "ෆං": 1762,
+    "ටෲ": 1763,
+    "ማ": 1764,
+    "ධ": 1765,
+    "्": 1766,
+    "業": 1767,
+    "🎁": 1768,
+    "": 1769,
+    "භු": 1770,
+    "ь": 1771,
+    "近": 1772,
+    "බෝ": 1773,
+    "尺": 1774,
+    "ㄷ": 1775,
+    "ر": 1776,
+    "වෛ": 1777,
+    "√": 1778,
+    "": 1779,
+    "👓": 1780,
+    "බේ": 1781,
+    "L": 1782,
+    "": 1783,
+    "ඕං": 1784,
+    "ෆෝ": 1785,
+    "මැ": 1786,
+    "ʈ": 1787,
+    "ở": 1788,
+    "😀": 1789,
+    "ṣ": 1790,
+    "©": 1791,
+    "덕": 1792,
+    "ం": 1793,
+    "薦": 1794,
+    "රි": 1795,
+    "ם": 1796,
+    "ハ": 1797,
+    "ਰ": 1798,
+    "ෂ": 1799,
+    "®": 1800,
+    "∋": 1801,
+    "Ш": 1802,
+    "සැ": 1803,
+    "؟": 1804,
+    "˝": 1805,
+    "원": 1806,
+    "ස": 1807,
+    "и": 1808,
+    "物": 1809,
+    "🙌": 1810,
+    "妇": 1811,
+    "여": 1812,
+    "˜": 1813,
+    "べ": 1814,
+    "ඟී": 1815,
+    "ඛී": 1816,
+    "✴": 1817,
+    "‹": 1818,
+    "💇": 1819,
+    "💃": 1820,
+    "ύ": 1821,
+    "හ්": 1822,
+    "භෛ": 1823,
+    "ੁ": 1824,
+    "வ": 1825,
+    "යේ": 1826,
+    "▬": 1827,
+    "යු": 1828,
+    "දෝ": 1829,
+    "ධැ": 1830,
+    "﻿": 1831,
+    "😔": 1832,
+    "ŝ": 1833,
+    "職": 1834,
+    "П": 1835,
+    "✌": 1836,
+    "ටෞ": 1837,
+    "გ": 1838,
+    "¯": 1839,
+    "น": 1840,
+    "说": 1841,
+    "飛": 1842,
+    "ة": 1843,
+    "යි": 1844,
+    "ገ": 1845,
+    "ಯ": 1846,
+    "පො": 1847,
+    "ථේ": 1848,
+    "å": 1849,
+    "ථ": 1850,
+    "ෂෘ": 1851,
+    "!": 1852,
+    "육": 1853,
+    "ह": 1854,
+    "ú": 1855,
+    "ටූ": 1856,
+    "祭": 1857,
+    "せ": 1858,
+    "භෙ": 1859,
+    "░": 1860,
+    "烧": 1861,
+    "🧗": 1862,
+    "තෙ": 1863,
+    "r": 1864,
+    "☂": 1865,
+    "කො": 1866,
+    "ඩෛ": 1867,
+    "æ": 1868,
+    "∞": 1869,
+    "ิ": 1870,
+    "和": 1871,
+    "ටි": 1872,
+    "ථෙ": 1873,
+    "ﷺ": 1874,
+    "": 1875,
+    "ෂෑ": 1876,
+    "ලී": 1877,
+    "ගෝ": 1878,
+    "❋": 1879,
+    "ඈෑ": 1880,
+    "ト": 1881,
+    "හි": 1882,
+    "ටැ": 1883,
+    "❸": 1884,
+    "ශෲ": 1885,
+    "ලෑ": 1886,
+    "➧": 1887,
+    "📕": 1888,
+    "ණං": 1889,
+    "ලූ": 1890,
+    "◄": 1891,
+    "ඟු": 1892,
+    "ශේ": 1893,
+    "К": 1894,
+    "Ó": 1895,
+    "ඵු": 1896,
+    "": 1897,
+    "ਵ": 1898,
+    "͡": 1899,
+    "": 1900,
+    "🌊": 1901,
+    "ತ": 1902,
+    "5": 1903,
+    "සෛ": 1904,
+    "⭐": 1905,
+    "‎": 1906,
+    "⇒": 1907,
+    "ෆෞ": 1908,
+    "幣": 1909,
+    "🖤": 1910,
+    "シ": 1911,
+    "ṟ": 1912,
+    "╮": 1913,
+    "☟": 1914,
+    "පෲ": 1915,
+    "כ": 1916,
+    "遅": 1917,
+    "😉": 1918,
+    "අැ": 1919,
+    "■": 1920,
+    "我": 1921,
+    "අෑ": 1922,
+    "B": 1923,
+    "მ": 1924,
+    "◢": 1925,
+    "භො": 1926,
+    "タ": 1927,
+    "දෲ": 1928,
+    "ū": 1929,
+    "ዬ": 1930,
+    "ኋ": 1931,
+    "½": 1932,
+    "族": 1933,
+    "β": 1934,
+    "ល": 1935,
+    "ඵෙ": 1936,
+    "ඤෑ": 1937,
+    "જ": 1938,
+    "ෂෝ": 1939,
+    "🤗": 1940,
+    "ვ": 1941,
+    "×": 1942,
+    "ش": 1943,
+    "ℓ": 1944,
+    "ง": 1945,
+    "වෲ": 1946,
+    "维": 1947,
+    "녕": 1948,
+    "Ţ": 1949,
+    "M": 1950,
+    "": 1951,
+    "කඃ": 1952,
+    "🏁": 1953,
+    "💰": 1954,
+    "本": 1955,
+    "█": 1956,
+    "හෘ": 1957,
+    "l": 1958,
+    "ඩෝ": 1959,
+    "යා": 1960,
+    "😥": 1961,
+    "ः": 1962,
+    "Ʒ": 1963,
+    "ඳි": 1964,
+    "曜": 1965,
+    "弘": 1966,
+    "ළෙ": 1967,
+    "動": 1968,
+    "☺": 1969,
+    "යො": 1970,
+    "应": 1971,
+    "プ": 1972,
+    "ජූ": 1973,
+    "φ": 1974,
+    "ძ": 1975,
+    "✖": 1976,
+    "h": 1977,
+    "井": 1978,
+    "錢": 1979,
+    "ɑ": 1980,
+    "ඨ": 1981,
+    "ශං": 1982,
+    "的": 1983,
+    "පැ": 1984,
+    "෿": 1985,
+    "ְ": 1986,
+    "ඳො": 1987,
+    "නෙ": 1988,
+    "": 1989,
+    "⇜": 1990,
+    "ÿ": 1991,
+    "ල්": 1992,
+    "ඩැ": 1993,
+    "ு": 1994,
+    "ඛෑ": 1995,
+    "特": 1996,
+    "は": 1997,
+    "۱": 1998,
+    "🚗": 1999,
+    "ඡෝ": 2000,
+    "ශෛ": 2001,
+    "భ": 2002,
+    "౦": 2003,
+    "ඟං": 2004,
+    "እ": 2005,
+    "තෑ": 2006,
+    "ශෑ": 2007,
+    "ධේ": 2008,
+    "ை": 2009,
+    "ы": 2010,
+    "Ὑ": 2011,
+    "ບ": 2012,
+    "정": 2013,
+    "ඹො": 2014,
+    "ථී": 2015,
+    "ඟෙ": 2016,
+    "›": 2017,
+    "🙏": 2018,
+    "➣": 2019,
+    "ධෞ": 2020,
+    "🎰": 2021,
+    "ථා": 2022,
+    "ف": 2023,
+    "ə": 2024,
+    "ඪ": 2025,
+    "👼": 2026,
+    "ರ": 2027,
+    "Ŝ": 2028,
+    "新": 2029,
+    "ටෝ": 2030,
+    "ም": 2031,
+    ")": 2032,
+    "v": 2033,
+    "ص": 2034,
+    "ئ": 2035,
+    "兵": 2036,
+    "ඛැ": 2037,
+    "ෂඃ": 2038,
+    "Ü": 2039,
+    "न": 2040,
+    "ම": 2041,
+    "я": 2042,
+    "ہ": 2043,
+    "Γ": 2044,
+    "ශෝ": 2045,
+    "இ": 2046,
+    "°": 2047,
+    "": 2048,
+    "（": 2049,
+    "訣": 2050,
+    "එෙ": 2051,
+    "=": 2052,
+    "ッ": 2053,
+    "ป": 2054,
+    "Σ": 2055,
+    "堡": 2056,
+    "ඬ": 2057,
+    "✘": 2058,
+    " ": 2059,
+    "۹": 2060,
+    "包": 2061,
+    "д": 2062,
+    "Π": 2063,
+    "ඉා": 2064,
+    "♪": 2065,
+    "ලෞ": 2066,
+    "": 2067,
+    "පු": 2068,
+    "ධෛ": 2069,
+    "헌": 2070,
+    "එ": 2071,
+    "භැ": 2072,
+    "ಅ": 2073,
+    "‑": 2074,
+    "Ἀ": 2075,
+    "¶": 2076,
+    "": 2077,
+    "є": 2078,
+    "s": 2079,
+    "リ": 2080,
+    "ඉං": 2081,
+    "ඨො": 2082,
+    "🤓": 2083,
+    "إ": 2084,
+    "ோ": 2085,
+    "ඍ්": 2086,
+    "💠": 2087,
+    "击": 2088,
+    "ඕෙ": 2089,
+    "☖": 2090,
+    "ඒං": 2091,
+    "ή": 2092,
+    "ඨු": 2093,
+    "මො": 2094,
+    "x": 2095,
+    "د": 2096,
+    "ﻟ": 2097,
+    "ආ": 2098,
+    "ඔු": 2099,
+    "客": 2100,
+    "🏿": 2101,
+    "少": 2102,
+    "☎": 2103,
+    "ඔෙ": 2104,
+    "↔": 2105,
+    "ඳං": 2106,
+    "租": 2107,
+    "😲": 2108,
+    "🔖": 2109,
+    "💯": 2110,
+    "ඉැ": 2111,
+    "구": 2112,
+    "が": 2113,
+    "": 2114,
+    "වී": 2115,
+    "ඳා": 2116,
+    "රං": 2117,
+    "චං": 2118,
+    "：": 2119,
+    "බෑ": 2120,
+    "ש": 2121,
+    "ʁ": 2122,
+    "ණෘ": 2123,
+    "සෲ": 2124,
+    "ả": 2125,
+    "ጥ": 2126,
+    "ඇඃ": 2127,
+    "ಿ": 2128,
+    "😇": 2129,
+    "🚘": 2130,
+    "并": 2131,
+    "천": 2132,
+    "რ": 2133,
+    "ඤූ": 2134,
+    "ඬූ": 2135,
+    "ช": 2136,
+    "¸": 2137,
+    "": 2138,
+    "ਸ": 2139,
+    "♚": 2140,
+    "﴿": 2141,
+    "මා": 2142,
+    "⏳": 2143,
+    "හූ": 2144,
+    "－": 2145,
+    "": 2146,
+    "නො": 2147,
+    "╰": 2148,
+    "ඵං": 2149,
+    "̃": 2150,
+    "汉": 2151,
+    "●": 2152,
+    "ḷ": 2153,
+    "²": 2154,
+    "▷": 2155,
+    "ი": 2156,
+    "ア": 2157,
+    "ෂෛ": 2158,
+    "🎼": 2159,
+    "": 2160,
+    "භෑ": 2161,
+    "♠": 2162,
+    "රෝ": 2163,
+    "🚉": 2164,
+    "ህ": 2165,
+    "乾": 2166,
+    "අේ": 2167,
+    "ٹ": 2168,
+    "卒": 2169,
+    ">": 2170,
+    "ર": 2171,
+    "දී": 2172,
+    "ඩෙ": 2173,
+    "推": 2174,
+    "ඥ": 2175,
+    "ኛ": 2176,
+    "络": 2177,
+    "ἄ": 2178,
+    "t": 2179,
+    "ඞ්": 2180,
+    "😋": 2181,
+    "ý": 2182,
+    "∆": 2183,
+    "ජඃ": 2184,
+    "？": 2185,
+    "れ": 2186,
+    "ふ": 2187,
+    "最": 2188,
+    "🔬": 2189,
+    "ἀ": 2190,
+    "ථං": 2191,
+    "排": 2192,
+    "요": 2193,
+    "್": 2194,
+    "◎": 2195,
+    "無": 2196,
+    "ொ": 2197,
+    "🔓": 2198,
+    "භා": 2199,
+    "": 2200,
+    "ඝෙ": 2201,
+    "😜": 2202,
+    "С": 2203,
+    "ඣා": 2204,
+    "♊": 2205,
+    "◆": 2206,
+    "书": 2207,
+    "☠": 2208,
+    "χ": 2209,
+    "b": 2210,
+    "👭": 2211,
+    "ﻬ": 2212,
+    "德": 2213,
+    "‐": 2214,
+    "ඣි": 2215,
+    "ධූ": 2216,
+    "": 2217,
+    "1": 2218,
+    "공": 2219,
+    "σ": 2220,
+    "ළෘ": 2221,
+    "அ": 2222,
+    "贼": 2223,
+    "래": 2224,
+    "මෘ": 2225,
+    "කෲ": 2226,
+    "ì": 2227,
+    "˚": 2228,
+    "è": 2229,
+    "🍔": 2230,
+    "館": 2231,
+    "ථෛ": 2232,
+    "๒": 2233,
+    "ц": 2234,
+    "ඡං": 2235,
+    "☐": 2236,
+    "ෂ්": 2237,
+    "✔": 2238,
+    "චී": 2239,
+    "👊": 2240,
+    "ඔා": 2241,
+    "ඤෙ": 2242,
+    "ಗ": 2243,
+    "ශෘ": 2244,
+    "ಾ": 2245,
+    "ආං": 2246,
+    "ඟි": 2247,
+    "හෙ": 2248,
+    "大": 2249,
+    "ෆු": 2250,
+    "භං": 2251,
+    "ன": 2252,
+    "කි": 2253,
+    "ෂූ": 2254,
+    "♫": 2255,
+    "පෙ": 2256,
+    "ミ": 2257,
+    "🇺": 2258,
+    "Н": 2259,
+    "尔": 2260,
+    "▣": 2261,
+    "ч": 2262,
+    "‛": 2263,
+    "දි": 2264,
+    "පෑ": 2265,
+    "ෆා": 2266,
+    "ථෘ": 2267,
+    "ャ": 2268,
+    "ϕ": 2269,
+    "ח": 2270,
+    "多": 2271,
+    "බො": 2272,
+    "<unk>": 2273
+}
\ No newline at end of file
diff --git a/examples/examples.ipynb b/examples/examples.ipynb
index 461880a..811cf4e 100644
--- a/examples/examples.ipynb
+++ b/examples/examples.ipynb
@@ -16,8 +16,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sinlib import Tokenizer\n",
-    "from sinlib import preprocessing"
+    "from sinlib import Tokenizer, preprocessing, Romanizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training a sinlib tokenizer"
    ]
   },
   {
@@ -26,7 +32,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "text = [\"ක්‍රමවත්ව, ඉවසිලිවන්තව\"] * 10"
+    "corpus = [\n",
+    "    \"\"\"මෙරටට බුදදහම දායාද කරමින් අනුබුදු මිහිඳු හිමිගේ ලංකා ගමනය සිදුවූ උතුම් පොසොන් පුර පසළොස්වක පොහොය දිනය අදට යෙදී තිබේ.\n",
+    "\n",
+    "මිහිඳු මහරහතන් වහන්සේ ප්‍රමුඛ ඉට්ඨිය, උත්ථිය, සම්බල, බද්දසාල යන රහතන් වහන්සේලාත් සුමන සාමණේරයන් වහන්සේත් භණ්ඩුක උපාසකක් බුදුරජාණන් වහන්සේගේ නිර්මල බුදුදහම රැගෙන මිහින්තලා පව්වට වැඩම කරවීම අද වැනි පොසොන් පුර පසළොස්වක පෙහොය දිනක සිදුවූ බව බෞද්ධ ඉතිහාසයේ සඳහන් වෙයි.\n",
+    "\n",
+    "දේවානම් පියතිස්ස රජු ඇතුළු පිරිස චුල්ලහත්ථි පදෝපම සූත්‍රය අසා තෙරුවන් සරණ යාම සිදු වූයේද අද වැනි පොසොන් පොහොය දිනකය.\"\"\",\n",
+    "\"මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසින් අනු දැන වදාළ ධර්ම මාර්ගය මෙරට පත්වී ඇති දේශපාලන, සමාජ හා ආර්ථික ගැටළු නිරාකරණය කර ගනිමින් දියුණු රටක් ගොඩනැඟීමට ඉවහල් කරගන්නා ලෙස සියලු දෙනාගෙන් ඉල්ලා සිටින බවය.\"\n",
+    "]"
    ]
   },
   {
@@ -35,44 +48,41 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "['ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව',\n",
-       " 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව']"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[0;31mSignature:\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_list\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mDocstring:\u001b[0m\n",
+      "Train the tokenizer on a list of text strings.\n",
+      "\n",
+      "Parameters\n",
+      "----------\n",
+      "text_list : list of str\n",
+      "    List of text strings to be used for training the tokenizer.\n",
+      "\n",
+      "Examples\n",
+      "--------\n",
+      ">>> from sinlib import Tokenizer\n",
+      ">>> corpus = [...]\n",
+      ">>> tokenizer = Tokenizer()\n",
+      ">>> tokenizer.train(corpus)\n",
+      "\u001b[0;31mFile:\u001b[0m      ~/learning/sinlib/src/sinlib/tokenizer.py\n",
+      "\u001b[0;31mType:\u001b[0m      method"
+     ]
     }
    ],
    "source": [
-    "text"
+    "tokenizer = Tokenizer()\n",
+    "tokenizer.train?"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව', 'ක්\\u200dරමවත්ව, ඉවසිලිවන්තව']\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "print(text) # have non printables \\u200d"
+    "tokenizer.train(corpus)"
    ]
   },
   {
@@ -83,16 +93,7 @@
     {
      "data": {
       "text/plain": [
-       "[0.9333333333333333,\n",
-       " 0.9333333333333333,\n",
-       " 0.9333333333333333,\n",
-       " 0.9333333333333333,\n",
-       " 0.9333333333333333,\n",
-       " 0.9333333333333333,\n",
-       " 0.9333333333333333,\n",
-       " 0.9333333333333333,\n",
-       " 0.9333333333333333,\n",
-       " 0.9333333333333333]"
+       "127"
       ]
      },
      "execution_count": 6,
@@ -101,237 +102,364 @@
     }
    ],
    "source": [
-    "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=False)"
+    "len(tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Encoding text"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"උතුම් පොසොන් පොහොය අද\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "encodings = tokenizer(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"
+       "[51, 118, 33, 54, 121, 13, 97, 54, 121, 29, 50, 54, 52, 120]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True)"
+    "encodings"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"
+       "['උ', 'තු', 'ම්', ' ', 'පො', 'සො', 'න්', ' ', 'පො', 'හො', 'ය', ' ', 'අ', 'ද']"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True, ignore_non_printable=True)"
+    "[tokenizer.token_id_to_token_map[tok] for tok in encodings]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save trained tokenizer and load from disk"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.save_tokenizer(\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_tokenizer = Tokenizer().load_from_pretrained(\"./vocab.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert loaded_tokenizer(text)==tokenizer(text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Sinhala text romanization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "romanizer = Romanizer(char_mapper_fp=None, tokenizer_vocab_path=None) #pass both none to load from default configs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "[0.9375,\n",
-       " 0.9375,\n",
-       " 0.9375,\n",
-       " 0.9375,\n",
-       " 0.9375,\n",
-       " 0.9375,\n",
-       " 0.9375,\n",
-       " 0.9375,\n",
-       " 0.9375,\n",
-       " 0.9375]"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "meratata budadahama dayada karamin anubudu mihidu himige lanka gamanaya siduwu uthum poson pura pasaloswaka pohoya dinaya adata yedi thibe.mihidu maharahathan wahanse pramuka ettiya, uththiya, sambala, baddasala yana rahathan wahanselath sumana samanorayan wahanseth bhanduka upasakak budurajanan wahansege nirmala bududahama regena mihinthala pawwata wadama karawema ada wani poson pura pasaloswaka pehoya dinaka siduwu bawa bauddha ethihasaye sadahan wei.dewanam piyathissa raju ethulu pirisa chullahaththi padhopama suthraya asa theruwan sarana yama sidu wuyeda ada wani poson pohoya dinakaya.\n"
+     ]
     }
    ],
    "source": [
-    "preprocessing.get_sinhala_character_ratio(text, consider_special_character_as_sinhala=True, ignore_non_printable=False)"
+    "print(romanizer(corpus[0]))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tokeniser = Tokenizer()"
+    "more_complex_text = corpus[1]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
-    "corpus = [\"\"\"මේ මාසයේ ගත වූ දින 15ක කාලය තුළ කොළඹ නගරය ආශ්‍රිත ව සීසීටීවී දර්ශන මඟින් වැරදිවලට සම්බන්ධ පුද්ගලයන් 793 දෙනෙකු හදුනාගත් බව පොලීසිය නිවේදනය කර තිබේ.\"\"\"\n",
-    "          \"\"\"මෑතකාලීන ව රට මුහුණ දුන් අභියෝගාත්මකම ආර්ථික කාරණාව ණය ප්‍රතිව්‍යුගතකරණය බව මුදල් රාජ්‍ය අමාත්‍ය ආචාර්ය රංජිත් සියඹලාපිටිය මහතා පවසයි.\"\"\",\n",
-    "          \"භාෂාව\"\n",
-    "          ]"
+    "more_complex_text = more_complex_text[:100] + \".... \\nIn linguistics, romanization is the conversion...., adding special chars ^^*#(&#&$^)\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 18,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'me athara poson poho dina paniwudayak nikuth karamin janadhipathiwaraya penwa denne mihidu maharahathan wahanse visi.... In linguistics, romanization is the conversion...., adding special chars ^^*#(&#&$^)'"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "tokeniser.train(corpus)"
+    "romanizer(more_complex_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Few available preprocessing methods on Sinhala texts"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tokens = tokeniser(\"රට මුහුණ දුන් සිද්ධියේ\")"
+    "_, token_count = preprocessing.process_text_with_token_counts(corpus[0], consider_special_character_as_sinhala=False, ignore_non_printable=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[73, 37, 2, 68, 56, 38, 2, 62, 29, 2, 46, 54, 87, 4]"
+       "271"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "tokens"
+    "token_count"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
-    "decoded_tokens = [tokeniser.token_id_to_token_map[id] for id in tokens]"
+    "more_complex_text += \"ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසි.... \n",
+      "In linguistics, romanization is the conversion...., adding special chars ^^*#(&#&$^)ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(more_complex_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'රට මුහුණ දුන් සිද්<unk>යේ'"
+       "'rs ^^*#(&#&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී'"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "\"\".join(decoded_tokens)"
+    "more_complex_text[-50:]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'රට මුහුණ දුන් සිද්<unk>යේ'"
+       "'මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසි.... , ...., ^^*#(&#&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී'"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "tokeniser.decode(tokens)"
+    "preprocessing.remove_english_characters(more_complex_text)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['භා', 'ෂා', 'ව']"
+       "'rs ^^*#(&#&$^)ශ්රී ලංකා ප්රජාතාන්ත්රික සමාජවාදී'"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "[tokeniser.token_id_to_token_map[id] for id in tokeniser(\"භාෂාව\")]"
+    "preprocessing.remove_non_printable(more_complex_text[-50:])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['ස', 'ි', 'ං', 'හ', 'ල']"
+       "0.610738255033557"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "list(\"සිංහල\")"
+    "preprocessing.get_sinhala_character_ratio(more_complex_text)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.0"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preprocessing.get_sinhala_character_ratio(\n",
+    "    preprocessing.remove_english_characters(\n",
+    "        more_complex_text\n",
+    "    )\n",
+    ")"
+   ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "analysis-env",
+   "language": "python",
+   "name": "analysis"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index 4398668..704f290 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "sinlib"
-version = "0.0.8.5"
+version = "0.0.8.6"
 description = "Sinhala NLP Toolkit"
 authors = [
     { name = "Ransaka", email = "ransaka.ravihara@gmail.com" }
diff --git a/src/sinlib/__init__.py b/src/sinlib/__init__.py
index e48d083..129cf78 100644
--- a/src/sinlib/__init__.py
+++ b/src/sinlib/__init__.py
@@ -1,7 +1,9 @@
 from sinlib.tokenizer import Tokenizer
 from sinlib.utils import preprocessing
+from sinlib.romanize import Romanizer
 
 __all__ = [
     "Tokenizer",
-    "preprocessing"
+    "preprocessing",
+    "Romanizer"
 ]
diff --git a/src/sinlib/romanize.py b/src/sinlib/romanize.py
new file mode 100644
index 0000000..dafa4d3
--- /dev/null
+++ b/src/sinlib/romanize.py
@@ -0,0 +1,43 @@
+from .utils.preprocessing import load_char_mapper
+from .tokenizer import Tokenizer
+from .utils.preprocessing import DEFAULT_VOCAB_MAP_FP, CHAR_MAPPER_FP
+from .utils.chars import ALL_SINHALA_CHARACTERS, NUBERS_AND_PUNKTS
+from .utils.preprocessing import remove_non_printable
+import numpy as np
+
+
+class Romanizer:
+    def __init__(self, char_mapper_fp: str, tokenizer_vocab_path: str):
+        if char_mapper_fp is None:
+            char_mapper_fp = CHAR_MAPPER_FP
+        if tokenizer_vocab_path is None:
+            tokenizer_vocab_path = DEFAULT_VOCAB_MAP_FP
+        self.char_mapper = load_char_mapper(char_mapper_fp)
+        self.tokenizer = Tokenizer()
+        self.tokenizer.load_from_pretrained(tokenizer_vocab_path)
+
+    def __call__(self, text):
+        return self.__romanize(text)
+
+    def __romanize(self, text: str):
+        text = remove_non_printable(text)
+        chars = np.array(list(text))
+        sinhala_mask = [
+            True
+            if ch in ALL_SINHALA_CHARACTERS + list(NUBERS_AND_PUNKTS) + [" "]
+            else False
+            for ch in chars
+        ]
+        sinhala_text = "".join(chars[sinhala_mask]).strip()
+        encodings = self.tokenizer(sinhala_text)
+        decoded_sinhala_chars = [
+            self.tokenizer.token_id_to_token_map[c] for c in encodings
+        ]
+        romanized_sinhala = [
+            self.char_mapper.get(ch, ch if ch in NUBERS_AND_PUNKTS.union(" ") else None)
+            for ch in decoded_sinhala_chars
+        ]
+        romanized_sinhala = "".join(romanized_sinhala)
+        word_2_word_mapping = dict(zip(sinhala_text.split(), romanized_sinhala.split()))
+        romanized_text = [word_2_word_mapping.get(word, word) for word in text.split()]
+        return " ".join(romanized_text)
diff --git a/src/sinlib/tokenizer.py b/src/sinlib/tokenizer.py
index f2436b9..b5bc0ee 100644
--- a/src/sinlib/tokenizer.py
+++ b/src/sinlib/tokenizer.py
@@ -1,26 +1,96 @@
+import json
+import warnings
+from pathlib import Path
 import concurrent.futures
-from .utils.preprocessing import process_text
+from .utils.preprocessing import process_text, load_default_vocab_map
+
 
 class Tokenizer:
     def __init__(self):
+        self.unknown_token_id = None
+        self.token_id_to_token_map = None
+        self.vocab_map = None
         self.unknown_token = "<unk>"
         self.tokenized_chars = []
         self.unique_chars = []
-    
-    def __encode(self, text):
+
+    def __encode(self, text) -> list:
         processed_text = self.__process_text(text)
-        encoded_text = [self.vocab_map.get(char, self.unknown_token_id) for char in processed_text]
+        encoded_text = [
+            self.vocab_map.get(char, self.unknown_token_id) for char in processed_text
+        ]
         return encoded_text
-    
-    def __call__(self, text):
+
+    def __call__(self, text) -> list:
+        """
+        Encode the given text into a list of tokens.
+
+        Parameters
+        ----------
+        text : str
+            Text to be encoded.
+
+        Returns
+        -------
+        encoded_tokens : list of int
+            List of tokens representing the encoded text.
+
+        Examples
+        --------
+        >>> from sinlib import Tokenizer
+        >>> corpus = [...]
+        >>> tokenizer = Tokenizer()
+        >>> tokenizer.train(corpus)
+        >>> tokenizer("මම ගෙදර ගියා")
+        [2041, 2041, 942, 965, 624, 909, 942, 54, 1960]
+        """
         return self.__encode(text)
-    
-    def decode(self, ids):
-        return "".join([self.token_id_to_token_map.get(token,self.unknown_token) for token in ids])
 
-    def train(self, text_list):
-        self.__train_chracter_level_tokenizer(text_list)
-    
+    def decode(self, ids) -> str:
+        """
+        Decode a list of token IDs into a string.
+
+        Parameters
+        ----------
+        ids : list of int
+            List of token IDs to be decoded.
+
+        Returns
+        -------
+        decoded_text : str
+            The decoded text string.
+
+        Examples
+        --------
+        >>> from sinlib import Tokenizer
+        >>> tokenizer = Tokenizer()
+        >>> tokenizer.train([...])
+        >>> encoded_tokens = [2041, 2041, 942, 965, 624, 909, 942, 54, 1960]
+        >>> tokenizer.decode(encoded_tokens)
+        'මම ගෙදර ගියා'
+        """
+        return "".join(
+            [self.token_id_to_token_map.get(token, self.unknown_token) for token in ids]
+        )
+
+    def train(self, text_list) -> None:
+        """
+        Train the tokenizer on a list of text strings.
+
+        Parameters
+        ----------
+        text_list : list of str
+            List of text strings to be used for training the tokenizer.
+
+        Examples
+        --------
+        >>> from sinlib import Tokenizer
+        >>> corpus = [...]
+        >>> tokenizer = Tokenizer()
+        >>> tokenizer.train(corpus)
+        """
+        self.__train_character_level_tokenizer(text_list)
+
     def __len__(self):
         return len(self.vocab_map)
 
@@ -28,12 +98,64 @@ def __len__(self):
     def __process_text(t):
         return process_text(t)
 
-    def __train_chracter_level_tokenizer(self, text_list):
+    def __train_character_level_tokenizer(self, text_list):
         with concurrent.futures.ThreadPoolExecutor() as executor:
             results = list(executor.map(self.__process_text, text_list))
             self.tokenized_chars = [char for sublist in results for char in sublist]
         self.unique_chars = set(self.tokenized_chars)
-        self.vocab_map = dict(zip(self.unique_chars,range(len(self.unique_chars))))
+        self.vocab_map = dict(zip(self.unique_chars, range(len(self.unique_chars))))
         self.vocab_map[self.unknown_token] = len(self.vocab_map)
         self.unknown_token_id = self.vocab_map[self.unknown_token]
-        self.token_id_to_token_map = {value:key for key,value in self.vocab_map.items()}
\ No newline at end of file
+        self.token_id_to_token_map = {
+            value: key for key, value in self.vocab_map.items()
+        }
+
+    def load_from_pretrained(self, file_path: str) -> None:
+        """
+        Load the vocabulary map from a pre-trained file.
+
+        Parameters
+        ----------
+        file_path : str
+            Path to the file containing the pre-trained vocabulary map.
+
+        Returns
+        -------
+        None
+
+        Warns
+        -----
+        UserWarning
+            If the file is not found at the specified path, a default vocabulary map is loaded and a warning is issued.
+
+        Examples
+        --------
+        >>> from sinlib import Tokenizer
+        >>> tokenizer = Tokenizer()
+        >>> tokenizer.load_from_pretrained("pretrained_vocab.json")
+        """
+        if Path(file_path).is_file():
+            with open(file_path, "r") as f:
+                self.vocab_map = json.load(f)
+        else:
+            warnings.warn(
+                "File not found at the specified path. Loaded default vocab map.",
+                UserWarning,
+            )
+            self.vocab_map = load_default_vocab_map()
+
+        self.token_id_to_token_map = {
+            value: key for key, value in self.vocab_map.items()
+        }
+        self.unknown_token_id = self.vocab_map[self.unknown_token]
+        return self
+
+    def save_tokenizer(self, save_path: str):
+        save_path = Path(save_path)
+        configurations = {"unknown_token": self.unknown_token}
+
+        with open(save_path / "vocab.json", "w", encoding="utf-8") as file:
+            json.dump(self.vocab_map, file, ensure_ascii=False, indent=4)
+
+        with open(save_path / "config.json", "w") as file:
+            json.dump(configurations, file, indent=4)
\ No newline at end of file
diff --git a/src/sinlib/utils/chars.py b/src/sinlib/utils/chars.py
index 3dd2cb1..228b247 100644
--- a/src/sinlib/utils/chars.py
+++ b/src/sinlib/utils/chars.py
@@ -1,43 +1,188 @@
 from string import punctuation
 
-BASE_CONSONANTS = [
-    'ක', 'ඛ', 'ග', 'ඝ', 'ඞ', 'ඟ',
-    'ච', 'ඡ', 'ජ', 'ඣ', 'ඤ', 'ඦ',
-    'ට', 'ඨ', 'ඩ', 'ඪ', 'ණ', 'ඬ',
-    'ත', 'ථ', 'ද', 'ධ', 'න', 'ඳ',
-    'ප', 'ඵ', 'බ', 'භ', 'ම', 'ඹ',
-    'ය', 'ර', 'ල', 'ව',
-    'ශ', 'ෂ', 'ස', 'හ', 'ළ', 'ෆ',
+ALL_SINHALA_CHARACTERS = [
+    "ඏ",
+    "ඛ",
+    "ම",
+    "ඍ",
+    "ு",
+    "ා",
+    "ප",
+    "ඝ",
+    "ඹ",
+    "ඓ",
+    "ෑ",
+    "ෂ",
+    "ැ",
+    "ෲ",
+    "ි",
+    "ක",
+    "ණ",
+    "ධ",
+    "்",
+    "ඵ",
+    "ඞ",
+    "ජ",
+    "හ",
+    "ෝ",
+    "ඤ",
+    "ට",
+    "ඇ",
+    "ෞ",
+    "ඒ",
+    "ූ",
+    "ව",
+    "ඣ",
+    "ච",
+    "ඖ",
+    "ෘ",
+    "ු",
+    "ඳ",
+    "ඌ",
+    "ෙ",
+    "්",
+    "ඥ",
+    "ீ",
+    "ෛ",
+    "ෳ",
+    "ඔ",
+    "ආ",
+    "ළ",
+    "උ",
+    "ඟ",
+    "ඃ",
+    "ඈ",
+    "ඪ",
+    "බ",
+    "අ",
+    "ෆ",
+    "ත",
+    "ේ",
+    "ඬ",
+    "ය",
+    "ො",
+    "ශ",
+    "භ",
+    "ං",
+    "ර",
+    "ඉ",
+    "ඨ",
+    "ී",
+    "ඕ",
+    "ඡ",
+    "න",
+    "ස",
+    "ද",
+    "ඩ",
+    "ෟ",
+    "ග",
+    "එ",
+    "ඊ",
+    "ල",
+    "ථ",
 ]
 
-SAN = [
-    'ඟ', 'ඦ', 'ඬ', 'ඳ', 'ඹ'
+BASE_CONSONANTS = [
+    "ක",
+    "ඛ",
+    "ග",
+    "ඝ",
+    "ඞ",
+    "ඟ",
+    "ච",
+    "ඡ",
+    "ජ",
+    "ඣ",
+    "ඤ",
+    "ඦ",
+    "ට",
+    "ඨ",
+    "ඩ",
+    "ඪ",
+    "ණ",
+    "ඬ",
+    "ත",
+    "ථ",
+    "ද",
+    "ධ",
+    "න",
+    "ඳ",
+    "ප",
+    "ඵ",
+    "බ",
+    "භ",
+    "ම",
+    "ඹ",
+    "ය",
+    "ර",
+    "ල",
+    "ව",
+    "ශ",
+    "ෂ",
+    "ස",
+    "හ",
+    "ළ",
+    "ෆ",
 ]
 
-SAN_MAPPING = {'ඟ': 'ංග', 'ඦ': 'ඤ්ජ', 'ඬ': 'ණ්ඩ', 'ඳ': 'න්ද', 'ඹ': 'ම්බ'}
+SAN = ["ඟ", "ඦ", "ඬ", "ඳ", "ඹ"]
+
+SAN_MAPPING = {"ඟ": "ංග", "ඦ": "ඤ්ජ", "ඬ": "ණ්ඩ", "ඳ": "න්ද", "ඹ": "ම්බ"}
 REVERSE_SAN_MAPPING = {d: v for v, d in SAN_MAPPING.items()}
 
-CONSONANTS = [c + '්' for c in BASE_CONSONANTS]
+CONSONANTS = [c + "්" for c in BASE_CONSONANTS]
 
 VOWELS = [
-    'අ', 'ආ', 'ඇ', 'ඈ', 'ඉ', 'ඊ', 'උ', 'ඌ',
-    'ඍ', 'ඎ', 'එ', 'ඒ', 'ඓ', 'ඔ', 'ඕ', 'ඖ',
-    'අං', 'අඃ',
+    "අ",
+    "ආ",
+    "ඇ",
+    "ඈ",
+    "ඉ",
+    "ඊ",
+    "උ",
+    "ඌ",
+    "ඍ",
+    "ඎ",
+    "එ",
+    "ඒ",
+    "ඓ",
+    "ඔ",
+    "ඕ",
+    "ඖ",
+    "අං",
+    "අඃ",
 ]
 
 VOWEL_DIACRITICS = [
-    '', 'ා', 'ැ', 'ෑ', 'ි', 'ී', 'ු', 'ූ', 'ෘ',
-    'ෲ', 'ෙ', 'ේ', 'ෛ', 'ො', 'ෝ', 'ෞ',
-    'ං', 'ඃ', '්', 'ෳ'
+    "",
+    "ා",
+    "ැ",
+    "ෑ",
+    "ි",
+    "ී",
+    "ු",
+    "ූ",
+    "ෘ",
+    "ෲ",
+    "ෙ",
+    "ේ",
+    "ෛ",
+    "ො",
+    "ෝ",
+    "ෞ",
+    "ං",
+    "ඃ",
+    "්",
+    "ෳ",
 ]
 
 LONG_TO_SHORT_VOWEL_DIACRITICS_MAPPING = {
-    '': 'ා',
-    'ෑ': 'ැ',
-    'ී': 'ි',
-    'ූ': 'ු',
-    'ේ': 'ෙ',
-    'ෝ': 'ො'
+    "": "ා",
+    "ෑ": "ැ",
+    "ී": "ි",
+    "ූ": "ු",
+    "ේ": "ෙ",
+    "ෝ": "ො",
 }
 
 DIACRITICS_MAPPING = {v: d for v, d in zip(VOWELS, VOWEL_DIACRITICS)}
@@ -45,49 +190,111 @@
 REVERSE_DIACRITICS_MAPPING = {d: v for v, d in zip(VOWELS, VOWEL_DIACRITICS)}
 
 CONJUNCT_CONSONANTS = [
-    'ක්ර', 'ඛ්ර', 'ග්ර', 'ඝ්ර', 'ඞ්ර', 'ඟ්ර',
-    'ක්ය', 'ඛ්ය', 'ග්ය', 'ඝ්ය', 'ඞ්ය', 'ඟ්ය',
-    'ක්ෂ', '෴',
+    "ක්ර",
+    "ඛ්ර",
+    "ග්ර",
+    "ඝ්ර",
+    "ඞ්ර",
+    "ඟ්ර",
+    "ක්ය",
+    "ඛ්ය",
+    "ග්ය",
+    "ඝ්ය",
+    "ඞ්ය",
+    "ඟ්ය",
+    "ක්ෂ",
+    "෴",
 ]
 
 NUMERALS = [
-    '𑇡', '𑇢', '𑇣', '𑇤', '𑇥', '𑇦', '𑇧', '𑇨', '𑇩', '𑇪',
-    '𑇫', '𑇬', '𑇭', '𑇮', '𑇯', '𑇰', '𑇱', '𑇲', '𑇳', '𑇴',
+    "𑇡",
+    "𑇢",
+    "𑇣",
+    "𑇤",
+    "𑇥",
+    "𑇦",
+    "𑇧",
+    "𑇨",
+    "𑇩",
+    "𑇪",
+    "𑇫",
+    "𑇬",
+    "𑇭",
+    "𑇮",
+    "𑇯",
+    "𑇰",
+    "𑇱",
+    "𑇲",
+    "𑇳",
+    "𑇴",
 ]
 
 
 GOSHA_LETTERS = [
-    'අ', 'ආ', 'ඇ', 'ඈ', 'ඉ', 'ඊ', 'උ', 'ඌ',
-    'ඍ', 'ඎ', 'එ', 'ඒ', 'ඓ', 'ඔ', 'ඕ', 'ඖ',
-    'අං', 'අඃ',
-    'ග', 'ඝ', 'ඞ',
-    'ජ', 'ඣ', 'ඤ',
-    'ඩ', 'ඪ', 'ණ',
-    'ද', 'ධ', 'න',
-    'බ', 'භ', 'ම',
-    'ය', 'ර', 'ල', 'ව',
-    'හ'
+    "අ",
+    "ආ",
+    "ඇ",
+    "ඈ",
+    "ඉ",
+    "ඊ",
+    "උ",
+    "ඌ",
+    "ඍ",
+    "ඎ",
+    "එ",
+    "ඒ",
+    "ඓ",
+    "ඔ",
+    "ඕ",
+    "ඖ",
+    "අං",
+    "අඃ",
+    "ග",
+    "ඝ",
+    "ඞ",
+    "ජ",
+    "ඣ",
+    "ඤ",
+    "ඩ",
+    "ඪ",
+    "ණ",
+    "ද",
+    "ධ",
+    "න",
+    "බ",
+    "භ",
+    "ම",
+    "ය",
+    "ර",
+    "ල",
+    "ව",
+    "හ",
 ]
 
 AGOSHA_LETTERS = [
-    'ක්', 'ඛ්',
-    'ච්', 'ඡ්',
-    'ට්', 'ඨ්',
-    'ත්', 'ථ්',
-    'ප්', 'ඵ්',
+    "ක්",
+    "ඛ්",
+    "ච්",
+    "ඡ්",
+    "ට්",
+    "ඨ්",
+    "ත්",
+    "ථ්",
+    "ප්",
+    "ඵ්",
 ]
 
 AGOSHA_TO_GOSHA_MAPPING = {
-    'ක්': 'ග්',
-    'ඛ්': 'ඝ්',
-    'ච්': 'ජ්',
-    'ඡ්': 'ඣ්',
-    'ට්': 'ඩ්',
-    'ඨ්': 'ඪ්',
-    'ත්': 'ද්',
-    'ථ්': 'ධ්',
-    'ප්': 'බ්',
-    'ඵ්': 'භ්',
+    "ක්": "ග්",
+    "ඛ්": "ඝ්",
+    "ච්": "ජ්",
+    "ඡ්": "ඣ්",
+    "ට්": "ඩ්",
+    "ඨ්": "ඪ්",
+    "ත්": "ද්",
+    "ථ්": "ධ්",
+    "ප්": "බ්",
+    "ඵ්": "භ්",
 }
 PUNKT = set(punctuation)
 NUMBERS = set("1234567890")
diff --git a/src/sinlib/utils/preprocessing.py b/src/sinlib/utils/preprocessing.py
index b2be711..3ce505f 100644
--- a/src/sinlib/utils/preprocessing.py
+++ b/src/sinlib/utils/preprocessing.py
@@ -2,13 +2,32 @@
 import multiprocessing
 import re
 from .chars import VOWEL_DIACRITICS, NUBERS_AND_PUNKTS, ALL_LETTERS
-import numpy as np
-import os
+import json
+from pathlib import Path
+import warnings
 
-# file_path = os.path.join(os.path.dirname(__file__), '../data', 'sinhala_chars_with_special_chars.txt')
+DEFAULT_VOCAB_MAP_FP = "../data/vocab_map.json"
+CHAR_MAPPER_FP = "../data/char_map.json"
 
-# with open(file_path,'r') as f:
-#     SINHALA_CHARS_WITH_SPECIAL_CHARS = f.read().split("\n")
+
+def load_char_mapper(char_mapper_fp):
+    if Path(char_mapper_fp).is_file():
+        with open(char_mapper_fp, "r") as f:
+            char_mapper = json.load(f)
+    else:
+        warnings.warn(
+            "File not found at the specified path. Loaded default char map.",
+            UserWarning,
+        )
+        with open(CHAR_MAPPER_FP, "r") as f:
+            char_mapper = json.load(f)
+    return char_mapper
+
+
+def load_default_vocab_map():
+    with open(DEFAULT_VOCAB_MAP_FP, "r") as f:
+        vocab_map = json.load(f)
+    return vocab_map
 
 
 def remove_non_printable(input_string):
@@ -69,7 +88,38 @@ def process_text(t):
     return tokenized_chars
 
 
-def process_text_with_token_counts(t:str, consider_special_character_as_sinhala:bool, ignore_non_printable:bool):
+def process_text_with_token_counts(
+    t: str, consider_special_character_as_sinhala: bool, ignore_non_printable: bool
+):
+    """
+    Process the given text, tokenizing it and counting the tokens.
+
+    Parameters
+    ----------
+    t : str
+        The text to be processed.
+    consider_special_character_as_sinhala : bool
+        If True, special characters will be considered as Sinhala characters.
+    ignore_non_printable : bool
+        If True, non-printable characters will be removed from the text.
+
+    Returns
+    -------
+    tokenized_chars : list of str
+        List of tokenized characters from the text.
+    token_counts : int
+        Total count of tokens in the text.
+
+    Examples
+    --------
+    >>> from sinlib.utils.preprocessing import process_text_with_token_counts
+    >>> text = "මම ගෙදර ගියා."
+    >>> tokenized_chars, token_counts = process_text_with_token_counts(text, True, True)
+    >>> print(tokenized_chars)
+    ['ම', 'ම', ' ', 'ගෙ', 'ද', 'ර', ' ', 'ගි', 'යා', '.']
+    >>> print(token_counts)
+    10
+    """
     if ignore_non_printable:
         t = remove_non_printable(t)
 
@@ -92,25 +142,62 @@ def process_text_with_token_counts(t:str, consider_special_character_as_sinhala:
                 tokenized_chars.append(char + t[i + 1])
             else:
                 tokenized_chars.append(char)
-
         else:
             tokenized_chars.append(char)
 
     return tokenized_chars, token_counts
 
 
-def get_sinhala_character_ratio(text, consider_special_character_as_sinhala:bool=True, ignore_non_printable:bool=True):
-    """Retuning sinhala character ratio for given text string for given settings. Expects optional two parameters.
-    consider_special_character_as_sinhala: if this set to true all numbers and special characters will consider as sinhala.
-    ignore_non_printable: if this set to true non printables will remove before start processing
+def get_sinhala_character_ratio(
+    text,
+    consider_special_character_as_sinhala: bool = True,
+    ignore_non_printable: bool = True,
+):
+    """
+    Calculate the ratio of Sinhala characters in the given text.
+
+    Parameters
+    ----------
+    text : str or list of str
+        The text or list of text strings to be processed.
+    consider_special_character_as_sinhala : bool, default=True
+        If True, numbers and special characters will be considered as Sinhala characters.
+    ignore_non_printable : bool, default=True
+        If True, non-printable characters will be removed before processing.
+
+    Returns
+    -------
+    ratio : float or list of float
+        The ratio of Sinhala characters in the text. If the input is a list, returns a list of ratios for each text string.
+
+    Examples
+    --------
+    >>> from sinlib.utils.preprocessing import get_sinhala_character_ratio
+    >>> text = "මම ගෙදර ගියා."
+    >>> ratio = get_sinhala_character_ratio(text, True, True)
+    >>> print(ratio)
+    1.0
+
+    >>> texts = ["මම ගෙදර ගියා.", "This is an example."]
+    >>> ratio = get_sinhala_character_ratio(texts, False, True)
+    >>> print(ratios)
+    [0.875, 0.0]
     """
     if isinstance(text, str):
-        tokenized_text, sinhala_token_count = process_text_with_token_counts(text,consider_special_character_as_sinhala,ignore_non_printable=ignore_non_printable)
+        tokenized_text, sinhala_token_count = process_text_with_token_counts(
+            text,
+            consider_special_character_as_sinhala,
+            ignore_non_printable=ignore_non_printable,
+        )
         tokenized_text = [tok for tok in tokenized_text if tok != " "]
         return sinhala_token_count / len(tokenized_text)
     elif isinstance(text, list):
         pool = multiprocessing.Pool()
-        partial_process_text = partial(process_text_with_token_counts, consider_special_character_as_sinhala=consider_special_character_as_sinhala, ignore_non_printable=ignore_non_printable)
+        partial_process_text = partial(
+            process_text_with_token_counts,
+            consider_special_character_as_sinhala=consider_special_character_as_sinhala,
+            ignore_non_printable=ignore_non_printable,
+        )
         results = pool.map(partial_process_text, text)
         pool.close()
         pool.join()