Added text romanization logics and added proper dockstrings

Ransaka · Jun 21, 2024 · a5f4010 · a5f4010
1 parent b995ba4
commit a5f4010
Show file tree

Hide file tree

Showing 16 changed files with 3,119 additions and 193 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/sinlib.iml b/.idea/sinlib.iml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Sinlib (Buggy alpha version)
+# Sinlib
 
 ![Alt text](sinlib.png)
 
@@ -29,14 +29,27 @@ encoding = tokenizer("මේ අතර, පෙබරවාරි මාසයේ
 [tokenizer.token_id_to_token_map[id] for id in encoding]
 ['මේ', ' ', 'අ', 'ත', 'ර', ',', ' ', 'පෙ', 'බ', 'ර', 'වා', 'රි', ' ', 'මා', 'ස', 'යේ', ' ', 'ප', 'ළ', 'මු']
 ```
+
 02. Preprocessor
    ```python
 sent = ['මෙය සිංහල වාක්‍යක්', 'මෙය සිංහල වාක්‍යක් සමග english character කීපයක්','This is complete english sentence']
 print(sent)
-['මෙය සිංහල වාක්\u200dයක්', 'මෙය සිංහල වාක්\u200dයක් සමග english character කීපයක්', 'This is complete english sentence']
+#['මෙය සිංහල වාක්\u200dයක්', 'මෙය සිංහල වාක්\u200dයක් සමග english character කීපයක්', 'This is #complete english sentence']
 
 from sinlib.preprocessing import get_sinhala_character_ratio
 
 get_sinhala_character_ratio(sent)
-[0.9, 0.46875, 0.0]
+#[0.9, 0.46875, 0.0]
+```
+
+03. Sinnhala Romanizer
+   ```python
+texts = ["hello, මේ මාසයේ ගත වූ දින 15ක කාලය තුළ කොළඹ නගරය ආශ්‍රිත ව", "මෑතකාලීන ව රට මුහුණ දුන් අභියෝගාත්මකම ආර්ථික කාරණාව ණය ප්‍රතිව්‍යුගතකරණය බව මුදල් රාජ්‍ය අමාත්‍ය ආචාර්ය රංජිත් සියඹ$$$ mahatha see more****"]
+
+from sinlib import Romanizer
+
+romanizer = Romanizer(char_mapper_fp = None, tokenizer_vocab_path = None)
+romanizer(text)
+#['hello, me masaye gatha wu dina 15ka kalaya thula kolaba nagaraya ashritha wa',
+# 'methakaleena wa rata muhuna dun abhiyogathmakama arthika karanawa naya prathiwyugathakaranaya #bawa mudal rajya amathya acharya ranjith siyaba$$$ mahatha see more****']
 ```
diff --git a/data/char_map.json b/data/char_map.json