releasing new version with new dcoding method

Ransaka · Sep 3, 2024 · a9b2d80 · a9b2d80
1 parent 82b301a
commit a9b2d80
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,6 @@ src/sinhala_tokenizers/.DS_Store
 .DS_Store
 examples/config.json
 examples/vocab.json
+experiments/experiments.ipynb
+experiments/test.py
+experiments/*
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,14 @@
 [project]
 name = "sinlib"
-version = "0.1.3"
+version = "0.1.4"
 description = "Sinhala NLP Toolkit"
 authors = [
     { name = "Ransaka", email = "ransaka.ravihara@gmail.com" }
 ]
 dependencies = [
     "numpy",
-    "torch"
+    "torch",
+    "tqdm"
 ]
 license = { file = "LICENSE" }
 readme = "README.md"

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 numpy
 torch
+tqdm
diff --git a/src/sinlib/tokenizer.py b/src/sinlib/tokenizer.py
@@ -3,11 +3,13 @@
 from pathlib import Path
 import concurrent.futures
 from .utils.preprocessing import process_text, load_default_vocab_map
+from tqdm import tqdm
+
 
 
 class Tokenizer:
     def __init__(
-        self, max_length: int, unknown_token: str = "<unk>", pad_token: str = "<pad>"
+        self, max_length: int, unknown_token: str = "<|unk|>", pad_token: str = "<|pad|>", end_of_text_token: str = "<|endoftext|>"
     ):
         self.unknown_token_id = None
         self.token_id_to_token_map = None
@@ -19,12 +21,44 @@ def __init__(
         self.special_tokens = [self.unknown_token, self.pad_token]
         self.max_length = max_length
         self.pad_token_id = None
+        self.end_of_text_token = end_of_text_token
+        self.end_of_text_token_id = None
+
+    def __encode(self, text, truncate_and_pad: bool, allowed_special_tokens: list = []) -> list:
+        """
+        Encode the given text into a list of tokens.
 
-    def __encode(self, text, truncate_and_pad: bool) -> list:
-        processed_text = self.__process_text(text)
-        text_encodings = [
-            self.vocab_map.get(char, self.unknown_token_id) for char in processed_text
-        ]
+        Parameters
+        ----------
+        text : str
+            Text to be encoded.
+        truncate_and_pad: bool
+            Set as True if you need to truncate/pad encodings False otherwise
+        """
+        allowed_special_tokens = [self.vocab_map[tok] for tok in allowed_special_tokens]
+        text_encodings = []
+        parts = text.split(self.end_of_text_token)
+        if len(parts) > 1:
+            for part in parts:
+                processed_text = self.__process_text(part)
+                for token in processed_text:
+                    if token in self.special_tokens:
+                        if token in allowed_special_tokens:
+                            text_encodings.append(self.vocab_map[token])
+                        else:
+                            continue
+                    else:
+                        text_encodings.append(self.vocab_map.get(token, self.unknown_token_id))
+                text_encodings.append(self.end_of_text_token_id)
+        else:
+            processed_text = self.__process_text(text)
+            for token in processed_text:
+                if token in self.special_tokens:
+                    if token in allowed_special_tokens:
+                        text_encodings.append(self.vocab_map[token])
+                    else:
+                        continue
+
         if truncate_and_pad:
             return self.pad_or_truncate(
                 sequence=text_encodings,
@@ -43,7 +77,7 @@ def pad_or_truncate(sequence, max_length, padding_value):
         else:
             return sequence
 
-    def __call__(self, text, truncate_and_pad: bool = True) -> list:
+    def __call__(self, text, truncate_and_pad: bool = True, allowed_special_tokens: list = []) -> list:
         """
         Encode the given text into a list of tokens.
 
@@ -68,7 +102,7 @@ def __call__(self, text, truncate_and_pad: bool = True) -> list:
         >>> tokenizer("මම ගෙදර ගියා")
         [2041, 2041, 942, 965, 624, 909, 942, 54, 1960]
         """
-        return self.__encode(text, truncate_and_pad=truncate_and_pad)
+        return self.__encode(text, truncate_and_pad=truncate_and_pad, allowed_special_tokens=allowed_special_tokens)
 
     def decode(self, ids, skip_special_tokens: bool = False) -> str:
         """
@@ -112,7 +146,7 @@ def decode(self, ids, skip_special_tokens: bool = False) -> str:
                 ]
             )
 
-    def train(self, text_list) -> None:
+    def train(self, text_list, memory_efficient: bool = False, chunk_size: int = 1000) -> None:
         """
         Train the tokenizer on a list of text strings.
 
@@ -128,7 +162,10 @@ def train(self, text_list) -> None:
         >>> tokenizer = Tokenizer()
         >>> tokenizer.train(corpus)
         """
-        self.__train_character_level_tokenizer(text_list)
+        if memory_efficient:
+            self.__train_character_level_tokenizer_memory_efficient(text_list, chunk_size)
+        else:
+            self.__train_character_level_tokenizer(text_list)
 
     def __len__(self):
         return len(self.vocab_map)
@@ -140,6 +177,43 @@ def vocab_size(self):
     @staticmethod
     def __process_text(t):
         return process_text(t)
+
+    def __train_character_level_tokenizer_memory_efficient(self, text_list, chunk_size):
+        """
+        Train the tokenizer on a list of text strings in a memory-efficient manner.
+
+        This method processes the text list in chunks, updating the vocabulary
+        incrementally without storing all tokenized characters in memory.
+
+        Parameters
+        ----------
+        text_list : list of str
+            List of text strings to be used for training the tokenizer.
+        """
+        unique_chars = set()
+
+        for i in tqdm(range(0, len(text_list), chunk_size), desc="Training tokenizer", total=len(text_list) // chunk_size):
+            chunk = text_list[i:i+chunk_size]
+
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                results = list(executor.map(self.__process_text, chunk))
+
+            for sublist in results:
+                unique_chars.update(sublist)
+
+        self.unique_chars = unique_chars
+        self.vocab_map = {char: i for i, char in enumerate(self.unique_chars)}
+
+        # Add special tokens
+        self.vocab_map[self.unknown_token] = len(self.vocab_map)
+        self.vocab_map[self.pad_token] = len(self.vocab_map)
+        self.vocab_map[self.end_of_text_token] = len(self.vocab_map)
+
+        self.unknown_token_id = self.vocab_map[self.unknown_token]
+        self.pad_token_id = self.vocab_map[self.pad_token]
+        self.end_of_text_token_id = self.vocab_map[self.end_of_text_token]
+
+        self.token_id_to_token_map = {value: key for key, value in self.vocab_map.items()}
 
     def __train_character_level_tokenizer(self, text_list):
         with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -149,8 +223,10 @@ def __train_character_level_tokenizer(self, text_list):
         self.vocab_map = dict(zip(self.unique_chars, range(len(self.unique_chars))))
         self.vocab_map[self.unknown_token] = len(self.vocab_map)
         self.vocab_map[self.pad_token] = len(self.vocab_map)
+        self.vocab_map[self.end_of_text_token] = len(self.vocab_map)
         self.unknown_token_id = self.vocab_map[self.unknown_token]
         self.pad_token_id = self.vocab_map[self.pad_token]
+        self.end_of_text_token_id = self.vocab_map[self.end_of_text_token]
         self.token_id_to_token_map = {
             value: key for key, value in self.vocab_map.items()
         }
@@ -190,6 +266,8 @@ def load_from_pretrained(self, file_path: str) -> None:
             self.unknown_token_id = configurations["unknown_token_id"]
             self.pad_token_id = configurations["pad_token_id"]
             self.max_length = configurations["max_length"]
+            self.end_of_text_token = configurations["end_of_text_token"]
+            self.end_of_text_token_id = configurations["end_of_text_token_id"]
         else:
             warnings.warn(
                 "File not found at the specified path. Loaded default vocab map.",
@@ -202,6 +280,7 @@ def load_from_pretrained(self, file_path: str) -> None:
         }
         self.unknown_token_id = self.vocab_map[self.unknown_token]
         self.pad_token_id = self.vocab_map[self.pad_token]
+        self.end_of_text_token_id = self.vocab_map[self.end_of_text_token]
         return self
 
     def save_tokenizer(self, save_path: str):
@@ -212,6 +291,8 @@ def save_tokenizer(self, save_path: str):
             "unknown_token_id": self.unknown_token_id,
             "pad_token_id": self.pad_token_id,
             "max_length": self.max_length,
+            "end_of_text_token": self.end_of_text_token,
+            "end_of_text_token_id": self.end_of_text_token_id,
         }
 
         with open(save_path / "vocab.json", "w", encoding="utf-8") as file: