From a9b2d80186c4219269894ed63aff31a482dfc989 Mon Sep 17 00:00:00 2001 From: ranzaka <145642022+ranzaka@users.noreply.github.com> Date: Tue, 3 Sep 2024 11:32:39 +0530 Subject: [PATCH] releasing new version with new dcoding method --- .gitignore | 3 ++ pyproject.toml | 5 +- requirements.txt | 1 + src/sinlib/tokenizer.py | 101 ++++++++++++++++++++++++++++++++++++---- 4 files changed, 98 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index b1e9670..8bed4db 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ src/sinhala_tokenizers/.DS_Store .DS_Store examples/config.json examples/vocab.json +experiments/experiments.ipynb +experiments/test.py +experiments/* diff --git a/pyproject.toml b/pyproject.toml index 555500b..8dd5447 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,14 @@ [project] name = "sinlib" -version = "0.1.3" +version = "0.1.4" description = "Sinhala NLP Toolkit" authors = [ { name = "Ransaka", email = "ransaka.ravihara@gmail.com" } ] dependencies = [ "numpy", - "torch" + "torch", + "tqdm" ] license = { file = "LICENSE" } readme = "README.md" diff --git a/requirements.txt b/requirements.txt index 3b7480f..3b55e1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy torch +tqdm diff --git a/src/sinlib/tokenizer.py b/src/sinlib/tokenizer.py index 80020c7..4d85203 100644 --- a/src/sinlib/tokenizer.py +++ b/src/sinlib/tokenizer.py @@ -3,11 +3,13 @@ from pathlib import Path import concurrent.futures from .utils.preprocessing import process_text, load_default_vocab_map +from tqdm import tqdm + class Tokenizer: def __init__( - self, max_length: int, unknown_token: str = "", pad_token: str = "" + self, max_length: int, unknown_token: str = "<|unk|>", pad_token: str = "<|pad|>", end_of_text_token: str = "<|endoftext|>" ): self.unknown_token_id = None self.token_id_to_token_map = None @@ -19,12 +21,44 @@ def __init__( self.special_tokens = [self.unknown_token, self.pad_token] self.max_length = max_length self.pad_token_id = None + self.end_of_text_token = end_of_text_token + self.end_of_text_token_id = None + + def __encode(self, text, truncate_and_pad: bool, allowed_special_tokens: list = []) -> list: + """ + Encode the given text into a list of tokens. - def __encode(self, text, truncate_and_pad: bool) -> list: - processed_text = self.__process_text(text) - text_encodings = [ - self.vocab_map.get(char, self.unknown_token_id) for char in processed_text - ] + Parameters + ---------- + text : str + Text to be encoded. + truncate_and_pad: bool + Set as True if you need to truncate/pad encodings False otherwise + """ + allowed_special_tokens = [self.vocab_map[tok] for tok in allowed_special_tokens] + text_encodings = [] + parts = text.split(self.end_of_text_token) + if len(parts) > 1: + for part in parts: + processed_text = self.__process_text(part) + for token in processed_text: + if token in self.special_tokens: + if token in allowed_special_tokens: + text_encodings.append(self.vocab_map[token]) + else: + continue + else: + text_encodings.append(self.vocab_map.get(token, self.unknown_token_id)) + text_encodings.append(self.end_of_text_token_id) + else: + processed_text = self.__process_text(text) + for token in processed_text: + if token in self.special_tokens: + if token in allowed_special_tokens: + text_encodings.append(self.vocab_map[token]) + else: + continue + if truncate_and_pad: return self.pad_or_truncate( sequence=text_encodings, @@ -43,7 +77,7 @@ def pad_or_truncate(sequence, max_length, padding_value): else: return sequence - def __call__(self, text, truncate_and_pad: bool = True) -> list: + def __call__(self, text, truncate_and_pad: bool = True, allowed_special_tokens: list = []) -> list: """ Encode the given text into a list of tokens. @@ -68,7 +102,7 @@ def __call__(self, text, truncate_and_pad: bool = True) -> list: >>> tokenizer("මම ගෙදර ගියා") [2041, 2041, 942, 965, 624, 909, 942, 54, 1960] """ - return self.__encode(text, truncate_and_pad=truncate_and_pad) + return self.__encode(text, truncate_and_pad=truncate_and_pad, allowed_special_tokens=allowed_special_tokens) def decode(self, ids, skip_special_tokens: bool = False) -> str: """ @@ -112,7 +146,7 @@ def decode(self, ids, skip_special_tokens: bool = False) -> str: ] ) - def train(self, text_list) -> None: + def train(self, text_list, memory_efficient: bool = False, chunk_size: int = 1000) -> None: """ Train the tokenizer on a list of text strings. @@ -128,7 +162,10 @@ def train(self, text_list) -> None: >>> tokenizer = Tokenizer() >>> tokenizer.train(corpus) """ - self.__train_character_level_tokenizer(text_list) + if memory_efficient: + self.__train_character_level_tokenizer_memory_efficient(text_list, chunk_size) + else: + self.__train_character_level_tokenizer(text_list) def __len__(self): return len(self.vocab_map) @@ -140,6 +177,43 @@ def vocab_size(self): @staticmethod def __process_text(t): return process_text(t) + + def __train_character_level_tokenizer_memory_efficient(self, text_list, chunk_size): + """ + Train the tokenizer on a list of text strings in a memory-efficient manner. + + This method processes the text list in chunks, updating the vocabulary + incrementally without storing all tokenized characters in memory. + + Parameters + ---------- + text_list : list of str + List of text strings to be used for training the tokenizer. + """ + unique_chars = set() + + for i in tqdm(range(0, len(text_list), chunk_size), desc="Training tokenizer", total=len(text_list) // chunk_size): + chunk = text_list[i:i+chunk_size] + + with concurrent.futures.ThreadPoolExecutor() as executor: + results = list(executor.map(self.__process_text, chunk)) + + for sublist in results: + unique_chars.update(sublist) + + self.unique_chars = unique_chars + self.vocab_map = {char: i for i, char in enumerate(self.unique_chars)} + + # Add special tokens + self.vocab_map[self.unknown_token] = len(self.vocab_map) + self.vocab_map[self.pad_token] = len(self.vocab_map) + self.vocab_map[self.end_of_text_token] = len(self.vocab_map) + + self.unknown_token_id = self.vocab_map[self.unknown_token] + self.pad_token_id = self.vocab_map[self.pad_token] + self.end_of_text_token_id = self.vocab_map[self.end_of_text_token] + + self.token_id_to_token_map = {value: key for key, value in self.vocab_map.items()} def __train_character_level_tokenizer(self, text_list): with concurrent.futures.ThreadPoolExecutor() as executor: @@ -149,8 +223,10 @@ def __train_character_level_tokenizer(self, text_list): self.vocab_map = dict(zip(self.unique_chars, range(len(self.unique_chars)))) self.vocab_map[self.unknown_token] = len(self.vocab_map) self.vocab_map[self.pad_token] = len(self.vocab_map) + self.vocab_map[self.end_of_text_token] = len(self.vocab_map) self.unknown_token_id = self.vocab_map[self.unknown_token] self.pad_token_id = self.vocab_map[self.pad_token] + self.end_of_text_token_id = self.vocab_map[self.end_of_text_token] self.token_id_to_token_map = { value: key for key, value in self.vocab_map.items() } @@ -190,6 +266,8 @@ def load_from_pretrained(self, file_path: str) -> None: self.unknown_token_id = configurations["unknown_token_id"] self.pad_token_id = configurations["pad_token_id"] self.max_length = configurations["max_length"] + self.end_of_text_token = configurations["end_of_text_token"] + self.end_of_text_token_id = configurations["end_of_text_token_id"] else: warnings.warn( "File not found at the specified path. Loaded default vocab map.", @@ -202,6 +280,7 @@ def load_from_pretrained(self, file_path: str) -> None: } self.unknown_token_id = self.vocab_map[self.unknown_token] self.pad_token_id = self.vocab_map[self.pad_token] + self.end_of_text_token_id = self.vocab_map[self.end_of_text_token] return self def save_tokenizer(self, save_path: str): @@ -212,6 +291,8 @@ def save_tokenizer(self, save_path: str): "unknown_token_id": self.unknown_token_id, "pad_token_id": self.pad_token_id, "max_length": self.max_length, + "end_of_text_token": self.end_of_text_token, + "end_of_text_token_id": self.end_of_text_token_id, } with open(save_path / "vocab.json", "w", encoding="utf-8") as file: