Skip to content

Commit

Permalink
releasing new version with new dcoding method
Browse files Browse the repository at this point in the history
  • Loading branch information
ranzaka committed Sep 3, 2024
1 parent 82b301a commit a9b2d80
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 12 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ src/sinhala_tokenizers/.DS_Store
.DS_Store
examples/config.json
examples/vocab.json
experiments/experiments.ipynb
experiments/test.py
experiments/*
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
[project]
name = "sinlib"
version = "0.1.3"
version = "0.1.4"
description = "Sinhala NLP Toolkit"
authors = [
{ name = "Ransaka", email = "ransaka.ravihara@gmail.com" }
]
dependencies = [
"numpy",
"torch"
"torch",
"tqdm"
]
license = { file = "LICENSE" }
readme = "README.md"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
numpy
torch
tqdm
101 changes: 91 additions & 10 deletions src/sinlib/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from pathlib import Path
import concurrent.futures
from .utils.preprocessing import process_text, load_default_vocab_map
from tqdm import tqdm



class Tokenizer:
def __init__(
self, max_length: int, unknown_token: str = "<unk>", pad_token: str = "<pad>"
self, max_length: int, unknown_token: str = "<|unk|>", pad_token: str = "<|pad|>", end_of_text_token: str = "<|endoftext|>"
):
self.unknown_token_id = None
self.token_id_to_token_map = None
Expand All @@ -19,12 +21,44 @@ def __init__(
self.special_tokens = [self.unknown_token, self.pad_token]
self.max_length = max_length
self.pad_token_id = None
self.end_of_text_token = end_of_text_token
self.end_of_text_token_id = None

def __encode(self, text, truncate_and_pad: bool, allowed_special_tokens: list = []) -> list:
"""
Encode the given text into a list of tokens.
def __encode(self, text, truncate_and_pad: bool) -> list:
processed_text = self.__process_text(text)
text_encodings = [
self.vocab_map.get(char, self.unknown_token_id) for char in processed_text
]
Parameters
----------
text : str
Text to be encoded.
truncate_and_pad: bool
Set as True if you need to truncate/pad encodings False otherwise
"""
allowed_special_tokens = [self.vocab_map[tok] for tok in allowed_special_tokens]
text_encodings = []
parts = text.split(self.end_of_text_token)
if len(parts) > 1:
for part in parts:
processed_text = self.__process_text(part)
for token in processed_text:
if token in self.special_tokens:
if token in allowed_special_tokens:
text_encodings.append(self.vocab_map[token])
else:
continue
else:
text_encodings.append(self.vocab_map.get(token, self.unknown_token_id))
text_encodings.append(self.end_of_text_token_id)
else:
processed_text = self.__process_text(text)
for token in processed_text:
if token in self.special_tokens:
if token in allowed_special_tokens:
text_encodings.append(self.vocab_map[token])
else:
continue

if truncate_and_pad:
return self.pad_or_truncate(
sequence=text_encodings,
Expand All @@ -43,7 +77,7 @@ def pad_or_truncate(sequence, max_length, padding_value):
else:
return sequence

def __call__(self, text, truncate_and_pad: bool = True) -> list:
def __call__(self, text, truncate_and_pad: bool = True, allowed_special_tokens: list = []) -> list:
"""
Encode the given text into a list of tokens.
Expand All @@ -68,7 +102,7 @@ def __call__(self, text, truncate_and_pad: bool = True) -> list:
>>> tokenizer("මම ගෙදර ගියා")
[2041, 2041, 942, 965, 624, 909, 942, 54, 1960]
"""
return self.__encode(text, truncate_and_pad=truncate_and_pad)
return self.__encode(text, truncate_and_pad=truncate_and_pad, allowed_special_tokens=allowed_special_tokens)

def decode(self, ids, skip_special_tokens: bool = False) -> str:
"""
Expand Down Expand Up @@ -112,7 +146,7 @@ def decode(self, ids, skip_special_tokens: bool = False) -> str:
]
)

def train(self, text_list) -> None:
def train(self, text_list, memory_efficient: bool = False, chunk_size: int = 1000) -> None:
"""
Train the tokenizer on a list of text strings.
Expand All @@ -128,7 +162,10 @@ def train(self, text_list) -> None:
>>> tokenizer = Tokenizer()
>>> tokenizer.train(corpus)
"""
self.__train_character_level_tokenizer(text_list)
if memory_efficient:
self.__train_character_level_tokenizer_memory_efficient(text_list, chunk_size)
else:
self.__train_character_level_tokenizer(text_list)

def __len__(self):
return len(self.vocab_map)
Expand All @@ -140,6 +177,43 @@ def vocab_size(self):
@staticmethod
def __process_text(t):
return process_text(t)

def __train_character_level_tokenizer_memory_efficient(self, text_list, chunk_size):
"""
Train the tokenizer on a list of text strings in a memory-efficient manner.
This method processes the text list in chunks, updating the vocabulary
incrementally without storing all tokenized characters in memory.
Parameters
----------
text_list : list of str
List of text strings to be used for training the tokenizer.
"""
unique_chars = set()

for i in tqdm(range(0, len(text_list), chunk_size), desc="Training tokenizer", total=len(text_list) // chunk_size):
chunk = text_list[i:i+chunk_size]

with concurrent.futures.ThreadPoolExecutor() as executor:
results = list(executor.map(self.__process_text, chunk))

for sublist in results:
unique_chars.update(sublist)

self.unique_chars = unique_chars
self.vocab_map = {char: i for i, char in enumerate(self.unique_chars)}

# Add special tokens
self.vocab_map[self.unknown_token] = len(self.vocab_map)
self.vocab_map[self.pad_token] = len(self.vocab_map)
self.vocab_map[self.end_of_text_token] = len(self.vocab_map)

self.unknown_token_id = self.vocab_map[self.unknown_token]
self.pad_token_id = self.vocab_map[self.pad_token]
self.end_of_text_token_id = self.vocab_map[self.end_of_text_token]

self.token_id_to_token_map = {value: key for key, value in self.vocab_map.items()}

def __train_character_level_tokenizer(self, text_list):
with concurrent.futures.ThreadPoolExecutor() as executor:
Expand All @@ -149,8 +223,10 @@ def __train_character_level_tokenizer(self, text_list):
self.vocab_map = dict(zip(self.unique_chars, range(len(self.unique_chars))))
self.vocab_map[self.unknown_token] = len(self.vocab_map)
self.vocab_map[self.pad_token] = len(self.vocab_map)
self.vocab_map[self.end_of_text_token] = len(self.vocab_map)
self.unknown_token_id = self.vocab_map[self.unknown_token]
self.pad_token_id = self.vocab_map[self.pad_token]
self.end_of_text_token_id = self.vocab_map[self.end_of_text_token]
self.token_id_to_token_map = {
value: key for key, value in self.vocab_map.items()
}
Expand Down Expand Up @@ -190,6 +266,8 @@ def load_from_pretrained(self, file_path: str) -> None:
self.unknown_token_id = configurations["unknown_token_id"]
self.pad_token_id = configurations["pad_token_id"]
self.max_length = configurations["max_length"]
self.end_of_text_token = configurations["end_of_text_token"]
self.end_of_text_token_id = configurations["end_of_text_token_id"]
else:
warnings.warn(
"File not found at the specified path. Loaded default vocab map.",
Expand All @@ -202,6 +280,7 @@ def load_from_pretrained(self, file_path: str) -> None:
}
self.unknown_token_id = self.vocab_map[self.unknown_token]
self.pad_token_id = self.vocab_map[self.pad_token]
self.end_of_text_token_id = self.vocab_map[self.end_of_text_token]
return self

def save_tokenizer(self, save_path: str):
Expand All @@ -212,6 +291,8 @@ def save_tokenizer(self, save_path: str):
"unknown_token_id": self.unknown_token_id,
"pad_token_id": self.pad_token_id,
"max_length": self.max_length,
"end_of_text_token": self.end_of_text_token,
"end_of_text_token_id": self.end_of_text_token_id,
}

with open(save_path / "vocab.json", "w", encoding="utf-8") as file:
Expand Down

0 comments on commit a9b2d80

Please sign in to comment.