-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add tokenizers dependency and update Python version to 3.12
- Loading branch information
1 parent
c6e6ff5
commit 89f810c
Showing
19 changed files
with
110,058 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,12 @@ | ||
"""Multi-tokenizer package.""" | ||
|
||
from multi_tokenizer.language_detect import LanguageDetector | ||
from multi_tokenizer.pretrained import LanguageSpecificTokenizer, PretrainedTokenizers | ||
from multi_tokenizer.tokenizer import MultiTokenizer | ||
|
||
__all__ = [ | ||
"MultiTokenizer", | ||
"PretrainedTokenizers", | ||
"LanguageSpecificTokenizer", | ||
"LanguageDetector", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
"""Language Detection Module.""" | ||
|
||
|
||
class LanguageDetector: | ||
"""Language Detector.""" | ||
|
||
def __init__(self, language: str) -> None: | ||
"""Initialize Language Detector.""" | ||
self.language = language | ||
|
||
def detect(self, text: str) -> list[tuple[int, int]]: | ||
"""Detect Language.""" | ||
raise NotImplementedError | ||
|
||
|
||
class EnglishDetector(LanguageDetector): | ||
"""English Language Detector.""" | ||
|
||
def __init__(self) -> None: | ||
"""Initialize English Detector.""" | ||
super().__init__("en") | ||
|
||
def detect(self, text: str) -> list[tuple[int, int]]: | ||
"""Detect English.""" | ||
return [(0, int(len(text) / 2) - 1), (int(len(text) / 2), len(text) - 1)] | ||
|
||
|
||
class SpanishDetector(LanguageDetector): | ||
"""Spanish Language Detector.""" | ||
|
||
def __init__(self) -> None: | ||
"""Initialize Spanish Detector.""" | ||
super().__init__("es") | ||
|
||
def detect(self, text: str) -> list[tuple[int, int]]: | ||
"""Detect Spanish.""" | ||
return [(0, int(len(text) / 2) - 1), (int(len(text) / 2), len(text) - 1)] | ||
|
||
|
||
class CantoneseDetector(LanguageDetector): | ||
"""Cantonese Language Detector.""" | ||
|
||
def __init__(self) -> None: | ||
"""Initialize Cantonese Detector.""" | ||
super().__init__("zh") | ||
|
||
def detect(self, text: str) -> list[tuple[int, int]]: | ||
"""Detect Cantonese.""" | ||
return [(0, int(len(text) / 2) - 1), (int(len(text) / 2), len(text) - 1)] | ||
|
||
|
||
class HindiDetector(LanguageDetector): | ||
"""Hindi Language Detector.""" | ||
|
||
def __init__(self) -> None: | ||
"""Initialize Hindi Detector.""" | ||
super().__init__("hi") | ||
|
||
def detect(self, text: str) -> list[tuple[int, int]]: | ||
"""Detect Hindi.""" | ||
return [(0, int(len(text) / 2) - 1), (int(len(text) / 2), len(text) - 1)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
"""Pretrained Tokenizers for Specific Languages.""" | ||
|
||
import os | ||
from enum import Enum | ||
from typing import Callable | ||
|
||
from multi_tokenizer.language_detect import ( | ||
CantoneseDetector, | ||
EnglishDetector, | ||
HindiDetector, | ||
LanguageDetector, | ||
SpanishDetector, | ||
) | ||
|
||
from tokenizers import Tokenizer | ||
|
||
|
||
file_dir = os.path.dirname(__file__) | ||
|
||
|
||
class LanguageSpecificTokenizer: | ||
"""Language Specific Tokenizer.""" | ||
|
||
def __init__( | ||
self, tokenizer_path: str, language_detector: LanguageDetector | None = None | ||
) -> None: | ||
"""Initialize Language Specific Tokenizer.""" | ||
self.language_detector = language_detector | ||
self.tokenizer = Tokenizer.from_file(tokenizer_path) | ||
|
||
def __getattr__(self, name: str) -> Callable: | ||
"""Get Attribute.""" | ||
return getattr(self.tokenizer, name) | ||
|
||
|
||
class PretrainedTokenizers(Enum): | ||
"""Pretrained Tokenizers for Specific Languages.""" | ||
|
||
ENGLISH = LanguageSpecificTokenizer( | ||
os.path.join(file_dir, "english_tokenizer.json"), EnglishDetector() | ||
) | ||
SPANISH = LanguageSpecificTokenizer( | ||
os.path.join(file_dir, "spanish_tokenizer.json"), SpanishDetector() | ||
) | ||
CANTONESE = LanguageSpecificTokenizer( | ||
os.path.join(file_dir, "cantonese_tokenizer.json"), CantoneseDetector() | ||
) | ||
HINDI = LanguageSpecificTokenizer( | ||
os.path.join(file_dir, "hindi_tokenizer.json"), HindiDetector() | ||
) | ||
|
||
|
||
__all__ = ["PretrainedTokenizers"] |
Empty file.
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
"""Multi Tokenizer Module.""" | ||
|
||
import pickle | ||
|
||
from multi_tokenizer.pretrained import LanguageSpecificTokenizer | ||
|
||
from tokenizers import Encoding | ||
|
||
|
||
class MultiTokenizer: | ||
"""MultiTokenizer Class.""" | ||
|
||
def __init__(self, tokenizers: list[LanguageSpecificTokenizer]) -> None: | ||
"""Initialize MultiTokenizer.""" | ||
self.tokenizers = tokenizers | ||
|
||
def encode(self, text: str) -> Encoding: | ||
"""Encode Text.""" | ||
raise NotImplementedError | ||
|
||
def decode(self, encoding: Encoding) -> str: | ||
"""Decode Encoding.""" | ||
raise NotImplementedError | ||
|
||
def save(self, path: str) -> None: | ||
"""Save Tokenizer.""" | ||
with open(path, "wb") as file: | ||
pickle.dump(self, file) | ||
|
||
@staticmethod | ||
def load(path: str) -> "MultiTokenizer": | ||
"""Load Tokenizer.""" | ||
with open(path, "rb") as file: | ||
return pickle.load(file) |
Oops, something went wrong.