diff --git a/examples/muon.py b/muon.py similarity index 100% rename from examples/muon.py rename to muon.py diff --git a/zeta/tokenizers/__init__.py b/zeta/tokenizers/__init__.py deleted file mode 100644 index a2db2cc7..00000000 --- a/zeta/tokenizers/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# from zeta.tokenizers.gptx_tokenizer import LanguageTokenizerGPTX -# from zeta.tokenizers.llama_sentencepiece import LLamaTokenizer -# from zeta.tokenizers.multi_modal_tokenizer import MultiModalTokenizer -# from zeta.tokenizers.sentence_piece import SentencePieceTokenizer -# from zeta.tokenizers.tokenmonster import TokenMonster - -# __all__ = [ -# "LanguageTokenizerGPTX", -# "MultiModalTokenizer", -# "SentencePieceTokenizer", -# "TokenMonster", -# "LLamaTokenizer", -# ] diff --git a/zeta/tokenizers/gptx_tokenizer.py b/zeta/tokenizers/gptx_tokenizer.py deleted file mode 100644 index 60c54ce1..00000000 --- a/zeta/tokenizers/gptx_tokenizer.py +++ /dev/null @@ -1,52 +0,0 @@ -from transformers import AutoTokenizer - - -class LanguageTokenizerGPTX: - """ - LanguageTokenizerGPTX is a class that provides tokenization and decoding functionality using the GPT-Neox-20B model. - """ - - def __init__(self): - self.tokenizer = AutoTokenizer.from_pretrained( - "EleutherAI/gpt-neox-20b", - eos_token="", - pad_token="", - extra_ids=0, - model_max_length=8192, - ) - - def tokenize_texts(self, texts): - """ - Tokenizes a list of texts using the GPT-Neox-20B tokenizer. - - Args: - texts (List[str]): A list of texts to be tokenized. - - Returns: - torch.Tensor: The tokenized input IDs as a PyTorch tensor. - """ - return self.tokenizer( - texts, return_tensors="pt", padding=True, truncation=True - ).input_ids - - def decode(self, texts): - """ - Decodes a list of tokenized input IDs into text. - - Args: - texts (torch.Tensor): The tokenized input IDs as a PyTorch tensor. - - Returns: - str: The decoded text. - """ - return self.tokenizer.decode(texts) - - def __len__(self): - """ - Returns the number of tokens in the tokenizer's vocabulary. - - Returns: - int: The number of tokens in the vocabulary. - """ - num_tokens = len(self.tokenizer) - return num_tokens diff --git a/zeta/tokenizers/llama_sentencepiece.py b/zeta/tokenizers/llama_sentencepiece.py deleted file mode 100644 index 1b5fc618..00000000 --- a/zeta/tokenizers/llama_sentencepiece.py +++ /dev/null @@ -1,92 +0,0 @@ -# Using LLAMA tokenizer -import os -from logging import getLogger - -import requests -from sentencepiece import SentencePieceProcessor - -logger = getLogger() - -PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": { - "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model", - }, - "tokenizer_file": { - "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json", - }, -} - - -class LLamaTokenizer: - """ - A tokenizer that uses a pretrained SentencePiece model for text tokenization. - - Args: - model_path: Path to a pretrained SentencePiece model file. - tokenizer_name: Name of a pretrained SentencePiece model hosted on HuggingFace Hub. - - Examples: - >>> tokenizer_name = "hf-internal-testing/llama-tokenizer" - >>> tokenizer = Tokenizer(tokenizer_name=tokenizer_name) - >>> encoded_text = tokenizer.encode("This is a sample text") - >>> decoded_text = tokenizer.decode(encoded_text) - >>> print("Encoded text:", encoded_text) - >>> print("Decoded text:", decoded_text) - """ - - def __init__(self, model_path: str = None, tokenizer_name: str = None): - if model_path: - assert os.path.isfile(model_path), model_path - elif tokenizer_name: - model_path = self.download_tokenizer(tokenizer_name) - else: - raise ValueError( - "Either model_path or tokenizer_name must be provided." - ) - - self.sp_model = SentencePieceProcessor(model_file=model_path) - logger.info(f"Reloaded SentencePiece model from {model_path}") - - @staticmethod - def download_tokenizer(tokenizer_name: str) -> str: - if tokenizer_name not in PRETRAINED_VOCAB_FILES_MAP["vocab_file"]: - raise ValueError(f"Tokenizer {tokenizer_name} is not available.") - - model_url = PRETRAINED_VOCAB_FILES_MAP["vocab_file"][tokenizer_name] - model_path = os.path.join("data", "tokenizer.model") - - if not os.path.exists("data"): - os.makedirs("data") - - # Downloading the tokenizer model file - response = requests.get(model_url) - if response.status_code == 200: - with open(model_path, "wb") as file: - file.write(response.content) - logger.info(f"Downloaded SentencePiece model to {model_path}") - else: - raise Exception(f"Failed to download model from {model_url}") - - return model_path - - def encode(self, s: str) -> [int]: - """Encodes a string into a list of token ids. - - Args: - s (str): _description_ - - Returns: - [int]: _description_ - """ - return self.sp_model.encode(s, out_type=int) - - def decode(self, ids: [int]) -> str: - """decodes a list of token ids into a string. - - Args: - ids (int]): _description_ - - Returns: - str: _description_ - """ - return self.sp_model.decode(ids) diff --git a/zeta/tokenizers/multi_modal_tokenizer.py b/zeta/tokenizers/multi_modal_tokenizer.py deleted file mode 100644 index 66327807..00000000 --- a/zeta/tokenizers/multi_modal_tokenizer.py +++ /dev/null @@ -1,119 +0,0 @@ -import logging - -import torch -from transformers import AutoTokenizer, CLIPProcessor - -logging.basicConfig( - level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s" -) - - -class MultiModalTokenizer: - """ - A tokenizer class for the kosmos model - - Attributes: - processor(CLIPProcessor): The processor to tokenize images - tokenizer: (AutoTokenizer): The tokenizer to tokenize text - im_idx: (int): The Index of the "" token. - im_end_idx (int): The index of the "" token. - """ - - def __init__(self, max_length: int = 8192): - self.max_length = max_length - - try: - self.processor = CLIPProcessor.from_pretrained( - "laion/CLIP-ViT-L-14-laion2B-s32B-b82K" - ) - self.tokenizer = AutoTokenizer.from_pretrained( - "EleutherAI/gpt-neox-20b", - additional_special_tokens=["", ""], - eos_token="", - pad_token="", - extra_ids=0, - model_max_length=self.max_length, - ) - except Exception as e: - logging.error(f"Failed to initialize KosmosTokenizer: {e}") - raise - - self.im_idx, self.im_end_idx = self.tokenizer.convert_tokens_to_ids( - ["", ""] - ) - - def tokenize_texts(self, texts: str): - """ - Tokenize given texts. - - Args: - Texts (str): The Text to be tokenized - - - Returns: - A tuple containing the tokenized texts and only the text tokens. - """ - try: - texts = self.tokenizer( - texts, return_tensors="pt", padding=True, truncation=True - ).input_ids - # Add image tokens to text as " text " - image_tokens = torch.tensor( - [[self.im_idx, self.im_end_idx]] * texts.shape[0] - ) - return ( - torch.cat([texts[:, 0:1], image_tokens, texts[:, 1:]], dim=1), - texts, - ) - except Exception as e: - logging.error(f"Failed to tokenize texts: {e}") - raise - - def tokenize_images(self, images): - """ - Tokenizes given images. - - Args: - images: The images to be tokenized - - Returns: - The tokenized images. - - """ - try: - return self.processor( - images=images, return_tensors="pt" - ).pixel_values - except Exception as e: - logging.error(f"Failed to tokenize images: {e}") - raise - - def tokenize(self, sample): - """ - Tokenizes given sample. - - Args: - Sample: The sample to be tokenized - - Returns: - A dictionary containing the tokenized text tokens, images, labels, and attention mask. - - """ - try: - text_tokens, only_text_tokens = self.tokenize_texts( - sample["target_text"] - ) - attention_mask = text_tokens != self.tokenizer.pad_token_id - dummy_image_features = torch.ones((text_tokens.shape[0], 64)) - attention_mask = torch.cat( - [dummy_image_features, attention_mask], dim=1 - ) - return { - "text_tokens": text_tokens, - "images": self.tokenize_images(sample["image"]), - "labels": only_text_tokens, - "attention_mask": attention_mask, - } - except Exception as e: - logging.error(f"Failed to tokenize sample: {e}") - raise diff --git a/zeta/tokenizers/sentence_piece.py b/zeta/tokenizers/sentence_piece.py deleted file mode 100644 index b09de319..00000000 --- a/zeta/tokenizers/sentence_piece.py +++ /dev/null @@ -1,97 +0,0 @@ -import os -from logging import getLogger -from typing import List, Optional - -from sentencepiece import SentencePieceProcessor - -logger = getLogger() - - -class SentencePieceTokenizer: - """ - A SentencePieceTokenizer is a tokenizer that uses a pretrained SentencePiece model to convert text into tokens and vice versa. - It includes the ability to add special tokens for infilling tasks and provides functionality to encode and decode text with or without implicit leading spaces. - Parameters: - - model_path (str): Path to the pretrained SentencePiece model file. - - Attributes: - - n_words (int): Vocabulary size of the SentencePiece model. - - bos_id (int): Token ID of the beginning-of-sentence (BOS) token. - - eos_id (int): Token ID of the end-of-sentence (EOS) token. - - pad_id (int): Token ID of the padding (PAD) token. - - prefix_id (int, optional): Token ID of the prefix token. Default: None. - - middle_id (int, optional): Token ID of the middle token. Default: None. - - suffix_id (int, optional): Token ID of the suffix token. Default: None. - - eot_id (int, optional): Token ID of the end-of-turn (EOT) token. Default: None. - """ - - def __init__(self, model_path: str): - # reload tokenizer - assert os.path.isfile(model_path), model_path - self.sp_model = SentencePieceProcessor(model_file=model_path) - logger.info(f"Reloaded SentencePiece model from {model_path}") - - # BOS / EOS token IDs - self.n_words: int = self.sp_model.vocab_size() - self.bos_id: int = self.sp_model.bos_id() - self.eos_id: int = self.sp_model.eos_id() - self.pad_id: int = self.sp_model.pad_id() - - # token IDs for special infilling tokens - self.prefix_id: Optional[int] = ( - self.sp_model.piece_to_id("▁
") or None
-        )
-        self.middle_id: Optional[int] = (
-            self.sp_model.piece_to_id("▁") or None
-        )
-        self.suffix_id: Optional[int] = (
-            self.sp_model.piece_to_id("▁") or None
-        )
-        self.eot_id: Optional[int] = self.sp_model.piece_to_id("▁") or None
-        logger.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID:"
-            f" {self.eos_id} - PRE ID: {self.prefix_id} - MID ID:"
-            f" {self.middle_id} - SUF ID: {self.suffix_id} - EOT ID:"
-            f" {self.eot_id}"
-        )
-        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
-
-    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
-        """
-        Encodes a given string using the SentencePiece tokenizer.
-
-        Args:
-            s (str): The input string to be encoded.
-            bos (bool): Whether to add a beginning of sentence token.
-            eos (bool): Whether to add an end of sentence token.
-
-        Returns:
-            List[int]: The list of encoded tokens.
-
-        """
-        assert isinstance(s, str)
-        t = self.sp_model.encode(s)
-        if bos:
-            t = [self.bos_id] + t
-        if eos:
-            t = t + [self.eos_id]
-        return t
-
-    def decode(self, t: List[int]) -> str:
-        """Decode a list of token IDs into a string.
-
-        Args:
-            t (List[int]): _description_
-
-        Returns:
-            str: _description_
-        """
-        return self.sp_model.decode(t)
-
-    def encode_infilling(self, s: str) -> List[int]:
-        """Encode a string without an implicit leading space."""
-        return self.sp_model.encode("☺" + s)[2:]
-
-    def decode_infilling(self, t: List[int]) -> str:
-        """Decode a string without an implicit leading space."""
-        return self.sp_model.decode([self.sp_model.piece_to_id("☺")] + t)[1:]
diff --git a/zeta/tokenizers/tokenmonster.py b/zeta/tokenizers/tokenmonster.py
deleted file mode 100644
index b6302b4a..00000000
--- a/zeta/tokenizers/tokenmonster.py
+++ /dev/null
@@ -1,347 +0,0 @@
-import tokenmonster
-
-
-class TokenMonster:
-    """
-    A class that encapsulates the functionality of the tokenmonster library.
-
-    >>> from zeta.tokenizers import TokenMonster
-    >>> tokenizer = TokenMonster("englishcode-32000-consistent-v1")
-    >>> tokenizer.tokenize("Hello world!")
-    """
-
-    def __init__(self, path):
-        """
-        Initializes the TokenMonster class and loads a vocabulary.
-
-        Args:
-            path (str): A filepath, URL or pre-built vocabulary name.
-        """
-        self.vocab = tokenmonster.load(path)
-
-    def set_local_directory(self, dir=None):
-        """
-        Sets the local directory for TokenMonster.
-
-        Args:
-            dir (str, optional): The local directory to use. Defaults to None.
-        """
-        tokenmonster.set_local_directory(dir)
-
-    def load(self, path):
-        """
-        Loads a TokenMonster vocabulary from file, URL or by name.
-
-        Args:
-            path (str): A filepath, URL or pre-built vocabulary name.
-        """
-        self.vocab = tokenmonster.load(path)
-
-    def load_multiprocess_safe(self, path):
-        """
-        Loads a TokenMonster vocabulary from file, URL or by name. It's safe for multiprocessing,
-        but vocabulary modification is disabled and tokenization is slightly slower.
-
-        Args:
-            path (str): A filepath, URL or pre-built vocabulary name.
-        """
-        self.vocab = tokenmonster.load_multiprocess_safe(path)
-
-    def new(self, yaml):
-        """
-        Creates a new vocabulary from a YAML string.
-
-        Args:
-            yaml (str): The YAML file.
-        """
-        self.vocab = tokenmonster.new(yaml)
-
-    def save(self, fname):
-        """
-        Saves the current vocabulary to a file.
-
-        Args:
-            fname (str): The filename to save the vocabulary to.
-        """
-        self.vocab.save(fname)
-
-    def export_yaml(self, order_by_score=False):
-        """
-        Exports the vocabulary as a YAML file, which is returned as a bytes string.
-
-        Args:
-            order_by_score (bool, optional): If true the tokens are order by score instead of alphabetically. Defaults to False.
-
-        Returns:
-            bytes: The vocabulary in YAML format.
-        """
-        return self.vocab.export_yaml(order_by_score)
-
-    def tokenize(self, text):
-        """
-        Tokenizes a string into tokens according to the vocabulary.
-
-        Args:
-            text (str): A string or bytes string, or list of strings or bytes strings.
-
-        Returns:
-            numpy array: The tokens IDs
-        """
-        return self.vocab.tokenize(text)
-
-    def tokenize_count(self, text):
-        """
-        Same as tokenize, but it returns only the number of tokens.
-
-        Args:
-            text (str): A string or bytes string, or list of strings or bytes strings.
-
-        Returns:
-            int: The number of tokens for each input string
-        """
-        return self.vocab.tokenize_count(text)
-
-    def decode(self, tokens):
-        """
-        Decodes tokens into a string.
-
-        Args:
-            tokens (int, list of int, or numpy array): The tokens to decode into a string.
-
-        Returns:
-            str: The composed string from the input tokens.
-        """
-        return self.vocab.decode(tokens)
-
-    def decoder(self):
-        """
-        Returns a new decoder instance used for decoding tokens into text.
-
-        Returns:
-            tokenmonster.DecoderInstance: A new decoder instance.
-        """
-        return self.vocab.decoder()
-
-    def get_dictionary(self):
-        """
-        Returns a dictionary of all tokens in the vocabulary.
-
-        Returns:
-            list: A list of dictionaries where the index is the token ID and each is a dictionary.
-        """
-        return self.vocab.get_dictionary()
-
-    def charset(self):
-        """
-        Returns the character set used by the vocabulary.
-
-        Returns:
-            str: The character set used by the vocabulary. Possible values are "UTF-8", "None".
-        """
-        return self.vocab.charset()
-
-    def normalization(self):
-        """
-        Returns the normalization of the vocabulary.
-
-        Returns:
-            str: The normalization of the vocabulary. Possible values are "None", "NFD", "Lowercase", "Accents", "Quotemarks", "Collapse", "Trim", "LeadingSpace", "UnixLines".
-        """
-        return self.vocab.normalization()
-
-    def capcode(self):
-        """
-        Returns the capcode level of the vocabulary.
-
-        Returns:
-            int: The capcode level (0-2).
-        """
-        return self.vocab.capcode()
-
-    def mode(self):
-        """
-        Returns the optimization mode of the vocabulary.
-
-        Returns:
-            int: The optimization mode (0-5).
-        """
-        return self.vocab.mode()
-
-    def id_to_token(self, id):
-        """
-        Get the token string from a single token ID, in its capcode-encoded form.
-
-        Args:
-            id (int): The token ID.
-
-        Returns:
-            str or None: The token string corresponding to the input ID. None if the ID is not in the vocabulary.
-        """
-        return self.vocab.id_to_token(id)
-
-    def id_to_token_decoded(self, id):
-        """
-        Get the token string from a single token ID, in its capcode-decoded form.
-
-        Args:
-            id (int): The token ID.
-
-        Returns:
-            str or None: The token string corresponding to the input ID. None if the ID is not in the vocabulary.
-        """
-        return self.vocab.id_to_token_decoded(id)
-
-    def token_to_id(self, token):
-        """
-        Returns the ID of a single token.
-
-        Args:
-            token (str): The token to get the ID for.
-
-        Returns:
-            int or None: The ID of the token. None if the token is not in the vocabulary.
-        """
-        return self.vocab.token_to_id(token)
-
-    def modify(
-        self,
-        add_special_tokens=None,
-        add_regular_tokens=None,
-        delete_tokens=None,
-        resize=None,
-        change_unk=None,
-    ):
-        """
-        Modifies the vocabulary.
-
-        Args:
-            add_special_tokens (str or list of str, optional): Special tokens to add to the vocabulary.
-            add_regular_tokens (str or list of str, optional): Regular tokens to add to the vocabulary.
-            delete_tokens (str or list of str, optional): Regular or Special tokens to delete.
-            resize (int, optional): Resizes the vocabulary to this size.
-            change_unk (bool, optional): If set, it enables or disables the UNK token.
-
-        Returns:
-            int: The new size of the vocabulary.
-        """
-        return self.vocab.modify(
-            add_special_tokens,
-            add_regular_tokens,
-            delete_tokens,
-            resize,
-            change_unk,
-        )
-
-    def add_token(self, token):
-        """
-        Add one or more regular tokens.
-
-        Args:
-            token (str or list of str): The regular tokens to add.
-
-        Returns:
-            int: The new size of the vocabulary.
-        """
-        return self.vocab.add_token(token)
-
-    def delete_token(self, token):
-        """
-        Delete one or more regular or special tokens.
-
-        Args:
-            token (str or list of str): The tokens to delete.
-
-        Returns:
-            int: The new size of the vocabulary.
-        """
-        return self.vocab.delete_token(token)
-
-    def delete_token_by_id(self, id):
-        """
-        Delete one or more regular or special token by specifying the token ID.
-
-        Args:
-            id (int or list of int): The IDs of the tokens to delete.
-
-        Returns:
-            int: The new size of the vocabulary.
-        """
-        return self.vocab.delete_token_by_id(id)
-
-    def add_special_token(self, token):
-        """
-        Add one or more special tokens.
-
-        Args:
-            token (str or list of str): The special tokens to add.
-
-        Returns:
-            int: The new size of the vocabulary.
-        """
-        return self.vocab.add_special_token(token)
-
-    def resize(self, size):
-        """
-        Changes the size of the vocabulary.
-
-        Args:
-            size (int): The new size of the vocabulary.
-
-        Returns:
-            int: The new size of the vocabulary.
-        """
-        return self.vocab.resize(size)
-
-    def reset_token_ids(self):
-        """
-        Resets the token IDs to be sequential beginning from zero.
-        """
-        self.vocab.reset_token_ids()
-
-    def enable_unk_token(self):
-        """
-        Enables the UNK token.
-
-        Returns:
-            int: The new size of the vocabulary.
-        """
-        return self.vocab.enable_unk_token()
-
-    def disable_unk_token(self):
-        """
-        Disables the UNK token.
-
-        Returns:
-            int: The new size of the vocabulary.
-        """
-        return self.vocab.disable_unk_token()
-
-    def disconnect(self):
-        """
-        Disconnects and closes tokenmonsterserver.
-        """
-        tokenmonster.disconnect()
-
-    def serialize_tokens(self, integer_list):
-        """
-        Serializes tokens from a list of ints or numpy array into a binary string.
-
-        Args:
-            integer_list (list of int or numpy array): The tokens to serialize.
-
-        Returns:
-            bytes: The serialized binary string.
-        """
-        return self.vocab.serialize_tokens(integer_list)
-
-    def deserialize_tokens(self, binary_string):
-        """
-        Deserializes a binary string into a numpy array of token IDs.
-
-        Args:
-            binary_string (bytes): The binary string to deserialize.
-
-        Returns:
-            np.array: The deserialized tokens.
-        """
-        return self.vocab.deserialize_tokens(binary_string)