Source code for core.tokenizer

"""
Tokenizer from tiktoken.
"""

import tiktoken
from typing import List

# from adalflow.core.component import BaseComponent



[docs]
class Tokenizer:
    __doc__ = r"""
    Tokenizer component that wraps around the tokenizer from tiktoken.
    __call__ is the same as forward/encode, so that we can use it in Sequential
    Additonally, you can can also use encode and decode methods.

    Args:
        name (str, optional): The name of the tokenizer. Defaults to "cl100k_base". You can find more information
        at the tiktoken documentation.
    """

    def __init__(self, name: str = "cl100k_base", remove_stop_words: bool = False):
        super().__init__()
        self.name = name
        self.tokenizer = tiktoken.get_encoding(name)
        self.stop_words = (
            set(["and", "the", "is", "in", "at", "of", "a", "an"])
            if remove_stop_words
            else set()
        )

    # call is the same as forward/encode, so that we can use it in Sequential
    def __call__(self, input: str) -> List[str]:
        return self.encode(input)


[docs]
    def preprocess(self, text: str) -> List[str]:
        # Lowercase the text
        words = text.lower().split()
        return words



[docs]
    def encode(self, text: str) -> List[int]:
        r"""Encodes the input text/word into token IDs."""
        return self.tokenizer.encode(text)



[docs]
    def decode(self, tokens: List[str]) -> str:
        r"""Decodes the input tokens into text."""
        return self.tokenizer.decode(tokens)



[docs]
    def count_tokens(self, text: str) -> int:
        r"""Counts the number of tokens in the input text."""
        return len(self.encode(text))



[docs]
    def get_string_tokens(self, text: str) -> List[str]:
        r"""Returns the string tokens from the input text."""
        token_ids = self.encode(text)
        return [self.tokenizer.decode([token_id]) for token_id in token_ids]