Source code for core.tokenizer

"""
Tokenizer from tiktoken.
"""

import tiktoken
from typing import List

# from adalflow.core.component import BaseComponent


[docs] class Tokenizer: __doc__ = r""" Tokenizer component that wraps around the tokenizer from tiktoken. __call__ is the same as forward/encode, so that we can use it in Sequential Additonally, you can can also use encode and decode methods. Args: name (str, optional): The name of the tokenizer. Defaults to "cl100k_base". You can find more information at the tiktoken documentation. """ def __init__(self, name: str = "cl100k_base", remove_stop_words: bool = False): super().__init__() self.name = name self.tokenizer = tiktoken.get_encoding(name) self.stop_words = ( set(["and", "the", "is", "in", "at", "of", "a", "an"]) if remove_stop_words else set() ) # call is the same as forward/encode, so that we can use it in Sequential def __call__(self, input: str) -> List[str]: return self.encode(input)
[docs] def preprocess(self, text: str) -> List[str]: # Lowercase the text words = text.lower().split() return words
[docs] def encode(self, text: str) -> List[int]: r"""Encodes the input text/word into token IDs.""" return self.tokenizer.encode(text)
[docs] def decode(self, tokens: List[str]) -> str: r"""Decodes the input tokens into text.""" return self.tokenizer.decode(tokens)
[docs] def count_tokens(self, text: str) -> int: r"""Counts the number of tokens in the input text.""" return len(self.encode(text))
[docs] def get_string_tokens(self, text: str) -> List[str]: r"""Returns the string tokens from the input text.""" token_ids = self.encode(text) return [self.tokenizer.decode([token_id]) for token_id in token_ids]