"""Tokenizer from tiktoken."""importtiktokenfromtypingimportList# from adalflow.core.component import BaseComponent
[docs]classTokenizer:__doc__=r""" Tokenizer component that wraps around the tokenizer from tiktoken. __call__ is the same as forward/encode, so that we can use it in Sequential Additonally, you can can also use encode and decode methods. Args: name (str, optional): The name of the tokenizer. Defaults to "cl100k_base". You can find more information at the tiktoken documentation. """def__init__(self,name:str="cl100k_base",remove_stop_words:bool=False):super().__init__()self.name=nameself.tokenizer=tiktoken.get_encoding(name)self.stop_words=(set(["and","the","is","in","at","of","a","an"])ifremove_stop_wordselseset())# call is the same as forward/encode, so that we can use it in Sequentialdef__call__(self,input:str)->List[str]:returnself.encode(input)
[docs]defpreprocess(self,text:str)->List[str]:# Lowercase the textwords=text.lower().split()returnwords
[docs]defencode(self,text:str)->List[int]:r"""Encodes the input text/word into token IDs."""returnself.tokenizer.encode(text)
[docs]defdecode(self,tokens:List[str])->str:r"""Decodes the input tokens into text."""returnself.tokenizer.decode(tokens)
[docs]defcount_tokens(self,text:str)->int:r"""Counts the number of tokens in the input text."""returnlen(self.encode(text))
[docs]defget_string_tokens(self,text:str)->List[str]:r"""Returns the string tokens from the input text."""token_ids=self.encode(text)return[self.tokenizer.decode([token_id])fortoken_idintoken_ids]