o
    ̳i                     @   s4   d dl mZmZmZ d dlmZ G dd deZdS )    )AnyDictList)SentencePieceBaseTokenizerc                	       st   e Zd ZdZddededef fddZd	ed
ee f fddZ		dde
eef ded
e
eef fddZ  ZS )T5Tokenizerz
    Text tokenizer for T5.

    Args:
        path (str): the path to the T5 sentencepiece tokenizer file
        max_seq_len (int): the context length
        truncate (bool): whether to truncate the token sequence when longer than max_seq_len
       Tpathmax_seq_lentruncatec                    s   t  | || _|| _d S )N)super__init__r	   r
   )selfr   r	   r
   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/t5/_tokenizer.pyr      s   
zT5Tokenizer.__init__textreturnc                    sN   t  j|ddddd}t|| jkr%| jsJ d|d| j }| j|d< |S )z
        Given a string, return the encoded list of token ids.

        Args:
            text (str): The text to encode.

        Returns:
            List[int]: The encoded list of token ids.
        FTN)add_bosadd_eostrim_leading_whitespaceprefixzWTokenized text is larger than the maximum sequence length but truncate is set to False.)r   encodelenr	   r
   eos_id)r   r   tokensr   r   r   r      s   

zT5Tokenizer.encodeFsample	inferencec                 C   s   | d}| ||d< |S )aW  
        Tokenize the "text" field in the sample.

        Args:
            sample (Dict[str, Any]): A sample with a "text" field containing a string to tokenize
            inference (bool): Unused by this tokenizer

        Returns:
            Dict[str, Any]: The sample with added "tokens" field and the "messages" field removed.
        r   r   )popr   )r   r   r   r   r   r   r   __call__6   s   
zT5Tokenizer.__call__)r   T)F)__name__
__module____qualname____doc__strintboolr   r   r   r   r   r    __classcell__r   r   r   r   r      s    	

r   N)typingr   r   r   6torchtune.modules.transforms.tokenizers._sentencepiecer   r   r   r   r   r   <module>   s   