o
    ̳iR                     @   sv   d dl mZmZmZmZmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZmZmZ g dZG dd deeZd	S )
    )AnyListMappingOptionalTuple)MessagePromptTemplate)MistralChatTemplate)	Transform)ModelTokenizerSentencePieceBaseTokenizer#tokenize_messages_no_special_tokens) 
	c                   @   s   e Zd ZdZde fdedee dee fddZ	e
dd	 Ze
d
d Ze
dd Ze
dd Z			d$dededededee f
ddZdee defddZdddee dedeee ee f fddZ	d%d eeef d!edeeef fd"d#ZdS )&MistralTokenizera  
    Mistral's implementation of the SentencePiece tokenizer

    Args:
        path (str): Path to pretrained tokenizer file.
        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
            Default: None
        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
            to add structured text around the actual messages. The structured text is used in three scenarios:

            - Task-specific templates to gear models for a particular task that it will expect after training
            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
              tags in Llama2 and in Mistral
            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`

            The extra text will still get tokenized as normal text, not as special tokens.
            Default is :class:`~torchtune.models.mistral.MistralChatTemplate`.

    Examples:
        >>> tokenizer = MistralTokenizer("/path/to/spm_model")
        >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
        >>> print(tokenized_text)
        [1, 31587, 29644, 102, 2]
    Npathmax_seq_lenprompt_templatec                 C   s,   t || _d| j_| jg| _|| _|| _d S )Nr   )r   
_spm_modelpad_ideos_idstop_tokensr   r   )selfr   r   r    r   W/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/mistral/_tokenizer.py__init__/   s
   


zMistralTokenizer.__init__c                 C      | j jS N)r   r   r   r   r   r   r   A      zMistralTokenizer.eos_idc                 C   r   r    )r   bos_idr!   r   r   r   r#   E   r"   zMistralTokenizer.bos_idc                 C   r   r    )r   r   r!   r   r   r   r   I   r"   zMistralTokenizer.pad_idc                 C   r   r    )r   
vocab_sizer!   r   r   r   r$   M   r"   zMistralTokenizer.vocab_sizeTFtextadd_bosadd_eostrim_leading_whitespacereturnc                 C   s   | j j||||dS )a4  
        Encode a string into a list of token IDs

        Args:
            text (str): The input text to be encoded, unbatched.
            add_bos (bool): Whether to prepend BOS special token (Beginning of Sentence) to the input, defaults to True.
            add_eos (bool): Whether to append EOS special token (End of Sentence) to the input, defaults to True.
            trim_leading_whitespace (bool): Whether to trim leading whitespace from
                underlying sentencepiece tokenization. Sentencepiece normally prepends
                whitespace to any tokenized text, which can cause differences where
                encode(s1) + encode(s2) != encode(s1 + s2) due to leading whitespace
                added to s2. Default: False
        Returns:
            List[int]: The encoded token IDs.
        )r&   r'   r(   )r   encode)r   r%   r&   r'   r(   r   r   r   r*   Q   s   zMistralTokenizer.encode	token_idsc                 C   s   | j |S )zDecode token IDs to strings.

        Args:
            token_ids (List[int]): The input token IDs to be decoded.

        Returns:
            str: The decoded text.
        )r   decode)r   r+   r   r   r   r,   n   s   zMistralTokenizer.decode)r'   messagesc                C   s8   | j dur
|  |n|}t| || j|r| jdS ddS )a  Tokenize a list of messages one at a time then concatenate them,
        returning a list of tokens and a list of masks.

        Note:
            sentencepiece has problems where in general
            encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling.
            We can get around this by prepending s2 with a known token and slicing the
            beginning off the tokenized s2.

        Example:
            >>> tokenizer = MistralTokenizer(tokenizer_path, max_seq_len)
            >>> messages = [
                Message(role="system", content="system message\n", masked=True),
                Message(role="user", content="user prompt\n", masked=True),
                Message(role="assistant", content="assistant response\n"),
            ]

            >>> # tokenize_messages encodes messages separately and concats
            >>> tokenizer.tokenize_messages(messages)[0]
            [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]


            >>> # Same result as encoding the full string in one go
            >>> tokenizer.encode(''.join([message.content for message in messages]))
            [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]


        Args:
            messages (List[Message]): A list of messages, each containing role, content,
                and masked attributes.
            add_eos (bool): Whether to append EOS after assistant message, default to True

        Returns:
            Tuple[List[int], List[bool]]: The tokenized messages
        N)	tokenizerr-   r#   r   )r   r   r#   r   )r   r-   r'   templated_messagesr   r   r   tokenize_messages|   s   
+
z"MistralTokenizer.tokenize_messagessample	inferencec                 C   s,   | d}| |\}}||d< ||d< |S )a%  
        Apply ``tokenize_messages`` to the "messages" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "messages" field containing
                a List[Message] to tokenize
            inference (bool): Whether the template is being used for inference or not.

        Returns:
            Mapping[str, Any]: The sample with added "tokens" and "mask" fields
                and the "messages" field removed.
            inference (bool): Whether the template is being used for inference or not.
        r-   tokensmask)popr0   )r   r1   r2   r-   r3   r4   r   r   r   __call__   s
   
zMistralTokenizer.__call__)TTF)F)__name__
__module____qualname____doc__r	   strr   intr   r   propertyr   r#   r   r$   boolr   r*   r,   r   r   r0   r   r   r6   r   r   r   r   r      sl    







6

r   N)typingr   r   r   r   r   torchtune.datar   r   )torchtune.models.mistral._prompt_templater	   torchtune.modules.transformsr
   'torchtune.modules.transforms.tokenizersr   r   r   WHITESPACE_CHARSr   r   r   r   r   <module>   s   