o
    ̳i(                     @   s   d dl mZmZmZmZmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ ddd	d
ddddddddZG dd deeZdS )    )AnyDictListMappingOptionalTuple)Message)PromptTemplate)truncate)	Transform)ModelTokenizerSentencePieceBaseTokenizer }  i}  i}  i}  i}  i}  i}  i}  i}  i	}  i
}  )<|endoftext|><|assistant|>z<|placeholder1|>z<|placeholder2|>z<|placeholder3|>z<|placeholder4|>
<|system|><|end|>z<|placeholder5|>z<|placeholder6|><|user|>c                   @   s  e Zd ZdZ			d#dedeeeef  dee dee fddZ	e
d	d
 Ze
dd Z			d$dededededee f
ddZd%dee dedefddZddddee dededeee ee f fddZ	d&deeef d edeeef fd!d"ZdS )'Phi3MiniTokenizera  
    SentencePiece tokenizer configured with Phi3 Mini's special tokens.

    Args:
        path (str): Path to pretrained tokenizer file.
        special_tokens (Optional[Dict[str, int]]): mapping containing special text tokens and
            their registered token IDs. If left as None, this will be set to the canonical
            Phi3 special tokens.
        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
            Default: None
        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
            to add structured text around the actual messages. The structured text is used in three scenarios:

            - Task-specific templates to gear models for a particular task that it will expect after training
            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
              tags in Llama2 and in Mistral
            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`

            The extra text will still get tokenized as normal text, not as special tokens. Default is None.

    Examples:
        >>> tokenizer = Phi3MiniTokenizer("/path/to/spm_model")
        >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
        >>> print(tokenized_text)
        [1, 31587, 29644, 102, 2]
    Npathspecial_tokensmax_seq_lenprompt_templatec                 C   sN   t || _|d ur|nt| _| jd | _| jd | _| jg| _|| _|| _d S )Nr   )	r   
_spm_modelPHI3_SPECIAL_TOKENSr   eos_idpad_idstop_tokensr   r   )selfr   r   r   r    r   T/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/phi3/_tokenizer.py__init__=   s   


zPhi3MiniTokenizer.__init__c                 C      | j jS N)r   
vocab_sizer   r   r   r    r$   U      zPhi3MiniTokenizer.vocab_sizec                 C   r"   r#   )r   bos_idr%   r   r   r    r'   Y   r&   zPhi3MiniTokenizer.bos_idTFtextadd_bosadd_eostrim_leading_whitespacereturnc                 C   s   | j j||||dS )Nr)   r*   r+   )r   encode)r   r(   r)   r*   r+   r   r   r    r.   ]   s   zPhi3MiniTokenizer.encodeidsskip_special_tokensc                 C   s:   g }|D ]}|r|dkr|dkrq| | q| j|S )a0  Decode token IDs to strings.

        Args:
            ids (List[int]): The input token IDs to be decoded.
            skip_special_tokens (bool): Whether to show or skip special tokens in the decoded string.
                Default is True.

        Returns:
            str: The decoded text.
        r   i@}  )appendr   decode)r   r/   r0   ids_for_decodetoken_idr   r   r    r2   k   s   zPhi3MiniTokenizer.decode)r*   ignore_system_promptmessagesr5   c             	   C   s@  | j dur
|  |n|}d}d}g }g }| jdddd}	|D ]}
|r(|
jdkr(q|r6|| j ||
j |
jdkrJ|| jd  ||
j n7|
jd	kr`|| jd
  d}||
j n!|
jdkrt|| jd  ||
j ntd|
j d|
j d|	|	 |	|
jgt
|	  g }|
jD ]!}|d dkr|| j|d ddddd }qtd|d  || jd g |	 }|	| |	|
jgt
|  |r|r|| j ||
j d}d}nd}| jrt
|| jkr nq| jrt
|| jkrt|| j|r| jnd}t|| j|r|
jnd}||fS )a  Tokenize a list of messages one at a time then concatenate them,
        returning a list of tokens and a list of masks.

        Example:
            >>> tokenizer = Phi3MiniTokenizer(tokenizer_path, max_seq_len)
            >>> messages = [
                Message(role="system", content="system message\n", masked=True),
                Message(role="user", content="user prompt\n", masked=True),
                Message(role="assistant", content="assistant response\n"),
            ]

            >>> # tokenize_messages encodes messages separately and concats
            >>> tokenizer.tokenize_messages(messages)[0]
            [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]

            >>> # Same result as encoding the full string in one go
            >>> tokenizer.encode(''.join([message.content for message in messages]))
            [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]


        Args:
            messages (List[Message]): A list of messages, each containing role, content,
                and masked attributes.
            add_eos (bool): Whether to append EOS after assistant message, default to False
            ignore_system_prompt (bool): Whether to ignore system prompt, defaults to False.

        Raises:
            ValueError: If the role is not "user", "assistant", or "system".
            RuntimeError: If ``message["type"] != "text``.

        Returns:
            Tuple[List[int], List[bool]]: The tokenized messages
        NTF
)r)   r*   systemuserr   	assistantr   r   zUnknown role 'z' for message: ''typer(   content r-   z"Unsupported message content type: r   )r   r.   roler1   r'   maskedr   
ValueErrorr=   extendlenrstripRuntimeErrorr   r   r
   )r   r6   r*   r5   templated_messagesstart_of_turnend_of_turntokenized_messagesmasknew_line_token_idmessagetokensitemr   r   r    tokenize_messages   sx   
*







z#Phi3MiniTokenizer.tokenize_messagessample	inferencec                 C   s,   | d}| |\}}||d< ||d< |S )a%  
        Apply ``tokenize_messages`` to the "messages" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "messages" field containing
                a List[Message] to tokenize
            inference (bool): Whether the template is being used for inference or not.

        Returns:
            Mapping[str, Any]: The sample with added "tokens" and "mask" fields
                and the "messages" field removed.
            inference (bool): Whether the template is being used for inference or not.
        r6   rM   rJ   )poprO   )r   rP   rQ   r6   rM   rJ   r   r   r    __call__   s
   
zPhi3MiniTokenizer.__call__)NNN)TTF)T)F)__name__
__module____qualname____doc__strr   r   intr	   r!   propertyr$   r'   boolr   r.   r2   r   r   rO   r   r   rS   r   r   r   r    r   !   sh    






r   N)typingr   r   r   r   r   r   torchtune.data._messagesr    torchtune.data._prompt_templatesr	   torchtune.data._utilsr
   torchtune.modules.transformsr   'torchtune.modules.transforms.tokenizersr   r   r   r   r   r   r   r    <module>   s&    