o
    ̳i~3                     @   s   d dl Z d dlmZmZmZmZmZmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZmZ dZddd	d
ddddddddddZdZdd eeee D Zi eeZG dd deeZdS )    N)AnyDictListMappingOptionalTuple)MessagePromptTemplatetruncate)	Transform)ModelTokenizerTikTokenBaseTokenizerzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+i  i i i i i i i i i	 i
 i  i )<|begin_of_text|><|end_of_text|>z<|reserved_special_token_0|>z<|reserved_special_token_1|><|finetune_right_pad_id|><|step_id|><|start_header_id|><|end_header_id|>
<|eom_id|>
<|eot_id|><|python_tag|>	<|image|>z	<|video|>   c                 C   s"   i | ]}d d|  dd| qS )z<|reserved_special_token_   z|>i  ).0ir   r   V/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/llama3/_tokenizer.py
<dictcomp>&   s    r   c                
   @   s  e Zd ZdZ			d2dedeeeef  dee dee fddZ	d	d
 Z
dedefddZedefddZedefddZ		d3dedededee fddZ		d3dee dededefddZdedee fddZdedee fd d!Zdedee fd"d#Zddd$ded%ed&edee fd'd(Zdd)d*ee d&edeee ee f fd+d,Z	-d4d.eeef d/edeeef fd0d1ZdS )5Llama3Tokenizera&  
    tiktoken tokenizer configured with Llama3 Instruct's special tokens, as described in
    https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3

    Args:
        path (str): Path to pretrained tiktoken tokenizer file.
        special_tokens (Optional[Dict[str, int]]): mapping containing special text tokens and
            their registered token IDs. If left as None, this will be set to the canonical
            Llama3 special tokens.
        max_seq_len (Optional[int]): maximum sequence length for tokenizing a single list of messages,
            after which the input will be truncated. Default is None.
        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
            to add structured text around the actual messages. The structured text is used in three scenarios:

            - Task-specific templates to gear models for a particular task that it will expect after training
            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
              tags in Llama2 and in Mistral
            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`

            The extra text will still get tokenized as normal text, not as special tokens. Default is None.

    Examples:
        >>> tokenizer = Llama3Tokenizer("/path/to/tt_model")
        >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
        >>> print(tokenized_text)
        [1, 31587, 29644, 102, 2]
    Npathspecial_tokensmax_seq_lenprompt_templatec                 C   s   |d ur|nt | _|   | jd | _| jd | _| jd | _| jd | _| jd | _| jd | _| jd | _	| jd | _
| jd	 | _| jd
 | _| j| j	| j
g| _t|dt| j| j| jd| _|| _|| _td| _td| _d S )Nr   r   r   r   r   r   r   r   r   r   llama3_tiktoken)r    namepatternbos_ideos_idr!   z	<\|.*?\|>z/<\|start_header_id\|>.*?<\|end_header_id\|>\n\n)LLAMA3_SPECIAL_TOKENSr!   _validate_special_tokensr'   r(   pad_idstep_idstart_header_idend_header_ideot_ideom_id
python_tagimage_idstop_tokensr   CL100K_PATTERNtt_modelr"   r#   recompile_special_token_regex_special_token_header_regex)selfr    r!   r"   r#   r   r   r   __init__K   s8   
zLlama3Tokenizer.__init__c                 C   s&   dD ]}|| j vrt| dqdS )zV
        Validate that required special tokens are passed into the tokenizer.
        )r   r   r   r   r   r   r   z missing from special_tokensN)r!   
ValueError)r:   tokenr   r   r   r*   ~   s
   
	z(Llama3Tokenizer._validate_special_tokenstextreturnc                 C   s   | j d| jd|S )z@
        Remove special tokens from the decoded string.
         )r8   subr9   )r:   r>   r   r   r   _remove_special_tokens   s   z&Llama3Tokenizer._remove_special_tokensc                 C      | j jS N)r5   base_vocab_sizer:   r   r   r   rE         zLlama3Tokenizer.base_vocab_sizec                 C   rC   rD   )r5   
vocab_sizerF   r   r   r   rH      rG   zLlama3Tokenizer.vocab_sizeTadd_bosadd_eosc                 C   s   | j j|||dS )N)r>   rI   rJ   )r5   encode)r:   r>   rI   rJ   r   r   r   rK      s   zLlama3Tokenizer.encode	token_idstruncate_at_eosskip_special_tokensc                 C   s"   | j j||d}|r| |S |S )a  
        Decode a list of token ids into a string.

        Args:
            token_ids (List[int]): The list of token ids.
            truncate_at_eos (bool): Whether to truncate the string at the end of
                sequence token. Default is True.
            skip_special_tokens (bool): Whether to show or skip special tokens in the decoded string.
                Default is True.

        Returns:
            str: The decoded string.
        )rL   rM   )r5   decoderB   )r:   rL   rM   rN   decoded_stringr   r   r   rO      s   
zLlama3Tokenizer.decodemessagec                 C   s6   | j g| j|j ddd | jg | jdddd S )zT
        Tokenize header start, message role, and header end as list of ids
        FrI   rJ   z

)r-   rK   rolestripr.   r:   rQ   r   r   r   _tokenize_header   s   z Llama3Tokenizer._tokenize_headerc                 C   s   |j r| jgS | jgS )z>
        Add eot or eom id at the end of the message.
        )eotr/   r0   rU   r   r   r   _tokenize_end   s   zLlama3Tokenizer._tokenize_endc                 C   sz   g }|j D ],}|d dkr|| j|d  ddd7 }q|d dkr)|| jg7 }qtd|d  |jr;| jg| }|S )z9
        Tokenize message content as list of ids
        typer>   contentFrR   imagez"Unsupported message content type: )rZ   rK   rT   r2   RuntimeErroripythonr1   )r:   rQ   tokenized_bodyitemr   r   r   _tokenize_body   s   

zLlama3Tokenizer._tokenize_body)add_start_tokensadd_end_tokensra   rb   c                C   s>   |r|  |ng }| |}|r| |ng }|| | }|S )a  
        Tokenize a message into a list of token ids.

        Args:
            message (Message): The message to tokenize.
            add_start_tokens (bool): Whether to prepend a tokenized header to the message. Default is True.
            add_end_tokens (bool): Whether to append eot or eom id at the end of the message. Default is True.

        Returns:
            List[int]: The list of token ids.
        )rV   r`   rX   )r:   rQ   ra   rb   tokenized_headerr^   tokenized_endtokenized_messager   r   r   tokenize_message   s
   
z Llama3Tokenizer.tokenize_messagerb   messagesc                C   s   | j dur
|  |n|}| jg}dg}t|}t|D ]/\}}||d kr'|nd}	| j||	d}
||
 }||jgt|
  }| jrJt|| jkrJ nq|rX|| jg }|dg }| jrrt|| j|rd| jnd}t|| j|rodnd}||fS )a  
        Tokenize a list of messages into a list of token ids and masks.

        Args:
            messages (List[Message]): The list of messages to tokenize.
            add_end_tokens (bool): Whether to append end tokens ids (end-of-seq, end-of-turn, end-of-message) at the end of the
                last assistant message. This value should be set to False for generation. Default is True.

        Examples:
            >>> # Tokenize a list of messages with default settings
            >>> messages = [
            ...     Message(role="user", content="Hello world!", masked=True),
            ...     Message(role="assistant", content="How are you?", masked=False),
            ... ]
            >>> tokenizer = Llama3Tokenizer("/path/to/tt_model")
            >>> tokenizer.tokenize_messages(messages)
            ([1, 31587, 29644, 102, 1, 31587, 29644, 102, 2], [True, True, True, True, True, False, False, False, True])

            >>> # Tokenize a list of messages with add_end_tokens set to False
            >>> tokenizer.tokenize_messages(messages, add_end_tokens=False)
            ([1, 31587, 29644, 102, 1, 31587, 29644], [True, True, True, True, True, False, False])

        Returns:
            Tuple[List[int], List[bool]]: The list of token ids and the list of masks.
        NT   rg   )	r#   r'   len	enumeraterf   maskedr"   r(   r
   )r:   rh   rb   templated_messagestokensmasknum_messagesr   rQ   add_end_tokens_to_messagere   r   r   r   tokenize_messages  s6   
!

z!Llama3Tokenizer.tokenize_messagesFsample	inferencec                 C   s2   | d}| j|| d\}}||d< ||d< |S )a  
        Apply ``tokenize_messages`` to the "messages" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "messages" field containing
                a List[Message] to tokenize
            inference (bool): Whether the template is being used for inference or not.

        Returns:
            Mapping[str, Any]: The sample with added "tokens" and "mask" fields
                and the "messages" field removed.
        rh   rg   rn   ro   )poprr   )r:   rs   rt   rh   rn   ro   r   r   r   __call__L  s
   
zLlama3Tokenizer.__call__)NNN)TT)F)__name__
__module____qualname____doc__strr   r   intr	   r;   r*   rB   propertyrE   rH   boolr   rK   rO   r   rV   rX   r`   rf   r   rr   r   r   rv   r   r   r   r   r   .   s    
3	

!

E

r   )r6   typingr   r   r   r   r   r   torchtune.datar   r	   r
   torchtune.modules.transformsr   'torchtune.modules.transforms.tokenizersr   r   r4   SPECIAL_TOKENSNUM_RESERVED_SPECIAL_TOKENSrangerj   RESERVED_TOKENSr)   r   r   r   r   r   <module>   s4    