o
    ̳i                     @   s   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ dddd	d
ddddddddZdZeddD ]ZedkrIqBeede d< ed7 ZqBdZG dd deeZdS )    )AnyDictListMappingOptionalTuple)MessagePromptTemplatetruncate)ModelTokenizer)	Transform)GPT2BaseTokenizer頇 i i i i i i i i i i i )z<|dummy_0|><|endoftext|>z<|fim_prefix|>z<|fim_middle|>z<|fim_suffix|>z<|dummy_1|>z<|dummy_2|>z<|dummy_3|><|im_start|>
<|im_end|>
<|im_sep|>z<|endofprompt|>   i i  z<|dummy_z|>   zs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+c                   @   s  e Zd ZdZ					d#dededeeeef  dee dee f
dd	Z	e
d
d Z	d$dedededee fddZd%dee dedefddZdedefddZddddee dededeee ee f fddZd eeef deeef fd!d"ZdS )&Phi4TokenizeraQ  
    TikToken tokenizer configured with Phi4 (14B) special tokens.

    Args:
        merges_path (str): Path to merges.txt file.
        vocab_path (str): Path to vocab.json file.
        special_tokens (Optional[Dict[str, int]]): Mapping containing special text tokens and
            their registered token IDs. If left as None, this will be set to the canonical
            Phi4 special tokens.
        max_seq_len (Optional[int]): Max sequence length to truncate tokens to.
        prompt_template (Optional[PromptTemplate]): Template used to format the messages based on their role.
    Nmerges_path
vocab_pathspecial_tokensmax_seq_lenprompt_templatec                 C   sd   |pt | _| jd | _| jd | _| jd | _| jg| _|| _|| _t||| j| j| j| j| _	d S )Nr   r   z<|dummy_85|>)
PHI4_SPECIAL_TOKENSr   eos_idbos_idpad_idstop_tokensr   r   r   tokenizer_model)selfr   r   r   r   r    r"   T/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/phi4/_tokenizer.py__init__6   s   


zPhi4Tokenizer.__init__c                 C   s   | j jS )N)r    
vocab_size)r!   r"   r"   r#   r%   R   s   zPhi4Tokenizer.vocab_sizeTtextadd_bosadd_eosreturnc                 C   s   | j j|||dS )N)r&   r'   r(   )r    encode)r!   r&   r'   r(   r"   r"   r#   r*   V   s   zPhi4Tokenizer.encodeidsskip_special_tokensc                    s    fdd|D }| j |S )zDecode token IDs to strings.c                    s*   g | ]} rd |  krdksn |qS )r   i r"   ).0token_idr,   r"   r#   
<listcomp>]   s    z(Phi4Tokenizer.decode.<locals>.<listcomp>)r    decode)r!   r+   r,   ids_for_decoder"   r/   r#   r1   [   s   
zPhi4Tokenizer.decoderolec                 C   s6   | j d g}|| j|ddd || j d  |S )Nr   Fr'   r(   r   )r   extendr*   append)r!   r3   tokenized_messagesr"   r"   r#   _tokenize_headerd   s   zPhi4Tokenizer._tokenize_headerF)r(   ignore_system_promptmessagesr9   c                C   sl  | j r|  |n|}g }g }|D ]}|r|jdkrq| |j}|| ||jgt|  g }	|jD ] }
|
d dkrM|	| j|
d dddd7 }	q5t	d|
d  |rf|jd	krf|	
| jd
  n|jd	krs|	
| jd
  ||	 ||jgt|	  | jrt|| jkr nq| jrt|| jkrt|| j|r| jnd }t|| j|r|jnd }||fS )Nsystemtyper&   content Fr4   z"Unsupported message content type: 	assistantr   )r   r3   r8   r5   maskedlenr=   r*   rstripRuntimeErrorr6   r   r   r
   r   )r!   r:   r(   r9   templated_messagesr7   maskmessagetokenized_headertokensitemr"   r"   r#   tokenize_messagesj   sD   




zPhi4Tokenizer.tokenize_messagessamplec                 C   s,   | d}| |\}}||d< ||d< |S )zR
        Apply `tokenize_messages` to the "messages" field in the sample.
        r:   rH   rE   )poprJ   )r!   rK   r:   rH   rE   r"   r"   r#   __call__   s
   
zPhi4Tokenizer.__call__)NNNNN)TT)T)__name__
__module____qualname____doc__strr   r   intr	   r$   propertyr%   boolr   r*   r1   listr8   r   r   rJ   r   r   rM   r"   r"   r"   r#   r   (   sX    


	

&5r   N)typingr   r   r   r   r   r   torchtune.datar   r	   r
   torchtune.modules.tokenizersr   torchtune.modules.transformsr   'torchtune.modules.transforms.tokenizersr   r   current_dummy_indexranger.   CL100K_PATTERNr   r"   r"   r"   r#   <module>   s4    
