o
    }oi
!                  	   @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZ zd dl	Z	W n	 e
y+   Y nw d dlmZ dgZ	ddedee deeef fd	d
ZdZdZg dZdZG dd deZdS )    N)Path)DictListOptional)TokenizerSpecTiktokenTokenizerpath	max_vocabreturnc                 C   s0  |  dsJ t| d}t|}W d   n1 sw   Y  t|ts(J tdt|  |durE|d| }tdt| d i }t|D ]0\}}|	 h dksYJ |d |ksaJ t
|d	 }|d
ksu|t|gksuJ |d ||< qKt|t|ksJ t| ttt|ksJ |S )zK
    Reload the tokenizer JSON file and convert it to Tiktoken format.
    z.jsonrNzVocab size: zCutting vocab to first z tokens.>   rank	token_strtoken_bytesr   r      )endswithopenjsonload
isinstancelistprintlen	enumeratekeysbase64	b64decodebytessetvaluesrange)r   r	   fvocabranksixmerge r&   i/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/tiktoken_tokenizer.pyreload_mergeable_ranks   s&    r(   z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+i   )<unk><s></s><mask><pad><cls><sep>z<SPECIAL_{id}>c                   @   sJ  e Zd ZdZeeddfdededededee	e  f
d	d
Z
defddZde	e fddZdd Zdd Zdd Zdd ZdefddZ	d:de	e defddZedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zed)d* Zed+d, Zed-d. Zed/d0 Zed1d2 Zed3d4 Zed5efd6d7Z ed8d9 Z!dS );r   a.  
    TiktokenTokenizer https://github.com/openai/tiktoken.

    Args:
        model_path: path to tokenizer vocabulary
        num_special_tokens: number of special tokens to generate
        special_tokens: template for user-defined special tokens
        pattern: Regex pattern to split the text
    i  N
vocab_filepattern
vocab_sizenum_special_tokensspecial_tokensc           	      C   s.  |rt j|std| d|d u rt }t|tt|ks)J d| t||  kr6|k s9J  J ttt|ksHJ dt |d| _	|d| _
|d| _|d| _|d	| _|d
| _|d| _|| _td| j || _dd tt||D }|| _|rtd|d  d|d   || | _tt| jt| j  kr|ksn J | j|| | _t|| jd| _dd | j D | _tt| jt| j ksJ dd t| jD | _| j D ]\}}|jddd| j|| j < qtj t!|j"j#|| ji d| _$d S )Nzvocab_file: z is invalidz!Special tokens should be unique: z%Custom special tokens should include r)   r*   r+   r,   r-   r.   r/   zself._vocab_size = c                 S   s   g | ]}t j|d qS ))id)SPECIAL_TOKEN_TEMPLATEformat).0r#   r&   r&   r'   
<listcomp>o       z.TiktokenTokenizer.__init__.<locals>.<listcomp>zAdding special tokens r   z, ..., )r	   c                 S   s   i | ]\}}||qS r&   r&   )r8   kvr&   r&   r'   
<dictcomp>y       z.TiktokenTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r&   r&   )r8   r#   tokr&   r&   r'   r>   |   r?   utf-8replaceerrors)namepat_strmergeable_ranksr4   )%osr   exists
ValueErrorSPECIAL_TOKENScopyr   r   index_unk_id_bos_id_eos_id_mask_id_pad_id_cls_id_sep_id_vocab_sizer   r3   r   special_fillerr4   inner_vocab_sizer(   token2iditemsid2tokenr   r   shifted_id2tokendecodetiktokenEncodingr   parentrE   	tokenizer)	selfr0   r1   r2   r3   r4   rV   keyvaluer&   r&   r'   __init__R   sH   " 
0
 
zTiktokenTokenizer.__init__textc                    s    j |} fdd|D S )Nc                       g | ]} j |qS r&   )r`   decode_single_token_bytesr8   tokenra   r&   r'   r9      r:   z4TiktokenTokenizer.text_to_tokens.<locals>.<listcomp>r`   encode)ra   re   	token_idsr&   rj   r'   text_to_tokens   s   z TiktokenTokenizer.text_to_tokenstokensc                    s    fdd|D } j |S )Nc                    rf   r&   )r`   encode_single_token)r8   ro   rj   r&   r'   r9      r:   z4TiktokenTokenizer.tokens_to_text.<locals>.<listcomp>)r`   r\   )ra   ro   rm   r&   rj   r'   tokens_to_text   s   z TiktokenTokenizer.tokens_to_textc                 C   s(   || j v r| j |S | j|| j S N)r4   rM   r`   rp   r3   )ra   ri   r&   r&   r'   token_to_id   s   
zTiktokenTokenizer.token_to_idc                    s    fdd|D S )Nc                    s   g | ]}  |qS r&   rs   rh   rj   r&   r'   r9      r?   z3TiktokenTokenizer.tokens_to_ids.<locals>.<listcomp>r&   )ra   ro   r&   rj   r'   tokens_to_ids   s   zTiktokenTokenizer.tokens_to_idsc                 C   s8   || j k r
| j| S || j 8 }| j|}|jdddS )NrA   rB   rC   )r3   r4   r`   rg   r\   )ra   token_idr   r&   r&   r'   id_to_token   s
   


zTiktokenTokenizer.id_to_tokenc                 C   s"   g }|D ]
}| | | q|S rr   )appendrw   )ra   rm   ro   rv   r&   r&   r'   ids_to_tokens   s   zTiktokenTokenizer.ids_to_tokensc                    s"    j |} fdd|D }|S )Nc                    s   g | ]}| j  qS r&   )r3   r8   trj   r&   r'   r9      r?   z1TiktokenTokenizer.text_to_ids.<locals>.<listcomp>rk   )ra   re   ro   r&   rj   r'   text_to_ids   s   zTiktokenTokenizer.text_to_idsTremove_special_tokensc                    s4   |r fdd|D }n|}|rd  |S dS )Nc                    s*   g | ]}| j  jhvr| jkr|qS r&   )boseosr3   rz   rj   r&   r'   r9      s   * z1TiktokenTokenizer.ids_to_text.<locals>.<listcomp> )joinry   )ra   ro   r}   adjusted_tokensr&   rj   r'   ids_to_text   s   zTiktokenTokenizer.ids_to_textc                 C      | j S rr   )rO   rj   r&   r&   r'   bos_id      zTiktokenTokenizer.bos_idc                 C   r   rr   )rP   rj   r&   r&   r'   eos_id   r   zTiktokenTokenizer.eos_idc                 C   r   rr   )rN   rj   r&   r&   r'   unk_id   r   zTiktokenTokenizer.unk_idc                 C   r   rr   )rQ   rj   r&   r&   r'   mask_id   r   zTiktokenTokenizer.mask_idc                 C   r   rr   )rR   rj   r&   r&   r'   pad_id   r   zTiktokenTokenizer.pad_idc                 C   r   rr   )rS   rj   r&   r&   r'   cls_id   r   zTiktokenTokenizer.cls_idc                 C   r   rr   )rT   rj   r&   r&   r'   sep_id   r   zTiktokenTokenizer.sep_idc                 C   r   rr   )rX   rj   r&   r&   r'   r!      r   zTiktokenTokenizer.vocabc                    s6    jjjgj   fddjD }|S )z
        Returns a list of the additional special tokens, excluding [bos, eos, pad, unk] and special_filler.
        Used to return sentinel tokens for e.g. T5.
        c                    s   g | ]}| vr |qS r&   rt   rh   excluding_tokensra   r&   r'   r9      s    zCTiktokenTokenizer.additional_special_tokens_ids.<locals>.<listcomp>)ry   rN   rO   rP   rV   r4   )ra   resultr&   r   r'   additional_special_tokens_ids   s   z/TiktokenTokenizer.additional_special_tokens_idsc                 C   r   rr   r[   rj   r&   r&   r'   decoder   r   zTiktokenTokenizer.decoderc                 C   r   rr   )r!   rj   r&   r&   r'   encoder   r   zTiktokenTokenizer.encoderr
   c                 C   r   rr   )rU   rj   r&   r&   r'   r2      r   zTiktokenTokenizer.vocab_sizec                 C   r   rr   r   rj   r&   r&   r'   	inv_vocab   r   zTiktokenTokenizer.inv_vocab)T)"__name__
__module____qualname____doc__PATTERN_TIKTOKENDEFAULT_TIKTOKEN_MAX_VOCABstrintr   r   rd   rn   rq   rs   ru   rw   ry   r|   boolr   propertyr   r   r   r   r   r   r   r!   r   r   r   r2   r   r&   r&   r&   r'   r   G   sp    

5









	

rr   )r   r   rH   pathlibr   typingr   r   r   r]   ImportError1nemo.collections.common.tokenizers.tokenizer_specr   __all__r   r   r   r(   r   r   rK   r6   r   r&   r&   r&   r'   <module>   s2   

"