o
    }oi                  	   @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	Z	d dl
Z
dZdZg dZdZ	dded	ee d
eeef fddZG dd dZdS )    N)Path)DictOptionalz[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+i   )z<unk><s></s>z<SPECIAL_{id}>path	max_vocabreturnc                 C   s4  |  dsJ t| ddd}t|}W d   n1 sw   Y  t|ts*J tdt|  |durG|d| }tdt| d i }t|D ]0\}}|	 h d	ks[J |d
 |kscJ t
|d }|dksw|t|gkswJ |d
 ||< qMt|t|ksJ t| ttt|ksJ |S )zK
    Reload the tokenizer JSON file and convert it to Tiktoken format.
    z.jsonrzutf-8)encodingNzVocab size: zCutting vocab to first z tokens.>   rank	token_strtoken_bytesr   r      )endswithopenjsonload
isinstancelistprintlen	enumeratekeysbase64	b64decodebytessetvaluesrange)r   r   fvocabranksixmerge r&   R/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/tiktoken_tokenizer.pyreload_mergeable_ranks   s&    r(   c                   @   sV   e Zd ZdefddZdd Zdd Zdd	 Zed
d Z	edd Z
edd ZdS )TiktokenTokenizer
vocab_filec                 C   sd   d| _ t}t}t }|| j  }t||d}tjt|j	j
||i d| _|d| _|d| _d S )Ni  )r   )namepat_strmergeable_ranksspecial_tokensr   r   )num_special_tokensDEFAULT_TIKTOKEN_MAX_VOCABPATTERN_TIKTOKENSPECIAL_TOKENScopyr(   tiktokenEncodingr   parentr+   	tokenizerindex_bos_id_eos_id)selfr*   
vocab_sizepatternr.   inner_vocab_sizetoken2idr&   r&   r'   __init__A   s   

zTiktokenTokenizer.__init__c                    s"    j |} fdd|D }|S )Nc                    s   g | ]}| j  qS r&   )r/   .0tr;   r&   r'   
<listcomp>W   s    z,TiktokenTokenizer.encode.<locals>.<listcomp>)r7   encode)r;   texttokensr&   rD   r'   rF   U   s   zTiktokenTokenizer.encodec                    s&    fdd|D }|r j |S dS )Nc                    s0   g | ]}| j  jhvr| jkr| j qS r&   )r9   r:   r/   rA   rD   r&   r'   rE   \   s
    z,TiktokenTokenizer.decode.<locals>.<listcomp> )r7   decode)r;   rH   adjusted_tokensr&   rD   r'   rJ   Z   s   
zTiktokenTokenizer.decodec                 C   s>   t |tjst|r| }t |d tr|d }| |S )Nr   )r   npndarraytorch	is_tensortolistr   rJ   )r;   idsr&   r&   r'   batch_decodeh   s
   
zTiktokenTokenizer.batch_decodec                 C      | j S Nr:   rD   r&   r&   r'   pad_idq      zTiktokenTokenizer.pad_idc                 C   rS   rT   )r9   rD   r&   r&   r'   bos_token_idu   rW   zTiktokenTokenizer.bos_token_idc                 C   rS   rT   rU   rD   r&   r&   r'   eos_token_idy   rW   zTiktokenTokenizer.eos_token_idN)__name__
__module____qualname__strr@   rF   rJ   rR   propertyrV   rX   rY   r&   r&   r&   r'   r)   @   s    	

r)   rT   )r   r   pathlibr   typingr   r   numpyrL   r4   rN   r1   r0   r2   SPECIAL_TOKEN_TEMPLATEr]   intr   r(   r)   r&   r&   r&   r'   <module>   s(   

"