o
    }oi&                     @   sJ   d dl Z d dlmZmZmZmZ d dlZd dlZd dl	Z	G dd dZ
dS )    N)DictListOptionalUnionc                
   @   sb  e Zd ZdZ				d9dee deeeeef ee f  de	dee
j fdd	Zd
d Zdd Zdd Zdd Zdd Zdd Zdeeee f deeee f fddZdd Zedd Zedd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zd:d.e	fd/d0Z dee fd1d2Z!d3d4 Z"ed5d6 Z#d7d8 Z$dS );SentencePieceTokenizera  
    SentencePieceTokenizer https://github.com/google/sentencepiece

        Args:
        model_path: path to sentence piece tokenizer model.
        special_tokens: either list of special tokens or dictionary of token name to token value
        legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored,
            including the possibility to add special tokens inside wrapper.
        tokenizer: wraps an existing tokenizer
    NF
model_pathspecial_tokenslegacy	tokenizerc                 C   s   |d u}|d u}||A st d|r|| _n|rtj|s&t d| dt | _| j| | j | _	| j | _
|| _i | _i | _|rT| jsOt d| | | d| d| d k| _d S )NzNExactly only one of the arguments 'model_path', 'tokenizer' should be providedzmodel_path: z is invalidz^Special tokens must be None when legacy is set to False. Provide special tokens at train time.zx yxy)
ValueErrorr
   ospathexistssentencepieceSentencePieceProcessorLoadget_piece_sizeoriginal_vocab_size
vocab_sizer	   special_token_to_idid_to_special_tokenadd_special_tokenstext_to_tokensspace_sensitive)selfr   r   r	   r
   model_path_providedtokenizer_provided r   W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/sentencepiece_tokenizer.py__init__#   s,   

$zSentencePieceTokenizer.__init__c              	   C   s   | j rbg }d}	 i }| jD ]}z||d  |||< W q ty%   Y qw t|dkr-n&t||jd}|||  }|| j	|||  |
| |t| }q|| j	||d   |S | j	|S Nr      )key)r	   r   indexr   lenmingetextendr
   encode_as_piecesappend)r   texttokensidxindicestoken
next_tokennext_idxr   r   r    r   D   s,   

z%SentencePieceTokenizer.text_to_tokensc              	   C   s   | j reg }d}	 i }| jD ]}z||d  |||< W q ty%   Y qw t|dkr-n)t||jd}|||  }|| j	|||  |
| j|  |t| }q|| j	||d   |S | j	|S r"   )r	   r   r%   r   r&   r'   r(   r)   r
   encode_as_idsr+   )r   r,   idsr.   r/   r0   r1   r2   r   r   r    encodea   s,   
zSentencePieceTokenizer.encodec                 C   s    t |tjr
| }| j|S N)
isinstancenpndarraytolistr
   decode_piecesr   r-   r   r   r    tokens_to_text~   s   z%SentencePieceTokenizer.tokens_to_textc                 C   s   t |tjst|r| }| jrOd}d}t|D ]$\}}|| jv r>|| j	
||| d 7 }|| j| d 7 }|d }q|| j	
||d  7 }| S | j	|S )N r    r#   )r7   r8   r9   torch	is_tensorr:   r	   	enumerater   r
   
decode_idsstripdecode)r   r4   r,   last_iiidr   r   r    batch_decode   s   
z#SentencePieceTokenizer.batch_decodec                 C   s&   | j r|| jv r| j| S | j|S r6   )r	   r   r
   piece_to_id)r   r0   r   r   r    token_to_id   s   
z"SentencePieceTokenizer.token_to_idc                 C   s@   g }|D ]}|| j kr|| j|  q|| j| q|S r6   )r   r+   r   r
   id_to_piece)r   r4   r-   rH   r   r   r    ids_to_tokens   s   
z$SentencePieceTokenizer.ids_to_tokensr-   returnc                 C   s2   t |tr|g}g }|D ]
}|| | q|S r6   )r7   strr+   rK   )r   r-   r4   r0   r   r   r    tokens_to_ids   s   
z$SentencePieceTokenizer.tokens_to_idsc                 C   s   | j stdt|tr6|D ]%}| j|| j kr3|| jvr3| j| j|< || j	| j< |  jd7  _qd S t|t
rm| D ]/\}}t| || | j|| j krl|| jvrl| j| j|< || j	| j< |  jd7  _q?d S d S )NzASpecial Token addition does not work when legacy is set to False.r#   )r	   AttributeErrorr7   listr
   rJ   unk_idr   r   r   dictitemssetattr)r   r   r0   
token_namer   r   r    r      s,   



z)SentencePieceTokenizer.add_special_tokensc                 C   *   | j r| | jgd }|S | j }|S Nr   )r	   rP   	pad_tokenr
   pad_id)r   r[   r   r   r    r[      
   
zSentencePieceTokenizer.pad_idc                 C   rX   rY   )r	   rP   	bos_tokenr
   bos_id)r   r^   r   r   r    bos_token_id   r\   z#SentencePieceTokenizer.bos_token_idc                 C   rX   rY   )r	   rP   	eos_tokenr
   eos_id)r   ra   r   r   r    eos_token_id   r\   z#SentencePieceTokenizer.eos_token_idc                 C       | j r| | jgd S tdNr   zVUse function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.)r	   rP   	sep_token	NameErrorr   r   r   r    sep_id      zSentencePieceTokenizer.sep_idc                 C   rc   rd   )r	   rP   	cls_tokenrf   rg   r   r   r    cls_id   ri   zSentencePieceTokenizer.cls_idc                 C   rc   rd   )r	   rP   
mask_tokenrf   rg   r   r   r    mask_id   ri   zSentencePieceTokenizer.mask_idc                 C   s
   | j  S r6   )r
   rS   rg   r   r   r    rS      s   
zSentencePieceTokenizer.unk_idc                    s8   t | j| j| j| j| j| jg  fdd| j D S )z{Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5.c                    s   g | ]
\}}| vr|qS r   r   ).0kvr   r   r    
<listcomp>   s    zHSentencePieceTokenizer.additional_special_tokens_ids.<locals>.<listcomp>)	setr]   r`   rZ   rl   rj   re   r   rU   rg   r   rq   r    additional_special_tokens_ids   s   z4SentencePieceTokenizer.additional_special_tokens_idsc                    sB    fddt  j D } fddt  j j D }|| S )Nc                    s   g | ]} j |qS r   )r
   rL   )rn   rH   rg   r   r    rr     s    z0SentencePieceTokenizer.vocab.<locals>.<listcomp>c                    s   g | ]
} j  j|  qS r   )r   r   )rn   rG   rg   r   r    rr     s    )ranger
   r   r   r   )r   
main_vocabr   r   rg   r    vocab   s
   
zSentencePieceTokenizer.vocabskip_special_tokensc                 C   
   |  |S r6   )rM   )r   r4   rx   r   r   r    convert_ids_to_tokens
     
z,SentencePieceTokenizer.convert_ids_to_tokensc                 C   ry   r6   )r=   r<   r   r   r    convert_tokens_to_string  r{   z/SentencePieceTokenizer.convert_tokens_to_stringc                 C   s   | j S r6   )r   rg   r   r   r    __len__  s   zSentencePieceTokenizer.__len__c                 C   s   dS )NTr   rg   r   r   r    is_fast  s   zSentencePieceTokenizer.is_fastc                 C   s   d S r6   r   rg   r   r   r    get_added_vocab  s   z&SentencePieceTokenizer.get_added_vocab)NNFN)F)%__name__
__module____qualname____doc__r   rO   r   r   r   boolr   r   r!   r   r5   r=   rI   rK   rM   intrP   r   propertyr[   r_   rb   rh   rk   rm   rS   rt   rw   rz   r|   r}   r~   r   r   r   r   r    r      s^    
!*	










r   )r   typingr   r   r   r   numpyr8   r   r@   r   r   r   r   r    <module>   s   