o
    }oid                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZmZ d dlmZ dgZdZG d	d
 d
eZed
eZG dd deZdS )    N)Counter)Enum)Path)DictListNewTypeOptionalUnion)TokenizerSpecCharTokenizeri c                   @   s4   e Zd ZdZdZdZdZdZdZdZ	e
dd	 Zd
S )SpecialTokenStringmaskboseospadsepclsunkc                 C   s
   || j v S N)_value2member_map_)r   value r   e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/char_tokenizer.py	has_value'   s   
zSpecialTokenString.has_valueN)__name__
__module____qualname__MASKBOSEOSPADSEPCLSUNKclassmethodr   r   r   r   r   r      s    r   c                   @   s  e Zd Zddgdd eD   dedd eD  d 											dSd
edeeeef  deeeef  deeeef  deeeef  deeeef  deeeef  deeeef  dee	 dee	 dee
e	 ef fddZedd Zedd Ze							dTdeeef deeeef  deeeef  deeeef  deeeef  deeeef  deeeef  deeeef  fddZedd Zed d! Zed"d# Zd$ed%e
e fd&d'Zd(e
e d%efd)d*Zd$ed%e
e fd+d,Zd-e
e d%efd.d/Zd(e
e d%e
e fd0d1Zd2ed%efd3d4Zd-e
e d%e
e fd5d6Zed7d8 Zed9d: Zed;d< Z ed=d> Z!ed?d@ Z"edAdB Z#edCdD Z$edEdF Z%e							dTdee dee dee dee dee dee dee fdGdHZ&edIdJ Z'edKdL Z(e											dUdMeee)e*j+f d$ee dNeeee)e*j+f  dOee
e  dPedee dee dee dee dee dee dee fdQdRZ,dS )Vr   u  
    Each character is a token.
    Args:
        vocab_file: path to file with vocabulary for a tokenizer. The file consists of valid Python string literals 
            separated by the new line character. Such literals must contain 1 character. Examples of valid Python 
            literals: ``'a'``, ``'\n'``, ``"'"``, ``'ж'``, ``'\u8976'``. Optionally the first line in the file can be a
            JSON dictionary of special tokens. The keys of the special tokens dictionary are ``'mask_token'``,
            ``'bos_token'`` and so on. Some special tokens names can be omitted in the special tokens dictionary line.
            A file ``vocab_file`` has to be in ``'utf-8'`` encoding.
        mask_token: mask token. The following is applicable to all special tokens. Parameter ``mask_token`` is used
            for adding mask token to vocabulary or for modification of mask token present in special tokens dictionary
            in the first line of file ``vocab_file``. Parameter ``mask_token`` can be either of type ``bool`` or a 
            ``str`` of length 1. 
            
            If ``mask_token`` is ``bool`` it has to be ``False``. If ``mask_token`` is ``True`` an exception is raised.
            If ``mask_token`` is ``False`` and ``mask_token`` is present in special tokens dictionary in vocabulary
            file ``vocab_file``, then ``mask_token`` is remove from special tokens dictionary.
            
            If the parameter ``mask_token`` is a string, then such strings in the input sequence are interpreted as
            mask tokens.
        bos_token: the beginning of sequence token. See more in ``mask_token`` parameter description.
        eos_token: the end of sequence token. Usually equal to sep_token. See more in ``mask_token`` parameter 
            description.
        pad_token: token to use for padding. See more in ``mask_token`` parameter description.
        sep_token: token used for separating sequences. See more in ``mask_token`` parameter description.
        cls_token: class token. Usually equal to bos_token. See more in ``mask_token`` parameter description.
        unk_token: token to use for unknown tokens. If the parameter ``unk_token`` is set and there is a character
            in the input of ``text_to_ids`` of ``text_to_tokens`` methods which is not in the vocabulary, then
            such an unknown character is tokenized into ``unk_token``. If the parameter ``unk_token`` is ``False``,
            then unknown tokens are discarded. See more in ``mask_token`` parameter description.
        special_token_to_prepend: special token to prepend to the output of ``text_to_ids`` of ``text_to_tokens``
            methods. This option can be used if you decide to add EOS and BOS tokens to the input on the stage of
            tokenization. Possible options are: Nc                 C      g | ]}|j qS r   r   .0er   r   r   
<listcomp>P       zCharTokenizer.<listcomp>a  .
        special_token_to_append: special token to append to the output of ``text_to_ids`` of ``text_to_tokens``
            methods. See more in the description of ``special_token_to_prepend`` parameter.
        special_tokens_to_remove_while_decoding: which special tokens are remove before detokenization. If this
            parameter equals ``'all'``, then all special tokens are removed. The parameter
            ``special_tokens_to_remove_while_decoding`` can also be a list of values from this set
            c                 c       | ]}|j V  qd S r   r&   r'   r   r   r   	<genexpr>V       zCharTokenizer.<genexpr>z.
    all
vocab_file
mask_token	bos_token	eos_token	pad_token	sep_token	cls_token	unk_tokenspecial_token_to_prependspecial_token_to_append'special_tokens_to_remove_while_decodingc              
      s>  t | }|jdd+}| }|d dkr't|} || | }n	i }|g|  }W d    n1 s:w   Y   ||||||||}t	D ]}|j
d }t |||v r_|| nd  qM| D ]
\}}t || qg|	df|
dffD ]\}} ||| t |t|tr|d n| qzi  _d}| D ]}| j|< |d7 }qt|D ]9\}}|dd	 d
r|dd	 ddd }n|dd	 d } ||| | jvr| j|< |d7 }qdd  j D  _t j _ || |dkr dd | D  _d S  fdd|D  _d S )Nutf-8encodingr   {_tokenr8   r9      \asciiunicode_escapec                 S   s   i | ]\}}||qS r   r   )r(   kvr   r   r   
<dictcomp>       z*CharTokenizer.__init__.<locals>.<dictcomp>r/   c                 S      g | ]}|qS r   r   )r(   rF   r   r   r   r*          z*CharTokenizer.__init__.<locals>.<listcomp>c                    s   g | ]	}t  |d  qS )_id)getattrr'   selfr   r   r*          )r   
expanduseropenreadlinejsonloads#check_special_tokens_dict_from_file	readlinesupdate_special_tokens_dictr   r   setattritemscheck_special_token_name
isinstancestrvocabvalues	enumerate
startswithencodedecodecheck_token_from_file	inv_vocablen
vocab_size-check_special_tokens_to_remove_while_decodingtokens_to_ids*special_token_ids_to_remove_while_decoding)rN   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   f
first_linespecial_tokens_dict
vocab_listr)   namerE   rF   r   countitokenr   rM   r   __init__Y   s`   

	
 




zCharTokenizer.__init__c                 C   s   |  D ]U\}}|dd  dkst|d d s.tdt| d| ddd tD  dt|tsHtd	| d
| dt| dt| d	t|dkrYtd	| d| dq| 	|d|  d S )Nir?   zUnsupported key z1 in special tokens dictionary in vocabulary file z" (first line). Supported keys are c                 S   s   g | ]}|j d  qS )r?   r&   r'   r   r   r   r*      rH   zECharTokenizer.check_special_tokens_dict_from_file.<locals>.<listcomp>.z7Values of special tokens dictionary in vocabulary file zA (first line) has to belong to type `str`, whereas type of item 'z' value z is ``.r   z? (first line) has to not empty strings, whereas value of item 'z' is an empty string.zLoaded from vocabulary file )
rY   r   r   
ValueErrorreprr[   r\   typere   .check_special_tokens_dict_for_duplicate_values)r   rl   r0   rE   rF   r   r   r   rU      s6   "

z1CharTokenizer.check_special_tokens_dict_from_filec           
      C   s   t | t t|  kreg }g }ttt|  d d D ]/\}}|g}|  D ]\}}||kr3 n
||kr<|| q)t |dkrM|| || q|rgddd t||D }	t	|d |	 d S d S )NrA   r@   z. c                 S   s"   g | ]\}}d | d| dqS )zTokens z have value ''r   )r(   trF   r   r   r   r*      s   " zPCharTokenizer.check_special_tokens_dict_for_duplicate_values.<locals>.<listcomp>z1 special tokens dictionary has duplicate values. )
re   setr^   listreversedrY   appendjoinzipru   )
rl   err_msg_prefixtokens_with_equal_valuesduplicate_valuesrE   rF   tokenskkvvdup_values_msgr   r   r   rx      s0   $



z<CharTokenizer.check_special_tokens_dict_for_duplicate_valuesinit_special_tokens_dictc	              	   C   s   |  }	t|||||||gg dD ]E\}
}|
d urWt|
tr>|
r)td| d||	v r1|	|= qtd| d|	 d qt|
tsStd| d| d	t|
 d
|
|	|< q| 	|	d |	S )Nr4   r7   r2   r3   r5   r1   r6   z*If `CharTokenizer` constructor parameter `z ` is `bool` it has to be `False`zCannot remove special token `z/` since it is not in special tokens dictionary rs   z'`CharTokenizer` constructor parameter `zE` has to be either `False` or belong to type `str`, whereas type of `z` is `rt   zeAfter updating special tokens dictionary with tokens passed in `CharTokenizer` constructor parameters)
copyr   r[   boolru   warningswarnr\   rw   rx   )r   r   r1   r2   r3   r4   r5   r6   r7   rl   r   rn   r   r   r   rW      sB   


z(CharTokenizer.update_special_tokens_dictc                 C   sH   t | trt | tr t| dkr"tdt|  d| d| dd S d S )Nr@   z_Each line in vocabulary have to be a Python string literal containing 1 character. Encountered z	 on line z	 in file rs   )r[   r\   re   ru   rv   )rq   r0   line_ir   r   r   rc      s    z#CharTokenizer.check_token_from_filec                 C   st   |d ur6t |stdt| d|  ddd t D  d|d |vr8td|d  d	t| d
|  dd S d S )NzValue z of parameter `z!` is wrong. Supported values are c                 S   r%   r   r&   r'   r   r   r   r*     r+   z:CharTokenizer.check_special_token_name.<locals>.<listcomp>rs   r?   You should provide `E` parameter to `CharTokenizer` constructor if you wish to pass token z in parameter `rt   )r   r   ru   rv   )parameter_namer   rl   r   r   r   rZ      s"   
z&CharTokenizer.check_special_token_namec                 C   s   t | trGt| D ];\}}t|s'tdt| d| ddd tD  d|d |vrDtd|d  d	t| d
|d  d| d	q	d S t | trP| dksUt | tsitdtdd tD  dt|  dd S )NzWrong element with value  in position zm of parameter `special_tokens_to_remove_while_decoding` of `CharTokenizer` constructor. Supported values are c                 S   r%   r   r&   r'   r   r   r   r*     r+   zOCharTokenizer.check_special_tokens_to_remove_while_decoding.<locals>.<listcomp>rs   r?   r   r   z: in parameter `special_tokens_to_remove_while_decoding`. `z` was detected in position z. in `special_tokens_to_remove_while_decoding`.r/   zParameter `special_tokens_to_remove_while_decoding` of `CharTokenizer` constructor has to be equal to a string 'all' or be a list of values from set c                 s   r,   r   r&   r'   r   r   r   r-   !  r.   zNCharTokenizer.check_special_tokens_to_remove_while_decoding.<locals>.<genexpr>z2 whereas `special_tokens_to_remove_while_decoding=`)	r[   r|   r_   r   r   ru   rv   r\   r{   )r:   rl   rp   r   r   r   r   rg   	  sB   

z;CharTokenizer.check_special_tokens_to_remove_while_decodingtextreturnc              
   C   s   dd |D }g }| j d ur|t| | j  t|D ].\}}|| jv r*|| q| jd ur6|| j qtdt| d| dt| d q| j	d urX|t| | j	 |S )Nc                 S   rI   r   r   )r(   charr   r   r   r*   &  rJ   z0CharTokenizer.text_to_tokens.<locals>.<listcomp>z
Character r   zF is not present in vocabulary and no `<UNK>` token was set. Character z is discarded.)
r8   r~   rL   r_   r]   r7   r   r   rv   r9   )rN   r   token_candidatesr   rp   rq   r   r   r   text_to_tokens%  s"   



zCharTokenizer.text_to_tokensr   c                 C   s   |  | |S r   )ids_to_textrh   rN   r   r   r   r   tokens_to_text8  s   zCharTokenizer.tokens_to_textc                    s    fdd  |D }|S )Nc                       g | ]} j | qS r   r]   r(   rq   rM   r   r   r*   <  rH   z-CharTokenizer.text_to_ids.<locals>.<listcomp>)r   )rN   r   idsr   rM   r   text_to_ids;  s   zCharTokenizer.text_to_idsr   c                    s"    fdd|D }d  |S )Nc                    s   g | ]	}| j vr|qS r   )ri   )r(   id_rM   r   r   r*   @  rO   z-CharTokenizer.ids_to_text.<locals>.<listcomp> )r   ids_to_tokens)rN   r   ids_r   rM   r   r   ?  s   zCharTokenizer.ids_to_textc                        fdd|D S )Nc                    r   r   r   r   rM   r   r   r*   D  rH   z/CharTokenizer.tokens_to_ids.<locals>.<listcomp>r   r   r   rM   r   rh   C     zCharTokenizer.tokens_to_idsrq   c                 C   s
   | j | S r   r   )rN   rq   r   r   r   token_to_idF  s   
zCharTokenizer.token_to_idc                    r   )Nc                    r   r   )rd   )r(   idrM   r   r   r*   J  rH   z/CharTokenizer.ids_to_tokens.<locals>.<listcomp>r   )rN   r   r   rM   r   r   I  r   zCharTokenizer.ids_to_tokensc              
   C   s>   | d u r|d d d }t d| d| d| d| d	d S )Nr?   zCannot return `z	` since `z` is not set. To obtain `z` you need to pass parameter `z!` to `CharTokenizer` constructor.)ru   )special_tokenid_nametoken_paramr   r   r   check_special_token_id_gettingL  s   z,CharTokenizer.check_special_token_id_gettingc                 C      |  | jd | j| j S )Npad_id)r   r4   r]   rM   r   r   r   r   U     zCharTokenizer.pad_idc                 C   r   )Nbos_id)r   r2   r]   rM   r   r   r   r   Z  r   zCharTokenizer.bos_idc                 C   r   )Neos_id)r   r3   r]   rM   r   r   r   r   _  r   zCharTokenizer.eos_idc                 C   r   )Nunk_id)r   r7   r]   rM   r   r   r   r   d  r   zCharTokenizer.unk_idc                 C   r   )Nmask_id)r   r1   r]   rM   r   r   r   r   i  r   zCharTokenizer.mask_idc                 C   r   )Nsep_id)r   r5   r]   rM   r   r   r   r   n  r   zCharTokenizer.sep_idc                 C   r   )Ncls_id)r   r6   r]   rM   r   r   r   r   s  r   zCharTokenizer.cls_idc              	   C   s   i }t |||||| |gg dD ]S\}}	|d urct|ts*td|	 dt| dt|dkr8td|	 d|| v r_d }
| D ]
\}}||krN|}
qDtdt| d	|	 d
|
 d|||	< q|S )Nr   zThe type of parameter `z$` has to be `None` or `str`, found `r   r   zIf the parameter `z.` is `str`, then its length has to be nonzero.z
The value z of special token `z-` is the same as the value of special token `rt   )	r   r[   r\   ru   rw   re   r^   rY   rv   )r1   r2   r3   r4   r5   r6   r7   rl   r   rn   
other_namerE   rF   r   r   r   create_special_tokens_dictx  s4   

z(CharTokenizer.create_special_tokens_dictc                 C   s`   t | D ])\}}t|tstd| dt| dt|dkr-td| dt| dqd S )NzRCharacter to exclude from vocabulary has to `str`, whereas an element in position z is of type `rt   r@   z~A length of an element of `characters_to_exclude_from_vocabulary` parameter has to be 1. The length of an element in position z is rs   )r_   r[   r\   ru   rw   re   )%characters_to_exclude_from_vocabularyrp   r   r   r   r   +check_characters_to_exclude_from_vocabulary  s"   
z9CharTokenizer.check_characters_to_exclude_from_vocabularyc                 C   s^   | d u r|d u rt d| d ur|d urt d| d ur+t| ts-t dt|  dd S d S )NzlExactly one of parameters `text` and `text_file_name` should be provided whereas both parameters are `None`.zqExactly one of parameters `text` and `text_file_name` has to be provided, whereas both parameters are not `None`.zFParameter `text` has to be of type `str`, whereas it belongs to type `rt   )ru   r[   r\   rw   )r   text_file_namer   r   r   check_text_and_text_file_name  s   
z+CharTokenizer.check_text_and_text_file_name	save_pathr   characters_to_excluderf   c              	   C   s  |  ||||	|
||}|du rg }n| | | || |dur&t|}n4|dus,J t| }t }|jdd}	 |t}|sEn|	| q=W d   n1 sUw   Y  |D ]	}||v re||= q\t| }|j
jddd |jdddg}|t|d  |du rt| dd	 d
D ]\}}|t|d  qn/|t|8 }tt| dd	 d
D ]$\}\}}||k r|t|d  q W d   dS W d   dS W d   dS 1 sw   Y  dS )a3  
        Creates character vocabulary and saves it to file ``save_path``. You should provide one of parameters ``text``
        and ``text_file_name``. The format of created character vocabulary file is following:
        ```
        {['mask_token': "ANY NON EMPTY STRING", ]['bos_token': "ANY NON EMPTY STRING", ] and so on}
        ' '
        'e'
        ...
        ```
        The first line is a JSON which contains special tokens. This special token are set using parameters
        ``mas_token``, ``bos_token``, ``eos_token``, ``pad_token``, ``sep_token``, ``cls_token``, ``unk_token``.
        Other lines in created vocabulary file are Python string literals containing one character each.

        Args:
            save_path: path to the output text file. If ``save_path`` parent directory does not exist it will be created
            text: string which characters are used for vocabulary creation.
            text_file_name: path to a file which characters are used for vocabulary creation. Use this parameter if
                the text in file is too large to be loaded in memory.
            characters_to_exclude: a list of characters which will not be added to vocabulary.
            vocab_size: vocabulary size. If this parameter is set only most frequent ``vocab_size`` characters are added
                to vocabulary.
            mask_token: mask token
            bos_token: the beginning of sequence token
            eos_token: the end of sequence token. Usually equal to sep_token.
            pad_token: token to use for padding.
            sep_token: token used for separating sequences.
            cls_token: class token. Usually equal to bos_token.
            unk_token: token to use for unknown tokens. If the parameter ``unk_token`` is set and there is a character
                in the input of ``text_to_ids`` of ``text_to_tokens`` methods which is not in the vocabulary, then
                such an unknown character is tokenized into ``unk_token``. If the parameter ``unk_token`` is ``False``,
                then unknown tokens are discarded.
        Nr;   r<   T)exist_okparentsw
c                 S   
   | d  S Nr@   r   xr   r   r   <lambda>     
 z+CharTokenizer.build_vocab.<locals>.<lambda>)keyc                 S   r   r   r   r   r   r   r   r   
  r   )r   r   r   r   r   rP   rQ   read%NUMBER_OF_CHARACTERS_READ_BUFFER_SIZEupdateparentmkdirwriterS   dumpssortedrY   rv   re   r_   )r   r   r   r   r   rf   r1   r2   r3   r4   r5   r6   r7   rl   counterrj   segmentr   c_rp   r   r   r   build_vocab  sV   0



$"zCharTokenizer.build_vocab)
NNNNNNNNNr/   )NNNNNNN)NNNNNNNNNNN)-r   r   r   r   r{   r\   r   r	   r   SpecialTokenStringTyper   rr   r$   rU   staticmethodrx   r   rW   rc   rZ   rg   r   r   intr   r   rh   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   bytesosPathLiker   r   r   r   r   r   /   sB    &,	

A


	+










!


	
)rS   r   r   collectionsr   enumr   pathlibr   typingr   r   r   r   r	   1nemo.collections.common.tokenizers.tokenizer_specr
   __all__r   r   r   r   r   r   r   r   <module>   s   
