o
    }oiH                     @   sN   d dl mZmZ d dlmZ d dlmZ d dlm	Z	 dgZ
G dd deZdS )    )ListOptional)AutoTokenizer)TokenizerSpec)loggingr   c                    @   s  e Zd ZdZdddddddddg ddddfdedee dee dee d	ee d
ee dee dee dee dee dee dee dee dedee fddZ					dLdedee dee dee dee dee fddZ	e
dd ZdedefddZe
dd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- ZdMd.d/Ze
d0d1 Ze
d2d3 Ze
d4d5 Ze
d6d7 Ze
d8d9 Ze
d:d; Ze
d<d= Ze
d>d? Ze
d@dA Z e
dBdC Z!e
dDdE Z"dNdFedGefdHdIZ#dFefdJdKZ$dS )Or   z
    Wrapper of HuggingFace AutoTokenizer
    https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer.

    NTFpretrained_model_name
vocab_filemerges_file
mask_token	bos_token	eos_token	pad_token	sep_token	cls_token	unk_tokenadditional_special_tokensuse_fasttrust_remote_codeinclude_special_tokenschat_templatec                 C   sl  z|  |||||| | jsJ dW n4 tyG   z|  |||| || | js-J dW n tyD } z
td| d| d}~ww Y nw || _t| j| _i }|
dur[|
|d< |durc||d< |durk||d< |durt||d< n| jjdu r| jjr| jj|d< |dur||d	< n| jjdu r| jjr| jj|d	< |dur||d
< n| jj	du r| jj
r| jj
|d
< |	dur|	|d< n| jj
du r| jj	r| jj	|d< |dur||d< g }||||||	|
fD ]}|dur|| j vr|| q|D ]}|dur|| j vr|| qt|dkr	 t| d | | | d| d| d k| _i | _dS )a.  
        Args:
            pretrained_model_name: corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input
                argument. For more details please refer to the documentation of the `from_pretrained` method here:
                https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer.
                The list of all supported models can be found here: https://huggingface.co/models
            vocab_file: path to file with vocabulary which consists
                of characters separated by newlines.
            mask_token: mask token
            bos_token: the beginning of sequence token
            eos_token: the end of sequence token. Usually equal to sep_token
            pad_token: token to use for padding
            sep_token: token used for separating sequences
            cls_token: class token. Usually equal to bos_token
            unk_token: token to use for unknown tokens
            additional_special_tokens: list of other tokens beside standard special tokens (bos, eos, pad, etc.). For
                example, sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.)
            use_fast: whether to use fast HuggingFace tokenizer
            include_special_tokens: when True, converting text to ids will include special tokens / prompt tokens (if
                any), yielding self.tokenizer(text).input_ids
            chat_template: The chat template string to format "messages" with against the underlying HF tokneizer with
                apply_chat_template function
        ztokenizer not initializedz4Unable to instantiate HuggingFace AUTOTOKENIZER for z. Exception: Nr   r
   r   r   r   r   r   r   r   zt 
 will be added to the vocabulary.
Please resize your model accordingly, see NLP_Tokenizers.ipynb for more details.zx yxy)_initialize_tokenizer	tokenizer	Exception
ValueErrorr   lenoriginal_vocab_sizer   r   r   r   	get_vocabappendr   warningadd_special_tokenstext_to_tokensspace_sensitive_inv_vocab_dict)selfr   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   especial_tokens_dictnew_tokens_in_vocabtoken r*   q/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py__init__"   s~   )






 
zAutoTokenizer.__init__c                 C   s   |d u rt j|||d| _n|d u rt j||||d| _nt j|||||d| _|d urDt| jdd d ur:td || j_d| j_d S d S )N)pretrained_model_name_or_pathr   r   )r-   r   r   r   )r-   r   r	   r   r   r   zHYou are overwriting tokenizer's chat template, confirm this is intended.jinja)AUTOTOKENIZERfrom_pretrainedr   getattrr   infor   chat_template_format)r%   r   r   r	   r   r   r   r*   r*   r+   r      s4   



z#AutoTokenizer._initialize_tokenizerc                 C   s
   t | jS )z
        Returns the size of the tokenizer's vocabulary.

        Returns:
            int: The number of tokens in the vocabulary.
        )r   r   r%   r*   r*   r+   
vocab_size   s   
zAutoTokenizer.vocab_sizer'   returnc                 C   sL   | j |}|dkrt| d | j jD ]}t| |t| j |d q|S )a  
        Adds a dictionary of special tokens (eos, pad, cls...). If special tokens are NOT in the vocabulary, they are
        added to it (indexed starting from the last index of the current vocabulary).

        Args:
            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
                ``mask_token``, ``additional_special_tokens``].
                Tokens are only added if they are not already in the vocabulary.

        Returns:
            Number of tokens added to the vocabulary.
        r   z5 special tokens added, resize your model accordingly.N)r   r!   r   r2   SPECIAL_TOKENS_ATTRIBUTESsetattrr1   )r%   r'   num_tokens_addedkr*   r*   r+   r!      s   z AutoTokenizer.add_special_tokensc                    s    fdd j D S )z
        Returns a list of the additional special tokens' IDs (excluding bos, eos, pad, unk).

        Returns:
            List[int]: List of token IDs for additional special tokens, such as sentinel tokens for T5.
        c                    s   g | ]}  |qS r*   )token_to_id).0r)   r4   r*   r+   
<listcomp>       z?AutoTokenizer.additional_special_tokens_ids.<locals>.<listcomp>)r   r4   r*   r4   r+   additional_special_tokens_ids   s   z+AutoTokenizer.additional_special_tokens_idsc                 C      | j |}|S )z
        Converts text into a list of tokens.

        Args:
            text (str): Input text to be tokenized.

        Returns:
            List[str]: List of tokens.
        )r   tokenize)r%   texttokensr*   r*   r+   r"         
zAutoTokenizer.text_to_tokensc                 C   r@   )z
        Converts a list of tokens back into text.

        Args:
            tokens (List[str]): List of tokens to be converted.

        Returns:
            str: The reconstructed text.
        )r   convert_tokens_to_string)r%   rC   rB   r*   r*   r+   tokens_to_text  rD   zAutoTokenizer.tokens_to_textc                 C   s   |  |gd S )z
        Converts a single token to its corresponding ID.

        Args:
            token (str): The token to convert.

        Returns:
            int: The ID corresponding to the token.
        r   )tokens_to_ids)r%   r)   r*   r*   r+   r;     s   
zAutoTokenizer.token_to_idc                 C   r@   )z
        Converts a list of tokens to their corresponding IDs.

        Args:
            tokens (List[str]): List of tokens to convert.

        Returns:
            List[int]: List of token IDs.
        )r   convert_tokens_to_ids)r%   rC   idsr*   r*   r+   rG     rD   zAutoTokenizer.tokens_to_idsc                 C   r@   )z
        Converts a list of token IDs back to tokens.

        Args:
            ids (List[int]): List of token IDs to convert.

        Returns:
            List[str]: List of tokens.
        )r   convert_ids_to_tokens)r%   rI   rC   r*   r*   r+   ids_to_tokens,  rD   zAutoTokenizer.ids_to_tokensc                 C   s*   | j r	| |jS | |}| |}|S )a+  
        Converts text directly to token IDs.

        Args:
            text (str): Input text to be converted to IDs.

        Returns:
            List[int]: List of token IDs. If include_special_tokens is True, will include special tokens from the
            tokenizer's configuration.
        )r   r   	input_idsr"   rG   )r%   rB   rC   rI   r*   r*   r+   text_to_ids9  s
   

zAutoTokenizer.text_to_idsc                 O   s   | j j|i |S )z*Appies chat template and tokenizes results)r   apply_chat_template)r%   argskwargsr*   r*   r+   rN   J  s   z!AutoTokenizer.apply_chat_templatec                    s4     |}|r fdd|D }n|} |}|S )aF  
        Converts token IDs back to text.

        Args:
            ids (List[int]): List of token IDs to convert to text.
            remove_special_tokens (bool): Whether to remove special tokens (like [PAD], [CLS], etc.) from the output
            text.

        Returns:
            str: The reconstructed text.
        c                    s   g | ]
}| j jvr|qS r*   )r   all_special_tokens)r<   tr4   r*   r+   r=   \  s    z-AutoTokenizer.ids_to_text.<locals>.<listcomp>)rK   rF   )r%   rI   remove_special_tokensrC   tokens_cleanrB   r*   r4   r+   ids_to_textN  s   

zAutoTokenizer.ids_to_textc                    s0   dd | j j D   fddtt D S )z
        Returns the vocabulary as a list where the index corresponds to the token ID.

        Returns:
            List[str]: List of tokens in the vocabulary.
        c                 S      i | ]\}}||qS r*   r*   r<   r:   vr*   r*   r+   
<dictcomp>j  r>   z'AutoTokenizer.vocab.<locals>.<dictcomp>c                    s   g | ]} | qS r*   r*   )r<   iid2vocabr*   r+   r=   k  s    z'AutoTokenizer.vocab.<locals>.<listcomp>)r   vocabitemsranger   r4   r*   r[   r+   r]   b  s   zAutoTokenizer.vocabc                 C   s(   | j i krdd | jj D | _ | j S )z
        Returns the inverse vocabulary mapping (token to ID).

        Returns:
            Dict[str, int]: Dictionary mapping tokens to their IDs.
        c                 S   rV   r*   r*   rW   r*   r*   r+   rY   v  r>   z+AutoTokenizer.inv_vocab.<locals>.<dictcomp>)r$   r   r]   r^   r4   r*   r*   r+   	inv_vocabm  s   
zAutoTokenizer.inv_vocabc                 C   (   t | ddu r	dS | t | dgd S )z
        Gets the ID of the padding token.

        Returns:
            int or None: The ID of the padding token if it exists, None otherwise.
        r   Nr   r1   rG   r4   r*   r*   r+   pad_idy     zAutoTokenizer.pad_idc                 C   ra   )z
        Gets the ID of the beginning-of-sequence token.

        Returns:
            int or None: The ID of the BOS token if it exists, None otherwise.
        r   Nr   rb   r4   r*   r*   r+   bos_id  rd   zAutoTokenizer.bos_idc                 C   ra   )z
        Gets the ID of the end-of-sequence token.

        Returns:
            int or None: The ID of the EOS token if it exists, None otherwise.
        r   Nr   rb   r4   r*   r*   r+   eos_id  rd   zAutoTokenizer.eos_idc                 C   s   |  t| dgd S )z
        Gets the ID of the end-of-document token (same as EOS token). Required for megatron-core compatibility.

        Returns:
            int: The ID of the EOD/EOS token.
        r   r   )rG   r1   r4   r*   r*   r+   eod  s   zAutoTokenizer.eodc                 C   ra   )z
        Gets the ID of the separator token.

        Returns:
            int or None: The ID of the separator token if it exists, None otherwise.
        r   Nr   rb   r4   r*   r*   r+   sep_id  rd   zAutoTokenizer.sep_idc                 C   ra   )z
        Gets the ID of the classifier token.

        Returns:
            int or None: The ID of the classifier token if it exists, None otherwise.
        r   Nr   rb   r4   r*   r*   r+   cls_id  rd   zAutoTokenizer.cls_idc                 C   ra   )z
        Gets the ID of the unknown token.

        Returns:
            int or None: The ID of the unknown token if it exists, None otherwise.
        r   Nr   rb   r4   r*   r*   r+   unk_id  rd   zAutoTokenizer.unk_idc                 C   ra   )z
        Gets the ID of the mask token.

        Returns:
            int or None: The ID of the mask token if it exists, None otherwise.
        r
   Nr   rb   r4   r*   r*   r+   mask_id  rd   zAutoTokenizer.mask_idc                 C   s   t | jjS )z
        Returns the name of the underlying HuggingFace tokenizer class.

        Returns:
            str: Name of the tokenizer class.
        )typer   __name__r4   r*   r*   r+   name  s   zAutoTokenizer.namesave_directoryfilename_prefixc                 C   s   | j j||dS )KSaves tokenizer's vocabulary and other artifacts to the specified directory)ro   rp   )r   save_vocabulary)r%   ro   rp   r*   r*   r+   rr     s   zAutoTokenizer.save_vocabularyc                 C   s   | j |S )rq   )r   save_pretrained)r%   ro   r*   r*   r+   rs     s   zAutoTokenizer.save_pretrained)NNFFN)T)N)%rm   
__module____qualname____doc__strr   r   boolr,   r   propertyr5   dictintr!   r?   r"   rF   r;   rG   rK   rM   rN   rU   r]   r`   rc   re   rf   rg   rh   ri   rj   rk   rn   rr   rs   r*   r*   r*   r+   r      s    		

 
&
	
	







	




	N)typingr   r   transformersr   r/   1nemo.collections.common.tokenizers.tokenizer_specr   
nemo.utilsr   __all__r*   r*   r*   r+   <module>   s   