o
    }oi-                     @   sz  d dl Zd dlmZmZ d dlmZmZmZ d dl	m
Z
 ddlmZ ddgZd	d
ddZdee fddZeG dd dZ							d&dedee dee dee deeeef  dee dee dee fddZ														d'dedee dee dee dee deeeef  dee dee dee d ee d!ee d"ee dee d#ee fd$d%ZdS )(    N)MISSING	dataclass)DictListOptional)logging   ))get_huggingface_pretrained_lm_models_listget_tokenizerget_tokenizer_listzmegatron-bert-345m-uncasedzmegatron-bert-345m-casedzmegatron-gpt-345m)BertWordPieceLowerCaseBertWordPieceCaseGPT2BPETokenizerreturnc                  C   s2   t tdd} | t tdd g dt|  S )z3
    Returns all all supported tokenizer names
    F)include_externalT)sentencepiececharword)setr	   updatelist)s r   g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/nlp/modules/common/tokenizer_utils.pyr       s   c                   @   s   e Zd ZU dZeZeed< dZe	e ed< dZ
e	e ed< dZe	e ed< dZe	eeef  ed< dZe	e ed	< d
Ze	e ed< dZe	e ed< dZe	e ed< dZe	e ed< dS )TokenizerConfigz,
    Tokenizer Configuration Dataclass.
    libraryNtokenizer_model
vocab_size
vocab_filespecial_tokens        bpe_dropoutg+?coveragetraining_sample_sizeFr2lsentencepiece_legacy)__name__
__module____qualname____doc__r   r   str__annotations__r   r   r   intr   r   r   r!   floatr"   r#   r$   boolr%   r   r   r   r   r   )   s   
 r   Fr    tokenizer_namer   r   merges_filer   use_fastr!   chat_templatec                 C   s  ddl }ddl m}	 t||jj|jjfr|	|}|du r!i }
n|}
d| v rQzddlm	}m
}m} W n ttfy@   tdw |du rM|| }|| }|| } | dkrlddlm} td	t|  |||d
|dS | dkr~ddlm} |||d dS | dkrddlm} |d d|i|
S | dkrddlm} |d d|i|
S | dkrddlm} | j||dS td|  d| d| d|
 d| 
 ddlm} |d | ||d|
||d}|S )!a$  
    Args:
        tokenizer_name: sentencepiece or pretrained model from the hugging face list,
            for example: bert-base-cased
            To see the list of all HuggingFace pretrained models, use:
            nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list()
        tokenizer_model: tokenizer model file of sentencepiece
        special_tokens: dict of special tokens.
            For additional special tokens besides standard special tokens (bos, eos, pad, etc.), such as sentinel
            tokens for T5 (<extra_id_0>, <extra_id_1>, etc.), use key 'additional_special_tokens'
        vocab_file: path to vocab file
        use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
        bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation
            procedure of BPE to help
            model better learn word compositionality and become robust to segmentation errors.
            It has empirically been shown to improve inference time BLEU scores.
    r   N	OmegaConfmegatron)get_megatron_merges_fileget_megatron_tokenizerget_megatron_vocab_filezMegatron-core was not found. Please see the NeMo README for installation instructions:  https://github.com/NVIDIA/NeMo#megatron-gpt.r   SentencePieceTokenizertokenizer_model: T
model_pathr   legacyr2   tiktokenTiktokenTokenizeradditional_special_tokens)r   r   r   )WordTokenizerr   r   )CharTokenizerregexRegExTokenizer
regex_filer   >Getting HuggingFace AutoTokenizer with pretrained_model_name: z, vocab_file: z,  merges_files: z, special_tokens_dict: z, and use_fast: AutoTokenizerpretrained_model_namer   r0   )r1   r2   r   )	omegaconfr4   
isinstance
listconfig
ListConfig
dictconfig
DictConfigto_container;nemo.collections.nlp.modules.common.megatron.megatron_utilsr6   r7   r8   ImportErrorModuleNotFoundError:nemo.collections.common.tokenizers.sentencepiece_tokenizerr:   r   infor*   5nemo.collections.common.tokenizers.tiktoken_tokenizerrA   1nemo.collections.common.tokenizers.word_tokenizerrC   1nemo.collections.common.tokenizers.char_tokenizerrD   2nemo.collections.common.tokenizers.regex_tokenizerrG   load_tokenizer=nemo.collections.common.tokenizers.huggingface.auto_tokenizerrL   )r/   r   r   r0   r   r1   r!   r2   rO   r4   special_tokens_dictr6   r7   r8   r:   rA   rC   rD   rG   rL   	tokenizerr   r   r   r
   ;   s   

r   r   
model_namer$   r>   	delimitertrust_remote_coder   c                 C   s>  ddl }ddl m} t||jj|jjfr||}|du r!i }n|}| dkr9|du r9|du s5tj	
|s9td| dkrdddlm} td|  |d&|||d	||||d
}|rb||j_|S | dkr~ddlm} td|  ||||	|dS | dkrddlm} td ||S | dkrddlm} td | j||dS | dkr|dkrddlm} td t| |||	dS |tv rt| }td| d| d|  t|||||dS | dkrddlm} |||
dS | d krdd!lm} ||d"S | d#kr|dusJ dd$lm } ||S t!d%)'a  
    Args:
        model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron
        tokenizer_model: tokenizer model file of sentencepiece
        special_tokens: dict of special tokens
        vocab_file: path to vocab file
        use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
        bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation procedure
            of BPE to help model better learn word compositionality and become robust to segmentation errors.
            It has empirically been shown to improve inference time BLEU scores.
        r2l: Whether to return subword IDs from right to left
    r   Nr3   z
byte-levelz2No Tokenizer path provided or file does not exist!huggingfacerK   rJ   rM   )r1   re   r2   r   r9   z"Getting SentencePiece with model: r<   )ByteLevelTokenizerzUsing byte-level tokenizationrE   rF   zUsing regex tokenizationrH   r5   GPTSentencePieceTokenizerr;   )r=   r>   z6Getting Megatron tokenizer for pretrained model name: z, custom vocab file: z, and merges file: )r/   r   r0   r   r2   tabular)TabularTokenizer)rd   r?   r@   )r   null)NullTokenizerzCurrently we only support "huggingface", "sentencepiece", "megatron", "byte-level", "regex", "tabular","tiktoken", and "null" tokenizer libraries.r   )"rO   r4   rP   rQ   rR   rS   rT   rU   ospathisfile
ValueErrorr`   rL   r   rZ   rb   r2   rY   r:   7nemo.collections.common.tokenizers.bytelevel_tokenizersrg   r^   rG   r_   megatron_tokenizer_model_mapr
   4nemo.collections.common.tokenizers.tabular_tokenizerrj   r[   rA   1nemo.collections.common.tokenizers.null_tokenizerrl   NotImplementedError)r   rc   r   r   r0   r   r1   r!   r$   r>   rd   re   r2   r   rO   r4   ra   rL   rb   r:   rg   rG   rj   rA   rl   r   r   r   get_nmt_tokenizer   s   

	






rv   )NNNNFr    N)r   NNNNNFr    FFNFNN)os.pathrm   dataclassesr   r   typingr   r   r   
nemo.utilsr   huggingface.huggingface_utilsr	   __all__rr   r*   r   r   r.   r-   r
   r,   rv   r   r   r   r   <module>   s   	
j	
