o
    wi_                  '   @   s  d dl Z d dlZd dlmZmZmZmZ d dlZd dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ ddgZG d	d deeZ	
														d#dedededededee dededededededee dee dededed ed!ef&d"dZdS )$    N)DictListOptionalUnion)if_exist)ChatTemplateMixin)TokenizerSpec)loggingSentencePieceTokenizercreate_spt_modelc                   @   sH  e Zd ZdZ						d6dedeeeeef ee f  de	d	e	d
ee f
ddZ
dd Zd7ddZd7ddZd7ddZdd Zdd Zdd Zdd Zg fdeeee f dee deeee f fd d!Zd"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zed0d1 Zed2d3 Zed4d5 ZdS )8r
   a  Sentencepiecetokenizer https://github.com/google/sentencepiece.

    Args:
        model_path: path to sentence piece tokenizer model. To create the model use create_spt_model()
        special_tokens: either list of special tokens or dictionary of token name to token value
        legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored,
            including the possibility to add special tokens inside wrapper.
        ignore_extra_whitespaces: whether to ignore extra whitespaces in the input text while encoding.
            Note:
            This is done for the current models tokenizers that don't handle extra whitespaces as by default tokenizer
            learned to ignore it. To check if the tokenizer by default ignores extra whitespaces refer to
            `self.removed_extra_spaces` attribute of the tokenizer. We added a parameter to process_asr_tokenizer.py
            for upcoming models to handle it inbuilt.
    NFT   ▁
model_pathspecial_tokenslegacyignore_extra_whitespaceschat_templatec                 C   s   || _ |rtj|std| dt | _| j| | j	 | _
| j	 | _|| _|| _d| _i | _i | _|| _|| _| j|| _|rT| jsOtd| | | jd| jdk| _| d| d| d k| _d S )	Nzmodel_path: z is invalidu   ☯z^Special tokens must be None when legacy is set to False. Provide special tokens at train time.zx  yzx yxy)r   ospathexists
ValueErrorsentencepieceSentencePieceProcessor	tokenizerLoadget_piece_sizeoriginal_vocab_size
vocab_sizer   r   extra_space_tokenspecial_token_to_idid_to_special_token&trim_spm_separator_after_special_tokenspm_separatorpiece_to_idspm_separator_idadd_special_tokensencode_as_piecesremoved_extra_spacestext_to_tokensspace_sensitive)selfr   r   r   r   r   r"   r#    r,   w/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py__init__/   s.   


$zSentencePieceTokenizer.__init__c           	   	      s`   j r jstdd j d|} jrg }d}	 i } jD ]}z||d |||< W q ty7   Y qw t	|dkr?nJt
||jd}|||  } j||| } jrxt	|dkrx|d  jv rxt	|dkrx|d  jkrx|d || || |t	| }q| j||d  n j|} j r jstt fdd	|}|S )
zConverts input text to a list of tokens.

        If legacy mode is enabled, handles special tokens separately.

        Args:
            text: The input string to tokenize.

        Returns:
            A list of string tokens.
        (?<= )(?= )|^ | $ r      Nkeyc                    s
   |  j kS N)r   )r   r+   r,   r-   <lambda>   s   
 z7SentencePieceTokenizer.text_to_tokens.<locals>.<lambda>)r(   r   resubr   r   r    indexr   lenmingetr   r'   r"   r#   popextendappendlistfilter)	r+   texttokenscur_idx
st_indicestokennext_special_tokennext_start_idxtext_tokensr,   r6   r-   r)   U   sD   



"z%SentencePieceTokenizer.text_to_tokensc                 C   s<   t |tr| ||S t |tr| |S tdt| )al  Converts input text to a list of token IDs.

        Handles chat formatting or raw string tokenization depending on input type.

        Args:
            text: A string or list representing chat template inputs.
            sample_alpha: Optional float to enable subword sampling for data augmentation.

        Returns:
            A list of token IDs.
        z+Expected either str or list input, but got )
isinstancestr_text_to_idsrA   apply_chat_templater   type)r+   rC   sample_alphar,   r,   r-   text_to_ids   s
   


z"SentencePieceTokenizer.text_to_idsc           
   	   C   s  | j r| jstdd| j d| }| jrg }d}	 i }| jD ]}z||d |||< W q! t	y9   Y q!w t
|dkrAnMt||jd}|||  }| j||| }	| jrzt
|dkrz|d | jv rzt
|	dkrz|	d | jkrz|	d ||	 || j|  |t
| }q| j r| js|| ||d  |S || j||d  |S | j r| js| ||S |dur| jj|d|dd	S | j|S )
a  Internal method to convert text to token IDs, handling optional sampling and special token logic.

        Args:
            text: Input string.
            sample_alpha: Optional alpha value for stochastic subword sampling.

        Returns:
            A list of token IDs.
        r/   r0   r   r1   Nr2   r4   Tenable_samplingalpha
nbest_size)r(   r   r8   r9   r   rstripr   r    r:   r   r;   r<   r=   r   encoder"   r!   r%   r>   r?   r@   _text_to_ids_extra_spaceencode_as_ids)
r+   rC   rP   idsrE   rF   rG   rH   rI   rJ   r,   r,   r-   rM      sN   



"z#SentencePieceTokenizer._text_to_idsc                 C   sh   g }i }|durd|dd}| | jD ]}|sq|| j7 }| jj|fi |}||dd  q|S )a  Tokenizes text while preserving extra space tokens for legacy mode.

        Args:
            text: Input string.
            sample_alpha: Optional alpha value for subword sampling.

        Returns:
            A list of token IDs with preserved extra space markers.
        NTr4   rR   )splitr   r   rY   r?   )r+   rC   rP   rZ   encoding_kwargspartpart_idsr,   r,   r-   rX      s   

z/SentencePieceTokenizer._text_to_ids_extra_spacec                 C   s&   t |tjtjfr| }| j|S )zConverts a list of tokens back to the corresponding string.

        Args:
            tokens: A list of string tokens or a tensor/array of token IDs.

        Returns:
            The decoded string.
        )rK   npndarraytorchTensortolistr   decode_pieces)r+   rD   r,   r,   r-   tokens_to_text   s   	z%SentencePieceTokenizer.tokens_to_textc                 C   s   t |tjtjfr| }| jrMd}d}t|D ]$\}}|| jv r<|| j	
||| d 7 }|| j| d 7 }|d }q|| j	
||d 7 }| S | j	
|S )zDecodes a list of token IDs into a string, handling special tokens if in legacy mode.

        Args:
            ids: A list or tensor/array of token IDs.

        Returns:
            The decoded string.
         r   r0   r1   N)rK   r_   r`   ra   rb   rc   r   	enumerater!   r   
decode_idsstrip)r+   rZ   rC   last_iiidr,   r,   r-   ids_to_text
  s   	
z"SentencePieceTokenizer.ids_to_textc                 C   s&   | j r|| jv r| j| S | j|S )zGets the ID corresponding to a token.

        Args:
            token: Token string.

        Returns:
            Token ID as an integer.
        )r   r    r   r$   )r+   rG   r,   r,   r-   token_to_id%  s   	
z"SentencePieceTokenizer.token_to_idc                 C   sZ   t |tjtjfr| }g }|D ]}|| jkr!|| j|  q|| j	
| q|S )zConverts a list of token IDs into corresponding token strings.

        Args:
            ids: A list or array/tensor of token IDs.

        Returns:
            List of string tokens.
        )rK   r_   r`   ra   rb   rc   r   r@   r!   r   id_to_piece)r+   rZ   rD   rl   r,   r,   r-   ids_to_tokens3  s   	
z$SentencePieceTokenizer.ids_to_tokensrD   tokens_to_skipreturnc                 C   s:   t |tr|g}g }|D ]}||vr|| | q|S )a&  Converts one or more tokens into their respective IDs, skipping any specified tokens.

        Args:
            tokens: A string or list of token strings.
            tokens_to_skip: List of tokens to ignore during conversion.

        Returns:
            A single ID or list of IDs.
        )rK   rL   r@   rn   )r+   rD   rq   rZ   rG   r,   r,   r-   tokens_to_idsF  s   

z$SentencePieceTokenizer.tokens_to_idsc                 C   sh  | j stdt|trS|D ]B}| j|| j kr4|| jvr4| j| j|< || j	| j< |  jd7  _q| j|| j krP| j|| j|< || j	| j| < qdS t|t
r| D ]J\}}t| || | j|| j kr|| jvr| j| j|< || j	| j< |  jd7  _q\| j|| j kr| j|| j|< || j	| j| < q\dS tdtt| )a-  Adds new special tokens to the tokenizer's vocabulary (only if legacy=True).

        Args:
            special_tokens: List or dict of special tokens to add.

        Raises:
            AttributeError: If not in legacy mode.
            ValueError: If the input is not a list or dictionary.
        zASpecial Token addition does not work when legacy is set to False.r1   z/Expected special_tokens to be a list or a dict N)r   AttributeErrorrK   rA   r   r$   unk_idr    r   r!   dictitemssetattrr   rL   rO   )r+   r   rG   
token_namer,   r,   r-   r&   X  s8   




z)SentencePieceTokenizer.add_special_tokensc                 C   *   | j r| | jgd }|S | j }|S )z%Returns the ID for the padding token.r   )r   rs   	pad_tokenr   pad_id)r+   r|   r,   r,   r-   r|     
   
zSentencePieceTokenizer.pad_idc                 C   rz   )z3Returns the ID for the beginning-of-sequence token.r   )r   rs   	bos_tokenr   bos_id)r+   r   r,   r,   r-   r     r}   zSentencePieceTokenizer.bos_idc                 C   rz   )z-Returns the ID for the end-of-sequence token.r   )r   rs   	eos_tokenr   eos_id)r+   r   r,   r,   r-   r     r}   zSentencePieceTokenizer.eos_idc                 C       | j r| | jgd S td)z=Returns the ID for the separator token (only in legacy mode).r   VUse function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.)r   rs   	sep_token	NameErrorr6   r,   r,   r-   sep_id     zSentencePieceTokenizer.sep_idc                 C   r   )zBReturns the ID for the classification token (only in legacy mode).r   r   )r   rs   	cls_tokenr   r6   r,   r,   r-   cls_id  r   zSentencePieceTokenizer.cls_idc                 C   r   )z8Returns the ID for the mask token (only in legacy mode).r   r   )r   rs   
mask_tokenr   r6   r,   r,   r-   mask_id  r   zSentencePieceTokenizer.mask_idc                 C   s
   | j  S )z%Returns the ID for the unknown token.)r   ru   r6   r,   r,   r-   ru     s   
zSentencePieceTokenizer.unk_idc                    s8   t | j| j| j| j| j| jg  fdd| j D S )zReturns a list of the additional special tokens (excluding bos, eos, pad, unk).

        Used to return sentinel tokens for e.g. T5.
        c                    s   g | ]
\}}| vr|qS r,   r,   ).0kvr   r,   r-   
<listcomp>  s    zHSentencePieceTokenizer.additional_special_tokens_ids.<locals>.<listcomp>)	setr~   r   r{   r   r   r   r    rw   r6   r,   r   r-   additional_special_tokens_ids  s   z4SentencePieceTokenizer.additional_special_tokens_idsc                    sB    fddt  j D } fddt  j j D }|| S )zHReturns the combined vocabulary list, including base and special tokens.c                    s   g | ]} j |qS r,   )r   ro   )r   rl   r6   r,   r-   r     s    z0SentencePieceTokenizer.vocab.<locals>.<listcomp>c                    s   g | ]
} j  j|  qS r,   )r!   r   )r   rk   r6   r,   r-   r     s    )ranger   r   r   r   )r+   
main_vocabr   r,   r6   r-   vocab  s
   
zSentencePieceTokenizer.vocab)NFTNTr   r5   ) __name__
__module____qualname____doc__rL   r   r   r   r   boolr.   r)   rQ   rM   rX   re   rm   rn   rp   intrs   r&   propertyr|   r   r   r   r   r   ru   r   r   r,   r,   r,   r-   r
      s^    
&
<

A6*








unigram      ?Fr4   T	data_filer   sample_sizedo_lower_casetokenizer_type
output_dircharacter_coveragetrain_extremely_large_corpusmax_sentencepiece_lengthboseospadcontrol_symbolsuser_defined_symbolsbyte_fallbacksplit_digitssplit_by_whitespacesplit_by_unicode_scriptremove_extra_whitespacesc               
   C   s  | rt j| std|  t j| }g }g d}|s"| d}t|dgr;td| d | d| dfS td	|  d
|  t j|dd d|  d| d| d| d| 
}d}|	sj|d8 }|d7 }|
st|d8 }|d7 }|r}|d| 7 }|rd	|}|d| 7 }||7 }|rd	|}|d| 7 }||7 }|r|d7 }|dkr|d| 7 }|r|d7 }|dkr|d| 7 }|r|d7 }|r|d 7 }|s|d!7 }|s|d"7 }|s|d#7 }t
j| g }t| d$d%d&d'?}|D ]4}|d(d }||v rq|d)r|dd* nd+| }t|dkr#|| q||d  qW d*   n	1 s6w   Y  || | d}t|d,d&d'}|D ]}|| d- qOW d*   n	1 sfw   Y  | d|fS ).u  Creates sentence piece tokenizer model from data file.

    Args:
        data_file: data file
        vocab_size: vocabulary size
        sample_size: maximum size of sentences the trainer loads
        do_lower_case: if text should be lower cased before tokenizer model is created
        character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset,
            can be < 1.0, but for all other languages, it should be set as 1.0
        output_dir: folder to save created tokenizer model. If not specified will store at data_file/../spt folder
        train_extremely_large_corpus: If training on huge datasets, pass this flag to allow SentencePiece
            to build the tokenizer.
        max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed.
            By default, no limit is placed.
        bos: when True, bos token "<s>" is added to the vocabulary.
        eos: when True, eos token "</s>" is added to the vocabulary.
        pad: when True, pad token "<pad>" is added to the vocabulary.
        control_symbols: control symbols to add to tokenizer, as defined by sentencepiece.
            These tokens get removed at decode time and are not encoded from the text - can only be added to the input
            programatically.
        user_defined_symbols: user symbols to add to tokenizer, as defined by sentencepiece.
            These tokens remain in the decoded text and are encoded automatically when present in the input text.
        byte_fallback: If <unk>, fallback to a byte sequence of the character.
        split_digits: If true, digits are split into individual tokens.
        split_by_whitespace: Whether to respect white space while creating subwords.
            If False, will learn merges across whitespace.
        split_by_unicode_script: Whether to include multiple Unicode scripts.
            Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ).
        remove_extra_whitespaces: Whether to remove leading, trailing, and duplicate internal whitespace.
            If true, will skip double spaces during encoding.
    z+data_file must be valid file path, but got )z<s>z</s>z<pad>z<unk>z/sptztokenizer.modelztokenizer model z/tokenizer.model already existsz/tokenizer.modelz
/vocab.txtzProcessing z and store at T)exist_okz--input=z --model_prefix=z/tokenizer --vocab_size=zE --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=z --character_coverage=   r1   z --bos_id=-1z --eos_id=-1z
 --pad_id=,z --control_symbols=z --user_defined_symbols=z& --normalization_rule_name=nmt_nfkc_cfr   z --input_sentence_size=z$ --train_extremely_large_corpus=truez --max_sentencepiece_length=z --byte_fallback=truez --split_digits=truez --split_by_whitespace=falsez  --split_by_unicode_script=falsez! --remove_extra_whitespaces=falsez/tokenizer.vocabrutf8)encoding	r   Nz##w
)r   r   r   r   dirnamer   r	   infomakedirsjoinr   SentencePieceTrainerTrainopenr[   
startswithr;   r@   r?   write) r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   data_dirr   r   cmdr|   control_stringuser_stringrD   flinepiecerG   
vocab_filer,   r,   r-   r     s   5



"

)r   Nr   Fr4   FFFNNFFTTF)r   r8   typingr   r   r   r   numpyr_   r   ra   #nemo.collections.common.parts.utilsr   6nemo.collections.common.tokenizers.chat_template_mixinr   1nemo.collections.common.tokenizers.tokenizer_specr   
nemo.utilsr	   __all__r
   rL   r   r   floatr   r,   r,   r,   r-   <module>   s      9	
