o
    c۷i%                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dl	m
Z
 d dlmZmZ d dlmZmZmZ d dlmZmZ d dlmZ ejd	ed
d ejd	edd e r[d dlmZ deeB defddZddeeB dedefddZdeeB dedB fddZG dd deZ dS )    N)cached_property)Path)TokenizerException)assert_sentencepiece_installedis_sentencepiece_installed)SpecialTokenPolicy	TokenizerTokenizerVersion)ImageConfigMultiModalVersion)ModelSettingsBuilderoncez%.*`get_control_token` is deprecated.*)actioncategorymessagez#.*`_control_tokens` is deprecated.*)SentencePieceProcessorpathreturnc                    sb   t tr	tttj}ttjdg   fdd|D dg } o0tfdd|D S )z1Check if the given path is a SentencePiece model. c                    s$   g | ]} D ]	}d | | qqS )z.model. ).0vm)mm_versionsr   d/home/ubuntu/vllm_env/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/sentencepiece.py
<listcomp>*   s   $ z$is_sentencepiece.<locals>.<listcomp>z.modelc                 3   s    | ]	} j |V  qd S N)nameendswith)r   suffix)r   r   r   	<genexpr>,   s    z#is_sentencepiece.<locals>.<genexpr>)	
isinstancestrr   listr	   __members__r   is_fileany)r   instruct_versionssuffixesr   )r   r   r   is_sentencepiece#   s   

r)   Ftokenizer_filenameraise_deprecatedc                 C   sp   t | } | dd }|dkr|dd }|dkr(|r$td|  dtdS |tjvr4td	|  t|S )
z3Get the version of the tokenizer from the filename..modelr   r   z4Make sure to rename your tokenizer file to end with z.v1.v1!Unrecognized tokenizer filename: )r"   splitr   r	   r$   )r*   r+   _version_strr   r   r   get_spm_version/   s   
r3   c                 C   s^   t | } | dd }|dksd|vrdS d|dd  }|tjvr*td|  t|jS )z1Get the image config from the tokenizer filename.r,   r-   r.   r   Nr0   )r"   r1   r   r$   r   config)r*   r2   _mm_version_strr   r   r   get_image_configD   s   

r6   c                	       s  e Zd ZdZd7deeB dedB ddf fddZedefdd	Z	edefd
dZ
ededB fddZdedefddZdedefddZedefddZedefddZdee fddZedefddZedefddZdeejB eB defddZedee fd d!Zedee fd"d#Zded$ed%edee fd&d'Zej fd(ee d)edefd*d+Z!d,edefd-d.Z"d(ee d)edefd/d0Z#d(ee defd1d2Z$edefd3d4Z%edefd5d6Z&  Z'S )8SentencePieceTokenizerzC[SentencePiece](https://github.com/google/sentencepiece) tokenizer.N
model_pathtokenizer_versionr   c                    s   t   t jj _tj|sJ |t	t
|tr|n| d _ j  j ks0J  fddt jD  _|pDt|dd _t| _t   dS )zInitialize the `SentencePieceTokenizer`.

        Args:
            model_path: The path to the `SentencePiece` model.
            tokenizer_version: The version of the tokenizer. If not provided, it will be inferred from the model path.
        )
model_filec                    s   g | ]} j |qS r   _modelid_to_piece)r   iselfr   r   r   h   s    z3SentencePieceTokenizer.__init__.<locals>.<listcomp>F)r+   N)r   logging	getLogger	__class____name___loggerosr   isfiler   r!   r"   as_posixr<   
vocab_sizeget_piece_sizerangen_words_vocabr3   _versionr   
_file_pathsuper__init__)r@   r8   r9   rC   r?   r   rQ   W   s   
zSentencePieceTokenizer.__init__c                 C      | j S )z The path to the tokenizer model.)rO   r?   r   r   r   	file_patho      z SentencePieceTokenizer.file_pathc                 C   rS   )zThe version of the tokenizer.)rN   r?   r   r   r   versiont   rU   zSentencePieceTokenizer.versionc                 C   s   | j jrtd| j  dS )zOAlways returns None as SentencePiece does not support `model_settings_builder`.zCSentencePieceTokenizer does not support model settings for version N)rV   supports_model_settings
ValueErrorr?   r   r   r   model_settings_buildery   s   z-SentencePieceTokenizer.model_settings_buildersc                 C      | j |S )z+Get the special token for the given string.)r<   piece_to_idr@   rZ   r   r   r   get_special_token      z(SentencePieceTokenizer.get_special_tokenc                 C   s   t dt | |S )NzC`get_control_token` is deprecated. Use `get_special_token` instead.)warningswarnFutureWarningr^   r]   r   r   r   get_control_token   s   
z(SentencePieceTokenizer.get_control_tokenc                 C   
   | j  S )z!Vocabulary size of the tokenizer.)r<   rI   r?   r   r   r   rL         
zSentencePieceTokenizer.n_wordsc                 C   s
   t | jS )z.The number of special tokens of the tokenizer.)lenspecial_idsr?   r   r   r   num_special_tokens   re   z)SentencePieceTokenizer.num_special_tokensc                 C   rS   )z,Get all tokens in the vocabulary as strings.)rM   r?   r   r   r   vocab   s   zSentencePieceTokenizer.vocabc                 C   rd   )z#The beginning of sentence token id.)r<   bos_idr?   r   r   r   rj      re   zSentencePieceTokenizer.bos_idc                 C   rd   )zThe end of sentence token id.)r<   eos_idr?   r   r   r   rk      re   zSentencePieceTokenizer.eos_idtokenc                 C   sV   t |ttjfr| jt|S t |tr!| j|}| j|S tdt	|j
 )z7Return `True` if the passed `token` is a special token.zExpected int or str, got )r!   intnpintegerr<   	IsControlr"   r\   	TypeErrortyperD   )r@   rl   	token_intr   r   r   
is_special   s   
z!SentencePieceTokenizer.is_specialc                 C   s   t dt | jS )NzS`_control_tokens` is deprecated. Make use of `is_special` or `special_ids` instead.)r`   ra   rb   rg   r?   r   r   r   _control_tokens   s   z&SentencePieceTokenizer._control_tokensc                    s    fddt  jD S )zIds of the special tokens.c                    s   h | ]
} j |r|qS r   )r<   rp   r   tokr?   r   r   	<setcomp>   s    z5SentencePieceTokenizer.special_ids.<locals>.<setcomp>)rK   rL   r?   r   r?   r   rg      s   z"SentencePieceTokenizer.special_idsboseosc                 C   s@   t |tsJ | j|}|r| jg|}|rg || j}|S )a  Encode the given string into a list of token ids.

        Args:
            s: The string to encode.
            bos: Whether to add the beginning of sentence token.
            eos: Whether to add the end of sentence token.

        Returns:
            The list of token ids.
        )r!   r"   r<   encoderj   rk   )r@   rZ   ry   rz   tr   r   r   r{      s   zSentencePieceTokenizer.encodetokensspecial_token_policyc                 C   sJ   t |ttfstdt| d|tjtjfv r| ||S | j	|S )a  Decode the given list of token ids into a string.

        Note:
            Using `special_token_policy=SpecialTokenPolicy.KEEP` will keep the special tokens and the normal tokens as
            SentencePiece pieces.

        Args:
            tokens: The list of token ids.
            special_token_policy: The policy to use for special tokens.

        Returns:
            The decoded string.
        z@Expected `special_token_policy` to be a SpecialTokenPolicy, got r,   )
r!   r"   r   rX   rr   KEEPRAISE_decode_with_special_tokensr<   decode)r@   r}   r~   r   r   r   r      s   zSentencePieceTokenizer.decodetoken_idc                 C   r[   )z,Convert the given token id to a token piece.r;   )r@   r   r   r   r   r=      r_   z"SentencePieceTokenizer.id_to_piecec                    s   g }g }|D ].}  |r/|tjkrtd|r&| fdd|D  g }| | q|| q|rC| fdd|D  d|S )NzNDecoding `tokens` that contain special tokens with special_token_policy=RAISE.c                       g | ]}  |qS r   r=   rv   r?   r   r   r          zFSentencePieceTokenizer._decode_with_special_tokens.<locals>.<listcomp>c                    r   r   r   rv   r?   r   r   r      r   r   )rt   r   r   rX   extendappendr=   join)r@   r}   r~   	text_listcurr_tokensrw   r   r?   r   r      s   


z2SentencePieceTokenizer._decode_with_special_tokensc                 C   s   | j |tjdS )N)r~   )r   r   r   )r@   r}   r   r   r   
_to_string   s   z!SentencePieceTokenizer._to_stringc                 C   rd   )zThe padding token id.)r<   pad_idr?   r   r   r   r      re   zSentencePieceTokenizer.pad_idc                 C   rd   )zThe unknown token id.)r<   unk_idr?   r   r   r   r     re   zSentencePieceTokenizer.unk_idr   )(rD   
__module____qualname____doc__r"   r   r	   rQ   propertyrT   rV   r   rY   rm   r^   rc   rL   rh   r#   ri   r   rj   rk   rn   ro   boolrt   setru   rg   r{   r   IGNOREr   r=   r   r   r   r   __classcell__r   r   rR   r   r7   T   sD    $
 r7   )F)!rA   rF   r`   	functoolsr   pathlibr   numpyrn   mistral_common.exceptionsr   mistral_common.importsr   r   %mistral_common.tokens.tokenizers.baser   r   r	   &mistral_common.tokens.tokenizers.imager
   r   7mistral_common.tokens.tokenizers.model_settings_builderr   filterwarningsrb   sentencepiecer   r"   r   r)   r3   r6   r7   r   r   r   r   <module>   s6    