o
    *i)                     @   s
  d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dl	m
Z
 d dlmZmZ d dlmZmZmZ d dlmZmZ ejded	d
 ejdedd
 e rUd dlmZ deeB defddZddeeB dedefddZdeeB dedB fddZG dd deZdS )    N)cached_property)Path)TokenizerException)assert_sentencepiece_installedis_sentencepiece_installed)SpecialTokenPolicy	TokenizerTokenizerVersion)ImageConfigMultiModalVersiononcez%.*`get_control_token` is deprecated.*)actioncategorymessagez#.*`_control_tokens` is deprecated.*)SentencePieceProcessorpathreturnc                    sb   t tr	tttj}ttjdg   fdd|D dg } o0tfdd|D S )z1Check if the given path is a SentencePiece model. c                    s$   g | ]} D ]	}d | | qqS )z.model. ).0vm)mm_versionsr   k/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/sentencepiece.py
<listcomp>)   s   $ z$is_sentencepiece.<locals>.<listcomp>z.modelc                 3   s    | ]	} j |V  qd S N)nameendswith)r   suffix)r   r   r   	<genexpr>+   s    z#is_sentencepiece.<locals>.<genexpr>)	
isinstancestrr   listr	   __members__r   is_fileany)r   instruct_versionssuffixesr   )r   r   r   is_sentencepiece"   s   

r(   Ftokenizer_filenameraise_deprecatedc                 C   sp   t | } | dd }|dkr|dd }|dkr(|r$td|  dtdS |tjvr4td	|  t|S )
z3Get the version of the tokenizer from the filename..modelr   r   z4Make sure to rename your tokenizer file to end with z.v1.v1!Unrecognized tokenizer filename: )r!   splitr   r	   r#   )r)   r*   _version_strr   r   r   get_spm_version.   s   
r2   c                 C   s^   t | } | dd }|dksd|vrdS d|dd  }|tjvr*td|  t|jS )z1Get the image config from the tokenizer filename.r+   r,   r-   r   Nr/   )r!   r0   r   r#   r   config)r)   r1   _mm_version_strr   r   r   get_image_configC   s   

r5   c                	       s  e Zd ZdZd7deeB dedB ddf fddZedefdd	Z	edefd
dZ
dedefddZdedefddZedefddZedefddZdee fddZedefddZedefddZdeejB eB defddZedee fddZedee fd d!Zded"ed#edee fd$d%Zd7d&ee d'edB defd(d)Zd*edefd+d,Zd&ee d'edefd-d.Z d&ee defd/d0Z!d&ee defd1d2Z"edefd3d4Z#edefd5d6Z$  Z%S )8SentencePieceTokenizerzC[SentencePiece](https://github.com/google/sentencepiece) tokenizer.N
model_pathtokenizer_versionr   c                    s   t   t jj _tj|sJ |t	t
|tr|n| d _ j  j ks0J  fddt jD  _|pDt|dd _t| _t   dS )zInitialize the `SentencePieceTokenizer`.

        Args:
            model_path: The path to the `SentencePiece` model.
            tokenizer_version: The version of the tokenizer. If not provided, it will be inferred from the model path.
        )
model_filec                    s   g | ]} j |qS r   _modelid_to_piece)r   iselfr   r   r   g   s    z3SentencePieceTokenizer.__init__.<locals>.<listcomp>F)r*   N)r   logging	getLogger	__class____name___loggerosr   isfiler   r    r!   as_posixr;   
vocab_sizeget_piece_sizerangen_words_vocabr2   _versionr   
_file_pathsuper__init__)r?   r7   r8   rB   r>   r   rP   V   s   
zSentencePieceTokenizer.__init__c                 C      | j S )z The path to the tokenizer model.)rN   r>   r   r   r   	file_pathn      z SentencePieceTokenizer.file_pathc                 C   rR   )zThe version of the tokenizer.)rM   r>   r   r   r   versions   rT   zSentencePieceTokenizer.versionsc                 C      | j |S )z+Get the special token for the given string.)r;   piece_to_idr?   rV   r   r   r   get_special_tokenx      z(SentencePieceTokenizer.get_special_tokenc                 C      t dt | |S )NzC`get_control_token` is deprecated. Use `get_special_token` instead.)warningswarnFutureWarningrZ   rY   r   r   r   get_control_token|   s   
z(SentencePieceTokenizer.get_control_tokenc                 C   
   | j  S )z!Vocabulary size of the tokenizer.)r;   rH   r>   r   r   r   rK         
zSentencePieceTokenizer.n_wordsc                 C   s
   t | jS )z.The number of special tokens of the tokenizer.)lenspecial_idsr>   r   r   r   num_special_tokens   rb   z)SentencePieceTokenizer.num_special_tokensc                 C   rR   )z,Get all tokens in the vocabulary as strings.)rL   r>   r   r   r   vocab   s   zSentencePieceTokenizer.vocabc                 C   ra   )z#The beginning of sentence token id.)r;   bos_idr>   r   r   r   rg      rb   zSentencePieceTokenizer.bos_idc                 C   ra   )zThe end of sentence token id.)r;   eos_idr>   r   r   r   rh      rb   zSentencePieceTokenizer.eos_idtokenc                 C   sV   t |ttjfr| jt|S t |tr!| j|}| j|S tdt	|j
 )z7Return `True` if the passed `token` is a special token.zExpected int or str, got )r    intnpintegerr;   	IsControlr!   rX   	TypeErrortyperC   )r?   ri   	token_intr   r   r   
is_special   s   
z!SentencePieceTokenizer.is_specialc                 C   s   t dt | jS )NzS`_control_tokens` is deprecated. Make use of `is_special` or `special_ids` instead.)r]   r^   r_   rd   r>   r   r   r   _control_tokens   s   z&SentencePieceTokenizer._control_tokensc                    s    fddt  jD S )zIds of the special tokens.c                    s   h | ]
} j |r|qS r   )r;   rm   r   tokr>   r   r   	<setcomp>   s    z5SentencePieceTokenizer.special_ids.<locals>.<setcomp>)rJ   rK   r>   r   r>   r   rd      s   z"SentencePieceTokenizer.special_idsboseosc                 C   s@   t |tsJ | j|}|r| jg|}|rg || j}|S )a  Encode the given string into a list of token ids.

        Args:
            s: The string to encode.
            bos: Whether to add the beginning of sentence token.
            eos: Whether to add the end of sentence token.

        Returns:
            The list of token ids.
        )r    r!   r;   encoderg   rh   )r?   rV   rv   rw   tr   r   r   rx      s   zSentencePieceTokenizer.encodetokensspecial_token_policyc                 C   sl   |durt |ttfstdt| d|du r"tdt tj}|tj	tj
fv r0| ||S | j|S )aa  Decode the given list of token ids into a string.

        Note:
            Using `special_token_policy=SpecialTokenPolicy.KEEP` will keep the special tokens and the normal tokens as
            SentencePiece pieces.

        Args:
            tokens: The list of token ids.
            special_token_policy: The policy to use for special tokens. If `None`, the default policy
                is `SpecialTokenPolicy.IGNORE`.  Passing `None` is deprecated and will be changed
                to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        NzFExpected `special_token_policy` to be None or SpecialTokenPolicy, got r+   zUsing the tokenizer's special token policy `None` is deprecated. It will be removed in 1.10.0. Please pass a special token policy explicitly. Future default will be SpecialTokenPolicy.IGNORE.)r    r!   r   
ValueErrorro   r]   r^   r_   IGNOREKEEPRAISE_decode_with_special_tokensr;   decode)r?   rz   r{   r   r   r   r      s   	zSentencePieceTokenizer.decodetoken_idc                 C   rW   )z,Convert the given token id to a token piece.r:   )r?   r   r   r   r   r<      r[   z"SentencePieceTokenizer.id_to_piecec                    s   g }g }|D ].}  |r/|tjkrtd|r&| fdd|D  g }| | q|| q|rC| fdd|D  d|S )NzNDecoding `tokens` that contain special tokens with special_token_policy=RAISE.c                       g | ]}  |qS r   r<   rs   r>   r   r   r          zFSentencePieceTokenizer._decode_with_special_tokens.<locals>.<listcomp>c                    r   r   r   rs   r>   r   r   r      r   r   )rq   r   r   r|   extendappendr<   join)r?   rz   r{   	text_listcurr_tokensrt   r   r>   r   r      s   


z2SentencePieceTokenizer._decode_with_special_tokensc                 C   r\   )z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

        Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

        This is a convenient method for debugging.
        z`to_string` is deprecated and will be removed in 1.10.0. Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.)r]   r^   r_   
_to_stringr?   rz   r   r   r   	to_string   s
   
z SentencePieceTokenizer.to_stringc                 C   s   | j |tjdS )N)r{   )r   r   r~   r   r   r   r   r     s   z!SentencePieceTokenizer._to_stringc                 C   ra   )zThe padding token id.)r;   pad_idr>   r   r   r   r     rb   zSentencePieceTokenizer.pad_idc                 C   ra   )zThe unknown token id.)r;   unk_idr>   r   r   r   r     rb   zSentencePieceTokenizer.unk_idr   )&rC   
__module____qualname____doc__r!   r   r	   rP   propertyrS   rU   rj   rZ   r`   rK   re   r"   rf   r   rg   rh   rk   rl   boolrq   setrr   rd   rx   r   r   r<   r   r   r   r   r   __classcell__r   r   rQ   r   r6   S   sB    $
 &r6   )F)r@   rE   r]   	functoolsr   pathlibr   numpyrk   mistral_common.exceptionsr   mistral_common.importsr   r   %mistral_common.tokens.tokenizers.baser   r   r	   &mistral_common.tokens.tokenizers.imager
   r   filterwarningsr_   sentencepiecer   r!   r   r(   r2   r5   r6   r   r   r   r   <module>   s4    