o
    i                     @   s`   d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ G dd dZdefdd	Zd
S )    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)LlamaTokenizerFast)bytes_to_unicodec                   @   sF   e Zd ZdZ				dddZdefdd	Zd
d ZdefddZ	dS )MistralConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                 K   s   || _ || _|| _|| _d S )N)vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   kwargs r   U/home/ubuntu/.local/lib/python3.10/site-packages/transformers/integrations/mistral.py__init__   s   
zMistralConverter.__init__r   c           
         s  | t  fddg }i }t  D ]\\}\}}|| jvrm|||< t|dkr-qg }tdt|D ]%}|d | ||d  }}	| v r[|	 v r[||	  v r[|||	|f q6t| fdddd}|| q|||< qt|dd dd}fd	d
|D }||fS )Nc                    s   d  fdd| dD S )N c                    s   g | ]} t | qS r   )ord).0charbyte_encoderr   r   
<listcomp>       zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)joindecode)br   r   r   token_bytes_to_string   s   zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string   c                    s    | d   | d  fS )Nr   r!   r   )x)	bpe_ranksr   r   <lambda>-   r   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>F)keyreversec                 S   s   | d S )N   r   )valr   r   r   r$   1   s    c                    s$   g | ]} |d   |d fqS )r   r!   r   )r   r(   )r    r   r   r   2   s   $ zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)	r	   	enumerateitemsr   lenrangeappendsortedextend)
r   r   mergesidxtokenranklocalindexpiece_lpiece_rr   )r#   r   r    r   extract_vocab_merges_from_model   s,   

z0MistralConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdrd|j_|S )NF)fuse_unkignore_mergesT)r8   r   r   r   hasattrmodelr:   )r   vocab_scoresr0   	tokenizerr   r   r   r>   5   s
   zMistralConverter.tokenizerreturnc                 C   s^   |   }ttjt| jdddtj| jddg|_t	 |_
|| j tjdd|_|S )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)r>   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   r>   r   r   r   	converted<   s   
zMistralConverter.converted)Nr   FN)
__name__
__module____qualname____doc__r   strr8   r>   r   rL   r   r   r   r   r
      s    
r
   tokenizer_filec                    s|   ddl m} || }|jjj}dd |jjjD   fdd D }|| |}tt	| d
 d}|d	 i |S )
z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )MistralTokenizerc                 S   s    g | ]}t |d r|jn|qS )value)r;   rT   r   r2   r   r   r   r   W   s    z,convert_tekken_tokenizer.<locals>.<listcomp>c                    s   i | ]}|  |qS r   )r5   rU   all_specialr   r   
<dictcomp>[   r   z,convert_tekken_tokenizer.<locals>.<dictcomp>)r   r   )tokenizer_objectr   )(mistral_common.tokens.tokenizers.mistralrS   	from_fileinstruct_tokenizerr>   _tekken_token2id_nospecial_all_special_tokensupdater   r
   rL   rJ   )rR   rS   mistral_tokenizerr   specials_tokensr>   r   rV   r   convert_tekken_tokenizerL   s   


rb   N)
tokenizersr   r   r   r   r   tokenizers.modelsr   transformersr   #transformers.convert_slow_tokenizerr	   r
   rQ   rb   r   r   r   r   <module>   s    D