o
    iZ                    @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZ d	d
lmZmZmZmZ d	dlmZ eeZdddZde de!fddZ"dd Z#G dd dZ$G dd de$Z%de!de fddZ&G dd dZ'G dd de'Z(G d d! d!e'Z)G d"d# d#e'Z*G d$d% d%e'Z+G d&d' d'e'Z,G d(d) d)e'Z-G d*d+ d+e'Z.G d,d- d-e'Z/G d.d/ d/e'Z0G d0d1 d1e'Z1G d2d3 d3e'Z2G d4d5 d5e'Z3G d6d7 d7e3Z4G d8d9 d9e3Z5G d:d; d;e3Z6G d<d= d=e3Z7G d>d? d?e3Z8G d@dA dAe3Z9G dBdC dCe3Z:G dDdE dEe3Z;G dFdG dGe3Z<G dHdI dIe3Z=G dJdK dKe3Z>G dLdM dMe3Z?G dNdO dOe3Z@G dPdQ dQe3ZAG dRdS dSe3ZBG dTdU dUe3ZCG dVdW dWe'ZDG dXdY dYe3ZEG dZd[ d[e'ZFG d\d] d]e'ZGG d^d_ d_e'ZHG d`da dae3ZIG dbdc dce3ZJG ddde dee3ZKG dfdg dge'ZLG dhdi die3ZMG djdk dke3ZNG dldm dme3ZOdndo ZPG dpdq dqZQG drds dsZRi dte4due0dve5dwe(dxeEdyeHdze6d{eFd|e-d}e(d~e2de7de(de(de(de(de(i de4de*de-de.de(de(de0de<de0de0de(deLde8de9de+de(de0i de:de,deAde/de(de>de?de(de0de1de;de(deBdeCdeDde<de=e)eIeKeKeJeKdZSddefddZTdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)	lru_cache)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece)tqdm   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR c                 C   sj   t  rddlm} |S t r.dd l}t|jjtdk r&ddl	m} |S ddl	m
} |S tt| )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   google r#   \/home/ubuntu/LTX-2/.venv/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf%   s   r%   add_prefix_spacereturnc                 C   s$   | rd}t |ddsd}|S d}|S )NalwayslegacyTfirstnever)getattr)r&   original_tokenizerprepend_schemer#   r#   r$   _get_prepend_scheme6   s   r/   c           
         s   |d u}|r
t |n }g }| D ]<\}}g }tdt|D ]}|d | ||d  }}	| v r>|	 v r>|||	|f qt| fddd}|| qt|dd |d}dd |D }|S )	Nr   c                        | d   | d  fS Nr   r   r#   xvocabr#   r$   <lambda>K       z!generate_merges.<locals>.<lambda>keyc                 S   s   | d t | d t | d fS )N   r   r   )lenvalr#   r#   r$   r6   N   s    r9   reversec                 S   s   g | ]
}|d  |d fqS r   r   r#   .0r=   r#   r#   r$   
<listcomp>O       z#generate_merges.<locals>.<listcomp>)dictitemsranger;   appendsortedextend)
r5   vocab_scoresr?   mergesmergepiece_scorelocalindexpiece_lpiece_rr#   r4   r$   generate_merges@   s   rS   c                   @   sB   e Zd ZdZdefddZd	deeeef e	e f fddZ
dS )
SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 C   s.   t | d ddlm} | | _| j| d S )Nr   r   )SentencePieceProcessor)r   r   rV   spLoad)selfrU   rV   r#   r#   r$   __init__X   s   
zSentencePieceExtractor.__init__Nr'   c                    s2   | j   fddt  D }t||}||fS )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                       i | ]}  ||qS r#   id_to_piecerB   rP   rW   r#   r$   
<dictcomp>e   r7   z2SentencePieceExtractor.extract.<locals>.<dictcomp>)rW   rG   GetPieceSizerS   rY   rK   r5   rL   r#   r`   r$   extract_   s   
zSentencePieceExtractor.extractN)__name__
__module____qualname____doc__strrZ   tuplerE   intlistrd   r#   r#   r#   r$   rT   S   s    (rT   c                   @   s0   e Zd Zddeeeef ee f fddZdS )GemmaSentencePieceExtractorNr'   c                    sH   | j   fddt  D }d|vr|d|d< t||}||fS )r[   c                    r\   r#   r]   r_   r`   r#   r$   ra   s   r7   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>	<0x09>)rW   rG   rb   getrS   rc   r#   r`   r$   rd   m   s   
z#GemmaSentencePieceExtractor.extractre   )	rf   rg   rh   rk   rE   rj   rl   rm   rd   r#   r#   r#   r$   rn   l   s    (rn   piecec                 C   s&   t | dk p| d dkp| d   S )Nr:   ,)r;   isdigit)rr   r#   r#   r$   check_number_comma}   s   &rw   c                   @   s"   e Zd Zdd ZdefddZdS )	Converterc                 C   s
   || _ d S re   )r-   )rY   r-   r#   r#   r$   rZ      s   
zConverter.__init__r'   c                 C   s   t  re   )NotImplementedErrorrY   r#   r#   r$   	converted   s   zConverter.convertedN)rf   rg   rh   rZ   r   r{   r#   r#   r#   r$   rx      s    rx   c                   @      e Zd ZdefddZdS )BertConverterr'   c           
      C      | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr-   r5   r   r   rj   r   hasattrr   tokenize_chinese_charsr   do_lower_caser	   BertNormalizer
normalizerr
   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr   decoder
rY   r5   	tokenizerr   r   r   clssepr   r   r#   r#   r$   r{      :   



zBertConverter.convertedNrf   rg   rh   r   r{   r#   r#   r#   r$   r}          r}   c                   @   r|   )SplinterConverterr'   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkrx| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )r-   r5   r   r   rj   r   r   r   r   r   r   r	   r   r   r
   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r   r   )rY   r5   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   r#   r#   r$   r{      sL   



$"
zSplinterConverter.convertedNr   r#   r#   r#   r$   r      r   r   c                   @   r|   )FunnelConverterr'   c           
      C   r~   )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   r#   r#   r$   r{      r   zFunnelConverter.convertedNr   r#   r#   r#   r$   r      r   r   c                   @   r|   )MPNetConverterr'   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   r#   r#   r$   r{   
  s:   



zMPNetConverter.convertedNr   r#   r#   r#   r$   r   	  r   r   c                   @   r|   )OpenAIGPTConverterr'   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d ur/|
t|g tjdd|_t |_tjdd|_|S )N</w>F)r5   rL   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)r-   encoderrm   	bpe_rankskeysr   r   r   rj   token_to_idadd_special_tokensr	   r   r   r
   r   r   r   
BPEDecoderr   rY   r5   rL   r   r   r#   r#   r$   r{   1  s&   
zOpenAIGPTConverter.convertedNr   r#   r#   r#   r$   r   0  r   r   c                	   @   B   e Zd Z	ddeeeef  deeeeef   de	fddZ
dS )GPT2ConverterNr5   rL   r'   c              	   C   s   |s| j j}|st| j j}tt||d dddd}t| j dd}tj|d|_	t
 |_t| j ddrP| j j}| j j}tj| d| d||fgd	|_|S tjdd
|_|S )Nr   Fr5   rL   r   continuing_subword_prefixr   r   r&   r&   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r-   r   rm   r   r   r   r,   r
   	ByteLevelr   r   r   	bos_tokenbos_token_idr   r   r   )rY   r5   rL   r   r&   bosr   r#   r#   r$   r{   L  s:   
zGPT2Converter.convertedNNrf   rg   rh   r   rE   rj   rl   rm   rk   r   r{   r#   r#   r#   r$   r   K      r   c                   @   r|   )HerbertConverterr'   c                 C   s   d}d}| j j}t| j j }||d d v r|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r   r   r   )r   r   r   F)r   r   r   )r   r   )r-   r   rm   r   r   r   r   r   r	   r   r   r
   r   r   r   r   r   r   BertProcessingr   r   r   r   r   )rY   tokenizer_info_strtoken_suffixr5   rL   r   r#   r#   r$   r{   t  s.   

zHerbertConverter.convertedNr   r#   r#   r#   r$   r   s  r   r   c                	   @   r   )Qwen2ConverterNr5   rL   r'   c                 C   s   |s| j j}|st| j j }tt||d d ddddd}t |_	t
t
jtddddt
jt| j ddddg|_t |_tjdd	|_|S )
Nr   F)r5   rL   r   r   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr&   r&   	use_regexr   )r-   r   rm   r   r   r   r   r	   NFCr   r
   SequenceSplitr   r   r,   r   r   r   r   r   )rY   r5   rL   r   r#   r#   r$   r{     sD   

zQwen2Converter.convertedr   r   r#   r#   r#   r$   r     r   r   c                   @   r|   )RobertaConverterr'   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )Nr   Fr   r   Tr   r   r&   r   )r-   r   rm   r   r   r   r   r
   r   r&   r   r   r   r   RobertaProcessingr   r   r   r   r   rY   otr5   rL   r   r#   r#   r$   r{     s,   


zRobertaConverter.convertedNr   r#   r#   r#   r$   r     r   r   c                   @   r|   )RoFormerConverterr'   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdr*| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr   r-   r5   r   r   rj   r   r   r   r   r   r	   r   r   r
   PreTokenizercustomr   r   r   r   r   r   r   r   r   r   )
rY   r   r5   r   r   r   r   r   r   r   r#   r#   r$   r{     s8   

zRoFormerConverter.convertedNr   r#   r#   r#   r$   r     r   r   c                   @   r|   )DebertaConverterr'   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r-   r   rm   r   r   r   r   r
   r   r&   r   r   r   r   r   r   r   r   r#   r#   r$   r{     s.   
	zDebertaConverter.convertedNr   r#   r#   r#   r$   r     r   r   c                       sn   e Zd ZdZeZi Z fddZdd Zdd Z	dd	 Z
d
d Zdd Zdd Zdd ZdefddZ  ZS )SpmConverterFc                    s   t | d t j|  t }| }t| jjd}||	  W d    n1 s+w   Y  || _
| j
jjrB| jsDtd d S d S d S )Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrZ   r%   
ModelProtoopenr-   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rY   args	model_pb2mf	__class__r#   r$   rZ   (  s   
zSpmConverter.__init__c                 C      dd |j D S )Nc                 S      g | ]}|j |jfqS r#   rr   scorerB   rr   r#   r#   r$   rC   >  r7   z&SpmConverter.vocab.<locals>.<listcomp>piecesrY   r   r#   r#   r$   r5   =     zSpmConverter.vocabc                 C   s   |j jS re   )r   unk_idr
  r#   r#   r$   r  @     zSpmConverter.unk_idc           	   	      s   |j j} |}|dkrtt| | jd}n-|dkrD  jj	
|\}}dd t|D }tt|||j jd jd d}ntd fd	d
t|jD }|dd
 t|dd dD  |S )Nr   r  r   r:   c                 S      i | ]	\}\}}||qS r#   r#   rB   iwordr  r#   r#   r$   ra   R      z*SpmConverter.tokenizer.<locals>.<dictcomp>Tr   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                    8   g | ]\}}|j d v r||j|j dkp|j jv fqS )      r  typerr   r   rB   idprz   r#   r$   rC   g  
    
z*SpmConverter.tokenizer.<locals>.<listcomp>c                 S       g | ]\}}}t |d |dqS F
normalizedspecialr   rB   r  tokenr#  r#   r#   r$   rC   m      c                 S      | d S Nr   r#   r2   r#   r#   r$   r6   o      z(SpmConverter.tokenizer.<locals>.<lambda>r8   )r   
model_typer5   r   r   r  r   SpmExtractorr-   r   rd   	enumerater   	unk_piece	Exceptionr	  
add_tokensrI   )	rY   r   r+  rK   r   _rL   	bpe_vocabspm_added_tokensr#   rz   r$   r   C  sF   

zSpmConverter.tokenizerc                 C   sJ   |j j}tjdddttddg}|st|S tt|g| S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr	   StripReplacer   r   PrecompiledrY   r   r8  _normalizersr#   r#   r$   r   u  s   
zSpmConverter.normalizerc                 C      t || j}tj||dS Nreplacementr.   )r/   r-   r
   	MetaspacerY   rA  r&   r.   r#   r#   r$   r        zSpmConverter.pre_tokenizerc                 C      d S re   r#   rz   r#   r#   r$   r        zSpmConverter.post_processorc                 C   r>  r?  )r/   r-   r   rB  rC  r#   r#   r$   r     rD  zSpmConverter.decoderr'   c                 C   s   |  | j}| | j}|d ur||_d}d}t| jdr!| jj}| ||}|d ur.||_| |||_|  }|r>||_|S )Nr6  Tr&   )	r   r   r   r   r-   r&   r   r   r   )rY   r   r   rA  r&   r   r   r#   r#   r$   r{     s    zSpmConverter.converted)rf   rg   rh   r   rT   r,  r   rZ   r5   r  r   r   r   r   r   r   r{   __classcell__r#   r#   r  r$   r   #  s    2r   c                   @   $   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   r  )Nc                 S   2   g | ]}t |jr|j|jfn|j|jd  fqS d   rw   rr   r  r  r#   r#   r$   rC         $z)AlbertConverter.vocab.<locals>.<listcomp>r  r
  r#   r#   r$   r5        zAlbertConverter.vocabc                 C      t ddt ddg}| jjs|t   |t   | jjr)|t   |j	j
}|r7|t | |t tdd t |S Nz``"z''r5  r   r	   r:  r-   keep_accentsrH   NFKDStripAccentsr   	Lowercaser7  r8  r;  r   r   rY   r   list_normalizersr8  r#   r#   r$   r        


zAlbertConverter.normalizerc                 C   ,   t jddd| jdfd| jdfgdS Nr   r   r   r   r   r   r   r-   r   rz   r#   r#   r$   r        zAlbertConverter.post_processorNrf   rg   rh   r5   r   r   r#   r#   r#   r$   rI        rI  c                   @      e Zd Zdd Zdd ZdS )BarthezConverterc                 C      d}|S Nr  r#   rY   r   r  r#   r#   r$   r       zBarthezConverter.unk_idc                 C   r[  Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r]  rz   r#   r#   r$   r     r^  zBarthezConverter.post_processorN)rf   rg   rh   r  r   r#   r#   r#   r$   rb    s    rb  c                   @   rH  )CamembertConverterc                 C   2   g d}|dd |j dd  D 7 }|dg7 }|S )N))z
<s>NOTUSED        <pad>rl  )z</s>NOTUSEDrl  z<unk>rl  )z<unk>NOTUSEDic                 S   r  r#   r  r  r#   r#   r$   rC     r7   z,CamembertConverter.vocab.<locals>.<listcomp>r   z<mask>rl  r  rY   r   r5   r#   r#   r$   r5     s   
zCamembertConverter.vocabc                 C      dS rd  r#   r
  r#   r#   r$   r       zCamembertConverter.unk_idc                 C   r[  rg  r]  rz   r#   r#   r$   r     r^  z!CamembertConverter.post_processorNrf   rg   rh   r5   r  r   r#   r#   r#   r$   rj    s    rj  c                   @   rH  )DebertaV2Converterc                 C   sH   g }| j jr|tjdd t|| j }|tj||d t|S )Nr   )r   r@  )r-   split_by_punctrH   r
   Punctuationr/   rB  r   )rY   rA  r&   list_pretokenizersr.   r#   r#   r$   r     s   
z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|r"|t| |t	t
dd t|S )Nr5  r   )r-   r   rH   r	   rW  r9  r7  r8  r;  r:  r   r   rX  r#   r#   r$   r     s   
zDebertaV2Converter.normalizerc                 C   r[  r\  r]  rz   r#   r#   r$   r     r^  z!DebertaV2Converter.post_processorN)rf   rg   rh   r   r   r   r#   r#   r#   r$   ru    s    ru  c                   @   rH  )MBartConverterc                 C   >   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )Nrh  rl  rm  ri  rl  ro  c                 S   r  r#   r  r  r#   r#   r$   rC     r7   z(MBartConverter.vocab.<locals>.<listcomp>r  )ar_ARrl  cs_CZrl  de_DErl  en_XXrl  es_XXrl  et_EErl  fi_FIrl  fr_XXrl  gu_INrl  hi_INrl  it_ITrl  ja_XXrl  kk_KZrl  ko_KRrl  lt_LTrl  lv_LVrl  my_MMrl  ne_NPrl  nl_XXrl  ro_ROrl  ru_RUrl  si_LKrl  tr_TRrl  vi_VNrl  zh_CNrl  rp  r  rq  r#   r#   r$   r5     s
   
zMBartConverter.vocabc                 C   rr  rd  r#   r
  r#   r#   r$   r  >  rF  zMBartConverter.unk_idc                 C   r[  )Nz$A </s> en_XXz$A $B </s> en_XXr  ri  r   r]  rz   r#   r#   r$   r   A  r^  zMBartConverter.post_processorNrt  r#   r#   r#   r$   ry    s    &ry  c                   @   rH  )MBart50Converterc                 C   rz  )Nr{  c                 S   r  r#   r  r  r#   r#   r$   rC   T  r7   z*MBart50Converter.vocab.<locals>.<listcomp>r  )4r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZArl  )az_AZrl  )bn_INrl  )fa_IRrl  )he_ILrl  )hr_HRrl  )id_IDrl  )ka_GErl  )km_KHrl  )mk_MKrl  )ml_INrl  )mn_MNrl  )mr_INrl  )pl_PLrl  )ps_AFrl  )pt_XXrl  )sv_SErl  )sw_KErl  )ta_INrl  )te_INrl  )th_THrl  )tl_XXrl  )uk_UArl  )ur_PKrl  )xh_ZArl  )gl_ESrl  )sl_SIrl  rp  r  rq  r#   r#   r$   r5   M  s
   
zMBart50Converter.vocabc                 C   rr  rd  r#   r
  r#   r#   r$   r  Y  rF  zMBart50Converter.unk_idc                 C   r[  )Nzen_XX $A </s>zen_XX $A $B </s>r  ri  r   r]  rz   r#   r#   r$   r   \  r^  zMBart50Converter.post_processorNrt  r#   r#   r#   r$   r  L  s    r  c                   @   rH  )NllbConverterc                 C   (   g d}|dd |j dd  D 7 }|S )Nr{  c                 S   r  r#   r  r  r#   r#   r$   rC   o  r7   z'NllbConverter.vocab.<locals>.<listcomp>r  r  rq  r#   r#   r$   r5   h     zNllbConverter.vocabc                 C   rr  rd  r#   r
  r#   r#   r$   r  r  rF  zNllbConverter.unk_idc                 C   r[  )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnri  r   r]  rz   r#   r#   r$   r   u  r^  zNllbConverter.post_processorNrt  r#   r#   r#   r$   r  g      
r  c                   @   rH  )SeamlessM4TConverterc                 C   r  )N)rm  ro  r|  r}  c                 S   r  r#   r  r  r#   r#   r$   rC     r7   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>r  r  rq  r#   r#   r$   r5     r  zSeamlessM4TConverter.vocabc                 C   s   | j jS re   )r-   unk_token_idr
  r#   r#   r$   r    r  zSeamlessM4TConverter.unk_idc                 C   r[  )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__ri  r   r]  rz   r#   r#   r$   r     r^  z#SeamlessM4TConverter.post_processorNrt  r#   r#   r#   r$   r    r  r  c                   @   rH  )XLMRobertaConverterc                 C   rk  )Nr{  c                 S   r  r#   r  r  r#   r#   r$   rC     r7   z-XLMRobertaConverter.vocab.<locals>.<listcomp>r  rp  r  rq  r#   r#   r$   r5     s   
zXLMRobertaConverter.vocabc                 C   rc  rd  r#   re  r#   r#   r$   r    rf  zXLMRobertaConverter.unk_idc                 C   r[  rg  r]  rz   r#   r#   r$   r     r^  z"XLMRobertaConverter.post_processorNrt  r#   r#   r#   r$   r        r  c                   @   rH  )XLNetConverterc                 C   r  )Nc                 S   rJ  rK  rM  r  r#   r#   r$   rC     rN  z(XLNetConverter.vocab.<locals>.<listcomp>r  r
  r#   r#   r$   r5     rO  zXLNetConverter.vocabc                 C   rP  rQ  rS  rX  r#   r#   r$   r     rZ  zXLNetConverter.normalizerc                 C   r[  )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r]  rz   r#   r#   r$   r     r^  zXLNetConverter.post_processorNr_  r#   r#   r#   r$   r    r`  r  c                   @      e Zd ZdS )ReformerConverterNrf   rg   rh   r#   r#   r#   r$   r        r  c                   @   ra  )RemBertConverterc                 C   s   t ddt ddt tddg}| jjs%|t   |t   | jjr0|t 	  |j
j}|r>|t | t |S rQ  )r	   r:  r   r-   rT  rH   rU  rV  r   rW  r7  r8  r;  r   rX  r#   r#   r$   r     s   


zRemBertConverter.normalizerc                 C   r[  r\  r]  rz   r#   r#   r$   r     r^  zRemBertConverter.post_processorN)rf   rg   rh   r   r   r#   r#   r#   r$   r    s    r  c                   @   r  )BertGenerationConverterNr  r#   r#   r#   r$   r    r  r  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd ur|| j jdfg7 }| j jd ur2| j j| j jk r2|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )Nrl  c                 S      g | ]
}d | ddfqS )z<unk_>g      Yr#   rB   r  r#   r#   r$   rC     rD   z*PegasusConverter.vocab.<locals>.<listcomp>r:   c                 S   r  r#   r  r  r#   r#   r$   rC     r7   )	r-   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrG   r	  rq  r#   r#   r$   r5     s   

zPegasusConverter.vocabc                 C   s   |j j| jj S re   )r   r  r-   r  r
  r#   r#   r$   r    r  zPegasusConverter.unk_idc                 C   s(   t || j}tt tj||dgS r?  )r/   r-   r
   r   WhitespaceSplitrB  rC  r#   r#   r$   r     s   zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Br   )r-   r  eos_token_idr   r   )rY   eosr   r#   r#   r$   r   !  s   
zPegasusConverter.post_processorN)rf   rg   rh   r5   r  r   r   r#   r#   r#   r$   r    s
    	r  c                   @   ra  )T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   r  r#   r  r  r#   r#   r$   rC   ,  r7   z%T5Converter.vocab.<locals>.<listcomp>c                 S   r  )z
<extra_id_r  rl  r#   r  r#   r#   r$   rC   -  rD   r   rs   )r-   
_extra_idsr	  rG   )rY   r   num_extra_idsr5   r#   r#   r$   r5   *  s   zT5Converter.vocabc                 C   &   t jddgg dd| jdfgdS Nr  ri  )r  ri  r  ri  r   r]  rz   r#   r#   r$   r   0     zT5Converter.post_processorN)rf   rg   rh   r5   r   r#   r#   r#   r$   r  )  s    r  c                   @      e Zd Zdd ZdS )UdopConverterc                 C   r  r  r]  rz   r#   r#   r$   r   ;  r  zUdopConverter.post_processorNrf   rg   rh   r   r#   r#   r#   r$   r  :      r  c                   @   r|   )WhisperConverterr'   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fgt||d|_|S )Nr   Fr   r   r   c                 S   s   g | ]}| d qS )r   r#   rB   r&  r#   r#   r$   rC   \  s    z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 r   z $A:0 $B:1 r   r   )r-   r   rm   r   r   r   r   r
   r   r&   r   r   r   prefix_tokensconvert_ids_to_tokensr  r  joinr   r   zipr   )	rY   r5   rL   r   prefix_token_idsprefixesr  r  prefix_templater#   r#   r$   r{   F  s8   
	zWhisperConverter.convertedNr   r#   r#   r#   r$   r  E  r   r  c                   @   r  )BigBirdConverterc                 C   r[  r\  r]  rz   r#   r#   r$   r   j  r^  zBigBirdConverter.post_processorNr  r#   r#   r#   r$   r  i  r  r  c                   @   r|   )CLIPConverterr'   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )Nr   r   Fr5   rL   r   r   r   r   r   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r-   r   rm   r   r   r   r   r   rj   r	   r   r   r:  r   rW  r   r
   r   r   r   r   r   r   r   r  r  r   r   r   r   r#   r#   r$   r{   v  sD   


zCLIPConverter.convertedNr   r#   r#   r#   r$   r   u  r   r   c                   @   r|   )LayoutLMv2Converterr'   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nr   FTr   r   r   r   r   r   r   r   r   r   r   r#   r#   r$   r{     r   zLayoutLMv2Converter.convertedNr   r#   r#   r#   r$   r    r   r  c                   @   r|   )BlenderbotConverterr'   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )Nr   Fr   r   z$A:0 r   )r   r   )r-   r   rm   r   r   r   r   r
   r   r&   r   r   r   r   r   r  r  r   r   r#   r#   r$   r{     s*   

zBlenderbotConverter.convertedNr   r#   r#   r#   r$   r    r   r  c                   @   rH  )XGLMConverterc                 C   s4   g d}|dd |j dd  D 7 }|g d7 }|S )Nr{  c                 S   r  r#   r  r  r#   r#   r$   rC     r7   z'XGLMConverter.vocab.<locals>.<listcomp>r  ))z<madeupword0>rl  )z<madeupword1>rl  )z<madeupword2>rl  )z<madeupword3>rl  )z<madeupword4>rl  )z<madeupword5>rl  )z<madeupword6>rl  r  rq  r#   r#   r$   r5     s   zXGLMConverter.vocabc                 C   rc  rd  r#   re  r#   r#   r$   r    rf  zXGLMConverter.unk_idc                 C   r[  )Nz</s> $Az</s> $A </s> </s> $Brh  ri  r   r]  rz   r#   r#   r$   r     r^  zXGLMConverter.post_processorNrt  r#   r#   r#   r$   r    r  r  c                   @   sF   e Zd ZdZeZddhZ	 dd Zdd Zdd	 Z	d
d Z
dd ZdS )GemmaConverterTz<start_of_turn>z<end_of_turn>c                 C      t ddS Nr   r6  )r	   r:  r
  r#   r#   r$   r        zGemmaConverter.normalizerc                 C   s|   | j jdf| j jdf| j jdfg}|dd |jdd  D 7 }tdd |D s<tdd t|D d }|d ur<d||< |S )	Nrl  c                 S   r  r#   r  r  r#   r#   r$   rC     r7   z(GemmaConverter.vocab.<locals>.<listcomp>r  c                 s   s    | ]	}|d  dkV  qdS )r   ro   Nr#   )rB   r3   r#   r#   r$   	<genexpr>  s    z'GemmaConverter.vocab.<locals>.<genexpr>c                 s   s$    | ]\}}|d  dkr|V  qdS )r   rp   Nr#   )rB   r  r3   r#   r#   r$   r
    s   " )ro   rl  )r-   r  r  r   r	  anynextr-  )rY   r   r5   override_indexr#   r#   r$   r5     s   


zGemmaConverter.vocabc                 C   r  )Nr   merged_with_previous)r
   r   rY   rA  r&   r#   r#   r$   r   "  r	  zGemmaConverter.pre_tokenizerc                 C   rc  rd  r#   re  r#   r#   r$   r  %  rf  zGemmaConverter.unk_idc                 C   s    t t ddt  t  gS )Nr6  r   )r   r   r:  ByteFallbackFuser  r#   r#   r$   r   )  s   
zGemmaConverter.decoderN)rf   rg   rh   r   rn   r,  r   r   r5   r   r  r   r#   r#   r#   r$   r    s    
r  c                   @   s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )LlamaConverterTc                 C   sN   | j ddf| j ddf| j ddfg}|dd |jdd  D 7 }|S )Nr   rl  r   r:   c                 S   r  r#   r  r  r#   r#   r$   rC   <  r7   z(LlamaConverter.vocab.<locals>.<listcomp>r  )r-   r  r	  rq  r#   r#   r$   r5   6  s   zLlamaConverter.vocabc                 C   rc  r)  r#   re  r#   r#   r$   r  ?  rf  zLlamaConverter.unk_idc                 C   <   t ddt  t  g}|r|t jdddg7 }t |S Nr6  r   r   )contentr4  r   r:  r  r  r9  r   rY   rA  r&   sequencer#   r#   r$   r   C     

zLlamaConverter.decoderc                 C   sT   t | jddr(g }t | jddr|tjddg7 }|tjdddg7 }t|S d S )Nr)   Tr&   r6  )prependr   )patternr  )r,   r-   r	   Prependr:  r   )rY   r   r  r#   r#   r$   r   M  s   
zLlamaConverter.normalizerc                 C   s.   t | jddst|| j}tj||ddS d S )Nr)   TFrA  r.   split)r,   r-   r/   r
   rB  rC  r#   r#   r$   r   V  s   zLlamaConverter.pre_tokenizerc                 C   rE  re   r#   rz   r#   r#   r$   r   \  rs  zLlamaConverter.post_processorN)
rf   rg   rh   r   r5   r  r   r   r   r   r#   r#   r#   r$   r  3  s    	
	r  c                   @   r|   )MarkupLMConverterr'   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )Nr   Fr  r   z $A z $B r   )r-   r   rm   r   r   r   r   r   r
   r   r&   r   r   r   rj   r   r   r   r   r   r   r   )	rY   r   r5   rL   r   r   r   r   r   r#   r#   r$   r{   b  s8   
	zMarkupLMConverter.convertedNr   r#   r#   r#   r$   r  a  r   r  c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )MoshiConverterTNc                 K   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S Nr   r   	r   rx   rZ   r%   r   r   r   r   r   )rY   r   model_max_lengthkwargsr   r   r   r#   r#   r$   rZ     s   

zMoshiConverter.__init__c                 C   s:   |j j}tddg}|st|S tt|g| S r  )r7  r8  r	   r:  r   r;  r<  r#   r#   r$   r     s   

zMoshiConverter.normalizerc                 C   r  r  r  r  r#   r#   r$   r     r  zMoshiConverter.decoderc                 C   s   d}t j||ddS )Nr*   Fr  )r
   rB  rC  r#   r#   r$   r     s   zMoshiConverter.pre_tokenizerre   )rf   rg   rh   r   rZ   r   r   r   r#   r#   r#   r$   r     s    


r   c                   @   sR   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )HeliumConverterTNc                 K   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S r!  r"  )rY   r   r$  r   r   r   r#   r#   r$   rZ     s   

zHeliumConverter.__init__c                    s     |}tt| | jd} fddt|jD }|dd t|dd dD  |t	dd	d	d
g |j
ddd |S )Nr  c                    r  r  r  r  rz   r#   r$   rC     r  z-HeliumConverter.tokenizer.<locals>.<listcomp>c                 S   s"   g | ]\}}}t |d |ddqS )FT)r"  r#  single_wordr$  r%  r#   r#   r$   rC     s    c                 S   r(  r)  r#   r2   r#   r#   r$   r6     r*  z+HeliumConverter.tokenizer.<locals>.<lambda>r8   
Fr!  rn  r  )r  pad_id)r5   r   r   r  r   r-  r	  r0  rI   r   enable_padding)rY   r   rK   r   r3  r#   rz   r$   r     s&   

zHeliumConverter.tokenizerc                 C   sB   g }|j D ]}|jdkr|d|jfg7 }q||j|jfg7 }q|S )Nz<0x0A>r'  )r	  rr   r  )rY   r   r5   rr   r#   r#   r$   r5     s   

zHeliumConverter.vocabc                 C   rc  r)  r#   re  r#   r#   r$   r    rf  zHeliumConverter.unk_idc                 C   s8   t ddt  t  g}|t jdddg7 }t |S r  r  r  r#   r#   r$   r     s   

zHeliumConverter.decoderc                 C   s   t t dt ddgS r  )r	   r   r  r:  r
  r#   r#   r$   r     s   zHeliumConverter.normalizerc                 C   s   t t ddgS )Nr'  
contiguous)r
   r   r   r  r#   r#   r$   r     s   zHeliumConverter.pre_tokenizerc                 C   s   t jddgg ddgdS )Nrh  r  )rh  r  rh  r  )rh  r   r   )r   r   rz   r#   r#   r$   r     s   zHeliumConverter.post_processorre   )rf   rg   rh   r   rZ   r   r5   r  r   r   r   r   r#   r#   r#   r$   r%    s    
		r%  c                   @   s"   e Zd ZdZdddZdd ZdS )ParakeetConverterTNc                 G   sl   || _ t| d t| | t }| }t|d}||  W d    n1 s,w   Y  || _	d S r!  )
r   r   rx   rZ   r%   r   r   r   r   r   )rY   r   r   r   r   r   r#   r#   r$   rZ     s   

zParakeetConverter.__init__c              	      s     |}  j|\}}dd t|D }tt|||jjd j	d d} fddt|j
D }|dd t|dd	 d
D  |S )Nc                 S   r  r#   r#   r  r#   r#   r$   ra     r  z/ParakeetConverter.tokenizer.<locals>.<dictcomp>Tr  c                    r  r  r  r  rz   r#   r$   rC   *  r  z/ParakeetConverter.tokenizer.<locals>.<listcomp>c                 S   r  r   r$  r%  r#   r#   r$   rC   0  r'  c                 S   r(  r)  r#   r2   r#   r#   r$   r6   2  r*  z-ParakeetConverter.tokenizer.<locals>.<lambda>r8   )r5   r,  r   rd   r-  r   r   r   r.  r   r	  r0  rI   )rY   r   rK   r1  rL   r2  r   r3  r#   rz   r$   r     s,   

zParakeetConverter.tokenizerre   )rf   rg   rh   r   rZ   r   r#   r#   r#   r$   r+  	  s    
r+  c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r#   )chr)rB   nr#   r#   r$   rC   N  s    z$bytes_to_unicode.<locals>.<listcomp>)rm   rG   ordrH   rE   r  )bscsr4  br#   r#   r$   bytes_to_unicode:  s   L
r9  c                   @   sF   e Zd ZdZ				dddZdefdd	Zd
d ZdefddZ	dS )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                 K   4   || _ || _|| _t|tr| | _d S || _d S re   r   r  r&   
isinstancerE   r   additional_special_tokensrY   r   r  r&   r?  r$  r#   r#   r$   rZ   W     zTikTokenConverter.__init__tiktoken_urlc                    s   zddl m} W n ty   tdw || t fddg }i }  D ]P\}}|||< t|dkr:q)g }tdt|D ]%}|d | ||d  }	}
|	 v rh|
 v rh|	|
  v rh||	|
|f qCt	| fddd	d
}|
| q)t	|dd d	d
}fdd|D }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                       d  fdd| dD S )Nr   c                       g | ]} t | qS r#   r5  rB   charbyte_encoderr#   r$   rC   t  r7   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>latin-1r  decoder8  rI  r#   r$   token_bytes_to_strings  s   zPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringr   c                    r0   r1   r#   r2   r   r#   r$   r6     r7   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Fr>   c                 S   r(  Nr:   r#   r<   r#   r#   r$   r6     r*  c                    $   g | ]} |d   |d fqS r@   r#   rA   rO  r#   r$   rC        $ zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)tiktoken.loadrC  r/  
ValueErrorr9  rF   r;   rG   rH   rI   rJ   )rY   rB  rC  rL   r5   r&  rankrO   rP   rQ   rR   r#   )r   rJ  rO  r$   extract_vocab_merges_from_modelh  s6   z1TikTokenConverter.extract_vocab_merges_from_modelc                 C   :   |  | j\}}tt||dd}t|jdrd|j_|S NF)r   ignore_mergesTrX  r   r   r   r   rU   r[  rY   rK   rL   r   r#   r#   r$   r     
   zTikTokenConverter.tokenizerr'   c                 C   sh   |   }ttjt| jdddtj| jddg|_t	 |_
|dd | jD  tjdd|_|S )Nr   Fr   r   c                 S   s   g | ]	}t |d ddqS )FTr!  r$  r  r#   r#   r$   rC     r  z/TikTokenConverter.converted.<locals>.<listcomp>r   )r   r
   r   r   r   r  r   r&   r   r   r   r   r?  r   r   rY   r   r#   r#   r$   r{     s   
zTikTokenConverter.convertedNr;  FN)
rf   rg   rh   ri   rZ   rj   rX  r   r   r{   r#   r#   r#   r$   r:  R  s    
r:  c                   @   sB   e Zd Z				dddZdefddZd	d
 ZdefddZdS )MistralConverterNr;  Fc                 K   r<  re   r=  r@  r#   r#   r$   rZ     rA  zMistralConverter.__init__rB  c                    s  dd l  dd l}t| jddd}||}W d    n1 s w   Y  |d d | _dd |d	 D | _|d
 t tfddg }i }t	| jD ]	\}}|||j
< qN fddD t}	t	tddD ]P\}
}|
||< t|dkr~qmg }tdt|D ]%}|d | ||d  }}||	v r||	v r|| |	v r||||
f qt|fdddd}|| qmt|dd dd}fdd|D }||fS )Nr   rzutf-8)encodingconfigr  c                 S   s    g | ]}t |d  |d dqS )	token_str
is_control)r#  r$  rB   kr#   r#   r$   rC     s    zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>r   r5   c                    rD  )Nr   c                    rE  r#   rF  rG  rI  r#   r$   rC     r7   zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>rK  rL  rN  rI  r#   r$   rO    s   zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringc                    s   g | ]	}  |d  qS )token_bytes)	b64decoderg  )base64r#   r$   rC     r  z(Converting tekken.json to tokenizer.json)descr   c                    s     | d   | d fS r1   )rP   r2   rP  r#   r$   r6     rD   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>Fr>   c                 S   r(  rQ  r#   r<   r#   r#   r$   r6     r*  c                    rR  r@   r#   rA   rS  r#   r$   rC     rT  )rk  jsonr   r   loadr  r?  r9  r   r-  r  setr   r;   rG   rH   rI   rJ   )rY   rB  rm  r   untypedrL   r5   idxr&  rank_setrW  rO   rP   rQ   rR   r#   )rk  r   rJ  rO  r$   rX    sD   z0MistralConverter.extract_vocab_merges_from_modelc                 C   rY  rZ  r\  r]  r#   r#   r$   r     r^  zMistralConverter.tokenizerr'   c                 C   s^   |   }ttjt| jdddtj| jddg|_t	 |_
|| j tjdd|_|S )Nr   Fr   r   r   )r   r
   r   r   r   r  r   r&   r   r   r   r0  r?  r   r   r_  r#   r#   r$   r{     s   
zMistralConverter.convertedr`  )	rf   rg   rh   rZ   rj   rX  r   r   r{   r#   r#   r#   r$   ra    s    
&ra  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3TokenizerFc                 C   s   | j j}|tv r|st| }||  S | jdr)| | _td t	| j S ztd t
| j| jd W S  tyK   tdtt  w )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    ztekken.jsonz#Converting from Mistral tekken.jsonzConverting from Tiktoken)r   r?  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r  rf   SLOW_TO_FAST_CONVERTERSr{   r   endswithr-   loggerinfora  r:  r?  r/  rV  rm   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classr#   r#   r$   convert_slow_tokenizer-  s.   


r  )r   )F)Uri   r   	functoolsr   typingr   	packagingr   
tokenizersr   r   r   r   r	   r
   r   tokenizers.modelsr   r   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerrf   r  r%   boolrj   r/   rS   rT   rn   rw   rx   r}   r   r   r   r   r   r   r   r   r   r   r   rI  rb  rj  ru  ry  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r   r%  r+  r9  r:  ra  r  r  r#   r#   r#   r$   <module>   sZ  $


'2''(.' %!5% ($+'4.&)Y1OO	
 !"#$%&'()*+,-./01234=