o
    [۷i                     @  s  d dl mZ d dlmZ d dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZmZmZmZmZmZmZ zd dlmZ W n eyI   dZY nw zd dlmZ d dlmZ d d	lmZ d d
lmZ W n eyw   dZdZdZdZdZY nw dZzd dlmZ W n ey   dZY nw d dl Z ddl!m"Z" e#e$Z%G dd dZ&eG dd deZ'eG dd de'eZ(G dd de'Z)G dd de(Z*G dd de(Z+G dd de(Z,G dd de-eZ.d&d"d#Z/G d$d% d%e(Z0dS )'    )annotations)EnumN)Path)AnyCallableSequenceMappingIterableProtocolClassVarruntime_checkable)SentencePieceProcessor)MistralTokenizer)
Tekkenizer)_filter_valid_tokenizer_files)SentencePieceTokenizerFT)get_one_valid_tokenizer_file   )
GGUFWriterc                   @  s   e Zd ZU ded< ded< ded< ded< 			
	
d-d.ddZd/ddZd0d1ddZd2d d!Zd3d"d#Zd4d'd(Z	d3d)d*Z
d3d+d,Zd
S )5SpecialVocab	list[str]mergeszdict[str, bool]add_special_tokendict[str, int]special_token_idsz(str | Sequence[Mapping[str, str]] | Nonechat_templateFNpathstr | os.PathLike[str]load_mergesboolspecial_token_typesIterable[str] | Nonen_vocab
int | Nonec                 C  sL   i | _ i | _|| _|| _g | _d | _|d ur|| _nd| _| t| d S )N)boseosunkseppadclsmask)	r   r   r"   r   r   r   r    _loadr   )selfr   r   r    r"    r-   @/home/ubuntu/vllm_env/lib/python3.10/site-packages/gguf/vocab.py__init__8   s   zSpecialVocab.__init__returnstrc                 C  s    d t| j| jp
d| jpdS )NzG<SpecialVocab with {} merges, special tokens {}, add special tokens {}>unset)formatlenr   r   r   r,   r-   r-   r.   __repr__I   s   zSpecialVocab.__repr__gwr   quietNonec                 C  sL  | j r|stdt| j  d || j  n| jr td | j D ]0\}}t	|d| dd }|d u rDtd| d| d q%|sQtd	| d
|  || q%| j
 D ]0\}}t	|d| dd }|d u rztd| d| d q[|std| d|  || q[| jd ur|std| j  || j d S d S )NzAdding z
 merge(s).zJAdding merges requested but no merges found, output may be non-functional.add_	_token_idz"No handler for special token type z	 with id  - skippingzSetting special token type z to add_add__tokenzNo handler for add_z_token with value zSetting add_z
_token to zSetting chat_template to )r   loggerinfor4   add_token_mergesr   warningr   itemsgetattrr   r   add_chat_template)r,   r7   r8   typtokid
id_handlervalueadd_handlerr-   r-   r.   add_to_ggufN   s6   



zSpecialVocab.add_to_ggufr   c                 C  s6   |  | | | | jr| js| | d S d S d S N)_try_load_from_tokenizer_json_try_load_from_config_jsonr   r   _try_load_merges_txt)r,   r   r-   r-   r.   r+   j   s
   

zSpecialVocab._loadc           	      C  s   |d }|  s
dS t|dddW}t|d }|ds&|d d}nd	}g }|D ]4}|d	7 }| }|s9q,|d d
}t|dkrRt	|j
 d| d q,||d  d|d	   q,W d    n1 skw   Y  || _dS )Nz
merges.txtFrutf-8encoding #r   r         z: Line z: Entry malformed, ignoring T)is_fileopennextstrip
startswithseeksplitr4   r?   rB   nameappendr   )	r,   r   merges_filefp
first_lineline_numr   linepartsr-   r-   r.   rO   p   s0   

z!SpecialVocab._try_load_merges_txtrF   tidr   c              	   C  s~   t |tsd S |dk rtd| d| | jd u s|| jk r-|| jv r&d S || j|< d S td| d| d| j d d S )Nr   z%invalid value for special token type : zSpecial token type z, id z out of range, must be under r<   )
isinstanceint
ValueErrorr"   r   r?   rB   )r,   rF   rh   r-   r-   r.   _set_special_token   s   


$zSpecialVocab._set_special_tokenc           %   	     s  d }|d }|  rt|dd}t|}W d    n1 s w   Y  | jr|di d}t|tr|rt|d trC|| _	n<t|d tr{t
|d dkr{t|d d tr{tdd	 |D rrtd
ttdd  dd |D | _	ntd|di }ni }d }|d }|  rt|dd}t|}W d    n1 sw   Y  |r |pi d}	|pi d}
|pi d}|pi d}|	s|
r|r|
 |d< }	|s|r|r| |d< }|d }r |d|gD ]}|ddkr4d| jd< d| jd< d| jd< |
s |r |d|	gd }
|
|d< |s3|r3|d|gd }||d< q|ddkr|d g }|d!g }d }d }t
|d"kr|d d#i d$ }r|sf|}	||	|
fv rodnd%| jd< ||	|
fvrtd&|d' |d( d#i d$ }r|s|}n=||krd)| jvrt| jd* | _||d+< nd,| jvrt| jd- | _||d.< ntd/|d0|d1 | |d< }||krdnd%| jd< ||krtd2|d' |r|r|d d#i d$|krd"nd}|r|d( d#i d$|krd(nd }|r$|dks,|r1|d u r1td3 |t||  }r|d d4i d$}|d( d4i d$}|d5ks[|d6krgtd7| d8| d9 |d5kr|d6kr|d"d(  }rd%}|d d#i d$ }r|||fv r|sd}|||fvrtd:|d9 ntd;|d d9 t
|dkr|d" d#i d$ }r|||fv rd}|||fvrtd<|d9 ntd=|d" d9 || jd< |r|s|r||d< qq|sdS d }|d> }|d? }|  rjt|dd}| }W d    n	1 s*w   Y  t|d@ dA }ridB|dCg}|D ]%}t|dd}||j| dC W d    n	1 sbw   Y  qCn$|  rt|dd}t|dD}W d    n	1 sw   Y  |dD|}|d u st|ttfr|| _n	tdE|dF | jD ]R} |dG|  dH}!t|!tr|!| j| < ||  dH}"t|"tr|" nt|"tr|"dI}#t|#tsq|# nqt fdJd	|D d }$| | |$ qdS )KNtokenizer.jsonrQ   rR   modelr   r   rW   c                 s  s"    | ]}|D ]}d |v V  qqdS )rX   Nr-   ).0pairsr-   r-   r.   	<genexpr>   s     z=SpecialVocab._try_load_from_tokenizer_json.<locals>.<genexpr>z'Spaces in merges detected, encoding as rX      c                 S      g | ]}d  dd |D qS )rX   c                 S  ru   )rT   c                 s  ,    | ]}|d krt t|d n|V  qdS rX   rt   Nchrordrp   cr-   r-   r.   rs      s
    
zSSpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>.<listcomp>.<genexpr>joinrp   partr-   r-   r.   
<listcomp>   s    
zISpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>.<listcomp>r}   rp   rq   r-   r-   r.   r      s    z>SpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>zUnknown tokenizer merges formatadded_tokensztokenizer_config.json	bos_token	cls_token	eos_token	sep_tokenpost_processor
processorstypeRobertaProcessingTr$   r%   r'   r)   TemplateProcessingsinglerq   r   SpecialTokenidFzUnknown leading special token z in TemplateProcessing<single>eot)r   	eot_tokeneom)r   	eom_tokenzOverriding EOS token z with z without EOT/EOM fallback!zUnknown trailing special token z`TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>r   ABzUnknown sequence z...z in TemplateProcessing<pair>zUnknown separator token zUnknown middle sequence zUnknown second separator token zUnknown second middle sequence zchat_template.jsonzchat_template.jinjaadditional_chat_templatesz*.jinjadefault)r`   templater   z$Bad type for chat_template field in z - ignoringr:   r>   contentc                 3  s(    | ]}| d  kr| dV  qdS )r   r   N)get)rp   atok
tc_contentr-   r.   rs   6  s   & )rY   rZ   jsonloadr   r   rj   listr1   r   r4   anyr?   rB   ry   rz   rl   r   r    tupleslicereadglobra   stemr   r   dictr[   rm   )%r,   r   	tokenizertokenizer_filefr   r   tokenizer_configtokenizer_config_filespecial_bosspecial_clsspecial_eosspecial_sepr   	processortmpl_single	tmpl_pairspecial_firstspecial_last	seq_startseq_stoptmpl_atmpl_badd_sepspecial_entrychat_template_altchat_template_jsonchat_template_jinjaadditional_templatestemplate_pathrc   r   rF   	add_entryentryentry_contentmaybe_token_idr-   r   r.   rM      s,  0







** 
&
<





z*SpecialVocab._try_load_from_tokenizer_jsonc                 C  s   |d }|  s
dS t|dd}t|}W d    n1 s w   Y  | jD ]"}|| d}|d u rDd|v rD|d | d}| || q(dS )Nzconfig.jsonFrQ   rR   r;   text_configT)rY   rZ   r   r   r    r   rm   )r,   r   config_filer   configrF   token_idr-   r-   r.   rN   <  s   
z'SpecialVocab._try_load_from_config_json)FNN)r   r   r   r   r    r!   r"   r#   r0   r1   )F)r7   r   r8   r   r0   r9   )r   r   r0   r9   )r   r   r0   r   )rF   r1   rh   r   r0   r9   )__name__
__module____qualname____annotations__r/   r6   rK   r+   rO   rm   rM   rN   r-   r-   r-   r.   r   2   s"   
 




 (r   c                   @  s   e Zd ZU ded< ded< dS )	BaseVocabzClassVar[str]tokenizer_modelr`   N)r   r   r   r   r-   r-   r-   r.   r   K  s   
 r   c                   @  sB   e Zd ZU ded< ded< ded< ded< dd
dZdddZdS )Vocabrk   
vocab_sizer   added_tokens_dictr   added_tokens_listr   fname_tokenizer	base_pathc                 C     d S rL   r-   )r,   r   r-   r-   r.   r/   X      zVocab.__init__r0   -Iterable[tuple[bytes, float, gguf.TokenType]]c                 C  r   rL   r-   r5   r-   r-   r.   
all_tokensY  r   zVocab.all_tokensNr   r   r0   r   )r   r   r   r   r/   r   r-   r-   r-   r.   r   Q  s   
 
r   c                   @  s   e Zd ZdZdZdddZdS )NoVocabno_vocabr0   r1   c                 C     dS )Nz3<NoVocab for a model without integrated vocabulary>r-   r5   r-   r-   r.   r6   `  s   zNoVocab.__repr__Nr   )r   r   r   r   r`   r6   r-   r-   r-   r.   r   \  s    r   c                   @  F   e Zd ZdZdZdddZdd	d
ZdddZdddZdddZ	dS )BpeVocabgpt2bper   r   c              	     s  i }|d  }  rOt|dd}t| _W d    n1 s!w   Y  zt|d dd}t|}W d    n1 s?w   Y  W n[ tyN   Y nSw |d }t|dd}t|}W d    n1 siw   Y  |d }|d dks|d	d
s|d d dkrtd|d  _|d }d ur fdd|D }t j}tt	||t| }	t
| }
|	|
kr|t|
 d }tdt|
 d| d| d|
 t
| dd d}| _dd |D  _| _ jt j  _| _d S )Nz
vocab.jsonrQ   rR   added_tokens.jsonrn   ro   r   BPEbyte_fallbackFdecoder	ByteLevelzCannot find GPT-2 BPE tokenizervocabr   c                   s(   i | ]}|d   j vr|d  |d qS )r   r   )r   )rp   itemr5   r-   r.   
<dictcomp>  s
    z%BpeVocab.__init__.<locals>.<dictcomp>r   zExpected the z1 added token ID(s) to be sequential in the range z - z; got c                 S     | d S Nr   r-   )text_idxr-   r-   r.   <lambda>      z#BpeVocab.__init__.<locals>.<lambda>keyc                 S  s   g | ]\}}|qS r-   r-   )rp   textidxr-   r-   r.   r         z%BpeVocab.__init__.<locals>.<listcomp>)existsrZ   r   r   r   FileNotFoundErrorr   r4   r   rangesortedvaluesrl   rC   r   r   vocab_size_baser   r   )r,   r   r   r   r   tokenizer_jsonr   addedr   expected_ids
actual_idsexpected_end_idrC   r-   r5   r.   r/   h  sX   



zBpeVocab.__init__r0   r   c                 c  sB    dd | j  D }t| j D ]\}}|| dtjjfV  qd S )Nc                 S     i | ]\}}||qS r-   r-   rp   encoded_tokr   r-   r-   r.   r     s    z'BpeVocab.bpe_tokens.<locals>.<dictcomp>g        )r   rC   	enumerategguf	TokenTypeNORMAL)r,   reverse_vocabi_r-   r-   r.   
bpe_tokens  s
   zBpeVocab.bpe_tokensc                 c  ,    | j D ]}d}|d|tjjfV  qd S N     @rQ   )r   encoder  r  CONTROLr,   r   scorer-   r-   r.   r     
   
zBpeVocab.added_tokensc                 c  "    |   E d H  |  E d H  d S rL   )r  r   r5   r-   r-   r.   r        zBpeVocab.all_tokensr1   c                 C     d| j  dt| j dS )Nz<BpeVocab with  base tokens and  added tokens>r   r4   r   r5   r-   r-   r.   r6        zBpeVocab.__repr__Nr   r   r   )
r   r   r   r   r`   r/   r  r   r   r6   r-   r-   r-   r.   r   d  s    

4

r   c                   @  r   )SentencePieceVocabllamaspmr   r   c                   sB  t d u rtdi }|d  } r;zt|d dd}t|}W d    n1 s+w   Y  W n ty:   Y nw |jd  } sHtdt  | _| j	t
| | j fdd| D  ttt  }t  }||krtd	| d
| || _ fdd|D | _| _| jt| j | _|| _d S )Nzsentencepiece is not installedztokenizer.modelr   rQ   rR   zCannot find tokenizer.modelc                   s   i | ]\}}| kr||qS r-   r-   )rp   piecer   )r   r-   r.   r     s    z/SentencePieceVocab.__init__.<locals>.<dictcomp>zExpected new token IDs z to be sequential; got c                   s   g | ]} | qS r-   r-   )rp   r   )
new_tokensr-   r.   r     r   z/SentencePieceVocab.__init__.<locals>.<listcomp>)r   RuntimeErrorr   rZ   r   r   r   parentsentencepiece_tokenizerLoadFromFiler1   r   rC   r   r   r4   r   keysrl   r   r   r   r   )r,   r   r   r   r   expected_new_idsactual_new_idsr-   )r  r   r.   r/     s6   

zSentencePieceVocab.__init__r0   r   c                 c  s    | j }t| D ]?}||}|d}||}tjj}|	|r(tjj
}||r1tjj}||r:tjj}||rCtjj}|||fV  q
d S )NrQ   )r"  r   r   	IdToPiecer  GetScorer  r  r  	IsUnknownUNKNOWN	IsControlr  IsUnusedUNUSEDIsByteBYTE)r,   r   r	  r  r   r  toktyper-   r-   r.   sentencepiece_tokens  s"   






z'SentencePieceVocab.sentencepiece_tokensc                 c  r  r  )r   r  r  r  USER_DEFINEDr  r-   r-   r.   r     r  zSentencePieceVocab.added_tokensc                 c  r  rL   )r1  r   r5   r-   r-   r.   r     r  zSentencePieceVocab.all_tokensr1   c                 C  r  )Nz<SentencePieceVocab with r  r  r  r5   r-   r-   r.   r6     r  zSentencePieceVocab.__repr__Nr   r   r   )
r   r   r   r   r`   r/   r1  r   r   r6   r-   r-   r-   r.   r    s    

"

r  c                   @  sb   e Zd ZdZdZd!ddZd"d	d
Zd#ddZd$ddZd"ddZ	dd Z
d"ddZd%ddZd S )&LlamaHfVocabr  hfftr   r   c              
     s  |d }t |dd}t|}W d    n1 sw   Y  |d }|d dko5|ddo5|d	d
 }|r<td|sV|d dksR|d	drR|d d dkrVtdzddlm} W n tyo } ztd|d }~ww |j	||d
d _
 j
jsJ g  _t  _t  _t j
  dd dD ]\}	}
|
 j
jkr j|	 |
 j|	<  j|
 q fdd j
jD  _t j
j _ j
j _ jt j  _| _d S )Nrn   rQ   rR   ro   r   r   ignore_mergesFr   Tz'Llama 3 must be converted with BpeVocabr   r   zCannot find Llama BPE tokenizerr   )AutoTokenizerzsTo use LlamaHfVocab, please install the `transformers` package. You can install it with `pip install transformers`.)	cache_dirlocal_files_onlyc                 S  r   r   r-   xr-   r-   r.   r   +  r   z'LlamaHfVocab.__init__.<locals>.<lambda>r   c                   s   i | ]
}| j  | qS r-   )r   	get_vocab)rp   tokr5   r-   r.   r   4  s    z)LlamaHfVocab.__init__.<locals>.<dictcomp>)rZ   r   r   r   	TypeErrorr   transformersr6  ImportErrorfrom_pretrainedr   is_fastr   r   r   setadded_tokens_idsr   get_added_vocabrC   r   ra   addall_special_tokensspecialsall_special_idsspecial_idsr   r4   r   )r,   r   r   r   r   r   	is_llama3r6  er<  tokidxr-   r5   r.   r/     s`   



zLlamaHfVocab.__init__r0   r   c                 c  sf    dd | j   D }t| jD ]}|| jv rq|| d}|| || ||| j	fV  qd S )Nc                 S  r  r-   r-   r  r-   r-   r.   r   A      
z*LlamaHfVocab.hf_tokens.<locals>.<dictcomp>rQ   )
r   r;  rC   r   r   rC  r  get_token_scoreget_token_typerI  )r,   r  r   
token_textr-   r-   r.   	hf_tokens@  s   

zLlamaHfVocab.hf_tokensr   rk   rP  bytesrI  set[int]gguf.TokenTypec                 C  s,   t d|r
tjjS ||v rtjjS tjjS )Ns   <0x[0-9A-Fa-f]{2}>)re	fullmatchr  r  r/  r  r  )r,   r   rP  rI  r-   r-   r.   rO  R  s   zLlamaHfVocab.get_token_typefloatc                 C  r   )Nr  r-   )r,   r   r-   r-   r.   rN  Z  s   zLlamaHfVocab.get_token_scorec                 c  sb    | j D ]*}|| jv r| | j| d| j}| | j| }ntjj}d}|d||fV  qd S )N    r  rQ   )	r   rG  rO  rI  rN  r  r  r2  r  )r,   r   r0  r  r-   r-   r.   r   _  s   

zLlamaHfVocab.added_tokensc                 C  s   d| j jv pd| j jv S )Nz<0x0A>
)r   r   r5   r-   r-   r.   has_newline_tokenj  s   zLlamaHfVocab.has_newline_tokenc                 c  r  rL   )rQ  r   r5   r-   r-   r.   r   m  r  zLlamaHfVocab.all_tokensr1   c                 C  r  )Nz<LlamaHfVocab with r  r  r  r5   r-   r-   r.   r6   q  r  zLlamaHfVocab.__repr__Nr   r   )r   rk   rP  rR  rI  rS  r0   rT  )r   rk   r0   rW  r   )r   r   r   r   r`   r/   rQ  rO  rN  r   rZ  r   r6   r-   r-   r-   r.   r3    s    

B



r3  c                   @  s   e Zd ZdZdZdS )MistralTokenizerTyper  tekkenN)r   r   r   r  r\  r-   r-   r-   r.   r[  u  s    r[  r0   dict[int, str]c                  C  s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr   rt   c                 S  s   g | ]}t |qS r-   )ry   )rp   nr-   r-   r.   r     r   z$bytes_to_unicode.<locals>.<listcomp>)r   r   rz   ra   r   zip)bscsrd  bcs_strr-   r-   r.   bytes_to_unicode}  s    
rj  c                   @  s   e Zd ZU dZdZi Zded< g Zded< d0dd	Ze	d1ddZ
e	d1ddZd2ddZd2ddZd3ddZe	d4ddZe	d4ddZe	d4ddZe	d4dd Ze	d1d!d"Ze	d1d#d$Ze	d1d%d&Ze	d1d'd(Zd2d)d*Zed+d, Zd-d. Zd/S )5MistralVocabmistralr   r   r   r   r   r   c                 C  s:  t stdtd usJ dtd usJ dtd usJ dtd|  dd |dD }td ur9t|}n8t|}t	|dkrJt
d| t	|d	krid
|v rWd
}nt|d }td| d|  n|d }|| }t|jj| _t| jtrtjntj| _| jj| _|| _d| jj d | jj | _d S )NzwTo use MistralVocab, please install the `mistral-common` package. You can install it with `pip install mistral-common`.mistral_common is not installedzLoading Mistral tokenizer from c                 S  s   g | ]
}|  r| qS r-   )rY   as_posix)rp   r   r-   r-   r.   r     s    z)MistralVocab.__init__.<locals>.<listcomp>z**/*r   z*No tokenizer file found in the directory: r   ztekken.jsonr   z"Multiple tokenizer files found in z. Using zmistral--)_mistral_common_installedr?  r   r   r   r?   r@   r   r   r4   rl   r   rB   	from_fileinstruct_tokenizerr   rj   r[  r\  r  tokenizer_typen_wordsr   r   rI   version_name)r,   r   	all_filestokenizer_file_pathvalid_tokenizer_filesr   r-   r-   r.   r/     sH   


zMistralVocab.__init__r0   r1   c                 C  s   | j S rL   )rv  r5   r-   r-   r.   tokenizer_name  s   zMistralVocab.tokenizer_namec                 C  s   | j tjkrdS dS )Nr  r   )rs  r[  r  r5   r-   r-   r.   gguf_tokenizer_model  s   z!MistralVocab.gguf_tokenizer_modelr   c                 c  s    t d us	J dt| jt sJ dt| j t| jj D ]K}| jj|}|d}| jj	|}t
jj}| jj|rEt
jj}| jj|rPt
jj}| jj|r[t
jj}| jj|rft
jj}|||fV  q!d S )Nrm  z%Expected SentencePieceTokenizer, got rQ   )r   rj   r   r   r   _modelr   r'  r  r(  r  r  r  r)  r*  r+  r  r,  r-  r.  r/  )r,   r	  r  r   r  r0  r-   r-   r.   _sentencepiece_tokens  s(   
z"MistralVocab._sentencepiece_tokensc                 c  s    t d us	J dt| jt sJ dt| j t }t| jjD ]}| j|ddt	j
jfV  q"| jjD ]}| ||ddt	j
jfV  q8d S )Nrm  Expected Tekkenizer, got rQ   r   )r   rj   r   r   rj  r   num_special_tokensid_to_piecer  r  r  r  _tekken_token2id_nospecialtoken_bytes_to_stringr  )r,   byte_encoderr   tokenr-   r-   r.   _tekken_tokens  s"   zMistralVocab._tekken_tokensr  rk   c                 C  s   t d urtd usJ d| jtjkr!t| jt sJ | jj|S | jtj	kr:t| jts/J | jj|| jj
 S td| j )Nrm  Unknown tokenizer type: )r   r   rs  r[  r  rj   r   _vocabindexr\  r  rl   )r,   r  r-   r-   r.   get_token_id  s   zMistralVocab.get_token_idc                 C     | j jS rL   )r   bos_idr5   r-   r-   r.   r       zMistralVocab.bos_idc                 C  r  rL   )r   eos_idr5   r-   r-   r.   r    r  zMistralVocab.eos_idc                 C  s   | j jdkr	| jS | j jS )Nr   )r   pad_idr  r5   r-   r-   r.   r    s   zMistralVocab.pad_idc                 C  r  rL   )r   unk_idr5   r-   r-   r.   r    r  zMistralVocab.unk_idc                 C     | j | j jS rL   )r   r  r  r5   r-   r-   r.   r   #     zMistralVocab.bos_tokenc                 C  r  rL   )r   r  r  r5   r-   r-   r.   r   '  r  zMistralVocab.eos_tokenc                 C  r  rL   )r   r  r  r5   r-   r-   r.   	pad_token+  r  zMistralVocab.pad_tokenc                 C  r  rL   )r   r  r  r5   r-   r-   r.   	unk_token/  r  zMistralVocab.unk_tokenc                 c  sN    | j tjkr|  E d H  d S | j tjkr|  E d H  d S td| j  )Nr  )rs  r[  r  r}  r\  r  rl   r5   r-   r-   r.   r   3  s   zMistralVocab.all_tokensc                   s   d  fdd| dD S )NrT   c                   s   g | ]} t | qS r-   )rz   )rp   charr  r-   r.   r   ?      z6MistralVocab.token_bytes_to_string.<locals>.<listcomp>latin-1)r~   decode)rh  r  r-   r  r.   r  =  s   z"MistralVocab.token_bytes_to_stringc                   sB  t d ur
tjt sJ dtj jjjdd  D }g }tdjjj	 D ]S}|| }g }tdt
|D ]$}|d | }||d  }|v ra|v ra|| v ra||||f q=|sqtd| d|d t|fd	d
dd}|| q.t|dd
 dd}t   fdd|D }	dd |	D }
|
S )Nr~  c                 S  r  r-   r-   )rp   token_bytesrankr-   r-   r.   r   H  rM  z@MistralVocab.extract_vocab_merges_from_model.<locals>.<dictcomp>rt   r   z-Could not find valid merge for token at rank ri   r  c                   s    | d   | d  fS )Nr   r   r-   r9  )mergeable_ranksr-   r.   r   `  r  z>MistralVocab.extract_vocab_merges_from_model.<locals>.<lambda>F)r   reversec                 S  r   )NrW   r-   )valr-   r-   r.   r   d  r   c                   s,   g | ]} |d    |d  gqS )r   r   )r  )rp   r  )r  r,   r-   r.   r   h  s    z@MistralVocab.extract_vocab_merges_from_model.<locals>.<listcomp>c                 S  ru   )rX   c                 S  ru   )rT   c                 s  rv   rw   rx   r{   r-   r-   r.   rs   t  s   * zUMistralVocab.extract_vocab_merges_from_model.<locals>.<listcomp>.<listcomp>.<genexpr>r}   r   r-   r-   r.   r   r  s    zKMistralVocab.extract_vocab_merges_from_model.<locals>.<listcomp>.<listcomp>r}   r   r-   r-   r.   r   p  s    )r   rj   r   r   r|  _mergeable_ranksrC   r   r   r  r4   ra   rl   r  r   extendrj  )r,   token_bytes_mapmerge_pairsr	  merged_tokenlocaljleftrightdecoded_merge_pairsr   r-   )r  r  r,   r.   extract_vocab_merges_from_modelA  sL   

z,MistralVocab.extract_vocab_merges_from_modelNr   r   r   )r  r1   r0   rk   )r0   rk   )r   r   r   r   r`   r   r   r   r/   propertyrz  r{  r}  r  r  r  r  r  r  r   r   r  r  r   staticmethodr  r  r-   r-   r-   r.   rk    sB   
 
2





rk  )r0   r]  )1
__future__r   enumr   rU  loggingr   ospathlibr   typingr   r   r   r   r	   r
   r   r   sentencepiecer   r?  (mistral_common.tokens.tokenizers.mistralr   'mistral_common.tokens.tokenizers.tekkenr   &mistral_common.tokens.tokenizers.utilsr   .mistral_common.tokens.tokenizers.sentencepiecer   rp  r   r  gguf_writerr   	getLoggerr   r?   r   r   r   r   r   r  r3  r1   r[  rj  rk  r-   r-   r-   r.   <module>   s`    (
  
KK{
