o
    հi4Q                     @  s"  d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZmZmZmZmZmZ zd dlmZ W n eyC   dZY nw d dlZddlmZ eeZG dd	 d	ZeG d
d deZeG dd deeZG dd deZG dd deZG dd deZG dd deZ dS )    )annotationsN)Path)AnyCallableSequenceMappingIterableProtocolClassVarruntime_checkable)SentencePieceProcessor   )
GGUFWriterc                   @  s   e Zd ZU ded< ded< ded< ded< 			
	
d-d.ddZd/ddZd0d1ddZd2d d!Zd3d"d#Zd4d'd(Z	d3d)d*Z
d3d+d,Zd
S )5SpecialVocab	list[str]mergeszdict[str, bool]add_special_tokendict[str, int]special_token_idsz(str | Sequence[Mapping[str, str]] | Nonechat_templateFNpathstr | os.PathLike[str]load_mergesboolspecial_token_typesIterable[str] | Nonen_vocab
int | Nonec                 C  sL   i | _ i | _|| _|| _g | _d | _|d ur|| _nd| _| t| d S )N)boseosunkseppadclsmask)	r   r   r   r   r   r   r   _loadr   )selfr   r   r   r    r'   >/home/ubuntu/.local/lib/python3.10/site-packages/gguf/vocab.py__init__   s   zSpecialVocab.__init__returnstrc                 C  s    d t| j| jp
d| jpdS )NzG<SpecialVocab with {} merges, special tokens {}, add special tokens {}>unset)formatlenr   r   r   r&   r'   r'   r(   __repr__-   s   zSpecialVocab.__repr__gwr   quietNonec                 C  sL  | j r|stdt| j  d || j  n| jr td | j D ]0\}}t	|d| dd }|d u rDtd| d| d q%|sQtd	| d
|  || q%| j
 D ]0\}}t	|d| dd }|d u rztd| d| d q[|std| d|  || q[| jd ur|std| j  || j d S d S )NzAdding z
 merge(s).zJAdding merges requested but no merges found, output may be non-functional.add_	_token_idz"No handler for special token type z	 with id  - skippingzSetting special token type z to add_add__tokenzNo handler for add_z_token with value zSetting add_z
_token to zSetting chat_template to )r   loggerinfor.   add_token_mergesr   warningr   itemsgetattrr   r   add_chat_template)r&   r1   r2   typtokid
id_handlervalueadd_handlerr'   r'   r(   add_to_gguf2   s6   



zSpecialVocab.add_to_ggufr   c                 C  s6   |  | | | | jr| js| | d S d S d S N)_try_load_from_tokenizer_json_try_load_from_config_jsonr   r   _try_load_merges_txt)r&   r   r'   r'   r(   r%   N   s
   

zSpecialVocab._loadc           	      C  s   |d }|  s
dS t|dddW}t|d }|ds&|d d}nd	}g }|D ]4}|d	7 }| }|s9q,|d d
}t|dkrRt	|j
 d| d q,||d  d|d	   q,W d    n1 skw   Y  || _dS )Nz
merges.txtFrutf-8encoding #r   r         z: Line z: Entry malformed, ignoring T)is_fileopennextstrip
startswithseeksplitr.   r9   r<   nameappendr   )	r&   r   merges_filefp
first_lineline_numr   linepartsr'   r'   r(   rI   T   s0   

z!SpecialVocab._try_load_merges_txtr@   tidr   c              	   C  s~   t |tsd S |dk rtd| d| | jd u s|| jk r-|| jv r&d S || j|< d S td| d| d| j d d S )Nr   z%invalid value for special token type z: zSpecial token type z, id z out of range, must be under r6   )
isinstanceint
ValueErrorr   r   r9   r<   )r&   r@   rb   r'   r'   r(   _set_special_tokenm   s   


$zSpecialVocab._set_special_tokenc                   s  |d }|  rt|dd}t|}W d    n1 sw   Y  | jr}|di d}t|tr}|r}t|d trA|| _	n<t|d tryt
|d dkryt|d d trytdd	 |D rptd
ttdd  dd |D | _	ntd|di }ni }|d }|  sdS t|dd}t|}W d    n1 sw   Y  d }	|d }
|
  rt|
dd}t|d}	W d    n1 sw   Y  |d|	}|d u st|ttfr|| _n	td|d | jD ]O}|d| d}t|tr
|| j|< || d}t|tr| nt|tr0|d}t|ts-q| nqt fdd	|D d }| || qdS )Ntokenizer.jsonrK   rL   modelr   r   rQ   c                 s  s"    | ]}|D ]}d |v V  qqdS )rR   Nr'   ).0pairsr'   r'   r(   	<genexpr>   s     z=SpecialVocab._try_load_from_tokenizer_json.<locals>.<genexpr>z'Spaces in merges detected, encoding as rR      c                 S      g | ]}d  dd |D qS )rR   c                 S  rn   )rN   c                 s  s,    | ]}|d krt t|d n|V  qdS )rR   rm   N)chrord)ri   cr'   r'   r(   rl      s
    
zSSpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>.<listcomp>.<genexpr>join)ri   partr'   r'   r(   
<listcomp>   s    
zISpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>.<listcomp>rr   )ri   rj   r'   r'   r(   ru      s    z>SpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>zUnknown tokenizer merges formatadded_tokensztokenizer_config.jsonTzchat_template.jsonr   z$Bad type for chat_template field in z - ignoringr4   r8   contentc                 3  s(    | ]}| d  kr| dV  qdS )rw   idN)get)ri   atok
tc_contentr'   r(   rl      s   & )rS   rT   jsonloadr   ry   rc   listr+   r   r.   anyr9   r<   ro   rp   re   r   r   r   r   dictrU   rf   )r&   r   tokenizer_filef	tokenizerr   rv   tokenizer_config_filetokenizer_configchat_template_altchat_template_filer   r@   	add_entryentryentry_contentmaybe_token_idr'   r{   r(   rG   y   sl   0



z*SpecialVocab._try_load_from_tokenizer_jsonc                 C  sr   |d }|  s
dS t|dd}t|}W d    n1 s w   Y  | jD ]}| ||| d q(dS )Nzconfig.jsonFrK   rL   r5   T)rS   rT   r}   r~   r   rf   ry   )r&   r   config_filer   configr@   r'   r'   r(   rH      s   
z'SpecialVocab._try_load_from_config_json)FNN)r   r   r   r   r   r   r   r   r*   r+   )F)r1   r   r2   r   r*   r3   )r   r   r*   r3   )r   r   r*   r   )r@   r+   rb   r   r*   r3   )__name__
__module____qualname____annotations__r)   r0   rE   r%   rI   rf   rG   rH   r'   r'   r'   r(   r      s    
 




Gr   c                   @  s   e Zd ZU ded< ded< dS )	BaseVocabzClassVar[str]tokenizer_modelrZ   N)r   r   r   r   r'   r'   r'   r(   r      s   
 r   c                   @  sB   e Zd ZU ded< ded< ded< ded< dd
dZdddZdS )Vocabrd   
vocab_sizer   added_tokens_dictr   added_tokens_listr   fname_tokenizer	base_pathc                 C     d S rF   r'   )r&   r   r'   r'   r(   r)          zVocab.__init__r*   -Iterable[tuple[bytes, float, gguf.TokenType]]c                 C  r   rF   r'   r/   r'   r'   r(   
all_tokens   r   zVocab.all_tokensNr   r   r*   r   )r   r   r   r   r)   r   r'   r'   r'   r(   r      s   
 
r   c                   @  s   e Zd ZdZdZdddZdS )NoVocabno_vocabr*   r+   c                 C     dS )Nz3<NoVocab for a model without integrated vocabulary>r'   r/   r'   r'   r(   r0      s   zNoVocab.__repr__Nr   )r   r   r   r   rZ   r0   r'   r'   r'   r(   r      s    r   c                   @  F   e Zd ZdZdZdddZdd	d
ZdddZdddZdddZ	dS )BpeVocabgpt2bper   r   c              	     s  i }|d  }  rOt|dd}t| _W d    n1 s!w   Y  zt|d dd}t|}W d    n1 s?w   Y  W n[ tyN   Y nSw |d }t|dd}t|}W d    n1 siw   Y  |d }|d dks|d	d
s|d d dkrtd|d  _|d }d ur fdd|D }t j}tt	||t| }	t
| }
|	|
kr|t|
 d }tdt|
 d| d| d|
 t
| dd d}| _dd |D  _| _ jt j  _| _d S )Nz
vocab.jsonrK   rL   added_tokens.jsonrg   rh   typeBPEbyte_fallbackFdecoder	ByteLevelzCannot find GPT-2 BPE tokenizervocabrv   c                   s(   i | ]}|d   j vr|d  |d qS )rw   rx   )r   )ri   itemr/   r'   r(   
<dictcomp>	  s
    z%BpeVocab.__init__.<locals>.<dictcomp>r   zExpected the z1 added token ID(s) to be sequential in the range z - z; got c                 S     | d S Nr   r'   )text_idxr'   r'   r(   <lambda>      z#BpeVocab.__init__.<locals>.<lambda>keyc                 S  s   g | ]\}}|qS r'   r'   )ri   textidxr'   r'   r(   ru         z%BpeVocab.__init__.<locals>.<listcomp>)existsrT   r}   r~   r   FileNotFoundErrorry   r.   r   rangesortedvaluesre   r=   r   r   vocab_size_baser   r   )r&   r   rv   r   r   tokenizer_jsonr   addedr   expected_ids
actual_idsexpected_end_idr=   r'   r/   r(   r)      sX   



zBpeVocab.__init__r*   r   c                 c  sB    dd | j  D }t| j D ]\}}|| dtjjfV  qd S )Nc                 S     i | ]\}}||qS r'   r'   ri   encoded_tokrx   r'   r'   r(   r     s    z'BpeVocab.bpe_tokens.<locals>.<dictcomp>g        )r   r=   	enumerategguf	TokenTypeNORMAL)r&   reverse_vocabi_r'   r'   r(   
bpe_tokens  s
   zBpeVocab.bpe_tokensc                 c  ,    | j D ]}d}|d|tjjfV  qd S N     @rK   )r   encoder   r   CONTROLr&   r   scorer'   r'   r(   rv   "  
   
zBpeVocab.added_tokensc                 c  "    |   E d H  |  E d H  d S rF   )r   rv   r/   r'   r'   r(   r   '     zBpeVocab.all_tokensr+   c                 C     d| j  dt| j dS )Nz<BpeVocab with  base tokens and  added tokens>r   r.   r   r/   r'   r'   r(   r0   +     zBpeVocab.__repr__Nr   r   r   )
r   r   r   r   rZ   r)   r   rv   r   r0   r'   r'   r'   r(   r      s    

4

r   c                   @  r   )SentencePieceVocabllamaspmr   r   c                   sB  t d u rtdi }|d  } r;zt|d dd}t|}W d    n1 s+w   Y  W n ty:   Y nw |jd  } sHtdt  | _| j	t
| | j fdd| D  ttt  }t  }||krtd	| d
| || _ fdd|D | _| _| jt| j | _|| _d S )Nzsentencepiece is not installedztokenizer.modelr   rK   rL   zCannot find tokenizer.modelc                   s   i | ]\}}| kr||qS r'   r'   )ri   piecerx   )r   r'   r(   r   G  s    z/SentencePieceVocab.__init__.<locals>.<dictcomp>zExpected new token IDs z to be sequential; got c                   s   g | ]} | qS r'   r'   )ri   rx   )
new_tokensr'   r(   ru   P  r   z/SentencePieceVocab.__init__.<locals>.<listcomp>)r   RuntimeErrorr   rT   r}   r~   r   parentsentencepiece_tokenizerLoadFromFiler+   r   r=   r   r   r.   r   keysre   r   r   r   r   )r&   r   rv   r   r   expected_new_idsactual_new_idsr'   )r   r   r(   r)   3  s6   

zSentencePieceVocab.__init__r*   r   c                 c  s    | j }t| D ]?}||}|d}||}tjj}|	|r(tjj
}||r1tjj}||r:tjj}||rCtjj}|||fV  q
d S )NrK   )r   r   r   	IdToPiecer   GetScorer   r   r   	IsUnknownUNKNOWN	IsControlr   IsUnusedUNUSEDIsByteBYTE)r&   r   r   r   r   r   toktyper'   r'   r(   sentencepiece_tokensU  s"   






z'SentencePieceVocab.sentencepiece_tokensc                 c  r   r   )r   r   r   r   USER_DEFINEDr   r'   r'   r(   rv   m  r   zSentencePieceVocab.added_tokensc                 c  r   rF   )r   rv   r/   r'   r'   r(   r   r  r   zSentencePieceVocab.all_tokensr+   c                 C  r   )Nz<SentencePieceVocab with r   r   r   r/   r'   r'   r(   r0   v  r   zSentencePieceVocab.__repr__Nr   r   r   )
r   r   r   r   rZ   r)   r   rv   r   r0   r'   r'   r'   r(   r   /  s    

"

r   c                   @  sb   e Zd ZdZdZd!ddZd"d	d
Zd#ddZd$ddZd"ddZ	dd Z
d"ddZd%ddZd S )&LlamaHfVocabr   hfftr   r   c              
     s  |d }t |dd}t|}W d    n1 sw   Y  |d }|d dko5|ddo5|d	d
 }|r<td|sV|d dksR|d	drR|d d dkrVtdzddlm} W n tyo } ztd|d }~ww |j	||d
d _
 j
jsJ g  _t  _t  _t j
  dd dD ]\}	}
|
 j
jkr j|	 |
 j|	<  j|
 q fdd j
jD  _t j
j _ j
j _ jt j  _| _d S )Nrg   rK   rL   rh   r   r   ignore_mergesFr   Tz'Llama 3 must be converted with BpeVocabr   r   zCannot find Llama BPE tokenizerr   )AutoTokenizerzsTo use LlamaHfVocab, please install the `transformers` package. You can install it with `pip install transformers`.)	cache_dirlocal_files_onlyc                 S  r   r   r'   )xr'   r'   r(   r     r   z'LlamaHfVocab.__init__.<locals>.<lambda>r   c                   s   i | ]
}| j  | qS r'   )r   	get_vocab)ri   tokr/   r'   r(   r     s    z)LlamaHfVocab.__init__.<locals>.<dictcomp>)rT   r}   r~   ry   	TypeErrorr   transformersr   ImportErrorfrom_pretrainedr   is_fastr   r   r   setadded_tokens_idsr   get_added_vocabr=   r   r[   addall_special_tokensspecialsall_special_idsspecial_idsr   r.   r   )r&   r   r   r   r   r   	is_llama3r   er  tokidxr'   r/   r(   r)   ~  s`   



zLlamaHfVocab.__init__r*   r   c                 c  sf    dd | j   D }t| jD ]}|| jv rq|| d}|| || ||| j	fV  qd S )Nc                 S  r   r'   r'   r   r'   r'   r(   r     s    
z*LlamaHfVocab.hf_tokens.<locals>.<dictcomp>rK   )
r   r   r=   r   r   r  r   get_token_scoreget_token_typer  )r&   r   token_id
token_textr'   r'   r(   	hf_tokens  s   

zLlamaHfVocab.hf_tokensr  rd   r  bytesr  set[int]gguf.TokenTypec                 C  s,   t d|r
tjjS ||v rtjjS tjjS )Ns   <0x[0-9A-Fa-f]{2}>)re	fullmatchr   r   r   r   r   )r&   r  r  r  r'   r'   r(   r    s   zLlamaHfVocab.get_token_typefloatc                 C  r   )Nr   r'   )r&   r  r'   r'   r(   r    s   zLlamaHfVocab.get_token_scorec                 c  sb    | j D ]*}|| jv r| | j| d| j}| | j| }ntjj}d}|d||fV  qd S )N    r   rK   )	r   r  r  r  r  r   r   r   r   )r&   r   r   r   r'   r'   r(   rv     s   

zLlamaHfVocab.added_tokensc                 C  s   d| j jv pd| j jv S )Nz<0x0A>
)r   r   r/   r'   r'   r(   has_newline_token  s   zLlamaHfVocab.has_newline_tokenc                 c  r   rF   )r  rv   r/   r'   r'   r(   r     r   zLlamaHfVocab.all_tokensr+   c                 C  r   )Nz<LlamaHfVocab with r   r   r   r/   r'   r'   r(   r0     r   zLlamaHfVocab.__repr__Nr   r   )r  rd   r  r  r  r  r*   r  )r  rd   r*   r  r   )r   r   r   r   rZ   r)   r  r  r  rv   r  r   r0   r'   r'   r'   r(   r   z  s    

B



r   )!
__future__r   r  loggingr}   ospathlibr   typingr   r   r   r   r   r	   r
   r   sentencepiecer   r  r   gguf_writerr   	getLoggerr   r9   r   r   r   r   r   r   r   r'   r'   r'   r(   <module>   s2    (
 6
KK