o
    㥵ig                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlZeeZ	d
g dZdZdZdZdZd	Zd
ZdZdZdZdZdZdZdZdZdZdZeeedZdZdd edD ZeeeeeeeeeeeeeeegeZ G dd dZ!dS )    N)Path|)z(?i:'s|'t|'re|'ve|'m|'ll|'d)z\p{P}z[^\r\n\p{L}\p{N}]?\p{L}+z\p{N}z ?[^\s\p{L}\p{N}]+[\r\n]*z
\s*[\r\n]+z
\s+(\?!\S)z\s+i z<|begin_of_text|>z<|end_of_text|>z<|pad|>z<|im_start|>z
<|im_end|>z<|phoneme_start|>z<|phoneme_end|>z<|tool_call_start|>z<|tool_call_end|>z<|text|>z	<|voice|>z<|interleave|>z<|audio_start|>z<|audio_end|>z	<|audio|>)textvoice
interleavez<|semantic:{i}|>c                 C   s   g | ]}t j|d qS ))i)SEMANTIC_TOKEN_TEMPLATEformat).0r    r   I/home/ubuntu/.local/lib/python3.10/site-packages/fish_speech/tokenizer.py
<listcomp>1   s    r   i   c                   @   s   e Zd Zefdedee ddfddZedd Zed	d
 Z	e
dedeeef fddZdedefddZddedeee B dee fddZdee defddZdefddZe
defddZdS )FishTokenizer
model_pathspecial_tokensreturnNc                    s   |  |}t|  fddt|D | _i | _d}|D ] }|dr<ttd|	d}| j| | j|< ||kr<|}q| jd | _
| j| | _tjjt|jt|| jd| _d S )Nc                    s   i | ]	\}}| | qS r   r   )r
   r   tokenspecial_token_beginr   r   
<dictcomp>N   s    z*FishTokenizer.__init__.<locals>.<dictcomp>r   z<|semantic:z<\|semantic:(\d+)\|>   )namepat_strmergeable_ranksr   )load_tiktoken_bpelen	enumerateall_special_tokens_with_idssemantic_id_to_token_id
startswithintrematchgroupsemantic_begin_idsemantic_end_idtiktokencoreEncodingr   stemFISH_TIKTOKEN_PATTERN	tkt_model)selfr   r   r   end_idxr   idxr   r   r   __init__I   s0   



zFishTokenizer.__init__c                 C   s   t | jjS N)r   r+   _mergeable_ranksr,   r   r   r   
vocab_sizeh   s   zFishTokenizer.vocab_sizec                 C   s
   t | jS r0   )r   r   r2   r   r   r   num_special_tokensl   s   
z FishTokenizer.num_special_tokenstiktoken_bpe_filec                 C   sL   i }t |   D ]}|sq
| \}}|dkrq
t||t|< q
|S )N=)openread
splitlinessplitr    base64	b64decode)r5   dataliner   rankr   r   r   r   p   s   zFishTokenizer.load_tiktoken_bper   c                 C   s
   | j | S r0   )r   )r,   r   r   r   r   get_token_id|   s   
zFishTokenizer.get_token_idTsallowed_specialc                 C   sz   t |tsJ g }tdt|tD ]}||||t   q|du r(| jj}n|du r/t }t	| jj
||t dg dS )Nr   TF)rB   disallowed_special)start)
isinstancestrranger   TIKTOKEN_MAX_ENCODE_CHARSappendr+   special_tokens_setsetsumencode_batch)r,   rA   rB   subsr   r   r   r   encode   s   
zFishTokenizer.encodetokensc                 C   s   | j |S r0   )r+   decode)r,   rP   r   r   r   rQ      s   zFishTokenizer.decodepathc                 C   s   t |}|jddd t|d d+}| jj D ]\}}t| }|dkr*d}|	| d| d qW d    n1 s@w   Y  t|d	 d}t
j| j|d
dd W d    d S 1 sbw   Y  d S )NT)parentsexist_oktokenizer.tiktokenw r6    
special_tokens.json   F)indentensure_ascii)r   mkdirr7   r+   r1   itemsr;   	b64encoderQ   writejsondumpr   )r,   rR   fr   r?   ar   r   r   save_pretrained   s$   "zFishTokenizer.save_pretrainedc                 C   s^   t | d }| r$t|}t|}W d    n1 sw   Y  nt}tt | d |S )NrZ   rU   )r   existsr7   rb   loadALL_SPECIAL_TOKENSr   )rR   special_tokens_pathrd   r   r   r   r   from_pretrained   s   
zFishTokenizer.from_pretrained)T)__name__
__module____qualname__ri   rF   listr/   propertyr3   r4   staticmethoddictbytesr    r   r@   boolrK   rO   rQ   rf   rk   r   r   r   r   r   H   s*    


$r   )"r;   rb   loggingr!   pathlibr   r&   	getLoggerrl   loggerjoinr*   rH   	BOS_TOKEN	EOS_TOKEN	PAD_TOKENIM_START_TOKENIM_END_TOKENPHONEME_START_TOKENPHONEME_END_TOKENTOOL_CALL_START_TOKENTOOL_CALL_END_TOKENMODALITY_TEXT_TOKENMODALITY_VOICE_TOKENMODALITY_INTERLEAVE_TOKENAUDIO_START_TOKENAUDIO_END_TOKENAUDIO_EMBED_TOKENMODALITY_TOKENSr   rG   SEMANTIC_TOKENSri   r   r   r   r   r   <module>   sf    
