o
    i7                     @   s  d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
mZmZmZ d dlZi ddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&i d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHi dIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidji dkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~ddddddddddddddi ddddddddddddddddddddddddddddddddddi ddddddddddddddddddddddēddƓddȓddʓdd̓dd͓ddΓddddҜZi ddԄ e D dddddddd7d7dddd՜ZddddddddddddZdddddZeG dd dZeddddededefddZedddddddddededee dee dee dee defddZdS )    N)	dataclassfield)cached_property	lru_cache)DictListOptionalTupleenenglishzhchinesedegermanesspanishrurussiankokoreanfrfrenchjajapanesept
portuguesetrturkishplpolishcacatalannldutchararabicsvswedishititalianid
indonesianhihindififinnishvi
vietnamesehehebrewuk	ukrainianelgreekmsmalaycsczechroromaniandadanishhu	hungariantatamilno	norwegianththaiururduhrcroatianbg	bulgarianlt
lithuanianlalatinmimaoriml	malayalamcywelshskslovaktetelugufapersianlvlatvianbnbengalisrserbianazazerbaijanisl	slovenianknkannadaetestonianmk
macedonianbrbretoneubasqueis	icelandichyarmeniannenepalimn	mongolianbsbosniankkkazakhsqalbanianswswahiliglgalicianmrmarathipapunjabisisinhalakmkhmersnshonayoyorubasosomaliaf	afrikaansococcitankageorgianbe
belarusiantgtajiksdsindhigugujaratiamamharicyiyiddishlolaouzuzbekfofaroesehtzhaitian creolepspashtotkturkmennnnynorskmtmaltesesasanskritlbluxembourgishmymyanmarbotibetantltagalogmgmalagasyasassamesetttatarhawhawaiianlnlingalahahausababashkirjwjavanesesu	sundaneseyue	cantoneseminnanwuyudialectzh/enen/zh)r   r   r   c                 C   s   i | ]\}}||qS  r   ).0codelanguager   r   c/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sense_voice/whisper_lib/tokenizer.py
<dictcomp>y   s    r   )burmese	valencianflemishhaitianletzeburgeschpushtopanjabi	moldavianmoldovan	sinhalese	castilianmandarinASRAEDSERSpeech/SpeechBGM/BGMLaughter	/LaughterApplause	/Applause)r   r   r   r   r   r   r   r   r   r   r   HAPPYSADANGRYNEUTRAL)r   r   r   r   c                   @   s  e Zd ZU dZejed< eed< dZe	e
 ed< dZe	e
 ed< dZee ed< eed	Zee
ef ed
< dd Zdd Zdee de
fddZdee de
fddZdefddZedefddZedefddZedefddZedefddZedefdd Zedefd!d"Zedefd#d$Zedefd%d&Z edefd'd(Z!edefd)d*Z"edefd+d,Z#d-d. Z$edee fd/d0Z%edee
 fd1d2Z&edee fd3d4Z'edee fd5d6Z(d7ee fd8d9Z)d7ee fd:d;Z*d7ee fd<d=Z+dS )>	TokenizerzIA thin wrapper around `tiktoken` providing quick access to special tokensencodingnum_languagesNr   taskr   sot_sequence)default_factoryspecial_tokensc           	      C   s   | j jD ]}| j |}|| j|< q| jd }| jd }| jd }tt d | j }|g}| jd urM| jdkr@|	| j
 n|	|d || j  | jd ur`| jdkrY|n|}|	| t|| _d S )N<|startoftranscript|><|translate|><|transcribe|>nospeech   
transcribe)r   special_tokens_setencode_single_tokenr   tuple	LANGUAGESkeysr   r   append	no_speechindexr   r   )	selfspecialspecial_tokensot	translater  langsr   
task_tokenr   r   r   __post_init__   s    






zTokenizer.__post_init__c                 K      | j j|fi |S N)r   encode)r  textkwargsr   r   r   r     s   zTokenizer.encode	token_idsreturnc                    s&    fdd|D } j j|fi |S )Nc                    s   g | ]	}| j k r|qS r   )timestamp_begin)r   tr  r   r   
<listcomp>   s    z$Tokenizer.decode.<locals>.<listcomp>r   decoder  r  r  r   r  r   r      s   zTokenizer.decodec                 K   r  )z
        Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        r  r!  r   r   r   decode_with_timestamps   s   z Tokenizer.decode_with_timestampsc                 C      | j jS r  )r   n_vocabr  r   r   r   get_vocab_size   s   zTokenizer.get_vocab_sizec                 C   r#  r  )r   	eot_tokenr  r   r   r   eot   s   zTokenizer.eotc                 C   
   | j d S )Nr   r   r  r   r   r   r        
zTokenizer.transcribec                 C   r(  )Nr   r)  r  r   r   r   r     r*  zTokenizer.translatec                 C   r(  Nr   r)  r  r   r   r   r     r*  zTokenizer.sotc                 C   r(  r+  r)  r  r   r   r   	sot_sense   r*  zTokenizer.sot_sensec                 C   r(  )N<|startoflm|>r)  r  r   r   r   sot_lm   r*  zTokenizer.sot_lmc                 C   r(  )N<|startofprev|>r)  r  r   r   r   sot_prev   r*  zTokenizer.sot_prevc                 C   r(  )N<|nospeech|>r)  r  r   r   r   r
     r*  zTokenizer.no_speechc                 C   r(  )N<|notimestamps|>r)  r  r   r   r   no_timestamps   r*  zTokenizer.no_timestampsc                 C   r(  )Nz<|0.00|>r)  r  r   r   r   r     r*  zTokenizer.timestamp_beginc                 C   s   | j du r	td| | j S )zGReturns the token id corresponding to the value of the `language` fieldNz6This tokenizer does not have language token configured)r   
ValueErrorto_language_tokenr  r   r   r   language_token   s   
zTokenizer.language_tokenc                 C   s.   | j d| dd  }r|S td| d)N<||>z	Language z not found in tokenizer.)r   getKeyError)r  r   tokenr   r   r   r5    s   zTokenizer.to_language_tokenc                 C   sB   g }| j  D ]\}}|dtv r|| qt|d | j S )N<|>)r   itemsstripr  r	  r  r   )r  resultr;  token_idr   r   r   all_language_tokens	  s   
zTokenizer.all_language_tokensc                    s   t  fdd jD S )Nc                 3   s"    | ]}  |gd V  qdS )r<  N)r   r>  )r   _lr  r   r   	<genexpr>  s     z/Tokenizer.all_language_codes.<locals>.<genexpr>)r  rA  r  r   r  r   all_language_codes  s   zTokenizer.all_language_codesc                 C   s   t t| j| jg S r  )r  listr   r3  r  r   r   r   #sot_sequence_including_notimestamps  s   z-Tokenizer.sot_sequence_including_notimestampsc                 C   s   t d}|d 7 }td}tdd |D sJ | jdd | jdd h}|t | D ]$}| j|| jd	| fD ]}t|d
ksK||v rR||d  q?q/tt	|S )u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c                 s   s,    | ]}d t |  kodkn  V  qdS )i@&  i&  N)ord)r   cr   r   r   rC  -  s   * z.Tokenizer.non_speech_tokens.<locals>.<genexpr>z -r   z ' r  )
rE  splitsetallr   r  lenaddr  sorted)r  symbolsmiscellaneousr?  symboltokensr   r   r   non_speech_tokens  s    
zTokenizer.non_speech_tokensrS  c                 C   s   | j dv r
| |S | |S )N>   r   r   r   rF   r   r   )r   split_tokens_on_unicodesplit_tokens_on_spaces)r  rS  r   r   r   split_to_word_tokens;  s   


zTokenizer.split_to_word_tokensc           
      C   s   |  |}d}g }g }g }d}|D ]-}|| |  |}	||	vs,|||	|  |kr>||	 || g }|t|	7 }q||fS )Nu   �r   )r"  r	  r  rM  )
r  rS  decoded_fullreplacement_charwordsword_tokenscurrent_tokensunicode_offsetr;  decodedr   r   r   rU  D  s&   




z!Tokenizer.split_tokens_on_unicodec                 C   s   |  |\}}g }g }t||D ]=\}}|d | jk}|d}	| tjv }
|s3|	s3|
s3t|dkr>|| || q|d | |d< |d 	| q||fS )Nr   rI  )
rU  zipr'  
startswithr>  stringpunctuationrM  r	  extend)r  rS  subwordssubword_tokens_listrZ  r[  subwordsubword_tokensr  
with_spacerc  r   r   r   rV  ]  s   

z Tokenizer.split_tokens_on_spaces),__name__
__module____qualname____doc__tiktokenEncoding__annotations__intr   r   strr   r   r	   r   dictr   r   r  r  r   r   r"  r%  r   r'  r  r  r  r,  r.  r0  r
  r3  r  r6  r5  rA  rD  rF  rT  rW  rU  rV  r   r   r   r   r      s^   
 
!	r   )maxsizegpt2c   namer   
vocab_pathc                 C   s  |d u rt jt jtd|  d}dd dd t|D D }t|}i }	 dd	gdd tt	 d | D dd tt	 D dd tt	 D dddddddd t
ddD dd t
dD }|D ]
}|||< |d7 }qrtjt j||d||dS )Nassetsz	.tiktokenc                 S   s    i | ]\}}t |t|qS r   )base64	b64decoderq  )r   r;  rankr   r   r   r   u  s    z get_encoding.<locals>.<dictcomp>c                 s   s    | ]	}|r|  V  qd S r  )rJ  )r   liner   r   r   rC  w  s    zget_encoding.<locals>.<genexpr>Fz<|endoftext|>r   c                 S      g | ]}d | dqS r7  r8  r   r   langr   r   r   r        z get_encoding.<locals>.<listcomp>r   r   r-  r/  r1  r2  c                 S      g | ]}d |d ddqS r7  g{Gz?z.2fr8  r   r   ir   r   r   r        i  c                 S   r~  r  r   r  r   r   r   r    r  c                 S   r~  r  r   )r   audio_eventr   r   r   r    r  c                 S   r~  r  r   )r   emotionr   r   r   r    r  c                 S   r~  )z<|SPECIAL_TOKEN_r8  r   r  r   r   r   r    r  r  3   c                 S   r  r  r   r  r   r   r   r    r  zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)rw  explicit_n_vocabpat_strmergeable_ranksr   )ospathjoindirname__file__openrM  rE  r  r  rangeAUDIO_EVENTEMOTIONrn  ro  basename)rw  r   rx  ranksr$  r   specialsr;  r   r   r   get_encodingp  sV   	


r  )r   r   r   encoding_pathrx  multilingualr   r   r  r  c                C   s   |d ur!|  }|tvr!|tv rt| }n|dkrntd| | r.d}|p(d}|p,d}nd}d }d }|d ur:|}t|||d}t||||dS )	Nr  zUnsupported language: r  r
   r  ru  )rw  r   rx  )r   r   r   r   )lowerr  TO_LANGUAGE_CODEr4  r  r   )r  r   r   r   r  rx  encoding_namer   r   r   r   get_tokenizer  s&   


r  )ru  rv  N)rz  r  rb  dataclassesr   r   	functoolsr   r   typingr   r   r   r	   rn  r  r=  r  r  r  r   rr  rq  r  boolr  r   r   r   r   <module>   s&   	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgm Q6