o
    Xim&                     @   s$  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	 d dl
Zd dlZd dlmZ i dddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'i d(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIi dJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdki dldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~dddddddddddddddi dddddddddddddddddddddddddddddddddddddddddddddddddZi dd e D dddddddd8d8dddÜZeddōG ddǄ dǃZeddȍddefdd̄Zeddȍddd͜dedee dee defddӄZdS )    N)	dataclass)	lru_cache)ListOptionalTupleUnion)GPT2TokenizerFastenenglishzhchinesedegermanesspanishrurussiankokoreanfrfrenchjajapanesept
portuguesetrturkishplpolishcacatalannldutchararabicsvswedishititalianid
indonesianhihindififinnishvi
vietnamesehehebrewuk	ukrainianelgreekmsmalaycsczechroromaniandadanishhu	hungariantatamilno	norwegianththaiururduhrcroatianbg	bulgarianlt
lithuanianlalatinmimaoriml	malayalamcywelshskslovaktetelugufapersianlvlatvianbnbengalisrserbianazazerbaijanisl	slovenianknkannadaetestonianmk
macedonianbrbretoneubasqueis	icelandichyarmeniannenepalimn	mongolianbsbosniankkkazakhsqalbanianswswahiliglgalicianmrmarathipapunjabisisinhalakmkhmersnshonayoyorubasosomaliaf	afrikaansococcitankageorgianbe
belarusiantgtajiksdsindhigugujaratiamamharicyiyiddishlolaouzuzbekfofaroesehtzhaitian creolepspashtotkturkmennnnynorskmtmaltesesanskritluxembourgishmyanmartibetantagalogmalagasyassamesetatarhawaiianlingalahausabashkirjavanese	sundanese)salbmybotlmgastthawlnhabajwsuc                 C   s   i | ]\}}||qS  r   ).0codelanguager   r   E/home/ubuntu/.local/lib/python3.10/site-packages/whisper/tokenizer.py
<dictcomp>r   s    r   r   r   )burmese	valencianflemishhaitianletzeburgeschpushtopanjabi	moldavianmoldovan	sinhalese	castilianT)frozenc                   @   s  e Zd ZU dZded< ee ed< ee ed< dd Z	de
eee ejejf fd	d
ZdefddZee defddZee defddZee defddZee defddZee defddZee defddZee defddZee defddZee dee fddZee dee fd d!Zee dee fd"d#Zee dee fd$d%Zdefd&d'Z d(S ))	TokenizerzRA thin wrapper around `GPT2TokenizerFast` providing quick access to special tokensr   	tokenizerr   sot_sequencec                 K      | j j|fi |S N)r   encode)selftextkwargsr   r   r   r         zTokenizer.encode	token_idsc                 K   r   r   )r   decode)r   r   r   r   r   r   r      r   zTokenizer.decodereturnc                    sr   g g}|D ]%}| j kr#d| j  d dd}|| |g  q|d | q fdd|D }d|S )	z
        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        <|g{Gz?z.2f|>c                    s&   g | ]}t |tr|n j|qS r   )
isinstancestrr   r   )r   sr   r   r   
<listcomp>   s   & z4Tokenizer.decode_with_timestamps.<locals>.<listcomp> )timestamp_beginappendjoin)r   tokensoutputstoken	timestampr   r   r   decode_with_timestamps   s   


z Tokenizer.decode_with_timestampsc                 C   s   | j jS r   )r   eos_token_idr   r   r   r   eot   s   zTokenizer.eotc                 C   
   |  dS )N<|startoftranscript|>_get_single_token_idr   r   r   r   sot      
zTokenizer.sotc                 C   r   )N<|startoflm|>r  r   r   r   r   sot_lm   r  zTokenizer.sot_lmc                 C   r   )N<|startofprev|>r  r   r   r   r   sot_prev   r  zTokenizer.sot_prevc                 C   r   )N<|nospeech|>r  r   r   r   r   	no_speech   r  zTokenizer.no_speechc                 C   r   )N<|notimestamps|>r  r   r   r   r   no_timestamps   r  zTokenizer.no_timestampsc                 C   s   | j jd d S )Nr      )r   all_special_idsr   r   r   r   r      s   zTokenizer.timestamp_beginc                 C   sX   | j du r	tdtt| jj| jj}d| j  d}||v r#|| S td| j  d)zGReturns the token id corresponding to the value of the `language` fieldNz6This tokenizer does not have language token configuredr   r   z	Language z not found in tokenizer.)r   
ValueErrordictzipr   additional_special_tokensadditional_special_tokens_idsKeyError)r   additional_tokens	candidater   r   r   language_token   s   
zTokenizer.language_tokenc                 C   s@   g }t | jj| jjD ]\}}|dtv r|| qt|S )N<|>)r  r   r  r  strip	LANGUAGESr   tuple)r   resultr   token_idr   r   r   all_language_tokens   s   
zTokenizer.all_language_tokensc                    s   t  fdd jD S )Nc                 3   s"    | ]}  |gd V  qdS )r  N)r   r  )r   lr   r   r   	<genexpr>   s     z/Tokenizer.all_language_codes.<locals>.<genexpr>)r  r  r   r   r   r   all_language_codes   s   zTokenizer.all_language_codesc                 C   s   t t| j| jg S r   )r  listr   r  r   r   r   r   #sot_sequence_including_notimestamps   s   z-Tokenizer.sot_sequence_including_notimestampsc                 C   s   t d}|d 7 }td}tdd |D sJ | jdd | jdd h}|t | D ]$}| j|| jd	| fD ]}t|d
ksK||v rR||d  q?q/tt	|S )u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c                 s   s,    | ]}d t |  kodkn  V  qdS )i@&  i&  N)ord)r   cr   r   r   r!     s   * z.Tokenizer.non_speech_tokens.<locals>.<genexpr>z -r   z ' r  )
r#  splitsetallr   r   lenaddr  sorted)r   symbolsmiscellaneousr  symbolr   r   r   r   non_speech_tokens   s     zTokenizer.non_speech_tokensc                 C   s.   | j |}t|dksJ | d|d S )Nr  z! is not encoded as a single tokenr   )r   r   r+  )r   r   r   r   r   r   r    s   zTokenizer._get_single_token_idN)!__name__
__module____qualname____doc____annotations__r   r   r   intr   r   r   npndarraytorchTensorr   r   propertyr   r   r  r  r	  r  r  r   r  r  r"  r$  r1  r  r   r   r   r   r      sZ   
 "
r   )maxsizegpt2namec                 C   sp   dt jd< t jt jtd| }t|}dgdd t	 D ddd	d
dd}|
t|d |S )NfalseTOKENIZERS_PARALLELISMassetsr  c                 S   s   g | ]}d | dqS )r   r   r   )r   langr   r   r   r     s    z#build_tokenizer.<locals>.<listcomp>z<|translate|>z<|transcribe|>r  r  r
  r  )r  )osenvironpathr   dirname__file__r   from_pretrainedr  keysadd_special_tokensr  )r?  rF  r   specialsr   r   r   build_tokenizer  s*   

rM  )taskr   multilingualrN  r   r   c                C   s   |d ur|  }|tvr|tv rt| }ntd| | r)d}|p#d}|p'd}nd}d }d }t|d}|j}|d }|d }|d	 }tt }	|g}
|d ur\|
|d |		|  |d urk|
|dkrh|n| t
||t|
d
S )NzUnsupported language: rO  
transcriber	   r>  )r?  r  i)r   r   r   )lowerr  TO_LANGUAGE_CODEr  rM  r  r  rJ  r   indexr   )rO  rN  r   tokenizer_namer   r  r  	translaterP  langsr   r   r   r   get_tokenizer&  s2   


rX  )r>  )rD  dataclassesr   	functoolsr   typingr   r   r   r   numpyr8  r:  transformersr   r  itemsrS  r   r   rM  boolrX  r   r   r   r   <module>   s   	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVg 