o
    }oiE#                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
 d dlmZmZ d dlmZ dgZd	Zd
ZdZdZdZdZdZeeeeeegZdZG dd de
ZG dd deZdedee defddZdS )    N)cached_property)Path)DictList)AggregateTokenizer)SentencePieceTokenizercreate_spt_model)loggingCanaryTokenizerz<|startoftranscript|>z<|endoftext|>z<pad>z<|nospeech|>z<|pnc|>z	<|nopnc|>z<|startofcontext|>
spl_tokensc                
       s   e Zd ZdZdef fddZedefddZedefdd	Z	edefd
dZ
edefddZdee f fddZdee f fddZdee fddZdedee fddZdd Ze	ddee deeB dedefddZ  ZS ) r
   zZ
    Thin wrapper around AggregateTokenizer to provide quick access to special tokens
    
tokenizersc                    sV   t  | i | _|t jD ]}|dr|ds|tkr(| j|td| j|< qd S )N<||>)lang_id)	super__init__special_tokensCANARY_SPECIAL_TOKENIZERvocab
startswithendswith
CANARY_PADtoken_to_id)selfr   special	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/canary_tokenizer.pyr   -   s   zCanaryTokenizer.__init__returnc                 C   
   | j t S N)r   
CANARY_EOSr   r   r   r   eos_id7      
zCanaryTokenizer.eos_idc                 C   r    r!   )r   
CANARY_BOSr#   r   r   r   bos_id;   r%   zCanaryTokenizer.bos_idc                 C   r    r!   )r   CANARY_NOSPEECHr#   r   r   r   nospeech_id?   r%   zCanaryTokenizer.nospeech_idc                 C   r    r!   )r   r   r#   r   r   r   pad_idC   r%   zCanaryTokenizer.pad_idc                    s0  |  }| |}g }g }d}d}	d}
|D ]^}|
dkr,|t|k r,|||  |d7 }|t ||7 }|t ||7 }|
d7 }
|
|	 dkro|
dkro|t|k ro|||  |d7 }|t|k rn|||  |d7 }q|d7 }q|t|k r||d  |t|k r|d |d kr||d  |S )Nr         )split_tokenize_special_promptlenappendr   text_to_ids)r   text_without_timestamps	time_textr   trans_wordstime_idsword_ids
result_ids
time_indextimestamp_every_n_words
word_indexwordr   r   r   _text_with_timestamps_to_idsG   s6   
 
z,CanaryTokenizer._text_with_timestamps_to_idsc                    sR   t d}d||}t|}|st ||S |d| }| 	|||S )Nz	<\|\d+\|> )
recompilejoinfindallboolr   r2   substripr=   )r   text_no_eosr   time_patternr4   has_timestampr3   r   r   r   "_text_to_ids_maybe_with_timestampso   s   
z2CanaryTokenizer._text_to_ids_maybe_with_timestampsc                 C   sV   |t kr	| |S t|| j}|tr%| |d tt  || jg S | ||S r!   )	r   r/   _map_canary1_to_canary2_langlangsr   r"   rI   r0   r$   )r   textr   r   r   r   r2   y   s   

"zCanaryTokenizer.text_to_idsrL   c           	      C   s   g }| tr>|| jt  |ttd }|t}|d|  }r>|dd dd}|	| 
|| ||d }|d}t|D ]}|d|dd  }|| j|  |t|d }qGt|dksoJ ||S )	z
        Tokenize the input special prompt of Canary family of models.

        Required because otherwise self.text_to_ids() returns a different result than what Canary had been trained with.
        Nr      r   r>   >r+   r   )r   CANARY2_BOCTXr1   r   r0   findr&   r.   replaceextendr2   countrange)	r   rL   ansctx_end_idxdecoder_ctxtarget_langnum_special_tokens_tokenr   r   r   r/      s    


z(CanaryTokenizer._tokenize_special_promptc                 C   s.   | j d| dd  }r|S td| d)Nr   r   zToken z not found in tokenizer.)r   getKeyError)r   r[   token_idr   r   r   spl_token_to_id   s   zCanaryTokenizer.spl_token_to_idFtokens	model_dirforce_rebuildc           	         s   |rt d dD ]}tj|rt| q	td t fdd| D  } t	t
| } t|}|jddd |d }d	| }|| |d
 }tt|t| d ddt|| d tt|}|S )NzBuilding special tokenizer)tokenizer.modelztokenizer.vocabz	vocab.txttrain_text.txtz<\|.+\|>c                    s*   g | ]}  |d u rd| dn|qS )Nr   r   )match).0t
spl_tok_rer   r   
<listcomp>   s   * z;CanaryTokenizer.build_special_tokenizer.<locals>.<listcomp>T)exist_okparentsrd   
rc   r,   r-   F)
vocab_sizesample_sizedo_lower_case
output_diruser_defined_symbols)r	   infoospathexistsremover?   r@   DEFAULT_TOKENSlistdictfromkeysr   mkdirrA   
write_textr   strr0   r   )	r`   ra   rb   filerq   	text_path
train_text
model_pathspl_tokenizerr   rh   r   build_special_tokenizer   s2   





z'CanaryTokenizer.build_special_tokenizer)F)__name__
__module____qualname____doc__r   r   r   intr$   r'   r)   r*   ry   r=   rI   r2   r~   r/   r_   staticmethodr   r   rC   r   r   __classcell__r   r   r   r   r
   (   s6    
(
c                   @   sX   e Zd ZdZedefddZedefddZedefddZedefd	d
Z	dS )CanaryBPETokenizerz
    Thin wrapper around SPE tokenizer that overwrites SPE's BOS/EOS/PAD with Canary's special tokens
    for compatibility with CanaryTokenizer (aggregate).
    r   c                 C   
   |  tS r!   )r   r"   r#   r   r   r   r$      r%   zCanaryBPETokenizer.eos_idc                 C   r   r!   )r   r&   r#   r   r   r   r'      r%   zCanaryBPETokenizer.bos_idc                 C   r   r!   )r   r(   r#   r   r   r   r)      r%   zCanaryBPETokenizer.nospeech_idc                 C   r   r!   )r   r   r#   r   r   r   r*      r%   zCanaryBPETokenizer.pad_idN)
r   r   r   r   r   r   r$   r'   r)   r*   r   r   r   r   r      s    r   langavailable_langsr   c                 C   sT   t | dks
| |v r| S ddddd|  }d ur ||v r |S td|  d| )	Nr,   zen-USzes-ESzfr-FRzde-DE)enesfrdezUnsupported language: 'z&' for CanaryTokenizer with languages: )r0   r\   RuntimeError)r   r   mappedr   r   r   rJ      s   rJ   )rt   r?   	functoolsr   pathlibr   typingr   r   6nemo.collections.common.tokenizers.aggregate_tokenizerr   :nemo.collections.common.tokenizers.sentencepiece_tokenizerr   r   
nemo.utilsr	   __all__r&   r"   r   r(   
CANARY_PNCCANARY_NOPNCrO   rx   r   r
   r   r~   ry   rJ   r   r   r   r   <module>   s,    