o
    i=O                     @   sB  d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z
d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dlFmGZG d dlHmIZI d dlJmKZK d d lLmMZM d d!lNmOZO d d"lPmQZQ d d#lRmSZS d d$lTmUZU d d%lVmWZW d d&lXmYZY d d'lZm[Z[ d d(l\m]Z]m^Z^m_Z_m`Z` eQd)eae/e3e1d*e-d+d,ZbeQd-eaeAd.e?dd/d0ZceQd1eaeEeGd2eCd3d/d4ZdeQd5eae=e;d6e9dd/d0ZeeQd7eaee'e e)e%e+e"e#d8ed9d:ZfeQd;eae7d<e5dd/d0ZgeQd=eaeeeeeed>ed9d:ZheQd?eaeeeeeed>ed9d:ZieQd@eaeeeeeed>ed9d:ZjG dAdB dBeKZkdS )C    N)Callable
CollectionDictListOptionalTuple)check_argument_typescheck_return_type)CTC)
AbsDecoder)
RNNDecoder)&DynamicConvolution2DTransformerDecoder$DynamicConvolutionTransformerDecoder*LightweightConvolution2DTransformerDecoder(LightweightConvolutionTransformerDecoderTransformerDecoder)
AbsEncoder)ConformerEncoder)!ContextualBlockTransformerEncoder)FairseqHubertEncoderFairseqHubertPretrainEncoder)
RNNEncoder)TransformerEncoder)VGGRNNEncoder)FairSeqWav2Vec2Encoder)AbsFrontend)DefaultFrontend)S3prlFrontend)SlidingWindow)AbsPostEncoder)"HuggingFaceTransformersPostEncoder)AbsPreEncoder)LinearProjection)LightweightSincConvs)
AbsSpecAug)SpecAug)AbsNormalize)	GlobalMVN)UtteranceMVN)ESPnetSTModel)AbsTask)g2p_choices)
initialize)ClassChoices)CommonCollateFn) MutliTokenizerCommonPreprocessor)Trainer)get_default_kwargs)NestedDictAction)float_or_noneint_or_nonestr2boolstr_or_nonefrontend)defaultsliding_windows3prlr8   )nameclasses
type_checkr8   specaug)r>   T)r;   r<   r=   r8   optional	normalize)
global_mvnutterance_mvnrB   )r<   r=   r8   r?   
preencoder)sinclinearencoder)	conformertransformercontextual_block_transformervgg_rnnrnnwav2vec2huberthubert_pretrainrK   )r<   r=   r8   postencoder)hugging_face_transformersdecoder)rH   lightweight_convlightweight_conv2ddynamic_convdynamic_conv2drK   extra_asr_decoderextra_mt_decoderc                   @   s>  e Zd ZU dZeed< eeee	e
eeeeg	ZeZedejfddZedejdedeeeeeeejf f  geee eee j!f f f fd	d
Z"edejdede#eeeeej$f geeejf f  fddZ%e	ddededeedf fddZ&e	ddededeedf fddZ'edejde(fddZ)dS )STTask   num_optimizersparserc                 C   s  |j dd}|d}|dg7 }|jdtd dd |jdtd d	d |jd
dd d dg dd |jdtd dd |jdtttdd |jdtttdd |j dd}|jdt	ddd |jdt
dg ddd |jd t
dg dd!d |jd"td d#d |jd$td d%d |jd&td'd( |jd)tg d*d d+d, |jd-ttd d.d, |jd/td d0d |jd1td d2d |jd3td4d5d |jd6td d7d |jd8td4d9d |jd:t
d;d<d |jd=td>d?d | jD ]}|| qd S )@NzTask related)descriptionrequired
token_listz--token_listz4A text mapping int-id to token (for target language))typer8   helpz--src_token_listz4A text mapping int-id to token (for source language)z--initc                 S   s   t |  S )N)r6   lower)x rc   D/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/st.py<lambda>   s    z+STTask.add_task_arguments.<locals>.<lambda>zThe initialization method)chainerxavier_uniformxavier_normalkaiming_uniformkaiming_normalN)r_   r8   r`   choicesz--input_sizez,The number of input dimension of the featurez
--ctc_confz$The keyword arguments for CTC class.)actionr8   r`   z--model_confz&The keyword arguments for model class.zPreprocess relatedz--use_preprocessorTz"Apply preprocessing to data or notz--token_typebpe)rm   charwordphnz>The target text will be tokenized in the specified level token)r_   r8   rk   r`   z--src_token_typez>The source text will be tokenized in the specified level tokenz
--bpemodelz5The model file of sentencepiece (for target language)z--src_bpemodelz5The model file of sentencepiece (for source language)z--non_linguistic_symbolsz non_linguistic_symbols file path)r_   r`   z	--cleaner)Ntacotronjaconv
vietnamesezApply text cleaning)r_   rk   r8   r`   z--g2pz&Specify g2p method if --token_type=phnz--speech_volume_normalizez/Scale the maximum amplitude to the given value.z	--rir_scpzThe file path of rir scp file.z--rir_apply_prob      ?z-THe probability for applying RIR convolution.z--noise_scpz The file path of noise scp file.z--noise_apply_probz&The probability applying Noise adding.z--noise_db_range13_15z!The range of noise decibel level.z--short_noise_thres      ?znIf len(noise) / len(speech) is smaller than this threshold during dynamic mixing, a warning will be displayed.)add_argument_groupget_defaultadd_argumentr6   r4   r2   r1   r
   r)   r5   strr+   r3   floatclass_choices_listadd_arguments)clsr[   groupr]   class_choicesrc   rc   rd   add_task_arguments   s  


zSTTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ tdddS )Ng        )float_pad_valueint_pad_value)r   r.   )r~   r   r   rc   rc   rd   build_collate_fn\  s   
zSTTask.build_collate_fnc                 C   s"  t  sJ |jrtdi d|d|j|jgd|j|jgd|j|jgd|j	d|j
d|jdt|dr8|jnd d	t|d	rC|jnd
dt|drN|jnd dt|drY|jnd
dt|drd|jnddt|dro|jnddt|drz|jnd dddddg}nd }t|sJ |S )Nr   
token_typer^   bpemodelnon_linguistic_symbolstext_cleanerg2p_typerir_scprir_apply_probrt   	noise_scpnoise_apply_probnoise_db_rangeru   short_noise_thresrv   speech_volume_normalizespeech_namespeech	text_nametextsrc_textrc   )r   use_preprocessorr/   r   src_token_typer^   src_token_listr   src_bpemodelr   cleanerg2phasattrr   r   r   r   r   r   r   r	   )r~   r   r   retvalrc   rc   rd   build_preprocess_fng  sn   
	zSTTask.build_preprocess_fnTF	inference.c                 C   s   |sd}|S d}|S )N)r   r   )r   rc   r~   r   r   r   rc   rc   rd   required_data_names  s
   zSTTask.required_data_namesc                 C   s   |sd}nd}t |sJ |S )N)r   rc   )r	   r   rc   rc   rd   optional_data_names  s
   zSTTask.optional_data_namesc                 C   s  t  sJ t|jtr/t|jdd}dd |D }W d    n1 s$w   Y  t||_nt|jttfr=t|j}ntdt|}t	
d|  |jd urt|jtr|t|jdd}dd |D }W d    n1 sqw   Y  t||_nt|jttfrt|j}ntdt|}t	
d|  nd	\}}|jd u rt|j}|di |j}| }	nd |_i |_d }|j}	|jd urt|j}
|
di |j}nd }|jd urt|j}|di |j}nd }t|d
d d ur
t|j}|di |j}| }	nd }t|j}|dd|	i|j}| }t|dd d ur>t|j }|dd|i|j!}| }nd }t"|j#}|d||d|j$}|d urbt%d||d|j&}nd }t|dd d ur|d urt'|j(}|d||d|j)}nd }t|dd d urt*|j+}|d||d|j,}nd }t-d||||||||||||||d|j.}|j/d urt0||j/ t1|sJ |S )Nzutf-8)encodingc                 S      g | ]}|  qS rc   rstrip.0linerc   rc   rd   
<listcomp>      z&STTask.build_model.<locals>.<listcomp>ztoken_list must be str or listzVocabulary size: c                 S   r   rc   r   r   rc   rc   rd   r     r   zSource vocabulary size: )NNrC   
input_sizerO   )
vocab_sizeencoder_output_size)odimr   rV   rW   )r   src_vocab_sizer7   r>   r@   rC   rF   rO   rQ   ctcrV   rW   r^   r   rc   )2r   
isinstancer^   rz   openlisttupleRuntimeErrorlenlogginginfor   r   frontend_choices	get_classr7   frontend_confoutput_sizer>   specaug_choicesspecaug_confr@   normalize_choicesnormalize_confgetattrpreencoder_choicesrC   preencoder_confencoder_choicesrF   encoder_confpostencoder_choicesrO   postencoder_confdecoder_choicesrQ   decoder_confr
   ctc_confextra_asr_decoder_choicesrV   extra_asr_decoder_confextra_mt_decoder_choicesrW   extra_mt_decoder_confr)   
model_confinitr,   r	   )r~   r   fr^   r   r   r   frontend_classr7   r   specaug_classr>   normalize_classr@   preencoder_classrC   encoder_classrF   r   postencoder_classrO   decoder_classrQ   r   extra_asr_decoder_classrV   extra_mt_decoder_classrW   modelrc   rc   rd   build_model  s   









zSTTask.build_modelN)TF)*__name__
__module____qualname__rZ   int__annotations__r   r   r   r   r   r   r   r   r   r|   r0   trainerclassmethodargparseArgumentParserr   	Namespaceboolr   r   r   rz   r   npndarrayr   torchTensorr   r   arrayr   r   r   r)   r   rc   rc   rc   rd   rX      sp   
  
&'



rX   )lr   r   typingr   r   r   r   r   r   numpyr   r   	typeguardr   r	   espnet2.asr.ctcr
   espnet2.asr.decoder.abs_decoderr   espnet2.asr.decoder.rnn_decoderr   'espnet2.asr.decoder.transformer_decoderr   r   r   r   r   espnet2.asr.encoder.abs_encoderr   %espnet2.asr.encoder.conformer_encoderr   8espnet2.asr.encoder.contextual_block_transformer_encoderr   "espnet2.asr.encoder.hubert_encoderr   r   espnet2.asr.encoder.rnn_encoderr   'espnet2.asr.encoder.transformer_encoderr   #espnet2.asr.encoder.vgg_rnn_encoderr   $espnet2.asr.encoder.wav2vec2_encoderr   !espnet2.asr.frontend.abs_frontendr   espnet2.asr.frontend.defaultr   espnet2.asr.frontend.s3prlr   espnet2.asr.frontend.windowingr   'espnet2.asr.postencoder.abs_postencoderr   =espnet2.asr.postencoder.hugging_face_transformers_postencoderr    %espnet2.asr.preencoder.abs_preencoderr!   espnet2.asr.preencoder.linearr"   espnet2.asr.preencoder.sincr#   espnet2.asr.specaug.abs_specaugr$   espnet2.asr.specaug.specaugr%   espnet2.layers.abs_normalizer&   espnet2.layers.global_mvnr'   espnet2.layers.utterance_mvnr(   espnet2.st.espnet_modelr)   espnet2.tasks.abs_taskr*   espnet2.text.phoneme_tokenizerr+   espnet2.torch_utils.initializer,   espnet2.train.class_choicesr-   espnet2.train.collate_fnr.   espnet2.train.preprocessorr/   espnet2.train.trainerr0    espnet2.utils.get_default_kwargsr1    espnet2.utils.nested_dict_actionr2   espnet2.utils.typesr3   r4   r5   r6   dictr   r   r   r   r   r   r   r   r   rX   rc   rc   rc   rd   <module>   s     



	