o
    iE                     @   s  d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z
d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZmZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dlFmGZG d dlHmIZI d dlJmKZK d d lLmMZM d d!lNmOZO d d"lPmQZQ d d#lRmSZS d d$lTmUZU d d%lVmWZW d d&lXmYZY d d'lZm[Z[ d d(l\m]Z] d d)l^m_Z_ d d*l`maZa d d+lbmcZc d d,ldmeZe d d-lfmgZg d d.lhmiZi d d/ljmkZk d d0llmmZm d d1lnmoZompZpmqZqmrZr ecd2ese;eAe?e=d3e9d4d5Ztecd6eseQd7eOdd8d9Zuecd:eseWeYd;eUd<d8d=Zvecd>ese7eCd?ead@dAZwecdBeseMeKdCeIdd8d9ZxecdDese$e1e(e&e3e/e5e*e+e-e"dEe dFdAZyecdGeseGdHeEdd8d9ZzecdIeseeeeeeeedJedFdAZ{G dKdL dLe[Z|dS )M    N)Callable
CollectionDictListOptionalTuple)check_argument_typescheck_return_type)CTC)
AbsDecoder)
MLMDecoder)
RNNDecoder)TransducerDecoder)&DynamicConvolution2DTransformerDecoder$DynamicConvolutionTransformerDecoder*LightweightConvolution2DTransformerDecoder(LightweightConvolutionTransformerDecoderTransformerDecoder)
AbsEncoder)BranchformerEncoder)ConformerEncoder)ContextualBlockConformerEncoder)!ContextualBlockTransformerEncoder)FairseqHubertEncoderFairseqHubertPretrainEncoder)LongformerEncoder)
RNNEncoder)TransformerEncoder)VGGRNNEncoder)FairSeqWav2Vec2Encoder)ESPnetASRModel)AbsFrontend)DefaultFrontend)FusedFrontends)S3prlFrontend)SlidingWindow)MaskCTCModel)AbsPostEncoder)"HuggingFaceTransformersPostEncoder)AbsPreEncoder)LinearProjection)LightweightSincConvs)
AbsSpecAug)SpecAug)JointNetwork)AbsNormalize)	GlobalMVN)UtteranceMVN)AbsTask)g2p_choices)
initialize)AbsESPnetModel)ClassChoices)CommonCollateFn)CommonPreprocessor)Trainer)get_default_kwargs)NestedDictAction)float_or_noneint_or_nonestr2boolstr_or_nonefrontend)defaultsliding_windows3prlfusedrA   )nameclasses
type_checkrA   specaug)rH   T)rE   rF   rG   rA   optional	normalize)
global_mvnutterance_mvnrL   )rF   rG   rA   rI   model)espnetmaskctcrN   )rF   rG   rA   
preencoder)sinclinearencoder)	conformertransformercontextual_block_transformercontextual_block_conformervgg_rnnrnnwav2vec2huberthubert_pretrain
longformerbranchformerrY   postencoder)hugging_face_transformersdecoder)rU   lightweight_convlightweight_conv2ddynamic_convdynamic_conv2drY   
transducermlmc                   @   s<  e Zd ZU dZeed< eeee	e
eeegZeZedejfddZedejdedeeeeeeejf f  geee eeej f f f fd	d
Z!edejdede"eeeeej#f geeejf f  fddZ$e	ddededeedf fddZ%e	ddededeedf fddZ&edejde'fddZ(dS )ASRTask   num_optimizersparserc                 C   s  |j dd}|d}|dg7 }|jdtd dd |jdd	d
 d dg dd |jdtd dd |jdtttdd |jdtd dd |j dd}|jdtddd |jdt	dg ddd |jdtd dd |jd td!d" |jd#tg d$d d%d& |jd'tt
d d(d& |jd)td d*d |jd+td d,d |jd-td.d/d |jd0td d1d |jd2td.d3d |jd4t	d5d6d |jd7td8d9d | jD ]}|| qd S ):NzTask related)descriptionrequired
token_listz--token_listzA text mapping int-id to token)typerA   helpz--initc                 S   s   t |  S )N)r?   lower)x rs   E/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/asr.py<lambda>   s    z,ASRTask.add_task_arguments.<locals>.<lambda>zThe initialization method)chainerxavier_uniformxavier_normalkaiming_uniformkaiming_normalN)ro   rA   rp   choicesz--input_sizez,The number of input dimension of the featurez
--ctc_confz$The keyword arguments for CTC class.)actionrA   rp   z--joint_net_confz.The keyword arguments for joint network class.zPreprocess relatedz--use_preprocessorTz"Apply preprocessing to data or notz--token_typebpe)r}   charwordphnz7The text will be tokenized in the specified level token)ro   rA   r{   rp   z
--bpemodelzThe model file of sentencepiecez--non_linguistic_symbolsz non_linguistic_symbols file path)ro   rp   z	--cleaner)Ntacotronjaconv
vietnamesezApply text cleaning)ro   r{   rA   rp   z--g2pz&Specify g2p method if --token_type=phnz--speech_volume_normalizez/Scale the maximum amplitude to the given value.z	--rir_scpzThe file path of rir scp file.z--rir_apply_prob      ?z-THe probability for applying RIR convolution.z--noise_scpz The file path of noise scp file.z--noise_apply_probz&The probability applying Noise adding.z--noise_db_range13_15z!The range of noise decibel level.z--short_noise_thres      ?znIf len(noise) / len(speech) is smaller than this threshold during dynamic mixing, a warning will be displayed.)add_argument_groupget_defaultadd_argumentr?   r=   r;   r:   r
   r>   strr3   r<   floatclass_choices_listadd_arguments)clsrk   grouprm   class_choicesrs   rs   rt   add_task_arguments   s   


zASRTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ tdddS )Ng        )float_pad_valueint_pad_value)r   r7   )r   r   r   rs   rs   rt   build_collate_fnI  s   
zASRTask.build_collate_fnc                 C   s   t  sJ |jrYt||j|j|j|j|j|jt	|dr|j
nd t	|dr'|jndt	|dr0|jnd t	|dr9|jndt	|drB|jndt	|drK|jnd	t	|drT|jnd d
}nd }t|saJ |S )Nrir_scprir_apply_probr   	noise_scpnoise_apply_probnoise_db_ranger   short_noise_thresr   )r   
token_typern   bpemodelnon_linguistic_symbolstext_cleanerg2p_typer   r   r   r   r   r   speech_volume_normalize)r   use_preprocessorr8   r   rn   r   r   cleanerg2phasattrr   r   r   r   r   r   r   r	   )r   r   r   retvalrs   rs   rt   build_preprocess_fnT  s>   
zASRTask.build_preprocess_fnTF	inference.c                 C   s   |sd}|S d}|S )N)speechtext)r   rs   r   r   r   r   rs   rs   rt   required_data_namesz  s
   zASRTask.required_data_namesc                 C   s   d}t |sJ |S )Nrs   )r	   r   rs   rs   rt   optional_data_names  s   zASRTask.optional_data_namesc                 C   s  t  sJ t|jtr/t|jdd}dd |D }W d    n1 s$w   Y  t||_nt|jttfr=t|j}ntdt|}t	
d|  |jd u ret|j}|di |j}| }nd |_i |_d }|j}|jd urt|j}|di |j}	nd }	|jd urt|j}
|
di |j}nd }t|dd d urt|j}|di |j}| }nd }t|j}|dd|i|j}| }t|d	d d urt|j}|dd|i|j }| }nd }t!|j"}|j"d
kr||fddi|j#}t$|| |j%fi |j&}n|d||d|j#}d }t'd||d|j(}zt)|j*}W n t+yA   t)d}Y nw |d|||	||||||||d|j,}|j-d urbt.||j- t/|siJ |S )Nzutf-8)encodingc                 S   s   g | ]}|  qS rs   )rstrip).0liners   rs   rt   
<listcomp>  s    z'ASRTask.build_model.<locals>.<listcomp>ztoken_list must be str or listzVocabulary size: rP   
input_sizer_   rf   	embed_padr   )
vocab_sizeencoder_output_size)odimr   rN   )r   r@   rH   rJ   rP   rS   r_   ra   ctcjoint_networkrn   rs   )0r   
isinstancern   r   openlisttupleRuntimeErrorlenlogginginfor   frontend_choices	get_classr@   frontend_confoutput_sizerH   specaug_choicesspecaug_confrJ   normalize_choicesnormalize_confgetattrpreencoder_choicesrP   preencoder_confencoder_choicesrS   encoder_confpostencoder_choicesr_   postencoder_confdecoder_choicesra   decoder_confr.   dunitsjoint_net_confr
   ctc_confmodel_choicesrM   AttributeError
model_confinitr4   r	   )r   r   frn   r   frontend_classr@   r   specaug_classrH   normalize_classrJ   preencoder_classrP   encoder_classrS   r   postencoder_classr_   decoder_classra   r   r   model_classrM   rs   rs   rt   build_model  s   






zASRTask.build_modelN)TF))__name__
__module____qualname__rj   int__annotations__r   r   r   r   r   r   r   r   r   r9   trainerclassmethodargparseArgumentParserr   	Namespaceboolr   r   r   r   r   npndarrayr   torchTensorr   r   arrayr   r   r   r    r   rs   rs   rs   rt   rh      sn   
  

&%


rh   )}r   r   typingr   r   r   r   r   r   numpyr   r   	typeguardr   r	   espnet2.asr.ctcr
   espnet2.asr.decoder.abs_decoderr   espnet2.asr.decoder.mlm_decoderr   espnet2.asr.decoder.rnn_decoderr   &espnet2.asr.decoder.transducer_decoderr   'espnet2.asr.decoder.transformer_decoderr   r   r   r   r   espnet2.asr.encoder.abs_encoderr   (espnet2.asr.encoder.branchformer_encoderr   %espnet2.asr.encoder.conformer_encoderr   6espnet2.asr.encoder.contextual_block_conformer_encoderr   8espnet2.asr.encoder.contextual_block_transformer_encoderr   "espnet2.asr.encoder.hubert_encoderr   r   &espnet2.asr.encoder.longformer_encoderr   espnet2.asr.encoder.rnn_encoderr   'espnet2.asr.encoder.transformer_encoderr   #espnet2.asr.encoder.vgg_rnn_encoderr   $espnet2.asr.encoder.wav2vec2_encoderr   espnet2.asr.espnet_modelr    !espnet2.asr.frontend.abs_frontendr!   espnet2.asr.frontend.defaultr"   espnet2.asr.frontend.fusedr#   espnet2.asr.frontend.s3prlr$   espnet2.asr.frontend.windowingr%   espnet2.asr.maskctc_modelr&   'espnet2.asr.postencoder.abs_postencoderr'   =espnet2.asr.postencoder.hugging_face_transformers_postencoderr(   %espnet2.asr.preencoder.abs_preencoderr)   espnet2.asr.preencoder.linearr*   espnet2.asr.preencoder.sincr+   espnet2.asr.specaug.abs_specaugr,   espnet2.asr.specaug.specaugr-   $espnet2.asr_transducer.joint_networkr.   espnet2.layers.abs_normalizer/   espnet2.layers.global_mvnr0   espnet2.layers.utterance_mvnr1   espnet2.tasks.abs_taskr2   espnet2.text.phoneme_tokenizerr3   espnet2.torch_utils.initializer4   espnet2.train.abs_espnet_modelr5   espnet2.train.class_choicesr6   espnet2.train.collate_fnr7   espnet2.train.preprocessorr8   espnet2.train.trainerr9    espnet2.utils.get_default_kwargsr:    espnet2.utils.nested_dict_actionr;   espnet2.utils.typesr<   r=   r>   r?   dictr   r   r   r   r   r   r   r   rh   rs   rs   rs   rt   <module>   s     	
	
	
