o
    iQ                     @   s  d dl Z d dlZd dlmZmZmZmZ d dlZd dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF d dlGmHZH d d lImJZJ d d!lKmLZL d d"lMmNZN d d#lOmPZP d d$lQmRZR d d%lSmTZT d d&lUmVZV d d'lWmXZX d d(lYmZZZ d d)l[m\Z\ d d*l]m^Z^ d d+l_m`Z` d d,lambZb d d-lcmdZd d d.lemfZf d d/lgmhZh d d0limjZj d d1lkmlZl d d2lmmnZn d d3lompZpmqZqmrZrmsZs efd4ete6e<e:e8d5e4d6d7Zuefd8eteJd9eHdd:d;Zvefd<etePeRd=eNd>d:d?Zwefd@eteTdAeddBdCZxefdDeteFeDdEeBdd:d;ZyefdFete!e.e%e#e0e,e2e'e(e*edGedHdCZzefdIete@eZe\dJe>dd:d;Z{efdKete@eZe\dJe>dd:d;Z|efdLeteeeeeeeedMedHdCZ}efdNeteXdOeVdd:d;Z~G dPdQ dQe^ZdS )R    N)CallableDictOptionalTuple)check_argument_typescheck_return_type)CTC)
AbsDecoder)
MLMDecoder)
RNNDecoder)TransducerDecoder)&DynamicConvolution2DTransformerDecoder$DynamicConvolutionTransformerDecoder*LightweightConvolution2DTransformerDecoder(LightweightConvolutionTransformerDecoderTransformerDecoder)
AbsEncoder)BranchformerEncoder)ConformerEncoder)ContextualBlockConformerEncoder)!ContextualBlockTransformerEncoder)FairseqHubertEncoderFairseqHubertPretrainEncoder)LongformerEncoder)
RNNEncoder)TransformerEncoder)VGGRNNEncoder)FairSeqWav2Vec2Encoder)AbsFrontend)DefaultFrontend)FusedFrontends)S3prlFrontend)SlidingWindow)AbsPostEncoder)"HuggingFaceTransformersPostEncoder)AbsPreEncoder)LinearProjection)LightweightSincConvs)
AbsSpecAug)SpecAug)JointNetwork)AbsNormalize)	GlobalMVN)UtteranceMVN)ESPnetSLUModel)AbsPostDecoder)"HuggingFaceTransformersPostDecoder)ConformerPostEncoder)TransformerPostEncoder)ASRTask)g2p_choices)
initialize)AbsESPnetModel)ClassChoices)SLUPreprocessor)Trainer)get_default_kwargs)NestedDictAction)float_or_noneint_or_nonestr2boolstr_or_nonefrontend)defaultsliding_windows3prlfusedrA   )nameclasses
type_checkrA   specaug)rH   T)rE   rF   rG   rA   optional	normalize)
global_mvnutterance_mvnrL   )rF   rG   rA   rI   model)espnetrN   )rF   rG   rA   
preencoder)sinclinearencoder)	conformertransformercontextual_block_transformercontextual_block_conformervgg_rnnrnnwav2vec2huberthubert_pretrain
longformerbranchformerrX   postencoder)hugging_face_transformersrS   rT   deliberationencoderdecoder)rT   lightweight_convlightweight_conv2ddynamic_convdynamic_conv2drX   
transducermlmpostdecoder)r_   c                   @   s   e Zd ZU dZeed< eeee	e
eeeeeg
ZeZedejfddZedejdedeeeeeejf geeejf f  fd	d
Z e	ddedede!edf fddZ"e	ddedede!edf fddZ#edejde$fddZ%dS )SLUTask   num_optimizersparserc                 C   s  |j dd}|d}|dg7 }|jdtd dd |jdtd d	d |jd
tddd |jdtddd |jddd d dg dd |jdtd dd |jdtttdd |jdtd dd |j dd}|jdtddd |jd t	d!g d"d#d$ |jd%td d&d |jd'td(d) |jd*tg d+d d,d- |jd.tt
d d/d- |jd0td d1d |jd2td d3d |jd4td5d6d |jd7td d8d |jd9td5d:d |jd;t	d<d=d |jd>td?d@d | jD ]}|| qd S )ANzTask related)descriptionrequired
token_listz--token_listzA text mapping int-id to token)typerA   helpz--transcript_token_listz.A text mapping int-id to token for transcriptsz
--two_passFzRun 2-pass SLUz--pre_postencoder_normpre_postencoder_normz--initc                 S   s   t |  S )N)r?   lower)x ru   E/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/slu.py<lambda>   s    z,SLUTask.add_task_arguments.<locals>.<lambda>zThe initialization method)chainerxavier_uniformxavier_normalkaiming_uniformkaiming_normalN)rp   rA   rq   choicesz--input_sizez,The number of input dimension of the featurez
--ctc_confz$The keyword arguments for CTC class.)actionrA   rq   z--joint_net_confz.The keyword arguments for joint network class.zPreprocess relatedz--use_preprocessorTz"Apply preprocessing to data or notz--token_typebpe)r   charwordphnz7The text will be tokenized in the specified level token)rp   rA   r}   rq   z
--bpemodelzThe model file of sentencepiecez--non_linguistic_symbolsz non_linguistic_symbols file path)rp   rq   z	--cleaner)Ntacotronjaconv
vietnamesezApply text cleaning)rp   r}   rA   rq   z--g2pz&Specify g2p method if --token_type=phnz--speech_volume_normalizez/Scale the maximum amplitude to the given value.z	--rir_scpzThe file path of rir scp file.z--rir_apply_prob      ?z-THe probability for applying RIR convolution.z--noise_scpz The file path of noise scp file.z--noise_apply_probz&The probability applying Noise adding.z--noise_db_range13_15z!The range of noise decibel level.z--short_noise_thres      ?znIf len(noise) / len(speech) is smaller than this threshold during dynamic mixing, a warning will be displayed.)add_argument_groupget_defaultadd_argumentr?   r>   r=   r;   r:   r   strr4   r<   floatclass_choices_listadd_arguments)clsrl   grouprn   class_choicesru   ru   rv   add_task_arguments   s  


zSLUTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ |jrat||j|jd|vrd n|j|j|j|j|j	t
|dr&|jnd t
|dr/|jndt
|dr8|jnd t
|drA|jndt
|drJ|jndt
|d	rS|jnd
t
|dr\|jnd d}nd }t|siJ |S )Ntranscript_token_listrir_scprir_apply_probr   	noise_scpnoise_apply_probnoise_db_ranger   short_noise_thresr   )r   
token_typero   r   bpemodelnon_linguistic_symbolstext_cleanerg2p_typer   r   r   r   r   r   speech_volume_normalize)r   use_preprocessorr8   r   ro   r   r   r   cleanerg2phasattrr   r   r   r   r   r   r   r   )r   r   r   retvalru   ru   rv   build_preprocess_fnu  sD   
zSLUTask.build_preprocess_fnTF	inference.c                 C   s   |sd}|S d}|S )N)speechtext)r   ru   r   r   r   r   ru   ru   rv   required_data_names  s
   zSLUTask.required_data_namesc                 C   s   d}t |sJ |S )N)
transcript)r   r   ru   ru   rv   optional_data_names  s   zSLUTask.optional_data_namesc                 C   s  t  sJ t|jtr/t|jdd}dd |D }W d    n1 s$w   Y  t||_nt|jttfr=t|j}ntdd|v r|jd urt|jtrtt|jdd}dd |D }W d    n1 siw   Y  t||_nt|jttfrt|j}ntdt	|}t
d	|  |jd u rt|j}|di |j}| }nd |_i |_d }|j}|jd urt|j}	|	di |j}
nd }
|jd urt|j}|di |j}nd }t|d
d d urt|j}|di |j}| }nd }t|j}|dd|i|j}| }t|dd d ur0t|j }|dd|i|j!}| }nd }t|dd d urPt"|j#}|dd|i|j$}| }nd }t|dd d urlt%|j&}|di |j'}|}nd }t(|j)}|j)dkr||fddi|j*}t+|| |j,fi |j-}n|d||d|j*}d }t.d||d|j/}zt0|j1}W n t2y   t0d}Y nw d|v r|jd ur||j3d< |j4|j3d< |j5|j3d< |d|||
||||||||||d|j3}|j6d urt7||j6 t8|s	J |S )Nzutf-8)encodingc                 S      g | ]}|  qS ru   rstrip.0lineru   ru   rv   
<listcomp>      z'SLUTask.build_model.<locals>.<listcomp>ztoken_list must be str or listr   c                 S   r   ru   r   r   ru   ru   rv   r     r   z* Transcript token_list must be str or listzVocabulary size: rO   
input_sizer^   r`   rh   rf   	embed_padr   )
vocab_sizeencoder_output_size)odimr   rN   two_passrr   )r   r@   rH   rJ   rO   rR   r^   r`   ra   rh   ctcjoint_networkro   ru   )9r   
isinstancero   r   openlisttupleRuntimeErrorr   lenlogginginfor   frontend_choices	get_classr@   frontend_confoutput_sizerH   specaug_choicesspecaug_confrJ   normalize_choicesnormalize_confgetattrpreencoder_choicesrO   preencoder_confencoder_choicesrR   encoder_confpostencoder_choicesr^   postencoder_confdeliberationencoder_choicesr`   deliberationencoder_confpostdecoder_choicesrh   postdecoder_confdecoder_choicesra   decoder_confr*   dunitsjoint_net_confr   ctc_confmodel_choicesrM   AttributeError
model_confr   rr   initr5   r   )r   r   fro   r   r   frontend_classr@   r   specaug_classrH   normalize_classrJ   preencoder_classrO   encoder_classrR   r   postencoder_classr^   deliberationencoder_classr`   postdecoder_classrh   decoder_classra   r   r   model_classrM   ru   ru   rv   build_model  s  










zSLUTask.build_modelN)TF)&__name__
__module____qualname__rk   int__annotations__r   r   r   r   r   r   r   r   r   r   r   r9   trainerclassmethodargparseArgumentParserr   	Namespaceboolr   r   r   r   nparrayndarrayr   r   r   r   r.   r   ru   ru   ru   rv   ri      sZ   
  &(


ri   )r   r   typingr   r   r   r   numpyr   	typeguardr   r   espnet2.asr.ctcr   espnet2.asr.decoder.abs_decoderr	   espnet2.asr.decoder.mlm_decoderr
   espnet2.asr.decoder.rnn_decoderr   &espnet2.asr.decoder.transducer_decoderr   'espnet2.asr.decoder.transformer_decoderr   r   r   r   r   espnet2.asr.encoder.abs_encoderr   (espnet2.asr.encoder.branchformer_encoderr   %espnet2.asr.encoder.conformer_encoderr   6espnet2.asr.encoder.contextual_block_conformer_encoderr   8espnet2.asr.encoder.contextual_block_transformer_encoderr   "espnet2.asr.encoder.hubert_encoderr   r   &espnet2.asr.encoder.longformer_encoderr   espnet2.asr.encoder.rnn_encoderr   'espnet2.asr.encoder.transformer_encoderr   #espnet2.asr.encoder.vgg_rnn_encoderr   $espnet2.asr.encoder.wav2vec2_encoderr   !espnet2.asr.frontend.abs_frontendr   espnet2.asr.frontend.defaultr   espnet2.asr.frontend.fusedr    espnet2.asr.frontend.s3prlr!   espnet2.asr.frontend.windowingr"   'espnet2.asr.postencoder.abs_postencoderr#   =espnet2.asr.postencoder.hugging_face_transformers_postencoderr$   %espnet2.asr.preencoder.abs_preencoderr%   espnet2.asr.preencoder.linearr&   espnet2.asr.preencoder.sincr'   espnet2.asr.specaug.abs_specaugr(   espnet2.asr.specaug.specaugr)   $espnet2.asr_transducer.joint_networkr*   espnet2.layers.abs_normalizer+   espnet2.layers.global_mvnr,   espnet2.layers.utterance_mvnr-   espnet2.slu.espnet_modelr.   'espnet2.slu.postdecoder.abs_postdecoderr/   =espnet2.slu.postdecoder.hugging_face_transformers_postdecoderr0   -espnet2.slu.postencoder.conformer_postencoderr1   /espnet2.slu.postencoder.transformer_postencoderr2   espnet2.tasks.asrr3   espnet2.text.phoneme_tokenizerr4   espnet2.torch_utils.initializer5   espnet2.train.abs_espnet_modelr6   espnet2.train.class_choicesr7   espnet2.train.preprocessorr8   espnet2.train.trainerr9    espnet2.utils.get_default_kwargsr:    espnet2.utils.nested_dict_actionr;   espnet2.utils.typesr<   r=   r>   r?   dictr   r   r   r   r   r   r   r   r   r   ri   ru   ru   ru   rv   <module>   sH   


