o
    i2                     @   s  d Z ddlZddlZddlmZmZmZmZmZm	Z	 ddl
ZddlZddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@ e1deAeededdZBe1d eAed!edd"d#ZCe1d$eAe)e+d%e'd&d"d#ZDe1d'eAeed(ed)d*ZEG d+d, d,e-ZFdS )-zASR Transducer Task.    N)Callable
CollectionDictListOptionalTuple)check_argument_typescheck_return_type)AbsFrontend)DefaultFrontend)SlidingWindow)
AbsSpecAug)SpecAug)
AbsDecoder)
RNNDecoder)StatelessDecoder)Encoder)ESPnetASRTransducerModel)JointNetwork)AbsNormalize)	GlobalMVN)UtteranceMVN)AbsTask)g2p_choices)ClassChoices)CommonCollateFn)CommonPreprocessor)Trainer)get_default_kwargs)NestedDictAction)float_or_noneint_or_nonestr2boolstr_or_nonefrontend)defaultsliding_windowr%   )nameclasses
type_checkr%   specaug)r*   T)r(   r)   r%   optional	normalize)
global_mvnutterance_mvnr.   decoder)rnn	statelessr0   )r(   r)   r%   c                   @   s8  e Zd ZU dZdZeed< eee	e
gZeZedejfddZedejded	eeeeeeejf f  geee eeejf f f fd
dZedejded	eeeeeej f geeejf f  fddZ!e	ddeded	eedf fddZ"e	ddeded	eedf fddZ#edejd	e$fddZ%dS )ASRTransducerTaskzASR Transducer Task definition.   num_optimizersparserc                 C   s  |j dd}|d}|dg7 }|jdtddd |jd	tdd
d |jdtddd |jdtttdd |jdti dd |jdti dd |j dd}|jdtddd |jdt	dg ddd |jdtddd |jdtd d! |jd"tg d#dd$d% |jd&tt
dd'd% |jd(tdd)d |jd*tdd+d |jd,td-d.d |jd/tdd0d |jd1td-d2d |jd3t	d4d5d | jD ]}|| qdS )6zAdd Transducer task arguments.

        Args:
            cls: ASRTransducerTask object.
            parser: Transducer arguments parser.

        zTask related.)descriptionrequired
token_listz--token_listNz!Integer-string mapper for tokens.)typer%   helpz--input_sizez,The number of dimensions for input features.z--initz$Type of model initialization to use.z--model_confz*The keyword arguments for the model class.)actionr%   r:   z--encoder_confz,The keyword arguments for the encoder class.z--joint_network_confz2The keyword arguments for the joint network class.zPreprocess related.z--use_preprocessorTz-Whether to apply preprocessing to input data.z--token_typebpe)r<   charwordphnz.The type of tokens to use during tokenization.)r9   r%   choicesr:   z
--bpemodelz$The path of the sentencepiece model.z--non_linguistic_symbolsz'The 'non_linguistic_symbols' file path.)r9   r:   z	--cleaner)Ntacotronjaconv
vietnamesezText cleaner to use.)r9   r@   r%   r:   z--g2pz&g2p method to use if --token_type=phn.z--speech_volume_normalizez2Normalization value for maximum amplitude scaling.z	--rir_scpzThe RIR SCP file path.z--rir_apply_prob      ?z/The probability of the applied RIR convolution.z--noise_scpzThe path of noise SCP file.z--noise_apply_probz.The probability of the applied noise addition.z--noise_db_range13_15z%The range of the noise decibel level.)add_argument_groupget_defaultadd_argumentr#   r!   r   r   r   r"   strr   r    floatclass_choices_listadd_arguments)clsr5   groupr7   class_choices rP   P/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/asr_transducer.pyadd_task_argumentsX   s   	


z$ASRTransducerTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ tdddS )zBuild collate function.

        Args:
            cls: ASRTransducerTask object.
            args: Task arguments.
            train: Training mode.

        Return:
            : Callable collate function.

        g        )float_pad_valueint_pad_value)r   r   )rM   rS   rT   rP   rP   rQ   build_collate_fn   s   
z"ASRTransducerTask.build_collate_fnc                 C   s   t  sJ |jrPt||j|j|j|j|j|jt	|dr|j
ndt	|dr'|jndt	|dr0|jndt	|dr9|jndt	|drB|jndt	|drK|jndd	}nd}t|sXJ |S )
zBuild pre-processing function.

        Args:
            cls: ASRTransducerTask object.
            args: Task arguments.
            train: Training mode.

        Return:
            : Callable pre-processing function.

        rir_scpNrir_apply_probrD   	noise_scpnoise_apply_probnoise_db_rangerE   )rT   
token_typer8   bpemodelnon_linguistic_symbolstext_cleanerg2p_typerZ   r[   r\   r]   r^   speech_volume_normalize)r   use_preprocessorr   r_   r8   r`   ra   cleanerg2phasattrrZ   r[   r\   r]   r^   rd   r	   )rM   rS   rT   retvalrP   rP   rQ   build_preprocess_fn   s8   
z%ASRTransducerTask.build_preprocess_fnTF	inference.c                 C   s   |sd}|S d}|S )zRequired data depending on task mode.

        Args:
            cls: ASRTransducerTask object.
            train: Training mode.
            inference: Inference mode.

        Return:
            retval: Required task data.

        )speechtext)rl   rP   rM   rT   rk   ri   rP   rP   rQ   required_data_names   s
   z%ASRTransducerTask.required_data_namesc                 C   s   d}t |sJ |S )zOptional data depending on task mode.

        Args:
            cls: ASRTransducerTask object.
            train: Training mode.
            inference: Inference mode.

        Return:
            retval: Optional task data.

        rP   )r	   rn   rP   rP   rQ   optional_data_names6  s   z%ASRTransducerTask.optional_data_namesc                 C   s  t  sJ t|jtr/t|jdd}dd |D }W d   n1 s$w   Y  t||_nt|jttfr=t|j}ntdt|}t	
d|  |jdu ret|j}|di |j}| }nd}|j}|jdur~t|j}|di |j}	nd}	|jdurt|j}
|
di |j}nd}t|fi |j}|j}t|j}||fi |j}|j}t|||fi |j}td||||	||||d|j}|j durt!d	d
t"|sJ |S )zRequired data depending on task mode.

        Args:
            cls: ASRTransducerTask object.
            args: Task arguments.

        Return:
            model: ASR Transducer model.

        zutf-8)encodingc                 S   s   g | ]}|  qS rP   )rstrip).0linerP   rP   rQ   
<listcomp>Z  s    z1ASRTransducerTask.build_model.<locals>.<listcomp>Nztoken_list must be str or listzVocabulary size: )
vocab_sizer8   r$   r*   r,   encoderr/   joint_networkzCurrently not supported.z7Initialization part will be reworked in a short future.rP   )#r   
isinstancer8   rI   openlisttupleRuntimeErrorlenlogginginfo
input_sizefrontend_choices	get_classr$   frontend_confoutput_sizer*   specaug_choicesspecaug_confr,   normalize_choicesnormalize_confr   encoder_confdecoder_choicesr/   decoder_confr   joint_network_confr   
model_confinitNotImplementedErrorr	   )rM   rS   fr8   rv   frontend_classr$   r   specaug_classr*   normalize_classr,   rw   encoder_output_sizedecoder_classr/   decoder_output_sizerx   modelrP   rP   rQ   build_modelJ  sx   




	
zASRTransducerTask.build_modelN)TF)&__name__
__module____qualname____doc__r4   int__annotations__r   r   r   r   rK   r   trainerclassmethodargparseArgumentParserrR   	Namespaceboolr   r   r   rI   r   npndarrayr   torchTensorrY   r   arrayrj   ro   rp   r   r   rP   rP   rP   rQ   r2   J   sh   
  &.

r2   )Gr   r   r   typingr   r   r   r   r   r   numpyr   r   	typeguardr   r	   !espnet2.asr.frontend.abs_frontendr
   espnet2.asr.frontend.defaultr   espnet2.asr.frontend.windowingr   espnet2.asr.specaug.abs_specaugr   espnet2.asr.specaug.specaugr   *espnet2.asr_transducer.decoder.abs_decoderr   *espnet2.asr_transducer.decoder.rnn_decoderr   0espnet2.asr_transducer.decoder.stateless_decoderr   &espnet2.asr_transducer.encoder.encoderr   .espnet2.asr_transducer.espnet_transducer_modelr   $espnet2.asr_transducer.joint_networkr   espnet2.layers.abs_normalizer   espnet2.layers.global_mvnr   espnet2.layers.utterance_mvnr   espnet2.tasks.abs_taskr   espnet2.text.phoneme_tokenizerr   espnet2.train.class_choicesr   espnet2.train.collate_fnr   espnet2.train.preprocessorr   espnet2.train.trainerr    espnet2.utils.get_default_kwargsr    espnet2.utils.nested_dict_actionr   espnet2.utils.typesr    r!   r"   r#   dictr   r   r   r   r2   rP   rP   rP   rQ   <module>   s     		
