o
    iL4                     @   s  d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z
d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z<m=Z=m>Z>m?Z? e0de@eededdZAe0de@e d edd!d"ZBe0d#e@e&e(d$e$d%d!d&ZCe0d'e@ed(edd!d"ZDe0d)e@ed*ed+d,ZEG d-d. d.e*ZFdS )/    N)Callable
CollectionDictListOptionalTuple)check_argument_typescheck_return_type)
AbsEncoder)FairseqHubertPretrainEncoder)AbsFrontend)DefaultFrontend)SlidingWindow)AbsPreEncoder)LightweightSincConvs)
AbsSpecAug)SpecAug)HubertPretrainModel)AbsNormalize)	GlobalMVN)UtteranceMVN)AbsTask)g2p_choices)
initialize)ClassChoices)CommonCollateFn)CommonPreprocessor)Trainer)get_default_kwargs)NestedDictAction)float_or_noneint_or_nonestr2boolstr_or_nonefrontend)defaultsliding_windowr%   )nameclasses
type_checkr%   specaug)r*   T)r'   r(   r)   r%   optional	normalize)
global_mvnutterance_mvnr.   )r(   r)   r%   r+   
preencoder)sincencoder)hubert_pretrainr2   )r(   r)   r%   c                   @   s6  e Zd ZU dZeed< eeee	e
gZeZedejfddZedejdedeeeeeeejf f  geee eeejf f f fd	d
Zedejdedeeeeeej f geeejf f  fddZ!e	ddededeedf fddZ"e	ddededeedf fddZ#edejde$fddZ%dS )
HubertTask   num_optimizersparserc                 C   s  |j dd}|d}|dg7 }|jdtd dd |jdd	d
 d dg dd |jdtd dd |jdtttdd |j dd}|jdtddd |jdt	dg ddd |jdtd dd |jdtdd  |jd!tg d"d d#d$ |jd%tt
d d&d$ |jd'td d(d |jd)td d*d |jd+td,d-d |jd.td d/d |jd0td,d1d |jd2t	d3d4d |jd5td,d6d |jd7td8d9d |jd:td8d;d |jd<t	d=d>d | jD ]}|| qd S )?NzTask related)descriptionrequired
token_listz--token_listzA text mapping int-id to token)typer%   helpz--initc                 S   s   t |  S )N)r#   lower)x r>   H/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/hubert.py<lambda>z   s    z/HubertTask.add_task_arguments.<locals>.<lambda>zThe initialization method)chainerxavier_uniformxavier_normalkaiming_uniformkaiming_normalN)r:   r%   r;   choicesz--input_sizez,The number of input dimension of the featurez--model_confz&The keyword arguments for model class.)actionr%   r;   zPreprocess relatedz--use_preprocessorTz"Apply preprocessing to data or notz--token_typebpe)rH   charwordphnz7The text will be tokenized in the specified level token)r:   r%   rF   r;   z
--bpemodelzThe model file of sentencepiecez--non_linguistic_symbolsz non_linguistic_symbols file path)r:   r;   z	--cleaner)Ntacotronjaconv
vietnamesezApply text cleaning)r:   rF   r%   r;   z--g2pz&Specify g2p method if --token_type=phnz--speech_volume_normalizez/Scale the maximum amplitude to the given value.z	--rir_scpzThe file path of rir scp file.z--rir_apply_prob      ?z-THe probability for applying RIR convolution.z--noise_scpz The file path of noise scp file.z--noise_apply_probz&The probability applying Noise adding.z--noise_db_range13_15z!The range of noise decibel level.z--pred_masked_weightz,weight for predictive loss for masked framesz--pred_nomask_weight        z.weight for predictive loss for unmasked framesz--loss_weightsz1weights for additional loss terms (not first one)z--hubert_dictz
./dict.txtz9word-based target dictionary for Hubert pretraining stage)add_argument_groupget_defaultadd_argumentr#   r!   r   r   r   r"   strr   r    floatclass_choices_listadd_arguments)clsr6   groupr8   class_choicesr>   r>   r?   add_task_argumentsi   s  


zHubertTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ tdddS )NrQ   )float_pad_valueint_pad_value)r   r   )rY   r]   r^   r>   r>   r?   build_collate_fn   s   
zHubertTask.build_collate_fnc                 C   s   t  sJ |jrYt||j|j|j|j|j|jt	|dr|j
nd t	|dr'|jndt	|dr0|jnd t	|dr9|jndt	|drB|jndt	|drK|jnd	t	|drT|jnd d
}nd }t|saJ |S )Nrir_scprir_apply_probrO   	noise_scpnoise_apply_probnoise_db_rangerP   short_noise_thresg      ?)r^   
token_typer9   bpemodelnon_linguistic_symbolstext_cleanerg2p_typerd   re   rf   rg   rh   ri   speech_volume_normalize)r   use_preprocessorr   rj   r9   rk   rl   cleanerg2phasattrrd   re   rf   rg   rh   ri   ro   r	   )rY   r]   r^   retvalr>   r>   r?   build_preprocess_fn  s>   
zHubertTask.build_preprocess_fnTF	inference.c                 C   s   |sd}|S d}|S )N)speechtext)rw   r>   rY   r^   rv   rt   r>   r>   r?   required_data_names.  s
   zHubertTask.required_data_namesc                 C   s   d}t |sJ |S )Nr>   )r	   ry   r>   r>   r?   optional_data_names9  s   zHubertTask.optional_data_namesc              
   C   s  t  sJ t|jtr/t|jdd}dd |D }W d    n1 s$w   Y  t||_nt|jttfr=t|j}ntdt|}t	
d|  |jd u ret|j}|d
i |j}| }nd |_i |_d }|j}|jd urt|j}|d
i |j}	nd }	|jd urt|j}
|
d
i |j}nd }t|dd d urt|j}|d
i |j}| }nd }t|j}|d
||j|jd|j}t d
|||	||||d	|j!}|j"d urt#||j" t$|sJ |S )Nzutf-8)encodingc                 S   s   g | ]}|  qS r>   )rstrip).0liner>   r>   r?   
<listcomp>F  s    z*HubertTask.build_model.<locals>.<listcomp>ztoken_list must be str or listzVocabulary size: r/   )
input_sizeuse_amphubert_dict)
vocab_sizer$   r*   r,   r/   r1   r9   r>   )%r   
isinstancer9   rU   openlisttupleRuntimeErrorlenlogginginfor   frontend_choices	get_classr$   frontend_confoutput_sizer*   specaug_choicesspecaug_confr,   normalize_choicesnormalize_confgetattrpreencoder_choicesr/   preencoder_confencoder_choicesr1   r   r   encoder_confr   
model_confinitr   r	   )rY   r]   fr9   r   frontend_classr$   r   specaug_classr*   normalize_classr,   preencoder_classr/   encoder_classr1   modelr>   r>   r?   build_modelA  sn   






zHubertTask.build_modelN)TF)&__name__
__module____qualname__r5   int__annotations__r   r   r   r   r   rW   r   trainerclassmethodargparseArgumentParserr\   	Namespaceboolr   r   r   rU   r   npndarrayr   torchTensorrc   r   arrayru   rz   r{   r   r   r>   r>   r>   r?   r3   T   sh   
  	&%


r3   )Gr   r   typingr   r   r   r   r   r   numpyr   r   	typeguardr   r	   espnet2.asr.encoder.abs_encoderr
   "espnet2.asr.encoder.hubert_encoderr   !espnet2.asr.frontend.abs_frontendr   espnet2.asr.frontend.defaultr   espnet2.asr.frontend.windowingr   %espnet2.asr.preencoder.abs_preencoderr   espnet2.asr.preencoder.sincr   espnet2.asr.specaug.abs_specaugr   espnet2.asr.specaug.specaugr   espnet2.hubert.espnet_modelr   espnet2.layers.abs_normalizer   espnet2.layers.global_mvnr   espnet2.layers.utterance_mvnr   espnet2.tasks.abs_taskr   espnet2.text.phoneme_tokenizerr   espnet2.torch_utils.initializer   espnet2.train.class_choicesr   espnet2.train.collate_fnr   espnet2.train.preprocessorr   espnet2.train.trainerr    espnet2.utils.get_default_kwargsr    espnet2.utils.nested_dict_actionr   espnet2.utils.typesr    r!   r"   r#   dictr   r   r   r   r   r3   r>   r>   r>   r?   <module>   s    

	
