o
    wi                     @   s`  d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 g dZ4G dd de#ee(Z5G dd de5Z6G dd de6Z7dS )    )ceil)AnyDictListOptionalUnionN)Trainer)
DictConfig)audio_to_text_datasetssl_dataset)DALIOutputs)LhotseSpeechToTextBpeDataset)ConvFeatureMaksingWrapper)ASRModuleMixin)process_augmentations)!get_lhotse_dataloader_from_configmove_data_to_device)make_parser)ModelPT)PretrainedModelInfo	typecheck)AccessMixinset_access_cfg)AcousticEncodedRepresentationAudioSignal
LabelsTypeLengthsTypeLogprobsType
NeuralTypeSpectrogramType)logging)SpeechEncDecSelfSupervisedModelEncDecMaskedTokenPredModel!EncDecDenoiseMaskedTokenPredModelc                       s  e Zd ZdZedee fddZd%dede	f fdd	Z
d
ee fddZdeeeef  fddZdeeeef  fddZedeeeef  fddZedeeeef  fddZe 				d&ddZd'ddZdd Zd(ddZd(d d!Zd(d"efd#d$Z  ZS ))r"   zSBase class for encoder-decoder models used for self-supervised encoder pre-trainingreturnc                 C   s8   g }t dddd}|| t dddd}|| |S )
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        ssl_en_conformer_largezsFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:ssl_en_conformer_largezyhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/ssl_en_conformer_large/versions/1.10.1/files/ssl_en_conformer_large.nemo)pretrained_model_namedescriptionlocationssl_en_conformer_xlargeztFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:ssl_en_conformer_xlargez{https://api.ngc.nvidia.com/v2/models/nvidia/nemo/ssl_en_conformer_xlarge/versions/1.10.0/files/ssl_en_conformer_xlarge.nemo)r   append)clsresultsmodel r0   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/models/ssl_models.pylist_available_models4   s   

z5SpeechEncDecSelfSupervisedModel.list_available_modelsNcfgtrainerc                    s  d| _ |d ur|j | _ t j||d t| jj| _t| jj| _d | _d| jv ri | _i | _	i | _
i | _i | _i | _i | _| jj D ]T\}}|ddsQqFt|jt|jd}t|}|| j|< |dd| j	|< |d	d | j|< |d
d | j|< |dd| j
|< |dd| j|< d| j|< qFt| j| _nt| jj| _t| jj| _t| jj| _d| jv rtj| jjnd | _d| jv rtj| jjnd | _d| jv rd| jj| _| _nd\| _| _d| jv rt | jj!| j" d| _#d S )N   )r3   r4   	loss_list	is_activeT)decoderloss
loss_alphag      ?output_from_layertargets_from_loss
start_stepr   transpose_encodedFdropout_featuresdropout_features_qfeature_penalty        NNaccess)$
world_sizesuper__init__r"   from_config_dict_cfgpreprocessorencoderdecoder_lossesloss_alphasr=   r;   r>   r<   decoder_losses_activer6   itemsgetr8   r9   nn
ModuleDictdecoder_sslspec_augmentspec_augmentationtorchDropoutr?   r@   rA   feat_pen
pen_factorr   rD   
model_guidapply_masking)selfr3   r4   decoder_loss_namedecoder_loss_cfgnew_decoder_loss	__class__r0   r1   rG   N   sV   







z(SpeechEncDecSelfSupervisedModel.__init__configc           	      C   s  d|v rt |d }nd }tj| j|dd |drCt|| j| jtt	|dd |dd|dd	|d
d	|dddddS |d }t
j rNdnd}|ddrp|dkr]| jnd }tj|||| j| j| jjd}|S |ddrd|v r|d d u sd|v r|d d u rtd|  d S |r|dd|d  nd}tj||| j| j|d}d}nd|v r|d d u rtd|  d S tj||d}t|d r|j}nt|jd d r|jd j}n	|jd jd j}t
jjj||d ||d!d||d"d|d#dd$S )%N	augmentorsample_ratekey
use_lhotselabelsparseren	unk_indexblank_indexnormalize_transcriptsF)rh   nameunk_idblank_iddo_normalize)	tokenizerglobal_rankrE   datasetshufflegpucpuuse_dali)rb   rw   	device_idru   rE   preprocessor_cfg	is_tarredtarred_audio_filepathsmanifest_filepathznCould not load dataset as `manifest_filepath` was None or `tarred_audio_filepaths` is None. Provided config : 	shuffle_n   
batch_sizer   )rb   r   ru   rE   rc   zJCould not load dataset as `manifest_filepath` was None. Provided config : )rb   rc   
collate_fn	drop_lastnum_workers
pin_memoryrv   r   r   r   rw   r   r   )r   r
   )inject_dataloader_value_from_model_configr3   rP   r   ru   rE   r   r   rV   cudais_available
local_rankget_dali_char_datasetrI   rJ   r!   warningget_tarred_datasetget_char_datasethasattrr   datasetsutilsdata
DataLoader)	r\   rb   rc   rw   devicer{   rv   r   r   r0   r0   r1   _setup_dataloader_from_config   s   









z=SpeechEncDecSelfSupervisedModel._setup_dataloader_from_configtrain_data_configc                 C   s   d|vrd|d< | j d|d | j|d| _| jdur\t| jdr^t| jjtjjj	r`| j
durPt| j
jtrPt| j
jtt| jj| j |d   | j
_dS | j
du rbtd	 dS dS dS dS dS )
a  
        Sets up the training data loader via a Dict-like object.

        Args:
            train_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        rw   Ttraindataset_namerb   rb   Nrv   r   zModel Trainer was not set before constructing the dataset, incorrect number of training batches will be used. Please set the trainer and rebuild the dataset.)_update_dataset_configr   	_train_dlr   
isinstancerv   rV   r   r   IterableDataset_trainerlimit_train_batchesfloatintr   lenrE   r!   r   )r\   r   r0   r0   r1   setup_training_data   s*   


z3SpeechEncDecSelfSupervisedModel.setup_training_dataval_data_configc                 C   s   d|vrd|d< | j d|d | j|d| _| jdurKt| jdrMt| jjtjjj	rOt| j
jtrQt| j
jtt| jj| j |d   | j
_dS dS dS dS dS )	a  
        Sets up the validation data loader via a Dict-like object.

        Args:
            val_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        rw   F
validationr   r   Nrv   r   )r   r   _validation_dlr   r   rv   rV   r   r   r   r   limit_val_batchesr   r   r   r   rE   )r\   r   r0   r0   r1   setup_validation_data  s"   

z5SpeechEncDecSelfSupervisedModel.setup_validation_datac              
   C   s   t | jdrt| jjd}nt }td|ddttdt ddtdt ddttdt ddtdt ddttdt dddS )	N_sample_ratefreqBTToptionalr   r   Dr   )input_signalinput_signal_lengthprocessed_signalprocessed_signal_lengthtargetstarget_lengths)	r   rJ   r   r   r   tupler   r    r   r\   input_signal_eltyper0   r0   r1   input_types>  s   z+SpeechEncDecSelfSupervisedModel.input_typesc                 C   s2   t dt t dt t dt t tdt dS )Nr   r   )spectrograms
spec_masksencodedencoded_len)r   r    r   r   r   r\   r0   r0   r1   output_typesM  s
   


z,SpeechEncDecSelfSupervisedModel.output_typesc                 C   s  |  | jr
|   t| dr| j}nd}| js|rJ| jdurJ| jdurJt| jdkrJt	| j
 }tdd |D }|rJd| jd< | jd| jd	 |duoQ|du}|duoY|du}	||	A dkrgt|  d
|	sr| j||d\}}| jr| d | j | _|  }
| jr| |}| jr| |
}
| jr| j||d}| }t|dk |dk }t|D ]\}}d||dd|df< q| j||d\}}|
|||fS )aS  
        Forward pass of the model.

        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, that contains the individual lengths of the audio
                sequences.
            processed_signal: Tensor that represents a batch of processed audio signals,
                of shape (B, D, T) that has undergone processing via some DALI preprocessor.
            processed_signal_length: Vector of length B, that contains the individual lengths of the
                processed audio sequences.

        Returns:
            A tuple of 4 elements -
            1) Processed spectrograms of shape [B, D, T].
            2) Masks applied to spectrograms of shape [B, D, T].
            3) The encoded features tensor of shape [B, D, T].
            2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
        _in_validation_stepFNr   c                 S   s   g | ]}|d uqS Nr0   ).0ro   r0   r0   r1   
<listcomp>      z;SpeechEncDecSelfSupervisedModel.forward.<locals>.<listcomp>Tsave_encoder_tensors)access_enabledguid Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_len`` arguments.r   length   )
input_specr   gh㈵>gh㈵rB   audio_signalr   )is_access_enabledrZ   reset_registryr   r   trainingrL   r;   r   listvaluesany
access_cfgset_access_enabled
ValueErrorrJ   rY   r   powmeanrX   detachcloner?   r@   r[   rU   rV   logical_and	enumeraterK   )r\   r   r   r   r   in_validation_steplayer_namesregister_layerhas_input_signalhas_processed_signalr   masked_spectrogramsr   idxproc_lenr   r   r0   r0   r1   forwardV  sV   






z'SpeechEncDecSelfSupervisedModel.forwardc                 C   s  i }| j du r>t| jdr| jjr| j|||d}n| j|d}| jjr2| j|||||d}	|	|fS | j|||d}	|	|fS |d}	i }| | j}
| j  D ]\}}| j	| sZqP| j
| du rd|}n|
| j
|  d d	 }| j| rz|d
d	}| j| dur| j| }| j | d j}| j | d j}|du r|}t|d dr|d jr|d |||d||< n	|d |d||< |d }|jr|||| |||d}n
||||| |d}|	|| j|   }	|||< qP|	|fS )a  
        Forward pass through all decoders and calculate corresponding losses.
        Args:
            spectrograms: Processed spectrograms of shape [B, D, T].
            spec_masks: Masks applied to spectrograms of shape [B, D, T].
            encoded: The encoded features tensor of shape [B, D, T].
            encoded_len: The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
            targets: Optional target labels of shape [B, T]
            target_lengths: Optional target label lengths of shape [B]

        Returns:
            A tuple of 2 elements -
            1) Total sum of losses weighted by corresponding loss_alphas
            2) Dictionary of unweighted losses
        Nneeds_labels)encoder_outputr   r   r   )r   decoder_outputsr   decoder_lengthsr   )r   r   r   r5   rK   rl   r9   r8   )r   r   r   r   )rL   r   rS   r   r9   	new_zerosget_module_registryrK   rO   rN   r;   r>   	transposer<   
target_idsr   rM   )r\   r   r   r   r   r   r   loss_val_dictoutputs
loss_valueregistrydec_loss_namedec_loss	dec_inputtarget_losscurrent_losscurrent_loss_valuer0   r0   r1   decoder_loss_step  sp   
@8




z1SpeechEncDecSelfSupervisedModel.decoder_loss_stepc                 C   s0  |\}}}}t |tr|jr| j||d\}}}	}
n| j||d\}}}	}
| jd urQ| j D ] \}}| jj| j| k| j	|< |d }t
|drO|| jj q/nt
| jdr_| j| jj | |||	|
||\}}| jjd d | jjd}| D ]
\}}||d| < q|| jr|| j7 }|   ||d	S )
Nr   r   r   r   r9   set_num_updatesr   lr)learning_rateglobal_steptrain_r9   log)r   r   r   r   rL   rO   r4   r  r=   rN   r   r  r9   r   
_optimizerparam_groupsrX   r   )r\   batchbatch_nbsignal
signal_lenr   r   r   r   r   r   r   r   r9   r   r   tensorboard_logs	loss_nameloss_valr0   r0   r1   training_step	  s@   



z-SpeechEncDecSelfSupervisedModel.training_stepr   c                 C   s   d| _ |\}}}}t|tr|jr| j||d\}}	}
}n| j||d\}}	}
}| jd urC| j D ]\}}| jj| j	| k| j
|< q2| ||	|
|||\}}| jrW|| j7 }|   | ` d|i}|S )NTr  r  val_loss)r   r   r   r   r   rL   rO   r4   r  r=   rN   r   rX   r   )r\   r  	batch_idxdataloader_idxr  r  r   r   r   r   r   r   r   r   r   _metricsr0   r0   r1   validation_pass4  s*   

z/SpeechEncDecSelfSupervisedModel.validation_passc                 C   sR   |  |||}t| jjtkr!t| jjdkr!| j| | |S | j| |S )Nr5   )r  typer4   val_dataloadersr   r   validation_step_outputsr,   r\   r  r  r  r  r0   r0   r1   validation_stepU  s    z/SpeechEncDecSelfSupervisedModel.validation_stepr  c                 C   *   t dd |D  }d|i}||dS )Nc                 S      g | ]}|d  qS )r  r0   r   xr0   r0   r1   r   ^  r   zNSpeechEncDecSelfSupervisedModel.multi_validation_epoch_end.<locals>.<listcomp>r  r  r	  rV   stackr   )r\   r   r  val_loss_meanr  r0   r0   r1   multi_validation_epoch_end]     
z:SpeechEncDecSelfSupervisedModel.multi_validation_epoch_endr   )NNNNrC   r   )__name__
__module____qualname____doc__classmethodr   r   r2   r	   r   rG   r   r   r   r   r   r   propertystrr   r   r   r   r   r   r  r  r  r   r'  __classcell__r0   r0   r`   r1   r"   1   s.    FU-(
W[
+
!r"   c                       s  e Zd ZdZdedejdedefddZe	de
e fdd	Zd'dedef fddZedefddZedeeeef  fddZedeeeef  fddZe 	
	
	
	
	d(ddZd)ddZd*ddZd+ddZd+d d!Zd)d"edefd#d$Z d)defd%d&Z!  Z"S ),r#   zc
    Speech self-supervised model that performs masked token prediction on the encoder output.
    r  r   r  r%   c                 C   s   t ||}|S )z
        PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
        r   )r\   r  r   r  r0   r0   r1   transfer_batch_to_deviceh  s   
z3EncDecMaskedTokenPredModel.transfer_batch_to_devicec                 C   s   g }|S )r&   r0   )r-   r.   r0   r0   r1   r2   o  s   z0EncDecMaskedTokenPredModel.list_available_modelsNr3   r4   c                    s   t  || | `| jdddkr4| jjj| jj_| jjj| jj	_| jj	j
| jjj | jj	_
d| jj_| | jj| _| | jj	| _| | jj| _| | jj| _| | jj| _d | _| jdddkrxt| jj| j| _| j| j_d S d S )Nmask_positionpre_conv	post_convr5   )rF   rG   rS   r3   rP   rK   d_model	quantizerfeat_inmasking
block_sizesubsampling_factorr9   combine_time_stepsrH   mask_processorr8   pre_encoderr   
pre_encoder\   r3   r4   r`   r0   r1   rG   z  s"   
z#EncDecMaskedTokenPredModel.__init__c                 C   s*   t tdt ddtdt ddgdS )
        Return a typing schema for optimal batch size calibration for various
        sequence lengths using OOMptimizer.
        r   input)r  
seq_lengthr   r-   inputs)r   r   r   r   r   r0   r0   r1   oomptimizer_schema  s
   z-EncDecMaskedTokenPredModel.oomptimizer_schemac                 C   sn   t | jdrt| jjd}nt }td|ddttdt ddtdt ddttdt ddtdddS )	Nr   r   r   Tr   r   r   )r   r   r   r   
apply_maskr   rJ   r   r   r   r   r   r    r   r0   r0   r1   r     s   z&EncDecMaskedTokenPredModel.input_typesc                 C   h   | j jdkr| j jrtdt }tdt }ntdt }tdt }|ttdt tdt |dS 	Nr5   )r   r   Cr   )r   r   rL  H)r   r   rM  r   r   )logprobsr   maskstokens	r3   	num_bookssqueeze_singler   r   r   r   r   r    r\   rN  rP  r0   r0   r1   r        
z'EncDecMaskedTokenPredModel.output_typesFc                 C   s  |d uo|d u}|d uo|d u}||A dkrt |  d|s(| j||d\}}| jd urT| jj|d | j||d\}}	| j }
| j }| j|ddd\}}n$| j|d\}}|rh| j	||d	\}}
n|}t
|}
| j||d\}}	| j|d
}||	|
|fS )NFr   r   rH  r   r5   r   r   input_featsinput_lengthsr   )r   rJ   r>  set_masking_enabledrK   get_current_maskget_current_featr7  r   r=  rV   
zeros_liker8   )r\   r   r   r   r   rH  r   r   r   r   rO  featsr  rP  masked_signal	log_probsr0   r0   r1   r     s6   	




z"EncDecMaskedTokenPredModel.forwardr   c                 C   s   |d |d }}t |tr|jr| j||dd\}}}}n| j||dd\}}}}| j||||d}	| jjd d | jj|	d}
|	|
d	S )
Nr   r5   Tr   r   rH  r   r   rH  rO  r   r   r   r  r  r  
train_lossr  )	r   r   r   r   r9   r
  r  r4   r  )r\   r  r  r   r   ra  r   rO  rP  r   r  r0   r0   r1   r    s   
z(EncDecMaskedTokenPredModel.training_stepvalc                 C   st   |d |d }}t |tr|jr| j|||d\}}	}
}n| j|||d\}}	}
}| j|
|||	d}| d|iS )Nr   r5   rb  rc  rd  _loss)r   r   r   r   r9   )r\   r  r  r  moderH  r   r   ra  r   rO  rP  r   r0   r0   r1   inference_pass  s   z)EncDecMaskedTokenPredModel.inference_passc                 C   sV   | j |||dd}t| jjtkr#t| jjdkr#| j| | |S | j| |S )NTrV  r5   rj  r  r4   r  r   r   r  r,   r  r0   r0   r1   r    s    z*EncDecMaskedTokenPredModel.validation_stepc                 C   sX   | j |||ddd}t| jjtkr$t| jjdkr$| j| | |S | j| |S )NtestT)ri  rH  r5   rk  r  r0   r0   r1   	test_step  s    z$EncDecMaskedTokenPredModel.test_stepr   c              	   C   s   g }t |D ]1\}}t|tstd| d| d|  d|v r)||d  qtd| d| d|  qt|dkrMtd| j d| d	 i S t	|
 }d|i}||d
S )NzBatch z! output in validation dataloader z is not a dictionary: r  z does not have key `val_loss`: r   zEpoch z/ received no batches for validation dataloader .r#  )r   r   dictr!   r   r,   r   current_epochrV   r%  r   )r\   r   r  r6   ir"  r&  r  r0   r0   r1   r'    s"   

z5EncDecMaskedTokenPredModel.multi_validation_epoch_endc                 C   r  )Nc                 S   r   )	test_lossr0   r!  r0   r0   r1   r   4  r   zCEncDecMaskedTokenPredModel.multi_test_epoch_end.<locals>.<listcomp>rr  )rr  r	  r$  )r\   r   r  test_loss_meanr  r0   r0   r1   multi_test_epoch_end3  r(  z/EncDecMaskedTokenPredModel.multi_test_epoch_endr   )NNNNFr)  )r   r   rg  F)r   r   )#r*  r+  r,  r-  r   rV   r   r   r2  r.  r   r   r2   r	   r   rG   r/  ro  rG  r   r   r0  r   r   r   r   r   r  rj  r  rm  r   r'  rt  r1  r0   r0   r`   r1   r#   c  s2    

,


r#   c                       s   e Zd ZdZedefddZd dedef fdd	Z	d
e
e fddZede
eeef  fddZede
eeef  fddZe 													d!ddZdejdefddZ			d"dejdedededef
ddZ  ZS )#r$   z
    Model class that performs denoising and masked token prediction for speech self-supervised learning.
    Please refer to the NEST paper for more details: https://arxiv.org/abs/2408.13106
    r%   c              
   C   sx   t jtdt dddtdt dddtdt dddtdt dddtdt dd	dtdt dd
dgdS )rA  r   rB  audio)r  rC  ro   rD  	audio_lennoise	noise_lennoisy_audionoisy_audio_lenrE  )r   AudioNoiseBatchr   r   r   r   r0   r0   r1   rG  ?  s   z4EncDecDenoiseMaskedTokenPredModel.oomptimizer_schemaNr3   r4   c                    s   t  || d S r   )rF   rG   r@  r`   r0   r1   rG   Q  s   z*EncDecDenoiseMaskedTokenPredModel.__init__rb   c                 C   s   t j| j|dd |dr%t|| j| jtj|dd |dd ddS tj	|| j| jd}|d	 }t
|tjjjr=d
}t|drF|j}nt|jd drU|jd j}n	|jd jd j}tjjj||d ||dd
||dd|dd
dS )Nrd   re   rg   noise_manifestbatch_augmentor)r|  batch_augmentor_cfgrt   )ru   rE   rw   Fr   r   r   r   r   r   r   )r
   r   r3   rP   r   ru   rE   r   LhotseAudioNoiseDataset#get_audio_noise_dataset_from_configr   rV   r   r   r   r   r   r   r   )r\   rb   rv   rw   r   r0   r0   r1   r   T  sB   







z?EncDecDenoiseMaskedTokenPredModel._setup_dataloader_from_configc                 C   s   t | jdrt| jjd}nt }td|ddttdt ddtdt ddttdt ddtd|ddttdt ddtdt ddttdt ddtd|ddttdt ddtdt ddttdt ddtdddS )	Nr   r   r   Tr   r   r   )r   r   r   r   noise_signalnoise_signal_lengthprocessed_noise_signalprocessed_noise_lengthnoisy_input_signalnoisy_input_signal_lengthprocessed_noisy_signalprocessed_noisy_signal_lengthrH  rI  r   r0   r0   r1   r     s"   z-EncDecDenoiseMaskedTokenPredModel.input_typesc                 C   rJ  rK  rQ  rT  r0   r0   r1   r     rU  z.EncDecDenoiseMaskedTokenPredModel.output_typesFc                 C   sb  |d uo|d u}|d uo|d u}||A dkrt |  d|s(| j||d\}}|	d uo/|
d u}|d uo7|d u}||A dkrEt |  d|sP| j|	|
d\}}| jd ur| jj||d\}}| j|ddd\}}| jj|d	 | j||d
\}}| j }n$| j|d\}}|r| j	||d\}}n|}t
|}| j||d
\}}| j|d}||||fS )NFr   r   z Arguments ``noisy_input_signal`` and ``noisy_input_signal_length`` are mutually exclusive  with ``processed_noisy_input_signal`` and ``processed_noisy_input_signal_len`` arguments.)r"  lengthsr5   r   rW  rV  r   rX  r   )r   rJ   r>  r?  r7  r   r[  rK   r\  r=  rV   r^  r8   )r\   r   r   r   r   r  r  r  processed_noise_signal_lengthr  r  processed_noisy_input_signal#processed_noisy_input_signal_lengthrH  r   r   has_noisy_input_signal has_processed_noisy_input_signalr_  r  rP  r   r   rO  r`  ra  r0   r0   r1   r     sR   




z)EncDecDenoiseMaskedTokenPredModel.forwardr  r  c           	   	   C   sd   | j |j|j|j|j|j|jdd\}}}}| j||||d}| jj	d d | j
j|d}||dS )NTr   r   r  r  r  r  rH  rd  r   r  re  r  )r   ru  rv  rw  rx  ry  rz  r9   r
  r  r4   r  )	r\   r  r  ra  r   rO  rP  r   r  r0   r0   r1   r    s   

z/EncDecDenoiseMaskedTokenPredModel.training_stepr   rg  Tr  ri  rH  c              	   C   sL   | j |j|j|j|j|j|j|d\}}}}	| j|||	|d}
| d|
iS )Nr  rd  rh  )r   ru  rv  rw  rx  ry  rz  r9   )r\   r  r  r  ri  rH  ra  r   rO  rP  r   r0   r0   r1   rj    s   
z0EncDecDenoiseMaskedTokenPredModel.inference_passr   )NNNNNNNNNNNNF)r   rg  T)r*  r+  r,  r-  r/  ro  rG  r	   r   rG   r   r   r   r0  r   r   r   r   r   r   r{  r   r  boolrj  r1  r0   r0   r`   r1   r$   9  sP    +Tr$   )8mathr   typingr   r   r   r   r   rV   torch.nnrQ   lightning.pytorchr   	omegaconfr	   nemo.collections.asr.datar
   r   ,nemo.collections.asr.data.audio_to_text_dalir   .nemo.collections.asr.data.audio_to_text_lhotser   0nemo.collections.asr.modules.ssl_modules.maskingr   !nemo.collections.asr.parts.mixinsr   0nemo.collections.asr.parts.preprocessing.perturbr   #nemo.collections.common.data.lhotser   "nemo.collections.common.data.utilsr   3nemo.collections.common.parts.preprocessing.parsersr   nemo.core.classesr   nemo.core.classes.commonr   r   nemo.core.classes.mixinsr   r   nemo.core.neural_typesr   r   r   r   r   r   r    
nemo.utilsr!   __all__r"   r#   r$   r0   r0   r0   r1   <module>   s:   $	    6 W