o
    }oi                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
 d dlmZmZmZmZmZ d dlZd dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZmZ d dlm Z m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8T d dl9m:Z:m;Z; d dl<m=Z= ddgZ>eG dd dZ?eG dd dZ@G dd de#e$e(ZAG d d de&e(ZBG d!d deAZCG d"d# d#eBZDdS )$    N)abstractmethod)	dataclassfield)ceilfloor)AnyDictListOptionalUnion)Trainer)
DictConfig
ListConfig	OmegaConf)
DataLoader)Accuracy)MeanAbsoluteErrorMeanSquaredError)audio_to_label_datasetfeature_to_label_dataset)ASRModelExportableEncDecModel)EncDecSpeakerLabelModel)TranscriptionMixinTranscriptionReturnTypeInternalTranscribeConfig)WaveformFeaturizer)process_augmentations)CrossEntropyLossMSELoss)TopKClassificationAccuracy)PretrainedModelInfo	typecheck)*)loggingmodel_utils)cast_allEncDecClassificationModelEncDecRegressionModelc                   @   <   e Zd ZU dZeed< dZeed< edd dZ	e
ed< d	S )
ClassificationInferConfig   
batch_sizeFlogprobsc                   C      t  S Nr    r1   r1   e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/models/classification_models.py<lambda>5       z"ClassificationInferConfig.<lambda>default_factory	_internalN__name__
__module____qualname__r-   int__annotations__r.   boolr   r7   r   r1   r1   r1   r2   r+   0      
 r+   c                   @   r*   )
RegressionInferConfigr,   r-   Tr.   c                   C   r/   r0   r   r1   r1   r1   r2   r3   =   r4   zRegressionInferConfig.<lambda>r5   r7   Nr8   r1   r1   r1   r2   r@   8   r?   r@   c                
       s  e Zd ZdZdAdedef fddZedd Zed	d
 Z	edd Z
edd Zedd Zedeeeef  fddZeedeeeef  fddZ	dBddZdeeeef  fddZdeeeef  fddZdCdeeeef  d efd!d"Zd#d$ Zd%efd&d'Zd%edejjjfd(d)Z e! 	*		dDd+ee"e ef d,e#d-ee$ ee% B de&f fd.d/Z'	 d0e"e d1ed2e$fd3d4Z(d5e)d2e$fd6d7Z*d2e$dee"e e"ej+ f fd8d9Z,d%edd:fd;d<Z-ed=d> Z.e/de$fd?d@Z0  Z1S )E_EncDecBaseModelz&Encoder decoder Classification models.Ncfgtrainerc                    s   d| _ |d ur|j|j | _ t|}t|}|dd| _| |j	|j
 t j||d t| jdrD| jjd urDt| jj| _nd | _t| jdr\| jjd ur\t| jj| _nd | _|  | _|  | _|  | _
|  | _|   d S )N   is_regression_taskFrB   rC   spec_augmentcrop_or_pad_augment)
world_size	num_nodesnum_devicesr&   #convert_model_config_to_dict_configmaybe_update_config_versiongetrE   _update_decoder_configlabelsdecodersuper__init__hasattr_cfgrG   r   from_config_dictspec_augmentationrH   crop_or_pad_setup_preprocessorpreprocessor_setup_encoderencoder_setup_decoder_setup_lossloss_setup_metricsselfrB   rC   	__class__r1   r2   rS   C   s&   





z_EncDecBaseModel.__init__c                 C      dS )zR
        Setup preprocessor for audio data
        Returns: Preprocessor

        Nr1   rb   r1   r1   r2   rY   d      z$_EncDecBaseModel._setup_preprocessorc                 C   re   )zX
        Setup encoder for the Encoder-Decoder network
        Returns: Encoder
        Nr1   rf   r1   r1   r2   r[   m      z_EncDecBaseModel._setup_encoderc                 C   re   )zX
        Setup decoder for the Encoder-Decoder network
        Returns: Decoder
        Nr1   rf   r1   r1   r2   r]   u   rh   z_EncDecBaseModel._setup_decoderc                 C   re   )zR
        Setup loss function for training
        Returns: Loss function

        Nr1   rf   r1   r1   r2   r^   }   rg   z_EncDecBaseModel._setup_lossc                 C   re   )zX
        Setup metrics to be tracked in addition to loss
        Returns: void

        Nr1   rf   r1   r1   r2   r`      rg   z_EncDecBaseModel._setup_metricsreturnc                 C   sf   t | jdrt| jjd}nt }td|ddttdt ddtdt ddttdt dddS )	N_sample_rate)freq)BTT)optionalrl   )rl   Drm   )input_signalinput_signal_lengthprocessed_signalprocessed_signal_length)rT   rZ   AudioSignalrj   
NeuralTypetupleLengthsTypeSpectrogramType)rb   audio_eltyper1   r1   r2   input_types   s   z_EncDecBaseModel.input_typesc                 C      d S r0   r1   rf   r1   r1   r2   output_types   s   z_EncDecBaseModel.output_typesc           
      C   s   |d uo|d u}|d uo|d u}||A dkrt |  d|s(| j||d\}}| jd ur6| j||d\}}| jd urE| jrE| j||d}| j||d\}}| j|d}	|	S )NF Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_length`` arguments.rp   length
input_specr   audio_signalr   )encoder_output)
ValueErrorrZ   rX   rW   trainingr\   rQ   
rb   rp   rq   rr   rs   has_input_signalhas_processed_signalencodedencoded_lenlogitsr1   r1   r2   forward   s(   


z_EncDecBaseModel.forwardtrain_data_configc                 C   s   d|vrd|d< | j d|d | jt|d| _| jd urMt| jdrOt| jjtjj	j
rQt| jjtrSt| jjtt| jj| j |d   | j_d S d S d S d S d S )NshuffleTtraindataset_nameconfigr   datasetr-   )_update_dataset_config_setup_dataloader_from_configr   	_train_dlrT   
isinstancer   torchutilsdataIterableDataset_trainerlimit_train_batchesfloatr<   r   lenrI   )rb   r   r1   r1   r2   setup_training_data   s"   

z$_EncDecBaseModel.setup_training_dataval_data_configc                 C   s4   d|vrd|d< | j d|d | jt|d| _d S )Nr   F
validationr   r   )r   r   r   _validation_dl)rb   r   r1   r1   r2   setup_validation_data   s   z&_EncDecBaseModel.setup_validation_dataFtest_data_configuse_featc                 C   X   d|vrd|d< | j d|d |r!t| dr!| jt|d| _d S | jt|d| _d S Nr   Ftestr   _setup_feature_label_dataloaderr   r   rT   r   r   _test_dlr   rb   r   r   r1   r1   r2   setup_test_data      z _EncDecBaseModel.setup_test_datac                 C   s   | j d ur| j S d S r0   )r   rf   r1   r1   r2   test_dataloader   s   
z _EncDecBaseModel.test_dataloaderr   c           	      C   .  t |d | j|_t |d d|v rt|d }nd }t|d |dd|d}|d }|ddrd	|v r>|d	 d u sHd
|v rR|d
 d u rRtd|  d S d|v ra|d ratd d S |rm|dd|d  nd}tj	|||| j
| jd}d}|d }t|dr|j}nst|jd dr|jd j}nd|jd jd j}nZd
|v r|d
 d u rtd|  d S d|v r|d rtd tj||d}d}|j}n,tj||d}|d }t|dr|j}nt|jd dr|jd j}n	|jd jd j}tjjj||||dd||dd|dddS NFT	augmentorsample_rate
int_values)r   r   r   r   	is_tarredtarred_audio_filepathsmanifest_filepathzmCould not load dataset as `manifest_filepath` is None or `tarred_audio_filepaths` is None. Provided config : 
vad_streamz1VAD inference does not support tarred dataset now	shuffle_nr,   r-   r   )
featurizerr   r   global_rankrI   
collate_fnICould not load dataset as `manifest_filepath` is None. Provided config : z!Perform streaming frame-level VAD)r   r   rD   	drop_lastnum_workers
pin_memoryr   r-   r   r   r   r   r   r   
set_structrE   r   r   rN   r%   warningr   'get_tarred_classification_label_datasetr   rI   rT   r   datasetsinfoget_speech_label_datasetvad_frame_seq_collate_fn get_classification_label_datasetr   r   r   r   	rb   r   r   r   r   r   r   r-   r   r1   r1   r2   r      sz   






z._EncDecBaseModel._setup_dataloader_from_configc                 C      t |d | j|_t |d d|v rt|d }nd}d|v r1|d du r1td|  dS tj||d}d|v rH|d rH|j}d	}d}n|j	}|d
 }|d }t
jjj||||dd||dd|dddS Q
        setup dataloader for VAD inference with audio features as input
        FTr   Nr   r   r   r   r   rD   r-   r   r   r   r   r   r   r   r   rE   r   r%   r   r   get_feature_label_dataset_vad_segment_collate_fn_collate_fnr   r   r   r   rN   rb   r   r   r   collate_funcr-   r   r1   r1   r2   r   A  4   


z0_EncDecBaseModel._setup_feature_label_dataloaderr,   audior-   override_configc                    x   |du r| j }|du r| j st||d}nt||d}nt|ts2t|ts2tdt dt| |}t j||dS a  
        Generate class labels for provided audio files. Use this method for debugging and prototyping.

        Args:
            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
                Can also be a dataloader object that provides values that can be consumed by the model.
                Recommended length per file is approximately 1 second.
            batch_size: (int) batch size to use during inference.                 Bigger will result in better throughput performance but would use more memory.
            logprobs: (bool) pass True to get log probabilities instead of class labels.
            override_config: (Optional) ClassificationInferConfig to use for this inference call.
                If None, will use the default config.

        Returns:

            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
        Nr-   r.    override_config must be of type 
, but got )r   r   rE   r+   r@   r   r   typerR   
transcriberb   r   r-   r.   r   trcfgrc   r1   r2   r   f     z_EncDecBaseModel.transcribeaudio_filestemp_dirr   c           	      C      t tj|dddd(}|D ]}| jrdn| jjd }|d|d}|t	|d	  qW d    n1 s7w   Y  ||j
|d
}|S Nmanifest.jsonwzutf-8)encodingg        r   g     j@)audio_filepathdurationlabel
)paths2audio_filesr-   r   openospathjoinrE   rB   rP   writejsondumpsr-   	rb   r   r   r   fp
audio_filer   entryr   r1   r1   r2   %_transcribe_input_manifest_processing     z6_EncDecBaseModel._transcribe_input_manifest_processingbatchc                 C   $   | j |d |d d}t|d}|S Nr   rD   rp   rq   )r   r   dictrb   r  r   r   outputr1   r1   r2   _transcribe_forward     
z$_EncDecBaseModel._transcribe_forwardc                 C      | d}g }|jr#t|jd D ]}|| }||   q|S g }| jj}|D ]}	|	| j_| j	|}
|
 }
||
 q+t
|dkrK|d }||7 }|| j_|S Nr   r   rD   popr.   rangeshapeappendcpunumpy	_accuracytop_ktop_k_predicted_labelsr   rb   outputsr   r   rP   idxlglabels_ktop_kstop_k_i
labels_k_ir1   r1   r2   _transcribe_output_processing  &   
z._EncDecBaseModel._transcribe_output_processingtorch.utils.data.DataLoaderc                 C   N   t j|d d| jj| jjt|d t|d ddd}| j	t
|d}|S a  
        Setup function for a temporary data loader which wraps the provided audio file.

        Args:
            config: A python dictionary which contains the following keys:

        Returns:
            A pytorch DataLoader for the given audio file(s).
        r   r   r-   r   F)r   r   rP   r-   trim_silencer   r   r   r   r   rZ   rj   rB   rP   minr   r   r   rb   r   	dl_configtemporary_datalayerr1   r1   r2   _setup_transcribe_dataloader     	z-_EncDecBaseModel._setup_transcribe_dataloaderc                 C   r{   r0   r1   rb   rP   rB   r1   r1   r2   rO     s   z'_EncDecBaseModel._update_decoder_configc                 C   r/   )z
        Utility method that returns the default config for transcribe() function.
        Returns:
            A dataclass
        )r+   )clsr1   r1   r2   get_transcribe_config  s   z&_EncDecBaseModel.get_transcribe_configr0   NNNNFr,   NN)2r9   r:   r;   __doc__r   r   rS   r   rY   r[   r]   r^   r`   propertyr
   r   strru   rz   r|   r   r   r   r   r>   r   r   r   r   r   r   r   r   no_gradr	   r<   r+   r@   r   r   r   r   r	  Tensorr  r*  rO   classmethodr.  __classcell__r1   r1   rc   r2   rA   @   st    !





 	R%+


rA   c                
       s`  e Zd Zd/deeeef  defddZdede	j
jjfdd	Zdefd
dZdd Zdd Zd0dedef fddZdee fddZedeee  fddZdeddfddZe	 			d1deee ef ded ee ee B def fd!d"Z	 d#ee d$ed%efd&d'Z d(e!d%efd)d*Z"d%edeee ee	j# f fd+d,Z$ fd-d.Z%  Z&S )2r(   Fr   r   c                 C   r   r   r   r   r1   r1   r2   r     r   z)EncDecClassificationModel.setup_test_datar   ri   c                 C   r   r   r   r   r1   r1   r2   r     r   z9EncDecClassificationModel._setup_feature_label_dataloaderc           	      C   r   r   r   r   r1   r1   r2   r     sz   






z7EncDecClassificationModel._setup_dataloader_from_configc                 C   s$   | j ||d\}}| j||d}|S )Nr   )r   r   )r\   rQ   )rb   r   r   r   r   r1   r1   r2   forward_for_exportl  s   z,EncDecClassificationModel.forward_for_exportc                 C   s:   t |d d|v rt||j_t||_t |d dS )z
        Update the number of classes in the decoder based on labels provided.

        Args:
            labels: The current labels of the model
            cfg: The config of the decoder which will be updated.
        FparamsTN)r   r   r   r:  num_classesr,  r1   r1   r2   rO   q  s
   
z0EncDecClassificationModel._update_decoder_configNrB   rC   c                    s|   t d | |j|j t|dr|jd ur|j| _nd| _t || t|dr9|j	d ur9t
|j	| _d S d | _d S )NzPlease use the EncDecSpeakerLabelModel instead of this model. EncDecClassificationModel model is kept for backward compatibility with older models.rE   FrH   )r%   r   rO   rP   rQ   rT   rE   rR   rS   rH   r   rV   rX   ra   rc   r1   r2   rS     s   

z"EncDecClassificationModel.__init__
new_labelsc                 C   sB  |durt |tst|}| jj|kr#td| jj d| d dS |du s-t|dkr4td| || j_| j	 }t
|}| || | `t|| _t| jjd || j_t| jjd d	| jv rt| jjdurt|| jj_d
| jv r| jjdur|| jj_d| jv r| jjdur|| jj_td| jj d dS )aZ  
        Changes labels used by the decoder model. Use this method when fine-tuning on from pre-trained model.
        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
        use it if you want to use pretrained encoder when fine-tuning on a data in another dataset.

        If new_labels == self.decoder.vocabulary then nothing will be changed.

        Args:

            new_labels: list with new labels. Must contain at least 2 elements. Typically,             this is set of labels for the dataset.

        Returns: None

        NzOld labels (z) and new labels (z) match. Not changing anythingr   z8New labels must be non-empty list of labels. But I got: FTtrain_dsvalidation_dstest_dszChanged decoder output to z labels.)r   r   rU   rP   r%   r   r   r   rQ   to_config_dictcopydeepcopyrO   r(   rV   r   r   r=  r>  r?  r   r;  )rb   r<  decoder_confignew_decoder_configr1   r1   r2   change_labels  s0   




z'EncDecClassificationModel.change_labelsc                 C   s   g }t dddd}|| t dddd}|| t dd	d
d}|| t dddd}|| t dddd}|| t dddd}|| t dddd}|| t dddd}|| t dddd}|| |S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        vad_multilingual_marblenetzFor details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_marblenetzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilingual_marblenet.nemopretrained_model_namedescriptionlocationvad_telephony_marblenetztFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_telephony_marblenetz}https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_telephony_marblenet/versions/1.0.0rc1/files/vad_telephony_marblenet.nemovad_marblenetzjFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_marblenetzihttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_marblenet/versions/1.0.0rc1/files/vad_marblenet.nemo*commandrecognition_en_matchboxnet3x1x64_v1zFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v1zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x1x64_v1/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x1x64_v1.nemo*commandrecognition_en_matchboxnet3x2x64_v1zFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v1zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x2x64_v1/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x2x64_v1.nemo*commandrecognition_en_matchboxnet3x1x64_v2zFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v2zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x1x64_v2/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x1x64_v2.nemo*commandrecognition_en_matchboxnet3x2x64_v2zFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v2zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x2x64_v2/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x2x64_v2.nemo6commandrecognition_en_matchboxnet3x1x64_v2_subset_taskzFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v2_subset_taskzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x1x64_v2_subset_task/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x1x64_v2_subset_task.nemo6commandrecognition_en_matchboxnet3x2x64_v2_subset_taskzFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v2_subset_taskzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x2x64_v2_subset_task/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x2x64_v2_subset_task.nemor"   r  r-  resultsmodelr1   r1   r2   list_available_models  sp   








z/EncDecClassificationModel.list_available_modelsr!  c                 C   r"  r#  r%  r'  r1   r1   r2   r*    r+  z6EncDecClassificationModel._setup_transcribe_dataloaderr,   r   r-   r   c                    r   r   r   r   rc   r1   r2   r   "  r   z$EncDecClassificationModel.transcriber   r   r   c           	      C   r   r   r   r   r1   r1   r2   r   P  r   z?EncDecClassificationModel._transcribe_input_manifest_processingr  c                 C   r  r  r  r  r1   r1   r2   r	  \  r
  z-EncDecClassificationModel._transcribe_forwardc                 C   r  r  r  r  r1   r1   r2   r  a  r   z7EncDecClassificationModel._transcribe_output_processingc                    s   t  ||\}}|S r0   )rR   r   )rb   rp   rq   r   _rc   r1   r2   r     s   z!EncDecClassificationModel.forwardr0  r0   r1  )'r9   r:   r;   r
   r   r   r   r>   r   r   r   r   r   r   r   r9  rO   r   rS   r	   r4  rE  r7  r"   rW  r*  r5  r<   r+   r@   r   r   r   r   r	  r6  r  r   r8  r1   r1   rc   r2   r(     sN     %Q3I+

c                       s   e Zd ZdZedee fddZd,dede	f fdd	Z
d
d Zdd Zdd Zdd Zdd Zedeeeef  fddZe  fddZdd Zd-defddZd-defddZd-defd d!Zd-defd"d#Ze 	d.d%ee d&ed'ee  dee! f fd(d)Z"d*d+ Z#  Z$S )/r)   zEncoder decoder class for speech regression models.
    Model class creates training, validation methods for setting up data
    performing model forward pass.
    ri   c                 C   s   g }|S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
        Returns:
            List of available pre-trained models.
        r1   )r-  resultr1   r1   r2   rW    s   z+EncDecRegressionModel.list_available_modelsNrB   rC   c                    s(   | dds
tdt j||d d S )NrE   FzLEndDecRegressionModel requires the flag is_regression_task to be set as truerF   )rN   r   rR   rS   ra   rc   r1   r2   rS     s   zEncDecRegressionModel.__init__c                 C      t | jjS r0   )r)   rV   rU   rZ   rf   r1   r1   r2   rY        z)EncDecRegressionModel._setup_preprocessorc                 C   rZ  r0   )r)   rV   rU   r\   rf   r1   r1   r2   r[     r[  z$EncDecRegressionModel._setup_encoderc                 C   rZ  r0   )r)   rV   rU   rQ   rf   r1   r1   r2   r]     r[  z$EncDecRegressionModel._setup_decoderc                 C   r/   r0   )r    rf   r1   r1   r2   r^     s   z!EncDecRegressionModel._setup_lossc                 C   s   t  | _t | _d S r0   )r   _mser   _maerf   r1   r1   r2   r`     s   z$EncDecRegressionModel._setup_metricsc                 C   s   dt tdt iS )Npredsrl   )ru   rv   RegressionValuesTyperf   r1   r1   r2   r|     s   z"EncDecRegressionModel.output_typesc                    s   t  j||d}|dS )Nr  )rR   r   view)rb   rp   rq   r   rc   r1   r2   r     s   
zEncDecRegressionModel.forwardc                 C   sl   |\}}}}| j ||d}| j||d}| j||d}	| j||d}
| ||	|
| jjd d d d|iS )Nr  r^  rP   r^  targetr   lr)
train_loss	train_mse	train_maelearning_rater_   )r   r_   r\  r]  log_dict
_optimizerparam_groups)rb   r  	batch_idxr   audio_signal_lentargetstargets_lenr   r_   rg  rh  r1   r1   r2   training_step  s   	z#EncDecRegressionModel.training_stepr   dataloader_idxc                 C   sP   |\}}}}| j ||d}| j||d}	| j||d}
| j||d}|	|
|dS )Nr  rb  rc  val_lossval_mseval_mae)r   r_   r\  r]  )rb   r  rm  rr  r   rn  ro  rp  r   
loss_valueru  rv  r1   r1   r2   validation_step  s   z%EncDecRegressionModel.validation_stepc                 C   s&   |  |||}|d |d |d dS )Nrt  test_mserv  	test_lossry  test_maerx  )rb   r  rm  rr  logsr1   r1   r2   	test_step  s   zEncDecRegressionModel.test_stepc                 C   Z   t dd |D  }| j }| j  | j }| j  |||d}||||dS )Nc                 S      g | ]}|d  qS )rt  r1   .0xr1   r1   r2   
<listcomp>      zDEncDecRegressionModel.multi_validation_epoch_end.<locals>.<listcomp>rs  )rt  ru  rv  logr   stackmeanr\  computeresetr]  )rb   r  rr  val_loss_meanru  rv  tensorboard_logsr1   r1   r2   multi_validation_epoch_end     



z0EncDecRegressionModel.multi_validation_epoch_endc                 C   r  )Nc                 S   r  )r{  r1   r  r1   r1   r2   r    r  z>EncDecRegressionModel.multi_test_epoch_end.<locals>.<listcomp>rz  )r{  ry  r|  r  r  )rb   r  rr  test_loss_meanry  r|  r  r1   r1   r2   multi_test_epoch_end  r  z*EncDecRegressionModel.multi_test_epoch_endr,   r   r-   r   c                    sZ   |du rt |dd}nt|t stdt  dt| |}t j||d}dd |D S )	a  
        Generate class labels for provided audio files. Use this method for debugging and prototyping.

        Args:
            paths2audio_files: (a list) of paths to audio files.                 Recommended length per file is approximately 1 second.
            batch_size: (int) batch size to use during inference.                 Bigger will result in better throughput performance but would use more memory.

        Returns:

            A list of predictions in the same order as paths2audio_files
        NTr   r   r   )r   c                 S   s   g | ]}t |qS r1   )r   )r  predr1   r1   r2   r    r  z4EncDecRegressionModel.transcribe.<locals>.<listcomp>)r@   r   r   r   rR   r   )rb   r   r-   r   r   predictionsrc   r1   r2   r     s   
z EncDecRegressionModel.transcribec                 C   s4   t |d d|v rd|j_nd|_t |d d S )NFr:  rD   T)r   r   r:  r;  r,  r1   r1   r2   rO     s
   
z,EncDecRegressionModel._update_decoder_configr0   r   )r,   N)%r9   r:   r;   r2  r7  r	   r"   rW  r   r   rS   rY   r[   r]   r^   r`   r3  r
   r   r4  ru   r|   r#   r   rq  r<   rx  r  r  r  r   r5  r@   r   r   rO   r8  r1   r1   rc   r2   r)     s>    
	c                       s  e Zd Zedeeeef  fddZd,de	de
f fddZedeee  fd	d
Zdd Zdd Zde	fddZde	dejjjfddZdd Ze 	d-ddZdd Zd.dedefddZd.dedefd d!Zd/d"d#Zd/defd$d%Z d&d' Z!d(d) Z"	d-d*d+Z#  Z$S )0EncDecFrameClassificationModelri   c                 C   s   dt dt iS )Nr  )rl   rm   C)ru   
LogitsTyperf   r1   r1   r2   r|     s   z+EncDecFrameClassificationModel.output_typesNrB   rC   c                    sH   t |j| _d| _|dd| _t j||d | j| j	_| j| j	_
d S )Nr   ratio_thresholdg?rF   )r   rP   r;  eval_loop_cntrN   r  rR   rS   r|   rQ   output_types_for_exportra   rc   r1   r2   rS     s   
z'EncDecFrameClassificationModel.__init__c                 C   s    g }t dddd}|| |S )N vad_multilingual_frame_marblenetzFor details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_frame_marblenetzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_frame_marblenet/versions/1.20.0/files/vad_multilingual_frame_marblenet.nemorG  rS  rT  r1   r1   r2   rW  !  s   
z4EncDecFrameClassificationModel.list_available_modelsc                 C   s"   t dd| _t| jddd| _d S )NT)dist_sync_on_stepmacro
multiclass)r;  averagetask)r!   r  r   r;  _macro_accuracyrf   r1   r1   r2   r`   ,  s   z-EncDecFrameClassificationModel._setup_metricsc                 C   sX   d| j v r | j jdd }|dv rdg| j }td|  ndg| j }td|dS )Nr_   weight)NnoneNoneg      ?z"Using cross-entropy with weights:    )logits_ndimr  )rB   r_   rN   r;  r%   r   r   )rb   r  r1   r1   r2   r^   0  s   
z*EncDecFrameClassificationModel._setup_lossr   c                 C   s2  t |d | j|_t |d |dd}|ddrbd|v r&|d d u s0d|v r7|d d u r7td| |rC|dd	|d
  nd}tj||| j| jd}d}t	|dr[|j
}n |jd j
}nd|v rs|d d u rstd| t|}|j
}tjjj||d
d||dd||dd|dddS )NFTr   r   r   r   zjCould not load dataset as `manifest_filepath` is None or `tarred_audio_filepaths` is None. Provided cfg : r   r,   r-   r   )rB   r   r   rI   r   zFCould not load dataset as `manifest_filepath` is None. Provided cfg : rD   r   r   r   r   )r   r   rE   rN   r   r   $get_tarred_audio_multi_label_datasetr   rI   rT   r   r   get_audio_multi_label_datasetr   r   r   r   )rb   r   r   r   r   r   r1   r1   r2   r   :  sH   





z<EncDecFrameClassificationModel._setup_dataloader_from_configc                 C   s   t |d | j|_t |d d|v rt|d }nd}d|v r1|d du r1td|  dS tj||d}tj	j
j||dd	|j|d
d|dd|dd|dddS )r   FTr   Nr   r   r   r-   rD   r   r   r   r   r   r   )r   r   rE   r   r%   r   r   get_feature_multi_label_datasetr   r   r   r   rN   r   )rb   r   r   r   r1   r1   r2   r   e  s&   




z>EncDecFrameClassificationModel._setup_feature_label_dataloaderc                 C   sD   t |dd d d f |j|d d d f k }|j|jtdS )NrD   dtype)r   arangesizetodevicer>   )rb   rP   
labels_lenmaskr1   r1   r2   get_label_masks  s   4z.EncDecFrameClassificationModel.get_label_masksc           
      C   s   |d uo|d u}|d uo|d u}||A dkrt |  d|s(| j||d\}}| jd ur6| j||d\}}| jd urE| jrE| j||d}| j||d\}}| |dd}	|	S )NFr}   r~   r   r   rD      )r   rZ   rX   rW   r   r\   rQ   	transposer   r1   r1   r2   r     s(   


z&EncDecFrameClassificationModel.forwardc                 C   s   |\}}}}| j ||d}| ||||\}}| ||}| j|||d}	|	| jjd d tj| jj	tj
dd}
| |||\}}| j||d | j }| j  t| jj|D ]\}}||
d| < qZ|	|
d	S )
Nr  r   rP   	loss_maskr   re  r  )rf  ri  global_stepr   rP   ztraining_batch_accuracy_top@)r_   r  )r   reshape_labelsr  r_   rk  rl  r   tensorrC   r  float32get_metric_logits_labelsr  r  r  zipr  )rb   r  rm  r   rn  rP   r  r   masksrw  r  metric_logitsmetric_labelstopk_scoresr  scorer1   r1   r2   rq    s    


z,EncDecFrameClassificationModel.training_stepr   valrr  tagc              
   C   s   |\}}}}| j ||d}	| |	|||\}}| ||}
| j|	||
d}| |	||
\}}| j||d}| jj| jj}}| jj	||d | j
 }| d|| d|| d|| d|| d	|iS )
Nr  r  r  rc  _loss_correct_counts_total_counts
_acc_micro
_acc_stats)r   r  r  r_   r  r  correct_counts_ktotal_counts_kr  update_final_state)rb   r  rm  rr  r  r   rn  rP   r  r   r  rw  r  r  acccorrect_countstotal_countsstatsr1   r1   r2   rx    s    





z.EncDecFrameClassificationModel.validation_stepc                    sx  t  fdd|D  }t  fdd|D jdd}t  fdd|D jdd}|| j_|| j_| j }t  fdd|D jdd| j_	t  fdd|D jdd| j_
t  fd	d|D jdd| j_t  fd
d|D jdd| j_| j }| j  | j    d|  d|i}	t| jj|D ]\}
}||	  d|
 < q| j|	dd |	S )Nc                       g | ]	}|  d  qS )r  r1   r  r  r1   r2   r        zMEncDecFrameClassificationModel.multi_validation_epoch_end.<locals>.<listcomp>c                    r  )r  r1   r  r  r1   r2   r    r  r   )axisc                    r  )r  r1   r  r  r1   r2   r    r  c                       g | ]}|  d  d qS )r  r   r1   r  r  r1   r2   r        c                    r  )r  rD   r1   r  r  r1   r2   r    r  c                    r  )r  r  r1   r  r  r1   r2   r    r  c                    r  )r  r  r1   r  r  r1   r2   r    r  r  
_acc_macroz_acc_micro_top@T)	sync_dist)r   r  r  sumr  r  r  r  r  tpr   tnfnr  r  r  rj  )rb   r  rr  r  r  r  r  r  macro_accuracy_scoretensorboard_logr  r  r1   r  r2   r    s(     
$$$$




z9EncDecFrameClassificationModel.multi_validation_epoch_endc                 C   s   | j |||ddS Nr   r  r}  )rb   r  rm  rr  r1   r1   r2   r    s   z(EncDecFrameClassificationModel.test_stepc                 C   s   | j ||ddS r  )r  )rb   r  rr  r1   r1   r2   r    s   z3EncDecFrameClassificationModel.multi_test_epoch_endc                 C   s  | d}| d}| d}||k r|| }|| }	t|| | jk rc|  }t|t| dkrE|dgt|t|t|   7 }t| 	|j
}|dt|d}| ||||S |	dkr|ddd|	 f }|||	 k}
||
|||	    }|||dd}tj||dd}tjtj|dddf |dddf gddddd }| | fS ||krB|| }|| }	t|| | jk r|jt|dd }|ddd|f }|t| }||k}
||
||   }n(|jt|dd }|t| }|	dkrtj||dd|	 df gdd}tjtj|dddf |dddf gddddd }| | fS tjtj|dddf |dddf gddddd }||fS )a  
        Reshape labels to match logits shape. For example, each label is expected to cover a 40ms frame, while each frme prediction from the
        model covers 20ms. If labels are shorter than logits, labels are repeated, otherwise labels are folded and argmax is applied to obtain
        the label of each frame. When lengths of labels and logits are not factors of each other, labels are truncated or padded with zeros.
        The ratio_threshold=0.2 is used to determine whether to pad or truncate labels, where the value 0.2 is not important as in real cases the ratio
        is very close to either ceil(ratio) or floor(ratio). We use 0.2 here for easier unit-testing. This implementation does not allow frame length
        and label length that are not multiples of each other.
        Args:
            logits: logits tensor with shape [B, T1, C]
            labels: labels tensor with shape [B, T2]
            logits_len: logits length tensor with shape [B]
            labels_len: labels length tensor with shape [B]
        Returns:
            labels: labels tensor with shape [B, T1]
            labels_len: labels length tensor with shape [B]
        rD   r   r`  Nr   )rounding_mode)dim)r  r   r  r  tolistr   r   r  longr  r  ra  amaxr  divr&  cat
contiguousrepeat_interleaver   )rb   r   rP   
logits_lenr  logits_max_lenlabels_max_lenr-   ratioresr  r1   r1   r2   r    sL   


"8

$88z-EncDecFrameClassificationModel.reshape_labelsc                 C   sh   | d}|d|}|d }|d}| }|jd|ddd}|jd|dd}||fS )a+  
        Computes valid logits and labels for metric computation.
        Args:
           logits: tensor of shape [B, T, C]
           labels: tensor of shape [B, T]
           masks: tensor of shape [B, T]
        Returns:
           logits of shape [N, C]
           labels of shape [N,]
        r  r`  r   rD   )r  index)r  ra  r  nonzerogatherrepeat)rb   r   rP   r  r  r  r1   r1   r2   r  8  s   

z7EncDecFrameClassificationModel.get_metric_logits_labelsc           
      C   s   t | jd| jj}|du r|||d}t|tr|d }n||||||d\}}}}}t | jd| jj}||ddd}	t|	trF|	d }	|durQ|	||||f}	t|	tj	tj
d	S )
at  
        This forward is used when we need to export the model to ONNX format.
        Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models.
        Args:
            input: Tensor that represents a batch of raw audio signals,
                of shape [B, T]. T here represents timesteps.
            length: Vector of length B, that contains the individual lengths of the audio sequences.
            cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers
            cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers
                N is the number of such layers which need caching, B is batch size, H is the hidden size of activations,
                and T is the length of the cache

        Returns:
            the output of the model
        r9  Nr   r   )r   r   cache_last_channelcache_last_timecache_last_channel_lenrD   r  )hidden_states)
from_dtypeto_dtype)getattrinput_moduler   r   rv   output_moduler  r'   r   float16r  )
rb   inputr   r  r  r  enc_funr   dec_funretr1   r1   r2   r9  N  s(   

z1EncDecFrameClassificationModel.forward_for_exportr0   r/  )r   r  r  )%r9   r:   r;   r3  r
   r   r4  ru   r|   r   r   rS   r7  r	   r"   rW  r`   r^   r   r   r   r   r   r   r  r#   r   rq  r<   rx  r  r  r  r  r  r9  r8  r1   r1   rc   r2   r    s.    

+
@r  )ErA  r   r   abcr   dataclassesr   r   mathr   r   typingr   r   r	   r
   r   r   lightning.pytorchr   	omegaconfr   r   r   torch.utils.datar   torchmetricsr   torchmetrics.regressionr   r   nemo.collections.asr.datar   r   %nemo.collections.asr.models.asr_modelr   r   (nemo.collections.asr.models.label_modelsr   !nemo.collections.asr.parts.mixinsr   r   /nemo.collections.asr.parts.mixins.transcriptionr   1nemo.collections.asr.parts.preprocessing.featuresr   0nemo.collections.asr.parts.preprocessing.perturbr   nemo.collections.common.lossesr   r    nemo.collections.common.metricsr!   nemo.core.classes.commonr"   r#   nemo.core.neural_types
nemo.utilsr%   r&   nemo.utils.cast_utilsr'   __all__r+   r@   rA   r(   r)   r  r1   r1   r1   r2   <module>   sT      +     