o
    }oiwm                     @   sF  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZmZmZmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 d dl4m5Z5m6Z6 dgZ7G dd deeeeeZ8dS )    N)ceil)AnyDictListOptionalUnion)
DictConfig	OmegaConf	open_dict)
DataLoader)audio_to_text_dataset)DALIOutputs)WER)ASRModelExportableEncDecModel)ASRBPEMixinASRModuleMixinASRTranscriptionMixinTranscribeConfigTranscriptionReturnType)process_augmentations)CTCBPEDecodingCTCBPEDecodingConfig)SequenceGeneratorSequenceGeneratorConfigget_seq_mask)SmoothedNLLLoss)PretrainedModelInfo	typecheck)AudioSignal
LabelsTypeLengthsTypeLogprobsType
NeuralTypeSpectrogramType)loggingmodel_utilsSLUIntentSlotBPEModelc                       s  e Zd ZdZd@def fddZedeee	e
f  fddZedeee	e
f  fd	d
ZdefddZe 						dAddZdd Z	dBdee	 fddZdCddZdCddZdCddZdd Zdee fddZdeeeef  fd d!Zd"eeeef  fd#d$Zd%eeeef  fd&d'Zdedd(fd)d*Ze 	+	,		-dDd.eee	 e f d/e!d0e"d1e!d2e"de#f fd3d4Z$	 d5e%d6e&fd7d8Z'd6e&dee	 fd9d:Z(e)dee* fd;d<Z+ed=d> Z,e,j-d?d> Z,  Z.S )Er'   ztModel for end-to-end speech intent classification and slot filling, which is formulated as a speech-to-sequence taskNcfgc                    s  t |}t |}d|vrtd| |j t j||d | | j	j
| _
| | j	j| _| | j	j| _t| jdrM| jjd urM| | jj| _nd | _|   |   | jj | _t| j}|| j	jd< | | j	j| _|| j	jd< | | j	j| _t| j	jjd| _t| j	j| j| j| j| jd| _| j	d	d }|d u rtt }t!| j	 || j	_"W d    n1 sw   Y  t#| j	j"| jd
| _"t$| j"| jddd| jdddd| _%d S )N	tokenizerz:`cfg` must have `tokenizer` config to create a tokenizer !)r(   trainerspec_augment
vocab_sizenum_classes)label_smoothing)r(   	embeddingdecoderlog_softmaxr)   decoding)r)   use_cerFTlog_prediction)r2   r3   dist_sync_on_stepr4   fold_consecutive)&r&   #convert_model_config_to_dict_configmaybe_update_config_version
ValueError_setup_tokenizerr)   super__init__from_config_dictr(   preprocessorencoderr0   hasattr_cfgr+   spec_augmentationsetup_optimization_flagssetup_adapters	get_vocab
vocabularylenr/   
classifierr   lossr.   r   sequence_generatorgetr	   
structuredr   r
   r2   r   r   wer)selfr(   r*   r,   decoding_cfg	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/models/slu_models.pyr<   1   sT   


	

zSLUIntentSlotBPEModel.__init__returnc                 C   s   t | jdrt| jjd}nt }td|ddttdt ddtd|ddttdt ddtdt ddttdt ddttdt dddS )	N_sample_rate)freqBTToptionalrX   )rX   DrY   )input_signalinput_signal_lengthtarget_semanticstarget_semantics_lengthprocessed_signalprocessed_signal_length	sample_id)r@   r>   r   rU   r#   tupler!   r$   )rN   input_signal_eltyperR   rR   rS   input_typesv   s   z!SLUIntentSlotBPEModel.input_typesc                 C   s4   t dt ddt tdt ddt dt dddS )N)rX   rY   r\   TrZ   rX   rW   )	log_probslengthsgreedy_predictions)r#   r"   rd   r!   r    rN   rR   rR   rS   output_types   s   z"SLUIntentSlotBPEModel.output_typesc                 C   s*   | j jj|_t|| j| j| j| j| _ d S N)	rJ   	generatormax_seq_lengthmax_sequence_lengthr   r/   r0   rH   r)   )rN   r(   rR   rR   rS   set_decoding_strategy   s   z+SLUIntentSlotBPEModel.set_decoding_strategyc                 C   s   |duo|du}|duo|du}||A dkrt |  d|s(| j||d\}}| jdur7| jr7| j||d}| j||d\}	}
|	dd}	t|	|
}|du rZ| |	|}dd|fS |dddd	f }| |}t||d }| j	|	|||d
}| 
|}|jd	dd}| j|}|||fS )a  
        Forward pass of the model.

        Params:
            input_signal: Tensor that represents a batch of raw audio signals, of shape [B, T]. T here represents
            timesteps, with 1 second of audio represented as `self.sample_rate` number of floating point values.

            input_signal_length: Vector of length B, that contains the individual lengths of the audio sequences.

            target_semantics: Tensor that represents a batch of semantic tokens, of shape [B, L].

            target_semantics_length: Vector of length B, that contains the individual lengths of the semantic sequences.

            processed_signal: Tensor that represents a batch of processed audio signals, of shape (B, D, T) that has
            undergone processing via some DALI preprocessor.

            processed_signal_length: Vector of length B, that contains the individual lengths of the processed audio
            sequences.

        Returns:
            A tuple of 3 elements -
            1) The log probabilities tensor of shape [B, T, D].
            2) The lengths of the output sequence after decoder, of shape [B].
            3) The token predictions of the model of shape [B, T].
        NF Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_len`` arguments.r]   length
input_specrs   audio_signalrs         )encoder_statesencoder_maskdecoder_statesdecoder_mask)dimkeepdim)r9   r>   rB   trainingr?   	transposer   rJ   r/   r0   rH   argmaxget_seq_length)rN   r]   r^   r_   r`   ra   rb   has_input_signalhas_processed_signalencodedencoded_lenencoded_maskpredictionsbos_semantics_tokensbos_semanticsbos_semantics_maskdecodedrg   pred_lenrR   rR   rS   forward   s@   #





zSLUIntentSlotBPEModel.forwardc                 C   s>  t |dkr|\}}}}n|\}}}}}| j||||d\}}	}
|d d dd f }|d }| j|||d}d| i}t | jjdkrO| jjd d |d< nt| jjD ]\}}|d |d	| < qUt| d
rr| jd urr| jj	}nd}|d | dkr| j
j|
||	|d | j
 \}}}| j
  |d|i ||dS )N   r]   r^   r_   r`   rx   rg   labelsrh   
train_lossr   lrlearning_ratelearning_rate_g_trainerr   targetspredictions_lengthstargets_lengthstraining_batch_wer)rI   log)rG   r   rI   item
_optimizerparam_groups	enumerater@   r   log_every_n_stepsrM   updatecomputereset)rN   batchbatch_nbsignal
signal_len	semanticssemantics_lenrc   rg   r   r   eos_semanticseos_semantics_len
loss_valuetensorboard_logsigroupr   rM   _rR   rR   rS   training_step   s>   


z#SLUIntentSlotBPEModel.training_stepr   c                 C   s   |d uo|d u}|d uo|d u}||A dkrt |  d|s(| j||d\}}| jd ur7| jr7| j||d}| j||d\}}	|dd}t||	}
| ||
}| j|}|S )NFrq   rr   rt   rv   rx   ry   )	r9   r>   rB   r   r?   r   r   rJ   decode_semantics_from_tokens)rN   r]   r^   ra   rb   dataloader_idxr   r   r   r   r   pred_tokensr   rR   rR   rS   predict  s&   

zSLUIntentSlotBPEModel.predictc                 C   s   t |dkr|\}}}}n|\}}}}}t|tr)|jr)| j||||d\}	}
}n| j||||d\}	}
}|d d dd f }|d }| j|	||d}| jj|||
|d | j \}}}| j	  ||||dS )Nr   )ra   rb   r_   r`   r   rx   r   r   )val_lossval_wer_numval_wer_denomval_wer)
rG   
isinstancer   r   r   rI   rM   r   r   r   )rN   r   	batch_idxr   r   r   r   r   rc   rg   r   r   r   r   r   rM   wer_num	wer_denomrR   rR   rS   validation_pass'  s@   
z%SLUIntentSlotBPEModel.validation_passc                 C   sR   |  |||}t| jjtkr!t| jjdkr!| j| | |S | j| |S )Nrx   )r   typer*   val_dataloaderslistrG   validation_step_outputsappend)rN   r   r   r   metricsrR   rR   rS   validation_stepQ  s    z%SLUIntentSlotBPEModel.validation_stepc                 C   sf   | j |||d}dd | D }t| jjtkr+t| jjdkr+| j| | |S | j| |S )N)r   c                 S   s   i | ]\}}| d d|qS )val_test_)replace).0namevaluerR   rR   rS   
<dictcomp>[  s    z3SLUIntentSlotBPEModel.test_step.<locals>.<dictcomp>rx   )	r   itemsr   r*   test_dataloadersr   rG   test_step_outputsr   )rN   r   r   r   logs	test_logsrR   rR   rS   	test_stepY  s    zSLUIntentSlotBPEModel.test_stepc                 C   s   | j d u rg | _ | j S rl   )_test_dlrj   rR   rR   rS   test_dataloaderb  s   
z%SLUIntentSlotBPEModel.test_dataloaderconfigc           	      C   s  d|v rt |d }nd }|d }tj rdnd}|ddr<|dkr'| jnd }tj|| j||| j	| j
| jjd}|S |ddrd	|v rL|d	 d u sVd
|v r`|d
 d u r`td|  d S |rl|dd|d  nd}tj|| j|| j	| j
|d}d}nd
|v r|d
 d u rtd|  d S tj|| j|d}t|dr|j}nt|jd dr|jd j}n	|jd jd j}tjjj||d ||dd||dd|dddS )N	augmentorshufflegpucpuuse_daliF)r   r)   r   	device_idglobal_rank
world_sizepreprocessor_cfg	is_tarredtarred_audio_filepathsmanifest_filepathznCould not load dataset as `manifest_filepath` was None or `tarred_audio_filepaths` is None. Provided config : 	shuffle_nr   
batch_sizer   )r   r)   r   r   r   r   zJCould not load dataset as `manifest_filepath` was None. Provided config : )r   r)   r   
collate_fn	drop_lastnum_workers
pin_memory)datasetr   r   r   r   r   r   )r   torchcudais_availablerK   
local_rankr   get_dali_bpe_datasetr)   r   r   rA   r>   r%   warningget_tarred_datasetget_bpe_datasetr@   r   datasetsutilsdatar   )	rN   r   r   r   devicer   r   r   r   rR   rR   rS   _setup_dataloader_from_configi  sp   	



z3SLUIntentSlotBPEModel._setup_dataloader_from_configtrain_data_configc                 C   s   d|vrd|d< | j d|d | j|d| _| jdur\t| jdr^t| jjtjjj	r`| j
durPt| j
jtrPt| j
jtt| jj| j |d   | j
_dS | j
du rbtd	 dS dS dS dS dS )
a  
        Sets up the training data loader via a Dict-like object.

        Args:
            train_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   Ttraindataset_namer   r   Nr   r   zModel Trainer was not set before constructing the dataset, incorrect number of training batches will be used. Please set the trainer and rebuild the dataset.)_update_dataset_configr   	_train_dlr@   r   r   r   r   r   IterableDatasetr   limit_train_batchesfloatintr   rG   r   r%   r   )rN   r   rR   rR   rS   setup_training_data  s*   


z)SLUIntentSlotBPEModel.setup_training_dataval_data_configc                 C   0   d|vrd|d< | j d|d | j|d| _dS )a  
        Sets up the validation data loader via a Dict-like object.

        Args:
            val_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   F
validationr   r   N)r   r   _validation_dl)rN   r  rR   rR   rS   setup_validation_data     z+SLUIntentSlotBPEModel.setup_validation_datatest_data_configc                 C   r  )a  
        Sets up the test data loader via a Dict-like object.

        Args:
            test_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   Ftestr   r   N)r   r   r   )rN   r  rR   rR   rS   setup_test_data  r  z%SLUIntentSlotBPEModel.setup_test_dataztorch.utils.data.DataLoaderc                 C   s   d|v r|d }|d }nt j|d d}t|d t|d }|| jj|d|dt|t  d d	| j	j
d
dd}| jt|d}|S )a  
        Setup function for a temporary data loader which wraps the provided audio file.

        Args:
            config: A python dictionary which contains the following keys:
            paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments.                 Recommended length per file is between 5 and 25 seconds.
            batch_size: (int) batch size to use during inference.                 Bigger will result in better throughput performance but would use more memory.
            temp_dir: (str) A temporary directory where the audio manifest is temporarily
                stored.
            num_workers: (int) number of workers. Depends of the batch_size and machine.                 0 - only the main process will load batches, 1 - one worker (not main process)

        Returns:
            A pytorch DataLoader for the given audio file(s).
        r   r   temp_dirzmanifest.jsonpaths2audio_filesFr   rx   Tuse_start_end_token)r   sample_rater   r   r   r   r  r   )ospathjoinminrG   r>   rU   rK   	cpu_countr(   validation_dsr   r   )rN   r   r   r   	dl_configtemporary_datalayerrR   rR   rS   _setup_transcribe_dataloader	  s   

z2SLUIntentSlotBPEModel._setup_transcribe_dataloaderr   FTaudior   return_hypothesesr   verbosec                    s   t  j|||||dS )a`  
        Uses greedy decoding to transcribe audio files into SLU semantics. 
        Use this method for debugging and prototyping.

        Args:
            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
                Can also be a dataloader object that provides values that can be consumed by the model.
                Recommended length per file is between 5 and 25 seconds.                 But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference.
                Bigger will result in better throughput performance but would use more memory.
            return_hypotheses: (bool) Either return hypotheses or text
                With hypotheses can do some postprocessing like getting timestamp or rescoring
            num_workers: (int) number of workers for DataLoader
            verbose: (bool) whether to display tqdm progress bar

        Returns:
            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
        )r  r   r  r   r  )r;   
transcribe)rN   r  r   r  r   r  rP   rR   rS   r  0  s   z SLUIntentSlotBPEModel.transcriber   trcfgc                 C   s"   | j |d |d d}d|i}|S )Nr   rx   )r]   r^   r   )r   )rN   r   r   r   outputrR   rR   rS   _transcribe_forwardV  s   z)SLUIntentSlotBPEModel._transcribe_forwardc                 C   s   | d}|S )Nr   )pop)rN   outputsr   
hypothesesrR   rR   rS   _transcribe_output_processing[  s   
z3SLUIntentSlotBPEModel._transcribe_output_processingc                 C   s    g }t dddd}|| dS )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        %slu_conformer_transformer_large_slurpzFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:slu_conformer_transformer_large_slurpzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/slu_conformer_transformer_large_slurp/versions/1.13.0/files/slu_conformer_transformer_large_slurp.nemo)pretrained_model_namedescriptionlocationN)r   r   )clsresultsmodelrR   rR   rS   list_available_models_  s   z+SLUIntentSlotBPEModel.list_available_modelsc                 C   s   | j S rl   _werrj   rR   rR   rS   rM   p  s   zSLUIntentSlotBPEModel.werc                 C   s
   || _ d S rl   r/  )rN   rM   rR   rR   rS   rM   t  s   
rl   )NNNNNN)NNr   )r   )r   Fr   T)/__name__
__module____qualname____doc__r   r<   propertyr   r   strr#   rf   rk   r   rp   r   r   r   r   r   r   r   r   r   r   r   r  r
  r  r  r   no_gradr   r  boolr   r  r   r   r"  r&  classmethodr   r.  rM   setter__classcell__rR   rR   rP   rS   r'   .   sr    EM,


*
	E-'#
)9r  mathr   typingr   r   r   r   r   r   	omegaconfr   r	   r
   torch.utils.datar   nemo.collections.asr.datar   ,nemo.collections.asr.data.audio_to_text_dalir    nemo.collections.asr.metrics.werr   %nemo.collections.asr.models.asr_modelr   r   !nemo.collections.asr.parts.mixinsr   r   r   r   r   0nemo.collections.asr.parts.preprocessing.perturbr   2nemo.collections.asr.parts.submodules.ctc_decodingr   r   *nemo.collections.asr.parts.utils.slu_utilsr   r   r   nemo.collections.common.lossesr   nemo.core.classes.commonr   r   nemo.core.neural_typesr   r    r!   r"   r#   r$   
nemo.utilsr%   r&   __all__r'   rR   rR   rR   rS   <module>   s(    