o
    }oi                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZBmCZCmDZDmEZEmFZF d dlGmHZH G dd de$e)e%e*ZIdS )    N)ceil)AnyDictListOptionalUnion)Trainer)
DictConfig	OmegaConf	open_dict)
DataLoader)audio_to_text_dataset)_AudioTextDataset)AudioToCharDALIDatasetDALIOutputs)LhotseSpeechToTextBpeDataset)RNNTLossresolve_rnnt_default_loss_name)WER)ASRModelExportableEncDecModel)RNNTDecoderJoint)ASRModuleMixinASRTranscriptionMixinTranscribeConfigTranscriptionReturnType)ChannelSelectorType)RNNTDecodingRNNTDecodingConfig)get_semi_sorted_batch_sampler)
Hypothesis)process_timestamp_outputs)!get_lhotse_dataloader_from_config)make_parser)PretrainedModelInfo	typecheck)AccessMixin)AcousticEncodedRepresentationAudioSignalLengthsType
NeuralTypeSpectrogramType)loggingc                       s  e Zd ZdZd\dedef fddZdd Zdee fd	d
Z	dd Z
e 									d]deeee ejef dededeed  dedee dededee dee def fddZd\dee d ee fd!d"Zd^d efd#d$Zd%ee fd&d'Zd(eeeef  fd)d*Zd+eeeef  fd,d-Zd.eeeef  fd/d0Ze deeee!f  fd1d2Z"e deeee!f  fd3d4Z#e$ 	d_d5d6Z%d7d8 Z&d`d9d:Z'd`d;d<Z(d`d=d>Z)d`d?d@Z*d`dAefdBdCZ+d`dAefdDdEZ,	 dFe-dGefdHdIZ.dGedeed eed  f fdJdKZ/d%eddLfdMdNZ0 fdOdPZ1dQdR Z2e dSdT Z3 fdUdVZ4e5dee6 fdWdXZ7e dYdZ Z8e8j9d[dZ Z8  Z:S )aEncDecRNNTModelz1Base class for encoder decoder RNNT-based models.Ncfgtrainerc                    sn  d| _ |d ur|j | _ t j||d t| jj| _t| jj| _t| jj	 t
| jj| jj	_W d    n1 s=w   Y  t| jj* t
| jj| jj_| jj| jj_| jjj| jjj_| jjj| jjj_W d    n1 suw   Y  t| jj	| _	t| jj| _| | jdd \}}| jjd }|dkr|| jj }t|||| jddd| _t| jdr| jjd urt| jj| _nd | _| | jj | j_ t!| jj | j	| j| jjd	| _ t"| j d
| jdd| jdddd| _#d| jv r| jj$| _$nd| _$| jj%s| j j&d ur)| j j&d
kr)| j'| j | j(| j# | )  | *  | +  d S )N   )r.   r/   losstdtrnnt_reduction
mean_batch)num_classes	loss_nameloss_kwargs	reductionspec_augmentdecoding_cfgdecoderjoint
vocabularyr   use_cerFlog_predictionTdecodingbatch_dim_indexr?   r@   dist_sync_on_stepcompute_eval_loss),
world_sizesuper__init__r-   from_config_dictr.   preprocessorencoderr   r<   lenlabels
vocab_sizer=   r5   r>   model_defaults
enc_hiddenjointnetencoder_hiddenpred_hiddenextract_rnnt_loss_cfggetnum_classes_with_blanknum_extra_outputsr   r1   hasattr_cfgr9   spec_augmentation#set_decoding_type_according_to_lossrB   r   r   werrE   fuse_loss_werjoint_fused_batch_sizeset_lossset_wersetup_optim_normalizationsetup_optimization_flagssetup_adapters)selfr.   r/   r6   r7   r5   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/models/rnnt_models.pyrH   8   sl   	
zEncDecRNNTModel.__init__c                 C   s   t | jdr| jd dd| _| jd dd| _nd| _d| _| jdd| _d| _| jdd| _| jd	d| _| jd
d| _	dS )aI  
        Helper method to setup normalization of certain parts of the model prior to the optimization step.

        Supported pre-optimization normalizations are as follows:

        .. code-block:: yaml

            # Variation Noise injection
            model:
                variational_noise:
                    std: 0.0
                    start_step: 0

            # Joint - Length normalization
            model:
                normalize_joint_txu: false

            # Encoder Network - gradient normalization
            model:
                normalize_encoder_norm: false

            # Decoder / Prediction Network - gradient normalization
            model:
                normalize_decoder_norm: false

            # Joint - gradient normalization
            model:
                normalize_joint_norm: false
        variational_noisestdr   
start_stepnormalize_joint_txuFNnormalize_encoder_normnormalize_decoder_normnormalize_joint_norm)
rX   r.   rU   _optim_variational_noise_std_optim_variational_noise_start_optim_normalize_joint_txu_optim_normalize_txu_optim_normalize_encoder_norm_optim_normalize_decoder_norm_optim_normalize_joint_normrd   rg   rg   rh   ra      s   z)EncDecRNNTModel.setup_optim_normalizationc                 C   s`   |du rt i }|dd}|dkrt }|| dd}td| d| d|  ||fS )a  
        Helper method to extract the rnnt loss name, and potentially its kwargs
        to be passed.

        Args:
            cfg: Should contain `loss_name` as a string which is resolved to a RNNT loss name.
                If the default should be used, then `default` can be used.
                Optionally, one can pass additional kwargs to the loss function. The subdict
                should have a keyname as follows : `{loss_name}_kwargs`.

                Note that whichever loss_name is selected, that corresponding kwargs will be
                selected. For the "default" case, the "{resolved_default}_kwargs" will be used.

        Examples:
            .. code-block:: yaml

                loss_name: "default"
                warprnnt_numba_kwargs:
                    kwargs2: some_other_val

        Returns:
            A tuple, the resolved loss name as well as its kwargs (if found).
        Nr6   default_kwargszUsing RNNT Loss : z
Loss z	_kwargs: )r	   rU   r   r,   info)rd   r.   r6   r7   rg   rg   rh   rT      s   z%EncDecRNNTModel.extract_rnnt_loss_cfgc                 C   s@   |  | jdd \}}|dkr|j|_|S |dkr|j|_|S )Nr1   r2   multiblank_rnnt)rT   r.   rU   	durationsbig_blank_durations)rd   r;   r6   r7   rg   rg   rh   r[      s   z3EncDecRNNTModel.set_decoding_type_according_to_loss   Fr   Taudio
batch_sizereturn_hypothesespartial_hypothesisr    num_workerschannel_selector	augmentorverbose
timestampsoverride_configreturnc                    s   |	p
|
dur	|
j nd}	|	durk|	s|
dur@|
j r@td d}t| jj d| jj_d| jj_W d   n1 s:w   Y  n"d}t| jj d| jj_d| jj_W d   n1 s]w   Y  | j| jjdd t	 j
||||||||	|
|d
S )a+
  
        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.

        Args:
            audio: (a single or list) of paths to audio files or a np.ndarray/tensor audio array or path 
                to a manifest file.
                Can also be a dataloader object that provides values that can be consumed by the model.
                Recommended length per file is between 5 and 25 seconds.                 But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference.                 Bigger will result in better throughput performance but would use more memory.
            return_hypotheses: (bool) Either return hypotheses or text
                With hypotheses can do some postprocessing like getting timestamp or rescoring
            partial_hypothesis: Optional[List['Hypothesis']] - A list of partial hypotheses to be used during rnnt
                decoding. This is useful for streaming rnnt decoding. If this is not None, then the length of this
                list should be equal to the length of the audio list.
            num_workers: (int) number of workers for DataLoader
            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels 
                from multi-channel audio. If set to `'average'`, it performs averaging across channels. 
                Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
            augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
            verbose: (bool) whether to display tqdm progress bar
            timestamps: Optional(Bool): timestamps will be returned if set to True as part of hypothesis object 
                (output.timestep['segment']/output.timestep['word']). Refer to `Hypothesis` class for more details. 
                Default is None and would retain the previous state set by using self.change_decoding_strategy().
            override_config: (Optional[TranscribeConfig]) override transcription config pre-defined by the user.
                **Note**: All other arguments in the function will be ignored if override_config is passed.
                You should call this argument as `model.transcribe(audio, override_config=TranscribeConfig(...))`.

        Returns:
            Returns a tuple of 2 items -
            * A list of greedy transcript texts / Hypothesis
            * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
        NzTimestamps requested, setting decoding timestamps to True. Capture them in Hypothesis object,                         with output[0][idx].timestep['word'/'segment'/'char']TF)r   )
r   r   r   r   r   r   r   r   r   r   )r   r,   rz   r   r.   rB   compute_timestampspreserve_alignmentschange_decoding_strategyrG   
transcribe)rd   r   r   r   r   r   r   r   r   r   r   re   rg   rh   r      s<   1

zEncDecRNNTModel.transcribenew_vocabularyr;   c              	   C   s  | j j|krtd| j j d| d dS |du s t|dkr'td| | j  }t|}||d< t||d< | ` t	
|| _ | j }t|}t||_| `t	
|| _| `| | jd	d\}}t| j jd
 ||d| _|du r|| jj}tt}	tt|	}	t|	|}| |}t|| j| j | j jd| _t| j| jj| jj| jj dd| _| j j!s| jj"dur| jj"dkr| j #| j | j $| j t%| jj  || j_ W d   n1 sw   Y  t%| jj || j_W d   n1 sw   Y  t%| jj || j_W d   n	1 sw   Y  g d}
|
D ]+}|| jv rOt%| j|  t|| j| d< W d   n	1 sJw   Y  q%t&d| j j d dS )a@  
        Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a 
        pre-trained model. This method changes only decoder and leaves encoder and pre-processing 
        modules unchanged. For example, you would use it if you want to use pretrained encoder when 
        fine-tuning on data in another language, or when you'd need model to learn capitalization, 
        punctuation and/or special characters.

        Args:
            new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically,                 this is target alphabet.
            decoding_cfg: A config for the decoder, which is optional. If the decoding type
                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.

        Returns: None

        zOld z	 and new z match. Not changing anything.Nr   z;New vocabulary must be non-empty list of chars. But I got: r>   r5   r1   r0   )r5   r6   r7   r:   TrA   )train_dsvalidation_dstest_dsrM   zChanged decoder to output to z vocabulary.)'r=   r>   r,   warningrL   
ValueErrorto_config_dictcopydeepcopyr-   rI   r<   rN   r1   rT   r.   rU   r   rV   rB   r
   
structuredr   createto_containermerger[   r   r   r\   rC   r?   r@   r]   r^   r_   r`   r   rz   )rd   r   r;   joint_confignew_joint_configdecoder_confignew_decoder_configr6   r7   decoding_clsds_keyskeyrg   rg   rh   change_vocabularyB  sx    






	


z!EncDecRNNTModel.change_vocabularyc                 C   s,  |du rt d | jj}tt}tt|}t	||}| 
|}t|| j| j| jjd| _t| j| jj| jj| jjdd| _| jjsS| jjdura| jjdkra| j| j | j| j |dd| j_t| jj || j_W d   n1 s~w   Y  |rt d	t| jj  dS dS )
ag  
        Changes decoding strategy used during RNNT decoding process.

        Args:
            decoding_cfg: A config for the decoder, which is optional. If the decoding type
                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.
            verbose: (bool) whether to display logging information
        NzONo `decoding_cfg` passed when changing decoding strategy, using internal configr:   TrA   r   temperatureg      ?zChanged decoding strategy to 
)r,   rz   r.   rB   r
   r   r   r   r   r   r[   r   r<   r=   r>   r   r\   rC   r?   r@   r]   r^   r_   r1   r`   rU   r   r   to_yaml)rd   r;   r   r   rg   rg   rh   r     s>   	


	
z(EncDecRNNTModel.change_decoding_strategyconfigc                 C   s  t j| j|dd t j| j|dd |drZt||dds"| jn|d|dds/| jn|dtt|dd |d	d
|dd|dd|ddd|ddddS t j	|| j
| j| j| jdd d}|d u rrd S t|try|S |d }t|tjjjrd}t|dr|j}nt|jd dr|jd j}n	|jd jd j}d }|ddrt|tstdt| t| ||}d |d< d|d< d}tjjj||d |d ||dd||dd|ddd	S )Nsample_rate)r   rM   
use_lhotsedo_transcribeFglobal_rankrF   parseren	unk_indexblank_indexnormalize_transcripts)rM   nameunk_idblank_iddo_normalize)	tokenizerreturn_cuts)r   rF   datasetrJ   )r   
local_rankr   rF   preprocessor_cfgshuffle
collate_fnr   use_semi_sorted_batchingzmSemi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset but found dataset of type r   	drop_lastr   
pin_memory)	r   r   samplerbatch_samplerr   r   r   r   r   )r   )inject_dataloader_value_from_model_configr.   rU   r"   r   rF   r   r#   *get_audio_to_text_char_dataset_from_configr   rY   
isinstancer   torchutilsdataIterableDatasetrX   r   datasetsr   RuntimeErrortyper   r   )rd   r   r   r   r   r   rg   rg   rh   _setup_dataloader_from_config  sx   












z-EncDecRNNTModel._setup_dataloader_from_configtrain_data_configc                 C   s   d|vrd|d< | j d|d | j|d| _| jdur\t| jdr^t| jjtjjj	r`| j
durPt| j
jtrPt| j
jtt| jj| j |d   | j
_dS | j
du rbtd	 dS dS dS dS dS )
a  
        Sets up the training data loader via a Dict-like object.

        Args:
            train_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   Ttraindataset_namer   r   Nr   r   zModel Trainer was not set before constructing the dataset, incorrect number of training batches will be used. Please set the trainer and rebuild the dataset.)_update_dataset_configr   	_train_dlrX   r   r   r   r   r   r   _trainerlimit_train_batchesfloatintr   rL   rF   r,   r   )rd   r   rg   rg   rh   setup_training_data  s*   


z#EncDecRNNTModel.setup_training_dataval_data_configc                 C   0   d|vrd|d< | j d|d | j|d| _dS )a  
        Sets up the validation data loader via a Dict-like object.

        Args:
            val_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   F
validationr   r   N)r   r   _validation_dl)rd   r   rg   rg   rh   setup_validation_dataM     z%EncDecRNNTModel.setup_validation_datatest_data_configc                 C   r   )a  
        Sets up the test data loader via a Dict-like object.

        Args:
            test_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   Ftestr   r   N)r   r   _test_dl)rd   r   rg   rg   rh   setup_test_datad  r   zEncDecRNNTModel.setup_test_datac                 C   sf   t | jdrt| jjd}nt }td|ddttdt ddtdt ddttdt dddS )	N_sample_rate)freq)BTT)optionalr   r   Dr   )input_signalinput_signal_lengthprocessed_signalprocessed_signal_length)rX   rJ   r(   r   r*   tupler)   r+   )rd   input_signal_eltyperg   rg   rh   input_types{  s   zEncDecRNNTModel.input_typesc                 C   s   t dt t tdt dS )Nr   r   )outputsencoded_lengths)r*   r'   r   r)   rw   rg   rg   rh   output_types  s   
zEncDecRNNTModel.output_typesc           	      C   s   |duo|du}|duo|du}||A du rt |  d|s(| j||d\}}| jdur7| jr7| j||d}| j||d\}}||fS )a  
        Forward pass of the model. Note that for RNNT Models, the forward pass of the model is a 3 step process,
        and this method only performs the first step - forward of the acoustic model.

        Please refer to the `training_step` in order to see the full `forward` step for training - which
        performs the forward of the acoustic model, the prediction network and then the joint network.
        Finally, it computes the loss and possibly compute the detokenized text via the `decoding` step.

        Please refer to the `validation_step` in order to see the full `forward` step for inference - which
        performs the forward of the acoustic model, the prediction network and then the joint network.
        Finally, it computes the decoded tokens via the `decoding` step and possibly compute the batch metrics.

        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, that contains the individual lengths of the audio
                sequences.
            processed_signal: Tensor that represents a batch of processed audio signals,
                of shape (B, D, T) that has undergone processing via some DALI preprocessor.
            processed_signal_length: Vector of length B, that contains the individual lengths of the
                processed audio sequences.

        Returns:
            A tuple of 2 elements -
            1) The log probabilities tensor of shape [B, T, D].
            2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
        NFz Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_len`` arguments.)r   length)
input_specr   )audio_signalr   )r   rJ   rZ   trainingrK   )	rd   r   r   r   r   has_input_signalhas_processed_signalencodedencoded_lenrg   rg   rh   forward  s    
zEncDecRNNTModel.forwardc                 C   s   t | jrt |  |\}}}}t|tr#|jr#| j||d\}}n	| j||d\}}~| j||d\}	}
}t	| drJ| j
d urJ| j
j}| j
j}nd}|}| jjs| j||	d}| j||||
d}| |}t | jrrt |  || jjd d	 tj| jjtjd
d}|d | dkr| jj||||d | j \}}}| j  |d| | i nI|d | dkrd}nd}| j||	||||d\}}}}| |}t | jrt |  || jjd d	 tj| jjtjd
d}|r|d|i | | | jr| | g| _d|iS )Nr   r   r   r   targetstarget_lengthr   r0   encoder_outputsdecoder_outputs	log_probsr   input_lengthstarget_lengthsr   lrdtype)
train_losslearning_rateglobal_steppredictionspredictions_lengthsr   targets_lengthstraining_batch_werTFr  r  encoder_lengthstranscriptstranscript_lengthscompute_werr1   ) r&   is_access_enabled
model_guidreset_registryr   r   r   r   r<   rX   r   log_every_n_stepsr  r=   r]   r1   add_auxiliary_losses
_optimizerparam_groupsr   tensorr/   float32r\   updatecomputeresetr   log_dictrr   maxrs   )rd   batchbatch_nbsignal
signal_len
transcripttranscript_lenr   r   r<   r   statesr  	sample_idr=   
loss_valuetensorboard_logs_scoreswordsr  r\   rg   rg   rh   training_step  sx   








zEncDecRNNTModel.training_stepc                 C   s   |\}}}}}t |tr|jr| j||d\}	}
n	| j||d\}	}
~| jj|	|
dd}t |tjr:| 	 
 }tt||S )Nr   r   T)encoder_outputr   r   )r   r   r   r   rB   rnnt_decoder_predictions_tensorr   Tensorcpudetachnumpylistzip)rd   r%  	batch_idxdataloader_idxr'  r(  r)  r*  r,  r   r   best_hyp_textrg   rg   rh   predict_step(  s   zEncDecRNNTModel.predict_stepc                 C   sh  |\}}}}t |tr|jr| j||d\}}	n	| j||d\}}	~i }
| jjsm| jrI| j||d\}}}| j||d}| j|||	|d}||
d< | j	j
||	||d | j	 \}}}| j	  ||
d< ||
d	< ||
d
< n7d}| jr}| j||d\}}}nd }|}| j|||	|||d\}}}}|d ur||
d< ||
d< ||
d	< ||
d
< | dtj| jjtjd |
S )Nr   r   r   r   r  val_lossr  val_wer_numval_wer_denomval_werTr  r  r  )r   r   r   r   r=   r]   rE   r<   r1   r\   r   r!  r"  logr   r  r/   r  r  )rd   r%  r;  r<  r'  r(  r)  r*  r   r   r.  r<   r   r+  r=   r-  r\   wer_num	wer_denomr  decoded
target_lenrg   rg   rh   validation_pass:  sZ   

	zEncDecRNNTModel.validation_passc                 C   sR   |  |||}t| jjtkr!t| jjdkr!| j| | |S | j| |S )Nr0   )rH  r   r/   val_dataloadersr9  rL   validation_step_outputsappend)rd   r%  r;  r<  metricsrg   rg   rh   validation_step~  s    zEncDecRNNTModel.validation_stepc                 C   sf   | j |||d}dd | D }t| jjtkr+t| jjdkr+| j| | |S | j| |S )N)r<  c                 S   s   i | ]\}}| d d|qS )val_test_)replace).0r   valuerg   rg   rh   
<dictcomp>  s    z-EncDecRNNTModel.test_step.<locals>.<dictcomp>r0   )	rH  itemsr   r/   test_dataloadersr9  rL   test_step_outputsrK  )rd   r%  r;  r<  logs	test_logsrg   rg   rh   	test_step  s    zEncDecRNNTModel.test_stepr<  c                 C      | j rtdd |D  }d|i}ni }tdd |D  }tdd |D  }i |d| | i}i |d|iS )Nc                 S      g | ]}|d  qS )r?  rg   rQ  xrg   rg   rh   
<listcomp>      z>EncDecRNNTModel.multi_validation_epoch_end.<locals>.<listcomp>r?  c                 S   r[  )r@  rg   r\  rg   rg   rh   r^    r_  c                 S   r[  )rA  rg   r\  rg   rg   rh   r^    r_  rB  rC  rE   r   stackmeansumr   )rd   r   r<  val_loss_meanval_loss_logrD  rE  r.  rg   rg   rh   multi_validation_epoch_end     
z*EncDecRNNTModel.multi_validation_epoch_endc                 C   rZ  )Nc                 S   r[  )	test_lossrg   r\  rg   rg   rh   r^    r_  z8EncDecRNNTModel.multi_test_epoch_end.<locals>.<listcomp>rh  c                 S   r[  )test_wer_numrg   r\  rg   rg   rh   r^    r_  c                 S   r[  )test_wer_denomrg   r\  rg   rg   rh   r^    r_  test_werrC  r`  )rd   r   r<  test_loss_meantest_loss_logrD  rE  r.  rg   rg   rh   multi_test_epoch_end  rg  z$EncDecRNNTModel.multi_test_epoch_endr%  trcfgc                 C   s*   | j |d |d d\}}t||d}|S )Nr   r0   r   )r   r   )r   dict)rd   r%  ro  r   r   outputrg   rg   rh   _transcribe_forward  s   z#EncDecRNNTModel._transcribe_forwardc                 C   sT   | d}| d}| jj|||j|jd}~~|jr(t|| jj| j	d d }|S )Nr   r   )r   partial_hypothesesrJ   window_stride)
poprB   r4  r   r   r   r!   rK   subsampling_factorr.   )rd   r   ro  r   r   hyprg   rg   rh   _transcribe_output_processing  s   

z-EncDecRNNTModel._transcribe_output_processingztorch.utils.data.DataLoaderc                 C   s   d|v r|d }|d }nt j|d d}t|d t|d }|| jj| jj|dd|	dt|t 
 d d	d
}|	drG|	d|d< | jt|d}|S )a  
        Setup function for a temporary data loader which wraps the provided audio file.

        Args:
            config: A python dictionary which contains the following keys:
            paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments.                 Recommended length per file is between 5 and 25 seconds.
            batch_size: (int) batch size to use during inference.                 Bigger will result in better throughput performance but would use more memory.
            temp_dir: (str) A temporary directory where the audio manifest is temporarily
                stored.

        Returns:
            A pytorch DataLoader for the given audio file(s).
        manifest_filepathr   temp_dirzmanifest.jsonpaths2audio_filesFr   r0   T)ry  r   rM   r   trim_silencer   r   r   r   r   )ospathjoinminrL   rJ   r   r=   r>   rU   	cpu_countr   r	   )rd   r   ry  r   	dl_configtemporary_datalayerrg   rg   rh   _setup_transcribe_dataloader  s$   

z,EncDecRNNTModel._setup_transcribe_dataloaderc                    s  t    | jdkr5| j| jkr5| j D ]\}}|jd ur4tj	d| j|
 |j|jd}|jj| q| jrq| j\}}|d urq|d urq| j D ]\}}|jd urZ|jj| qJ| j D ]\}}|jd urp|jj| q`| jr| j D ]\}}|jd ur|j }|jj| qy| jr| j D ]\}}|jd ur|j }|jj| q| jr| j D ]\}}|jd ur|j }|jj| qd S d S )Nr   g        )rb  rj   sizedevicer	  )rG   on_after_backwardrp   r  rq   r<   named_parametersgradr   normalr  r  r	  r   add_rr   rs   rK   div_rt   normru   rv   r=   )rd   
param_nameparamnoiser   Ur  re   rg   rh   r    sX   










z!EncDecRNNTModel.on_after_backwardc                 C   s   ddgS )NrK   decoder_jointrg   rw   rg   rg   rh   list_export_subnets  s   z#EncDecRNNTModel.list_export_subnetsc                 C   s   t | j| jS N)r   r<   r=   rw   rg   rg   rh   r    s   zEncDecRNNTModel.decoder_jointc                    s<   d|v rt | dr| j|d d ntdt | d S )Ndecoder_typer   )r  z'Model does not have decoder type option)rX   r   	ExceptionrG   set_export_config)rd   argsre   rg   rh   r    s
   
z!EncDecRNNTModel.set_export_configc                 C   s    g }t dddd}|| |S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        !stt_zh_conformer_transducer_largezFor details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_zh_conformer_transducer_largezhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_conformer_transducer_large/versions/1.8.0/files/stt_zh_conformer_transducer_large.nemo)pretrained_model_namedescriptionlocation)r$   rK  )clsresultsmodelrg   rg   rh   list_available_models&  s   
z%EncDecRNNTModel.list_available_modelsc                 C   s   | j S r  _werrw   rg   rg   rh   r\   9  s   zEncDecRNNTModel.werc                 C   s
   || _ d S r  r  )rd   r\   rg   rg   rh   r\   =  s   
r  )	r~   FNr   NNTNN)T)NNNN)r   );__name__
__module____qualname____doc__r	   r   rH   ra   r   rT   r[   r   no_gradr   strr   npndarrayr   r   boolr   r   r   r   r   r   r   r   r   r   r   propertyr*   r   r   r%   r   r2  r>  rH  rM  rY  rf  rn  r   rr  rx  r  r  r  r  r  classmethodr$   r  r\   setter__classcell__rg   rg   re   rh   r-   5   s    U3&

	
Q\3N.5
b

D
	
(,

r-   )Jr   r}  mathr   typingr   r   r   r   r   r8  r  r   lightning.pytorchr   	omegaconfr	   r
   r   torch.utils.datar   nemo.collections.asr.datar   'nemo.collections.asr.data.audio_to_textr   ,nemo.collections.asr.data.audio_to_text_dalir   r   .nemo.collections.asr.data.audio_to_text_lhotser    nemo.collections.asr.losses.rnntr   r    nemo.collections.asr.metrics.werr   %nemo.collections.asr.models.asr_modelr   r   !nemo.collections.asr.modules.rnntr   !nemo.collections.asr.parts.mixinsr   r   r   r   0nemo.collections.asr.parts.preprocessing.segmentr   3nemo.collections.asr.parts.submodules.rnnt_decodingr   r   -nemo.collections.asr.parts.utils.asr_batchingr   +nemo.collections.asr.parts.utils.rnnt_utilsr    0nemo.collections.asr.parts.utils.timestamp_utilsr!   #nemo.collections.common.data.lhotser"   3nemo.collections.common.parts.preprocessing.parsersr#   nemo.core.classes.commonr$   r%   nemo.core.classes.mixinsr&   nemo.core.neural_typesr'   r(   r)   r*   r+   
nemo.utilsr,   r-   rg   rg   rg   rh   <module>   s<   