o
    }oi~                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZBmCZCmDZDmEZEmFZFmGZG d dlHmIZI dgZJG dd de#e$e&e(e'ZKdS )    N)ceil)AnyDictListOptionalUnion)Trainer)
DictConfig	OmegaConf	open_dict)
DataLoader)audio_to_text_dataset)_AudioTextDataset)AudioToCharDALIDatasetDALIOutputs)LhotseSpeechToTextBpeDataset)CTCLoss)WER)ASRModelExportableEncDecModel)ASRModuleMixinASRTranscriptionMixinInterCTCMixinTranscribeConfig)GenericTranscriptionTypeTranscriptionReturnType)ChannelSelectorType)CTCDecodingCTCDecodingConfig)get_semi_sorted_batch_sampler)
Hypothesis)process_timestamp_outputs)!get_lhotse_dataloader_from_config)make_parser)PretrainedModelInfo	typecheck)AccessMixin)AudioSignal
LabelsTypeLengthsTypeLogprobsType
NeuralTypeSpectrogramType)loggingEncDecCTCModelc                       sh  e Zd ZdZdSdedef fddZ							
		dTdeee	e e
jejef dedededee dededee dee def fddZdSde	e dee fddZdUdedefddZdee fddZd eeeef  fd!d"Zd#eeeef  fd$d%Zd&eeeef  fd'd(Zedeeeef  fd)d*Zedeeeef  fd+d,Z e! 	dVd-d.Z"d/d0 Z#dWd1d2Z$dWd3d4Z%dWd5d6Z&dWd7ef fd8d9Z'dWd7ef fd:d;Z(dWd<d=Z)d>d? Z*	 d@e+dAefdBdCZ,dAede-fdDdEZ.dFe/e/e0  fdGdHZ1deddIfdJdKZ2e3de	e4 fdLdMZ5ede	e fdNdOZ6edPdQ Z7e7j8dRdQ Z7  Z9S )Xr.   z0Base class for encoder decoder CTC-based models.Ncfgtrainerc                    s,  d| _ |d ur|j | _ t j||d t| jj| _t| jj| _t| jX d| jj	vs:| jj	j
sAt| jdrA| jj| jj	_
d| jj	vsL| jj	j
sPtd| jj	jdk ry| jj	jd urytd| jj	jt| jj	j t| jj	j|j	d< W d    n1 sw   Y  t| jj	| _	t| j	jd d| jd	d
d| _t| jdr| jjd urt| jj| _nd | _| jdd }|d u rtt}t| j || j_W d    n1 sw   Y  t| jjt | j	jd| _t!| j| jddd| jddd| _"| #  | j$dddd | %  d S )N   )r/   r0   feat_in	_feat_outz1param feat_in of the decoder's config is not set!zP
Replacing placeholder number of classes ({}) with actual number of classes - {}num_classesTctc_reduction
mean_batchr4   zero_infinity	reductionspec_augmentdecoding)
vocabularyuse_cerFlog_predictionr;   r=   dist_sync_on_stepr>   decoderlosswer)decoder_name	loss_namewer_name)&
world_sizesuper__init__r.   from_config_dict_cfgpreprocessorencoderr   rA   r2   hasattrr3   
ValueErrorr/   r4   r<   r-   infoformatlenr   num_classes_with_blankgetrB   r:   spec_augmentationr
   
structuredr   r;   r   to_containerr   rC   setup_optimization_flagssetup_interctcsetup_adapters)selfr/   r0   decoding_cfg	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/models/ctc_models.pyrI   4   sb   



zEncDecCTCModel.__init__   Fr   Taudio
batch_sizereturn_hypothesesnum_workerschannel_selector	augmentorverbose
timestampsoverride_configreturnc
           
         s  |p
|	dur	|	j nd}|dur~|s|	durI|	j rItd d}t| jj d| jj_d| jj_W d   n1 s:w   Y  | j| jjdd n5t| jj | jj	dd| jj_| jj	dd| jj_W d   n1 spw   Y  | j| jjdd t
 j|||||||||	d	S )	a  
        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.

        Args:
            audio: (a single or list) of paths to audio files or a np.ndarray/tensor audio array or 
                path to a manifest file.
                Can also be a dataloader object that provides values that can be consumed by the model.
                Recommended length per file is between 5 and 25 seconds.                 But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference.
                Bigger will result in better throughput performance but would use more memory.
            return_hypotheses: (bool) Either return hypotheses or text
                With hypotheses can do some postprocessing like getting timestamp or rescoring
            num_workers: (int) number of workers for DataLoader
            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels 
                from multi-channel audio. If set to `'average'`, it performs averaging across channels. 
                Disabled if set to `None`. Defaults to `None`.
            augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
            timestamps: Optional(Bool): timestamps will be returned if set to True as part of hypothesis 
                object (output.timestep['segment']/output.timestep['word']). Refer to `Hypothesis` class 
                for more details. Default is None and would retain the previous state set by 
                using self.change_decoding_strategy().
            verbose: (bool) whether to display tqdm progress bar
            override_config: (Optional[TranscribeConfig]) override transcription config pre-defined by the user.
                **Note**: All other arguments in the function will be ignored if override_config is passed.
                You should call this argument as `model.transcribe(audio, override_config=TranscribeConfig(...))`.

        Returns:
            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as 
            paths2audio_files
        NzTimestamps requested, setting decoding timestamps to True. Capture them in Hypothesis object,                         with output[idx].timestep['word'/'segment'/'char']TF)rh   compute_timestampspreserve_alignments)	rb   rc   rd   re   rf   rg   rh   ri   rj   )ri   r-   rP   r   r/   r;   rl   rm   change_decoding_strategyrT   rH   
transcribe)
r[   rb   rc   rd   re   rf   rg   rh   ri   rj   r]   r_   r`   ro   x   s8   +
zEncDecCTCModel.transcribenew_vocabularyr\   c              	   C   s  | j j|krtd| j j d| d dS |du s t|dkr'td| | j  }t|}||d< t||d< | ` t	
|| _ | `t| j jd	 d
| jddd| _|du r^| jj}tt}tt|}t||}t|t| j jd| _t| j| jddd
| jddd| _t| jj  || j_ W d   n1 sw   Y  t| jj || j_W d   n1 sw   Y  g d}|D ](}|| jv rt| j|  t|| j| d< W d   n1 sw   Y  qtd| j j d dS )a  
        Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model.
        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
        use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need
        model to learn capitalization, punctuation and/or special characters.

        If new_vocabulary == self.decoder.vocabulary then nothing will be changed.

        Args:

            new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically,             this is target alphabet.

        Returns: None

        zOld z	 and new z match. Not changing anything.Nr   z;New vocabulary must be non-empty list of chars. But I got: r<   r4   r1   Tr5   r6   r7   r\   r<   r=   Fr>   r?   )train_dsvalidation_dstest_dslabelszChanged decoder to output to z vocabulary.)rA   r<   r-   warningrR   rO   to_config_dictcopydeepcopyr.   rJ   rB   r   rS   rK   rT   r/   r;   r
   rV   r   createrW   merger   r   rC   r   rP   )r[   rp   r\   decoder_confignew_decoder_configdecoding_clsds_keyskeyr_   r_   r`   change_vocabulary   sX    






z EncDecCTCModel.change_vocabularyc                 C   s   |du rt d | jj}tt}tt|}t	||}t
|t| jjd| _t| j| jj| jjdd| _|dd| j_t| jj || j_W d   n1 sWw   Y  |rmt dt| jj  dS dS )	af  
        Changes decoding strategy used during CTC decoding process.

        Args:
            decoding_cfg: A config for the decoder, which is optional. If the decoding type
                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.
            verbose: (bool) whether to display logging information
        NzONo `decoding_cfg` passed when changing decoding strategy, using internal configrq   T)r;   r=   r>   r@   temperatureg      ?zChanged decoding strategy to 
)r-   rP   r/   r;   r
   rV   r   rz   rW   r{   r   rA   r<   r   rC   r=   r>   rT   r   r   to_yaml)r[   r\   rh   r~   r_   r_   r`   rn   	  s,   	


z'EncDecCTCModel.change_decoding_strategyconfigc                 C   s  t j| j|dd t j| j|dd |drZt||dds"| jn|d|dds/| jn|dtt|dd |d	d
|dd|dd|ddd|ddddS t j	|| j
| j| j| jdd d}|d u rrd S t|try|S |d }t|tjjjrd}t|dr|j}nt|jd dr|jd j}n	|jd jd j}d }|ddrt|tstdt| t| ||}d |d< d|d< d}tjjj||d |d ||dd||dd|ddd	S )Nsample_rate)r   ru   
use_lhotsedo_transcribeFglobal_rankrG   parseren	unk_indexblank_indexnormalize_transcripts)ru   nameunk_idblank_iddo_normalize)	tokenizerreturn_cuts)r   rG   datasetrL   )r   
local_rankr   rG   preprocessor_cfgshuffle
collate_fnr   use_semi_sorted_batchingzmSemi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset but found dataset of type rc   	drop_lastre   
pin_memory)	r   rc   samplerbatch_samplerr   r   r   re   r   )r   )inject_dataloader_value_from_model_configr/   rT   r"   r   rG   r   r#   *get_audio_to_text_char_dataset_from_configr   rK   
isinstancer   torchutilsdataIterableDatasetrN   r   datasetsr   RuntimeErrortyper   r   )r[   r   r   r   r   r   r_   r_   r`   _setup_dataloader_from_config0  sx   












z,EncDecCTCModel._setup_dataloader_from_configtrain_data_configc                 C   s   d|vrd|d< | j d|d | j|d| _| jdur\t| jdr^t| jjtjjj	r`| j
durPt| j
jtrPt| j
jtt| jj| j |d   | j
_dS | j
du rbtd	 dS dS dS dS dS )
a  
        Sets up the training data loader via a Dict-like object.

        Args:
            train_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   Ttraindataset_namer   r   Nr   rc   zModel Trainer was not set before constructing the dataset, incorrect number of training batches will be used. Please set the trainer and rebuild the dataset.)_update_dataset_configr   	_train_dlrN   r   r   r   r   r   r   _trainerlimit_train_batchesfloatintr   rR   rG   r-   rv   )r[   r   r_   r_   r`   setup_training_data~  s*   


z"EncDecCTCModel.setup_training_dataval_data_configc                 C   0   d|vrd|d< | j d|d | j|d| _dS )a  
        Sets up the validation data loader via a Dict-like object.

        Args:
            val_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   F
validationr   r   N)r   r   _validation_dl)r[   r   r_   r_   r`   setup_validation_data     z$EncDecCTCModel.setup_validation_datatest_data_configc                 C   r   )a  
        Sets up the test data loader via a Dict-like object.

        Args:
            test_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.

        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
        r   Ftestr   r   N)r   r   _test_dl)r[   r   r_   r_   r`   setup_test_data  r   zEncDecCTCModel.setup_test_datac              	   C   sx   t | jdrt| jjd}nt }td|ddttdt ddtdt ddttdt ddttdt dddS )	N_sample_rate)freqBTT)optionalr   )r   Dr   )input_signalinput_signal_lengthprocessed_signalprocessed_signal_length	sample_id)rN   rL   r'   r   r+   tupler)   r,   )r[   input_signal_eltyper_   r_   r`   input_types  s   zEncDecCTCModel.input_typesc                 C   s(   t dt t tdt t dt dS )N)r   r   r   r   r   )outputsencoded_lengthsgreedy_predictions)r+   r*   r   r)   r(   r[   r_   r_   r`   output_types  s   

zEncDecCTCModel.output_typesc                 C   s   |duo|du}|duo|du}||A dkrt |  d|s(| j||d\}}| jdur7| jr7| j||d}| j||d}|d }|d }	| j|d	}
|
jd
dd}|
|	|fS )a/  
        Forward pass of the model.

        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, that contains the individual lengths of the audio
                sequences.
            processed_signal: Tensor that represents a batch of processed audio signals,
                of shape (B, D, T) that has undergone processing via some DALI preprocessor.
            processed_signal_length: Vector of length B, that contains the individual lengths of the
                processed audio sequences.

        Returns:
            A tuple of 3 elements -
            1) The log probabilities tensor of shape [B, T, D].
            2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
            3) The greedy token predictions of the model of shape [B, T] (via argmax)
        NFz Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_len`` arguments.)r   length)
input_specr   )audio_signalr   r   r1   )encoder_outputr   )dimkeepdim)rO   rL   rU   trainingrM   rA   argmax)r[   r   r   r   r   has_input_signalhas_processed_signalr   encodedencoded_len	log_probsr   r_   r_   r`   forward  s,   
zEncDecCTCModel.forwardc                 C   sn  t | jrt |  |  rt jd| jd |\}}}}t|tr0|jr0| j	||d\}}}	n
| j	||d\}}}	t
| drI| jd urI| jj}
nd}
| j||||d}| |}| j||||d |
 dkd	\}}t | jrut |  ||| jjd d
 tj| jjtjdd |d |
 dkr| jj||||d | j \}}}| j  |d|i ||dS )NTaccess_enabledguidr   r   r   r   r   r1   r   targetsinput_lengthstarget_lengthsr   )compute_werlrdtype)
train_losslearning_rateglobal_steppredictionsr   targets_lengthspredictions_lengthstraining_batch_wer)rB   log)r&   is_access_enabled
model_guidreset_registryis_interctc_enabledset_access_enabledr   r   r   r   rN   r   log_every_n_stepsrB   add_auxiliary_lossesadd_interctc_lossesupdate
_optimizerparam_groupsr   tensorr0   r   float32rC   computereset)r[   batchbatch_nbsignal
signal_len
transcripttranscript_lenr   r   r   r   
loss_valuetensorboard_logsrC   _r_   r_   r`   training_step%  sN   






zEncDecCTCModel.training_stepc                 C   s   |\}}}}}t |tr|jr| j||d\}	}
}n
| j||d\}	}
}| jjj|	|
dd}t |tjr<|	 
  }tt||S )Nr   r   F)decoder_outputsdecoder_lengthsrd   )r   r   r   r   rC   r;   ctc_decoder_predictions_tensorr   Tensorcpudetachnumpylistzip)r[   r  	batch_idxdataloader_idxr	  r
  r  r  r   r   r   r   transcribed_textsr_   r_   r`   predict_step^  s   zEncDecCTCModel.predict_stepc                 C   s  |   rtjd| jd |\}}}}t|tr%|jr%| j||d\}}	}
n
| j||d\}}	}
| j|||	|d}| j	|||dddd\}}| j
j||||	d | j
 \}}}| j
  |||||d	 | d
tj| jjtjd t| jrt|  |S )NTr   r   r   r   val_)r   log_wer_num_denom
log_prefixr   )val_lossval_wer_numval_wer_denomval_werr   r   )r   r&   r   r   r   r   r   r   rB   r   rC   r   r  r  r   r   r  r0   r   r  r   r   )r[   r  r  r  r	  r
  r  r  r   r   r   r  metricsrC   wer_num	wer_denomr_   r_   r`   validation_passq  s@   
	

zEncDecCTCModel.validation_passc                 C   sR   |  |||}t| jjtkr!t| jjdkr!| j| | |S | j| |S )Nr1   )r(  r   r0   val_dataloadersr  rR   validation_step_outputsappend)r[   r  r  r  r%  r_   r_   r`   validation_step  s    zEncDecCTCModel.validation_stepr  c                    "   t  ||}| j||dd |S )Nr  prefix)rH   multi_validation_epoch_endfinalize_interctc_metricsr[   r   r  r%  r]   r_   r`   r0       z)EncDecCTCModel.multi_validation_epoch_endc                    r-  )Ntest_r.  )rH   multi_test_epoch_endr1  r2  r]   r_   r`   r5    r3  z#EncDecCTCModel.multi_test_epoch_endc                 C   sf   | j |||d}dd | D }t| jjtkr+t| jjdkr+| j| | |S | j| |S )N)r  c                 S   s   i | ]\}}| d d|qS )r  r4  )replace).0r   valuer_   r_   r`   
<dictcomp>  s    z,EncDecCTCModel.test_step.<locals>.<dictcomp>r1   )	r(  itemsr   r0   test_dataloadersr  rR   test_step_outputsr+  )r[   r  r  r  logs	test_logsr_   r_   r`   	test_step  s    zEncDecCTCModel.test_stepc                 C   s   | j d ur| j S d S N)r   r   r_   r_   r`   test_dataloader  s   
zEncDecCTCModel.test_dataloaderr  trcfgc                 C   s.   | j |d |d d\}}}t||d}~|S )Nr   r1   r   )logits
logits_len)r   dict)r[   r  rB  rC  rD  r   outputr_   r_   r`   _transcribe_forward  s   z"EncDecCTCModel._transcribe_forwardc                 C   s   | d}| d}| jj|||jd}|jr_|jr0tj|j|jt	ddd}|j
|dd n|}| }t|jd D ] }||d || f  || _|| jd u r]|| j|| _q=~~~|jrqt|| jj| jd	 d
 }|S )NrC  rD  )r  rd   r  T)r   devicer   )non_blockingr   rL   window_stride)popr;   r  rd   is_cudar   emptyshaper   rH  copy_r  rangeclone
y_sequence
alignmentsri   r!   rM   subsampling_factorr/   )r[   r   rB  rC  rD  
hypotheses
logits_cpuidxr_   r_   r`   _transcribe_output_processing  s2   

z,EncDecCTCModel._transcribe_output_processingall_hypothesisc                 C   s   dd |D S )Nc                 S   s   g | ]}|d  qS r   r_   )r7  hypr_   r_   r`   
<listcomp>  s    z5EncDecCTCModel.get_best_hyptheses.<locals>.<listcomp>r_   )r[   rY  r_   r_   r`   get_best_hyptheses  s   z!EncDecCTCModel.get_best_hypthesesztorch.utils.data.DataLoaderc                 C   s   d|v r|d }|d }nt j|d d}t|d t|d }|| jjt| j	j
|dd|dt|t  d d	|d
dd	}|drO|d|d< | jt|d}|S )a  
        Setup function for a temporary data loader which wraps the provided audio file.

        Args:
            config: A python dictionary which contains the following keys:
            paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments.                 Recommended length per file is between 5 and 25 seconds.
            batch_size: (int) batch size to use during inference.                 Bigger will result in better throughput performance but would use more memory.
            temp_dir: (str) A temporary directory where the audio manifest is temporarily
                stored.
            num_workers: (int) number of workers. Depends of the batch_size and machine.                 0 - only the main process will load batches, 1 - one worker (not main process)

        Returns:
            A pytorch DataLoader for the given audio file(s).
        manifest_filepathrc   temp_dirzmanifest.jsonpaths2audio_filesFre   r1   Trf   N)	r^  r   ru   rc   trim_silencer   re   r   rf   rg   r   )ospathjoinminrR   rL   r   r
   rW   rA   r<   rT   	cpu_countr   r	   )r[   r   r^  rc   	dl_configtemporary_datalayerr_   r_   r`   _setup_transcribe_dataloader  s&   


z+EncDecCTCModel._setup_transcribe_dataloaderc                 C   sX  g }t dddd}|| t dddd}|| t dd	d
d}|| t dddd}|| t dddd}|| t dddd}|| t dddd}|| t dddd}|| t dddd}|| t dddd}|| t d d!d"d}|| t d#d$d%d}|| t d#d$d%d}|| t d&d'd(d}|| |S ))z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        zQuartzNet15x5Base-Ena   QuartzNet15x5 model trained on six datasets: LibriSpeech, Mozilla Common Voice                 (validated clips from en_1488h_2019-12-10), WSJ, Fisher, Switchboard, and NSC Singapore English.                     It was trained with Apex/Amp optimization level O1 for 600 epochs. The model achieves a WER of                     3.79% on LibriSpeech dev-clean, and a WER of 10.05% on dev-other. Please visit                         https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels for further details.zmhttps://api.ngc.nvidia.com/v2/models/nvidia/nemospeechmodels/versions/1.0.0a5/files/QuartzNet15x5Base-En.nemo)pretrained_model_namedescriptionlocationstt_en_quartznet15x5zqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_quartznet15x5zwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_quartznet15x5/versions/1.0.0rc1/files/stt_en_quartznet15x5.nemostt_en_jasper10x5drzpFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5drzuhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_jasper10x5dr/versions/1.0.0rc1/files/stt_en_jasper10x5dr.nemostt_ca_quartznet15x5zqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5zwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_quartznet15x5/versions/1.0.0rc1/files/stt_ca_quartznet15x5.nemostt_it_quartznet15x5zqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5zwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_quartznet15x5/versions/1.0.0rc1/files/stt_it_quartznet15x5.nemostt_fr_quartznet15x5zqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5zwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_quartznet15x5/versions/1.0.0rc1/files/stt_fr_quartznet15x5.nemostt_es_quartznet15x5zqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5zwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_quartznet15x5/versions/1.0.0rc1/files/stt_es_quartznet15x5.nemostt_de_quartznet15x5zqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5zwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_quartznet15x5/versions/1.0.0rc1/files/stt_de_quartznet15x5.nemostt_pl_quartznet15x5zqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5zwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_pl_quartznet15x5/versions/1.0.0rc1/files/stt_pl_quartznet15x5.nemostt_ru_quartznet15x5zqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5zwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_quartznet15x5/versions/1.0.0rc1/files/stt_ru_quartznet15x5.nemostt_zh_citrinet_512zpFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512zuhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_512/versions/1.0.0rc1/files/stt_zh_citrinet_512.nemostt_zh_citrinet_1024_gamma_0_25z|For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_zh_citrinet_1024_gamma_0_25.nemoasr_talknet_alignerzpFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:asr_talknet_alignerzzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/asr_talknet_aligner/versions/1.0.0rc1/files/qn5x5_libri_tts_phonemes.nemo)r$   r+  )clsresultsmodelr_   r_   r`   list_available_models  s   
	












z$EncDecCTCModel.list_available_modelsc                 C   s   g dS )N) rM   rA   r_   r   r_   r_   r`   adapter_module_names  s   z#EncDecCTCModel.adapter_module_namesc                 C   s   | j S r@  _werr   r_   r_   r`   rC     s   zEncDecCTCModel.werc                 C   s
   || _ d S r@  r  )r[   rC   r_   r_   r`   rC     s   
r@  )ra   Fr   NNTNN)T)NNNNrZ  ):__name__
__module____qualname____doc__r	   r   rI   r   strr   r   r  npndarrayr   r   boolr   r   r   r   ro   r   rn   r   r   r   r   r   propertyr+   r   r   r%   r   r  r  r(  r,  r0  r5  r?  rA  r   rG  r   rX  r  r    r]  ri  classmethodr$   r|  r~  rC   setter__classcell__r_   r_   r]   r`   r.   1   s    G	
JG'N-5
9

*
	(*r
)Lrx   rb  mathr   typingr   r   r   r   r   r  r  r   lightning.pytorchr   	omegaconfr	   r
   r   torch.utils.datar   nemo.collections.asr.datar   'nemo.collections.asr.data.audio_to_textr   ,nemo.collections.asr.data.audio_to_text_dalir   r   .nemo.collections.asr.data.audio_to_text_lhotser   nemo.collections.asr.losses.ctcr    nemo.collections.asr.metrics.werr   %nemo.collections.asr.models.asr_modelr   r   !nemo.collections.asr.parts.mixinsr   r   r   r   /nemo.collections.asr.parts.mixins.transcriptionr   r   0nemo.collections.asr.parts.preprocessing.segmentr   2nemo.collections.asr.parts.submodules.ctc_decodingr   r   -nemo.collections.asr.parts.utils.asr_batchingr   +nemo.collections.asr.parts.utils.rnnt_utilsr    0nemo.collections.asr.parts.utils.timestamp_utilsr!   #nemo.collections.common.data.lhotser"   3nemo.collections.common.parts.preprocessing.parsersr#   nemo.core.classes.commonr$   r%   nemo.core.classes.mixinsr&   nemo.core.neural_typesr'   r(   r)   r*   r+   r,   
nemo.utilsr-   __all__r.   r_   r_   r_   r`   <module>   s>    