o
    }oiJ                     @   s0  d dl Z d dlZd dlmZmZ d dlmZmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZmZ d dlmZ d d	lmZmZ d d
lm Z m!Z! d dl"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dlFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZN d dlOmPZPmQZQ dgZRdd ZSdd  ZTeG d!d" d"e+ZUeG d#d$ d$e,ZVG d%d de#e$e&e'e(ZWd&eXdB d'eYeX fd(d)ZZdS )*    N)MappingSequence)	dataclassfield)ceil)AnyDictListOptionalUnion)Trainer)
DictConfig
ListConfig	OmegaConf	open_dict)
DataLoader) PromptedAudioToTextLhotseDatasetPromptedAudioToTextMiniBatch)BLEUWER)ASRModelExportableEncDecModel)ASRBPEMixinASRModuleMixinASRTranscriptionMixin)GenericTranscriptionTypeInternalTranscribeConfigTranscribeConfig)ChannelSelectorType)MultiTaskDecodingMultiTaskDecodingConfig)TokenClassifier)
Hypothesis)process_aed_timestamp_outputs)
tokenizers)!get_lhotse_dataloader_from_config)GlobalAverageLossMetrictransformer_weights_init)get_full_path)PromptFormatter)	typecheck)AudioSignalChannelType
LabelsTypeLengthsTypeLogprobsTypeMaskType
NeuralTypeSpectrogramType)loggingmodel_utilsEncDecMultiTaskModelc                 C   s4   | j d }tj|| jd}|||| dk }|S )z1
    Create a mask from a tensor of lengths.
    r   )device   )shapetorcharanger7   expand	unsqueeze)lens
max_length
batch_sizer;   mask rB   d/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/models/aed_multitask_models.pylens_to_maskC   s   
rD   c                 C   sj   d| vrt dd| vrt dd| vrt dd| jvr!t dd	| jvr*t d
d| jvr3t dd S )N	tokenizerz:`cfg` must have `tokenizer` config to create a tokenizer !prompt_formatzE`cfg` must have `prompt_format` config to create a multi task model !model_defaultsz;`cfg` must have `model_defaults` config to create a model !asr_enc_hiddenz5`cfg.model_defaults` must have `asr_enc_hidden` key !lm_enc_hiddenz4`cfg.model_defaults` must have `lm_enc_hidden` key !lm_dec_hiddenz4`cfg.model_defaults` must have `lm_dec_hidden` key !)
ValueErrorrG   )cfgrB   rB   rC   _config_checkM   s   


rM   c                   @   s2   e Zd ZU dZdZee ed< dZee ed< dS )$MultiTaskTranscriptionInternalConfig4
    Configuration for Multi Task Transcription
    Nmanifest_filepathprimary_language)	__name__
__module____qualname____doc__rP   r
   str__annotations__rQ   rB   rB   rB   rC   rN   ^   s   
 rN   c                   @   sp   e Zd ZU dZdZeeeeeef f  dB ed< dZ	eed< dZ
eed< edd	 d
Zee ed< dd ZdS )MultiTaskTranscriptionConfigrO   Npromptanswer
text_fieldtarget_lang
lang_fieldc                   C      t  S N)rN   rB   rB   rB   rC   <lambda>s   s    z%MultiTaskTranscriptionConfig.<lambda>)default_factory	_internalc                 C   s   t | j| _d S r_   )parse_multitask_promptrY   selfrB   rB   rC   __post_init__v   s   z*MultiTaskTranscriptionConfig.__post_init__)rR   rS   rT   rU   rY   listdictrV   rW   r[   r]   r   rb   r
   rN   rf   rB   rB   rB   rC   rX   h   s   
 $rX   c                       s  e Zd ZdZd[dedef fddZdefdd	Z		d\d
ee	ef de	de
e de
e	 fddZ	d\de
e	 de
eee	ef   fddZe 								d]dee	ee	 ejef dededede
e dedede
e de
e deee	 ee f f fd d!Zd"e
e fd#d$Zd%e
e fd&d'Zd(e
eeef  fd)d*Zd+e
eeef  fd,d-Zede
ee	e f  fd.d/Z!ede
ee	e f  fd0d1Z"e# 						d^d2d3Z$d4e%fd5d6Z&d_d4e%fd8d9Z'd`d:d;Z(d`d<d=Z)d>d? Z*	 d@ef fdAdBZ+dCee	 dDe	d@edee	ef f fdEdFZ,d4e%e-ej.dGf B d@ede/fdHdIZ0d@ede1fdJdKZ2d"eddLfdMdNZ3d@ef fdOdPZ4d@efdQdRZ5e6defdSdTZ7	dad4e%fdUdVZ8edee	 fdWdXZ9ede/fdYdZZ:  Z;S )br6   z$Base class for AED multi-task modelsNrL   trainerc                    s~  t |}t |}t| |j| _|j| _| |j t	| j}|| j|
d }d ur3t|nd d| _t j||d t| jj| _t| jj| _| jjj}| jjj}||krgtj||| _ntj | _| j
dd }d| _|d ur|d dkrd| _t|| _d	| jjjd
   | j  fdd |j!}dt"| jj#d  }	t$| d|v r|	|d d< W d    n1 sw   Y  t|| _!t$| jj% |	| jj%_&W d    n1 sw   Y  t| jj%| _'t(| j't)r| j!j*j+j,| j'j-j._,d	| jjjd
   | j!  fdd | j'  fdd | j
dd }
|
d u rJt/t0}
t$| j |
| j_1W d    n	1 sEw   Y  t2| jj1| j!| j'| jd| _1t$| jj3 | jj4| jj3_4W d    n	1 sqw   Y  t| jj3| _3t5| jdr| jj6d urt| jj6| _7nd | _7t8ddd| _9t:| j1| j
dd| _;t<| j1| j
dddd| _=| >  d S )Nprompt_defaultsrE   defaults)rL   ri   transf_encoderF
num_layersr   Tr8         ?c                    
   t |  S r_   r'   modulestd_init_rangerB   rC   r`         
 z/EncDecMultiTaskModel.__init__.<locals>.<lambda>   config_dict
vocab_sizec                    rp   r_   r'   rq   rs   rB   rC   r`      ru   c                    rp   r_   r'   rq   rs   rB   rC   r`      ru   decodingdecoding_cfgtransformer_decoderlog_softmax_modulerE   spec_augment)dist_sync_on_steptake_avg_losslog_prediction)r   bleu_tokenizer13a)tokenizer   )?r5   #convert_model_config_to_dict_configmaybe_update_config_versionrM   rF   sample_rate_setup_tokenizerrE   r*   resolvegetr   to_containerrY   super__init__r6   from_config_dictrL   preprocessorencoderrG   rH   rJ   r:   nnLinearencoder_decoder_projIdentityuse_transf_encoderrm   rI   applytransf_decoderr   rx   r   headnum_classeslog_softmax
isinstancer!   	embeddingtoken_embeddingweightmlplayer0
structuredr    ry   r   losspad_idhasattrr~   spec_augmentationr&   val_lossr   werr   bleusetup_adapters)re   rL   ri   
prompt_clspdasr_enc_hidden_sizedecoder_hidden_sizetransf_encoder_cfg_dicttransf_decoder_cfg_dictrx   r{   	__class__rs   rC   r   }   s   







zEncDecMultiTaskModel.__init__r{   c                 C   s   |du rt d | jj}tt}tt|}t	||}t
|| j| j| jd| _t| jj || j_W d   n1 sAw   Y  t dt| jj  dS )a*  
        Changes decoding strategy used during Multi Task decoding process.

        Args:
            decoding_cfg: A config for the decoder, which is optional. If the decoding type
                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.
        NzONo `decoding_cfg` passed when changing decoding strategy, using internal configrz   zChanged decoding strategy to 
)r4   inforL   ry   r   r   r    creater   merger   r   r   rE   r   to_yaml)re   r{   decoding_clsrB   rB   rC   change_decoding_strategy   s    


z-EncDecMultiTaskModel.change_decoding_strategynew_tokenizer_dirnew_tokenizer_typerF   c              
      sF  t |ttfr|dkrt |tst|}|}n	td| d}|dur(|}ntj|s5t	d| |
 dvr?tdt||d}|du rO| jj}| | | jj }| j }dt| jjd  }	t| d	|v rx|	|d	 d
< W d   n1 sw   Y  | j }
t|| _| j }|
 D ]'\}}||v r|j|| jkr|||< qtd| d|j d|| j  q| j| t| jj |	| jj_W d   n1 sw   Y  | `t| jj| _t | jt r| jj!j"j#| jj$j%_#d| jj&j'd   | j( fdd |du r| jj)}t*t+}tt,|}t-||}| `)t.|| j| j| jd| _)t| jj) || j_)W d   n	1 sQw   Y  t| jj/ | jj0| jj/_0W d   n	1 sow   Y  | `/t| jj/| _/t| j || j_W d   n	1 sw   Y  t1d| d dS )a*  
        Changes vocabulary used during AED decoding process. Use this method when fine-tuning on
        from pre-trained model. This method changes only decoder and leaves encoder and pre-processing
        modules unchanged. For example, you would use it if you want to use pretrained encoder when
        fine-tuning on data in another language, or when you'd need model to learn capitalization,
        punctuation and/or special characters.

        Args:
            new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer
                (if the tokenizer type is `agg`)
            new_tokenizer_type: Type of tokenizer. Can be either `agg`, `bpe` or `wpe`.
            decoding_cfg: A config for the decoding, which is optional. If the decoding type
                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.
            prompt_format: A string alias of the object that represents the prompt structure.
                If not None, it will be used to update the prompt format.
        aggzyNew tokenizer dir should be a string unless the tokenizer is `agg`, but this                          tokenizer type is: NzJNew tokenizer dir must be non-empty path to a directory. But instead got: )bpewpez0New tokenizer type must be either `bpe` or `wpe`)dirtyperv   rw   rx   zSkipping key `z|` in the `transf_decoder` module from original state dict due to shape mismatch after change in vocabulary.
Original shape: z, New shape: r8   ro   c                    rp   r_   r'   rq   rs   rB   rC   r`   m  ru   z8EncDecMultiTaskModel.change_vocabulary.<locals>.<lambda>rz   zChanged decoder to output to z vocabulary.)2r   rh   r   r   r   rK   ospathisdirNotADirectoryErrorlowerrL   rF   r   rE   	get_vocabr   to_config_dictr   rx   r   
state_dictr6   r   itemsr9   r4   warningload_state_dictr   r   r   r!   r   r   r   r   r   rG   rJ   r   ry   r   r    r   r   r   r   r   r   )re   r   r   r{   rF   new_tokenizer_cfgtokenizer_cfg
vocabularyr   rx   original_decoder_state_dictdecoder_state_dictog_keyog_valuer   rB   rs   rC   change_vocabulary
  s   











z&EncDecMultiTaskModel.change_vocabularyrj   c                 C   s  |dur|| _ |dur<t|tstd|D ]}t|ts!tdd|vr)tdd|vr1tdqt|ts<t|}t	| j }|| j
| jd }durTt|ndd| _t| j | j | j_ || j_W d   n1 srw   Y  td	| j  d
 dS )a^  
        Changes the prompt format used during Multi Task decoding process.

        Args:
            prompt_format: A string alias of the object that represents the prompt structure.
                If not None, it will be used to update the prompt format.
            prompt_defaults: A dictionary of default values for the prompt format.
        Nz0`prompt_defaults` must be a list of dictionariesrolezR`prompt_defaults` must have a `role` key for each item in the list of dictionariesslotszS`prompt_defaults` must have a `slots` key for each item in the list of dictionariesrj   rk   zChanged prompt format to ``)rF   r   r   rK   r   r   r   r   r*   r   rE   rL   r   r   rY   r   rj   r4   r   )re   rF   rj   itemr   r   rB   rB   rC   change_prompt  s:   



 

z"EncDecMultiTaskModel.change_prompt   Fr   Taudior@   return_hypothesesnum_workerschannel_selector	augmentorverbose
timestampsoverride_configreturnc
              	      s   |dur |du rd}n|du rd}n
t |}|dv sJ ||
d< |	du r0t|||||||
d}nt|	tsAtd	t d
t|	 |	}t j||dS )a	  
        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
        Args:
            audio: (a single or list) of paths to audio files or a np.ndarray/tensor audio array or path 
                to a manifest file.
                Can also be a dataloader object that provides values that can be consumed by the model.
                Recommended length per file is between 5 and 25 seconds.                 But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference.
                Bigger will result in better throughput performance but would use more memory.
            return_hypotheses: (bool) Either return hypotheses or text
                With hypotheses can do some postprocessing like getting timestamp or rescoring
            num_workers: (int) number of workers for DataLoader
            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels 
                from multi-channel audio. If set to `'average'`, it performs averaging across channels. 
                Disabled if set to `None`. Defaults to `None`.
            augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
            timestamps: Optional(Bool): timestamps will be returned if set to True as part of hypothesis 
                object (output.timestep['segment']/output.timestep['word']). Refer to `Hypothesis` class 
                for more details. Default is None and would retain the previous state set by using 
                self.change_decoding_strategy(). 
            Note: Currently its not supported for AED models.
            verbose: (bool) whether to display tqdm progress bar
            override_config: (Optional[MultiTaskTranscriptionConfig]) A config to override the 
                default config.
            **prompt: Optional input to construct the prompts for the model. Accepted formats are: 
                1) legacy Canary-1B API source_lang=<lang>, target_lang=<lang>, etc. 
                2) explicit single-turn role=<role>, slots={<slot>: <value>, ...} 
                3) explicit multi-turn: turns=[{"role": <role>, "slots": {<slot>: <value>, ...}}]

        Returns:
            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order 
            as paths2audio_files
        NTyesFno)r   r   	timestampnotimestamp10r   )r@   r   r   r   r   r   rY   z override_config must be of type z
, but got )r   r   )rV   rX   r   rK   r   r   
transcribe)re   r   r@   r   r   r   r   r   r   r   rY   trcfgr   rB   rC   r     s4   0

zEncDecMultiTaskModel.transcribeconfigc                 C   sP   | dds
J d| d| j}| d| j}t|||t| j| jd| jdS )N
use_lhotseFztMulti-task model only supports dataloading with Lhotse. Please set config.{train,validation,test}_ds.use_lhotse=Trueglobal_rank
world_size)rE   rY   )r   r   datasetrE   )r   r   r   r%   r   rE   rY   )re   r   r   r   rB   rB   rC   _setup_dataloader_from_config  s   z2EncDecMultiTaskModel._setup_dataloader_from_configtrain_data_configc                 C   s   | j d|d | j|d| _d|v rG|d rI| jd ur;t| jjtr;t| jjtt	| jj
| j |d   | j_d S | jd u rKtd d S d S d S d S )Ntraindataset_namer   r   	is_tarredr@   zModel Trainer was not set before constructing the dataset, incorrect number of training batches will be used. Please set the trainer and rebuild the dataset.)_update_dataset_configr   	_train_dl_trainerr   limit_train_batchesfloatintr   lenr   r   r4   r   )re   r   rB   rB   rC   setup_training_data(  s   

z(EncDecMultiTaskModel.setup_training_dataval_data_configc                 C   0   d|vrd|d< | j d|d | j|d| _dS )ap  
        Sets up the validation data loader via a Dict-like object.
        Args:
            val_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.
        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text_lhotse_prompted.PromptedAudioToTextLhotseDataset`
        shuffleF
validationr   r   N)r   r   _validation_dl)re   r   rB   rB   rC   setup_validation_dataB     	z*EncDecMultiTaskModel.setup_validation_datatest_data_configc                 C   r   )ak  
        Sets up the test data loader via a Dict-like object.
        Args:
            test_data_config: A config that contains the information regarding construction
                of an ASR Training dataset.
        Supported Datasets:
            -   :class:`~nemo.collections.asr.data.audio_to_text_lhotse_prompted.PromptedAudioToTextLhotseDataset`
        r   Ftestr   r   N)r   r   _test_dl)re   r  rB   rB   rC   setup_test_dataR  r  z$EncDecMultiTaskModel.setup_test_datac                 C   s   t | jdrt| jjd}nt }td|ddttdt ddtdt ddttdt ddtdt ddttdt ddtdt ddttdt ddttdt ddd	S )	N_sample_rate)freqBTT)optionalr  )r  Dr  )	input_signalinput_signal_lengthprocessed_signalprocessed_signal_length
transcripttranscript_lengthrY   prompt_length	sample_id)	r   r   r,   r  r2   tupler/   r3   r.   )re   input_signal_eltyperB   rB   rC   input_typesb  s   z EncDecMultiTaskModel.input_typesc                 C   s2   t dt t tdt t dt t dt dS )N)r  r  r  r  r
  )transf_log_probsencoded_lengthsencoder_statesencoder_mask)r2   r0   r  r/   r-   r1   rd   rB   rB   rC   output_typest  s
   


z!EncDecMultiTaskModel.output_typesc                 C   s  |duo|du}|duo|du}||A dkrt |  d|s(| j||d\}}| jdur7| jr7| j||d}| j||d\}	}
|	ddd	}| |}t|
|jd	 	|j
}| jrb| j||d
}d}|durt||jd	 	|j
}| j||||d}| j|d}||
||fS )aO  
        Forward pass of the model.
        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, that contains the individual lengths of the audio
                sequences.
            processed_signal: Tensor that represents a batch of processed audio signals,
                of shape (B, D, T).
            processed_signal_length: Vector of length B, that contains the individual lengths of the
                processed audio sequences.
            # TODO: Add support for `transcript` and `transcript_length` in the docstring

        Returns:
            A tuple of 3 elements -
            1) The log probabilities tensor of shape [B, T, D].
            2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
            3) The greedy token predictions of the model of shape [B, T] (via argmax)
        NFz Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_len`` arguments.)r  length)
input_specr  )audio_signalr  r      r8   )r  r  )	input_idsdecoder_maskencoder_embeddingsr  )hidden_states)rK   r   r   trainingr   permuter   rD   r9   todtyper   rm   r   r   )re   r  r  r  r  r  r  has_input_signalhas_processed_signalencodedencoded_len
enc_statesenc_maskr  dec_mask
dec_statesrB   rB   rC   forward}  s4   

zEncDecMultiTaskModel.forwardbatchc                 C   s.  |d u r
t dgS | \}}|jd }|j  }|j  }t j|j	 |j
t jd}t j|j	 |j
t jd}	| j|j|j||d\}
}}}| jddrj|jjd d }t||t|jd | @ }nd }| j|
||d}|t | jjd d	 t |jjd |||| ||	 d
}||dS )Ng        r8   )r7   r*  r  r  r  r  use_loss_mask_for_promptF	log_probslabelsoutput_maskr   lr)
train_losslearning_rater@   
num_frames
num_tokensinput_to_padding_ratiooutput_to_padding_ratio)r   log)r:   tensorget_decoder_inputs_outputsprompted_transcript_lens
audio_lenssumr   	as_tensorr   numelr7   prompted_transcriptr3  rL   r   r9   rD   prompt_lensr   
_optimizerparam_groups)re   r4  batch_nbr#  r9  input_ids_lensr>  r?  
tot_frames
tot_tokensr  r.  r/  r0  maxlen	loss_mask
audio_losstensorboard_logsrB   rB   rC   training_step  s8   



z"EncDecMultiTaskModel.training_stepvalc                 C   sV  |  \}}|jd }| j|j|j||jd\}}	}
}| jddr?|jjd d }t	||t	|j
d | @ }|  }nd }|jd |jd  }| j|||d}| j||d | d|i}| jj|
|	|j|j||jd	 | j \}}}||||d
 | j  | jj|
|	|j|j||jd	 | jj| dd}|| | j  |S )Nr8   r5  r6  Fr   r7  )r   num_measurements_loss)predictionspredictions_lengthstargetstargets_lengthspredictions_maskr#  )val_werval_wer_numval_wer_denom_)prefix)rD  rE  r3  r   rF  rL   r   rJ  r9   rD   rK  longrG  r   r   r   updater  transcript_lensrY   computeresetr   )re   r4  	batch_idxdataloader_idx	eval_moder#  r9  rO  r  r.  r/  r0  rR  rS  rX  transf_lossoutput_dictr   wer_num	wer_denombleu_metricsrB   rB   rC   validation_pass  sP   




z$EncDecMultiTaskModel.validation_passc                 C   V   | j |||dd}t| jjtkr#t| jjdkr#| j| | |S | j| |S )NrW  rk  r8   rq  r   ri   val_dataloadersrg   r   validation_step_outputsappendre   r4  ri  rj  metricsrB   rB   rC   validation_step      z$EncDecMultiTaskModel.validation_stepc                 C   rr  )Nr  rs  r8   rt  rx  rB   rB   rC   	test_step#  r{  zEncDecMultiTaskModel.test_stepc                 C   s   | j d ur| j S d S r_   )r  rd   rB   rB   rC   test_dataloader+  s   
z$EncDecMultiTaskModel.test_dataloaderr   c                    s   t  || | j  t|trHtdt| d td t| j	t
jrJt|drLt|jdrN| j	jd |j_td|jj d d	S d	S d	S d	S d	S )
aQ  
        Transcription setup method.
        Args:
            audio: A list of paths to audio files or a path to a manifest file.
            trcfg: A config for the transcription, which is optional. If the decoding type
                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.
        zFound 'audio' to be a list of z items.z6Assuming each item in 'audio' is a path to audio file.rb   rQ   r   z%Transcribing with default setting of .N)r   _transcribe_on_beginr   freezer   rg   r4   debugr   rE   r$   AggregateTokenizerr   rb   langsrQ   )re   r   r   r   rB   rC   r  1  s   


z)EncDecMultiTaskModel._transcribe_on_beginaudio_filestemp_dirc                    s&   |j j}| |||}t |||S )aj  
        Internal function to process the input audio filepaths and return a config dict for the dataloader.
        This implementation adds support for dictionaries as manifest items.

        Args:
            audio_files: A list of string filepaths for audio files, or a single string filepath for a manifest file.
            temp_dir: A temporary directory to store intermediate files.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            A config dict that is used to setup the dataloader for transcription.
        )rb   rP   _may_be_make_dict_and_fix_pathsr   %_transcribe_input_manifest_processing)re   r  r  r   rP   r   rB   rC   r  G  s   z:EncDecMultiTaskModel._transcribe_input_manifest_processing.c                    sf  t |tr|j}|j}|j}n|d |d }}t|dkr#|d }nd}|jd }| j||d\}}}	}
|du r| j }|jsD|}nQ|j	 }|D ]I}|d   fdd	|D  }rt|dkrnt
d
 d d |d d }|d 	 |d< | D ]\}}|d |du r||d |< qqK| jj|dd d|d|jj}t|||	|
|dS )au  
        Internal function to perform the model's custom forward pass to return outputs that are processed by
        `_transcribe_output_processing()`.
        This function is called by `transcribe()` and `transcribe_generator()` to perform the model's forward pass.

        Args:
            batch: A batch of input data from the data loader that is used to perform the model's forward pass.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            The model's outputs that are processed by `_transcribe_output_processing()`.
        r   r8      r   N)r  r  r   c                    s   g | ]
}|d   kr|qS r   rB   .0tr  rB   rC   
<listcomp>      z<EncDecMultiTaskModel._transcribe_forward.<locals>.<listcomp>z-More than one default turn detected for role=z@. We'll be using default slot values for the first turn of role=z only.r   )turnscontext_ids)r8  r  r  r  decoder_input_ids)r   r   r   rF  rY   r   r9   r3  get_default_dialog_slotscopywarningswarnr   r   encode_dialogr=   repeatr)  rb   r7   rh   )re   r4  r   r   rF  r  r@   r8  r.  r/  r0  default_turnsr  turndefault_turns_for_roledefault_slotsslotrW  rB   r  rC   _transcribe_forward\  sT   





z(EncDecMultiTaskModel._transcribe_forwardc           	      C   sp   | d}| d}| d}| d}| d}~~| jj||||jd}~~~t|| jj| jd d }|S )	aW  
        Internal function to process the model's outputs to return the results to the user. This function is called by
        `transcribe()` and `transcribe_generator()` to process the model's outputs.

        Args:
            outputs: The model's outputs that are processed by `_transcribe_forward()`.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            The output can be a list of
            objects, list of list of objects.
            Its type is defined in `TranscriptionReturnType`.
        r8  r  r  r  r  encoder_hidden_statesencoder_input_maskr  r   r   window_stride)popry   decode_predictions_tensorr   r#   r   subsampling_factorrL   )	re   outputsr   r8  r.  r/  r0  r  
hypothesesrB   rB   rC   _transcribe_output_processing  s"   




z2EncDecMultiTaskModel._transcribe_output_processingztorch.utils.data.DataLoaderc                 C   s   d|v r|d }|d }nt j|d d}t|d t|d }|| jj|dd|dt|t  d d	d	dd|d
d|dd|dd|dd|ddd}| j	t
|d}|S )a  
        Setup function for a temporary data loader which wraps the provided audio file.
        Args:
            config: A python dictionary which contains keys such as:
                paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments.                     Recommended length per file is between 5 and 25 seconds.
                batch_size: (int) batch size to use during inference.                     Bigger will result in better throughput performance but would use more memory.
                temp_dir: (str) A temporary directory where the audio manifest is temporarily
                    stored.
        Returns:
            A pytorch DataLoader for the given audio file(s).
        rP   r@   r  zmanifest.jsonpaths2audio_filesFr   r8   Tr[   rZ   r]   r\   r   Npad_min_durationg      ?pad_directionboth)rP   r   r@   trim_silencer   r   
pin_memoryr   use_bucketing	drop_lastr[   r]   r   r  r  r   )r   r   joinminr   r   r  r   	cpu_countr   r   )re   r   rP   r@   	dl_configtemporary_datalayerrB   rB   rC   _setup_transcribe_dataloader  s.   





z1EncDecMultiTaskModel._setup_transcribe_dataloaderc                    s   t  | | jjdd dS )z
        Internal function to teardown the model after transcription. Perform all teardown and post-checks here.

        Args:
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.
        T)partialN)r   _transcribe_on_endr   unfreeze)re   r   r   rB   rC   r    s   z'EncDecMultiTaskModel._transcribe_on_endc           
      C   s   g }|D ]o}t |tr|dd}nt |tr#|}t|d |d|d< n	tdt| dd |jD }|r<|d d	 ni }| jd
kr[d|v rO|d rOtdd|v r[|d r[tddD ]\}}	||vrm|||	||< q]|	| q|S )a  
        Utility method to convert a list of strings to a list of dictionaries.

        Args:
            json_items: A list of strings or dictionaries.
            manifest_path: A path to a manifest file.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            A list of dictionaries with the audio file paths fixed.
        i )audio_filepathdurationr  )manifest_filezExpected str or dict, got c                 S   s   g | ]
}|d  dkr|qS )r   userrB   r  rB   rB   rC   r  !  r  zHEncDecMultiTaskModel._may_be_make_dict_and_fix_paths.<locals>.<listcomp>r   r   canaryr   zrTimestamp feature is not supported in Canary prompt format. Please use latest canary-1b-flash or canary-180m-flashcontextzpContext feature is not supported in Canary prompt format. Please use latest canary-1b-flash or canary-180m-flash))source_langen)r\   r  )tasknameasr)pncr   )r   )r   r   )
r   rV   rh   r)   rK   r   rY   rF   r   rw  )
re   
json_itemsmanifest_pathr   out_json_itemsr   entrydefault_turnkdvrB   rB   rC   r    s6   


z4EncDecMultiTaskModel._may_be_make_dict_and_fix_pathsc                 C   r^   )z
        Utility method that returns the default config for transcribe() function.

        Returns:
            A dataclass
        )rX   )clsrB   rB   rC   get_transcribe_config=  s   z*EncDecMultiTaskModel.get_transcribe_configc                 C   s   |r|j }|j}d }d }n
d }d }|j }|j}| j||||d\}	}	}
}| jj|
||jdd}t|| jj| j	d d }|j
rGtt|j
|S |S )N)r  r  r  r  Fr  r   r  )r   rF  r3  ry   r  rY   r#   r   r  rL   cutsrg   zip)re   r4  ri  rj  r,  r  r  signal
signal_lenrb  r/  r0  r  rB   rB   rC   predict_stepG  s6   z!EncDecMultiTaskModel.predict_stepc                 C   s   g dS )N)r  r   rm   r   rB   rd   rB   rB   rC   adapter_module_namesl  s   z)EncDecMultiTaskModel.adapter_module_namesc                 C   sx   t dtdt dddtdt dddtdt d| jjd	d
tdt ddddddddddddddgdS )z
        Return a typing schema for optimal batch size calibration for various
        sequence lengths using OOMptimizer.
        r   r
  input)namer   
seq_lengthrF  )r  rJ  output)r  r   r  rx   rE  r  dummy)r  r   rf  rY   rK  )r  inputs)r   r2   r,   r/   r.   rE   rx   rd   rB   rB   rC   oomptimizer_schemap  s$   

z'EncDecMultiTaskModel.oomptimizer_schemar_   )NN)r   Fr   NNTNN)NNNNNN)r   rW  )r   )r   r   F)<rR   rS   rT   rU   r   r   r   r   r   rV   r
   r   r	   r   r   r   r:   no_gradnpndarrayr   r   boolr   rX   r"   r   r   r   r  r  propertyr2   r  r  r+   r3  r   rV  rq  rz  r|  r}  r  r  r  Tensorrh   r  r   r  r  r  r  classmethodr  r  r  r  __classcell__rB   rB   r   rC   r6   z   s    n#

 	
5	
O@*
3


P%*7

%rY   r   c                 C   s   | d u s| sg S d| v r0t | dkr$t| d tr$tdd | d D s,J d| d| d S tdd |  D }|rDJ d| d	| v r^d
| v r^t| d
 ts[J d| d| gS | d	d}t|| dgS )Nr  r8   c                 s   s*    | ]}t |tod |v od|v V  qdS )r   r   Nr   rh   r  rB   rB   rC   	<genexpr>  s   ( z)parse_multitask_prompt.<locals>.<genexpr>zWhen providing a multi-turn prompt through 'turns', no other keys are allowed and the value under prompt['turns'] must be a list of dicts with roles and slot values (we received prompt=)c                 s   s&    | ]\}}|d krt |tV  qdS )r   Nr  )r  r  vrB   rB   rC   r    s   $ zTWe don't support dict values for prompt keys other than 'slots'. We received prompt=r   r   zfWhen providing a single-turn prompt through 'role', 'slots' must also be provided (we received prompt=z).r  )r   r   )r   r   rg   allanyr   rh   r  )rY   values_are_dictsr   rB   rB   rC   rc     s4   	rc   )[r   r  collections.abcr   r   dataclassesr   r   mathr   typingr   r   r	   r
   r   numpyr  r:   lightning.pytorchr   	omegaconfr   r   r   r   torch.utils.datar   7nemo.collections.asr.data.audio_to_text_lhotse_promptedr   r   nemo.collections.asr.metricsr   r   %nemo.collections.asr.models.asr_modelr   r   !nemo.collections.asr.parts.mixinsr   r   r   /nemo.collections.asr.parts.mixins.transcriptionr   r   r   0nemo.collections.asr.parts.preprocessing.segmentr   8nemo.collections.asr.parts.submodules.multitask_decodingr   r    6nemo.collections.asr.parts.submodules.token_classifierr!   +nemo.collections.asr.parts.utils.rnnt_utilsr"   0nemo.collections.asr.parts.utils.timestamp_utilsr#   nemo.collections.commonr$   .nemo.collections.common.data.lhotse.dataloaderr%   nemo.collections.common.metricsr&   nemo.collections.common.partsr(   4nemo.collections.common.parts.preprocessing.manifestr)   )nemo.collections.common.prompts.formatterr*   nemo.core.classes.commonr+   nemo.core.neural_typesr,   r-   r.   r/   r0   r1   r2   r3   
nemo.utilsr4   r5   __all__rD   rM   rN   rX   r6   rh   rg   rc   rB   rB   rB   rC   <module>   s^   (

	        