o
    }oiq                     @   s   d dl Z d dlZd dlmZmZmZmZmZ d dlZ	d dl
Z
d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ G d	d
 d
eZdS )    N)DictListOptionalTupleUnion)
DictConfig	OmegaConf	open_dict)tqdmFrameCtmUnit)DALIOutputs)ASRModel)loggingc                       s  e Zd ZdZdedef fddZdefddZdefdd	Zdefd
dZ	de
jde
jde
jde
jde
jf
ddZde
jde
jfddZde
jde
jdeee
j ee
j f fddZde
jde
jde
jde
jde
jdeeedf  fddZdee dee d ee d!ee d"ee deee ee ee ee f fd#d$Zdee dee d ee d!ee d"ee deee ee ee ee f fd%d&Zd'ed(e
jd)e
jdeeed f fd*d+Zde
jde
jde
jde
jde
jdeeedf  fd,d-Zde
jde
jde
jde
jde
jdeeedf  fd.d/Ze
 dDdeeedf  fd1d2Ze
 	5dEd6ee d7ed8ed9eded f
d:d;Zd<eeee f  fd=d>Z!d?eeee f  fd@dAZ"d?eeee f  fdBdCZ#  Z$S )FAlignerWrapperModelz}ASR model wrapper to perform alignment building.
    Functionality is limited to the components needed to build an alignment.modelcfgc                    s   |j }dD ]
}||v rd ||< qt j||jd || _|dd| _|dd| _|dd| _|d	d
| _	| jdkr>n*| jdkrDn$| jdkrRt
d| j d| jdkr`t
d| j dtd| j | | d S )N)train_dsvalidation_dstest_ds)r   traineralignment_typeforcedword_outputTcpu_decodingFdecode_batch_sizer   argmaxloosezalignment_type=`z!` is not supported at the moment.rnnt_decoding_auxzUnsupported alignment type: )r   super__init__r   _modelgetr   r   r   r   NotImplementedErrorRuntimeError_init_model_specific)selfr   r   	model_cfgds	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/models/k2_aligner_model.pyr    "   s*   



zAlignerWrapperModel.__init__c                 C   s  | j dkrt| jdsdS ddlm} | j dkrat| jdr<| jjr,| jj| _d| j_n
|| jdd	d
d| _| j	| j_
n	|| j| j	d| _|dd}|dur_|d| jj_|d| jj_dS | j dkr| jjsu|| jdd	d
d| j_d| jj_d| jj_| j	| jj_
d| j_dS dS )zPart of __init__ intended to initialize attributes specific to the alignment type for CTC models.

        This method is not supposed to be called outside of __init__.
        r   use_graph_lmNr   ViterbiDecoderWithGraphr   Fk2topo1best)num_classesbackenddec_typereturn_type)r3   split_batch_sizedecoder_module_cfgintersect_prunedintersect_confT)r   hasattrr!   *nemo.collections.asr.modules.graph_decoderr/   r-   transcribe_decodergraph_decoderblank_idr   r7   r"   _decoderr9   r:   return_ilabelsoutput_aligned)r&   r   r/   r8   r+   r+   r,   _init_ctc_alignment_specific<   s>   








z0AlignerWrapperModel._init_ctc_alignment_specificc                    s   j dkrdS ddlm} j dkrk|jdd_|jdd_ddlm m	  fd	d
_
ddlm} |jddjt|djjdd_|dd}|duri|djj_|djj_dS dS )zPart of __init__ intended to initialize attributes specific to the alignment type for RNNT models.

        This method is not supposed to be called outside of __init__.
        r   Nr   r.   r   predictor_window_sizepredictor_step_size)apply_rnnt_prune_rangesget_uniform_rnnt_prune_rangesc                    s0    | |||j d j| dj| jdS )N   device)rD   rE   sizetorJ   )encoder_outputsencoded_lendecoder_outputstranscript_lenrF   rG   r&   r+   r,   <lambda>z   s    zCAlignerWrapperModel._init_rnnt_alignment_specific.<locals>.<lambda>)GraphModuleConfigr0   topo_rnnt_aliminimal)	topo_typerD   rE   )r3   r4   r5   r7   graph_module_cfgr8   r9   r:   )r   r<   r/   rnnt_cfgr"   rD   rE   #nemo.collections.asr.parts.k2.utilsrF   rG   prepare_pruned_outputs%nemo.collections.asr.parts.k2.classesrS   r?   r   r   
structuredr>   r@   r9   r:   )r&   r   r/   rS   r8   r+   rQ   r,   _init_rnnt_alignment_specificj   s8   

z1AlignerWrapperModel._init_rnnt_alignment_specificc                 C   s  ddl m} t| j|rod| _| jjjd | _| j| _	|j
dd}|j
dd}|dks1|dkr8td	| || jd  k sE|| jkrWtd
| j d  d| j d| |dk rb| jjj| n|| _|| _| | dS ddlm} t| j|rd| _| jjjd | _| jjjdu rdn| jjj | _| j| _	t| jjj}d|_t| d|_d|_W d   n1 sw   Y  | j| | | dS tdt | j )zPart of __init__ intended to initialize attributes specific to the model type.

        This method is not supposed to be called outside of __init__.
        r   )EncDecCTCModelctcrH   prob_suppress_indexprob_suppress_value      ?z&Suppression value has to be in (0,1]: z7Suppression index for the provided model has to be in [,z]: N)EncDecRNNTModelrnntgreedy_batchTzUnsupported model type: )!&nemo.collections.asr.models.ctc_modelsr^   
isinstancer!   
model_typedecodernum_classes_with_blankr?   _predict_impl_ctc_predict_implctc_cfgr"   
ValueErrorr`   rb   rC   'nemo.collections.asr.models.rnnt_modelsre   jointlog_softmax_predict_impl_rnntcopydeepcopyr   decodingstrategyr	   preserve_alignmentsfused_batch_sizechange_decoding_strategyr]   r$   type)r&   r   r^   r`   rb   re   decoding_configr+   r+   r,   r%      sH   
 

z(AlignerWrapperModel._init_model_specificrM   rN   rO   rP   returnc                 C   s   | j j|dd}| j j|dd}| ||||\}}| j j|| }| j jjdu r;|js9|jdd}|S | j jjrF|jdd}|S )zA variant of the RNNT Joiner tensor calculation with pruned Encoder and Predictor sum.
        Only the uniform pruning is supported at the moment.
        rH      Nra   dim)	r!   rr   enc	transposepredrZ   	joint_netrs   is_cuda)r&   rM   rN   rO   rP   encoder_outputs_pruneddecoder_outputs_prunedresr+   r+   r,   _rnnt_joint_pruned   s   

z&AlignerWrapperModel._rnnt_joint_pruned	log_probsc              
   C   s  |  }|dddd| jf }t|ddddd| jf |dddd| jd df gdd}tj|jd |jd df| j|jd}d| j|  | 	d
dd|jd d }|t|ddddd| jf ||dddd| jdf gd  S )ac  Multiplies probability of an element with index self.prob_suppress_index by self.prob_suppress_value times
        with stochasticity preservation of the log_probs tensor.
        
        Often used to suppress <blank> probability of the output of a CTC model.
        
        Example:
            For
                - log_probs = torch.log(torch.tensor([0.015, 0.085, 0.9]))
                - self.prob_suppress_index = -1
                - self.prob_suppress_value = 0.5
            the result of _apply_prob_suppress(log_probs) is
                - torch.log(torch.tensor([0.0825, 0.4675, 0.45]))
        NrH   r   ra   r   rI   )expr`   torchcatsumfullshaperb   rJ   	unsqueezerepeatlog)r&   r   	exp_probsxyb1b2r+   r+   r,   _apply_prob_suppress   s   <&,Dz(AlignerWrapperModel._apply_prob_suppressc                 C   s   t | jdr| jj||d\}}}||fS |jddd}| jddd\}}g g }}t|jd D ]C}|| }	|	||d|	f  ||d|	f 
 }
| j}t|	D ]}|
| }||kri|| jkri| j|
|< |}qU|	|
j|jd q3||fS )	zObtains argmax predictions with corresponding probabilities.
        Replaces consecutive repeated indices in the argmax predictions with the <blank> index.
        r=   )r   log_probs_lengthra   F)r   keepdimr   NrI   )r;   r!   r=   forwardr   r   maxranger   appendcpur?   rL   rJ   )r&   r   rN   predictions_probsgreedy_predictionsprobs_tensoriutt_lenpred_candidatepreviousjpr+   r+   r,   _prepare_ctc_argmax_predictions  s$   

z3AlignerWrapperModel._prepare_ctc_argmax_predictionsencoded
transcript	sample_idr   c              
   C   s*  | j jj||ddd }g }t||D ]\}}	|	j }
| j j|
}|	j}dd t||dd t|	j	g D }dgt| }| j
rd	d | j j|
d
D }t| j drb| |||||n| |||||\}}}}||dd t||||D f q||dd t||||D f q|S )a  Builds time alignment of an encoded sequence.
        This method assumes that the RNNT model is used and the alignment type is `argmax`.

        It produces a list of sample ids and fours: (label, start_frame, length, probability), called FrameCtmUnit.
        T)return_hypothesesr   c                 S   s   g | ]\}}|| qS r+   r+   .0r   r   r+   r+   r,   
<listcomp>0      zAAlignerWrapperModel._predict_impl_rnnt_argmax.<locals>.<listcomp>rH   Nrc   c                 S   s   g | ]}|d kr|qS ) r+   )r   wr+   r+   r,   r   4  r    	tokenizerc                 S   "   g | ]\}}}}t ||||qS r+   r   r   tblr   r+   r+   r,   r   ;     " c                 S   r   r+   r   r   r+   r+   r,   r   A  r   )r!   rw   rnnt_decoder_predictions_tensorzip
y_sequencetolistdecode_ids_to_tokenstimesteplen
alignmentsr   decode_tokens_to_strsplitr;   _process_tokens_to_words!_process_char_with_space_to_wordsr   )r&   r   rN   r   rP   r   
hypothesesresultss_id
hypothesispred_idstokenstoken_begin	token_len
token_probwords
word_beginword_len	word_probr+   r+   r,   _predict_impl_rnnt_argmax  s6   
(

z-AlignerWrapperModel._predict_impl_rnnt_argmaxr   r   r   r   r   c                    sn  t | jj|d t | jj|d d ksJ g g g }}}dd |D  d}	|D ]}
| jj|
}t |}|dkrn||	 ||	d < ||	d   ||	 7  <  |	d    |	 7  < ||	= ||	= ||	=  |	= |	= q-|dkr||d dkr||d8 }|	| }|||	  |t||	|  t |	| }|t fd	d
t|	|D |  |}	q-||||fS )z~Transforms alignment information from token level to word level.

        Used when self._model.tokenizer is present.
        r   r   c                 S      g | ]
}|d kr
|ndqS r   rH   r+   r   t_lr+   r+   r,   r   W      z@AlignerWrapperModel._process_tokens_to_words.<locals>.<listcomp>rH   r   ra   z??c                 3        | ]}|  |  V  qd S Nr+   r   ktoken_len_nonzeror   r+   r,   	<genexpr>k      z?AlignerWrapperModel._process_tokens_to_words.<locals>.<genexpr>)r   r!   r   text_to_tokensr   r   r   )r&   r   r   r   r   r   r   r   r   r   word
loc_tokensstepr   denominatorr+   r   r,   r   F  s0   
(z,AlignerWrapperModel._process_tokens_to_wordsc                    s  t |dk d  }t|t|d ksJ dd |D  t|dkrD|d g}t|g}t }	tdd t D |	 g}
nd}|d g}t|d	|d  g}t d	|d  }	t fd
dt|d D |	 g}
|d g}t|dd	 ||dd	 t|g D ]Z\}}}|||  |||  |
|  || |||d   |t||d |  t |d | }	|
t fddt|d |D |	  || q|}||||
fS )zTransforms alignment information from character level to word level.
        This method includes separator (typically the space) information in the results.

        Used with character-based models (no self._model.tokenizer).
        r   r   rH   c                 S   r   r   r+   r   r+   r+   r,   r     r   zIAlignerWrapperModel._process_char_with_space_to_words.<locals>.<listcomp>c                 s   s    | ]	\}}|| V  qd S r   r+   )r   t_pr   r+   r+   r,   r     s    zHAlignerWrapperModel._process_char_with_space_to_words.<locals>.<genexpr>z[SEP]Nc                 3   r   r   r+   r   r   r+   r,   r     r   c                 3   r   r   r+   r   r   r+   r,   r     r   )	nparraynonzeror   r   r   r   r   r   )r&   r   r   r   r   r   	space_idxr   r   r   r   
space_wordwords_with_spacer   r   r   r+   r   r,   r   o  s4   

 
&
0
,z5AlignerWrapperModel._process_char_with_space_to_wordsr   r   probc                    s  t |dkr
|g fS || jkjddd  }||  }|  | jdkrI| jj}|t	t | }t
|dd tt |t | gf}n| jj}|}t
|dd tt |gf}||}	||  }
| } fddt| |dd  t |g D }| jr||d	}t| jd
r| |	||
||n| |	||
||\}}}}|dd t||||D fS |dd t|	||
|D fS )a(  Transforms predictions with probabilities to a list of FrameCtmUnit objects, 
        containing frame-level alignment information (label, start, duration, probability), for a given sample id.

        Alignment information can be either token-based (char, wordpiece, ...) or word-based.
        r   T)as_tuplerf   rH   Nc                    s(   g | ]\}}t  || ||  qS r+   )r   r   	prob_listr+   r,   r     s    z<AlignerWrapperModel._results_to_ctmUnits.<locals>.<listcomp>r   r   c                 S   r   r+   r   r   r+   r+   r,   r     r   c                 S   r   r+   r   r   r+   r+   r,   r     r   )r   r?   r   r   r   rj   r!   rw   r   aranger   tensor_werr   r   r   r   r   r;   r   r   )r&   r   r   r   non_blank_idxr   
wer_moduler   	token_endr   r   r   r   r   r   r   r+   r   r,   _results_to_ctmUnits  s4   
,"

"

z(AlignerWrapperModel._results_to_ctmUnitsc           	         s   |} j dkr |} jdkr ||\}}n) jdkr@ jr4| | | | f\}}}} j||||\}}nt  fddt	|
 ||D S )zBuilds time alignment of an encoded sequence.
        This method assumes that the CTC model is used.

        It produces a list of sample ids and fours: (label, start_frame, length, probability), called FrameCtmUnit.
        rc   r   r   c                        g | ]\}}}  |||qS r+   r   r   r   r   r   r&   r+   r,   r         z9AlignerWrapperModel._predict_impl_ctc.<locals>.<listcomp>)rb   r   r   r   r   r   r>   alignr#   r   r   )	r&   r   rN   r   rP   r   r   r   r   r+   r  r,   rm     s"   




z%AlignerWrapperModel._predict_impl_ctcc                    s    j dkr |||||S  j dkr} jj||dd } jdkr1 j| k r1 ||||n jj||d} jdu rC|j	rCdn j}|rN|jdd	} j
rc| | | | f\}}}} j||||\}	}
 fd
dt| |	|
D S t )zBuilds time alignment of an encoded sequence.
        This method assumes that the RNNT model is used.

        It produces a list of sample ids and fours: (label, start_frame, length, probability), called FrameCtmUnit.
        r   r   )targetstarget_lengthr   )rM   rO   NTra   r   c                    r   r+   r   r   r  r+   r,   r   
  r  z:AlignerWrapperModel._predict_impl_rnnt.<locals>.<listcomp>)r   r   r!   rk   rD   r   r   rr   rs   r   r   r   r>   r  r   r   r#   )r&   r   rN   r   rP   r   decodedr   apply_log_softmaxr   r   r+   r  r,   rt     s,   


z&AlignerWrapperModel._predict_impl_rnntr   c                 C   sj   |\}}}}}t |tr|jr| jj||dd d \}	}
n| jj||dd d \}	}
| |	|
|||S )N)processed_signalprocessed_signal_lengthr   )input_signalinput_signal_length)ri   r   has_processed_signalr!   r   rn   )r&   batch	batch_idxdataloader_idxsignal
signal_lenr   rP   r   r   rN   r+   r+   r,   predict_step  s
   z AlignerWrapperModel.predict_step   NTmanifest
batch_sizenum_workersverbosec              	   C   s   g }| j j}t| j  j}| j jjj}| j jjj}	|du r't	|t
 d }zd| j jj_d| j jj_| j   | j j  | j j  t| j drQ| j j  t }
ttj |||d}| j |}t|d| dD ]"}|d ||d< |d ||d< |d	d
 | |dD 7 }~qoW | j j|d || j jj_|	| j jj_t|
 |du r| j j  | j j  t| j dr| j j  |S | j j|d || j jj_|	| j jj_t|
 |du r| j j  | j j  t| j dr| j j  w w w )a  
        Does alignment. Use this method for debugging and prototyping.

        Args:

            manifest: path to dataset JSON manifest file (in NeMo format).         Recommended length per audio file is between 5 and 25 seconds.
            batch_size: (int) batch size to use during inference.         Bigger will result in better throughput performance but would use more memory.
            num_workers: (int) number of workers for DataLoader
            verbose: (bool) whether to display tqdm progress bar

        Returns:
            A list of four: (label, start_frame, length, probability), called FrameCtmUnit,             in the same order as in the manifest.
        NrH   g        r   rr   )manifest_filepathr  r  Aligning)descdisablec                 S   s   g | ]\}}|qS r+   r+   )r   r   unitr+   r+   r,   r   Q  s    z2AlignerWrapperModel.transcribe.<locals>.<listcomp>)modeT)r!   trainingnext
parametersrJ   preprocessor
featurizerditherpad_tominos	cpu_countevalencoderfreezerk   r;   rr   r   get_verbosityset_verbosityWARNING_setup_transcribe_dataloaderr
   rL   r  trainunfreeze)r&   r  r  r  r  r   r  rJ   dither_valuepad_to_valuelogging_levelconfigtemporary_datalayer
test_batchr+   r+   r,   
transcribe  sb   


zAlignerWrapperModel.transcribetrain_data_configc                 C      t d)Nz'This module cannot be used in training.r$   )r&   r8  r+   r+   r,   setup_training_dataa     z'AlignerWrapperModel.setup_training_dataval_data_configc                 C   r9  )Nz)This module cannot be used in validation.r:  r&   r=  r+   r+   r,   setup_validation_datad  r<  z)AlignerWrapperModel.setup_validation_datac                 C   r9  )Nz&This module cannot be used in testing.r:  r>  r+   r+   r,   setup_test_datag  r<  z#AlignerWrapperModel.setup_test_data)r   )r  NT)%__name__
__module____qualname____doc__r   r   r    rC   r]   r%   r   Tensorr   r   r   r   r   intr   strfloatr   r   r   rm   rt   no_gradr  boolr7  r   r   r   r;  r?  r@  __classcell__r+   r+   r)   r,   r      s    .21


+
)
,
)
%
(
D"r   )ru   r&  typingr   r   r   r   r   numpyr   r   	omegaconfr   r   r	   	tqdm.autor
   .nemo.collections.asr.data.audio_to_ctm_datasetr   ,nemo.collections.asr.data.audio_to_text_dalir   %nemo.collections.asr.models.asr_modelr   
nemo.utilsr   r   r+   r+   r+   r,   <module>   s   