o
    }oil                     @   s  d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZmZmZ d dlmZmZmZ d dlm Z  d dl!m"Z" G dd de#Z$G dd de$eZ%G dd de$eZ&G dd de$Z'G dd deZ(dS )    )abstractmethod)defaultdict)Path)ListOptionalTupleUnionN)wer)
DictConfig)GraphIntersectDenseConfig)
CtcK2MixinRnntK2Mixin)create_supervisioninvert_permutationlevenshtein_graph_k2
load_graph)AbstractWFSTDecoderWfstNbestHypothesiscollapse_tokenword_hypotheses)k2)loggingc                   @   sV  e Zd ZdZedde ddedfdeded	e	e
 d
ededededejfddZdejfddZd&ddZ			d'dejdejdededededeeej eej f f fddZ			d'dejdejdededededeeej eej f f fd d!Z			d'dejdejd"ejd#ejdedededeeej eej f fd$d%ZdS )(BaseDecoderaP  Base graph decoder with topology for decoding graph.
    Typically uses the same parameters as for the corresponding loss function.

    Can do decoding and forced alignment.

    cfg takes precedence over all optional parameters
    We keep explicit parameter setting to be able to create an instance without the need of a config.
    NFdefaultTcpunum_classesblankcfgintersect_prunedintersect_conf	topo_typetopo_with_self_loopsdevicec	           	      C   s   |d ur| d|}| d|}| d|}| d|}|| _|| _|| _|| _|| _|| _| jdk| _|| _d | _	d | _
d | _d S )Nr   r   r   r    ctc_compact)getr   r   r   r!   r   r    
pad_fsavecr   graph_compiler
base_graphdecoding_graph)	selfr   r   r   r   r   r   r    r!    r)   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/k2/graph_decoders.py__init__3   s    
zBaseDecoder.__init__c                 C   s`   | j j|kr| j | | jj|kr| j|| _| jd ur+| jj|kr+| j|| _|| _d S N)r%   r!   tor&   r'   )r(   r!   r)   r)   r*   r-   R   s   
zBaseDecoder.tographk2.Fsac                 C   s   t r,   NotImplementedError)r(   r.   r)   r)   r*   update_graph[   s   zBaseDecoder.update_graph	log_probssupervisionsreturn_latticesreturn_ilabelsoutput_alignedreturnc                 C   sD  | j d u r	| j| _ |j| jkr| |j | ||}| jr3tj| j || jj	| jj
| jj| jjd}n'tj| tj| jd}| j jd dkrNt| j |n| j }	t|	|| jj
}d | _ |d d df }
|rt|t|
j|jd}| jdkrt|jdk| j|jd |_|S tt|dt|
j|jd}| |||S )N)a_fsasb_fsassearch_beamoutput_beammin_active_statesmax_active_states)dtyper!   r      r!   T)r'   r&   r!   r-   _prepare_emissions_graphsr   r   intersect_dense_prunedr   r;   r<   r=   r>   torchzerosdim0int32shape	index_fsaintersect_denser   r   wherelabelsshortest_path!_extract_labels_and_probabilities)r(   r3   r4   r5   r6   r7   emissions_graphslatsindices
dec_graphsordershortest_path_fsasr)   r)   r*   _decode_impl^   s@   
	

zBaseDecoder._decode_impllog_probs_lengthc                 C   s,   |  ||d d \}}}}| j|||||dS )Nr5   r6   r7   )_prepare_log_probs_and_targetsrU   )r(   r3   rV   r5   r6   r7   r4   _r)   r)   r*   decode   s   zBaseDecoder.decodetargetstarget_lengthsc           
      C   s^   |  ||||\}}}}|d d df jtjd}	| j||	 ||	 | _| j|||||dS )Nr   r?   rW   )rX   r-   rD   longr%   compiler'   rU   )
r(   r3   rV   r[   r\   r5   r6   r7   r4   rS   r)   r)   r*   align   s   
zBaseDecoder.alignr.   r/   FFT)__name__
__module____qualname____doc__r   r   rD   r!   intr   r
   boolstrr+   r-   r2   Tensorr   r   r   rU   rZ   r`   r)   r)   r)   r*   r   )   s    		
	
5
	r   c                       s^   e Zd ZdZdde ddedfdeded	ee	 d
e
dedede
dejf fddZ  ZS )
CtcDecodera@  Regular CTC graph decoder with custom topologies.
    Available topologies:
        -   `default`, with or without self-loops
        -   `compact`, with or without self-loops
        -   `shared_blank`, with or without self-loops
        -   `minimal`, without self-loops

    Can do decoding and forced alignment.
    NFr   Tr   r   r   r   r   r   r   r    r!   c	           
   
      sd   t  |||||||| ddlm}	 |	| j| j| j| j| j| _	t
| j	j g| j| _d S )Nr   )CtcTopologyCompiler)superr+   -nemo.collections.asr.parts.k2.graph_compilersrl   r   r   r   r    r!   r%   r   create_fsa_vecctc_topo_invinvertr-   r&   )
r(   r   r   r   r   r   r   r    r!   rl   	__class__r)   r*   r+      s   "zCtcDecoder.__init__)rc   rd   re   rf   r   rD   r!   rg   r   r
   rh   ri   r+   __classcell__r)   r)   rr   r*   rk      s2    	rk   c                       s  e Zd ZdZdde ddddedfd	ed
edee	 de
dedede
dededejf fddZ			d"dejdejde
de
de
dedeeej eej f f fddZ			d"dejdejdejdejde
de
de
deeej eej f f fd d!Z  ZS )#RnntAligneraO  RNNT graph decoder with the `minimal` topology.
    If predictor_window_size is not provided, this decoder works as a Viterbi over regular RNNT lattice.
    With predictor_window_size provided, it applies uniform pruning when compiling Emission FSAs
    to reduce memory and compute consumption.

    Can only do forced alignment.
    NFr   Tr   r@   r   r   r   r   r   r   r   r    predictor_window_sizepredictor_step_sizer!   c              
      s   |d ur| d|}| d|}| d|	}	|dkrtdt ||||||||
 || _|	| _ddlm} || j| j	| j
| j| j| jd| _| jj| _d S )	Nr   rv   rw   minimalz4Only topo_type=`minimal` is supported at the moment.r   )RnntTopologyCompiler)max_adapter_length)r#   r1   rm   r+   rv   rw   rn   ry   r   r   r   r    r!   r%   r&   )r(   r   r   r   r   r   r   r    rv   rw   r!   ry   rr   r)   r*   r+      s*   zRnntAligner.__init__r3   rV   r5   r6   r7   r8   r/   c                 C   s   t d)NzGRNNT decoding is not implemented. Only .align(...) method is supported.r0   )r(   r3   rV   r5   r6   r7   r)   r)   r*   rZ     s   zRnntAligner.decoder[   r\   c              	      s<   | j dks|d| j d ksJ t j|||||||dS )Nr      r@   rW   )rv   sizerm   r`   )r(   r3   rV   r[   r\   r5   r6   r7   rr   r)   r*   r`     s   "
zRnntAligner.alignrb   )rc   rd   re   rf   r   rD   r!   rg   r   r
   rh   ri   r+   rj   r   r   r   rZ   r`   rt   r)   r)   rr   r*   ru      s    	
(
	ru   c                       sz   e Zd ZdZddde ddedfdeded	ee	 d
ee
def  dededededejf fddZdddZ  ZS )TokenLMDecodera  Graph decoder with token_lm-based decoding graph.
    Available topologies:
        -   `default`, with or without self-loops
        -   `compact`, with or without self-loops
        -   `shared_blank`, with or without self-loops
        -   `minimal`, without self-loops

    Can do decoding and forced alignment.

    cfg takes precedence over all optional parameters
    We keep explicit parameter setting to be able to create an instance without the need of a config.
    NFr   Tr   r   r   r   token_lmr/   r   r   r   r    r!   c
           
   
      s   t  ||||||||	 |d ur|d|}|d ur;t|tr$t|n|| _| jd ur4| | j d S t	d d S t	d d | _d S )Nr~   ztoken_lm was set to None. Use this for debug 
                                purposes only or call .update_graph(token_lm) before using.ztoken_lm was set to None. Use this for debug
                            purposes only or call .update_graph(token_lm) before using.)
rm   r+   r#   
isinstanceri   r   r~   r2   r   warning)
r(   r   r   r   r~   r   r   r   r    r!   rr   r)   r*   r+   5  s    

zTokenLMDecoder.__init__r.   c                 C   s   || _ | j  }t|drt|d |j}| | jd kr+td|  d| j t	| j| j
| j| j| j|| _t| jjg| j| _d S )N
aux_labelsr@   z1token_lm is not compatible with the num_classes: z, )r~   clonehasattrdelattrrL   maxr   
ValueErroruniqueCtcNumGraphCompilerr   r   r    r!   r%   r   ro   r&   r-   )r(   r.   r~   rL   r)   r)   r*   r2   V  s   


zTokenLMDecoder.update_graphra   )rc   rd   re   rf   r   rD   r!   rg   r   r
   r   ri   rh   r+   r2   rt   r)   r)   rr   r*   r}   '  s:    	
!r}   c                       s  e Zd ZdZ							d9d	ed
eef dededee	 de
dede
def fddZd:dee	 fddZdefddZeddd ZdefddZdefddZed d! Zejdefd"d!Zdefd#d$Zdefd%d&Zedd'd(d)ejd*d
fd+d,Zedd-ejd.ejd*eee ed
 f fd/d0Zedd1d
d*eee ed
 f fd2d3Zedd-ejd.ejd4ee d*eeef fd5d6Zedd-ejd.ejd4ee d*eeee f fd7d8Z   Z!S );K2WfstDecodera  
    Used for performing WFST decoding of the logprobs with the k2 WFST decoder.

    Args:
      lm_fst:
        Kaldi-type language model WFST or its path.

      decoding_mode:
        Decoding mode. Choices: `nbest`, `lattice`.

      beam_size:
        Beam width (float) for the WFST decoding.

      config:
        Riva Decoder config.

      tokenword_disambig_id:
        Tokenword disambiguation index. Set to -1 to disable the tokenword mode.

      lm_weight:
        Language model weight in decoding.

      nbest_size:
        N-best size for decoding_mode == `nbest`

      device:
        Device for running decoding. Choices: `cuda`, `cpu`.
    nbest      $@N      ?r@   cudalm_fstr/   decoding_mode	beam_sizeconfigtokenword_disambig_id	lm_weight
nbest_sizer!   c	           	         s&   || _ || _t |||||| d S r,   )_nbest_size_devicerm   r+   )	r(   r   r   r   r   r   r   r   r!   rr   r)   r*   r+     s   zK2WfstDecoder.__init__c                 C   s,   |d u rt  }d|_| j|_d|_|| _d S )Ng      4@i'  )r   r;   
_beam_sizer<   r>   _config)r(   r   r)   r)   r*   _set_decoder_config  s   
z!K2WfstDecoder._set_decoder_configc                 C   s    |dvrt d| || _d S )N)r   latticezUnsupported mode: )r   _decoding_mode)r(   r   r)   r)   r*   _set_decoding_mode  s   
z K2WfstDecoder._set_decoding_modeFc                    s<  t | jttfrt| jn| j }|j |_|j| j	d| _| j
d u r]dd | jj  dD | _
| j
tt| j
 }|d tfdd| _| D ]	\}}|| j|< qS| jd u rdd | jj  dD | _| jtt| j }|d  t fd	d| _| D ]\}}|| j|< qd S d S )
NrA   c                 S   &   i | ]}t | d  | d qS r@   r   rg   split.0liner)   r)   r*   
<dictcomp>  s    z/K2WfstDecoder._init_decoder.<locals>.<dictcomp>
z<unk>c                          S r,   r)   r)   )word_unk_idr)   r*   <lambda>      z-K2WfstDecoder._init_decoder.<locals>.<lambda>c                 S   r   r   r   r   r)   r)   r*   r     s    c                      r   r,   r)   r)   )token_unk_idr)   r*   r     r   )r   _lm_fstr   ri   r   r   scores	lm_scoresr-   r   _id2wordaux_labels_symto_strstripr   rs   mapreverseditemsr   _word2id	_id2token
labels_sym	_token2id)r(   r   word2idkvtoken2idr)   )r   r   r*   _init_decoder  s.   $

zK2WfstDecoder._init_decodervaluec                 C   s    | j |kr|| j_|| _ d S d S r,   )r   r   r<   r(   r   r)   r)   r*   _beam_size_setter  s   

zK2WfstDecoder._beam_size_setterc                 C      | j |kr
|| _ d S d S r,   )
_lm_weightr   r)   r)   r*   _lm_weight_setter     

zK2WfstDecoder._lm_weight_setterc                 C   s   | j S r,   r   r(   r)   r)   r*   r     s   zK2WfstDecoder.nbest_sizec                 C   s   |  | d S r,   )_nbest_size_setterr   r)   r)   r*   r     s   c                 C   r   r,   r   r   r)   r)   r*   r     r   z K2WfstDecoder._nbest_size_setterc                 C   s   | j |kr| | d S d S r,   )r   r   r   r)   r)   r*   _decoding_mode_setter  s   
z#K2WfstDecoder._decoding_mode_setteremissions_fsaszk2.DenseFsaVecrS   r8   c              
   C   s   t j| j|| jj| jj| jj| jjddd}t t 	|}|j
|j |_| jdkr4|j| j|j  |_
d|jd< t |t|j| jdS )a1  
        Decodes logprobs into k2-type lattices.

        Args:
          emissions_fsas:
            A k2.DenseFsaVec of the predicted log-probabilities.
          order:
            A torch.Tensor that stores the order of the emissions_fsas elements.

        Returns:
          k2-type FsaVec.
        	frame_idxT)r9   r:   r;   r<   r=   r>   frame_idx_nameallow_partialr   N_propertiesrA   )r   rC   r   r   r;   r<   r=   r>   connectexpand_ragged_attributesr   r   	am_scoresr   __dict__rI   r   r-   r   )r(   r   rS   rP   r)   r)   r*   _decode_lattice  s    


zK2WfstDecoder._decode_latticer3   rV   c                 C   sH   t |}|dddf }t|j| jd|}| ||}| |}|S )a  
        Decodes logprobs into recognition hypotheses.

        Args:
          log_probs:
            A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary].

          log_probs_length:
            A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements.

        Returns:
          List of recognition hypotheses.
        Nr   rA   )r   r   DenseFsaVecr-   r   r   _post_decode)r(   r3   rV   r4   rS   r   rP   
hypothesesr)   r)   r*   rZ     s   
zK2WfstDecoder.decoder   c                    s  j dkr6 }g  jdkrt|d}|dd }t|jd D ]a}|| }|jdk}fdd|j|  D }|j	|j	dk  }	|j
| }
|
dd	 }|
dd }|dk}|| ||< ||
dd< |
 }
 tttt|t|
t|	||  gg q$ntj|j}t|j
|jj|j_
|jdd }d
d t|jjD }t|jdD ]a\}}|j| }|jdk}fdd|j|  D }|j	|j	dk  }	|j
| }
|
dd	 }|
dd }|dk}|| ||< ||
dd< |
 }
|| tt|t|
t|	||  g q|D ]} tt| qjr4t jj S  S  fddtt D S )z
        Does various post-processing of the recognition hypotheses.

        Args:
          hypotheses:
            FsaVec of k2-type lattices.

        Returns:
          List of processed recognition hypotheses.
        r   r@   TFr   c                       g | ]} j | qS r)   r   r   lr   r)   r*   
<listcomp>      z.K2WfstDecoder._post_decode.<locals>.<listcomp>Nr   c                 S   s   g | ]}g qS r)   r)   )r   rY   r)   r)   r*   r   6  s    c                    r   r)   r   r   r   r)   r*   r   :  r   c                    s   g | ]
} | j d dqS )r   rA   )r-   )r   i)r   r)   r*   r   O      )r   r   r   rM   get_tot_scorestolistrangerH   r   rL   r   appendr   tupleNbestfrom_latticeindex_select	kept_pathvaluesfsarF   	enumeraterow_ids_open_vocabulary_decodingr   r   _tokenword_disambig_idlen)r(   r   hypotheses_fsarT   r   r   r   non_eps_maskwords	alignment	timestepstimesteps_lefttimesteps_righttimesteps_right_zero_mask
nbest_fsasnbest_hypothesis_listjnbest_hypothesisr)   )r   r(   r*   r   	  sl   


 


 zK2WfstDecoder._post_decodereference_textsc                 C   s   t |t |ks
J | j}| j}| j}d| _| ||}dtd}}	d| _d| _tddD ],}
|
d }|D ]}|j||j  |_	q6| 
|}tdd	 |D |}||	k rZ||}}	q.|| _|| _|| _||	fS )
a  
        Calibrates LM weight to achieve the best WER for given logprob-text pairs.

        Args:
          log_probs:
            A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary].

          log_probs_length:
            A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements.

          reference_texts:
            List of reference word sequences.

        Returns:
          Pair of (best_lm_weight, best_wer).
        r         infr   r@      
   c                 S   s   g | ]
}d  |d jqS ) r   )joinr   )r   hr)   r)   r*   r   s  r   z5K2WfstDecoder.calibrate_lm_weight.<locals>.<listcomp>)r   r   r   r   rZ   floatr   r   r   r   r   word_error_rate)r(   r3   rV   r   decoding_mode_backuplm_weight_backupnbest_size_backuplatticesbest_lm_weightbest_werr   lm_weight_actlatr   r	   r)   r)   r*   calibrate_lm_weightQ  s,   

z!K2WfstDecoder.calibrate_lm_weightc              
      s   j rtt|t|ksJ  fdd|D }tdd |D } j}d _ ||}t j	 d }||j
|j
dk< | }t|d t| }	tt|}
tj|
dd	\}
}|
j }|||d
k< ||
_d|
jd< t|
}
tj|	|
dd}t|}z	tj|dd}W n ty } ztd dg fW  Y d}~S d}~ww |ddjtjd }|| }| _| |   |  fS )a  
        Calculates the oracle (the best possible WER for given logprob-text pairs.

        Args:
          log_probs:
            A torch.Tensor of the predicted log-probabilities of shape [Batch, Time, Vocabulary].

          log_probs_length:
            A torch.Tensor of length `Batch` which contains the lengths of the log_probs elements.

          reference_texts:
            List of reference word sequences.

        Returns:
          Pair of (oracle_wer, oracle_wer_per_utterance).
        c                    s"   g | ]} fd d|  D qS )c                    r   r)   )r   )r   wr   r)   r*   r     r   zAK2WfstDecoder.calculate_oracle_wer.<locals>.<listcomp>.<listcomp>)r   )r   textr   r)   r*   r     s   " z6K2WfstDecoder.calculate_oracle_wer.<locals>.<listcomp>c                 S   s   g | ]}t |qS r)   )r   )r   widr)   r)   r*   r     s    r   r@   r   r   T)ret_arc_mapr   Nr   F)treat_epsilons_specially)use_double_scoreszcalculate_oracle_wer failedr   r]   )!r   r1   r   rD   tensorr   rZ   r   r   keysr   rq   r   r   r   
linear_fsaadd_epsilon_self_loopsrL   r   r   arc_sortcomposeremove_epsilon_self_loopsrM   RuntimeErrorr   r   r   r-   int64sumitemr   )r(   r3   rV   r   word_idscountsr  r  oracle_disambighypsrefsarc_maprL   ali_latsr   er   wer_per_uttr)   r   r*   calculate_oracle_wer{  sB   





z"K2WfstDecoder.calculate_oracle_wer)r   r   Nr   r   r@   r   r,   )"rc   rd   re   rf   r   r   ri   r  r   r   rg   r+   r   r   rD   inference_moder   r   r   propertyr   setterr   r   rj   r   r   r   rZ   r   r   r  r(  rt   r)   r)   rr   r*   r   d  s     	

$G
)r   ))abcr   collectionsr   pathlibr   typingr   r   r   r   rD   jiwerr	   r  	omegaconfr
   %nemo.collections.asr.parts.k2.classesr   )nemo.collections.asr.parts.k2.loss_mixinsr   r   #nemo.collections.asr.parts.k2.utilsr   r   r   r   2nemo.collections.asr.parts.submodules.wfst_decoderr   r   r   nemo.core.utils.k2_guardr   
nemo.utilsr   objectr   rk   ru   r}   r   r)   r)   r)   r*   <module>   s&    !N=