o
    }oi?                     @   sn   d dl mZ d dlmZ d dlZd dlm  mZ d dl	m
Z
mZ d dlmZ d dlmZ G dd de
ZdS )	    )nullcontext)UnionN)GraphRnntLossforce_float32_context)k2)PrettyStrEnumc                	       s   e Zd ZdZG dd deZdejddddfdeded	e	ee
f f fd
dZdejdeddfddZdededejddfddZdejdededdfddZdejdejdejdejfddZ  ZS )GraphWTransducerLossa  
    W-Transducer loss: RNN-T loss modification for training RNN-T model for the case
    when some text at the beginning/end of the utterance is missing.
    The resulting model behaves like the RNN-T model (no modification for decoding is required).
    For details see "Powerful and Extensible WFST Framework for RNN-Transducer Losses" paper
        https://ieeexplore.ieee.org/document/10096679
    c                   @   s   e Zd ZdZdZdS )z"GraphWTransducerLoss.LastBlankModeallow_ignoreforce_finalN)__name__
__module____qualname__ALLOW_IGNOREFORCE_FINAL r   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/k2/w_transducer.pyLastBlankMode#   s    r   g        TFblank
eps_weightlast_blank_modec                    s,   t  j|||||d || _| || _dS )a  
        Init method

        Args:
            blank: blank label index
            eps_weight: weight of epsilon transitions, 0 means no penalty (default)
            last_blank_mode: allow to skip last blank in the prediction (default) or force it
            use_grid_implementation: Whether to use the grid implementation (Grid-Transducer).
            connect_composed: Connect graph after composing unit and temporal schemas
                (only for Compose-Transducer). `connect` operation is slow, it is useful for visualization,
                but not necessary for loss computation.
            double_scores: Use calculation of loss in double precision (float64) in the lattice.
                Does not significantly affect memory usage since the lattice is ~V/2 times smaller than the joint tensor.
            cast_to_float32: Force cast joint tensor to float32 before log-softmax calculation.
        )r   use_grid_implementationconnect_composeddouble_scorescast_to_float32N)super__init__r   r   r   )selfr   r   r   r   r   r   r   	__class__r   r   r   '   s   zGraphWTransducerLoss.__init__units_tensor
vocab_sizereturnzk2.Fsac                 C   sX  | j }|}|d }|j}|jd }tj|d d d dftj|d}tjd|d tj|d}	||d< |	|ddddf< |	|ddddf< ||ddddf< |	|ddddf< |	d |ddddf< ||ddddf< |d |d< ||d	< ||d
< d|d< |dddf   }
t	
||
}t|
|_|	dddd |jdd< d|jd< |S )ak  
        Get unit schema (target text) graph for W-Transducer loss (Compose-Transducer).
        Forward arcs represent text labels.

        Example graph: text [1, 2], blank=0. Eps ids: 3, 4.

        graph::

                3:3:0                  0:0:1                  0:0:2
              +-------+              +-------+              +-------+
              v       |              v       |              v       |
            +-----------+  1:1:0   +-----------+  2:2:1   +-----------+  -1:-1:-1  #===#
            |     0     | -------> |     1     | -------> |     2     | ---------> H 3 H
            +-----------+          +-----------+          +-----------+            #===#
              ^ 0:0:0 |                                     ^ 4:4:2 |
              +-------+                                     +-------+

        Args:
            units_tensor: 1d tensor with text units
            vocab_size: number of total labels (vocab size including blank)

        Returns:
            unit schema graph (k2.Fsa).
            Labels: <unit>:<unit>:<unit_position> (k2.Fsa: labels, aux_labels, unit_positions)
           r         dtypedevice)r   r#   )r)   r"   )r)   r#   )r(   r#   N)r   r'   shapetorchzerosint32arangedetachcloner   Fsa
zeros_likeunit_positionsexpand	transposeflatten)r   r   r    blank_idstart_eps_id
end_eps_idr'   text_lenarcstext_indicesolabelsfsa_textr   r   r   get_unit_schemaJ   s0   
""
z$GraphWTransducerLoss.get_unit_schema
num_framesr'   c                 C   s   | j }|}|d }d}|| |d |  d }tj|dftj|d}	tjd|tj|d}
|
d }|
|| |dd }|dd |	d|df< |dd |	d|df< tjd|| tj|d|||  dd |	d|df< ||	|||| df< d|	|||| df< |
d |	|||| df< |
dd |	|||| df< | j| j	j
kr|d n||	|||| df< tj||d dftj|d|	dddf< tj|	dddf dd	\}}|	| }	|	dddf   }d|d< t|	|}t|}|S )
a  
        Get temporal schema graph for W-Transducer loss (Compose-Transducer).

        Example graph: blank=0, num_frames=3, vocab_size=3, last_blank_mode="force_final".
        Labels: <unit>:<frame_index>. <unit> is a unit from vocab + special eps ids `vocab_size`, `vocab_size+1`.

        graph for force_final::

                                                         4:0
                       +--------------------------------------------+
                       |                               4:1          |
                       |                     +--------------------+ |
                1:0    |              1:1    |              1:2   | |
              +-----+  |            +-----+  |            +-----+ | |
              v     |  |            v     |  |            v     | v v
            +--------------+  0:0  +------------+  0:1   +------------+  0:2   +---+  -1:-1   #===#
            |    0         | ----> |    1       | -----> |    2       | -----> | 3 | -------> H 4 H
            +--------------+       +------------+        +------------+        +---+          #===#
              ^ 2:0 |  |  |         ^ 2:1 |  ^            ^ 2:2 |  ^
              +-----+  |  |         +-----+  |            +-----+  |
                       |  |     3:0          |                     |
                       |  +------------------+     3:0             |
                       +-------------------------------------------+


        Args:
            num_frames: length of the sequence (in frames)
            vocab_size: number of labels (including blank)
            device: device for tensor to construct

        Returns:
            temporal schema graph (k2.Fsa).
            Labels: <unit>:<frame_index>. <unit> is a unit from vocab + special units (e.g., additional eps).
        r"   r#   r$   r%   r   Nr(      dim)r   r+   r,   r-   r.   r4   r5   r6   r   r   r   tensorsortr/   r0   r   r1   arc_sort)r   r@   r    r'   r7   r8   r9   num_epsnum_sequence_arcsfsa_temporal_arcssequence_statessequence_states_nextstart_states_indicesr=   fsa_temporalr   r   r   get_temporal_schema   s<   #
(
z(GraphWTransducerLoss.get_temporal_schemac                 C   s  | j }|}|jd }|j}||d  }|d |d  }	|d d }
|	|
 }|| }tj|| d dftj|d}tj|	|d}||d  }||d|	df< ||d|	df< ||d|	df< ttj|d |d|d  |tj|d |d|d   g}||d  }|||	|	|d d  df< |||	|	|d d  df< |||	|	|d  df< |d ||	|d  |	|d d  df< d||	|	|d  df< |d ||	|d  |	|d d  df< tj|tj|d||d ddddf 	 }|d }|
|d	 }|||d	df< |||d	df< |||d	df< tj|d ||ftj|d|d	dd
f< tj||d dftj|d|ddd
f< tj|dddf |d dd}|dddf |d  }d|d< d|d< | |dd	df |d ||dd	df< | |dddf |d ||dddf< | j| jjkr|||	|d  |	|d d  df< tj|dddf dd\}}|| }|| }|| }t||}||_|S )a  
        Construct W-Transducer lattice directly (Grid-Transducer).

        Args:
            units_tensor: 1d tensor with text units
            num_frames: length of the sequence (number of frames)
            vocab_size: number of total labels (vocab size including blank)

        Returns:
            transducer lattice (k2.Fsa).
            Labels: <unit>:<frame_index>:<unit_position> (k2.Fsa: labels, aux_labels, unit_positions)
        r   r"   r#   r$   r%   )r'   Nr(   r)   rA   floor)rounding_moderB   )r   r*   r'   r+   r,   r-   r.   catreshaper6   r4   rD   divrelabel_statesr   r   r   rE   r   r1   r3   )r   r   r@   r    r7   eps_idtext_lengthr'   num_grid_statesnum_forward_arcs_basenum_forward_arcs_additionalnum_forward_arcsnum_text_arcsr;   from_states	to_statesilabelsr=   r3   rM   rN   
rnnt_graphr   r   r   get_grid   st   
("(( **zGraphWTransducerLoss.get_gridactslabelsact_lens
label_lensc              	   C   s  ||||f\}}}}|j d }	| ||||	}
| jrt nt }|^ tj|dd}t  | 	|
|j }d||
j
dk< d||
j
|	k< W d   n1 sNw   Y  | d|}d||
j
dk< | j||
j
|	k< ||
_d|
j| jdd }|W  d   S 1 sw   Y  dS )z
        Forward method is similar to RNN-T Graph-Transducer forward method,
        but we need to assign eps weight to eps-transitions.
        r(   rB   r   NT)use_double_scoreslog_semiring)r*   get_graphs_batchedr   r   r   Flog_softmaxr+   no_gradget_logits_indicesre   r6   index_selectr   scoresget_tot_scoresr   )r   rd   re   rf   rg   logitstargetslogits_lengthstarget_lengthsr    target_fsas_veccast_context	log_probsrN   rp   r   r   r   forward3  s$   	

$zGraphWTransducerLoss.forward)r   r   r   __doc__r   r   r   intfloatr   strr   r+   Tensorr?   r'   rP   rc   ry   __classcell__r   r   r   r   r      s8    
#<P]r   )
contextlibr   typingr   r+   torch.nn.functionalnn
functionalrk   .nemo.collections.asr.parts.k2.graph_transducerr   r   nemo.core.utils.k2_guardr   nemo.utils.enumr   r   r   r   r   r   <module>   s   