o
    }oi^                     @   s   d dl Z d dlmZ d dlmZ d dlZd dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ er=d dlmZ d	efd
dZG dd deZG dd deZdS )    N)nullcontext)ContextManager)Loss)k2)TRITON_AVAILABLE)logging)rnnt_logprobs_tritonreturnc                   C   s    t  rt jjdt jdS t S )z@Get context manager to force float32 precision in autocast mode.cuda)dtype)torchis_autocast_enabledampautocastfloat32r    r   r   b/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/k2/graph_transducer.pyforce_float32_context   s   r   c                
       s   e Zd ZdZ	ddef fddZejdej	de
dd	fd
dZejde
de
dejdd	fddZejdej	de
de
dd	fddZdej	de
de
dd	fddZdej	dej	dej	de
dd	f
ddZdejdej	fddZdejdejdej	fddZ  ZS ) GraphTransducerLossBasea  
    Base class for graph transducer losses.
    Implementation of the approach described in "Powerful and Extensible WFST Framework for RNN-Transducer Losses"
    https://ieeexplore.ieee.org/document/10096679

    Compose-Transducer: compose the unit (target text) and temporal schemas (graphs) into lattice.
        Subclass should implement `get_unit_schema` and `get_temporal_schema` methods.
    Grid-Transducer: construct the RNN-T lattice (grid) directly in code.
        Subclass should implement `get_grid` method.
    Fuse_grid_implementationc                    s&   t    || _|| _|| _|| _dS )a  

        Args:
            use_grid_implementation: Whether to use the grid implementation (Grid-Transducer).
            connect_composed: Connect graph after composing unit and temporal schemas (only for Compose-Transducer).
                `connect` operation is slow, it is useful for visualization, but not necessary for loss computation.
            double_scores: Use calculation of loss in double precision (float64) in the lattice.
                Does not significantly affect memory usage since the lattice is ~V/2 times smaller
                than the joint tensor.
            cast_to_float32: Force cast joint tensor to float32 before log-softmax calculation.
        N)super__init__r   connect_composeddouble_scorescast_to_float32)selfr   r   r   r   	__class__r   r   r   2   s
   

z GraphTransducerLossBase.__init__units_tensor
vocab_sizer	   k2.Fsac                 C      dS )a  
        Get unit schema (target text) graph for Compose-Transducer.

        Args:
            units_tensor: tensor with target text
            vocab_size: number of labels (including blank). Needed to construct additional eps-arcs (in some cases).

        Returns:
            unit schema graph (k2.Fsa).
            Labels: <unit>:<unit>:<unit_position> (k2.Fsa: labels, aux_labels, unit_positions)
        Nr   )r   r   r   r   r   r   get_unit_schemaF   s   z'GraphTransducerLossBase.get_unit_schema
num_framesdevicec                 C   r!   )a  
        Get temporal schema graph for Compose-Transducer.

        Args:
            num_frames: length of the sequence (in frames)
            vocab_size: number of labels (including blank)
            device: device for tensor to construct

        Returns:
            temporal schema graph (k2.Fsa).
            Labels: <unit>:<frame_index>. <unit> is a unit from vocab + special units (e.g., additional eps).
        Nr   )r   r#   r   r$   r   r   r   get_temporal_schemaU      z+GraphTransducerLossBase.get_temporal_schemac                 C   r!   )a  
        Construct the transducer lattice (grid) directly for Grid-Transducer.

        Args:
            units_tensor: tensor with target text
            num_frames: length of the sequence (in frames)
            vocab_size: number of labels (including blank)

        Returns:
            transducer lattice (k2.Fsa).
            Labels: <unit>:<frame_index>:<unit_position> (k2.Fsa: labels, aux_labels, unit_positions)
        Nr   )r   r   r#   r   r   r   r   get_gride   r&   z GraphTransducerLossBase.get_gridc                 C   s@   |  ||}| |||j}tj||dd}| jrt|}|S )a  
        Get composed lattice (unit and temporal schemas) for Compose-Transducer. Useful for visualization.
        Should be equivalent to the lattice from `get_grid` method.

        Args:
            units_tensor: tensor with target text
            num_frames: length of the sequence (in frames)
            vocab_size: vocab size (including blank)

        Returns:
            composed lattice (k2.Fsa) from unit and temporal schemas
        Ftreat_epsilons_specially)r"   r%   r$   r   composer   connect)r   r   r#   r   fsa_textfsa_temporalcomposedr   r   r   get_composed_latticeu   s   
z,GraphTransducerLossBase.get_composed_latticesource_lengthstargetstarget_lengthsc           	         s   j d }t g  jr.  t fddt|D W  d   S  fddt|D } fddt|D }tjt|t|dd} j	rht
|}W d   |S W d   |S 1 ssw   Y  |S )	a{  
        Get batched lattice (grid or composed) for the batch of sequences.

        Args:
            source_lengths: tensor with lengths of logits
            targets: tensor with target units
            target_lengths: tensor with lengths of targets
            vocab_size: vocab size (including blank)

        Returns:
            batched lattice - FsaVec (k2.Fsa)
        r   c                    s0   g | ]} j |d | f | dqS )N)r   r#   r   )r'   .0i)r   source_lengths_listtarget_lengths_listr1   r   r   r   
<listcomp>   s    z>GraphTransducerLossBase.get_graphs_batched.<locals>.<listcomp>Nc                    s.   g | ]} j |d |  f dqS )N)r   r   )r"   itemr3   )r   r2   r1   r   r   r   r8      s    c                    s&   g | ]} j |  jd qS ))r#   r   r$   )r%   r9   r$   r3   )r   r0   r1   r   r   r   r8      s    Fr(   )shaper   no_gradr   tolistr   create_fsa_vecranger*   r   r+   )	r   r0   r1   r2   r   
batch_size	text_fsastemporal_fsastarget_fsas_vecr   )r   r0   r6   r2   r7   r1   r   r   get_graphs_batched   s:   


"
""z*GraphTransducerLossBase.get_graphs_batchedrB   c                    sJ    j d } j}ttj||tjdtj fddt|D |d}|S )z
        Get batch indices (for logits) for each arc in the lattices.

        Args:
            target_fsas_vec: batch of target FSAs with lattices

        Returns:
            1d tensor with indices
        r   )r$   r   c                    s(   g | ]} j d |d   jd  qS )r   )arcsindexvaluesr:   r3   rB   r   r   r8      s   ( z=GraphTransducerLossBase.get_batch_indices.<locals>.<listcomp>r$   )r:   r$   r   repeat_interleavearangeint64tensorr>   )r   rB   r?   r$   scores_to_batch_ir   rG   r   get_batch_indices   s   

z)GraphTransducerLossBase.get_batch_indiceslogits_shapec                 C   sn   | j |d}||d  |d  |d  |jtj|d  |d   |jtj|d   |jtj }|S )a  
        Get indices of flatten logits for each arc in the lattices.

        Args:
            target_fsas_vec: batch of target FSAs with lattices
            logits_shape: shape of the logits tensor

        Returns:
            1d tensor with indices
        rG            )rN   
aux_labelstor   rK   unit_positionslabels)r   rB   rO   rM   indicesr   r   r   get_logits_indices   s   z*GraphTransducerLossBase.get_logits_indices)FFF)__name__
__module____qualname____doc__boolr   abcabstractmethodr   Tensorintr"   r$   r%   r'   r/   rC   r   FsarN   SizerX   __classcell__r   r   r   r   r   &   s4    
4$r   c                       s  e Zd ZdZ						d$def fddZdejded	d
fddZdededej	d	d
fddZ
edejdeded	ejfddZdejdeded	d
fddZ	d%dejdejdejdejd	d
f
ddZdejdejd ejd!ejd	ejeejd
f B f
d"d#Z  ZS )&GraphRnntLossz
    RNN-T loss implementation based on WFST according
    to "Powerful and Extensible WFST Framework for RNN-Transducer Losses"
    https://ieeexplore.ieee.org/document/10096679
    TFblankc                    sB   t  j||||d || _|| _|ot| _| jstd dS dS )a  
        Init method

        Args:
            blank: blank label index
            use_grid_implementation: Whether to use the grid implementation (Grid-Transducer).
            connect_composed: Connect graph after composing unit and temporal schemas (only for Compose-Transducer).
                `connect` operation is slow, it is useful for visualization, but not necessary for loss computation.
            double_scores: Use calculation of loss in double precision (float64) in the lattice.
                Does not significantly affect memory usage since the lattice is ~V/2 times smaller
                than the joint tensor.
            cast_to_float32: Force cast joint tensor to float32 before log-softmax calculation.
            return_graph: Return graph (along with loss) from `forward` function
            use_triton: use optimized log probs calculations with Triton (faster and more memory efficient)
        )r   r   r   r   z.Triton is disabled, memory usage can be largerN)r   r   rf   return_graphr   
use_tritonr   warning)r   rf   r   r   r   r   rg   rh   r   r   r   r      s   
zGraphRnntLoss.__init__r   r   r	   r    c           
      C   s  | j }|j}|jd }tj|d d dftj|d}tjd|d tj|d}||ddddf< ||ddddf< ||ddddf< ||ddddf< |d |ddddf< ||ddddf< d|d< |dddf   }t	
||}	|dddd |	_d|	jd< |	S )	a  
        Get unit schema (target text) graph for RNN-T loss (Compose-Transducer).
        Forward arcs represent text labels.

        Example graph: text [1, 2], blank=0.

        graph::

                0:0:0                  0:0:1                  0:0:2
              +-------+              +-------+              +-------+
              v       |              v       |              v       |
            +-----------+  1:1:0   +-----------+  2:2:1   +-----------+  -1:-1:-1  #===#
            |     0     | -------> |     1     | -------> |     2     | ---------> H 3 H
            +-----------+          +-----------+          +-----------+            #===#

        Args:
            units_tensor: 1d tensor with text units
            vocab_size: number of total labels (vocab size including blank)

        Returns:
            unit schema graph (k2.Fsa).
            Labels: <unit>:<unit>:<unit_position> (k2.Fsa: labels, aux_labels, unit_positions)
        r   rP   rQ      r   r$   N)rl   rQ   )rf   r$   r:   r   zerosint32rJ   detachcloner   rb   expand	transposeflattenrU   )
r   r   r   blank_idr$   text_lenrD   text_indicesolabelsr,   r   r   r   r"     s"   

zGraphRnntLoss.get_unit_schemar#   r$   c           
      C   s  | j }tj|| d dftj|d}tjd|tj|d}|||dd }||dddf< ||dddf< tjd|tj|d|| |dddf< |d ||d|df< tj||d dftj|d|dddf< |dddf 	 
 }d|d< t||}	t|	}	|	S )	a  
        Get temporal schema graph for RNN-T loss (Compose-Transducer).
        Forward arc - blank, self-loops - all labels excluding blank

        Example graph: blank=0, num_frames=3, vocab_size=3.
        Labels: <unit>:<frame_index>. <unit> is a unit from vocab.

        graph::

                1:0                1:1                1:2
              +-----+            +-----+            +-----+
              v     |            v     |            v     |
            +---------+  0:0   +---------+  0:1   +---------+  0:2   +---+  -1:-1   #===#
            |    0    | -----> |    1    | -----> |    2    | -----> | 3 | -------> H 4 H
            +---------+        +---------+        +---------+        +---+          #===#
              ^ 2:0 |            ^ 2:1 |            ^ 2:2 |
              +-----+            +-----+            +-----+

        Args:
            num_frames: length of the sequence (in frames)
            vocab_size: number of labels (including blank)
            device: device for tensor to construct

        Returns:
            temporal schema graph (k2.Fsa).
            Labels: <unit>:<frame_index>. <unit> is a unit from vocab.
        rP   rj   rk   r   Nrl   rQ   rR   )rf   r   rm   rn   rJ   rq   rr   rs   rL   ro   rp   r   rb   arc_sort)
r   r#   r   r$   rt   fsa_temporal_arcssequence_statesstart_statesrw   r-   r   r   r   r%   G  s   (
z!GraphRnntLoss.get_temporal_schemastatesnmc                 C   s  | | }t j| |dd}t||}t||}|| }|| d | }|| d }	||kr.|n|| d }
||||d  d? |  t ||||||d  d? || |  |
   |||	||d  d?  | |   }t j| || k| ||d |S )a  
        Relabel states to be in topological order: by diagonals

        Args:
            states: tensor with states
            n: number of rows
            m: number of columns

        Returns:
            tensor with relabeled states (same shape as `states`)
        floorrounding_moderP   )out)r   divminmaxltlogical_andgewhere)r|   r}   r~   r5   jmin_mnmax_mndiag	anti_diagmax_idxcur_diag_idx
new_statesr   r   r   relabel_states~  s$   

$zGraphRnntLoss.relabel_statesc                 C   sT  | j }|jd }|j}||d  }|d |d  }|| }	tj||	 d dftj|d}
tj||d}||d  }||
d|df< ||
d|df< ||
d|df< tj|tj|d||d ddddf  }|d }|	|d }||
|d	df< ||
|d	df< ||
|d	df< tj
|d ||ftj|d|
d	dd
f< tj
||d dftj|d|
ddd
f< tj|
dddf |d dd}|
dddf |d  }d|d< d|d< | |
dd	df |d ||
dd	df< | |
dddf |d ||
dddf< tj|
dddf dd\}}|
| }|| }|| }t||}||_|S )a  
        Construct the RNN-T lattice directly (Grid-Transducer).

        Args:
            units_tensor: 1d tensor with text units
            num_frames: length of the sequence (number of frames)
            vocab_size: number of total labels (vocab size including blank)

        Returns:
            transducer lattice (k2.Fsa).
            Labels: <unit>:<frame_index>:<unit_position> (k2.Fsa: labels, aux_labels, unit_positions)
        r   rP   rQ   rj   rk   rH   Nrl   rR   r   r   dim)rf   r:   r$   r   rm   rn   rJ   reshapers   rq   rL   r   r   sortr   rb   rU   )r   r   r#   r   rt   text_lengthr$   num_grid_statesnum_forward_arcsnum_text_arcsrD   from_states	to_statesilabelsrw   rU   _rW   sorted_arcs
rnnt_graphr   r   r   r'     sJ   
(( **zGraphRnntLoss.get_gridlogitsr1   r0   r2   c              	   C   s  |j d }| ||||}t 4 |jdk}| j|d}	|j tj	}
|j
 tj	}|j tj	}||d W d   n1 sHw   Y  | jrSt nt }|h | jr|jjdkrt||| j||d\}}|| jk}t|||	|
|f ||	|
|f tj}d||< ntj|dd}||	|
||f tj}d||< |r|j| |_n||_W d   |S W d   |S 1 sw   Y  |S )	a  
        Get batch of graphs (FsaVec) for RNN-T loss calculation.

        Args:
            logits: activations (joint tensor). NB: raw logits, not after log-softmax
            targets: target labels
            source_lengths: lengths of source sequences
            target_lengths: length of target sequences
            use_graph_weight: uses weight from graphs (if `get_graphs_batched` returns graphs with weights)

        Returns:
            FsaVec containing RNN-T graphs for all utterances.
        rl   rG   r   Nr
   )r   r1   rt   r0   r2   g        r   )r:   rC   r   r;   rV   rN   rS   rp   rT   rK   rU   masked_fill_r   r   r   rh   r$   typer   rf   r   r   Flog_softmaxscores)r   r   r1   r0   r2   use_graph_weightr   rB   last_transition_maskbatch_indicestime_indicesunit_indices
text_unitscast_contextunit_scoresblank_scorestext_units_blank_maskr   	log_probsr   r   r   get_weighted_graphs  sT   








z!GraphRnntLoss.get_weighted_graphsactsrV   act_lens
label_lensc                 C   s:   | j ||||dd}d|j| jdd }| jr||fS |S )aS  
        Compute forward method for RNN-T.

        Args:
            acts: activations (joint tensor). NB: raw logits, not after log-softmax
            labels: target labels
            act_lens: lengths of activations
            label_lens: length of labels sequences

        Returns:
            batch of RNN-T scores (loss)
        F)r   r1   r0   r2   r   rl   T)use_double_scoreslog_semiring)r   get_tot_scoresr   rg   )r   r   rV   r   r   rB   r   r   r   r   forward  s   
zGraphRnntLoss.forward)TFFFFT)F)rY   rZ   r[   r\   ra   r   r   r`   r"   r$   r%   staticmethodr   r'   r   tupler   rd   r   r   r   r   re      sN    	%37 I
?re   )r^   
contextlibr   typingr   r   torch.nn.functionalnn
functionalr   nemo.core.classes.lossr   nemo.core.utils.k2_guardr   nemo.core.utils.optional_libsr   
nemo.utilsr   2nemo.collections.asr.parts.k2.rnnt_logprobs_tritonr   r   r   re   r   r   r   r   <module>   s    C