o
    }oi                    @   sp  d dl mZmZmZmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZmZmZmZm Z m!Z!m"Z"m#Z# d dl$m%Z% G dd dej&eZ'G dd dej&eeZ(G dd dej)eeZ*G dd dej+j,eZ-G dd dej+j,Z.G dd de*Z/e(e*e/fD ]Z0e1e0du re2e0e0 qdS )    )AnyDictListOptionalTupleUnionN)
DictConfig)rnnt_abstract)stateless_net)adapter_utils
rnnt_utils)rnn)adapter_mixins	typecheck)
Exportable)AdapterModuleMixin)	AcousticEncodedRepresentationElementTypeEmbeddedTextType
LabelsTypeLengthsTypeLogprobsTypeLossType
NeuralTypeSpectrogramType)loggingc                       s6  e Zd ZdZedd Zedd ZdGddZ fd	d
Z		dHde	e
ef dededee
 f fddZe dIddZ				dJdeej deej dedee deejeej f f
ddZdd Zdejde	ee ef deejeej ejf fd d!Zdejdeej fd"d#Zd$eeej  fd%d&Zd'eej d(edeeej  fd)d*Zd'eeej  deej fd+d,Ze	dId-e ejejf e!ej B d.e ejejf e!ej B d/ejd0ee ejejf e!ej B  fd1d2Z"e	dId-e!ej d.e!ej dedB fd3d4Z#ede!ej de!ej fd5d6Z$ed'e!ej de!e!ej  fd7d8Z%e	dKd'e!e!ej  de!ej fd9d:Z&	dId;eej d<eej d=ee d>ee' deej f
d?d@Z(dAeeej  d/ejdeeej  fdBdCZ)dDeej de	ee ef deeej eeej  f fdEdFZ*  Z+S )LStatelessTransducerDecodera  A Stateless Neural Network Transducer Decoder / Prediction Network.
    An RNN-T Decoder/Prediction stateless network that simply takes concatenation of embeddings of the history tokens as the output.

    Args:
        prednet: A dict-like object which contains the following key-value pairs.
            pred_hidden: int specifying the hidden dimension of the prediction net.

            dropout: float, set to 0.0 by default. Optional dropout applied at the end of the final LSTM RNN layer.

        vocab_size: int, specifying the vocabulary size of the embedding layer of the Prediction network,
            excluding the RNNT blank token.

        context_size: int, specifying the size of the history context used for this decoder.

        normalization_mode: Can be either None, 'layer'. By default, is set to None.
            Defines the type of normalization applied to the RNN layer.

    c                 C   s.   t dt t tdt t dt ddgdS )*Returns definitions of module input ports.BTr   Toptionaltargetstarget_lengthstates)r   r   tupler   self r*   U/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/modules/rnnt.pyinput_typesK      
z&StatelessTransducerDecoder.input_typesc                 C   .   t dt t tdt t dt ddgdS )+Returns definitions of module output ports.r   Dr    r   r   Tr!   outputsprednet_lengthsr&   )r   r   r'   r   r   r(   r*   r*   r+   output_typesT   r-   z'StatelessTransducerDecoder.output_types   c                 C   n   |}t j| j||ft jdt|  j}t jd||ft jdt|  j}t	| 
| }|||fS s
        Generates input examples for tracing etc.
        Returns:
            A tuple of input examples.
        )
fill_valuesizedtyper   )r;   r<   torchfull	blank_idxint32tonext
parametersdevicerandintr'   initialize_statefloatr)   	max_batchmax_dimlengthr$   r%   r&   r*   r*   r+   input_example]      
z(StatelessTransducerDecoder.input_examplec                       d| _ t jdi | d S NTr*   _rnnt_exportsuper_prepare_for_exportr)   kwargs	__class__r*   r+   rT   m      z.StatelessTransducerDecoder._prepare_for_exportNprednet
vocab_sizecontext_sizenormalization_modec              
      sd   |d | _ || _|| _t j|| jdd |dd}| jdi ||| j | j||d| _d| _d S )	Npred_hiddenTr[   r@   blank_as_paddropout        )r\   r[   emb_dimr@   r]   ra   Fr*   )	r^   r@   r\   rS   __init__get_predict_modules
predictionrR   )r)   rZ   r[   r\   r]   ra   rW   r*   r+   rd   q   s   


z#StatelessTransducerDecoder.__init__c                 C   sD   t |}| jrd}nd}| j|||d\}}|dd}|||fS NFT)stateadd_sosr6      r   label_collaterR   predict	transpose)r)   r$   r%   r&   yrj   gri   r*   r*   r+   forward      

z"StatelessTransducerDecoder.forwardTrp   ri   rj   
batch_sizereturnc                 C   s   t |  }|j}|j}|dur#|j|kr||}| ||\}}n |du r5|du r-dn|d d}n|}tj|d| j	f||d}|rd|j
\}}	}
tj|d|
f|j|jd}tj||gdd }nd}~||fS )a  
        Stateful prediction of scores and state for a tokenset.

        Here:
        B - batch size
        U - label length
        C - context size for stateless decoder
        D - total embedding size

        Args:
            y: Optional torch tensor of shape [B, U] of dtype long which will be passed to the Embedding.
                If None, creates a zero tensor of shape [B, 1, D] which mimics output of pad-token on Embedding.

            state: An optional one-element list of one tensor. The tensor is used to store previous context labels.
                The tensor uses type long and is of shape [B, C].

            add_sos: bool flag, whether a zero vector describing a "start of signal" token should be
                prepended to the above "y" tensor. When set, output size is (B, U + 1, D).

            batch_size: An optional int, specifying the batch size of the `y` tensor.
                Can be infered if `y` and `state` is None. But if both are None, then batch_size cannot be None.

        Returns:
            A tuple  (g, state) such that -

            If add_sos is False:

                g:
                    (B, U, D)

                state:
                    [(B, C)] storing the history context including the new words in y.

            If add_sos is True:

                g:
                    (B, U + 1, D)

                state:
                    [(B, C)] storing the history context including the new words in y.

        Nr6   r   rE   r<   dim)rC   rD   rE   r<   rB   rg   r;   r>   zerosr^   shapecat
contiguous)r)   rp   ri   rj   rt   _prE   r<   r   Ur1   startr*   r*   r+   rn      s$   2

z"StatelessTransducerDecoder.predictc                 K   s   t jdi |}|S )aA  
        Prepare the trainable parameters of the Prediction Network.

        Args:
            vocab_size: Vocab size (excluding the blank token).
            pred_n_hidden: Hidden size of the RNNs.
            norm: Type of normalization to perform in RNN.
            dropout: Whether to apply dropout to RNN.
        Nr*   )r
   StatelessNet)r)   rV   netr*   r*   r+   rf      s   z+StatelessTransducerDecoder._predict_modules
hypothesiscachec                 C     |j dur|j d j}n	t|  }|j}t|jdkr'|jd | jkr'd}nd}tjddg|jd |tj	d}|dddf }t
|j}||v rP|| \}	}
n-|r^| jddddd\}	}
n| j||j ddd\}	}
|	ddddddf }	|	|
f||< |	|
|fS 	a  
        Similar to the predict() method, instead this method scores a Hypothesis during beam search.
        Hypothesis is a dataclass representing one hypothesis in a Beam Search.

        Args:
            hypothesis: Refer to rnnt_utils.Hypothesis.
            cache: Dict which contains a cache to avoid duplicate computations.

        Returns:
            Returns a tuple (y, states, lm_token) such that:
            y is a torch.Tensor of shape [1, 1, H] representing the score of the last token in the Hypothesis.
            state is a list of RNN states, each of shape [L, 1, H].
            lm_token is the final integer token of the hypothesis.
        Nr   TFr6   )r:   rE   r<   ri   rj   rt   	dec_staterE   rC   rD   len
y_sequencer@   r>   r?   longr'   rn   r)   r   r   rE   r}   blank_statetargetlm_tokensequencerp   	new_stater*   r*   r+   score_hypothesis   (   




z+StatelessTransducerDecoder.score_hypothesisc                 C   s2   | d}tj|| jd g| jtj|jdg}|S )Nr   r6   )r:   r<   rE   )r;   r>   r?   r\   r@   r   rE   r)   rp   batchri   r*   r*   r+   rG   5  s   
 z+StatelessTransducerDecoder.initialize_statedecoder_statesc                 C   s   t dd |D }|gS )a  
        Creates a stacked decoder states to be passed to prediction network.

        Args:
            decoder_states (list of list of torch.Tensor): list of decoder states
                [B, 1, C]
                    - B: Batch size.
                    - C: Dimensionality of the hidden state.

        Returns:
            batch_states (list of torch.Tensor): batch of decoder states [[B x C]]
        c                 S      g | ]}|d  qS r   r*   ).0sr*   r*   r+   
<listcomp>K      zFStatelessTransducerDecoder.batch_initialize_states.<locals>.<listcomp>r>   stack)r)   r   r   r*   r*   r+   batch_initialize_states>  s   z2StatelessTransducerDecoder.batch_initialize_statesbatch_statesidxc                 C   s&   |dur|d | }|  }|gS dS )a<  Get decoder state from batch of states, for given id.

        Args:
            batch_states (list): batch of decoder states
                [(B, C)]

            idx (int): index to extract state from batch of states

        Returns:
            (tuple): decoder states for given id
                [(C)]
        Nr   )r   )r)   r   r   r&   r*   r*   r+   batch_select_stateO  s   z-StatelessTransducerDecoder.batch_select_statec                 C   sL   g }g }t t|D ]}t|| }|| q
t|d}|| |S )zConcatenate a batch of decoder state to a packed state.

        Args:
            batch_states (list): batch of decoder states
                B x ([(C)]

        Returns:
            (tuple): decoder states
                [(B x C)]
        r   )ranger   r>   r   appendr{   )r)   r   
state_list
batch_list	sample_idtensorstate_tensorr*   r*   r+   batch_concat_statese  s   
z.StatelessTransducerDecoder.batch_concat_states
src_states
dst_statesmaskother_src_statesc                 C   s8   |dur|n|}t j|d|d |d |d d dS )a  
        Replaces states in `dst_states` with states from `src_states` based on the given `mask`.

        Args:
            mask (torch.Tensor): When True, selects values from `src_states`, otherwise `out` or `other_src_states`(if provided).
            src_states (tuple[torch.Tensor, torch.Tensor]): Values selected at indices where `mask` is True.
            dst_states (tuple[torch.Tensor, torch.Tensor], optional): The output states.
            other_src_states (tuple[torch.Tensor, torch.Tensor], optional): Values selected at indices where `mask` is False.

        Note:
            This operation is performed without CPU-GPU synchronization by using `torch.where`.
        Nr   r   out)r>   where	unsqueeze)clsr   r   r   r   otherr*   r*   r+   batch_replace_states_mask{  s   (z4StatelessTransducerDecoder.batch_replace_states_maskc                 C   sD   |du r|d  |d  dS |d d|  |d d|  dS )8Replace states in dst_states with states from src_statesNr   copy_r   r   r   rt   r*   r*   r+   batch_replace_states_all  s   &z3StatelessTransducerDecoder.batch_replace_states_allc                 C      dd |D S )Return copy of the statesc                 S   s   g | ]}|  qS r*   cloner   	sub_stater*   r*   r+   r     r   z:StatelessTransducerDecoder.clone_state.<locals>.<listcomp>r*   r   ri   r*   r*   r+   clone_state  s   z&StatelessTransducerDecoder.clone_statec                 C   r   )
        Split states into a list of states.
        Useful for splitting the final state for converting results of the decoding algorithm to Hypothesis class.
        c                 S   s   g | ]	}|j d ddqS )r6   r   rw   )splitr   r*   r*   r+   r     s    zAStatelessTransducerDecoder.batch_split_states.<locals>.<listcomp>r*   r   r   r*   r*   r+   batch_split_states  s   z-StatelessTransducerDecoder.batch_split_statesc                 C   s$   t jdd |D ddj||dgS )zj
        Concatenate a batch of decoder state to a packed state. Inverse of `batch_split_states`.
        c                 S   r   r   r*   r   ri   r*   r*   r+   r     r   zCStatelessTransducerDecoder.batch_unsplit_states.<locals>.<listcomp>r   rw   rv   r>   r   rB   r   r   rE   r<   r*   r*   r+   batch_unsplit_states  s    z/StatelessTransducerDecoder.batch_unsplit_states
old_states
new_statesidsvaluec                 C   s0   |du r|d |ddf |d |ddf< |S )a:  Copy states from new state to old state at certain indices.

        Args:
            old_states: packed decoder states
                single element list of (B x C)

            new_states: packed decoder states
                single element list of (B x C)

            ids (list): List of indices to copy states at.

            value (optional float): If a value should be copied instead of a state slice, a float should be provided

        Returns:
            batch of decoder states with partial copy at ids (or a specific value).
            (B x C)
        Nr   r*   )r)   r   r   r   r   r*   r*   r+   batch_copy_states  s   $z,StatelessTransducerDecoder.batch_copy_statesr&   c                 C   s   |du rdS |d | gS )  
        Return states by mask selection
        Args:
            states: states for the batch
            mask: boolean mask for selecting states; batch dimension should be the same as for states

        Returns:
            states filtered by mask
        Nr   r*   r)   r&   r   r*   r*   r+   mask_select_states  s   z-StatelessTransducerDecoder.mask_select_states
hypothesesc                 C   s`  t |}|dkrtdt|  }|j}g }g }dd t|D }t|D ]$\}	}
t|
j}||v r:|| ||	< q&|	|
jd  |	||
j
f q&|rt |}tj||tjd|d}| dd |D }| j||d|d	\}}d}t|D ](}	|r||	 d
u r| ||}|| |f||	< || |f||| d < |d7 }qydd |D dd |D fS )n  
        Used for batched beam search algorithms. Similar to score_hypothesis method.

        Args:
            hypothesis: List of Hypotheses. Refer to rnnt_utils.Hypothesis.
            cache: Dict which contains a cache to avoid duplicate computations.

        Returns:
            Returns a tuple (batch_dec_out, batch_dec_states) such that:
                batch_dec_out: a list of torch.Tensor [1, H] representing the prediction network outputs for the last tokens in the Hypotheses.
                batch_dec_states: a list of list of RNN states, each of shape [L, B, H]. Represented as B x List[states].
        r   )No hypotheses was provided for the batch!c                 S      g | ]}d qS Nr*   r   _r*   r*   r+   r         zEStatelessTransducerDecoder.batch_score_hypothesis.<locals>.<listcomp>r   rv   c                 S      g | ]\}}|qS r*   r*   r   r   d_stater*   r*   r+   r     r   Fr   Nr6   c                 S      g | ]\}}|qS r*   r*   r   dec_outr   r*   r*   r+   r   %  r   c                 S   r   r*   r*   r   r   
dec_statesr*   r*   r+   r   %  r   r   
ValueErrorrC   rD   rE   r   	enumerater'   r   r   r   r>   r   r   viewr   rn   r   )r)   r   r   final_batchr}   rE   tokens
to_processfinal	final_idxhypr   r   r   dec_outputsprocessed_idxr   r*   r*   r+   batch_score_hypothesis  s<   

z1StatelessTransducerDecoder.batch_score_hypothesisr6   r6   )r6   Nr   NNTNNN),__name__
__module____qualname____doc__propertyr,   r5   rM   rT   r   strr   intr   rd   r   rr   r>   Tensorboolr   r   rn   rf   r   
Hypothesisr   rG   r   r   r   classmethodr'   listr   r   r   r   r   rH   r   r   r   __classcell__r*   r*   rW   r+   r   7   s    




R
5	&" $

r   c                       s  e Zd ZdZedd Zedd ZdSddZ fd	d
Z			dTde	e
ef dedee
 dedef
 fddZe dUddZ				dVdeej deeej  dedee deejeej f f
ddZdd Zdejdeejejf fd d!Zd"ejd#e	ee ef deejeej ejf fd$d%Zd&eej d#e	ee ef deeej eeej  f fd'd(Zd)eeej  deej fd*d+Zd,eej d-edeeej  fd.d/Ze	dUd0e ejejf ded1ed2ejd3ee ejejf  de ejejf fd4d5Z!d,eeej  deej fd6d7Z"e	dUd0eejejf d3eejejf d8ejd9eeejejf  fd:d;Z#e	dUd0eejejf d3eejejf dedB fd<d=Z$ede ejejf de ejejf fd>d?Z%ed,e ejejf de&e ejejf  fd@dAZ'e	dWd,e&e ejejf  de ejejf fdBdCZ(	dUdDeej dEeej dFee dGee) deej f
dHdIZ*dJeejejf d8ejdeejejf fdKdLZ+dMe
dNe,f fdOdPZ-dNe,fdQdRZ.  Z/S )XRNNTDecoderaU  A Recurrent Neural Network Transducer Decoder / Prediction Network (RNN-T Prediction Network).
    An RNN-T Decoder/Prediction network, comprised of a stateful LSTM model.

    Args:
        prednet: A dict-like object which contains the following key-value pairs.

            pred_hidden:
                int specifying the hidden dimension of the prediction net.

            pred_rnn_layers:
                int specifying the number of rnn layers.

            Optionally, it may also contain the following:

                forget_gate_bias:
                    float, set by default to 1.0, which constructs a forget gate
                    initialized to 1.0.
                    Reference:
                    [An Empirical Exploration of Recurrent Network Architectures](http://proceedings.mlr.press/v37/jozefowicz15.pdf)

                t_max:
                    int value, set to None by default. If an int is specified, performs Chrono Initialization
                    of the LSTM network, based on the maximum number of timesteps `t_max` expected during the course
                    of training.
                    Reference:
                    [Can recurrent neural networks warp time?](https://openreview.net/forum?id=SJcKhk-Ab)

                weights_init_scale:
                    Float scale of the weights after initialization. Setting to lower than one
                    sometimes helps reduce variance between runs.

                hidden_hidden_bias_scale:
                    Float scale for the hidden-to-hidden bias scale. Set to 0.0 for
                    the default behaviour.

                dropout:
                    float, set to 0.0 by default. Optional dropout applied at the end of the final LSTM RNN layer.

        vocab_size: int, specifying the vocabulary size of the embedding layer of the Prediction network,
            excluding the RNNT blank token.

        normalization_mode: Can be either None, 'batch' or 'layer'. By default, is set to None.
            Defines the type of normalization applied to the RNN layer.

        random_state_sampling: bool, set to False by default. When set, provides normal-distribution
            sampled state tensors instead of zero tensors during training.
            Reference:
            [Recognizing long-form speech using streaming end-to-end models](https://arxiv.org/abs/1910.11455)

        blank_as_pad: bool, set to True by default. When set, will add a token to the Embedding layer of this
            prediction network, and will treat this token as a pad token. In essence, the RNNT pad token will
            be treated as a pad token, and the embedding layer will return a zero tensor for this token.

            It is set by default as it enables various batch optimizations required for batched beam search.
            Therefore, it is not recommended to disable this flag.
    c                 C   r.   )r   r   r   r1   r   r1   Tr!   r#   )r   r   r'   r   r   r(   r*   r*   r+   r,   b  r-   zRNNTDecoder.input_typesc                 C   r.   )r/   r0   r   r  Tr!   r2   )r   r   r'   r   r   r(   r*   r*   r+   r5   k  r-   zRNNTDecoder.output_typesr6   c                 C   r7   r8   r=   rI   r*   r*   r+   rM   t  rN   zRNNTDecoder.input_examplec                    rO   rP   rQ   rU   rW   r*   r+   rT     rY   zRNNTDecoder._prepare_for_exportNFTrZ   r[   r]   random_state_samplingr`   c                    s   |d | _ |d | _|| _t j|| j|d |dd}|dd }|dd}|dd	}	|d
d	}
|| _| j|| j | j|||||	|
|ddd
| _d| _	d S )Nr^   pred_rnn_layersr_   forget_gate_bias      ?t_maxweights_init_scalehidden_hidden_bias_scalerb   ra   rnn_hidden_sizer   )
r[   pred_n_hiddenr  r  r  normr  r	  ra   r
  F)
r^   r  r@   rS   rd   re   r  rf   rg   rR   )r)   rZ   r[   r]   r  r`   r  r  r  r	  ra   rW   r*   r+   rd     s.   
	


zRNNTDecoder.__init__c                 C   sD   t |}| jrd}nd}| j|||d\}}|dd}|||fS rh   rl   )r)   r$   r%   r&   rp   rj   rq   r*   r*   r+   rr     rs   zRNNTDecoder.forwardrp   ri   rj   rt   ru   c                 C   s4  t |  }|j}|j}|dur"|j|kr||}| jd |}n |du r4|du r,dn|d d}n|}tj|d| j	f||d}|rc|j
\}}	}
tj|d|
f|j|jd}tj||gdd }nd}|du rt| jrt| jrt| |}|dd}| jd ||\}}|dd}~~~|  r| |}||fS )at  
        Stateful prediction of scores and state for a (possibly null) tokenset.
        This method takes various cases into consideration :
        - No token, no state - used for priming the RNN
        - No token, state provided - used for blank token scoring
        - Given token, states - used for scores + new states

        Here:
        B - batch size
        U - label length
        H - Hidden dimension size of RNN
        L - Number of RNN layers

        Args:
            y: Optional torch tensor of shape [B, U] of dtype long which will be passed to the Embedding.
                If None, creates a zero tensor of shape [B, 1, H] which mimics output of pad-token on EmbeddiNg.

            state: An optional list of states for the RNN. Eg: For LSTM, it is the state list length is 2.
                Each state must be a tensor of shape [L, B, H].
                If None, and during training mode and `random_state_sampling` is set, will sample a
                normal distribution tensor of the above shape. Otherwise, None will be passed to the RNN.

            add_sos: bool flag, whether a zero vector describing a "start of signal" token should be
                prepended to the above "y" tensor. When set, output size is (B, U + 1, H).

            batch_size: An optional int, specifying the batch size of the `y` tensor.
                Can be infered if `y` and `state` is None. But if both are None, then batch_size cannot be None.

        Returns:
            A tuple  (g, hid) such that -

            If add_sos is False:

                g:
                    (B, U, H)

                hid:
                    (h, c) where h is the final sequence hidden state and c is the final cell state:

                        h (tensor), shape (L, B, H)

                        c (tensor), shape (L, B, H)

            If add_sos is True:
                g:
                    (B, U + 1, H)

                hid:
                    (h, c) where h is the final sequence hidden state and c is the final cell state:

                        h (tensor), shape (L, B, H)

                        c (tensor), shape (L, B, H)

        Nembedr6   r   rv   rw   dec_rnn)rC   rD   rE   r<   rB   rg   r;   r>   ry   r^   rz   r{   r|   r  trainingrG   ro   is_adapter_availableforward_enabled_adapters)r)   rp   ri   rj   rt   r}   rE   r<   r   r~   Hr   rq   hidr*   r*   r+   rn     s4   ?



zRNNTDecoder.predictc                 C   sv   | j rtjj|d || jd}ntj||}tj|tj||
dkr$|
n||||||	||||
k r2|ndd
d}|S )a  
        Prepare the trainable parameters of the Prediction Network.

        Args:
            vocab_size: Vocab size (excluding the blank token).
            pred_n_hidden: Hidden size of the RNNs.
            pred_rnn_layers: Number of RNN layers.
            forget_gate_bias: Whether to perform unit forget gate bias.
            t_max: Whether to perform Chrono LSTM init.
            norm: Type of normalization to perform in RNN.
            weights_init_scale: Float scale of the weights after initialization. Setting to lower than one
                sometimes helps reduce variance between runs.
            hidden_hidden_bias_scale: Float scale for the hidden-to-hidden bias scale. Set to 0.0 for
                the default behaviour.
            dropout: Whether to apply dropout to RNN.
            rnn_hidden_size: the hidden size of the RNN, if not specified, pred_n_hidden would be used
        r6   )padding_idxr   )

input_sizehidden_size
num_layersr  r  r  ra   r  r	  	proj_size)r  r  )r`   r>   nn	Embeddingr@   
ModuleDictr   )r)   r[   r  r  r  r  r  r  r	  ra   r
  r  layersr*   r*   r+   rf   .  s(   zRNNTDecoder._predict_modulesc                 C   s   | d}| jr)| jr)tj| j|| j|j|jdtj| j|| j|j|jdf}|S tj	| j|| j|j|jdtj	| j|| j|j|jdf}|S )a  
        Initialize the state of the LSTM layers, with same dtype and device as input `y`.
        LSTM accepts a tuple of 2 tensors as a state.

        Args:
            y: A torch.Tensor whose device the generated states will be placed on.

        Returns:
            Tuple of 2 tensors, each of shape [L, B, H], where

                L = Number of RNN layers

                B = Batch size

                H = Hidden size of RNN.
        r   )r<   rE   )
r;   r  r  r>   randnr  r^   r<   rE   ry   r   r*   r*   r+   rG   d  s   

zRNNTDecoder.initialize_stater   r   c                 C   r   r   r   r   r*   r*   r+   r     r   zRNNTDecoder.score_hypothesisr   c                 C   s\  t |}|dkrtdt|  }|j}g }g }dd t|D }t|D ]$\}	}
t|
j}||v r:|| ||	< q&|	|
jd  |	||
j
f q&|rt |}tj||tjd|d}| dd |D }| j||d|d	\}}d}t|D ]&}	||	 d
u r| ||}|| |f||	< || |f||| d < |d7 }qydd |D dd |D fS )r   r   r   c                 S   r   r   r*   r   r*   r*   r+   r     r   z6RNNTDecoder.batch_score_hypothesis.<locals>.<listcomp>r   rv   c                 S   r   r*   r*   r   r*   r*   r+   r     r   Fr   Nr6   c                 S   r   r*   r*   r   r*   r*   r+   r     r   c                 S   r   r*   r*   r   r*   r*   r+   r     r   r   )r)   r   r   r   r}   rE   r   r   r   r   r   r   r   r   r   r   r   r*   r*   r+   r     s<   

z"RNNTDecoder.batch_score_hypothesisr   c                 C   s0   t dd |D }|dddd}t| S )aM  
        Creates a stacked decoder states to be passed to prediction network

        Args:
            decoder_states (list of list of list of torch.Tensor): list of decoder states
                [B, C, L, H]
                    - B: Batch size.
                    - C: e.g., for LSTM, this is 2: hidden and cell states
                    - L: Number of layers in prediction RNN.
                    - H: Dimensionality of the hidden state.

        Returns:
            batch_states (list of torch.Tensor): batch of decoder states
                [C x torch.Tensor[L x B x H]
        c                 S   s   g | ]}t |qS r*   r   )r   decoder_stater*   r*   r+   r         z7RNNTDecoder.batch_initialize_states.<locals>.<listcomp>r6   rk   r      )r>   r   permuter   r|   )r)   r   stacked_statespermuted_statesr*   r*   r+   r     s   z#RNNTDecoder.batch_initialize_statesr   r   c                    s   |dur fdd|D S dS )ag  Get decoder state from batch of states, for given id.

        Args:
            batch_states (list): batch of decoder states
                ([L x (B, H)], [L x (B, H)])

            idx (int): index to extract state from batch of states

        Returns:
            (tuple): decoder states for given id
                ([L x (1, H)], [L x (1, H)])
        Nc                    s   g | ]
}|d d  f qS r   r*   r   r   r*   r+   r     s    z2RNNTDecoder.batch_select_state.<locals>.<listcomp>r*   )r)   r   r   r*   r$  r+   r     s   zRNNTDecoder.batch_select_stater   	beam_sizeindicesr   c                 C   s   |d j d }|d j d }t||||f}t||| |f}	|ddddddf |}
|dur\tj|d |d|
|d |d tj|d |d|
|d |d |S tj|d |d|
d|	tj|d |d|
d|	fS )a}  
        Aggregates decoder states based on the given indices.
        Args:
            src_states (Tuple[torch.Tensor, torch.Tensor]): source states of
                shape `([L x (batch_size * beam_size, H)], [L x (batch_size * beam_size, H)])`
            batch_size (int): The size of the batch.
            beam_size (int): The size of the beam.
            indices (torch.Tensor): A tensor of shape `(batch_size, beam_size)` containing
                the indices in beam that map the source states to the destination states.
            dst_states (Optional[Tuple[torch.Tensor, torch.Tensor]]): If provided, the method
                updates these tensors in-place.
        Returns:
            Tuple[torch.Tensor, torch.Tensor]:
        Note:
            - The `indices` tensor is expanded to match the shape of the source states
            during the gathering operation.
        r   r   Nrk   )rx   indexr   r6   )rx   r'  )rz   r>   Sizeexpandgatherr   )r   r   rt   r%  r&  r   
layers_num
layers_dim
beam_shape
flat_shapeindices_expandedr*   r*   r+   batch_aggregate_states_beam"  s    z'RNNTDecoder.batch_aggregate_states_beamc                 C   s   g }t t|d D ]A}g }t t|D ]%}t|| | tjs)t|| | n|| | }|d}|| qt|d}|	dd}|| q
|S )a  Concatenate a batch of decoder state to a packed state.

        Args:
            batch_states (list): batch of decoder states
                B x ([L x (H)], [L x (H)])

        Returns:
            (tuple): decoder states
                (L x B x H, L x B x H)
        r   r6   )
r   r   
isinstancer>   r   r   r   r   r{   ro   )r)   r   r   state_idr   r   r   r   r*   r*   r+   r   U  s   

zRNNTDecoder.batch_concat_statesr   r   c                 C   s   |dur|n|}|d j }tj|dd|d ||d ||d d tj|dd|d ||d ||d d dS )a  
        Replaces states in `dst_states` with states from `src_states` based on the given `mask`.

        Args:
            mask (torch.Tensor): When True, selects values from `src_states`, otherwise `out` or `other_src_states`(if provided).
            src_states (Tuple[torch.Tensor, torch.Tensor]): Values selected at indices where `mask` is True.
            dst_states (Tuple[torch.Tensor, torch.Tensor])): The output states.
            other_src_states (Tuple[torch.Tensor, torch.Tensor], optional): Values selected at indices where `mask` is False.

        Note:
            This operation is performed without CPU-GPU synchronization by using `torch.where`.
        Nr   r   r   r6   )r<   r>   r   r   rB   )r   r   r   r   r   r   r<   r*   r*   r+   r   s  s   
6:z%RNNTDecoder.batch_replace_states_maskc                 C   s   |du r|d  |d  |d  |d  dS |d ddd|f  |d ddd|f  |d ddd|f  |d ddd|f  dS )r   Nr   r6   r   r   r*   r*   r+   r     s
   26z$RNNTDecoder.batch_replace_states_allc                 C   s   |d   |d   fS )r   r   r6   r   r   r*   r*   r+   r     s   zRNNTDecoder.clone_statec                 C   s0   dd t |d jddd|d jdddD S )r   c                 S   s$   g | ]\}}| d | d fqS r6   )squeeze)r   sub_state_1sub_state_2r*   r*   r+   r     s    z2RNNTDecoder.batch_split_states.<locals>.<listcomp>r   r6   rw   )zipr   r   r*   r*   r+   r     s   $zRNNTDecoder.batch_split_statesc                 C   sD   t jdd |D ddj||dt jdd |D ddj||dfS )a:  
        Concatenate a batch of decoder state to a packed state. Inverse of `batch_split_states`.

        Args:
            batch_states (list): batch of decoder states
                B x ([L x (H)], [L x (H)])

        Returns:
            (tuple): decoder states
                (L x B x H, L x B x H)
        c                 S   r   r   r*   r   r*   r*   r+   r     r   z4RNNTDecoder.batch_unsplit_states.<locals>.<listcomp>r6   rw   rv   c                 S   r   r3  r*   r   r*   r*   r+   r     r   r   r   r*   r*   r+   r     s     z RNNTDecoder.batch_unsplit_statesr   r   r   r   c                 C   s   t t|D ]A}|du r%|| dd|ddf || dd|ddf< q|| dd|ddf  d9  < || dd|ddf  |7  < q|S )aC  Copy states from new state to old state at certain indices.

        Args:
            old_states(list): packed decoder states
                (L x B x H, L x B x H)

            new_states: packed decoder states
                (L x B x H, L x B x H)

            ids (list): List of indices to copy states at.

            value (optional float): If a value should be copied instead of a state slice, a float should be provided

        Returns:
            batch of decoder states with partial copy at ids (or a specific value).
                (L x B x H, L x B x H)
        Nrb   )r   r   )r)   r   r   r   r   r2  r*   r*   r+   r     s   2"$zRNNTDecoder.batch_copy_statesr&   c                 C   s(   |d dd|f |d dd|f fS )r   r   Nr6   r*   r   r*   r*   r+   r     s   (zRNNTDecoder.mask_select_statesnamecfgc                       |  |}t j||d d S N)r8  r9  _update_adapter_cfg_input_dimrS   add_adapterr)   r8  r9  rW   r*   r+   r>       
zRNNTDecoder.add_adapterc                 C      t j| || jd}|S N)
module_dim)r   update_adapter_cfg_input_dimr^   r)   r9  r*   r*   r+   r=       z)RNNTDecoder._update_adapter_cfg_input_dimr   )NFTr   r   r   )0r   r   r   r   r   r,   r5   rM   rT   r   r   r   r   r   r   rd   r   rr   r>   r   r   r   rn   rf   rG   r   r   r   r   r   r   r   r'   r0  r   r   r   r   r   r   r   rH   r   r   r   r>  r=  r   r*   r*   rW   r+   r  (  s   9



&
o 6
5
"A&"2,
!
r  c                       s  e Zd ZdZedd Zedd Z fddZdMd
dZedd Z									dNde
eef dededee dee dededee dedef fddZe 				dOdejdeej d eej d!eej d"eej d#ed$eejeeej  f fd%d&Zd'ejd$ejfd(d)Zd*ejd$ejfd+d,Zd-ejd.ejd$ejfd/d0Zd1d2 Zd3ed4ef fd5d6Zd4efd7d8Zed9d: Zed;d< Z ed=d> Z!d?d@ Z"edAdB Z#dCdD Z$edEdF Z%dPdGdHZ&edIdJ Z'dKdL Z(  Z)S )Q	RNNTJointa  A Recurrent Neural Network Transducer Joint Network (RNN-T Joint Network).
    An RNN-T Joint network, comprised of a feedforward model.

    Args:
        jointnet: A dict-like object which contains the following key-value pairs.
            encoder_hidden: int specifying the hidden dimension of the encoder net.
            pred_hidden: int specifying the hidden dimension of the prediction net.
            joint_hidden: int specifying the hidden dimension of the joint net
            activation: Activation function used in the joint step. Can be one of
            ['relu', 'tanh', 'sigmoid'].

            Optionally, it may also contain the following:
            dropout: float, set to 0.0 by default. Optional dropout applied at the end of the joint net.

        num_classes: int, specifying the vocabulary size that the joint network must predict,
            excluding the RNNT blank token.

        vocabulary: Optional list of strings/tokens that comprise the vocabulary of the joint network.
            Unused and kept only for easy access for character based encoding RNNT models.

        log_softmax: Optional bool, set to None by default. If set as None, will compute the log_softmax()
            based on the value provided.

        preserve_memory: Optional bool, set to False by default. If the model crashes due to the memory
            intensive joint step, one might try this flag to empty the tensor cache in pytorch.

            Warning: This will make the forward-backward pass much slower than normal.
            It also might not fix the OOM if the GPU simply does not have enough memory to compute the joint.

        fuse_loss_wer: Optional bool, set to False by default.

            Fuses the joint forward, loss forward and
            wer forward steps. In doing so, it trades of speed for memory conservation by creating sub-batches
            of the provided batch of inputs, and performs Joint forward, loss forward and wer forward (optional),
            all on sub-batches, then collates results to be exactly equal to results from the entire batch.

            When this flag is set, prior to calling forward, the fields `loss` and `wer` (either one) *must*
            be set using the `RNNTJoint.set_loss()` or `RNNTJoint.set_wer()` methods.

            Further, when this flag is set, the following argument `fused_batch_size` *must* be provided
            as a non negative integer. This value refers to the size of the sub-batch.

            When the flag is set, the input and output signature of `forward()` of this method changes.
            Input - in addition to `encoder_outputs` (mandatory argument), the following arguments can be provided.

                - decoder_outputs (optional). Required if loss computation is required.

                - encoder_lengths (required)

                - transcripts (optional). Required for wer calculation.

                - transcript_lengths (optional). Required for wer calculation.

                - compute_wer (bool, default false). Whether to compute WER or not for the fused batch.

            Output - instead of the usual `joint` log prob tensor, the following results can be returned.

                - loss (optional). Returned if decoder_outputs, transcripts and transript_lengths are not None.

                - wer_numerator + wer_denominator (optional). Returned if transcripts, transcripts_lengths are provided
                    and compute_wer is set.

        fused_batch_size: Optional int, required if `fuse_loss_wer` flag is set. Determines the size of the
            sub-batches. Should be any value below the actual batch size per GPU.
        masking_prob: Optional float, indicating the probability of masking out decoder output in HAINAN
            (Hybrid Autoregressive Inference Transducer) model, described in https://arxiv.org/pdf/2410.02597
            Default to -1.0, which runs standard Joint network computation; if > 0, then masking out decoder output
            with the specified probability.
    c              	   C   sT   t dt t dt t tdt ddt dt ddt tdt ddt dddS )r   r0   r   Tr!   r   encoder_outputsdecoder_outputsencoder_lengthstranscriptstranscript_lengthscompute_wer)r   r   r   r'   r   r   r(   r*   r*   r+   r,   G  s   

zRNNTJoint.input_typesc                 C   sL   | j sdtdt iS tt ddtt ddtt ddtt dddS )r/   r3   r   r    r    r1   T)elements_typer"   )losswer	wer_numer	wer_denom)_fuse_loss_werr   r   r   r   r(   r*   r*   r+   r5   S  s   zRNNTJoint.output_typesc                    s"   d| _ d| _t jdi | d S )NFr*   )rU  log_softmaxrS   rT   rU   rW   r*   r+   rT   c  s   zRNNTJoint._prepare_for_exportr6       c                 C   sX   |||}}}t || j|t|  j}t || j|t|  j}||fS )r9   )r>   r  encoder_hiddenrB   rC   rD   rE   r^   )r)   rJ   rK   r   r    r~   rI  rJ  r*   r*   r+   rM   h  s     zRNNTJoint.input_examplec                 C   s   t g dS )zHImplement this method to return a set of input names disabled for export)rK  rL  rM  rN  )setr(   r*   r*   r+   disabled_deployment_input_namess  s   z)RNNTJoint.disabled_deployment_input_namesr   NF      jointnetnum_classesnum_extra_outputs
vocabularyrV  preserve_memoryfuse_loss_werfused_batch_sizeexperimental_fuse_loss_wermasking_probc                    s  t    || _|| _|| _|d | | _|
| _| jdkr&| jdk s&J d|	d ur,|	}|| _|| _|r<|d u r<t	dd | _
d | _|| _|| _|rOtd |d | _|d | _|d	 | _|d
 | _|dd}| j| j| j| j| j| j|d\| _| _| _d| _d| _d S )Nr6   rb   r  z$masking_prob must be between 0 and 1BIf `fuse_loss_wer` is set, then `fused_batch_size` cannot be None!z`preserve_memory` was set for the Joint Model. Please be aware this will severely impact the forward-backward step time. It also might not solve OOM issues if the GPU simply does not have enough memory to compute the joint.rX  r^   joint_hidden
activationra   )r]  r  enc_n_hiddenjoint_n_hiddenrg  ra   F)rS   rd   r_  _vocab_size_num_extra_outputs_num_classesrd  rU  _fused_batch_sizer   _loss_werrV  r`  r   warningrX  r^   rf  rg  re   _joint_net_modulespredenc	joint_netrR   temperature)r)   r\  r]  r^  r_  rV  r`  ra  rb  rc  rd  ra   rW   r*   r+   rd   x  sJ   







zRNNTJoint.__init__rI  rJ  rK  rL  rM  rN  ru   c                 C   s
  | dd}|d ur| dd}| js#|d u rtd| ||}|S | jd u s-| jd u r1td| jd u r:td|d u sB|d u rFtdg }g g g }	}
}g }t|d}t	d|| jD ]}|}t
|| j |}|jd|t|| d}|jd|t|| d}||| }||| }| }| }|d ur|jd |kr|jddt|d}|jd|t|| d}|jd |d kr|jddt|d d}| ||}~|jd |kr|jddt|d}| jj}d | j_| j||||d	}|| || || j_nd }|rT| dd}| }| }| jr&| jj}d
| j_| jj||||d | j \}}}| j  | jrE|| j_|	| |
| || ~~~~q`|d ure| j||}|ryt|	t|	 }t|
}t|}nd }d }d }||||fS )Nr6   rk   zpdecoder_outputs passed is None, and `fuse_loss_wer` is not set. decoder_outputs can only be None for fused step!M`fuse_loss_wer` flag is set, but `loss` and `wer` modules were not provided! re  V`fuse_loss_wer` is set, therefore encoder and target lengths must be provided as well!r   rx   r   rL   	log_probsr$   input_lengthstarget_lengthsFpredictionspredictions_lengthsr$   targets_lengths)ro   rU  r   jointrn  ro  rm  r   r;   r   minnarrowmaxrz   rQ  	reductionr   detachr  rR  _to_syncupdatecomputeresetreducesumr   )r)   rI  rJ  rK  rL  rM  rN  r   losseswerswer_nums
wer_denomsr|  rt   	batch_idxbeginendsub_encsub_transcriptssub_enc_lenssub_transcript_lensmax_sub_enc_lengthmax_sub_transcript_lengthsub_dec	sub_jointloss_reduction
loss_batchoriginal_syncrR  wer_numrT  r*   r*   r+   rr     s   











zRNNTJoint.forwardencoder_outputc                 C   
   |  |S )z
        Project the encoder output to the joint hidden dimension.

        Args:
            encoder_output: A torch.Tensor of shape [B, T, D]

        Returns:
            A torch.Tensor of shape [B, T, H]
        )rs  )r)   r  r*   r*   r+   project_encoder`     

zRNNTJoint.project_encoderprednet_outputc                 C   r  )z
        Project the Prediction Network (Decoder) output to the joint hidden dimension.

        Args:
            prednet_output: A torch.Tensor of shape [B, U, D]

        Returns:
            A torch.Tensor of shape [B, U, H]
        )rr  )r)   r  r*   r*   r+   project_prednetl  r  zRNNTJoint.project_prednetfrq   c           	      C   s  |j dd}|j dd}| jr3| jdkr3|j\}}}}t|d|dg|j}t|| j}|| }|| }~~| 	 rB| 
|}| |}~| jrPtj  | jdu rp|jsn| jdkrh|| j jdd}|S |jdd}|S | jr| jdkr|| j jdd}|S |jdd}|S )aT  
        Compute the joint step of the network after projection.

        Here,
        B = Batch size
        T = Acoustic model timesteps
        U = Target sequence length
        H1, H2 = Hidden dimensions of the Encoder / Decoder respectively
        H = Hidden dimension of the Joint hidden step.
        V = Vocabulary size of the Decoder (excluding the RNNT blank token).

        NOTE:
            The implementation of this model is slightly modified from the original paper.
            The original paper proposes the following steps :
            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- \*1
            \*1 -> Forward through joint final [B, T, U, V + 1].

            We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- \*1
            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- \*2
            (\*1, \*2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1].

        Args:
            f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1]
            g: Output of the Decoder model. A torch.Tensor of shape [B, U, H2]

        Returns:
            Logits / log softmaxed tensor of shape (B, T, U, V + 1).
        rk   rw   r6   r   Nr  r   )r   r  rd  rz   r>   randrB   rE   gtr  r  rt  r`  cudaempty_cacherV  is_cudaru  )	r)   r  rq   r   r   r~   r  inpresr*   r*   r+   joint_after_projectionx  s8   






z RNNTJoint.joint_after_projectionc           
      C   s   t j||}t j||}|dvrtd| }|dkr&t jjdd}n|dkr0t j }n	|dkr9t j }|g|rEt jj|dgng  t j||g }	||t jj	|	 fS )	a  
        Prepare the trainable modules of the Joint Network

        Args:
            num_classes: Number of output classes (vocab size) excluding the RNNT blank token.
            pred_n_hidden: Hidden size of the prediction network.
            enc_n_hidden: Hidden size of the encoder network.
            joint_n_hidden: Hidden size of the joint network.
            activation: Activation of the joint. Can be one of [relu, tanh, sigmoid]
            dropout: Dropout value to apply to joint.
        )relusigmoidtanhzPUnsupported activation for joint step - please pass one of [relu, sigmoid, tanh]r  T)inplacer  r  )p)
r>   r  Linearr   lowerReLUSigmoidTanhDropout
Sequential)
r)   r]  r  rh  ri  rg  ra   rr  rs  r  r*   r*   r+   rq    s$   
zRNNTJoint._joint_net_modulesr8  r9  c                    r:  r;  r<  r?  rW   r*   r+   r>    r@  zRNNTJoint.add_adapterc                 C   rA  rB  )r   rD  rf  rE  r*   r*   r+   r=    rF  z'RNNTJoint._update_adapter_cfg_input_dimc                 C      | j S r   )rl  r(   r*   r*   r+   num_classes_with_blank     z RNNTJoint.num_classes_with_blankc                 C   r  r   )rk  r(   r*   r*   r+   r^    r  zRNNTJoint.num_extra_outputsc                 C   r  r   )rn  r(   r*   r*   r+   rQ    r  zRNNTJoint.lossc                 C      | j std|| _d S )NzEAttempting to set loss module even though `fuse_loss_wer` is not set!)rU  r   rn  )r)   rQ  r*   r*   r+   set_loss     
zRNNTJoint.set_lossc                 C   r  r   )ro  r(   r*   r*   r+   rR    r  zRNNTJoint.werc                 C   r  )NzDAttempting to set WER module even though `fuse_loss_wer` is not set!)rU  r   ro  )r)   rR  r*   r*   r+   set_wer  r  zRNNTJoint.set_werc                 C   r  r   )rU  r(   r*   r*   r+   ra    r  zRNNTJoint.fuse_loss_werc                 C   s   || _ || _|| _d S r   )rU  rn  ro  )r)   ra  rQ  metricr*   r*   r+   set_fuse_loss_wer  s   
zRNNTJoint.set_fuse_loss_werc                 C   r  r   rm  r(   r*   r*   r+   rb    r  zRNNTJoint.fused_batch_sizec                 C   s
   || _ d S r   r  )r)   rb  r*   r*   r+   set_fused_batch_size  s   
zRNNTJoint.set_fused_batch_size)r6   rW  )r   NNFFNNr[  NNNFr   )*r   r   r   r   r   r,   r5   rT   rM   rZ  r   r   r   r   r   r   r   rH   rd   r   r>   r   r   rr   r  r  r  rq  r   r>  r=  r  r^  rQ  r  rR  r  ra  r  rb  r  r   r*   r*   rW   r+   rG     s    F




	
I F#






rG  c                       sJ   e Zd ZdZ fddZedd ZdddZed	d
 Zdd Z	  Z
S )RNNTDecoderJointzB
    Utility class to export Decoder+Joint as a single module
    c                       t    || _|| _d S r   rS   rd   decoderr  r)   r  r  rW   r*   r+   rd        

zRNNTDecoderJoint.__init__c                 C   s<   t dt }t dt t dt t tdt ||d}|S )Nr  r0   r   r   )rI  r$   r%   input_states_1input_states_2)r   r   r   r   r'   r   )r)   
state_typemytypesr*   r*   r+   r,   #  s   

zRNNTDecoderJoint.input_typesr6   c                 C   sD   | j j||d}|d \}}t| j d g|d d  ||f S )N)rJ   rK   r   r   rk   )r  rM   r'   r  )r)   rJ   rK   decoder_examplestate1state2r*   r*   r+   rM   0  s   (zRNNTDecoderJoint.input_examplec                 C   s2   t dt t tdt t dt t dt dS )NrO  r   r  )r3   r4   output_states_1output_states_2)r   r   r'   r   r   r(   r*   r*   r+   r5   5  s
   


zRNNTDecoderJoint.output_typesc           
      C   sT   |  ||||f}|d }|d }|d d |d d }}| ||}	|	|||fS )Nr   r6   rk   r  r  )
r)   rI  r$   r%   r  r  rJ  decoder_outputdecoder_lengthjoint_outputr*   r*   r+   rr   >  s   zRNNTDecoderJoint.forwardr   )r   r   r   r   rd   r   r,   rM   r5   rr   r   r*   r*   rW   r+   r    s    


r  c                       sH   e Zd Z fddZedd Zedd Zedd Zd	d
 Z  Z	S )RNNTDecoderJointSSLc                    r  r   r  r  rW   r*   r+   rd   H  r  zRNNTDecoderJointSSL.__init__c                 C   s   dS )NTr*   r(   r*   r*   r+   needs_labelsM  s   z RNNTDecoderJointSSL.needs_labelsc                 C   s(   t dt t dt t tdt dS )Nr0   r   r   )r  r$   r|  )r   r   r   r'   r   r(   r*   r*   r+   r,   Q  s   

zRNNTDecoderJointSSL.input_typesc                 C   s   dt dt iS )Nrz  )r   r    r1   )r   r   r(   r*   r*   r+   r5   Y  s   z RNNTDecoderJointSSL.output_typesc                 C   s&   | j ||d\}}}| j||d}|S )N)r$   r%   )rI  rJ  r  )r)   r  r$   r|  r  r%   r&   rz  r*   r*   r+   rr   ]  s   zRNNTDecoderJointSSL.forward)
r   r   r   rd   r   r  r,   r5   rr   r   r*   r*   rW   r+   r  G  s    


r  c                       s   e Zd ZdZ					ddeeef dededee	 dee
 d	e
d
e
dee f fddZe 				ddejdeej deej deej deej de
deeje	eej  f f fddZdejdejdejdejdejf
 fddZ  ZS )SampledRNNTJointa  A Sampled Recurrent Neural Network Transducer Joint Network (RNN-T Joint Network).
    An RNN-T Joint network, comprised of a feedforward model, where the vocab size will be sampled instead
    of computing the full vocabulary joint.

    Args:
        jointnet: A dict-like object which contains the following key-value pairs.
            encoder_hidden: int specifying the hidden dimension of the encoder net.
            pred_hidden: int specifying the hidden dimension of the prediction net.
            joint_hidden: int specifying the hidden dimension of the joint net
            activation: Activation function used in the joint step. Can be one of
            ['relu', 'tanh', 'sigmoid'].

            Optionally, it may also contain the following:
            dropout: float, set to 0.0 by default. Optional dropout applied at the end of the joint net.

        num_classes: int, specifying the vocabulary size that the joint network must predict,
            excluding the RNNT blank token.

        n_samples: int, specifies the number of tokens to sample from the vocabulary space,
            excluding the RNNT blank token. If a given value is larger than the entire vocabulary size,
            then the full vocabulary will be used.

        vocabulary: Optional list of strings/tokens that comprise the vocabulary of the joint network.
            Unused and kept only for easy access for character based encoding RNNT models.

        log_softmax: Optional bool, set to None by default. If set as None, will compute the log_softmax()
            based on the value provided.

        preserve_memory: Optional bool, set to False by default. If the model crashes due to the memory
            intensive joint step, one might try this flag to empty the tensor cache in pytorch.

            Warning: This will make the forward-backward pass much slower than normal.
            It also might not fix the OOM if the GPU simply does not have enough memory to compute the joint.

        fuse_loss_wer: Optional bool, set to False by default.

            Fuses the joint forward, loss forward and
            wer forward steps. In doing so, it trades of speed for memory conservation by creating sub-batches
            of the provided batch of inputs, and performs Joint forward, loss forward and wer forward (optional),
            all on sub-batches, then collates results to be exactly equal to results from the entire batch.

            When this flag is set, prior to calling forward, the fields `loss` and `wer` (either one) *must*
            be set using the `RNNTJoint.set_loss()` or `RNNTJoint.set_wer()` methods.

            Further, when this flag is set, the following argument `fused_batch_size` *must* be provided
            as a non negative integer. This value refers to the size of the sub-batch.

            When the flag is set, the input and output signature of `forward()` of this method changes.
            Input - in addition to `encoder_outputs` (mandatory argument), the following arguments can be provided.

                - decoder_outputs (optional). Required if loss computation is required.

                - encoder_lengths (required)

                - transcripts (optional). Required for wer calculation.

                - transcript_lengths (optional). Required for wer calculation.

                - compute_wer (bool, default false). Whether to compute WER or not for the fused batch.

            Output - instead of the usual `joint` log prob tensor, the following results can be returned.

                - loss (optional). Returned if decoder_outputs, transcripts and transript_lengths are not None.

                - wer_numerator + wer_denominator (optional). Returned if transcripts, transcripts_lengths are provided
                    and compute_wer is set.

        fused_batch_size: Optional int, required if `fuse_loss_wer` flag is set. Determines the size of the
            sub-batches. Should be any value below the actual batch size per GPU.
    NFr\  r]  	n_samplesr_  rV  r`  ra  rb  c	           	   	      sB   t  j|||||||d || _| jdt| jd gdd d S )N)r\  r]  r_  rV  r`  ra  rb  blank_idr6   F)
persistent)rS   rd   r  register_bufferr>   r   r  )	r)   r\  r]  r  r_  rV  r`  ra  rb  rW   r*   r+   rd     s   	"zSampledRNNTJoint.__init__rI  rJ  rK  rL  rM  rN  ru   c                    s.  t  rt  rt j||||||dS |d u s|d u r%td td|dd}|d ur5|dd}| j	d u s?| j
d u rCtd| jd u rLtd|d u sT|d u rXtdg }g g g }}	}
g }t|d	}td	|| jD ]}|}t|| j |}|jd	|t|| d
}|jd	|t|| d
}||| }||| }| }| }|d ur-|jd |kr|jdd	t|d
}|jd	|t|| d
}|jd |d kr|jdd	t|d d
}|jd |kr|jdd	t|d
}| j||||d\}}~| jj}d | j_| jj	j}d	| jj	_| j||||d}|| || || j_|| jj	_nd }|rf|dd}| }| }| jj||||d | j \}}}| j  || |	| |
| ~~~~qr|d urw| j||}|rt|t| }t|	}t|
}nd }d }d }||||fS )NrH  z|Sampled RNNT Joint currently only works with `fuse_loss_wer` set to True, and when `fused_batch_size` is a positive integer.zSampled RNNT loss only works when the transcripts are provided during training.Please ensure that you correctly pass the `transcripts` and `transcript_lengths`.r6   rk   rv  re  rw  r   rx  )
transcriptrM  ry  r}  ) r>   is_grad_enabledis_inference_mode_enabledrS   rr   r   rp  r   ro   rn  ro  rm  r   r;   r   r  r  r  rz   sampled_jointrQ  r  blankr   r  rR  r  r  r  r  r  r   )r)   rI  rJ  rK  rL  rM  rN  r  r  r  r  r|  rt   r  r  r  r  r  r  r  r  r  r  r  sub_transcripts_remappedr  cached_blank_idr  rR  r  rT  rW   r*   r+   rr     s   	













zSampledRNNTJoint.forwardr  rq   r  c                    s  | j du st du st rt j||dS | |}|jdd | |}|jdd || }~~| 	 r<| 
|}| jdd D ]}||}qCt Z t|}t| j|g}tj|dd	d
}dd t|d	|D }	t|	  \}
}|j}tj||d
}tj|
|d
}
t| |
}|| |j}||}W d   n1 sw   Y  | jd j|ddf }| jd j| }t||dd| }t A tj| j d |jdd| j! }t"|dddf |dddf k}|# }tj$|tj%d}d||d < || }W d   n	1 sw   Y  | jd j|ddf }| jd j| }t||dd| }tj||gdd}~| j&rQtj'(  | j)du re|j*sa|j)dd}||fS | j)ro|j)dd}||fS )a  
        Compute the sampled joint step of the network.

        Reference: `Memory-Efficient Training of RNN-Transducer with Sampled Softmax <https://arxiv.org/abs/2203.16868>`__.

        Here,
        B = Batch size
        T = Acoustic model timesteps
        U = Target sequence length
        H1, H2 = Hidden dimensions of the Encoder / Decoder respectively
        H = Hidden dimension of the Joint hidden step.
        V = Vocabulary size of the Decoder (excluding the RNNT blank token).
        S = Sample size of vocabulary.

        NOTE:
            The implementation of this joint model is slightly modified from the original paper.
            The original paper proposes the following steps :
            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- \*1
            \*1 -> Forward through joint final [B, T, U, V + 1].

            We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- \*1
            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- \*2
            (\*1, \*2) -> Sum [B, T, U, H] -> Sample Vocab V_Pos (for target tokens) and V_Neg ->
            (V_Neg is sampled not uniformly by as a rand permutation of all vocab tokens, then eliminate
            all Intersection(V_Pos, V_Neg) common tokens to avoid duplication of loss) ->
            Concat new Vocab V_Sampled = Union(V_Pos, V_Neg)
            -> Forward partially through the joint final to create [B, T, U, V_Sampled]

        Args:
            f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1]
            g: Output of the Decoder model. A torch.Tensor of shape [B, U, H2]
            transcript: Batch of transcripts. A torch.Tensor of shape [B, U]
            transcript_lengths: Batch of lengths of the transcripts. A torch.Tensor of shape [B]

        Returns:
            Logits / log softmaxed tensor of shape (B, T, U, V + 1).
        F)r  rq   rk   rw   r6   Nr   r   cpu)rE   c                 S   s   i | ]\}}||qS r*   r*   )r   kvr*   r*   r+   
<dictcomp>  r  z2SampledRNNTJoint.sampled_joint.<locals>.<dictcomp>)nrE   )r<   )+r  r>   r  r  rS   r  rs  
unsqueeze_rr  r  r  rt  no_graduniquer{   r  aranger;   r7  rB   itemsrE   r   	bucketizeravelreshaperz   weightbiasmatmulro   randpermr  r  r   r   	ones_liker   r`  r  r  rV  r  )r)   r  rq   r  rM  r  moduletranscript_vocab_idst_idsmappingpalettekeyt_devicer'  true_weights	true_biastranscript_scores
sample_idsreject_samplesaccept_samplessample_masksample_weightssample_biasnoise_scoresr  rW   r*   r+   r  s  sh   .





&
	&
'
zSampledRNNTJoint.sampled_joint)NNFFNr  )r   r   r   r   r   r   r   r   r   r   r   rd   r   r>   r   r   rr   r  r   r*   r*   rW   r+   r  e  sp    L
	 /r  )3typingr   r   r   r   r   r   r>   	omegaconfr   nemo.collections.asr.modulesr	   %nemo.collections.asr.parts.submodulesr
    nemo.collections.asr.parts.utilsr   r   nemo.collections.common.partsr   nemo.core.classesr   r   nemo.core.classes.exportabler   nemo.core.classes.mixinsr   nemo.core.neural_typesr   r   r   r   r   r   r   r   r   
nemo.utilsr   AbstractRNNTDecoderr   r  AbstractRNNTJointrG  r  Moduler  r  r  r   get_registered_adapterregister_adapterr*   r*   r*   r+   <module>   sL    ,   t     ]    .   L