o
    }oiB                     @   st   d dl mZmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ G dd deeZG dd	 d	eeZdS )
    )ABCabstractmethod)AnyDictListOptionalTupleN)
Hypothesis)NeuralModulec                   @   s   e Zd ZdZedejdejdefddZedejdejfdd	Z	ed
ejdejfddZ
dejdejdejfddZedd Zedd ZdS )AbstractRNNTJointa?  
    An abstract RNNT Joint framework, which can possibly integrate with GreedyRNNTInfer and BeamRNNTInfer classes.
    Represents the abstract RNNT Joint network, which accepts the acoustic model and prediction network
    embeddings in order to compute the joint of the two prior to decoding the output sequence.
    fgreturnc                 C      t  )a  
        Compute the joint step of the network after the projection step.
        Args:
            f: Output of the Encoder model after projection. A torch.Tensor of shape [B, T, H]
            g: Output of the Decoder model (Prediction Network) after projection. A torch.Tensor of shape [B, U, H]

        Returns:
            Logits / log softmaxed tensor of shape (B, T, U, V + 1).
            Arbitrary return type, preferably torch.Tensor, but not limited to (e.g., see HatJoint)
        NotImplementedErrorselfr   r    r   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/modules/rnnt_abstract.pyjoint_after_projection   s   z(AbstractRNNTJoint.joint_after_projectionencoder_outputc                 C   r   )z
        Project the encoder output to the joint hidden dimension.

        Args:
            encoder_output: A torch.Tensor of shape [B, T, D]

        Returns:
            A torch.Tensor of shape [B, T, H]
        r   )r   r   r   r   r   project_encoder,      z!AbstractRNNTJoint.project_encoderprednet_outputc                 C   r   )z
        Project the Prediction Network (Decoder) output to the joint hidden dimension.

        Args:
            prednet_output: A torch.Tensor of shape [B, U, D]

        Returns:
            A torch.Tensor of shape [B, U, H]
        r   )r   r   r   r   r   project_prednet9   r   z!AbstractRNNTJoint.project_prednetc                 C   s   |  | || |S )a=  
        Compute the joint step of the network.

        Here,
        B = Batch size
        T = Acoustic model timesteps
        U = Target sequence length
        H1, H2 = Hidden dimensions of the Encoder / Decoder respectively
        H = Hidden dimension of the Joint hidden step.
        V = Vocabulary size of the Decoder (excluding the RNNT blank token).

        NOTE:
            The implementation of this model is slightly modified from the original paper.
            The original paper proposes the following steps :
            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1
            *1 -> Forward through joint final [B, T, U, V + 1].

            We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1
            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2
            (*1, *2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1].

        Args:
            f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1]
            g: Output of the Decoder model. A torch.Tensor of shape [B, U, H2]

        Returns:
            Logits / log softmaxed tensor of shape (B, T, U, V + 1).
        )r   r   r   r   r   r   r   jointF   s   zAbstractRNNTJoint.jointc                 C   r   Nr   r   r   r   r   num_classes_with_blankf      z(AbstractRNNTJoint.num_classes_with_blankc                 C   r   r   r   r   r   r   r   num_extra_outputsj   r    z#AbstractRNNTJoint.num_extra_outputsN)__name__
__module____qualname____doc__r   torchTensorr   r   r   r   r   propertyr   r!   r   r   r   r   r      s     
r   c                       sn  e Zd ZdZ fddZe				d:deej deej de	d	ee
 d
eejeej f f
ddZedejd
eej fddZededeee
 ef d
eejeej ejf fddZdee deee
 ef d
eejeej ejf fddZdeeej  fddZdeej de
d
eeej  fddZe	d;deejejf eej B d	e
de
dejd eeejejf eej B  d
eejejf eej B fd!d"Ze	d;deejejf eej B d eejejf eej B d#ejd$eeejejf eej B  fd%d&Ze	d;deej d eej d	e
dB fd'd(Zed)eejejf eej B d
eejejf eej B fd*d+Zedeej d
eeej  fd,d-Ze	d<deeejejf eej B  d
eejejf eej B fd.d/Zdeeej  d
eej fd0d1Z	d;d2eej d3eej d4ee
 d5ee  d
eej f
d6d7Z!d)ed#ejd
efd8d9Z"  Z#S )=AbstractRNNTDecoderap  
    An abstract RNNT Decoder framework, which can possibly integrate with GreedyRNNTInfer and BeamRNNTInfer classes.
    Represents the abstract RNNT Prediction/Decoder stateful network, which performs autoregressive decoding
    in order to construct the output sequence.

    Args:
        vocab_size: Size of the vocabulary, excluding the RNNT blank token.
        blank_idx: Index of the blank token. Can be 0 or size(vocabulary).
        blank_as_pad: Bool flag, whether to allocate an additional token in the Embedding layer
            of this module in order to treat all RNNT `blank` tokens as pad tokens, thereby letting
            the Embedding layer batch tokens more efficiently.

            It is mandatory to use this for certain Beam RNNT Infer methods - such as TSD, ALSD.
            It is also more efficient to use greedy batch decoding with this flag.
    c                    s4   t    || _|| _|| _|d|fvrtdd S )Nr   zA`blank_idx` must be either 0 or the final token of the vocabulary)super__init__
vocab_size	blank_idxblank_as_pad
ValueError)r   r,   r-   r.   	__class__r   r   r+      s   
zAbstractRNNTDecoder.__init__NFystateadd_sos
batch_sizer   c                 C   r   )a  
        Stateful prediction of scores and state for a (possibly null) tokenset.
        This method takes various cases into consideration :
        - No token, no state - used for priming the RNN
        - No token, state provided - used for blank token scoring
        - Given token, states - used for scores + new states

        Here:
        B - batch size
        U - label length
        H - Hidden dimension size of RNN
        L - Number of RNN layers

        Args:
            y: Optional torch tensor of shape [B, U] of dtype long which will be passed to the Embedding.
                If None, creates a zero tensor of shape [B, 1, H] which mimics output of pad-token on Embedding.

            state: An optional list of states for the RNN. Eg: For LSTM, it is the state list length is 2.
                Each state must be a tensor of shape [L, B, H].
                If None, and during training mode and `random_state_sampling` is set, will sample a
                normal distribution tensor of the above shape. Otherwise, None will be passed to the RNN.

            add_sos: bool flag, whether a zero vector describing a "start of signal" token should be
                prepended to the above "y" tensor. When set, output size is (B, U + 1, H).

            batch_size: An optional int, specifying the batch size of the `y` tensor.
                Can be infered if `y` and `state` is None. But if both are None, then batch_size cannot be None.

        Returns:
            A tuple  (g, hid) such that -

            If add_sos is False:
                g: (B, U, H)
                hid: (h, c) where h is the final sequence hidden state and c is the final cell state:
                    h (tensor), shape (L, B, H)
                    c (tensor), shape (L, B, H)

            If add_sos is True:
                g: (B, U + 1, H)
                hid: (h, c) where h is the final sequence hidden state and c is the final cell state:
                    h (tensor), shape (L, B, H)
                    c (tensor), shape (L, B, H)

        r   )r   r2   r3   r4   r5   r   r   r   predict   s   4zAbstractRNNTDecoder.predictc                 C   r   )a  
        Initialize the state of the RNN layers, with same dtype and device as input `y`.

        Args:
            y: A torch.Tensor whose device the generated states will be placed on.

        Returns:
            List of torch.Tensor, each of shape [L, B, H], where
                L = Number of RNN layers
                B = Batch size
                H = Hidden size of RNN.
        r   )r   r2   r   r   r   initialize_state   s   z$AbstractRNNTDecoder.initialize_state
hypothesiscachec                 C   r   )a  
        Similar to the predict() method, instead this method scores a Hypothesis during beam search.
        Hypothesis is a dataclass representing one hypothesis in a Beam Search.

        Args:
            hypothesis: Refer to rnnt_utils.Hypothesis.
            cache: Dict which contains a cache to avoid duplicate computations.

        Returns:
            Returns a tuple (y, states, lm_token) such that:
            y is a torch.Tensor of shape [1, 1, H] representing the score of the last token in the Hypothesis.
            state is a list of RNN states, each of shape [L, 1, H].
            lm_token is the final integer token of the hypothesis.
        r   )r   r8   r9   r   r   r   score_hypothesis   s   z$AbstractRNNTDecoder.score_hypothesis
hypothesesc                 C   r   )an  
        Used for batched beam search algorithms. Similar to score_hypothesis method.

        Args:
            hypothesis: List of Hypotheses. Refer to rnnt_utils.Hypothesis.
            cache: Dict which contains a cache to avoid duplicate computations.

        Returns:
            Returns a tuple (batch_dec_out, batch_dec_states) such that:
                batch_dec_out: a list of torch.Tensor [1, H] representing the prediction network outputs for the last tokens in the Hypotheses.
                batch_dec_states: a list of list of RNN states, each of shape [L, B, H]. Represented as B x List[states].
        r   )r   r;   r9   r   r   r   batch_score_hypothesis   s   z*AbstractRNNTDecoder.batch_score_hypothesisdecoder_statesc                 C   r   )aM  
        Creates a stacked decoder states to be passed to prediction network

        Args:
            decoder_states (list of list of list of torch.Tensor): list of decoder states
                [B, C, L, H]
                    - B: Batch size.
                    - C: e.g., for LSTM, this is 2: hidden and cell states
                    - L: Number of layers in prediction RNN.
                    - H: Dimensionality of the hidden state.

        Returns:
            batch_states (list of torch.Tensor): batch of decoder states
                [C x torch.Tensor[L x B x H]
        r   )r   r=   r   r   r   batch_initialize_states   s   z+AbstractRNNTDecoder.batch_initialize_statesbatch_statesidxc                 C   r   )ag  Get decoder state from batch of states, for given id.

        Args:
            batch_states (list): batch of decoder states
                ([L x (B, H)], [L x (B, H)])

            idx (int): index to extract state from batch of states

        Returns:
            (tuple): decoder states for given id
                ([L x (1, H)], [L x (1, H)])
        r   )r   r?   r@   r   r   r   batch_select_state  s   z&AbstractRNNTDecoder.batch_select_state
src_states	beam_sizeindices
dst_statesc                 C   r   )a=  
        Aggregates decoder states based on the given indices.
        Args:
            src_states (tuple[torch.Tensor, torch.Tensor] | list[torch.Tensor]): source states of
                shape `([L x (batch_size * beam_size, H)], [L x (batch_size * beam_size, H)])`
            batch_size (int): The size of the batch.
            beam_size (int): The size of the beam.
            indices (torch.Tensor): A tensor of shape `(batch_size, beam_size)` containing
                the indices in beam that map the source states to the destination states.
            dst_states (tuple[torch.Tensor, torch.Tensor] | list[torch.Tensor], optional): If provided, the method
                updates these tensors in-place.
        Returns:
            tuple[torch.Tensor, torch.Tensor] | list[torch.Tensor]: aggregated states
        r   )clsrB   r5   rC   rD   rE   r   r   r   batch_aggregate_states_beam     z/AbstractRNNTDecoder.batch_aggregate_states_beammaskother_src_statesc                 C   r   )a  
        Replaces states in `dst_states` with states from `src_states` based on the given `mask`.

        Args:
            mask (torch.Tensor): When True, selects values from `src_states`, otherwise `out` or `other_src_states`(if provided).
            src_states (tuple[torch.Tensor, torch.Tensor] | list[torch.Tensor]): Values selected at indices where `mask` is True.
            dst_states (tuple[torch.Tensor, torch.Tensor] | list[torch.Tensor], optional): The output states.
            other_src_states (tuple[torch.Tensor, torch.Tensor] | list[torch.Tensor], optional): Values selected at indices where `mask` is False.

        Note:
            This operation is performed without CPU-GPU synchronization by using `torch.where`.
        r   )rF   rB   rE   rI   rJ   r   r   r   batch_replace_states_mask0  s   z-AbstractRNNTDecoder.batch_replace_states_maskc                 C   r   )z8Replace states in dst_states with states from src_statesr   )rF   rB   rE   r5   r   r   r   batch_replace_states_allF  s   z,AbstractRNNTDecoder.batch_replace_states_allstatesc                 C   r   )zReturn copy of the statesr   )rF   rM   r   r   r   clone_stateP  s   zAbstractRNNTDecoder.clone_statec                 C   r   )z
        Split states into a list of states.
        Useful for splitting the final state for converting results of the decoding algorithm to Hypothesis class.
        r   )rF   r?   r   r   r   batch_split_statesW  s   z&AbstractRNNTDecoder.batch_split_statesc                 C   r   )zj
        Concatenate a batch of decoder state to a packed state. Inverse of `batch_split_states`.
        r   )rF   r?   devicedtyper   r   r   batch_unsplit_states_  s   z(AbstractRNNTDecoder.batch_unsplit_statesc                 C   r   )a  Concatenate a batch of decoder state to a packed state.

        Args:
            batch_states (list): batch of decoder states
                B x ([L x (H)], [L x (H)])

        Returns:
            (tuple): decoder states
                (L x B x H, L x B x H)
        r   )r   r?   r   r   r   batch_concat_statesh  r   z'AbstractRNNTDecoder.batch_concat_states
old_states
new_statesidsvaluec                 C   r   )aC  Copy states from new state to old state at certain indices.

        Args:
            old_states(list): packed decoder states
                (L x B x H, L x B x H)

            new_states: packed decoder states
                (L x B x H, L x B x H)

            ids (list): List of indices to copy states at.

            value (optional float): If a value should be copied instead of a state slice, a float should be provided

        Returns:
            batch of decoder states with partial copy at ids (or a specific value).
                (L x B x H, L x B x H)
        r   )r   rT   rU   rV   rW   r   r   r   batch_copy_statesu  rH   z%AbstractRNNTDecoder.batch_copy_statesc                 C   r   )aO  
        Return states by mask selection
        Args:
            states: states for the batch (preferably a list of tensors, but not limited to)
            mask: boolean mask for selecting states; batch dimension should be the same as for states

        Returns:
            states filtered by mask (same type as `states`)
        r   )r   rM   rI   r   r   r   mask_select_states  s   
z&AbstractRNNTDecoder.mask_select_states)NNFNr   )NN)$r"   r#   r$   r%   r+   r   r   r&   r'   boolintr   r   r6   r7   r	   r   r   r:   r<   r>   rA   classmethodtuplelistrG   rK   rL   rN   rO   rR   rS   floatrX   rY   __classcell__r   r   r0   r   r)   o   s    
5
&	$"
 r)   )abcr   r   typingr   r   r   r   r   r&   +nemo.collections.asr.parts.utils.rnnt_utilsr	   	nemo.corer
   r   r)   r   r   r   r   <module>   s   X