o
    eicX                     @   sl   d Z ddlmZ ddlmZ ddlmZmZ ddlZeG dd dej	j
ZG dd	 d	ej	j
Zd
d ZdS )zsDecoders and output normalization for Transducer sequence.

Author:
    Abdelwahab HEBA 2020
    Sung-Lin Yeh 2020
    )	dataclass)partial)AnyOptionalNc                   @   s"   e Zd ZU dZdZee ed< dS )(TransducerGreedySearcherStreamingContextzSimple wrapper for the hidden state of the transducer greedy searcher.
    Used by :meth:`~TransducerBeamSearcher.transducer_greedy_decode_streaming`.
    Nhidden)__name__
__module____qualname____doc__r   r   r   __annotations__ r   r   ]/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/decoders/transducer.pyr      s   
 r   c                       s   e Zd ZdZ						d  fdd	Zd	d
 Z	d!ddZdejde	fddZ
dd Zdd Zdd Zdd Zdd Zd"ddZdd Z  ZS )#TransducerBeamSearcheru9  
    This class implements the beam-search algorithm for the transducer model.

    Arguments
    ---------
    decode_network_lst : list
        List of prediction network (PN) layers.
    tjoint: transducer_joint module
        This module perform the joint between TN and PN.
    classifier_network : list
        List of output layers (after performing joint between TN and PN)
        exp: (TN,PN) => joint => classifier_network_list [DNN block, Linear..] => chars prob
    blank_id : int
        The blank symbol/index.
    beam_size : int
        The width of beam. Greedy Search is used when beam_size = 1.
    nbest : int
        Number of hypotheses to keep.
    lm_module : torch.nn.ModuleList
        Neural networks modules for LM.
    lm_weight : float
        The weight of LM when performing beam search (λ).
        log P(y|x) + λ log P_LM(y). (default: 0.3)
    state_beam : float
        The threshold coefficient in log space to decide if hyps in A (process_hyps)
        is likely to compete with hyps in B (beam_hyps), if not, end the while loop.
        Reference: https://arxiv.org/pdf/1911.01629.pdf
    expand_beam : float
        The threshold coefficient to limit the number of expanded hypotheses
        that are added in A (process_hyp).
        Reference: https://arxiv.org/pdf/1911.01629.pdf
        Reference: https://github.com/kaldi-asr/kaldi/blob/master/src/decoder/simple-decoder.cc (See PruneToks)

    Example
    -------
    searcher = TransducerBeamSearcher(
        decode_network_lst=[hparams["emb"], hparams["dec"]],
        tjoint=hparams["Tjoint"],
        classifier_network=[hparams["transducer_lin"]],
        blank_id=0,
        beam_size=hparams["beam_size"],
        nbest=hparams["nbest"],
        lm_module=hparams["lm_model"],
        lm_weight=hparams["lm_weight"],
        state_beam=2.3,
        expand_beam=2.3,
    )
    >>> from speechbrain.nnet.transducer.transducer_joint import Transducer_joint
    >>> import speechbrain as sb
    >>> emb = sb.nnet.embedding.Embedding(
    ...     num_embeddings=35,
    ...     embedding_dim=3,
    ...     consider_as_one_hot=True,
    ...     blank_id=0
    ... )
    >>> dec = sb.nnet.RNN.GRU(
    ...     hidden_size=10, input_shape=(1, 40, 34), bidirectional=False
    ... )
    >>> lin = sb.nnet.linear.Linear(input_shape=(1, 40, 10), n_neurons=35)
    >>> joint_network= sb.nnet.linear.Linear(input_shape=(1, 1, 40, 35), n_neurons=35)
    >>> tjoint = Transducer_joint(joint_network, joint="sum")
    >>> searcher = TransducerBeamSearcher(
    ...     decode_network_lst=[emb, dec],
    ...     tjoint=tjoint,
    ...     classifier_network=[lin],
    ...     blank_id=0,
    ...     beam_size=1,
    ...     nbest=1,
    ...     lm_module=None,
    ...     lm_weight=0.0,
    ... )
    >>> enc = torch.rand([1, 20, 10])
    >>> hyps, _, _, _ = searcher(enc)
          N        ffffff@c                    s   t    || _|| _|| _|| _|| _|| _|| _|| _	|d u r)|dkr)t
d|	| _|
| _tjjdd| _| jdkrB| j| _d S | j| _d S )Nr   zLanguage model is not provided.dim   )super__init__decode_network_lsttjointclassifier_networkblank_id	beam_sizenbestlm	lm_weight
ValueError
state_beamexpand_beamtorchnn
LogSoftmaxsoftmaxtransducer_greedy_decodesearchertransducer_beam_search_decode)selfr   r   r   r   r   r   	lm_moduler!   r#   r$   	__class__r   r   r   e   s"   

zTransducerBeamSearcher.__init__c                 C   s   |  |}|S )z
        Arguments
        ---------
        tn_output : torch.Tensor
            Output from transcription network with shape
            [batch, time_len, hiddens].

        Returns
        -------
        Topk hypotheses
        )r*   )r,   	tn_outputhypsr   r   r   forward   s   
zTransducerBeamSearcher.forwardFc                 C   s  dd t |dD dd t |dD d}tj|ddf|jtjd| j }|du r9| || j\}}n|\}}t |dD ]}| 	|dd|ddf 
d
d|
d}	tj|	dddd	\}
}g }t |dD ]1}||  | jkr|d
 | ||   |d |  |
| 7  < || || d< || qvt|dkr| |||\}}| || j|\}}|||< | |||}qD|d
 t|d   ddf}|r|||ff7 }|S )a  Transducer greedy decoder is a greedy decoder over batch which apply Transducer rules:
            1- for each time step in the Transcription Network (TN) output:
                -> Update the ith utterance only if
                    the previous target != the new one (we save the hiddens and the target)
                -> otherwise:
                ---> keep the previous target prediction from the decoder

        Arguments
        ---------
        tn_output : torch.Tensor
            Output from transcription network with shape
            [batch, time_len, hiddens].
        hidden_state : (torch.Tensor, torch.Tensor)
            Hidden state to initially feed the decode network with. This is
            useful in conjunction with `return_hidden` to be able to perform
            beam search in a streaming context, so that you can reuse the last
            hidden state as an initial state across calls.
        return_hidden : bool
            Whether the return tuple should contain an extra 5th element with
            the hidden state at of the last step. See `hidden_state`.

        Returns
        -------
        Tuple of 4 or 5 elements (if `return_hidden`).

        First element: List[List[int]]
            List of decoded tokens

        Second element: torch.Tensor
            Outputs a logits tensor [B,T,1,Output_Dim]; padding
            has not been removed.

        Third element: None
            nbest; irrelevant for greedy decode

        Fourth element: None
            nbest scores; irrelevant for greedy decode

        Fifth element: Present if `return_hidden`, (torch.Tensor, torch.Tensor)
            Tuple representing the hidden state required to call
            `transducer_greedy_decode` where you left off in a streaming
            context.
        c                 S   s   g | ]}g qS r   r   .0_r   r   r   
<listcomp>       zCTransducerBeamSearcher.transducer_greedy_decode.<locals>.<listcomp>r   c                 S   s   g | ]}d qS )r   r   r3   r   r   r   r6      r7   )
predictionlogp_scoresr   devicedtypeNr   r8   r9   )rangesizer%   onesr;   int32r   _forward_PNr   _joint_forward_step	unsqueezemaxsqueezeitemappendlen_get_sentence_to_update_update_hiddensTensorexpmean)r,   r0   hidden_statereturn_hiddenhypinput_PNout_PNr   t_step	log_probslogp_targets	positionshave_update_hypiselected_input_PNselected_hiddenselected_out_PNretr   r   r   r)      sr   /	 

z/TransducerBeamSearcher.transducer_greedy_decodexcontextc                 C   s&   | j ||jdd\}}}}}||_|S )a  Tiny wrapper for
        :meth:`~TransducerBeamSearcher.transducer_greedy_decode` with an API
        that makes it suitable to be passed as a `decoding_function` for
        streaming.

        Arguments
        ---------
        x : torch.Tensor
            Outputs of the prediction network (equivalent to `tn_output`)
        context : TransducerGreedySearcherStreamingContext
            Mutable streaming context object, which must be specified and reused
            across calls when streaming.
            You can obtain an initial context by initializing a default object.

        Returns
        -------
        hyp : torch.Tensor
        T)rO   )r)   r   )r,   r]   r^   rP   _scoresr5   r   r   r   r   "transducer_greedy_decode_streaming  s
   z9TransducerBeamSearcher.transducer_greedy_decode_streamingc                 C   sB  g }g }t |dD ]}}tjd|jtjd| j }tjd|jtjd| j }| jgddd}| jdkr>ddi}|| |g}	t |dD ]}
|	}g }		 t	|	| j
krXnt|ttd
}t	|	dkr~t|	ttd
}|d }|d }|| j| kr~n|| |d d |d< | || j|d \}}| |||
ddf ddd|d}| jdkr| ||d \}}tj|d| j
dd\}}|d |kr|d n|d }t |dD ]j}|d dd |d ||  |d d}|| | jkr|	| | jdkr|d |d< q|| || j krL|d ||   ||d< | jdkrG||d< |d  | j|dd|| f  7  < || qqPqHt|	ttd	dd| j }g }g }|D ]}||d dd  ||d t	|d   qc|| || qdd |D tdd |D   ||fS )a.  Transducer beam search decoder is a beam search decoder over batch which apply Transducer rules:
            1- for each utterance:
                2- for each time steps in the Transcription Network (TN) output:
                    -> Do forward on PN and Joint network
                    -> Select topK <= beam
                    -> Do a while loop extending the hyps until we reach blank
                        -> otherwise:
                        --> extend hyp by the new token

        Arguments
        ---------
        tn_output : torch.Tensor
            Output from transcription network with shape
            [batch, time_len, hiddens].

        Returns
        -------
        torch.Tensor
            Outputs a logits tensor [B,T,1,Output_Dim]; padding
            has not been removed.
        r   )r   r   r:   r   N)r8   
logp_score
hidden_dec	hidden_lmr   T)keyra   r8   r   )r   r   rb   )kr   )rd   reversec                 S      g | ]}|d  qS r   r   )r4   	nbest_uttr   r   r   r6         zHTransducerBeamSearcher.transducer_beam_search_decode.<locals>.<listcomp>c                 S   rg   rh   r   )r4   nbest_utt_scorer   r   r   r6     rj   )r=   r>   r%   r?   r;   r@   r   r!   updaterH   r   rD   r   get_transducer_keyr#   removerA   r   rB   rC   _lm_forward_steptopkviewrG   r$   rF   sortedr   rK   rL   rM   )r,   r0   nbest_batchnbest_batch_scorei_batchblankrQ   rP   lm_dict	beam_hypsrS   process_hyps
a_best_hyp
b_best_hypa_best_probb_best_probrR   r   rT   log_probs_lmrc   rU   rV   	best_logpjtopk_hyp
nbest_hypsall_predictions
all_scoresr   r   r   r+   .  s   






R
z4TransducerBeamSearcher.transducer_beam_search_decodec                 C   sR   t   | ||}| || j}| |}W d   |S 1 s"w   Y  |S )zJoin predictions (TN & PN).N)r%   no_gradr   _forward_after_jointr   r(   )r,   h_irR   outrT   r   r   r   rB     s   

		z*TransducerBeamSearcher._joint_forward_stepc                 C   sR   t   | j||d\}}| |}W d   ||fS 1 s w   Y  ||fS )aV  This method should implement one step of
        forwarding operation for language model.

        Arguments
        ---------
        inp_tokens : torch.Tensor
            The input tensor of the current timestep.
        memory : No limit
            The memory variables input for this timestep.
            (e.g., RNN hidden states).

        Return
        ------
        log_probs : torch.Tensor
            Log-probabilities of the current timestep output.
        hs : No limit
            The memory variables are generated in this timestep.
            (e.g., RNN hidden states).
        )hxN)r%   r   r    r(   )r,   
inp_tokensmemorylogitshsrT   r   r   r   ro     s   

z'TransducerBeamSearcher._lm_forward_stepc                 C   s|   ||ddf }t |tr/|d dd|ddf }|d dd|ddf }||f}||fS |dd|ddf }||fS )a  Select and return the updated hiddens and output
        from the Prediction Network.

        Arguments
        ---------
        selected_sentences : list
            List of updated sentences (indexes).
        output_PN: torch.Tensor
            Output tensor from prediction network (PN).
        hidden : torch.Tensor
            Optional: None, hidden tensor to be used for
            recurrent layers in the prediction network.

        Returns
        -------
        selected_output_PN: torch.Tensor
            Outputs a logits tensor [B_selected,U, hiddens].
        hidden_update_hyp: torch.Tensor
            Selected hiddens tensor.
        Nr   r   
isinstancetuple)r,   selected_sentences	output_PNr   selected_output_PNhidden0_update_hyphidden1_update_hyphidden_update_hypr   r   r   rI     s   
z.TransducerBeamSearcher._get_sentence_to_updatec                 C   sd   t |tr%|d |d dd|ddf< |d |d dd|ddf< |S ||dd|ddf< |S )a  Update hidden tensor by a subset of hidden tensor (updated ones).

        Arguments
        ---------
        selected_sentences : list
            List of index to be updated.
        updated_hidden : torch.Tensor
            Hidden tensor of the selected sentences for update.
        hidden : torch.Tensor
            Hidden tensor to be updated.

        Returns
        -------
        torch.Tensor
            Updated hidden tensor.
        r   Nr   r   )r,   r   updated_hiddenr   r   r   r   rJ     s   
z&TransducerBeamSearcher._update_hiddensc                 C   s6   |D ]}|j jdv r|||\}}q||}q||fS )a  Compute forward-pass through a list of prediction network (PN) layers.

        Arguments
        ---------
        out_PN : torch.Tensor
            Input sequence from prediction network with shape
            [batch, target_seq_lens].
        decode_network_lst: list
            List of prediction network (PN) layers.
        hidden : torch.Tensor
            Optional: None, hidden tensor to be used for
                recurrent layers in the prediction network

        Returns
        -------
        out_PN : torch.Tensor
            Outputs a logits tensor [B,U, hiddens].
        hidden : torch.Tensor
            Hidden tensor to be used for the next step
            by recurrent layers in prediction network.
        )RNNLSTMGRULiGRULiGRU_Layer)r/   r   )r,   rR   r   r   layerr   r   r   rA   -  s
   
z"TransducerBeamSearcher._forward_PNc                 C   s   |D ]}||}q|S )a.  Compute forward-pass through a list of classifier neural network.

        Arguments
        ---------
        out : torch.Tensor
            Output from joint network with shape
            [batch, target_len, time_len, hiddens]
        classifier_network : list
            List of output layers (after performing joint between TN and PN)
            exp: (TN,PN) => joint => classifier_network_list [DNN block, Linear..] => chars prob

        Returns
        -------
        torch.Tensor
            Outputs a logits tensor [B, U,T, Output_Dim];
        r   )r,   r   r   r   r   r   r   r   Q  s   
z+TransducerBeamSearcher._forward_after_joint)r   r   Nr   r   r   )NF)N)r   r	   r
   r   r   r2   r)   r%   rK   r   r`   r+   rB   ro   rI   rJ   rA   r   __classcell__r   r   r.   r   r      s2    Q#
{
   
$r   c                 C   s   | d t | d  }|S )a  Argument function to customize the sort order (in sorted & max).
    To be used as `key=partial(get_transducer_key)`.

    Arguments
    ---------
    x : dict
        one of the items under comparison

    Returns
    -------
    float
        Normalized log-score.
    ra   r8   )rH   )r]   logp_keyr   r   r   rm   h  s   rm   )r   dataclassesr   	functoolsr   typingr   r   r%   r&   Moduler   r   rm   r   r   r   r   <module>   s    	    S