o
    i#Z                     @   s   d Z ddlmZ ddlmZmZmZmZmZm	Z	 ddl
ZddlZddlmZ ddlmZ eG dd dZeG d	d
 d
eZG dd dZdS )z(Search algorithms for Transducer models.    )	dataclass)AnyDictListOptionalTupleUnionN)
AbsDecoder)JointNetworkc                   @   sj   e Zd ZU dZeed< ee ed< dZe	e
eje	ej f  ed< dZe	eeeef ee f  ed< dS )
Hypothesisaf  Default hypothesis definition for Transducer search algorithms.

    Args:
        score: Total log-probability.
        yseq: Label sequence as integer ID sequence.
        dec_state: RNNDecoder or StatelessDecoder state.
                     ((N, 1, D_dec), (N, 1, D_dec) or None) or None
        lm_state: RNNLM state. ((N, D_lm), (N, D_lm)) or None

    scoreyseqN	dec_statelm_state)__name__
__module____qualname____doc__float__annotations__r   intr   r   r   torchTensorr   r   r   strr    r   r   a/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr_transducer/beam_search_transducer.pyr      s   
  (r   c                   @   s.   e Zd ZU dZdZejed< dZejed< dS )ExtendedHypothesiszExtended hypothesis definition for NSC beam search and mAES.

    Args:
        : Hypothesis dataclass arguments.
        dec_out: Decoder output sequence. (B, D_dec)
        lm_score: Log-probabilities of the LM for given label. (vocab_size)

    Ndec_outlm_score)	r   r   r   r   r   r   r   r   r   r   r   r   r   r       s   
 	r   c                       s~  e Zd ZdZ											
		d7dedededeej	j
 dededededededededededdf fddZ	d8dejdedee fdd Zd9d!d"Zd#ee dee fd$d%Zd#ee dee fd&d'Zd#ee d(ejd)ejdee fd*d+Zd,eee  dejfd-d.Zdejdee fd/d0Zdejdee fd1d2Zdejdee fd3d4Zdejdee fd5d6Z  ZS ):BeamSearchTransducerar  Beam search implementation for Transducer.

    Args:
        decoder: Decoder module.
        joint_network: Joint network module.
        beam_size: Size of the beam.
        lm: LM class.
        lm_weight: LM weight for soft fusion.
        search_type: Search algorithm to use during inference.
        max_sym_exp: Number of maximum symbol expansions at each time step. (TSD)
        u_max: Maximum expected target sequence length. (ALSD)
        nstep: Number of maximum expansion steps at each time step. (mAES)
        expansion_gamma: Allowed logp difference for prune-by-value method. (mAES)
        expansion_beta:
             Number of additional candidates for expanded hypotheses selection. (mAES)
        score_norm: Normalize final scores by length.
        nbest: Number of final hypothesis.
        streaming: Whether to perform chunk-by-chunk beam search.

    N皙?default   2      ffffff@F   decoderjoint_network	beam_sizelm	lm_weightsearch_typemax_sym_expu_maxnstepexpansion_gammaexpansion_beta
score_normnbest	streamingreturnc                    sZ  t    || _|| _|j| _|| jksJ d|| jf || _|dkr)| j| _n[|dkr?|dks7J d| || _| j	| _nE|dkrY|rIJ d|dksQJ d	|| _
| j| _n+|d
kr~| j|| ksnJ d||| jf || | _|	| _|
| _| j| _ntd| |du| _| jrt|dsJ d| jd | _|| _|| _|| _|| _|   dS )z(Construct a BeamSearchTransducer object.zGbeam_size (%d) should be smaller than or equal to vocabulary size (%d).r!   tsdr&   z,max_sym_exp (%d) should be greater than one.alsdz(ALSD is not available in streaming mode.r   z7u_max should be a positive integer, a portion of max_T.maeszYbeam_size (%d) + expansion_beta (%d)  should be smaller than or equal to vocab size (%d).z,Specified search type (%s) is not supported.Nrnn_typez*Transformer LM is currently not supported.)super__init__r'   r(   
vocab_sizer)   default_beam_searchsearch_algorithmr-   time_sync_decodingr.   align_length_sync_decodingmax_candidatesr/   r0   "modified_adaptive_expansion_searchNotImplementedErroruse_lmhasattrsosr*   r+   r2   r3   reset_inference_cache)selfr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   	__class__r   r   r;   E   s^   







zBeamSearchTransducer.__init__Tenc_outis_finalc                 C   s8   | j |j | |}|r|   | |S || _|S )zPerform beam search.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)
            is_final: Whether enc_out is the final chunk of data.

        Returns:
            nbest_hyps: N-best decoding results

        )r'   
set_devicedevicer>   rG   
sort_nbestsearch_cache)rH   rK   rL   hypsr   r   r   __call__   s   

zBeamSearchTransducer.__call__c                 C   s   i | j _d| _dS )z.Reset cache for decoder scoring and streaming.N)r'   score_cacherP   )rH   r   r   r   rG      s   
z*BeamSearchTransducer.reset_inference_cacherQ   c                 C   s:   | j r|jdd dd n	|jdd dd |d| j S )zSort in-place hypotheses by score or score given sequence length.

        Args:
            hyps: Hypothesis.

        Return:
            hyps: Sorted hypothesis.

        c                 S   s   | j t| j S N)r   lenr   xr   r   r   <lambda>   s    z1BeamSearchTransducer.sort_nbest.<locals>.<lambda>Tkeyreversec                 S      | j S rT   r   rV   r   r   r   rX          N)r2   sortr3   )rH   rQ   r   r   r   rO      s   
zBeamSearchTransducer.sort_nbestc                 C   sX   i }|D ]!}d tt|j}||v r!t|| j|j|| _q|||< qg | S )zRecombine hypotheses with same label ID sequence.

        Args:
            hyps: Hypotheses.

        Returns:
            final: Recombined hypotheses.

        _)joinmapr   r   np	logaddexpr   values)rH   rQ   finalhypstr_yseqr   r   r   recombine_hyps   s   

z#BeamSearchTransducer.recombine_hypstopk_idx	topk_logpc                    sv   g }t |D ]2\}  fddt|| || D }t|dd dd |ttfdd|dd d	d
 q|S )a  Return K hypotheses candidates for expansion from a list of hypothesis.

        K candidates are selected according to the extended hypotheses probabilities
        and a prune-by-value method. Where K is equal to beam_size + beta.

        Args:
            hyps: Hypotheses.
            topk_idx: Indices of candidates hypothesis.
            topk_logp: Log-probabilities of candidates hypothesis.

        Returns:
            k_expansions: Best K expansion hypotheses candidates.

        c                    s&   g | ]\}}t | jt| fqS r   )r   r   r   ).0kv)rg   r   r   
<listcomp>   s    z<BeamSearchTransducer.select_k_expansions.<locals>.<listcomp>c                 S      | d S Nr&   r   rV   r   r   r   rX          z:BeamSearchTransducer.select_k_expansions.<locals>.<lambda>rZ   r&   c                    s    j  | d kS rq   )r0   rV   )
k_best_exprH   r   r   rX          c                 S   rp   rq   r   rV   r   r   r   rX     rr   TrY   )	enumeratezipmaxappendsortedfilter)rH   rQ   rj   rk   k_expansionsihyp_ir   )rg   rt   rH   r   select_k_expansions   s    

z(BeamSearchTransducer.select_k_expansionshyps_seqc                    s4   t dd |D  tj fdd|D jjdS )zMake batch of inputs with left padding for LM scoring.

        Args:
            hyps_seq: Hypothesis sequences.

        Returns:
            : Padded batch of sequences.

        c                 S   s   g | ]}t |qS r   )rU   rl   hr   r   r   ro         z?BeamSearchTransducer.create_lm_batch_inputs.<locals>.<listcomp>c                    s2   g | ]}j gd g t|   |dd  qS )r   r&   N)rF   rU   r   max_lenrH   r   r   ro     s   2 rN   )rx   r   
LongTensorr'   rN   )rH   r   r   r   r   create_lm_batch_inputs  s
   
z+BeamSearchTransducer.create_lm_batch_inputsc              
      s  t | j| jd }t|}| jdur| j}ntddg| jddg}t|D ]}|}g }	 t	|dd d	}|
| tjd
|jd tj| jjd}| j||j|j\}	}
tj| |||d ddf |	ddd}|dd j|dd}|t|jt|dd  |j|j|jd | jr| jtj| jg|jdd  | jjd|jd\}}n|j}t| D ],\}}|jt| }| jr|| j||d   7 }|t||jt|d g |
|d qtt	|dd d	j t  fdd|D dd d	}t|| jkr|}nq.q'|S )zBeam search implementation without prefix search.

        Modified from https://arxiv.org/pdf/1211.3711.pdf

        Args:
            enc_out: Encoder output sequence. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        r&   N        r   )r   r   r   Tc                 S   r\   rT   r]   rV   r   r   r   rX   8  r^   z:BeamSearchTransducer.default_beam_search.<locals>.<lambda>rs   )r&   r&   )dtyperN   dimr   r   r   r   r   c                 S   r\   rT   r]   rV   r   r   r   rX   p  r^   c                    s   g | ]	}|j  kr|qS r   r]   )rl   rg   hyps_maxr   r   ro   r  s    z<BeamSearchTransducer.default_beam_search.<locals>.<listcomp>c                 S   r\   rT   r]   rV   r   r   r   rX   s  r^   )!minr)   r<   rU   rP   r   r'   
init_staterangerx   remover   fullr   longrN   r   r   log_softmaxr(   squeezetopkry   r   r   rD   r*   r   rF   rw   r+   r   rz   )rH   rK   beam_kmax_t	kept_hypstrQ   max_hyplabelr   statelogptop_k	lm_scoresr   rm   r   kept_most_probr   r   r   r=     s   


	
	@z(BeamSearchTransducer.default_beam_searchc              
   C   s  t |d}t| j|d }tdgd| jddg}g }| jr)| j	 |d _
t|| D ]}g }g }g }	|D ]"}
t|
jd }|| }||d krNq:||
 |	||| f q:|r<tdd |	D }| j|\}}tj| ||dd}|d	d	dd	f j| jdd}| jr| j| d
d |D dd |D d	\}}t|D ]}\}}
t|
jt||df  |
jd	d	 |
j|
j
d}|| |	| d |d kr|| t|d | |d | d D ]=\}}t|
jt| |
jd	d	 t |g | j|||
j
d}| jr"| j| j|||f  7  _|| |_
|| qqt|dd ddd	| j }| |}q/|rB|S |S )zAlignment-length synchronous beam search implementation.

        Based on https://ieeexplore.ieee.org/document/9053040

        Args:
            h: Encoder output sequences. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        r   r&   r   r   r   r   c                 S   s   g | ]}|d  qS )r&   r   rl   br   r   r   ro     r   zCBeamSearchTransducer.align_length_sync_decoding.<locals>.<listcomp>r   r   Nc                 S      g | ]}|j qS r   r   r   r   r   r   ro     ru   c                 S   r   r   r   r   r   r   r   ro     ru   r   c                 S   r\   rT   r]   rV   r   r   r   rX     r^   zABeamSearchTransducer.align_length_sync_decoding.<locals>.<lambda>TrY   ) r   sizer   r.   r   r'   r   rD   r*   
zero_stater   r   rU   r   ry   r   stackbatch_scorer   r(   r   r)   r   rv   r   r   r   rw   select_stater+   rz   ri   )rH   rK   t_maxr.   Brf   r}   AB_	B_enc_outrg   ur   beam_enc_outbeam_dec_out
beam_state	beam_logp	beam_topkbeam_lm_scoresbeam_lm_statesnew_hypr   rm   r   r   r   r@   {  sv   

 

&

z/BeamSearchTransducer.align_length_sync_decodingc                 C   sv  | j dur	| j }ntdgd| jddg}| jr!| j |d _|D ]}g }|}|d}t	| j
D ]}g }| j|\}}	tj| ||dd}
|
ddddf j| jdd}dd	 |D }t|D ]@\}}|j|vr|t|jt|
|df  |jdd |j|jd
 qf||j}t|| j|jt|
|df  || _qf|| j
d k r| jr| j| dd	 |D dd	 |D d\}}t|D ]M\}}t|d | |d | d D ]9\}}t|jt| |jt|g | j|	||jd
}| jr| j| j|||f  7  _|| |_|| qqt|dd ddd| j }q4t|dd ddd| j }q#|S )zTime synchronous beam search implementation.

        Based on https://ieeexplore.ieee.org/document/9053040

        Args:
            enc_out: Encoder output sequence. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        Nr   r   r&   r   r   r   c                 S   r   r   r   r   r   r   r   ro     ru   z;BeamSearchTransducer.time_sync_decoding.<locals>.<listcomp>r   c                 S   r   r   r   rl   cr   r   r   ro     ru   c                 S   r   r   r   r   r   r   r   ro     ru   c                 S   r\   rT   r]   rV   r   r   r   rX   +  r^   z9BeamSearchTransducer.time_sync_decoding.<locals>.<lambda>TrY   c                 S   r\   rT   r]   rV   r   r   r   rX   -  r^   ) rP   r   r'   r   rD   r*   r   r   	unsqueezer   r-   r   r   r   r(   r   r)   rv   r   ry   r   r   r   indexrc   rd   r   rw   r   r   r+   rz   )rH   rK   r   	enc_out_tr   Crn   Dr   r   r   r   seq_Ar}   rg   dict_posr   r   r   rm   r   r   r   r   r?     sx   




 
	&
z'BeamSearchTransducer.time_sync_decodingc                 C   sH  | j dur	| j }nOtdgd| jddg}| j|\}}| jrA| j| dd |D dd |D d\}}|d }|d }	nd}d}	tdgd| j|d|d ||	d	g}|D ]F}
|}g }|
	d}g }t
| jD ]2}td
d |D }tj| ||ddj| jdd\}}| |||}g }t|D ]I\}}|| D ]@\}}t|jdd ||j|j|j|jd}|dkr|| q|jt| | jr| j| jt|j|  7  _|| qq|st| |dd ddd| j  } n| j|\}}| jr| j| dd |D dd |D d\}}|| jd k rPt|D ] \}}|| |_| j|||_| jrG|| |_|| |_q(|dd }qmtj| ||dd}t|D ]-\}}| jt||df 7  _|| |_| j|||_| jr|| |_|| |_q_t| || dd ddd| j  }qmqZ|S )aF  Modified version of Adaptive Expansion Search (mAES).

        Based on AES (https://ieeexplore.ieee.org/document/9250505) and
                 NSC (https://arxiv.org/abs/2201.05420).

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            nbest_hyps: N-best hypothesis.

        Nr   r   r&   r   c                 S   r   r   r   r   r   r   r   ro   R  ru   zKBeamSearchTransducer.modified_adaptive_expansion_search.<locals>.<listcomp>c                 S   r   r   r   r   r   r   r   ro   S  ru   )r   r   r   r   r   r   c                 S   r   r   )r   r   r   r   r   ro   p  ru   r   r   )r   r   r   r   r   r   c                 S   r\   rT   r]   rV   r   r   r   rX     r^   zIBeamSearchTransducer.modified_adaptive_expansion_search.<locals>.<lambda>TrY   c                 S   r   r   r   r   r   r   r   ro     ru   c                 S   r   r   r   r   r   r   r   ro     ru   c                 S   r\   rT   r]   rV   r   r   r   rX     r^   )!rP   r   r'   r   r   rD   r*   r   r   r   r   r/   r   r   r   r(   r   rA   r   rv   r   r   r   r   r   ry   r   r   r+   r   rz   ri   r)   )rH   rK   r   init_tokensr   r   r   r   r   r   r   rQ   r   list_bnr   beam_idxr|   list_expr}   rg   rm   	new_scorer   r   r   r   rB   1  s   






	






z7BeamSearchTransducer.modified_adaptive_expansion_search)Nr    r!   r"   r#   r$   r%   r$   Fr&   F)T)r5   N)r   r   r   r   r	   r
   r   r   r   nnModuler   r   boolr;   r   r   r   rR   rG   rO   ri   r   r   r   r=   r@   r?   rB   __classcell__r   r   rI   r   r   /   s    
	
U


)b
[[r   )r   dataclassesr   typingr   r   r   r   r   r   numpyrc   r   *espnet2.asr_transducer.decoder.abs_decoderr	   $espnet2.asr_transducer.joint_networkr
   r   r   r   r   r   r   r   <module>   s     