o
    ir                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZ eG d	d
 d
ZeG dd deZG dd dZdS )z(Search algorithms for Transducer models.    N)	dataclass)AnyDictListOptionalTupleUnion)
AbsDecoder)JointNetwork)TransformerLM)	is_prefixrecombine_hypsselect_k_expansionssubtractc                   @   st   e Zd ZU dZeed< ee ed< ee	e
jee
j f eee
j  e
jf ed< dZeeeef ee f ed< dS )
Hypothesisz?Default hypothesis definition for Transducer search algorithms.scoreyseq	dec_stateNlm_state)__name__
__module____qualname____doc__float__annotations__r   intr   r   torchTensorr   r   r   strr    r   r   a/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/transducer/beam_search_transducer.pyr      s   
 $r   c                   @   s2   e Zd ZU dZdZeej ed< dZ	ejed< dS )ExtendedHypothesisz<Extended hypothesis definition for NSC beam search and mAES.Ndec_out	lm_scores)
r   r   r   r   r"   r   r   r   r   r#   r   r   r   r    r!   #   s   
 r!   c                !   @   s  e Zd ZdZ													d1d
edededejj	de
dedededededededededeee  fddZdejdeee ee f fddZdeee ee f deee ee f fd d!Zdee d"ejdee fd#d$Zdejdee fd%d&Zdejdee fd'd(Zdejdee fd)d*Zdejdee fd+d,Zdejdee fd-d.Zdejdee fd/d0ZdS )2BeamSearchTransducerz*Beam search implementation for Transducer.N皙?default   2      ffffff@Tdecoderjoint_network	beam_sizelm	lm_weightsearch_typemax_sym_expu_maxnstepprefix_alphaexpansion_gammaexpansion_beta
score_normnbest
token_listc                 C   sn  || _ || _|| _|j| _|j| _| jd | _|| _|j	| _	| jdkr(| j
| _n||dkr1| j| _ns|dkrDt|tr<t|| _| j| _n`|dkrWt|trOt|| _| j| _nM|dkrmt|trbt|	| _|
| _| j| _n7|dkrt|trxt|	dkr~|	nd| _|
| _|| _| j|| ksJ d||| jf || | _| j| _nt|d	u| _|| _|| _|| _|| _d	S )
a  Initialize Transducer search module.

        Args:
            decoder: Decoder module.
            joint_network: Joint network module.
            beam_size: Beam size.
            lm: LM class.
            lm_weight: LM weight for soft fusion.
            search_type: Search algorithm to use during inference.
            max_sym_exp: Number of maximum symbol expansions at each time step. (TSD)
            u_max: Maximum output sequence length. (ALSD)
            nstep: Number of maximum expansion steps at each time step. (NSC/mAES)
            prefix_alpha: Maximum prefix length in prefix search. (NSC/mAES)
            expansion_beta:
              Number of additional candidates for expanded hypotheses selection. (mAES)
            expansion_gamma: Allowed logp difference for prune-by-value method. (mAES)
            score_norm: Normalize final scores by length. ("default")
            nbest: Number of final hypothesis.

        r)   r&   tsdalsdnscmaesr'   zXbeam_size (%d) + expansion_beta (%d) should be smaller or equal to vocabulary size (%d).N)r+   r,   r-   dunitshidden_sizeodim
vocab_sizesosr9   blank_idgreedy_searchsearch_algorithmdefault_beam_search
isinstancer   NotImplementedErrorr1   time_sync_decodingr2   align_length_sync_decodingr3   r4   nsc_beam_searchr5   max_candidates"modified_adaptive_expansion_searchuse_lmr.   r/   r7   r8   )selfr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r   r   r    __init__.   s^   &














zBeamSearchTransducer.__init__enc_outreturnc                 C   s   | j |j | |}|S )zPerform beam search.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            nbest_hyps: N-best decoding results

        )r+   
set_devicedevicerE   )rO   rQ   
nbest_hypsr   r   r    __call__   s   
zBeamSearchTransducer.__call__hypsc                 C   s:   | j r|jdd dd n	|jdd dd |d| j S )zSort hypotheses by score or score given sequence length.

        Args:
            hyps: Hypothesis.

        Return:
            hyps: Sorted hypothesis.

        c                 S   s   | j t| j S N)r   lenr   xr   r   r    <lambda>   s    z1BeamSearchTransducer.sort_nbest.<locals>.<lambda>Tkeyreversec                 S      | j S rX   r   rZ   r   r   r    r\          N)r7   sortr8   )rO   rW   r   r   r    
sort_nbest   s   zBeamSearchTransducer.sort_nbest	enc_out_tc              	   C   s   t |dd D ]n\}}||d d D ]a}t|j}t|j}t|j|jru|| | jkrutj| ||jd dd}|j	t
||j|   }	t||d D ]}
tj| ||j|
 dd}|	t
||j|
d   7 }	qOt|j	|	|_	qq|S )zjPrefix search for NSC and mAES strategies.

        Based on https://arxiv.org/pdf/1211.3711.pdf

        Nr)   dim)	enumeraterY   r   r   r4   r   log_softmaxr,   r"   r   r   rangenp	logaddexp)rO   rW   re   jhyp_jhyp_icurr_idpref_idlogp
curr_scorekr   r   r    prefix_search   s,   

z"BeamSearchTransducer.prefix_searchc                 C   s   | j d}td| jg|d}i }| j ||\}}}|D ]9}tj| ||dd}	tj|	dd\}
}|| jkrV|j	
t| | jt|
7  _||_| j ||\}}}q|gS )zGreedy search implementation.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            hyp: 1-best hypotheses.

        r)           r   r   r   rf   rg   )r+   
init_stater   rC   r   r   rj   r,   maxr   appendr   r   r   )rO   rQ   r   hypcacher"   state_re   rs   top_logppredr   r   r    rD      s"   


z"BeamSearchTransducer.greedy_searchc                    s\  t jj}t |jd }jd}tdjg|dg}i }i }|D ]}|}	g }jdurIt	dd
fddt|	dd	 d
dD   	 t|	dd	 d}
|	|
 j|
|\}}}tj||dd}|dd j|dd}|t|
jt|dd  |
jdd |
j|
jd jrt|
j|vrjtjjg|
jdd  jjd|
jd\}}||f|t|
j< n|t|
j \}}n|
j}t| D ]0\}}|
jt| }jr|j||d   7 }|	t||
jdd t |d g ||d qtt|	dd	 dj t fdd|D dd	 d}t!||kr'|}nqJq%"|S )zBeam search implementation.

        Modified from https://arxiv.org/pdf/1211.3711.pdf

        Args:
            enc_out: Encoder output sequence. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        r)   rw   rx   N
c              	      sH   g | ] }d d  fdd|jdd D  dtt|jd  qS )zhypo:  c                    s   g | ]} j | qS r   )r9   .0r[   rO   r   r    
<listcomp>      zGBeamSearchTransducer.default_beam_search.<locals>.<listcomp>.<listcomp>r)   Nz	, score: r'   )joinr   roundr   r   r   r|   r   r   r    r     s     z<BeamSearchTransducer.default_beam_search.<locals>.<listcomp>c                 S   r`   rX   ra   rZ   r   r   r    r\      rb   z:BeamSearchTransducer.default_beam_search.<locals>.<lambda>Tr]   c                 S   r`   rX   ra   rZ   r   r   r    r\   &  rb   )r^   rf   rg   r   r   r   r   r   )rT   c                 S   r`   rX   ra   rZ   r   r   r    r\   Y  rb   c                    s   g | ]	}|j  kr|qS r   ra   r   )hyps_maxr   r    r   [  s    c                 S   r`   rX   ra   rZ   r   r   r    r\   \  rb   )#minr-   rA   r+   ry   r   rC   r9   loggingdebugr   sortedrz   remover   r   rj   r,   topkr{   r   r   r   r   rN   tupler.   
LongTensorrB   rT   zipr/   r   rY   rd   )rO   rQ   beambeam_kr   	kept_hypsr}   cache_lmre   rW   max_hypr"   r~   	lm_tokensrs   top_kr#   r   ru   r   kept_most_probr   )r   rO   r    rF      s   




		;
z(BeamSearchTransducer.default_beam_searchc                 C   s  t | j| j}| j|}t| jgd| j|ddg}i }| jr)| j	
 |d _|D ]}g }|}|d}t| jD ]}	g }
| j|||| j\}}}tj| ||dd}|ddddf j|dd}dd	 |D }t|D ]@\}}|j|vr|t|jt||df  |jdd |j|jd
 qr||j}t|| j|jt||df  || _qr|	| jd k r| jr| j	|dd	 |D d\}}t|D ]M\}}t|d | |d | d D ]9\}}t|jt| |jt|g | j|||jd
}| jr| j| j|||f  7  _|| |_|
| qqt |
dd ddd| }q<t |dd ddd| }q+| !|S )zTime synchronous beam search implementation.

        Based on https://ieeexplore.ieee.org/document/9053040

        Args:
            enc_out: Encoder output sequence. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        rw   r   r   r   r   rf   rg   Nr)   c                 S      g | ]}|j qS r   r   r   hr   r   r    r         z;BeamSearchTransducer.time_sync_decoding.<locals>.<listcomp>r   c                 S   r   r   r   )r   cr   r   r    r     r   c                 S   r`   rX   ra   rZ   r   r   r    r\     rb   z9BeamSearchTransducer.time_sync_decoding.<locals>.<lambda>Tr]   c                 S   r`   rX   ra   rZ   r   r   r    r\     rb   )"r   r-   rA   r+   ry   r   rC   select_staterN   r.   
zero_stater   	unsqueezerk   r1   batch_scorer   rj   r,   r   ri   r   r{   r   r   r   indexrl   rm   r   r   r/   r   rd   )rO   rQ   r   
beam_stateBr}   re   ACvDbeam_dec_outbeam_lm_tokens	beam_logp	beam_topkseq_Air|   dict_posbeam_lm_scoresbeam_lm_statesrs   ru   new_hypr   r   r    rI   d  s   




	&

z'BeamSearchTransducer.time_sync_decodingc              
   C   s  t | j| j}t|d}t | j|d }| j|}t| j	gd| j
|ddg}g }i }| jr:| j |d _t|| D ]}	g }
g }g }|D ]"}t|jd }|	| }||d kr_qK|| |||| f qK|rG| j|||| j\}}}tdd |D }tj| ||dd}|d	d	dd	f j|dd}| jr| j|d
d |D d	\}}t|D ]}\}	}t|jt||	df  |jd	d	 |j|jd}|
| ||	 d |d kr|| t|d |	 |d |	 d D ]=\}}t|jt| |jd	d	 t|g | j
||	|jd}| jr/| j| j||	|f  7  _||	 |_|
| qqt|
dd ddd	| }t |}q@|rP| !|S |S )zAlignment-length synchronous beam search implementation.

        Based on https://ieeexplore.ieee.org/document/9053040

        Args:
            h: Encoder output sequences. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        r   r)   rw   r   c                 S   s   g | ]}|d  qS )r)   r   r   r   r   r    r     s    zCBeamSearchTransducer.align_length_sync_decoding.<locals>.<listcomp>rf   rg   Nc                 S   r   r   r   )r   br   r   r    r     r   r   c                 S   r`   rX   ra   rZ   r   r   r    r\   %  rb   zABeamSearchTransducer.align_length_sync_decoding.<locals>.<lambda>Tr]   )"r   r-   rA   r   sizer2   r+   ry   r   rC   r   rN   r.   r   r   rk   rY   r   r{   r   r   stackrj   r,   r   ri   r   r   r   r   r/   r   r   rd   )rO   rQ   r   t_maxr2   r   r   finalr}   r   r   B_	B_enc_outr|   utr   r   beam_enc_outr   r   r   r   r   rs   ru   r   r   r    rJ     s   




&

z/BeamSearchTransducer.align_length_sync_decodingc                 C   s  t | j| j}t || jd }| j|}t| jgd| j|ddg}i }| j|||| j	\}}}| j|d}	| j	rU| j
|dd |D d\}
}|d }|
d }nd}d}t| jgd|	|d g||dg}|D ]}| t|d	d
 dd|}g }|d}g }g }t| jD ]_}tdd |D }tj| ||dd}|ddddf j|dd}t|D ]s\}}|t|jdd |jt||ddf  |jdd |j|j|jd t|d | |d | d D ]:\}}|jt| }| j	r	|| jt|j|  7 }|t|jdd t |g ||jdd |j|j|jd qq|j!dd
 dd t"||d| }| j#|dd |D dd |D }| j|||| j	\}}}| j	rn| j
|dd |D d\}
}|| jd k rt|D ]#\}}|j||  | j|||_| j	r|| |_|
| |_qz|dd }qtj| ||dd}t|D ]6\}}| jdkr| jt||df 7  _|j||  | j|||_| j	r|| |_|
| |_qqt|| dd
 ddd| }qk| $|S )aw  N-step constrained beam search implementation.

        Based on/Modified from https://arxiv.org/pdf/2002.03577.pdf.
        Please reference ESPnet (b-flo, PR #2444) for any usage outside ESPnet
        until further modifications.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            nbest_hyps: N-best hypothesis.

        r)   rw   r   r   c                 S   r   r   r   r   r   r   r   r    r   V  r   z8BeamSearchTransducer.nsc_beam_search.<locals>.<listcomp>Nr   r   r   r"   r   r#   c                 S   
   t | jS rX   rY   r   rZ   r   r   r    r\   l     
 z6BeamSearchTransducer.nsc_beam_search.<locals>.<lambda>Tr]   c                 S      g | ]}|j d  qS rf   r"   r   r   r   r    r   v  r   rf   rg   r   r   r"   r   r   r#   c                 S   r`   rX   ra   rZ   r   r   r    r\     rb   c                 S   r   r   r   r   r   r   r   r    r     r   c                 S   r   r   r   r   r   r   r    r     r   c                 S   r   r   r   r   r   r   r    r     r   c                 S   r`   rX   ra   rZ   r   r   r    r\     rb   )%r   r-   rA   r+   ry   r!   rC   r   r   rN   r.   rv   r   r   rk   r3   r   r   rj   r,   r   ri   r{   r   r   r   r"   r   r   r#   r   r/   r   rc   r   create_batch_statesrd   )rO   rQ   r   r   r   init_tokensr}   r   r   r~   r   r   r   r#   r   re   rW   r   SVnr   r   r   r|   rs   ru   r   r   r   r   r    rK   -  s   




&





 
z$BeamSearchTransducer.nsc_beam_searchc                 C   s  t | j| j}| j|}t| jgd| j|ddg}i }| j|||| j	\}}}| j|d}| j	rM| j
|dd |D d\}	}
|
d }|	d }nd}d}t| jgd||d g||dg}|D ]w}| t|dd	 d
d|}g }|d}g }dd |D }t| jD ]Q}tdd |D }tj| ||ddj| jdd\}}t|||| j}g }t|D ]Y\}}|| D ]P\}}t|jdd ||jdd |j|j|jd}|dkr|| q|jt|g |vr|jt| | j	r	| j | j!t"|j|  7  _ || qq|s"t|dd	 d
dd| } n| j#|dd |D dd |D }| j|||| j	\}}}| j	rT| j
|dd |D d\}	}
|| jd k rt|D ]#\}}|j||  | j|||_| j	r|
| |_|	| |_q`|dd }qtj| ||dd}t|D ]0\}}| j t"||df 7  _ |j||  | j|||_| j	r|
| |_$|	| |_qt|| dd	 d
dd| }qqc| %|S )a'  It's the modified Adaptive Expansion Search (mAES) implementation.

        Based on/modified from https://ieeexplore.ieee.org/document/9250505 and NSC.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            nbest_hyps: N-best hypothesis.

        rw   r   r   c                 S   r   r   r   r   r   r   r    r     r   zKBeamSearchTransducer.modified_adaptive_expansion_search.<locals>.<listcomp>Nr   c                 S   r   rX   r   rZ   r   r   r    r\     r   zIBeamSearchTransducer.modified_adaptive_expansion_search.<locals>.<lambda>Tr]   c                 S   r   r   r   r   r   r   r    r     r   c                 S   r   r   r   r   r   r   r    r     r   rf   rg   r   c                 S   r`   rX   ra   rZ   r   r   r    r\   ?  rb   c                 S   r   r   r   r   r   r   r    r   G  r   c                 S   r   r   r   r   r   r   r    r   H  r   c                 S   r   r   r   )r   ru   r   r   r    r   T  r   r)   c                 S   r`   rX   ra   rZ   r   r   r    r\   r  rb   )&r   r-   rA   r+   ry   r!   rC   r   r   rN   r.   rv   r   r   rk   r3   r   r   rj   r,   r   rL   r   r5   ri   r   r"   r   r   r#   r{   r   r   r/   r   r   	lm_statesrd   )rO   rQ   r   r   r   r}   r   r   r~   r   r   r   r#   r   re   rW   r   list_bduplication_checkr   r   beam_idxk_expansionslist_expr   r|   ru   	new_scorer   r   r   r    rM     s   





	








z7BeamSearchTransducer.modified_adaptive_expansion_search)Nr%   r&   r'   r(   r)   r)   r*   r'   Tr)   N)r   r   r   r   r	   r
   r   r   nnModuler   r   boolr   r   rP   r   r   r   r!   rV   rd   rv   rD   rF   rI   rJ   rK   rM   r   r   r   r    r$   +   s    	


f


$"e`i $r$   )r   r   dataclassesr   typingr   r   r   r   r   r   numpyrl   r   espnet2.asr.decoder.abs_decoderr	   $espnet2.asr_transducer.joint_networkr
   espnet2.lm.transformer_lmr   ,espnet.nets.pytorch_backend.transducer.utilsr   r   r   r   r   r!   r$   r   r   r   r    <module>   s     