o
    ix                     @   s   d Z ddlZddlmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZmZ G d	d
 d
ZdS )z(Search algorithms for Transducer models.    N)ListUnion)CustomDecoder)JointNetwork)
RNNDecoder)create_lm_batch_statesinit_lm_state	is_prefixrecombine_hypsselect_k_expansionsselect_lm_statesubtract)ExtendedHypothesis
Hypothesisc                !   @   s  e Zd ZdZ												
		d4deeef dedede	j
jdedededededededededededef ddZde	jdeee ee f fd d!Zd"eee ee f deee ee f fd#d$Zd"ee d%e	jdee fd&d'Zde	jdee fd(d)Zde	jdee fd*d+Zde	jdee fd,d-Zde	jdee fd.d/Zde	jdee fd0d1Zde	jdee fd2d3ZdS )5BeamSearchTransducerz*Beam search implementation for Transducer.N皙?default   2      ffffff@T      ?Fdecoderjoint_network	beam_sizelm	lm_weightsearch_typemax_sym_expu_maxnstepprefix_alphaexpansion_gammaexpansion_beta
score_normsoftmax_temperaturenbestquantizationc                 C   s  || _ || _|| _|j| _|j| _|j| _| jdkr| j| _	n`|dkr(| j
| _	nW|dkr4|| _| j| _	nK|dkr@|| _| j| _	n?|dkrO|	| _|
| _| j| _	n0|dkr}|	dkrY|	nd| _|
| _|| _| j|| kssJ d||| jf || | _| j| _	nt|d	urd
| _|| _t|jdrd
nd| _| jr|jjn|j| _t| jj| _|| _ nd| _|dkr|d	urt!"d d| _#n|| _#|| _$|| _%|| _&d	S )a=  Initialize Transducer search module.

        Args:
            decoder: Decoder module.
            joint_network: Joint network module.
            beam_size: Beam size.
            lm: LM class.
            lm_weight: LM weight for soft fusion.
            search_type: Search algorithm to use during inference.
            max_sym_exp: Number of maximum symbol expansions at each time step. (TSD)
            u_max: Maximum output sequence length. (ALSD)
            nstep: Number of maximum expansion steps at each time step. (NSC/mAES)
            prefix_alpha: Maximum prefix length in prefix search. (NSC/mAES)
            expansion_beta:
              Number of additional candidates for expanded hypotheses selection. (mAES)
            expansion_gamma: Allowed logp difference for prune-by-value method. (mAES)
            score_norm: Normalize final scores by length. ("default")
            softmax_temperature: Penalization term for softmax function.
            nbest: Number of final hypothesis.
            quantization: Whether dynamic quantization is used.

        r   r   tsdalsdnscmaesr   zXbeam_size (%d) + expansion_beta (%d) should be smaller or equal to vocabulary size (%d).NTwordlmFr   z_Softmax temperature is not supported with LM decoding.Setting softmax-temperature value to 1.0.)'r   r   r   dunitshidden_sizeodim
vocab_sizeblank_idgreedy_searchsearch_algorithmdefault_beam_searchr   time_sync_decodingr   align_length_sync_decodingr    r!   nsc_beam_searchr"   max_candidates"modified_adaptive_expansion_searchNotImplementedErroruse_lmr   hasattr	predictor	is_wordlmr,   lm_predictorlenrnn	lm_layersr   loggingwarningr%   r'   r$   r&   )selfr   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'    rF   V/home/ubuntu/.local/lib/python3.10/site-packages/espnet/nets/beam_search_transducer.py__init__   sb   )









zBeamSearchTransducer.__init__enc_outreturnc                 C   s   | j |j | |}|S )zPerform beam search.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            nbest_hyps: N-best decoding results

        )r   
set_devicedevicer3   )rE   rI   
nbest_hypsrF   rF   rG   __call__   s   
zBeamSearchTransducer.__call__hypsc                 C   s:   | j r|jdd dd n	|jdd dd |d| j S )zSort hypotheses by score or score given sequence length.

        Args:
            hyps: Hypothesis.

        Return:
            hyps: Sorted hypothesis.

        c                 S   s   | j t| j S N)scorer@   yseqxrF   rF   rG   <lambda>   s    z1BeamSearchTransducer.sort_nbest.<locals>.<lambda>Tkeyreversec                 S      | j S rP   rQ   rS   rF   rF   rG   rU          N)r$   sortr&   )rE   rO   rF   rF   rG   
sort_nbest   s   zBeamSearchTransducer.sort_nbest	enc_out_tc              	   C   s
  t |dd D ]z\}}||d d D ]m}t|j}t|j}t|j|jr|| | jkrtj| j||jd | j	d| j
 dd}|jt||j|   }	t||d D ]#}
tj| j||j|
 | j	d| j
 dd}|	t||j|
d   7 }	qUt|j|	|_qq|S )zjPrefix search for NSC and mAES strategies.

        Based on https://arxiv.org/pdf/1211.3711.pdf

        Nr   r'   dim)	enumerater@   rR   r	   r!   torchlog_softmaxr   dec_outr'   r%   rQ   floatrangenp	logaddexp)rE   rO   r^   jhyp_jhyp_icurr_idpref_idlogp
curr_scorekrF   rF   rG   prefix_search   s@   


!z"BeamSearchTransducer.prefix_searchc                 C   s   | j d}td| jg|d}i }| j ||\}}}|D ]?}tj| j||| jd| j	 dd}	tj
|	dd\}
}|| jkr\|jt| | jt|
7  _||_| j ||\}}}q|gS )zGreedy search implementation.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            hyp: 1-best hypotheses.

        r           rQ   rR   	dec_stater`   r_   ra   )r   
init_stater   r1   rQ   rd   re   r   r'   r%   maxrR   appendintrg   rv   )rE   rI   rv   hypcacherf   state_r^   rp   top_logppredrF   rF   rG   r2      s&   

z"BeamSearchTransducer.greedy_searchc              
      s  t | j| j}t || jd }| jd}td| jg|dg}i }|D ]}|}g }	 t|dd d}	||	 | j	|	|\}
}}t
j| j||
| jd| j d	d
}|dd j|d	d
}|t|	j	t|dd  |	jdd |	j|	jd | jr| j|	j|\}}n|	j}t| D ]2\}}|	j	t| }| jr|| j|d |d   7 }|t||	jdd t|d g ||d qtt|dd dj	 t fdd|D dd d}t||kr|}nq*q#| |S )zBeam search implementation.

        Modified from https://arxiv.org/pdf/1211.3711.pdf

        Args:
            enc_out: Encoder output sequence. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        r   rt   ru   Tc                 S   rY   rP   rZ   rS   rF   rF   rG   rU     r[   z:BeamSearchTransducer.default_beam_search.<locals>.<lambda>)rW   r`   r_   ra   Nr   rQ   rR   rv   lm_statec                 S   rY   rP   rZ   rS   rF   rF   rG   rU   A  r[   c                    s   g | ]	}|j  kr|qS rF   rZ   .0r{   hyps_maxrF   rG   
<listcomp>C  s    z<BeamSearchTransducer.default_beam_search.<locals>.<listcomp>c                 S   rY   rP   rZ   rS   rF   rF   rG   rU   D  r[   )minr   r0   r   rw   r   r1   rx   removerQ   rd   re   r   r'   r%   topkry   rg   rR   rv   r   r;   r   predictzipr   rz   sortedr@   r]   )rE   rI   beambeam_krv   	kept_hypsr|   r^   rO   max_hyprf   r}   	lm_tokensrp   top_kr   	lm_scoresrr   rQ   kept_most_probrF   r   rG   r4      sn   
		3
z(BeamSearchTransducer.default_beam_searchc                 C   s  t | j| j}| j|}t| jgd| j|ddg}i }| jr,| j	s,t
| j|d _|D ]!}g }|}|d}t| jD ]}	g }
| j|||| j\}}}tj| ||| j dd}|ddddf j|dd}dd	 |D }t|D ]@\}}|j|vr|t|jt||df  |jdd |j|jd
 qy||j}t|| j|jt||df  || _qy|	| jd k r5| jrtdd	 |D | j | j	}| j!"||t#|\}}t|D ]R\}}t$|d | |d | d D ]>\}}t|jt| |jt%|g | j|||jd
}| jr.| j| j&|||f  7  _t'||| j | j	|_|
| qqt(|
dd ddd| }q?t(|dd ddd| }q.| )|S )zTime synchronous beam search implementation.

        Based on https://ieeexplore.ieee.org/document/9053040

        Args:
            enc_out: Encoder output sequence. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        rt   r   rR   rQ   rv   r_   ra   Nr   c                 S      g | ]}|j qS rF   rR   r   hrF   rF   rG   r         z;BeamSearchTransducer.time_sync_decoding.<locals>.<listcomp>r   c                 S   r   rF   r   )r   crF   rF   rG   r     r   c                 S   rY   rP   rZ   rS   rF   rF   rG   rU     r[   z9BeamSearchTransducer.time_sync_decoding.<locals>.<lambda>TrV   c                 S   rY   rP   rZ   rS   rF   rF   rG   rU     r[   )*r   r   r0   r   rw   r   r1   select_stater;   r>   r   r?   r   	unsqueezerh   r   batch_scorerd   re   r   r%   r   rc   rR   ry   rQ   rg   rv   indexri   rj   r   rB   r   buff_predictr@   r   rz   r   r   r   r]   )rE   rI   r   
beam_stateBr|   r^   ACvDbeam_dec_outbeam_lm_tokens	beam_logp	beam_topkseq_Air{   dict_posbeam_lm_statesbeam_lm_scoresrp   rr   new_hyprF   rF   rG   r5   L  s   




	
&
z'BeamSearchTransducer.time_sync_decodingc              
   C   s  t | j| j}t|d}t | j|d }| j|}t| j	gd| j
|ddg}g }i }| jr=| js=t| j|d _t|| D ]}	g }
g }g }|D ]"}t|jd }|	| }||d krbqN|| |||| f qN|r]| j|||| j\}}}tdd |D }tj| ||| j dd}|d	d	dd	f j|dd}| jrtd
d |D | j| j}| j||t|\}}t|D ]\}	}t|j t!||	df  |jd	d	 |j"|jd}|
| ||	 d |d kr|| t#|d |	 |d |	 d D ]C\}}t|j t!| |jd	d	 t|g | j
||	|jd}| jrD| j | j$||	|f  7  _ t%||	| j| j|_|
| qqt&|
dd ddd	| }t'|}qC|rf| (|S |S )zAlignment-length synchronous beam search implementation.

        Based on https://ieeexplore.ieee.org/document/9053040

        Args:
            h: Encoder output sequences. (T, D)

        Returns:
            nbest_hyps: N-best hypothesis.

        r   r   rt   r   c                 S   s   g | ]}|d  qS )r   rF   )r   rT   rF   rF   rG   r     s    zCBeamSearchTransducer.align_length_sync_decoding.<locals>.<listcomp>r_   ra   Nc                 S   r   rF   r   )r   brF   rF   rG   r     r   r   c                 S   rY   rP   rZ   rS   rF   rF   rG   rU     r[   zABeamSearchTransducer.align_length_sync_decoding.<locals>.<lambda>TrV   ))r   r   r0   rz   sizer   r   rw   r   r1   r   r;   r>   r   r?   r   rh   r@   rR   ry   r   rd   stackre   r   r%   r   r   rB   r   r   rc   rQ   rg   rv   r   r   r   r   r
   r]   )rE   rI   r   t_maxr   r   r   finalr|   r   r   B_	B_enc_outr{   utr   r   beam_enc_outr   r   r   r   r   rp   rr   rF   rF   rG   r6     s   





&
z/BeamSearchTransducer.align_length_sync_decodingc                 C   s6  t | j| j}t || jd }| j|}t| jgd| j|ddg}i }| j|||| j	\}}}| j|d}	| j	rU| j
d|d\}
}t|
d| j| j}|d }nd}d}t| jgd|	|d g||dg}|D ]}| t|dd d	d
|}g }|d}g }g }t| jD ]y}tdd |D }tj| ||| j dd}|ddddf j|dd}t|D ]s\}}|t|jdd |jt||ddf  |jdd |j |j!|j"d t#|d | |d | d D ]:\}}|jt| }| j	r|| j$t|j"|  7 }|t|jdd t%|g ||jdd |j |j!|j"d qq|j&dd d	d
 t'||d| }| j(|dd |D dd |D }| j|||| j	\}}}| j	r{t)dd |D | j| j}
| j
|
|t*|\}
}|| jd k rt|D ](\}}|j||  | j|||_ | j	rt|
|| j| j|_!|| |_"q|dd }qtj| ||| j dd}t|D ];\}}| jdkr| jt||df 7  _|j||  | j|||_ | j	rt|
|| j| j|_!|| |_"qqt|| dd d	d
d| }qk| +|S )aw  N-step constrained beam search implementation.

        Based on/Modified from https://arxiv.org/pdf/2002.03577.pdf.
        Please reference ESPnet (b-flo, PR #2444) for any usage outside ESPnet
        until further modifications.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            nbest_hyps: N-best hypothesis.

        r   rt   r   r   NrR   rQ   rv   rf   r   r   c                 S   
   t | jS rP   r@   rR   rS   rF   rF   rG   rU   b     
 z6BeamSearchTransducer.nsc_beam_search.<locals>.<lambda>TrV   c                 S      g | ]}|j d  qS r_   rf   r   rF   rF   rG   r   l      z8BeamSearchTransducer.nsc_beam_search.<locals>.<listcomp>r_   ra   rR   rQ   rf   rv   r   r   c                 S   rY   rP   rZ   rS   rF   rF   rG   rU     r[   c                 S   r   rF   rv   r   r   rF   rF   rG   r     r   c                 S   r   rF   r   r   rF   rF   rG   r     r   c                 S   r   rF   r   r   rF   rF   rG   r     r   c                 S   rY   rP   rZ   rS   rF   rF   rG   rU     r[   ),r   r   r0   r   rw   r   r1   r   r   r;   r   r   r   rB   r>   rs   r   r   rh   r    rd   r   re   r   r%   r   rc   ry   rR   rQ   rg   rf   rv   r   r   r   r   rz   r\   r   create_batch_statesr   r@   r]   )rE   rI   r   r   r   init_tokensr|   r   r   r}   r   r   r   r   r   r^   rO   r   SVnr   r   r   r{   rp   rr   rQ   r   rF   rF   rG   r7   #  s  




&




 
z$BeamSearchTransducer.nsc_beam_searchc                 C   s  t | j| j}| j|}t| jgd| j|ddg}i }| j|||| j	\}}}| j|d}| j	rM| j
d|d\}	}
t|	d| j| j}|
d }nd}d}t| jgd||d g||dg}|D ]}| t|dd d	d
|}g }|d}g }dd |D }t| jD ]k}tdd |D }tj| ||| j ddj| jdd\}}t|||| j}g }t|D ]Y\}}|| D ]P\}}t|jdd ||jdd |j |j!|j"d}|dkr|#| q|jt$|g |vr|j#t$| | j	r| j%| j&t'|j"|  7  _%|#| qq|s%t|dd d	d
d| } n| j(|dd |D dd |D }| j|||| j	\}}}| j	rat)dd |D | j| j}	| j
|	|t*|\}	}
|| jd k rt|D ](\}}|j#||  | j|||_ | j	rt|	|| j| j|_!|
| |_"qm|dd }qtj| ||| j dd}t|D ]5\}}| j%t'||df 7  _%|j#||  | j|||_ | j	rt|	|| j| j|_!|
| |_"qt|| dd d	d
d| }qqc| +|S )a'  It's the modified Adaptive Expansion Search (mAES) implementation.

        Based on/modified from https://ieeexplore.ieee.org/document/9250505 and NSC.

        Args:
            enc_out: Encoder output sequence. (T, D_enc)

        Returns:
            nbest_hyps: N-best hypothesis.

        rt   r   r   Nr   r   c                 S   r   rP   r   rS   rF   rF   rG   rU     r   zIBeamSearchTransducer.modified_adaptive_expansion_search.<locals>.<lambda>TrV   c                 S   r   rF   r   r   rF   rF   rG   r     r   zKBeamSearchTransducer.modified_adaptive_expansion_search.<locals>.<listcomp>c                 S   r   r   r   r   rF   rF   rG   r     r   r_   ra   r   c                 S   rY   rP   rZ   rS   rF   rF   rG   rU   @  r[   c                 S   r   rF   r   r   rF   rF   rG   r   H  r   c                 S   r   rF   r   r   rF   rF   rG   r   I  r   c                 S   r   rF   r   r   rF   rF   rG   r   U  r   c                 S   rY   rP   rZ   rS   rF   rF   rG   rU   }  r[   ),r   r   r0   r   rw   r   r1   r   r   r;   r   r   r   rB   r>   rs   r   r   rh   r    rd   r   re   r   r%   r   r8   r   r"   rc   rR   rf   rv   r   r   ry   rz   rQ   r   rg   r   r   r@   r]   )rE   rI   r   r   r   r|   r   r   r}   r   r   r   r   r   r^   rO   r   list_bduplication_checkr   r   beam_idxk_expansionslist_expr   r{   rr   	new_scorer   rF   rF   rG   r9     s  





	







z7BeamSearchTransducer.modified_adaptive_expansion_search)Nr   r   r   r   r   r   r   r   Tr   r   F)__name__
__module____qualname____doc__r   r   r   r   rz   rd   nnModulerg   strboolrH   Tensorr   r   r   rN   r]   rs   r2   r4   r5   r6   r7   r9   rF   rF   rF   rG   r      s    
	

n


,#Oho -r   )r   rC   typingr   r   numpyri   rd   5espnet.nets.pytorch_backend.transducer.custom_decoderr   4espnet.nets.pytorch_backend.transducer.joint_networkr   2espnet.nets.pytorch_backend.transducer.rnn_decoderr   ,espnet.nets.pytorch_backend.transducer.utilsr   r   r	   r
   r   r   r   (espnet.nets.transducer_decoder_interfacer   r   r   rF   rF   rF   rG   <module>   s    $	