o
    i8,                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddlm
Z
 dd	lmZ dd
lmZ G dd de	ZdS )z2Parallel beam search module for online simulation.    N)Any)Dict)List)Tuple)BatchBeamSearch)BatchHypothesis)
Hypothesis)
end_detectc                       s   e Zd ZdZddddddd fdd
Zd	d
 Zdedejde	e
eejf e
eef f fddZ			ddejdedededee f
ddZdd Zdd Zdejdedee fddZ  ZS )BatchBeamSearchOnlineao  Online beam search implementation.

    This simulates streaming decoding.
    It requires encoded features of entire utterance and
    extracts block by block from it as it shoud be done
    in streaming processing.
    This is based on Tsunoo et al, "STREAMING TRANSFORMER ASR
    WITH BLOCKWISE SYNCHRONOUS BEAM SEARCH"
    (https://arxiv.org/abs/2006.14941).
    (      Fr   )
block_sizehop_size
look_aheaddisable_repetition_detectionencoded_feat_length_limitdecoder_text_length_limitc          	         sB   t  j|i | || _|| _|| _|| _|| _|| _|   dS )zInitialize beam search.N)	super__init__r   r   r   r   r   r   reset)	selfr   r   r   r   r   r   argskwargs	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/espnet/nets/batch_beam_search_online.pyr      s   zBatchBeamSearchOnline.__init__c                 C   s.   d| _ d| _g | _g | _d| _d| _d| _dS )zReset parameters.Nr   )	encbufferrunning_hyps	prev_hyps
ended_hypsprocessed_blockprocess_idxprev_outputr   r   r   r   r   3   s   
zBatchBeamSearchOnline.resethypxreturnc                 C   s   t  }t  }| j D ]a\}}| jdkr[t|jdkr[t|jd | jkr[|jd| j | j }| j|dddf< dd | j	j
d D | j	j
d< |||j
| |\||< ||< q||j|j
| |\||< ||< q||fS )a  Score new hypothesis by `self.full_scorers`.

        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            x (torch.Tensor): Corresponding input feature

        Returns:
            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.full_scorers`
                and tensor score values of shape: `(self.n_vocab,)`,
                and state dict that has string keys
                and state values of `self.full_scorers`

        r      Nc                 S   s   g | ]}d qS Nr   ).0_r   r   r   
<listcomp>Z   s    z4BatchBeamSearchOnline.score_full.<locals>.<listcomp>decoder)dictfull_scorersitemsr   lenyseqnarrowclonesosr   statesbatch_score)r   r%   r&   scoresr6   kd	temp_yseqr   r   r   
score_full=   s$   

"$z BatchBeamSearchOnline.score_full        Tmaxlenratiominlenratiois_finalc           
      C   sz  | j du r	|| _ ntj| j |gdd| _ | j }|dkr!|jd }ntdt||d }d}	 | j| j | j	| j
  }||jd k rM|dd|}d}	n|rT|}d}	nnUtd| j
 td|jd | j | jdkr|jd | jkr|d|jd | j | j}| jdu r| || _| ||	||}td	| j
 |  j
d7  _
|	r|S q0|du r| jdu rg S | jS || _|S )
a  Perform beam search.

        Args:
            x (torch.Tensor): Encoded speech feature (T, D)
            maxlenratio (float): Input length ratio to obtain max output length.
                If maxlenratio=0.0 (default), it uses a end-detect function
                to automatically find maximum hypothesis lengths
            minlenratio (float): Input length ratio to obtain min output length.

        Returns:
            list[Hypothesis]: N-best decoding results

        Nr   )axisr(   TFzStart processing block: %dz*  Feature length: {}, current position: {}zFinished processing block: %d)r   torchcatshapemaxintsizer   r   r   r!   r3   loggingdebugformatr"   r   r   init_hypprocess_one_blockr#   )
r   r&   r>   r?   r@   maxlenretcur_end_framehblock_is_finalr   r   r   forwardb   sZ   


%
zBatchBeamSearchOnline.forwardc                 C   sd  |  || j | j|k rtdt| j  | | j|}| j|d kr0| | j|||| j| _|j	j
d }g }|j	t||jd f | jk}d}	t|j
d D ]+}
||
 rc| ||
}|| qQ| js||	s||j	|
df |j	|
ddf v r||s|d}	qQ|	rtd nu|r|d	krtd
d | jD | jrtd| j  | | jS t|dkr|std nF| j| _| | j|||| j| _|r|D ]}| j| qt| jdkrtd | | jS tdt| j  |  jd7  _| j|k s|r| | jS | jD ]}|| q| |}| jdkr0t| jdkr0| j| _|  jd8  _g | _|S )zRecognize one block.z	position r(   r   FNTzDetected repetition.r=   c                 S   s   g | ]}|  qS r   )asdict)r*   lhr   r   r   r,      s    z;BatchBeamSearchOnline.process_one_block.<locals>.<listcomp>zend detected at z+Detected hyp(s) reaching EOS in this block.zno hypothesis. Finish decoding.zremained hypotheses: )extendr   r"   rH   rI   strsearchpost_processr    r2   rD   rB   arangelengtheosrange_selectappendr   infor	   assemble_hypsr1   r   )r   rP   r@   rM   r>   bestn_batchlocal_ended_hypsis_local_eosprev_repeatir%   retsr   r   r   rL      s~   
  



D

z'BatchBeamSearchOnline.process_one_blockc              
      s  t |dd dd}t|dkrtd g S |d }|j D ]\}}t|dd j| d	d
| j|  dd|  qtd|jd td|jt|j	 d tdt|   j
durtdd fdd|j	dd D  d  |S )zAssemble the hypotheses.c                 S   s   | j S r)   )score)r&   r   r   r   <lambda>  s    z5BatchBeamSearchOnline.assemble_hyps.<locals>.<lambda>T)keyreverser   zOthere is no N-best results, perform recognition again with smaller minlenratio.z6.2fz * 3z = z for ztotal log probability: z.2fznormalized log probability: z"total number of ended hypotheses: Nzbest hypo:  c                    s   g | ]} j | qS r   )
token_list)r*   r&   r$   r   r   r,   !  s    z7BatchBeamSearchOnline.assemble_hyps.<locals>.<listcomp>r(   rS   
)sortedr1   rH   warningr8   r0   r`   weightsri   r2   ro   join)r   r    
nbest_hypsrb   r9   vr   r$   r   ra   
  s.   .
 z#BatchBeamSearchOnline.assemble_hypshypsc                 C   sL   | j  D ]\}}t|dr|| t|dr#||j| |j|< qdS )a  Extend probabilities and states with more encoded chunks.

        Args:
            x (torch.Tensor): The extended encoder output feature
            hyps (Hypothesis): Current list of hypothesis

        Returns:
            Hypothesis: The extended hypothesis

        extend_probextend_stateN)scorersr0   hasattrrx   ry   r6   )r   r&   rw   r9   r:   r   r   r   rV   &  s   


zBatchBeamSearchOnline.extend)r=   r=   T)__name__
__module____qualname____doc__r   r   r   rB   Tensorr   r   rW   r   r<   floatboolr   r   rR   rL   ra   rV   __classcell__r   r   r   r   r
      sD    

(
QW$r
   )r   rH   typingr   r   r   r   rB   espnet.nets.batch_beam_searchr   r   espnet.nets.beam_searchr   espnet.nets.e2e_asr_commonr	   r
   r   r   r   r   <module>   s    