o
    iK                     @   s  d dl mZ d dl mZ d dl mZ d dl mZ d dl mZ d dlZd dlZd dlm	Z	 d dl
m	  mZ d dlmZ d dlZd dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl$m&Z& d dl$m'Z' d dl$m(Z( d dl$m)Z) d dl$m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 G dd de	j2Z3e14ddG dd de	j2Z5dS )    )List)Optional)Sequence)Tuple)UnionN)overlap_chunk)	to_device)make_pad_mask)MultiHeadedAttentionMultiHeadedAttentionSANM)SinusoidalPositionEncoderStreamSinusoidalPositionEncoder)	LayerNorm)Conv1dLinear)MultiLayeredConv1d)PositionwiseFeedForward)repeat)Conv2dSubsampling)Conv2dSubsampling2)Conv2dSubsampling6)Conv2dSubsampling8)TooShortUttError)check_short_utt)subsequent_maskvad_mask)CTC)tablesc                       s8   e Zd Z			d fdd	ZdddZdd
dZ  ZS )EncoderLayerSANMTF        c	           	         sz   t t|   || _|| _t|| _t|| _t	|| _
|| _|| _|| _|| _| jr5t|| || _|| _|| _dS )z!Construct an EncoderLayer object.N)superr   __init__	self_attnfeed_forwardr   norm1norm2nnDropoutdropoutin_sizesizenormalize_beforeconcat_afterLinearconcat_linearstochastic_depth_ratedropout_rate)	selfr(   r)   r!   r"   r/   r*   r+   r.   	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/scama/encoder.pyr    -   s   


zEncoderLayerSANM.__init__Nc           
   
   C   sn  d}d}| j r| jdkrtd | jk }dd| j  }|r0|dur,tj||gdd}||fS |}| jr:| |}| jretj|| j	||||dfdd}	| j
| jkr]||| |	  }n-|| |	 }n%| j
| jkr|||| | j	||||d  }n|| | j	||||d }| js| |}|}| jr| |}||| | |  }| js| |}|||||fS )	  Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        Fg      ?r      Ndim)mask_shfit_chunkmask_att_chunk_encoder)trainingr.   torchranditemcatr*   r#   r+   r!   r(   r)   r-   r'   r$   r"   )
r0   xmaskcacher9   r:   
skip_layerstoch_layer_coeffresidualx_concatr3   r3   r4   forwardH   sj   
	
	


zEncoderLayerSANM.forwardr   c                 C   s   |}| j r
| |}| j| jkr | j||||\}}|| }n| j||||\}}| j s3| |}|}| j r=| |}|| | }| j sL| |}||fS )r5   )r*   r#   r(   r)   r!   forward_chunkr$   r"   )r0   rA   rC   
chunk_size	look_backrF   attnr3   r3   r4   rI      s    




zEncoderLayerSANM.forward_chunk)TFr   )NNNNNr   )__name__
__module____qualname__r    rH   rI   __classcell__r3   r3   r1   r4   r   ,   s    
Nr   encoder_classesSANMEncoderChunkOptc                5       s  e Zd ZdZdddddddded	d
dddg d
ddddddddddfdededededededededee d e	d!e	d"ed#ed$ed%e
e d&e	d'ed(ed)ed*eeee f d+eeee f d,eeee f d-eeee f d.eeee f d/ed0ef4 fd1d2Zd3efd4d5Z	6	6	dDd7ejd8ejd9ejd:ed;ed3eejejeej f fd<d=Zi fd>ejd?efd@dAZ	6dEd7ejd8ejd?efdBdCZ  ZS )FrS   z
    Author: Shiliang Zhang, Zhifu Gao, Haoneng Luo, Ming Lei, Jie Gao, Zhijie Yan, Lei Xie
    SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition
    https://arxiv.org/abs/2006.01712
          i      g?r   conv2dTFlinearr6   r;      r   sanm)   )
   )r   )r6   encoderzseq2seq/encoder
input_sizeoutput_sizeattention_headslinear_units
num_blocksr/   positional_dropout_rateattention_dropout_rateinput_layerr*   r+   positionwise_layer_typepositionwise_conv_kernel_sizepadding_idxinterctc_layer_idxinterctc_use_conditioningkernel_size
sanm_shfitselfattention_layer_typerJ   stridepad_leftencoder_att_look_back_factordecoder_att_look_back_factor!tf2torch_tensor_name_prefix_torchtf2torch_tensor_name_prefix_tfc                    s  t    | _|	dkr+tjtjtjtjtj	 |
|| _
nt|	dkr7t| _
nh|	dkrCt| _
n\|	dkrOt| _
nP|	dkr[t| _
nD|	dkrrtjtjj|d|
|| _
n-|	d u rkr~d | _
n!tj| _
n|	dkrt | _
n|	d	krt | _
ntd
|	 | _|dkrt|f	n|dkrt||f	n|dkrt||f	ntd|dkrt||fn|dkrt||||f||||ftd 	f	dd| _t|d  	fdd| _| jrt| _|| _t|dkr6dt |k r4t!||k s6J || _"d | _#|d d }t$||||||d| _%|| _&|| _'d S )NrX   rW   conv2d2conv2d6conv2d8embed)rh   pe	pe_onlinezunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.selfattnrZ   r6   c                    s   t    S Nr   lnum)	r+   r/   encoder_selfattn_layerencoder_selfattn_layer_args0r^   r*   r_   positionwise_layerpositionwise_layer_argsr3   r4   <lambda>;      z.SANMEncoderChunkOpt.__init__.<locals>.<lambda>c                    s   t    S r|   r}   r~   )r+   r/   r   encoder_selfattn_layer_argsr*   r_   r   r   r3   r4   r   H  r   r      )rJ   rn   ro   
shfit_fsmnrp   rq   )(r   r    _output_sizer=   r%   
Sequentialr,   r   r&   ReLUrw   r   r   r   r   	Embeddingr   r   
ValueErrorr*   r   r   r   NotImplementedErrorr
   r   r   	encoders0encoders
after_normri   lenminmaxrj   conditioning_layerr   overlap_chunk_clsrr   rs   )r0   r^   r_   r`   ra   rb   r/   rc   rd   re   pos_enc_classr*   r+   rf   rg   rh   ri   rj   rk   rl   rm   rJ   rn   ro   rp   rq   rr   rs   r   r1   )
r+   r/   r   r   r   r^   r*   r_   r   r   r4   r       s   






 
zSANMEncoderChunkOpt.__init__returnc                 C   s   | j S r|   )r   )r0   r3   r3   r4   r_   f  s   zSANMEncoderChunkOpt.output_sizeNxs_padilensprev_statesctcindc                 C   s  t |dddddf  |j}||  d 9 }| jdu r"|}nIt| jts:t| jts:t| jts:t| jt	rft
| j|d\}}|r]td|d dd| d |d|| ||\}}n| |}d\}	}
| jdur|dd}| j||}| jj|||d	\}}t |dddddf  |j}| jj||j|d
|jd}	| jj||j|d
|jd}
| ||d|	|
}|d
 |d }}g }t| jd
kr| ||d|	|
}|d
 |d }}nGt| jD ]A\}}|||d|	|
}|d
 |d }}|d | jv r2|}| jr| |}||d |f | jr2||}|| | }q| jr<| |}|dd}t|d
krR||f|dfS ||dfS )zEmbed positions in tensor.

        Args:
            xs_pad: input tensor (B, L, D)
            ilens: input length (B)
            prev_states: Not to be used now.
        Returns:
            position embedded tensor and mask
        N      ?r6   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty results)NN)
chunk_outsr   )dtype) r	   todevicer_   rw   
isinstancer   r   r   r   r   r)   r   r   squeezesumgen_chunk_masksplit_chunkget_mask_shfit_chunkr   get_mask_att_chunk_encoderr   r   ri   r   	enumerater*   r   appendrj   softmaxr   )r0   r   r   r   r   r   masksshort_status
limit_sizer9   r:   r   encoder_outsintermediate_outs	layer_idxencoder_layerencoder_outctc_outolensr3   r3   r4   rH   i  s~   $







$





zSANMEncoderChunkOpt.forwardfeatsrC   c                 C   st   t |dkr|S t|d |jd|d< tj|d |fdd}|d d |d d |d d   d d d f |d< |S )Nr   r   r   r6   r7   rJ   r   )r   r   r   r=   r@   )r0   r   rC   overlap_featsr3   r3   r4   _add_overlap_chunk  s   4z&SANMEncoderChunkOpt._add_overlap_chunkc                 K   sn  | dd}||  d 9 }| jd u r|}n| ||}|d r*t|d |jd}n| ||}|d d u rFt| jt| j }d g| }n|d }t	| jD ]\}}	|	
||| |d |d	 }
|
d
 |
d }|d
< qOt	| jD ]'\}}	|	
|||t| j  |d |d	 }
|
d
 |
d }||t| j < qr| jr| |}|d	 d
ks|d	 dkr||d< ||d fS )Nis_finalFr   
tail_chunkr   r   optrJ   encoder_chunk_look_backr   r6   r;   )getr_   rw   r   r   r   r   r   r   r   rI   r*   r   )r0   r   r   rC   kwargsr   cache_layer_num	new_cacher   r   r   r3   r3   r4   rI     s<   
"

z!SANMEncoderChunkOpt.forward_chunkrM   r|   )rN   rO   rP   __doc__r   intfloatr   strboolr   r   r   r    r_   r=   Tensorr   r   rH   npndarraydictr   rI   rQ   r3   r3   r1   r4   rS      s    		
 $
W)6typingr   r   r   r   r   loggingr=   torch.nnr%   torch.nn.functional
functionalF funasr.models.scama.chunk_utilisr   numpyr   funasr.train_utils.device_funcsr   *funasr.models.transformer.utils.nets_utilsr	   funasr.models.sanm.attentionr
   r   #funasr.models.transformer.embeddingr   r   $funasr.models.transformer.layer_normr   0funasr.models.transformer.utils.multi_layer_convr   r   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   +funasr.models.transformer.utils.subsamplingr   r   r   r   r   r   $funasr.models.transformer.utils.maskr   r   funasr.models.ctc.ctcr   funasr.registerr   Moduler   registerrS   r3   r3   r3   r4   <module>   sB    
