o
    iF                     @   sL  d dl Z d dlmZmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z# G dd de j$j%Z&e'ddG dd de j$j%Z(G dd de j$j%Z)e'ddG dd de j$j%Z*dS )    N)ListOptionalTuple)tables)CTC)repeat)	LayerNorm)MultiHeadedAttention)make_pad_mask)check_short_utt)TooShortUttError)SinusoidalPositionEncoder)Conv1dLinear)subsequent_maskvad_mask)MultiLayeredConv1d)PositionwiseFeedForward) MultiHeadedAttentionSANMwithMask)Conv2dSubsamplingConv2dSubsampling2Conv2dSubsampling6Conv2dSubsampling8c                       s8   e Zd Z			d fdd	ZdddZdd
dZ  ZS )EncoderLayerSANMTF        c	           	         s~   t t|   || _|| _t|| _t|| _tj	
|| _|| _|| _|| _|| _| jr7tj	|| || _|| _|| _dS z!Construct an EncoderLayer object.N)superr   __init__	self_attnfeed_forwardr   norm1norm2torchnnDropoutdropoutin_sizesizenormalize_beforeconcat_afterLinearconcat_linearstochastic_depth_ratedropout_rate)	selfr%   r&   r   r   r,   r'   r(   r+   	__class__ b/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/ct_transformer_streaming/encoder.pyr       s   


zEncoderLayerSANM.__init__Nc           
   
   C   sn  d}d}| j r| jdkrtd | jk }dd| j  }|r0|dur,tj||gdd}||fS |}| jr:| |}| jretj|| j	||||dfdd}	| j
| jkr]||| |	  }n-|| |	 }n%| j
| jkr|||| | j	||||d  }n|| | j	||||d }| js| |}|}| jr| |}||| | |  }| js| |}|||||fS )	  Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        Fg      ?r      N)dim)mask_shfit_chunkmask_att_chunk_encoder)trainingr+   r!   randitemcatr'   r   r(   r   r%   r&   r*   r$   r    r   )
r-   xmaskcacher5   r6   
skip_layerstoch_layer_coeffresidualx_concatr0   r0   r1   forward;   sj   
	
	


zEncoderLayerSANM.forwardr   c                 C   s   |}| j r
| |}| j| jkr | j||||\}}|| }n| j||||\}}| j s3| |}|}| j r=| |}|| | }| j sL| |}||fS )r2   )r'   r   r%   r&   r   forward_chunkr    r   )r-   r<   r>   
chunk_size	look_backrA   attnr0   r0   r1   rD      s    




zEncoderLayerSANM.forward_chunk)TFr   )NNN)NNr   )__name__
__module____qualname__r   rC   rD   __classcell__r0   r0   r.   r1   r      s    
Nr   encoder_classesSANMVadEncoderc                '       s   e Zd ZdZdddddddded	d
dddg d
dddfdededededededededee de	de	dededede
e d e	d!ed"ed#ef& fd$d%Zd&efd'd(Z	)	)d1d*ejd+ejd,ejd-ejd.ed&eejejeej f fd/d0Z  ZS )2rM   z<
    Author: Speech Lab of DAMO Academy, Alibaba Group

          i      g?r   conv2dTFlinearr3   r7      r   sanm
input_sizeoutput_sizeattention_headslinear_units
num_blocksr,   positional_dropout_rateattention_dropout_rateinput_layerr'   r(   positionwise_layer_typepositionwise_conv_kernel_sizepadding_idxinterctc_layer_idxinterctc_use_conditioningkernel_size
sanm_shfitselfattention_layer_typec                    sv  t    	_|	dkr+tjtjtjtjtj	 |
|	_
ni|	dkr7t	_
n]|	dkrCt	_
nQ|	dkrOt	_
nE|	dkr[t	_
n9|	dkrptjtjj|dt 	_
n$|	d u rkr|d 	_
ntj	_
n|	dkrt 	_
ntd	|	 	_|dkrt|fn|d
krt||fn|dkrt||fntd|dkrt}||fn|dkrt	_||||f||||ftd 	f	dd	_t|d  	fdd	_	jrt	_|	_t|dkr,dt |k r*t!||k s,J |	_"d 	_#tj	_$d S )NrR   rQ   conv2d2conv2d6conv2d8embed)r_   pezunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.selfattnrT   r3   c                    s   t j   S Nr   encoder_selfattn_layerlnum)	r(   r,   encoder_selfattn_layer_args0rU   r'   rV   positionwise_layerpositionwise_layer_argsr-   r0   r1   <lambda>&      z)SANMVadEncoder.__init__.<locals>.<lambda>c                    s   t j   S rl   rm   ro   )r(   r,   encoder_selfattn_layer_argsr'   rV   rr   rs   r-   r0   r1   rt   3  ru   r   )%r   r   _output_sizer!   r"   
Sequentialr)   r   r#   ReLUrh   r   r   r   r   	Embeddingr   
ValueErrorr'   r   r   r   NotImplementedErrorr	   r   rn   r   	encoders0encoders
after_normr`   lenminmaxra   conditioning_layerr$   )r-   rU   rV   rW   rX   rY   r,   rZ   r[   r\   pos_enc_classr'   r(   r]   r^   r_   r`   ra   rb   rc   rd   rn   r.   )
r(   r,   rv   rq   rU   r'   rV   rr   rs   r-   r1   r      s   




	
 zSANMVadEncoder.__init__returnc                 C   s   | j S rl   )rw   r-   r0   r0   r1   rV   G  s   zSANMVadEncoder.output_sizeNxs_padilensvad_indexesprev_statesctcc                 C   s(  t |dddddf  |j}t|d|jdd}||@ }||  d 9 }| jdu r3|}nIt| jt	sKt| jt
sKt| jtsKt| jtrwt| j|d\}	}
|	rntd|d dd	|
 d
 |d|
| ||\}}n| |}||g}| ||}|d |d }}g }t| jD ]Z\}}|d t| jkrtj|d|d|d|jtjd}t|D ]\}}t|d|| |jd||ddddf< q||@ }n|}||g}|||}|d |d }}q| jr| |}|dd}t|dkr||f|dfS ||dfS )zEmbed positions in tensor.

        Args:
            xs_pad: input tensor (B, L, D)
            ilens: input length (B)
            prev_states: Not to be used now.
        Returns:
            position embedded tensor and mask
        Nr7   )devicer         ?r3   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty results)r   dtype)r
   tor   r   r&   	unsqueezerV   rh   
isinstancer   r   r   r   r   r   r}   	enumerater~   r   r!   onesboolr   r'   r   squeezesum)r-   r   r   r   r   r   masks	sub_masksno_future_masksshort_status
limit_size	mask_tup0encoder_outs_intermediate_outs	layer_idxencoder_layer
coner_mask
word_indexlength
layer_mask	mask_tup1olensr0   r0   r1   rC   J  sj   $










zSANMVadEncoder.forward)NN)rH   rI   rJ   __doc__r   intfloatr   strr   r   r   rV   r!   Tensorr   r   rC   rK   r0   r0   r.   r1   rM      s    	
 c                       s$   e Zd Z fddZdd Z  ZS )EncoderLayerSANMExportc                    s>   t    |j| _|j| _|j| _|j| _|j| _|j| _dS r   )r   r   r   r   r   r    r%   r&   )r-   modelr.   r0   r1   r     s   
zEncoderLayerSANMExport.__init__c                 C   sV   |}|  |}| ||}| j| jkr|| }|}| |}| |}|| }||fS rl   )r   r   r%   r&   r    r   )r-   r<   r=   rA   r0   r0   r1   rC     s   


zEncoderLayerSANMExport.forward)rH   rI   rJ   r   rC   rK   r0   r0   r.   r1   r     s    r   SANMVadEncoderExportc                       s^   e Zd Z				ddef fddZdd	 Zd
ejdejdejdejfddZdd Z	  Z
S )r      0  encoderTonnxc           
         s   t    |j| _|| _|j| _ddlm} ||dd| _ddlm	} t
|drGt| jjD ]\}}	t|	jtr>||	j|	_t|	| jj|< q.t| jjD ]\}}	t|	jtr]||	j|	_t|	| jj|< qMd S )Nr   )sequence_maskF)flip)MultiHeadedAttentionSANMExportr}   )r   r   rh   r   rw   funasr.utils.torch_functionr   r
   funasr.models.sanm.attentionr   hasattrr   r}   r   r   r   r   r~   )
r-   r   max_seq_len	feats_dim
model_namer   r   r   idr.   r0   r1   r     s"   

zSANMVadEncoderExport.__init__c                 C   s*   |d d d d d f }d| d }||fS )Nr3   g     r0   )r-   r=   r   mask_3d_btdmask_4d_bhltr0   r0   r1   prepare_mask  s   z!SANMVadEncoderExport.prepare_maskspeechspeech_lengths	vad_masksr   c                 C   s   || j d  }| |}| ||}| ||}| jd u r |}n| |}| j||}|d |d }}t| jjD ]\}	}
|	t| jjd krK|}|
||}|d |d }}q;| j	|}||fS )Nr   r   r3   )
rw   r
   r   rh   r   r}   r   r~   r   r   )r-   r   r   r   r   r=   r   r   r   r   r   r0   r0   r1   rC     s    



zSANMVadEncoderExport.forwardc                 C   s   | j jd jS )Nr   )r   r~   r&   r   r0   r0   r1   get_output_size  s   z$SANMVadEncoderExport.get_output_size)r   r   r   T)rH   rI   rJ   r   r   r   r!   r   rC   r   rK   r0   r0   r.   r1   r     s&    
)+r!   typingr   r   r   funasr.registerr   funasr.models.ctc.ctcr   &funasr.models.transformer.utils.repeatr   $funasr.models.transformer.layer_normr   r   r	   *funasr.models.transformer.utils.nets_utilsr
   +funasr.models.transformer.utils.subsamplingr   r   #funasr.models.transformer.embeddingr   0funasr.models.transformer.utils.multi_layer_convr   $funasr.models.transformer.utils.maskr   r   r   3funasr.models.transformer.positionwise_feed_forwardr   0funasr.models.ct_transformer_streaming.attentionr   r   r   r   r   r"   Moduler   registerrM   r   r   r0   r0   r0   r1   <module>   s2    
 i
