o
    i2                     @   sR  d dl mZ d dl mZ d dl mZ d dl mZ d dl mZ d dlZd dlZd dlm	Z	 d dl
mZ d dlZd dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dlm!Z! d dlm"Z" d dlm#Z# d dl$m%Z% d dl&m'Z' G dd de	j(Z)G dd de'Z*dS )    )List)Optional)Sequence)Tuple)UnionN)overlap_chunk)make_pad_mask)MultiHeadSelfAttention)SinusoidalPositionEncoder)	LayerNorm)Conv1dLinear)MultiLayeredConv1d)PositionwiseFeedForward)repeat)Conv2dSubsampling)Conv2dSubsampling2)Conv2dSubsampling6)Conv2dSubsampling8)TooShortUttError)check_short_utt)CTC)
AbsEncoderc                       s.   e Zd Z			d	 fdd	Zd
ddZ  ZS )EncoderLayerTF        c	           	         sz   t t|   || _|| _t|| _t|| _t	|| _
|| _|| _|| _|| _| jr5t|| || _|| _|| _dS )z!Construct an EncoderLayer object.N)superr   __init__	self_attnfeed_forwardr   norm1norm2nnDropoutdropoutin_sizesizenormalize_beforeconcat_afterLinearconcat_linearstochastic_depth_ratedropout_rate)	selfr#   r$   r   r   r*   r%   r&   r)   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sond/encoder/self_attention_encoder.pyr       s   


zEncoderLayer.__init__Nc           	   	   C   sf  d}d}| j r| jdkrtd | jk }dd| j  }|r0|dur,tj||gdd}||fS |}| jr:| |}| jrdtj|| j	|||dfdd}| j
| jkr\||| |  }n+|| | }n#| j
| jkrz||| | j	|||d  }n|| | j	|||d }| js| |}|}| jr| |}||| | |  }| js| |}||||fS )	a  Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        Fg      ?r      N)dim)mask_att_chunk_encoder)trainingr)   torchranditemcatr%   r   r&   r   r#   r$   r(   r"   r   r   )	r+   xmaskcacher2   
skip_layerstoch_layer_coeffresidualx_concatr.   r.   r/   forward;   sF   




zEncoderLayer.forward)TFr   NN)__name__
__module____qualname__r   r@   __classcell__r.   r.   r,   r/   r      s    r   c                %       s   e Zd ZdZdddddddded	d
dddg d
dddfdededededededededee de	de	dededede
e d e	d!ed"ef$ fd#d$Zd%efd&d'Z		d.d(ejd)ejd*ejd+ed%eejejeej f f
d,d-Z  ZS )/SelfAttentionEncoderzk
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Self attention encoder in OpenNMT framework
          i      g?r   conv2dTFlinearr0   r3   encoderzseq2seq/encoderN
input_sizeoutput_sizeattention_headslinear_units
num_blocksr*   positional_dropout_rateattention_dropout_rateinput_layerr%   r&   positionwise_layer_typepositionwise_conv_kernel_sizepadding_idxinterctc_layer_idxinterctc_use_conditioning!tf2torch_tensor_name_prefix_torchtf2torch_tensor_name_prefix_tfc                    s@  t    | _dkr+tjtjtjtjtj	 |
|| _
nqdkr7t| _
nedkrCt| _
nYdkrOt| _
nMdkr[t| _
nAdkrptjtjj|dt | _
n,d u rkr|d | _
n tj| _
ndkrt | _
nd	krd | _
ntd
 | _|dkrt|f	n|dkrt||f	n|dkrt||f	ntdt| 	f
dd| _| jrt| _|| _t|dkrdt|k rt||k sJ || _d | _t| _ || _!|| _"|| _#|d urt|| _$d S d S )NrK   rJ   conv2d2conv2d6conv2d8embed)rW   penullzunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.c                    s`   | dkrt t 	 S t tdks"dkr$n 	 S )Nr   r`   ra   )r   r	   )lnum
rS   rO   r&   r*   rT   rM   r%   rN   positionwise_layerpositionwise_layer_argsr.   r/   <lambda>   s:   z/SelfAttentionEncoder.__init__.<locals>.<lambda>r   )%r   r   _output_sizer5   r    
Sequentialr'   r   r!   ReLUr_   r   r   r   r   	Embeddingr
   
ValueErrorr%   r   r   r   NotImplementedErrorr   encoders
after_normrX   lenminmaxrY   conditioning_layerr"   rZ   r[   	out_unitsoutput_linear)r+   rM   rN   rO   rP   rQ   r*   rR   rS   rT   pos_enc_classr%   r&   rU   rV   rW   rX   rY   rZ   r[   rt   r,   rd   r/   r   }   s   



"

zSelfAttentionEncoder.__init__returnc                 C   s   | j S )N)rh   )r+   r.   r.   r/   rN     s   z SelfAttentionEncoder.output_sizexs_padilensprev_statesctcc                 C   s  t |dddddf  |j}||  d  }| jdu r"|}nIt| jts:t| jts:t| jts:t| jt	rft
| j|d\}}|r]td|d dd| d |d|| ||\}}n| |}| |}g }t| jdkr| ||}	|	d |	d }}nAt| jD ];\}
}|||}	|	d |	d }}|
d | jv r|}| jr| |}||
d |f | jr||}|| | }q| jr| |}| jdur| |}|dd}t|dkr||f|dfS ||dfS )	zEmbed positions in tensor.

        Args:
            xs_pad: input tensor (B, L, D)
            ilens: input length (B)
            prev_states: Not to be used now.
        Returns:
            position embedded tensor and mask
        Ng      ?r0   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr   )r   todevicerN   r_   
isinstancer   r   r   r   r   r$   r   r"   rp   rX   rn   	enumerater%   ro   appendrY   softmaxrs   rt   ru   squeezesum)r+   rx   ry   rz   r{   masksshort_status
limit_sizeintermediate_outsencoder_outs	layer_idxencoder_layerencoder_outctc_outolensr.   r.   r/   r@     s`   $














zSelfAttentionEncoder.forwardrA   )rB   rC   rD   __doc__r
   intfloatr   strboolr   r   rN   r5   Tensorr   r   r@   rE   r.   r.   r,   r/   rF   w   s    	
 rF   )+typingr   r   r   r   r   loggingr5   torch.nnr     funasr.models.scama.chunk_utilisr   numpynp*funasr.models.transformer.utils.nets_utilsr   funasr.models.sond.attentionr	   #funasr.models.transformer.embeddingr
   $funasr.models.transformer.layer_normr   0funasr.models.transformer.utils.multi_layer_convr   r   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   +funasr.models.transformer.utils.subsamplingr   r   r   r   r   r   funasr.models.ctcr   !funasr.models.encoder.abs_encoderr   Moduler   rF   r.   r.   r.   r/   <module>   s8    X