o
    iD                     @   sB  d dl mZ d dl mZ d dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZmZmZ d d	lmZmZmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dlm!Z! d dlm"Z" d dlm#Z# d dl$m%Z% d dl&Z&G dd dej'Z(G dd de%Z)dS )    )Optional)TupleN)nnEncoderLayer)get_activation)make_pad_mask)MultiHeadedAttentionRelPositionMultiHeadedAttention%LegacyRelPositionMultiHeadedAttention)PositionalEncodingScaledPositionalEncodingRelPositionalEncodingLegacyRelPositionalEncoding)	LayerNorm)Conv1dLinear)MultiLayeredConv1d)PositionwiseFeedForward)repeat)Conv2dSubsampling)Conv2dSubsampling2)Conv2dSubsampling6)Conv2dSubsampling8)TooShortUttError)check_short_utt)
AbsEncoderc                       s2   e Zd ZdZe df fdd	Zdd Z  ZS )ConvolutionModulezConvolutionModule in Conformer model.
    Args:
        channels (int): The number of channels of conv layers.
        kernel_size (int): Kernerl size of conv layers.
    Tc              	      s   t t|   |d d dksJ tj|d| ddd|d| _tj|||d|d d ||d| _t|| _tj||ddd|d| _	|| _
dS )z&Construct an ConvolutionModule object.      r   )kernel_sizestridepaddingbias)r    r!   groupsr"   N)superr   __init__r   Conv1dpointwise_conv1depthwise_convBatchNorm1dnormpointwise_conv2
activation)selfchannelsr   r,   r"   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/mfcca/mfcca_encoder.pyr%   /   s:   
	
zConvolutionModule.__init__c                 C   sV   | dd}| |}tjj|dd}| |}| | |}| |}| ddS )zCompute convolution module.
        Args:
            x (torch.Tensor): Input tensor (#batch, time, channels).
        Returns:
            torch.Tensor: Output tensor (#batch, time, channels).
        r   r   )dim)		transposer'   r   
functionalglur(   r,   r*   r+   )r-   xr1   r1   r2   forwardQ   s   


zConvolutionModule.forward)	__name__
__module____qualname____doc__r   ReLUr%   r8   __classcell__r1   r1   r/   r2   r   (   s    "r   c                -       s*  e Zd ZdZ											
			
							
		d7dededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ef, fd)d*Zd+efd,d-Z		.d8d/e
jd0e
jd1e
jd2e
jd+ee
je
jee
j f f
d3d4Z	.d8d/e
jd0e
jd2e
jd+ee
je
jee
j f fd5d6Z  ZS )9MFCCAEncodera  Conformer encoder module.
    Args:
        input_size (int): Input dimension.
        output_size (int): Dimention of attention.
        attention_heads (int): The number of heads of multi head attention.
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        attention_dropout_rate (float): Dropout rate in attention.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        input_layer (Union[str, torch.nn.Module]): Input layer type.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            If True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            If False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        rel_pos_type (str): Whether to use the latest relative positional encoding or
            the legacy one. The legacy relative positional encoding will be deprecated
            in the future. More Details can be found in
            https://github.com/espnet/espnet/pull/2816.
        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
        encoder_attn_layer_type (str): Encoder attention layer type.
        activation_type (str): Encoder activation function type.
        macaron_style (bool): Whether to use macaron style for positionwise layer.
        use_cnn_module (bool): Whether to use convolution module.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): Kernerl size of convolution module.
        padding_idx (int): Padding idx for input_layer=embed.
                皙?        conv2dTFlinear   legacyrel_posrel_selfattnswish   
input_sizeoutput_sizeattention_headslinear_units
num_blocksdropout_ratepositional_dropout_rateattention_dropout_rateinput_layernormalize_beforeconcat_afterpositionwise_layer_typepositionwise_conv_kernel_sizemacaron_stylerel_pos_typepos_enc_layer_typeselfattention_layer_typeactivation_typeuse_cnn_module	zero_triucnn_module_kernelpadding_idxc                    s  t    
| _|dkr|dkrd}|dkrd}n|dkr*|dks#J |dks)J ntd| t|}|dkr;t}n,|d	krBt}n%|dkrO|dksLJ t}n|dkra|dksYJ t}t	
d
 ntd| |	dkrtjtj|
tj
tj|
|| _no|	dkrt|
|
|| _n_|	dkrt|
|
|| _nO|	dkrt|
|
|| _n?|	dkrtjtjj|
|d|
|| _n(t|	tjjrtj|	|
|| _n|	d u rtj|
|| _ntd|	 	| _|dkrt
||fn |dkrt
||fn|dkr"t
||fntd|dkr3t|
|fn4|dkrL|dks?J t|
|ft	
d n|dkra|dksXJ t|
||fntd| t 
||ft|
|ft!| 	
fdd| _"| jrt
| _#tjj$ddddgddgd d!| _%tjj$dd"ddgddgd d!| _&tjj$d"dddgddgd d!| _'tjj$ddddgddgd d!| _(d S )#NrI   rJ   legacy_rel_posrK   legacy_rel_selfattnlatestzunknown rel_pos_type: abs_posscaled_abs_posz=Using legacy_rel_pos and it will be deprecated in the future.zunknown pos_enc_layer: rG   rF   conv2d6conv2d8embed)rd   zunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.selfattnzBUsing legacy_rel_selfattn and it will be deprecated in the future.zunknown encoder_attn_layer: c              
      s<   t 
   r nd r nd 	 	S Nr   )lnumrY   convolution_layerconvolution_layer_argsrT   encoder_selfattn_layerencoder_selfattn_layer_argsencoder_selfattn_layer_args_rawencoder_selfattn_layer_rawr\   rX   rP   positionwise_layerpositionwise_layer_argsra   r1   r2   <lambda>+  s    z'MFCCAEncoder.__init__.<locals>.<lambda>            r   )r   rH   )r    r!       ))r$   r%   _output_size
ValueErrorr   r   r   r   r   loggingwarningtorchr   
SequentialLinearr   Dropoutrl   r   r   r   	Embedding
isinstanceModulerX   r   r   r   NotImplementedErrorr	   r   r
   r   r   encoders
after_normConv2dconv1conv2conv3conv4)r-   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   r,   pos_enc_classr/   rq   r2   r%      s  









$
   $zMFCCAEncoder.__init__returnc                 C   s   | j S ro   )r   )r-   r1   r1   r2   rP   A  s   zMFCCAEncoder.output_sizeNxs_padilenschannel_sizeprev_statesc                 C   s  t |dddddf  |j}t| jts$t| jts$t| jtrPt| j|	d\}}|rGt
d|	d dd| d |	d|| ||\}}n| |}| |||\}}}t|trh|d }|	d}|	d}	|d	|||	}|d
k rtd
| }
|d|
dddddd
ddddf }| |}| |}| |}| |}| d	||	}|	d}|d	|||dddddddf }| jr| |}|dd}||dfS )  Calculate forward propagation.
        Args:
            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
            ilens (torch.Tensor): Input length (#batch).
            prev_states (torch.Tensor): Not to be used now.
        Returns:
            torch.Tensor: Output tensor (#batch, L, output_size).
            torch.Tensor: Output length (#batch).
            torch.Tensor: Not to be used now.
        Nr   has ) frames and is too short for subsampling (it needs more than  frames), return empty resultsr   r   rN   r{   )r   todevicer   rl   r   r   r   r   sizer   r   tuplereshapemathceilr   r   r   r   r   squeezerX   r   sum)r-   r   r   r   r   masksshort_status
limit_sizet_lengd_dim
repeat_nummask_tmpolensr1   r1   r2   r8   D  sL   $







,




(

zMFCCAEncoder.forwardc                 C   s:  t |dddddf  |j}t| jts$t| jts$t| jtrPt| j|	d\}}|rGt
d|	d dd| d |	d|| ||\}}n| |}t| j}t| jD ]\}}	|	||\}}||d d krt|}
q_t|tr|d }|
d }
| jr| |}| |
| _|dd}||dfS )	r   Nr   r   r   r   r   r   r   )r   r   r   r   rl   r   r   r   r   r   r   lenr   	enumerater   rX   r   hidden_featurer   r   )r-   r   r   r   r   r   r   	num_layeridxencoderr   r   r1   r1   r2   forward_hidden~  s@   $








zMFCCAEncoder.forward_hidden)r@   rA   rB   rC   rD   rD   rE   rF   TFrG   rH   FrI   rJ   rK   rL   TFrM   rN   ro   )r9   r:   r;   r<   intfloatstrboolr%   rP   r   Tensorr   r   r8   r   r>   r1   r1   r/   r2   r?   h   s    #	
 9
>r?   )*typingr   r   r   r   r   )funasr.models.encoder.encoder_layer_mfccar   *funasr.models.transformer.utils.nets_utilsr   r   #funasr.models.transformer.attentionr	   r
   r   #funasr.models.transformer.embeddingr   r   r   r   $funasr.models.transformer.layer_normr   0funasr.models.transformer.utils.multi_layer_convr   r   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   +funasr.models.transformer.utils.subsamplingr   r   r   r   r   r   !funasr.models.encoder.abs_encoderr   r   r   r   r?   r1   r1   r1   r2   <module>   s2    @