o
    ic2                     @   s8  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! G dd dej"Z#e!$ddG dd dej"Z%dS )zTransformer encoder definition.    )List)Optional)TupleN)nn)MultiHeadedAttention)PositionalEncoding)	LayerNorm)Conv1dLinear)MultiLayeredConv1d)make_pad_mask)PositionwiseFeedForward)repeat)CTC)Conv2dSubsampling)Conv2dSubsampling2)Conv2dSubsampling6)Conv2dSubsampling8)TooShortUttError)check_short_utt)tablesc                       s2   e Zd ZdZ			d
 fdd	Zddd	Z  ZS )EncoderLayera%  Encoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        stochastic_depth_rate (float): Proability to skip this layer.
            During training, the layer may skip residual computation and return input
            as-is with given probability.
    TF        c                    sj   t    || _|| _t|| _t|| _t|| _	|| _
|| _|| _| jr0t|| || _|| _dS )z!Construct an EncoderLayer object.N)super__init__	self_attnfeed_forwardr   norm1norm2r   Dropoutdropoutsizenormalize_beforeconcat_afterLinearconcat_linearstochastic_depth_rate)selfr    r   r   dropout_rater!   r"   r%   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/transformer/encoder.pyr   8   s   



zEncoderLayer.__init__Nc           	   
   C   s  d}d}| j r| jdkrtd | jk }dd| j  }|r0|dur,tj||gdd}||fS |}| jr:| |}|du rA|}n@|j|jd |jd d | j	fksTJ |ddddddf }|ddddddf }|du rtdn|ddddddf }| j
rtj|| ||||fdd}||| |  }n||| | ||||  }| js| |}|}| jr| |}||| | |  }| js| |}|durtj||gdd}||fS )a  Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        Fg      ?r      N)dim)trainingr%   torchranditemcatr!   r   shaper    r"   r   r$   r   r   r   )	r&   xmaskcache
skip_layerstoch_layer_coeffresidualx_qx_concatr*   r*   r+   forwardP   sB   
&&


zEncoderLayer.forward)TFr   N)__name__
__module____qualname____doc__r   r=   __classcell__r*   r*   r(   r+   r   "   s    r   encoder_classesTransformerEncoderc                !       s   e Zd ZdZdddddddded	d
dddg d
fdededededededededee de	de	dededede
e de	f  fddZd efd!d"Z	#	#d*d$ejd%ejd&ejd'ed eejejeej f f
d(d)Z  ZS )+rE   ac  Transformer encoder module.

    Args:
        input_size: input dim
        output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the number of units of position-wise feed forward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        attention_dropout_rate: dropout rate in attention
        positional_dropout_rate: dropout rate after adding positional encoding
        input_layer: input layer type
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before: whether to use layer_norm before the first block
        concat_after: whether to concat attention layer's input and output
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied.
            i.e. x -> x + att(x)
        positionwise_layer_type: linear of conv1d
        positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
        padding_idx: padding_idx for input_layer=embed
          i      g?r   conv2dTFlinearr,   r.   
input_sizeoutput_sizeattention_headslinear_units
num_blocksr'   positional_dropout_rateattention_dropout_rateinput_layerr!   r"   positionwise_layer_typepositionwise_conv_kernel_sizepadding_idxinterctc_layer_idxinterctc_use_conditioningc              
      s  t    | _|	dkr+tjtj|tjtjtj	 |
|| _
nb|	dkr7t|| _
nV|	dkrCt|| _
nJ|	dkrOt|| _
n>|	dkr[t|| _
n2|	dkrrtjtjj||d|
|| _
n|	d u r|kr~d | _
ntj|| _
ntd|	 | _|dkrt|fn|d	krt||fn|d
krt||fntdt| fdd| _| jrt| _|| _t|dkrdt|k rt||k sJ || _d | _d S )NrJ   rI   conv2d2conv2d6conv2d8embed)rU   zunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.c                    s   t t  S r>   )r   r   )lnumrQ   rM   r"   r'   r!   rL   positionwise_layerpositionwise_layer_argsr*   r+   <lambda>   s    
z-TransformerEncoder.__init__.<locals>.<lambda>r   )r   r   _output_sizer0   r   
Sequentialr#   r   r   ReLUr[   r   r   r   r   	Embedding
ValueErrorr!   r   r
   r	   NotImplementedErrorr   encoders
after_normrV   lenminmaxrW   conditioning_layer)r&   rK   rL   rM   rN   rO   r'   rP   rQ   rR   pos_enc_classr!   r"   rS   rT   rU   rV   rW   r(   r^   r+   r      sz   




zTransformerEncoder.__init__returnc                 C   s   | j S r>   )rb   )r&   r*   r*   r+   rL     s   zTransformerEncoder.output_sizeNxs_padilensprev_statesctcc                 C   s  t |dddddf  |j}| jdu r|}nIt| jts2t| jts2t| jts2t| jtr^t	| j|
d\}}|rUtd|
d dd| d |
d|| ||\}}n| |}g }t| jdkru| ||\}}n:t| jD ]4\}	}
|
||\}}|	d | jv r|}| jr| |}||	d |f | jr||}|| | }qz| jr| |}|dd}t|dkr||f|dfS ||dfS )zEmbed positions in tensor.

        Args:
            xs_pad: input tensor (B, L, D)
            ilens: input length (B)
            prev_states: Not to be used now.
        Returns:
            position embedded tensor and mask
        Nr,   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr   )r   todevicer[   
isinstancer   r   r   r   r   r    r   rj   rV   rh   	enumerater!   ri   appendrW   softmaxrm   squeezesum)r&   rp   rq   rr   rs   masksshort_status
limit_sizeintermediate_outs	layer_idxencoder_layerencoder_outctc_outolensr*   r*   r+   r=   	  sT   $










zTransformerEncoder.forward)NN)r?   r@   rA   rB   r   intfloatr   strboolr   r   rL   r0   Tensorr   r   r=   rC   r*   r*   r(   r+   rE      s    	
a)&rB   typingr   r   r   r0   r   logging#funasr.models.transformer.attentionr   #funasr.models.transformer.embeddingr   $funasr.models.transformer.layer_normr   0funasr.models.transformer.utils.multi_layer_convr	   r
   *funasr.models.transformer.utils.nets_utilsr   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   funasr.models.ctc.ctcr   +funasr.models.transformer.utils.subsamplingr   r   r   r   r   r   funasr.registerr   Moduler   registerrE   r*   r*   r*   r+   <module>   s4   
i