o
    iG                     @   sD  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddlm!Z! ddlm"Z" ddlm#Z# ddlm$Z$ ddlm%Z% G dd dej&Z'G dd dej&Z(dS )zTransformer encoder definition.    )List)Optional)TupleN)nn)MultiHeadedAttention)PositionalEncoding)	LayerNorm)Conv1dLinear)MultiLayeredConv1d)make_pad_mask)PositionwiseFeedForward)repeat)DynamicConvolution)DynamicConvolution2D)LightweightConvolution)LightweightConvolution2D)Conv2dSubsampling)Conv2dSubsampling2)Conv2dSubsampling6)Conv2dSubsampling8)TooShortUttError)check_short_uttc                       s2   e Zd ZdZ			d
 fdd	Zddd	Z  ZS )EncoderLayera%  Encoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        stochastic_depth_rate (float): Proability to skip this layer.
            During training, the layer may skip residual computation and return input
            as-is with given probability.
    TF        c                    sn   t t|   || _|| _t|| _t|| _t	|| _
|| _|| _|| _| jr2t|| || _|| _dS )z!Construct an EncoderLayer object.N)superr   __init__	self_attnfeed_forwardr   norm1norm2r   Dropoutdropoutsizenormalize_beforeconcat_afterLinearconcat_linearstochastic_depth_rate)selfr"   r   r   dropout_rater#   r$   r'   	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/language_model/transformer_encoder.pyr   :   s   


zEncoderLayer.__init__Nc           	   
   C   s  d}d}| j r| jdkrtd | jk }dd| j  }|r0|dur,tj||gdd}||fS |}| jr:| |}|du rA|}n@|j|jd |jd d | j	fksTJ |ddddddf }|ddddddf }|du rtdn|ddddddf }| j
rtj|| ||||fdd}||| |  }n||| | ||||  }| js| |}|}| jr| |}||| | |  }| js| |}|durtj||gdd}||fS )a  Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        Fg      ?r      N)dim)trainingr'   torchranditemcatr#   r   shaper"   r$   r   r&   r!   r   r   )	r(   xmaskcache
skip_layerstoch_layer_coeffresidualx_qx_concatr,   r,   r-   forwardR   sB   
&&


zEncoderLayer.forward)TFr   N)__name__
__module____qualname____doc__r   r?   __classcell__r,   r,   r*   r-   r   $   s    r   c                       st   e Zd ZdZdddddddddd	d
eddddddd	dddf fdd	Z					dddZdd ZdddZ  Z	S )TransformerEncoder_lma  Transformer encoder module.

    Args:
        idim (int): Input dimension.
        attention_dim (int): Dimension of attention.
        attention_heads (int): The number of heads of multi head attention.
        conv_wshare (int): The number of kernel of convolution. Only used in
            selfattention_layer_type == "lightconv*" or "dynamiconv*".
        conv_kernel_length (Union[int, str]): Kernel size str of convolution
            (e.g. 71_71_71_71_71_71). Only used in selfattention_layer_type
            == "lightconv*" or "dynamiconv*".
        conv_usebias (bool): Whether to use bias in convolution. Only used in
            selfattention_layer_type == "lightconv*" or "dynamiconv*".
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        attention_dropout_rate (float): Dropout rate in attention.
        input_layer (Union[str, torch.nn.Module]): Input layer type.
        pos_enc_class (torch.nn.Module): Positional encoding module class.
            `PositionalEncoding `or `ScaledPositionalEncoding`
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        selfattention_layer_type (str): Encoder attention layer type.
        padding_idx (int): Padding idx for input_layer=embed.
        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
            indices start from 1.
            if not None, intermediate outputs are returned (which changes return type
            signature.)

          11F      皙?r   conv2dTlinearr.   selfattnr0   Nc                    s  t    d| _|dkr+tjtj| tj tjtj	 | |
| _
n|dkr:t| | _
d| _np|dkrMt| | |
| _
d| _n]|dkr\t| | _
d| _nN|dkrkt| | _
d	| _n?|d
krtjtjj| |d| |
| _
n(t|tjjrtj|| |
| _
n|du rtj| |
| _
ntd| 	| _| | ||\|dv rtd t| fg
 ns|dkrtd t fddt
D nX|dkrtd t fddt
D n<|dkrtd t fddt
D n |dkr;td t fddt
D nt|t
 	
f
dd| _| jr[t | _|| _ |durednd | _!| j!ry|| _"tj| | _#dS dS )!zConstruct an Encoder object.r.   rN   rM   rH   zconv2d-scaled-pos-encconv2d6rK   conv2d8   embed)padding_idxNzunknown input_layer: )rO   rel_selfattnlegacy_rel_selfattnz2encoder self-attention layer type = self-attention	lightconvz;encoder self-attention layer type = lightweight convolutionc              	      *   g | ]} t d | dfqS _Fintsplit.0lnumattention_dimattention_dropout_rateconv_kernel_lengthconv_usebiasconv_wsharer,   r-   
<listcomp>      	z2TransformerEncoder_lm.__init__.<locals>.<listcomp>lightconv2dzIencoder self-attention layer type = lightweight convolution 2-dimensionalc              	      rX   rY   r[   r^   ra   r,   r-   rg   "  rh   dynamicconvz7encoder self-attention layer type = dynamic convolutionc              	      rX   rY   r[   r^   ra   r,   r-   rg   0  rh   dynamicconv2dzEencoder self-attention layer type = dynamic convolution 2-dimensionalc              	      rX   rY   r[   r^   ra   r,   r-   rg   >  rh   c                    s0   t  |    	td|    S )Nr.   )r   float)r`   )
rb   r$   r)   encoder_selfattn_layerencoder_selfattn_layer_argsr#   
num_blockspositionwise_layerpositionwise_layer_argsr'   r,   r-   <lambda>N  s    
z0TransformerEncoder_lm.__init__.<locals>.<lambda>TF)$r   r   conv_subsampling_factorr2   r   
Sequentialr%   r   r    ReLUrS   r   r   r   	Embedding
isinstanceModule
ValueErrorr#   get_positionwise_layerlogginginfor   r   ranger   r   r   NotImplementedErrorr   encoders
after_normintermediate_layersuse_conditioningctc_softmaxconditioning_layer)r(   idimrb   attention_headsrf   rd   re   linear_unitsro   r)   positional_dropout_raterc   input_layerpos_enc_classr#   r$   positionwise_layer_typepositionwise_conv_kernel_sizeselfattention_layer_typerT   r'   r   r   conditioning_layer_dimr*   )rb   rc   r$   rd   re   rf   r)   rm   rn   r#   ro   rp   rq   r'   r-   r      s   




	
	

	

	
zTransformerEncoder_lm.__init__c                 C   sf   |dkrt }|||f}||fS |dkrt}||||f}||fS |dkr/t}||||f}||fS td)zDefine positionwise layer.rN   conv1dzconv1d-linearzSupport only linear or conv1d.)r   r
   r	   r~   )r(   r   rb   r   r)   r   rp   rq   r,   r,   r-   rz   a  s*   	
z,TransformerEncoder_lm.get_positionwise_layerc                 C   s   t | jtttfr| ||\}}n| |}| jdu r%| ||\}}n=g }t| jD ]5\}}|||\}}| jdura|d | jv ra|}| jrM| 	|}|
| | jra| |}|| | }q,| jrj| 	|}| jdurt|||fS ||fS )a>  Encode input sequence.

        Args:
            xs (torch.Tensor): Input tensor (#batch, time, idim).
            masks (torch.Tensor): Mask tensor (#batch, time).

        Returns:
            torch.Tensor: Output tensor (#batch, time, attention_dim).
            torch.Tensor: Mask tensor (#batch, time).

        Nr.   )rw   rS   r   r   r   r   r   	enumerater#   r   appendr   r   r   )r(   xsmasksintermediate_outputs	layer_idxencoder_layerencoder_outputintermediate_resultr,   r,   r-   r?     s4   








zTransformerEncoder_lm.forwardc                 C   s   t | jtr| ||\}}n| |}|du r$dd tt| jD }g }t|| jD ]\}}||||d\}}|| q,| jrG| 	|}|||fS )ad  Encode input frame.

        Args:
            xs (torch.Tensor): Input tensor.
            masks (torch.Tensor): Mask tensor.
            cache (List[torch.Tensor]): List of cache tensors.

        Returns:
            torch.Tensor: Output tensor.
            torch.Tensor: Mask tensor.
            List[torch.Tensor]: List of new cache tensors.

        Nc                 S   s   g | ]}d qS r@   r,   )r_   rZ   r,   r,   r-   rg     s    z:TransformerEncoder_lm.forward_one_step.<locals>.<listcomp>)r9   )
rw   rS   r   r}   lenr   zipr   r#   r   )r(   r   r   r9   	new_cachecer,   r,   r-   forward_one_step  s   


z&TransformerEncoder_lm.forward_one_step)rN   rG   rJ   rL   r.   r@   )
rA   rB   rC   rD   r   r   rz   r?   r   rE   r,   r,   r*   r-   rF      sD    ) 0
 0rF   ))rD   typingr   r   r   r2   r   r{   #funasr.models.transformer.attentionr   #funasr.models.transformer.embeddingr   $funasr.models.transformer.layer_normr   0funasr.models.transformer.utils.multi_layer_convr	   r
   *funasr.models.transformer.utils.nets_utilsr   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   ,funasr.models.transformer.utils.dynamic_convr   .funasr.models.transformer.utils.dynamic_conv2dr   )funasr.models.transformer.utils.lightconvr   +funasr.models.transformer.utils.lightconv2dr   +funasr.models.transformer.utils.subsamplingr   r   r   r   r   r   rx   r   rF   r,   r,   r,   r-   <module>   s6   i