o
    i9                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ G dd dejZeddG dd dejZeddG dd dejZeddG dd dejZdS )zTransformer encoder definition.    )List)Optional)TupleN)nn)MultiHeadedAttention) MultiHeadedAttentionReturnWeight)PositionalEncoding)	LayerNorm)make_pad_mask)PositionwiseFeedForward)repeat)tablesc                       s2   e Zd ZdZ			d
 fdd	Zddd	Z  ZS )EncoderLayera%  Encoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        stochastic_depth_rate (float): Proability to skip this layer.
            During training, the layer may skip residual computation and return input
            as-is with given probability.
    TF        c                    sn   t t|   || _|| _t|| _t|| _t	|| _
|| _|| _|| _| jr2t|| || _|| _dS )z!Construct an EncoderLayer object.N)superr   __init__	self_attnfeed_forwardr	   norm1norm2r   Dropoutdropoutsizenormalize_beforeconcat_afterLinearconcat_linearstochastic_depth_rate)selfr   r   r   dropout_rater   r   r   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/lcbnet/encoder.pyr   /   s   


zEncoderLayer.__init__Nc           	   
   C   s  d}d}| j r| jdkrtd | jk }dd| j  }|r0|dur,tj||gdd}||fS |}| jr:| |}|du rA|}n@|j|jd |jd d | j	fksTJ |ddddddf }|ddddddf }|du rtdn|ddddddf }| j
rtj|| ||||fdd}||| |  }n||| | ||||  }| js| |}|}| jr| |}||| | |  }| js| |}|durtj||gdd}||fS )a  Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        Fg      ?r      Ndim)trainingr   torchranditemcatr   r   shaper   r   r   r   r   r   r   )	r   xmaskcache
skip_layerstoch_layer_coeffresidualx_qx_concatr"   r"   r#   forwardG   sB   
&&


zEncoderLayer.forward)TFr   N__name__
__module____qualname____doc__r   r6   __classcell__r"   r"   r    r#   r      s    r   encoder_classesTransformerTextEncoderc                       s   e Zd ZdZdddddddedd	f
d
edededededededededef fddZdefddZ	de
jde
jdee
je
jee
j f fddZ  ZS )r?   ah  Transformer text encoder module.

    Args:
        input_size: input dim
        output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the number of units of position-wise feed forward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        attention_dropout_rate: dropout rate in attention
        positional_dropout_rate: dropout rate after adding positional encoding
        input_layer: input layer type
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before: whether to use layer_norm before the first block
        concat_after: whether to concat attention layer's input and output
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied.
            i.e. x -> x + att(x)
        positionwise_layer_type: linear of conv1d
        positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
        padding_idx: padding_idx for input_layer=embed
                皙?r   TF
input_sizeoutput_sizeattention_headslinear_units
num_blocksr   positional_dropout_rateattention_dropout_rater   r   c              
      s~   t    | _tjtj||	|| _| _t	|ft
| fdd| _| jr=t| _d S d S )Nc                    s   t t  S r7   )r   r   )lnumrK   rG   r   r   r   rF   positionwise_layerpositionwise_layer_argsr"   r#   <lambda>   s    
z1TransformerTextEncoder.__init__.<locals>.<lambda>)r   r   _output_sizer)   r   
Sequential	Embeddingembedr   r   r   encodersr	   
after_norm)r   rE   rF   rG   rH   rI   r   rJ   rK   pos_enc_classr   r   r    rM   r#   r      s&   
zTransformerTextEncoder.__init__returnc                 C   s   | j S r7   )rQ   )r   r"   r"   r#   rF      s   z"TransformerTextEncoder.output_sizexs_padilensc                 C   sh   t |dddddf  |j}| |}| ||\}}| jr'| |}|dd}||dfS )zEmbed positions in tensor.

        Args:
            xs_pad: input tensor (B, L, D)
            ilens: input length (B)
        Returns:
            position embedded tensor and mask
        Nr$   )	r
   todevicerT   rU   r   rV   squeezesum)r   rY   rZ   masksolensr"   r"   r#   r6      s   $


zTransformerTextEncoder.forward)r9   r:   r;   r<   r   intfloatboolr   rF   r)   Tensorr   r   r6   r=   r"   r"   r    r#   r?      sR    	,FusionSANEncoderc                       s0   e Zd ZdZ		d	 fdd	Zd
ddZ  ZS )SelfSrcAttentiona  Single decoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)


    TFc                    s   t t|   || _t|||| _t|||| _t|||| _	t
|| _t
|| _t
|| _t|| _|	| _|
| _| jrQt|| || _t|| || _dS dS )z%Construct an SelfSrcAttention object.N)r   rf   r   r   r   r   r   src_attnr   r   r	   r   r   norm3r   r   r   r   r   r   concat_linear1concat_linear2)r   r   rG   attention_dimrH   self_attention_dropout_ratesrc_attention_dropout_raterJ   r   r   r   r    r"   r#   r      s*   


zSelfSrcAttention.__init__Nc              	   C   s  |}| j r
| |}|du r|}|}nT|j|jd |jd d | jfks:J |j d|jd |jd d | jf |ddddddf }|ddddddf }d}|durg|ddddddf }| jrtj|| ||||fdd}	|| |	 }
n|| 	| |||| }
| j s| |
}
|
}| j r| 
|
}
| jrtj|
| |
|||fdd}|| | }
n| |
|||\}
}|| 	|
 }
| j s| 
|
}
|
}| j r| |
}
|| 	| |
 }
| j s| |
}
|durtj||
gdd}
|
|||fS )a"  Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor(#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        Nr   r$   z == r'   r%   )r   r   r-   r   r   r)   r,   r   ri   r   r   rg   rj   rh   r   )r   tgttgt_maskmemorymemory_maskr0   r3   tgt_q
tgt_q_mask
tgt_concatr.   r5   scorer"   r"   r#   r6     sV   
&




zSelfSrcAttention.forward)TFr7   r8   r"   r"   r    r#   rf      s    #rf   ConvBiasPredictorc                       s2   e Zd Z						d
 fdd	Zdd	 Z  ZS )ConvPredictorr@      rA   rD   rB   c                    sz   t    t|||| _t|| _t|||| _t|| _t	
||fd| _t	j|||| d |d| _t	|d| _d S )Nr   r$   )groups)r   r   r   attenr	   r   r   r   r   r   ConstantPad1dpadConv1dconv1dr   output_linear)r   r   l_orderr_orderrG   rK   rH   r    r"   r#   r   g  s   
	

zConvPredictor.__init__c                 C   s   |}||  |||d  }|}| |}|| | }| |}|dd}| |}| |}|| }|dd}t|}| 	|}|
 dkrN|d}|S )Nr$      rx   )rz   r   r   r   	transposer|   r~   r)   relur   r&   r]   )r   text_encasr_encr3   contextqueriesrp   outputr"   r"   r#   r6   y  s    






zConvPredictor.forward)r@   rx   rx   rA   rD   rB   )r9   r:   r;   r   r6   r=   r"   r"   r    r#   rw   e  s    rw   )r<   typingr   r   r   r)   r   logging#funasr.models.transformer.attentionr   funasr.models.lcbnet.attentionr   #funasr.models.transformer.embeddingr   $funasr.models.transformer.layer_normr	   *funasr.models.transformer.utils.nets_utilsr
   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   funasr.registerr   Moduler   registerr?   rf   rw   r"   r"   r"   r#   <module>   s.   
i
a 
