o
    i9                     @   s  d Z ddlZddlmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- G dd deZ.dS )zConformer encoder definition.    N)ListOptionalTupleUnion)check_argument_types)CTC)
AbsEncoder)ConvolutionModuleEncoderLayer)get_activationmake_pad_mask)%LegacyRelPositionMultiHeadedAttentionMultiHeadedAttentionRelPositionMultiHeadedAttention)LegacyRelPositionalEncodingPositionalEncodingRelPositionalEncodingScaledPositionalEncoding)	LayerNorm)Conv1dLinearMultiLayeredConv1d)PositionwiseFeedForward)repeat)Conv2dSubsamplingConv2dSubsampling2Conv2dSubsampling6Conv2dSubsampling8TooShortUttErrorcheck_short_uttc                6       s  e Zd ZdZddddddddd	d
ddd
ddddd	d
ddg d
dfdededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ee d*ed+e	eee f f2 fd,d-Z
d.efd/d0Z	1	1d8d2ejd3ejd4ejd5ed.eejejeej f f
d6d7Z  ZS )9ConformerEncodera  Conformer encoder module.

    Args:
        input_size (int): Input dimension.
        output_size (int): Dimension of attention.
        attention_heads (int): The number of heads of multi head attention.
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        attention_dropout_rate (float): Dropout rate in attention.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        input_layer (Union[str, torch.nn.Module]): Input layer type.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            If True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            If False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        rel_pos_type (str): Whether to use the latest relative positional encoding or
            the legacy one. The legacy relative positional encoding will be deprecated
            in the future. More Details can be found in
            https://github.com/espnet/espnet/pull/2816.
        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
        encoder_attn_layer_type (str): Encoder attention layer type.
        activation_type (str): Encoder activation function type.
        macaron_style (bool): Whether to use macaron style for positionwise layer.
        use_cnn_module (bool): Whether to use convolution module.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): Kernerl size of convolution module.
        padding_idx (int): Padding idx for input_layer=embed.

          i      g?g        conv2dTFlinear   legacyrel_posrel_selfattnswish   
input_sizeoutput_sizeattention_headslinear_units
num_blocksdropout_ratepositional_dropout_rateattention_dropout_rateinput_layernormalize_beforeconcat_afterpositionwise_layer_typepositionwise_conv_kernel_sizemacaron_stylerel_pos_typepos_enc_layer_typeselfattention_layer_typeactivation_typeuse_cnn_module	zero_triucnn_module_kernelpadding_idxinterctc_layer_idxinterctc_use_conditioningstochastic_depth_ratec                    s  t  sJ t   | _|dkr|dkrd}|dkrd}n|dkr/|dks(J |dks.J ntd| t|}|dkr@t}n,|d	krGt}n%|dkrT|dksQJ t}n|dkrf|dks^J t	}t
d
 ntd| |	dkrtjtj|tjtj||| _n|	dkrt|||| _np|	dkrt|||| _n`|	dkrt|||| _nP|	dkrt|||| _n@|	dkrtjtjj||d||| _n)t|	tjjrtj|	||| _n|	d u rtj||| _ntd|	 | _|dkrt	||f
n |dkr*t	||f
n|dkr8t	||f
ntd|dkrIt||fn4|dkrb|dksUJ t ||ft
d n|dkrw|dksnJ t!|||fntd| t"||ftt#rg| t$|krtdt$ d| dt%| 	
fdd| _&| jrt| _'|| _(t$|dkrdt)|k rt*||k sJ || _+d | _,d S ) Nr'   r(   legacy_rel_posr)   legacy_rel_selfattnlatestzunknown rel_pos_type: abs_posscaled_abs_posz=Using legacy_rel_pos and it will be deprecated in the future.zunknown pos_enc_layer: r%   r$   conv2d2conv2d6conv2d8embed)rB   zunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.selfattnzBUsing legacy_rel_selfattn and it will be deprecated in the future.zunknown encoder_attn_layer: z!Length of stochastic_depth_rate (z!) should be equal to num_blocks ()c                    s<   t  	
 r	
 nd r nd  |  	S Nr
   )lnumr7   convolution_layerconvolution_layer_argsr2   encoder_selfattn_layerencoder_selfattn_layer_argsr:   r6   r.   positionwise_layerpositionwise_layer_argsrE   r?    Y/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/encoder/conformer_encoder.py<lambda>	  s    z+ConformerEncoder.__init__.<locals>.<lambda>r   )-r   super__init___output_size
ValueErrorr   r   r   r   r   loggingwarningtorchnn
SequentialLinearr   DropoutrN   r   r   r   r   	Embedding
isinstanceModuler6   r   r   r   NotImplementedErrorr   r   r   r	   floatlenr   encoders
after_normrC   minmaxrD   conditioning_layer)selfr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   
activationpos_enc_class	__class__rT   r\   r_   R   s&  












"
 
zConformerEncoder.__init__returnc                 C   s   | j S rR   )r`   )rt   r[   r[   r\   r.     s   zConformerEncoder.output_sizeNxs_padilensprev_statesctcc                 C   s  t |dddddf  |j}t| jts*t| jts*t| jts*t| jtrVt	| j|
d\}}|rMtd|
d dd| d |
d|| ||\}}n| |}g }t| jdkrm| ||\}}nXt| jD ]R\}	}
|
||\}}|	d | jv r|}t|tr|d }| jr| |}||	d |f | jr||}t|tr|\}}|| | }||f}qr|| | }qrt|tr|d }| jr| |}|dd}t|dkr||f|dfS ||dfS )a  Calculate forward propagation.

        Args:
            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
            ilens (torch.Tensor): Input length (#batch).
            prev_states (torch.Tensor): Not to be used now.

        Returns:
            torch.Tensor: Output tensor (#batch, L, output_size).
            torch.Tensor: Output length (#batch).
            torch.Tensor: Not to be used now.

        N   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr   )r   todevicerj   rN   r   r   r   r   r   sizer   rn   rC   ro   	enumeratetupler6   rp   appendrD   softmaxrs   squeezesum)rt   rz   r{   r|   r}   masksshort_status
limit_sizeintermediate_outs	layer_idxencoder_layerencoder_outctc_outxpos_embolensr[   r[   r\   forward!  s`   $













zConformerEncoder.forward)NN)__name__
__module____qualname____doc__intrm   strboolr   r   r_   r.   rd   Tensorr   r   r   r   __classcell__r[   r[   rw   r\   r    /   s    %	
 Mr    )/r   rb   typingr   r   r   r   rd   	typeguardr   espnet2.asr.ctcr   espnet2.asr.encoder.abs_encoderr   1espnet.nets.pytorch_backend.conformer.convolutionr	   3espnet.nets.pytorch_backend.conformer.encoder_layerr   &espnet.nets.pytorch_backend.nets_utilsr   r   1espnet.nets.pytorch_backend.transformer.attentionr   r   r   1espnet.nets.pytorch_backend.transformer.embeddingr   r   r   r   2espnet.nets.pytorch_backend.transformer.layer_normr   8espnet.nets.pytorch_backend.transformer.multi_layer_convr   r   Aespnet.nets.pytorch_backend.transformer.positionwise_feed_forwardr   .espnet.nets.pytorch_backend.transformer.repeatr   3espnet.nets.pytorch_backend.transformer.subsamplingr   r   r   r   r   r   r    r[   r[   r[   r\   <module>   s$    
