o
    i:                     @   s   d Z ddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ G dd deZ%dS )zConformer encoder definition.    )ListOptionalTupleN)check_argument_types)CTC)ConformerEncoder)ConvolutionModuleEncoderLayer)get_activationmake_pad_mask)PositionalEncoding)	LayerNorm)Conv1dLinearMultiLayeredConv1d)PositionwiseFeedForward)repeat)Conv2dSubsamplingConv2dSubsampling2Conv2dSubsampling6Conv2dSubsampling8TooShortUttErrorcheck_short_uttc                7       s  e Zd ZdZddddddddd	d
ddd
ddddd	d
ddg d
g dg ddfdedededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ee d-ed.e	d/e	d0ef6 fd1d2Z
d3efd4d5Z	6	6d=d7ejd8ejd9ejd:ed3eejejeej f f
d;d<Z  ZS )>LongformerEncodera	  Longformer SA Conformer encoder module.

    Args:
        input_size (int): Input dimension.
        output_size (int): Dimension of attention.
        attention_heads (int): The number of heads of multi head attention.
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        attention_dropout_rate (float): Dropout rate in attention.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        input_layer (Union[str, torch.nn.Module]): Input layer type.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            If True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            If False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        rel_pos_type (str): Whether to use the latest relative positional encoding or
            the legacy one. The legacy relative positional encoding will be deprecated
            in the future. More Details can be found in
            https://github.com/espnet/espnet/pull/2816.
        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
        encoder_attn_layer_type (str): Encoder attention layer type.
        activation_type (str): Encoder activation function type.
        macaron_style (bool): Whether to use macaron style for positionwise layer.
        use_cnn_module (bool): Whether to use convolution module.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): Kernerl size of convolution module.
        padding_idx (int): Padding idx for input_layer=embed.
        attention_windows (list): Layer-wise attention window sizes
            for longformer self-attn
        attention_dilation(list): Layer-wise attention dilation sizes
            for longformer self-attn
        attention_mode(str): Implementation for longformer self-attn.
            Default="sliding_chunks"
            Choose 'n2', 'tvm' or 'sliding_chunks'. More details in
            https://github.com/allenai/longformer

          i      g?g        conv2dTFlinear   legacyabs_poslf_selfattnswish   )d   r&   r&   r&   r&   r&   )   r'   r'   r'   r'   r'   sliding_chunks
input_sizeoutput_sizeattention_headslinear_units
num_blocksdropout_ratepositional_dropout_rateattention_dropout_rateinput_layernormalize_beforeconcat_afterpositionwise_layer_typepositionwise_conv_kernel_sizemacaron_stylerel_pos_typepos_enc_layer_typeselfattention_layer_typeactivation_typeuse_cnn_module	zero_triucnn_module_kernelpadding_idxinterctc_layer_idxinterctc_use_conditioningattention_windowsattention_dilationattention_modec           !         sR  t  sJ t | | _t|}|dkrt}ntd| d t||kr7tdtt| d t| t||krMtdtt| d t| |dkr_t	|dkr_td	| d
 |	dkr~t
jt
j|t
jt
j||| _n|	dkrt|||| _no|	dkrt|||| _n_|	dkrt|||| _nO|	dkrt|||| _n?|	dkrt
jt
jj||d||| _n(t|	t
jjrt
j|	||| _n|	d u rt
j||| _ntd|	 | _|dkrt	||f
n |dkrt	||f
n|dkr*t	||f
ntd|| _|dkrZ|dks=J ddlm} ddl m!} ||||d||d} | fntd| d t"||ft#| 	
fdd| _$| jrt| _%|| _&t|dkrdt'|k rt	||k sJ || _(d | _)d S ) Nr!   z$incorrect or unknown pos_enc_layer: zUse abs_posz0incorrect attention_dilation parameter of lengthz does not match num_blocksz/incorrect attention_windows parameter of lengthtvmr'   z'incorrect attention mode for dilation: z'Use attention_mode=tvm with Cuda Kernelr   r   conv2d2conv2d6conv2d8embed)r>   zunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.r"   r   )LongformerConfig)LongformerAttentionF)attention_windowrB   autoregressivenum_attention_headshidden_sizeattention_probs_dropout_probrC   z,incompatible or unknown encoder_attn_layer: z Use lf_selfattnc              	      s<   t | f  	
 r	
 nd r nd  S Nr	   )layer_idr3   convolution_layerconvolution_layer_argsr.   encoder_selfattn_layerencoder_selfattn_layer_argsr6   r2   r*   positionwise_layerpositionwise_layer_argsr;    Z/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/encoder/longformer_encoder.py<lambda>  s    z,LongformerEncoder.__init__.<locals>.<lambda>)*r   super__init___output_sizer   r   
ValueErrorlenstrmaxtorchnn
SequentialLinearr   DropoutrH   r   r   r   r   	Embedding
isinstanceModuler2   r   r   r   NotImplementedErrorr9   longformer.longformerrJ   <espnet.nets.pytorch_backend.transformer.longformer_attentionrK   r   r   encoders
after_normr?   minr@   conditioning_layer)!selfr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   
activationpos_enc_classrJ   rK   config	__class__rS   r[   r^   O   s,  








	
 
 
zLongformerEncoder.__init__returnc                 C   s   | j S rQ   )r_   )rs   rZ   rZ   r[   r*     s   zLongformerEncoder.output_sizeNxs_padilensprev_statesctcc                 C   sj  t |dddddf  |j}t| jts*t| jts*t| jts*t| jtrVt	| j|
d\}}|rMtd|
d dd| d |
d|| ||\}}n| |}| jdkr|jd }tdd	 | jD d
 }	|	||	  |	 }
tjj|ddd|
fdd}tjj|d|
fdd}| ||\}}g }t| jdkr| ||\}}nZt| jD ]T\}}|||\}}|d | jv r|}t|tr|d }| jr| |}||d |f | jr||}t|tr|\}}|| | }||f}q|| | }qt|tr|d }| jr| |}|dd}t|dkr0||f|dfS ||dfS )a  Calculate forward propagation.

        Args:
            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
            ilens (torch.Tensor): Input length (#batch).
            prev_states (torch.Tensor): Not to be used now.

        Returns:
            torch.Tensor: Output tensor (#batch, L, output_size).
            torch.Tensor: Output length (#batch).
            torch.Tensor: Not to be used now.

        Nr'   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr"   c                 S   s   g | ]}|j jqS rZ   )	self_attnrL   ).0xrZ   rZ   r[   
<listcomp>F  s    z-LongformerEncoder.forward.<locals>.<listcomp>   r   constantF) r   todevicerj   rH   r   r   r   r   r   sizer   r9   shaperc   ro   rd   re   
functionalpadra   r?   	enumeratetupler2   rp   appendr@   softmaxrr   squeezesum)rs   rz   r{   r|   r}   masksshort_status
limit_sizeseq_lenrL   padding_lenintermediate_outs	layer_idxencoder_layerencoder_outctc_outr   pos_embolensrZ   rZ   r[   forward  sx   $















zLongformerEncoder.forward)NN)__name__
__module____qualname____doc__intfloatrb   boolr   listr^   r*   rd   Tensorr   r   r   r   __classcell__rZ   rZ   rw   r[   r   $   s    -	
 Jr   )&r   typingr   r   r   rd   	typeguardr   espnet2.asr.ctcr   %espnet2.asr.encoder.conformer_encoderr   1espnet.nets.pytorch_backend.conformer.convolutionr   3espnet.nets.pytorch_backend.conformer.encoder_layerr
   &espnet.nets.pytorch_backend.nets_utilsr   r   1espnet.nets.pytorch_backend.transformer.embeddingr   2espnet.nets.pytorch_backend.transformer.layer_normr   8espnet.nets.pytorch_backend.transformer.multi_layer_convr   r   Aespnet.nets.pytorch_backend.transformer.positionwise_feed_forwardr   .espnet.nets.pytorch_backend.transformer.repeatr   3espnet.nets.pytorch_backend.transformer.subsamplingr   r   r   r   r   r   r   rZ   rZ   rZ   r[   <module>   s     
