o
    i"                     @   s   d Z ddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z# G dd deZ$dS )zTransformer encoder definition.    )ListOptionalTupleN)check_argument_types)CTC)
AbsEncoder)make_pad_mask)MultiHeadedAttention)PositionalEncoding)EncoderLayer)	LayerNorm)Conv1dLinearMultiLayeredConv1d)PositionwiseFeedForward)repeat)Conv2dSubsamplingConv2dSubsampling2Conv2dSubsampling6Conv2dSubsampling8TooShortUttErrorcheck_short_uttc                !       s   e Zd ZdZdddddddded	d
dddg d
fdededededededededee de	de	dededede
e de	f  fddZd efd!d"Z	#	#d*d$ejd%ejd&ejd'ed eejejeej f f
d(d)Z  ZS )+TransformerEncoderac  Transformer encoder module.

    Args:
        input_size: input dim
        output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the number of units of position-wise feed forward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        attention_dropout_rate: dropout rate in attention
        positional_dropout_rate: dropout rate after adding positional encoding
        input_layer: input layer type
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before: whether to use layer_norm before the first block
        concat_after: whether to concat attention layer's input and output
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied.
            i.e. x -> x + att(x)
        positionwise_layer_type: linear of conv1d
        positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
        padding_idx: padding_idx for input_layer=embed
          i      g?g        conv2dTFlinear   
input_sizeoutput_sizeattention_headslinear_units
num_blocksdropout_ratepositional_dropout_rateattention_dropout_rateinput_layernormalize_beforeconcat_afterpositionwise_layer_typepositionwise_conv_kernel_sizepadding_idxinterctc_layer_idxinterctc_use_conditioningc              
      s  t  sJ t   | _|	dkr0tjtj|tjtj	tj
 |
|| _nb|	dkr<t|| _nV|	dkrHt|| _nJ|	dkrTt|| _n>|	dkr`t|| _n2|	dkrwtjtjj||d|
|| _n|	d u r|krd | _ntj|| _ntd|	 | _|dkrt|fn|d	krt||fn|d
krt||fntdt| fdd| _| jrt| _|| _t|dkrdt|k rt||k sJ || _d | _d S )Nr   r   conv2d2conv2d6conv2d8embed)r,   zunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.c                    s   t t  S N)r   r	   )lnumr&   r!   r)   r$   r(   r    positionwise_layerpositionwise_layer_args [/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/encoder/transformer_encoder.py<lambda>   s    z-TransformerEncoder.__init__.<locals>.<lambda>r   ) r   super__init___output_sizetorchnn
SequentialLinearr   DropoutReLUr2   r   r   r   r   	Embedding
ValueErrorr(   r   r   r   NotImplementedErrorr   encoders
after_normr-   lenminmaxr.   conditioning_layer)selfr   r    r!   r"   r#   r$   r%   r&   r'   pos_enc_classr(   r)   r*   r+   r,   r-   r.   	__class__r6   r:   r=   =   s|   





zTransformerEncoder.__init__returnc                 C   s   | j S r4   )r>   )rN   r9   r9   r:   r       s   zTransformerEncoder.output_sizeNxs_padilensprev_statesctcc                 C   s  t |dddddf  |j}| jdu r|}nIt| jts2t| jts2t| jts2t| jtr^t	| j|
d\}}|rUtd|
d dd| d |
d|| ||\}}n| |}g }t| jdkru| ||\}}n:t| jD ]4\}	}
|
||\}}|	d | jv r|}| jr| |}||	d |f | jr||}|| | }qz| jr| |}|dd}t|dkr||f|dfS ||dfS )zEmbed positions in tensor.

        Args:
            xs_pad: input tensor (B, L, D)
            ilens: input length (B)
            prev_states: Not to be used now.
        Returns:
            position embedded tensor and mask
        Nr   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr   )r   todevicer2   
isinstancer   r   r   r   r   sizer   rJ   r-   rH   	enumerater(   rI   appendr.   softmaxrM   squeezesum)rN   rS   rT   rU   rV   masksshort_status
limit_sizeintermediate_outs	layer_idxencoder_layerencoder_outctc_outolensr9   r9   r:   forward   sT   $










zTransformerEncoder.forward)NN)__name__
__module____qualname____doc__r
   intfloatr   strboolr   r=   r    r?   Tensorr   r   ri   __classcell__r9   r9   rP   r:   r   $   s    	
dr   )%rm   typingr   r   r   r?   	typeguardr   espnet2.asr.ctcr   espnet2.asr.encoder.abs_encoderr   &espnet.nets.pytorch_backend.nets_utilsr   1espnet.nets.pytorch_backend.transformer.attentionr	   1espnet.nets.pytorch_backend.transformer.embeddingr
   5espnet.nets.pytorch_backend.transformer.encoder_layerr   2espnet.nets.pytorch_backend.transformer.layer_normr   8espnet.nets.pytorch_backend.transformer.multi_layer_convr   r   Aespnet.nets.pytorch_backend.transformer.positionwise_feed_forwardr   .espnet.nets.pytorch_backend.transformer.repeatr   3espnet.nets.pytorch_backend.transformer.subsamplingr   r   r   r   r   r   r   r9   r9   r9   r:   <module>   s     
