o
    iQ                     @   s   d Z ddlZddlmZmZmZmZ ddlZddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& G dd dej'j(Z)G dd deZ*dS )u   Branchformer encoder definition.

Reference:
    Yifan Peng, Siddharth Dalmia, Ian Lane, and Shinji Watanabe,
    “Branchformer: Parallel MLP-Attention Architectures to Capture
    Local and Global Context for Speech Recognition and Understanding,”
    in Proceedings of ICML, 2022.

    N)ListOptionalTupleUnion)check_argument_types)
AbsEncoder)ConvolutionalGatingMLP)FastSelfAttention)make_pad_mask)%LegacyRelPositionMultiHeadedAttentionMultiHeadedAttentionRelPositionMultiHeadedAttention)LegacyRelPositionalEncodingPositionalEncodingRelPositionalEncodingScaledPositionalEncoding)	LayerNorm)repeat)Conv2dSubsamplingConv2dSubsampling2Conv2dSubsampling6Conv2dSubsampling8TooShortUttErrorcheck_short_uttc                       sd   e Zd ZdZ			ddedeejj deejj de	de
d	e	d
e	de	f fddZdddZ  ZS )BranchformerEncoderLayeraw  Branchformer encoder layer module.

    Args:
        size (int): model dimension
        attn: standard self-attention or efficient attention, optional
        cgmlp: ConvolutionalGatingMLP, optional
        dropout_rate (float): dropout probability
        merge_method (str): concat, learned_ave, fixed_ave
        cgmlp_weight (float): weight of the cgmlp branch, between 0 and 1,
            used if merge_method is fixed_ave
        attn_branch_drop_rate (float): probability of dropping the attn branch,
            used if merge_method is learned_ave
        stochastic_depth_rate (float): stochastic depth probability
          ?        sizeattncgmlpdropout_ratemerge_methodcgmlp_weightattn_branch_drop_ratestochastic_depth_ratec	           	         s  t    |d us|d usJ d|| _|| _|| _|| _|| _|| _|| _|d uo-|d u| _	|d ur8t
|| _|d urAt
|| _t
|| _tj|| _| j	r|dkr`tj|| || _d S |dkrtj|d| _tj|d| _tj|d| _tj|d| _tj||| _d S |dkrd|  krdksJ d J d|dkrd	| _	d | _d | _n|dkrd	| _	d | _d | _tj||| _d S td
| tj | _d S )Nz#At least one branch should be validconcatlearned_ave   	fixed_aver         ?z*cgmlp weight should be between 0.0 and 1.0Funknown merge method: )super__init__r   r   r   r!   r"   r#   r$   use_two_branchesr   norm_mhanorm_mlp
norm_finaltorchnnDropoutdropoutLinear
merge_projpooling_proj1pooling_proj2weight_proj1weight_proj2
ValueErrorIdentity)	selfr   r   r   r    r!   r"   r#   r$   	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/encoder/branchformer_encoder.pyr,   @   sZ   




z!BranchformerEncoderLayer.__init__Nc              
   C   sX  |durt dt|tr|d |d }}n|d}}d}d}| jr9| jdkr9td | jk }dd| j  }|rV|durHtj||gdd}|durR||f|fS ||fS |}|}	| j	dur| 
|}t| j	trq| 	||}
n|dur| 	|||||}
n| 	||||}
| |
}| jdur| |	}	|dur|	|f}	| |	|}	t|	tr|	d }	| |	}	| jr| jdkr||| | tj||	gd	d  }nH| jd
kr| jr| jdkrtd | jk rd\}}n| |dd| jd  }|dur)tttjd|jd jj}||d|}tj|d	d|dd}ntj|d	d}t|| d}| !|}| "|	dd| jd  }|durztttjd|jd jj}||d|}tj|d	d|dd}ntj|d	d}t||	 d}| #|}tjtj||gd	dd	d}|$d	$d	}|dddf |dddf }}||| | || ||	    }nO| jdkr||| | d| j% | | j%|	    }n2t&d| j | j	du r||| | |	  }n| jdu r||| | |  }nt&d| '|}|dur(||f|fS ||fS )a>  Compute encoded features.

        Args:
            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
                - w/o pos emb: Tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).
        Nz&cache is not None, which is not testedr   r'   Fr)   )dimr%   r&   )r   r)      r   )dtyper   r(   r*   z0Both branches are not None, which is unexpected.)(NotImplementedError
isinstancetupletrainingr$   r1   randitemcatr   r.   r	   r4   r   r/   r-   r!   r6   r#   r7   	transposer   floatnumpyfinfotensorrE   minmasked_filleqsoftmaxmatmulsqueezer9   r8   r:   	unsqueezer"   RuntimeErrorr0   )r=   x_inputmaskcachexpos_emb
skip_layerstoch_layer_coeffx1x2x_attw1w2score1	min_valuepooled1weight1score2pooled2weight2merge_weightsr@   r@   rA   forward   s   


















"



z BranchformerEncoderLayer.forward)r   r   r   N)__name__
__module____qualname____doc__intr   r1   r2   ModulerN   strr,   rn   __classcell__r@   r@   r>   rA   r   0   s.    

	Ir   c                2       s  e Zd ZdZ											
										
		d5dededededededededededededed eee	e f d!eee	e f d"ed#ed$ed%ed&e
e d'ed(ed)eee	e f f. fd*d+Zd,efd-d.Z	/d6d0ejd1ejd2ejd,eejeje
ej f fd3d4Z  ZS )7BranchformerEncoderzBranchformer encoder module.   T   rel_selfattnrel_poslatest      Fidentityr%   r   r      皙?conv2drC   
input_sizeoutput_sizeuse_attnattention_headsattention_layer_typepos_enc_layer_typerel_pos_type	use_cgmlpcgmlp_linear_unitscgmlp_conv_kerneluse_linear_after_convgate_activationr!   r"   r#   
num_blocksr    positional_dropout_rateattention_dropout_rateinput_layer	zero_triupadding_idxr$   c                    s  t  sJ t   | _|dkr|dkrd}|dkrd}n|dkr/|dks(J |dks.J ntd| |dkr<t}n,|d	krCt}n%|dkrP|dksMJ t}n|dkrb|dksZJ t}t	
d
 ntd| |dkrtjtj|tjtj||| _n|dkrt|||| _nv|dkrt|||| _nf|dkrt|||| _nV|dkrt|||| _nF|dkrtjtjj||d||| _n/t|tjjrtj|||| _n|d u r|krd | _ntj|| _ntd| |dkrt||fnH|dkr3|dks&J t||ft	
d n/|dkrH|dks?J t|||fn|dkr\|dv sTJ t||fntd| t|	|
||ft	trw	g| 	t	|krtdt	 d| dttrg| t|krtdt d| dt tr g|  t |krtdt  d| dt | 	
fdd| _!t| _"d S ) Nlegacyr|   legacy_rel_posr{   legacy_rel_selfattnr}   zunknown rel_pos_type: abs_posscaled_abs_posz=Using legacy_rel_pos and it will be deprecated in the future.zunknown pos_enc_layer: linearr   conv2d2conv2d6conv2d8embed)r   zunknown input_layer: selfattnzBUsing legacy_rel_selfattn and it will be deprecated in the future.fast_selfattn)r   r   zunknown encoder_attn_layer: z!Length of stochastic_depth_rate (z!) should be equal to num_blocks ()zLength of cgmlp_weight (z!Length of attn_branch_drop_rate (c              
      s:   t 
r nd r nd |   |  	|  S ro   )r   )lnumr#   cgmlp_layercgmlp_layer_argsr"   r    encoder_selfattn_layerencoder_selfattn_layer_argsr!   r   r$   r   r   r@   rA   <lambda>  s    
z.BranchformerEncoder.__init__.<locals>.<lambda>)#r   r+   r,   _output_sizer;   r   r   r   r   loggingwarningr1   r2   
Sequentialr5   r   r3   r   r   r   r   r   	EmbeddingrG   ru   r   r   r   r	   r   rN   lenr   encoders
after_norm)r=   r   r   r   r   r   r   r   r   r   r   r   r   r!   r"   r#   r   r    r   r   r   r   r   r$   pos_enc_classr>   r   rA   r,   *  s$  








	


 zBranchformerEncoder.__init__returnc                 C   s   | j S ro   )r   )r=   r@   r@   rA   r     s   zBranchformerEncoder.output_sizeNxs_padilensprev_statesc                 C   s  t |dddddf  |j}t| jts*t| jts*t| jts*t| jtrVt	| j|
d\}}|rMtd|
d dd| d |
d|| ||\}}n
| jdur`| |}| ||\}}t|trq|d }| |}|dd}||dfS )a  Calculate forward propagation.

        Args:
            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
            ilens (torch.Tensor): Input length (#batch).
            prev_states (torch.Tensor): Not to be used now.

        Returns:
            torch.Tensor: Output tensor (#batch, L, output_size).
            torch.Tensor: Output length (#batch).
            torch.Tensor: Not to be used now.

        Nr'   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr   )r
   todevicerG   r   r   r   r   r   r   r   r   r   rH   r   rW   sum)r=   r   r   r   masksshort_status
limit_sizeolensr@   r@   rA   rn     s6   $









zBranchformerEncoder.forward)ry   Trz   r{   r|   r}   Tr~   r   Fr   r%   r   r   r   r   r   r   r   FrC   r   ro   )rp   rq   rr   rs   rt   boolrv   r   rN   r   r   r,   r   r1   Tensorr   rn   rw   r@   r@   r>   rA   rx   '  s    	
 Hrx   )+rs   r   typingr   r   r   r   rO   r1   	typeguardr   espnet2.asr.encoder.abs_encoderr   espnet2.asr.layers.cgmlpr   espnet2.asr.layers.fastformerr	   &espnet.nets.pytorch_backend.nets_utilsr
   1espnet.nets.pytorch_backend.transformer.attentionr   r   r   1espnet.nets.pytorch_backend.transformer.embeddingr   r   r   r   2espnet.nets.pytorch_backend.transformer.layer_normr   .espnet.nets.pytorch_backend.transformer.repeatr   3espnet.nets.pytorch_backend.transformer.subsamplingr   r   r   r   r   r   r2   ru   r   rx   r@   r@   r@   rA   <module>   s$   
 
 x