o
    iyO                     @   s
  d Z ddlZddlmZmZmZmZ ddlZddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& G dd dej
j'Z(e&)ddG dd de
j'Z*dS )u   Branchformer encoder definition.

Reference:
    Yifan Peng, Siddharth Dalmia, Ian Lane, and Shinji Watanabe,
    “Branchformer: Parallel MLP-Attention Architectures to Capture
    Local and Global Context for Speech Recognition and Understanding,”
    in Proceedings of ICML, 2022.

    N)ListOptionalTupleUnion)ConvolutionalGatingMLP)FastSelfAttention)make_pad_mask)%LegacyRelPositionMultiHeadedAttentionMultiHeadedAttentionRelPositionMultiHeadedAttention)LegacyRelPositionalEncodingPositionalEncodingRelPositionalEncodingScaledPositionalEncoding)	LayerNorm)repeat)Conv2dSubsamplingConv2dSubsampling2Conv2dSubsampling6Conv2dSubsampling8TooShortUttErrorcheck_short_utt)tablesc                       sd   e Zd ZdZ			ddedeejj deejj de	de
d	e	d
e	de	f fddZdddZ  ZS )BranchformerEncoderLayeraw  Branchformer encoder layer module.

    Args:
        size (int): model dimension
        attn: standard self-attention or efficient attention, optional
        cgmlp: ConvolutionalGatingMLP, optional
        dropout_rate (float): dropout probability
        merge_method (str): concat, learned_ave, fixed_ave
        cgmlp_weight (float): weight of the cgmlp branch, between 0 and 1,
            used if merge_method is fixed_ave
        attn_branch_drop_rate (float): probability of dropping the attn branch,
            used if merge_method is learned_ave
        stochastic_depth_rate (float): stochastic depth probability
          ?        sizeattncgmlpdropout_ratemerge_methodcgmlp_weightattn_branch_drop_ratestochastic_depth_ratec	           	         s  t    |d us|d usJ d|| _|| _|| _|| _|| _|| _|| _|d uo-|d u| _	|d ur8t
|| _|d urAt
|| _t
|| _tj|| _| j	r|dkr`tj|| || _d S |dkrtj|d| _tj|d| _tj|d| _tj|d| _tj||| _d S |dkrd|  krdksJ d J d|dkrd	| _	d | _d | _n|dkrd	| _	d | _d | _tj||| _d S td
| tj | _d S )Nz#At least one branch should be validconcatlearned_ave   	fixed_aver         ?z*cgmlp weight should be between 0.0 and 1.0Funknown merge method: )super__init__r   r   r   r    r!   r"   r#   use_two_branchesr   norm_mhanorm_mlp
norm_finaltorchnnDropoutdropoutLinear
merge_projpooling_proj1pooling_proj2weight_proj1weight_proj2
ValueErrorIdentity)	selfr   r   r   r   r    r!   r"   r#   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/branchformer/encoder.pyr+   A   sL   



$z!BranchformerEncoderLayer.__init__Nc              
   C   sX  |durt dt|tr|d |d }}n|d}}d}d}| jr9| jdkr9td | jk }dd| j  }|rV|durHtj||gdd}|durR||f|fS ||fS |}|}	| j	dur| 
|}t| j	trq| 	||}
n|dur| 	|||||}
n| 	||||}
| |
}| jdur| |	}	|dur|	|f}	| |	|}	t|	tr|	d }	| |	}	| jr| jdkr||| | tj||	gd	d  }nH| jd
kr| jr| jdkrtd | jk rd\}}n| |dd| jd  }|dur)tttjd|jd jj}||d|}tj|d	d|dd}ntj|d	d}t|| d}| !|}| "|	dd| jd  }|durztttjd|jd jj}||d|}tj|d	d|dd}ntj|d	d}t||	 d}| #|}tjtj||gd	dd	d}|$d	$d	}|dddf |dddf }}||| | || ||	    }nO| jdkr||| | d| j% | | j%|	    }n2t&d| j | j	du r||| | |	  }n| jdu r||| | |  }nt&d| '|}|dur(||f|fS ||fS )aA  Compute encoded features.

        Args:
            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
                - w/o pos emb: Tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, 1, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).
        Nz&cache is not None, which is not testedr   r&   Fr(   )dimr$   r%   )r   r(      r   )dtyper   r'   r)   z0Both branches are not None, which is unexpected.)(NotImplementedError
isinstancetupletrainingr#   r0   randitemcatr   r-   r   r3   r   r.   r,   r    r5   r"   r6   	transposer   floatnumpyfinfotensorrD   minmasked_filleqsoftmaxmatmulsqueezer8   r7   r9   	unsqueezer!   RuntimeErrorr/   )r<   x_inputmaskcachexpos_emb
skip_layerstoch_layer_coeffx1x2x_attw1w2score1	min_valuepooled1weight1score2pooled2weight2merge_weightsr?   r?   r@   forward   s   















"&


z BranchformerEncoderLayer.forward)r   r   r   N)__name__
__module____qualname____doc__intr   r0   r1   ModulerM   strr+   rm   __classcell__r?   r?   r=   r@   r   1   s.    

	Er   encoder_classesBranchformerEncoderc                2       s  e Zd ZdZ											
										
		d5dededededededededededededed eee	e f d!eee	e f d"ed#ed$ed%ed&e
e d'ed(ed)eee	e f f. fd*d+Zd,efd-d.Z	/d6d0ejd1ejd2ejd,eejeje
ej f fd3d4Z  ZS )7rx   zBranchformer encoder module.   T   rel_selfattnrel_poslatest      Fidentityr$   r   r      皙?conv2drB   
input_sizeoutput_sizeuse_attnattention_headsattention_layer_typepos_enc_layer_typerel_pos_type	use_cgmlpcgmlp_linear_unitscgmlp_conv_kerneluse_linear_after_convgate_activationr    r!   r"   
num_blocksr   positional_dropout_rateattention_dropout_rateinput_layer	zero_triupadding_idxr#   c                    s  t    | _|dkr|dkrd}|dkrd}n|dkr*|dks#J |dks)J ntd| |dkr7t}n,|d	kr>t}n%|dkrK|dksHJ t}n|dkr]|dksUJ t}t	d
 ntd| |dkrt
jt
j|t
jt
j||| _n|dkrt|||| _nv|dkrt|||| _nf|dkrt|||| _nV|dkrt|||| _nF|dkrt
jt
jj||d||| _n/t|t
jjrt
j|||| _n|d u r|krd | _nt
j|| _ntd| |dkrt||fnH|dkr.|dks!J t||ft	d n/|dkrC|dks:J t|||fn|dkrW|dv sOJ t||fntd| t|	|
||ft	trr	g| 	t	|krtdt	 d| dttrg| t|krtdt d| dt tr g|  t |krtdt  d| dt| 	
fdd| _ t| _!d S ) Nlegacyr|   legacy_rel_posr{   legacy_rel_selfattnr}   zunknown rel_pos_type: abs_posscaled_abs_posz=Using legacy_rel_pos and it will be deprecated in the future.zunknown pos_enc_layer: linearr   conv2d2conv2d6conv2d8embed)r   zunknown input_layer: selfattnzBUsing legacy_rel_selfattn and it will be deprecated in the future.fast_selfattn)r   r   zunknown encoder_attn_layer: z!Length of stochastic_depth_rate (z!) should be equal to num_blocks ()zLength of cgmlp_weight (z!Length of attn_branch_drop_rate (c              
      s:   t 
r nd r nd |   |  	|  S rn   )r   )lnumr"   cgmlp_layercgmlp_layer_argsr!   r   encoder_selfattn_layerencoder_selfattn_layer_argsr    r   r#   r   r   r?   r@   <lambda>  s    z.BranchformerEncoder.__init__.<locals>.<lambda>)"r*   r+   _output_sizer:   r   r   r   r   loggingwarningr0   r1   
Sequentialr4   r   r2   r   r   r   r   r   	EmbeddingrF   rt   r
   r	   r   r   r   rM   lenr   encoders
after_norm)r<   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r   r   r   r   r   r   r   r#   pos_enc_classr=   r   r@   r+     s  







	


 zBranchformerEncoder.__init__returnc                 C   s   | j S rn   )r   )r<   r?   r?   r@   r     s   zBranchformerEncoder.output_sizeNxs_padilensprev_statesc                 C   s  t |dddddf  |j}t| jts*t| jts*t| jts*t| jtrVt	| j|
d\}}|rMtd|
d dd| d |
d|| ||\}}n
| jdur`| |}| ||\}}t|trq|d }| |}|dd}||dfS )a  Calculate forward propagation.

        Args:
            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
            ilens (torch.Tensor): Input length (#batch).
            prev_states (torch.Tensor): Not to be used now.

        Returns:
            torch.Tensor: Output tensor (#batch, L, output_size).
            torch.Tensor: Output length (#batch).
            torch.Tensor: Not to be used now.

        Nr&   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr   )r   todevicerF   r   r   r   r   r   r   r   r   r   rG   r   rV   sum)r<   r   r   r   masksshort_status
limit_sizeolensr?   r?   r@   rm     s6   $









zBranchformerEncoder.forward)ry   Trz   r{   r|   r}   Tr~   r   Fr   r$   r   r   r   r   r   r   r   FrB   r   rn   )ro   rp   rq   rr   rs   boolru   r   rM   r   r   r+   r   r0   Tensorr   rm   rv   r?   r?   r=   r@   rx     s    	
 A)+rr   r   typingr   r   r   r   rN   r0   torch.nnr1    funasr.models.branchformer.cgmlpr   %funasr.models.branchformer.fastformerr   *funasr.models.transformer.utils.nets_utilsr   #funasr.models.transformer.attentionr	   r
   r   #funasr.models.transformer.embeddingr   r   r   r   $funasr.models.transformer.layer_normr   &funasr.models.transformer.utils.repeatr   +funasr.models.transformer.utils.subsamplingr   r   r   r   r   r   funasr.registerr   rt   r   registerrx   r?   r?   r?   r@   <module>   s&   
 	 
f