o
    iB                     @   s  d Z ddlZddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZmZ ddlmZmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) G dd dejj*Z+e),ddG dd dej*Z-dS )zE-Branchformer encoder definition.
Reference:
    Kwangyoun Kim, Felix Wu, Yifan Peng, Jing Pan,
    Prashant Sridhar, Kyu J. Han, Shinji Watanabe,
    "E-Branchformer: Branchformer with Enhanced merging
    for speech recognition," in SLT 2022.
    N)ListOptionalTuple)CTC)ConvolutionalGatingMLP)FastSelfAttention)get_activationmake_pad_mask)%LegacyRelPositionMultiHeadedAttentionMultiHeadedAttentionRelPositionMultiHeadedAttention)LegacyRelPositionalEncodingPositionalEncodingRelPositionalEncodingScaledPositionalEncoding)	LayerNorm)PositionwiseFeedForward)repeat)Conv2dSubsamplingConv2dSubsampling2Conv2dSubsampling6Conv2dSubsampling8TooShortUttErrorcheck_short_utt)tablesc                       sd   e Zd ZdZ	ddedejjdejjdeejj deejj de	d	ef fd
dZ
dddZ  ZS )EBranchformerEncoderLayera  E-Branchformer encoder layer module.

    Args:
        size (int): model dimension
        attn: standard self-attention or efficient attention
        cgmlp: ConvolutionalGatingMLP
        feed_forward: feed-forward module, optional
        feed_forward: macaron-style feed-forward module, optional
        dropout_rate (float): dropout probability
        merge_conv_kernel (int): kernel size of the depth-wise conv in merge module
       sizeattncgmlpfeed_forwardfeed_forward_macarondropout_ratemerge_conv_kernelc              	      s   t    || _|| _|| _|| _|| _d| _| jd ur!t|| _	| jd ur.d| _t|| _
t|| _t|| _t|| _tj|| _tjj|| || |d|d d || dd| _tj|| || _d S )Ng      ?g      ?      T)kernel_sizestridepaddinggroupsbias)super__init__r   r   r   r    r!   ff_scaler   norm_ffnorm_ff_macaronnorm_mhanorm_mlp
norm_finaltorchnnDropoutdropoutConv1ddepthwise_conv_fusionLinear
merge_proj)selfr   r   r   r    r!   r"   r#   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/e_branchformer/encoder.pyr,   =   s4   









	z"EBranchformerEncoderLayer.__init__Nc                 C   s  |durt dt|tr|d |d }}n|d}}| jdur5|}| |}|| j| | |  }|}|}| |}t| jt	rK| ||}	n|durY| |||||}	n| ||||}	| |	}| 
|}|durs||f}| ||}t|tr|d }| |}tj||gdd}
|
dd}| |}|dd}|| | |
|  }| jdur|}| |}|| j| | |  }| |}|dur||f|fS ||fS )a@  Compute encoded features.

        Args:
            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
                - w/o pos emb: Tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, 1, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).
        Nz&cache is not None, which is not testedr   r$   )dimr%   )NotImplementedError
isinstancetupler!   r/   r-   r6   r0   r   r   r1   r   r3   cat	transposer8   r:   r    r.   r2   )r;   x_inputmaskcachexpos_embresidualx1x2x_attx_concatx_tmpr>   r>   r?   forwardg   sL   












z!EBranchformerEncoderLayer.forward)r   N)__name__
__module____qualname____doc__intr3   r4   Moduler   floatr,   rR   __classcell__r>   r>   r<   r?   r   0   s&    

*r   encoder_classesEBranchformerEncoderc                5       s  e Zd ZdZ										
																					d;dededededededededededed ed!ed"ed#ee d$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ef4 fd/d0Z	d1efd2d3Z
			d<d4ejd5ejd6ejd7ed8ed1eejejeej f fd9d:Z  ZS )=r]   zE-Branchformer encoder module.      rel_selfattnrel_poslatest      Fidentity   皙?        conv2dr@     swishlinearr   N
input_sizeoutput_sizeattention_headsattention_layer_typepos_enc_layer_typerel_pos_typecgmlp_linear_unitscgmlp_conv_kerneluse_linear_after_convgate_activation
num_blocksr"   positional_dropout_rateattention_dropout_rateinput_layer	zero_triupadding_idxlayer_drop_ratemax_pos_emb_lenuse_ffnmacaron_ffnffn_activation_typelinear_unitspositionwise_layer_typer#   interctc_use_conditioningc                    s  t    | _|dkr|dkrd}|dkrd}n|dkr*|dks#J |dks)J ntd| |dkr7t}n,|d	kr>t}n%|dkrK|dksHJ t}n|dkr]|dksUJ t}t	d
 ntd| |dkrt
jt
j|t
jt
j|||| _n|dkrt||||| _n||dkrt||||| _nk|dkrt||||| _nZ|dkrt||||| _nI|dkrt
jt
jj||d|||| _n1t|t
jjrt
j||||| _n|d u r
|krd | _nt
j|| _ntd| t|}|dkr"t||f	n|d u r-t	d ntd|dkr>t||fnH|dkrW|dksJJ t||ft	d n/|dkrl|dkscJ t|||fn|dkr|dv sxJ t||fntd| t |||	|
ft| 	
fdd|| _ t| _!|d u rg }|| _"t#|dkrdt$|k rt%||k sJ || _&d | _'d S )Nlegacyra   legacy_rel_posr`   legacy_rel_selfattnrb   zunknown rel_pos_type: abs_posscaled_abs_posz=Using legacy_rel_pos and it will be deprecated in the future.zunknown pos_enc_layer: rl   ri   conv2d2conv2d6conv2d8embed)r|   zunknown input_layer: zno macaron ffnzSupport only linear.selfattnzBUsing legacy_rel_selfattn and it will be deprecated in the future.fast_selfattn)r   r   zunknown encoder_attn_layer: c                    s>   t    
r	 nd 
rr	 S d S rS   )r   )lnumcgmlp_layercgmlp_layer_argsr"   encoder_selfattn_layerencoder_selfattn_layer_argsr   r#   rn   positionwise_layerpositionwise_layer_argsr   r>   r?   <lambda>d  s    z/EBranchformerEncoder.__init__.<locals>.<lambda>r   )(r+   r,   _output_size
ValueErrorr   r   r   r   loggingwarningr3   r4   
Sequentialr9   r   r5   r   r   r   r   r   	EmbeddingrC   rY   r   r   r   r
   r   r   r   r   encoders
after_norminterctc_layer_idxlenminmaxr   conditioning_layer)r;   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   r"   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r#   r   r   pos_enc_class
activationr<   r   r?   r,      s  

















		

 
zEBranchformerEncoder.__init__returnc                 C   s   | j S rS   )r   )r;   r>   r>   r?   rn   y  s   z EBranchformerEncoder.output_sizexs_padilensprev_statesctc	max_layerc                 C   s4  t |dddddf  |j}t| jts*t| jts*t| jts*t| jtrVt	| j|
d\}}|rMtd|
d dd| d |
d|| ||\}}n
| jdur`| |}g }	t| jdkr|durd|  krzt| jk rn nt| jD ]\}
}|||\}}|
|kr nqn]| ||\}}nTt| jD ]N\}
}|||\}}|
d | jv r|}t|tr|d }|	|
d |f | jr||}t|trt|}|d | | |d< t|}q|| | }qt|tr|d }| |}|dd}t|	dkr||	f|dfS ||dfS )a!  Calculate forward propagation.

        Args:
            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
            ilens (torch.Tensor): Input length (#batch).
            prev_states (torch.Tensor): Not to be used now.
            ctc (CTC): Intermediate CTC module.
            max_layer (int): Layer depth below which InterCTC is applied.
        Returns:
            torch.Tensor: Output tensor (#batch, L, output_size).
            torch.Tensor: Output length (#batch).
            torch.Tensor: Not to be used now.
        Nr$   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr   )r	   todevicerC   r   r   r   r   r   r   r   r   r   r   r   	enumeraterD   appendr   softmaxlistr   r   squeezesum)r;   r   r   r   r   r   masksshort_status
limit_sizeintermediate_outs	layer_idxencoder_layerencoder_outctc_outolensr>   r>   r?   rR   |  sj   $






&






zEBranchformerEncoder.forward)r^   r_   r`   ra   rb   rc   rd   Fre   rf   rg   rg   rh   ri   Fr@   rh   rj   FFrk   rc   rl   r   NF)NNN)rT   rU   rV   rW   rX   strboolrZ   r   r,   rn   r3   Tensorr   r   rR   r[   r>   r>   r<   r?   r]      s    	
 C).rW   r   typingr   r   r   r3   torch.nnr4   funasr.models.ctc.ctcr    funasr.models.branchformer.cgmlpr   %funasr.models.branchformer.fastformerr   *funasr.models.transformer.utils.nets_utilsr   r	   #funasr.models.transformer.attentionr
   r   r   #funasr.models.transformer.embeddingr   r   r   r   $funasr.models.transformer.layer_normr   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   +funasr.models.transformer.utils.subsamplingr   r   r   r   r   r   funasr.registerr   rY   r   registerr]   r>   r>   r>   r?   <module>   s(     
