o
    il                     @   s  d Z ddlZddlmZmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl(m*Z* ddl(m+Z+ ddl(m,Z, ddl(mZ ddl(m Z  ddl(m-Z- ddl(m.Z. ddl/m0Z0 ddl1Z1G dd de	j2Z3G dd de	j2Z4e05ddG dd de	j2Z6G d d! d!ej	j2Z7G d"d# d#ej	j2Z8e05dd$G d%d& d&ej	j2Z9dS )'zConformer encoder definition.    N)UnionDictListTupleOptional)nn)CTC)MultiHeadedAttentionRelPositionMultiHeadedAttention%LegacyRelPositionMultiHeadedAttention$RelPositionMultiHeadedAttentionChunk)PositionalEncodingScaledPositionalEncodingRelPositionalEncodingLegacyRelPositionalEncodingStreamingRelPositionalEncoding)	LayerNorm)Conv1dLinear)MultiLayeredConv1d)get_activation)make_pad_mask)TooShortUttErrorcheck_short_uttmake_chunk_maskmake_source_mask)PositionwiseFeedForward)repeatMultiBlocks)Conv2dSubsampling)Conv2dSubsampling2)Conv2dSubsampling6)Conv2dSubsampling8)r   )r   )Conv2dSubsamplingPad)StreamingConvInput)tablesc                       s2   e Zd ZdZe df fdd	Zdd Z  ZS )ConvolutionModulezConvolutionModule in Conformer model.

    Args:
        channels (int): The number of channels of conv layers.
        kernel_size (int): Kernerl size of conv layers.

    Tc              	      s   t t|   |d d dksJ tj|d| ddd|d| _tj|||d|d d ||d| _t|| _tj||ddd|d| _	|| _
dS )z&Construct an ConvolutionModule object.      r   )kernel_sizestridepaddingbias)r)   r*   groupsr+   N)superr%   __init__r   Conv1dpointwise_conv1depthwise_convBatchNorm1dnormpointwise_conv2
activation)selfchannelsr(   r5   r+   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/conformer/encoder.pyr.   >   s:   
	
zConvolutionModule.__init__c                 C   sV   | dd}| |}tjj|dd}| |}| | |}| |}| ddS )zCompute convolution module.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, channels).

        Returns:
            torch.Tensor: Output tensor (#batch, time, channels).

        r&   r'   dim)		transposer0   r   
functionalglur1   r5   r3   r4   )r6   xr:   r:   r;   forward`   s   


zConvolutionModule.forward)	__name__
__module____qualname____doc__r   ReLUr.   rB   __classcell__r:   r:   r8   r;   r%   5   s    "r%   c                       s2   e Zd ZdZ			d
 fdd	Zddd	Z  ZS )EncoderLayera  Encoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        conv_module (torch.nn.Module): Convolution module instance.
            `ConvlutionModule` instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        stochastic_depth_rate (float): Proability to skip this layer.
            During training, the layer may skip residual computation and return input
            as-is with given probability.
    TF        c
           
         s   t t|   || _|| _|| _|| _t|| _t|| _	|dur*t|| _
d| _nd| _| jdur<t|| _t|| _t|| _|| _|| _|| _| jrWt|| || _|	| _dS )z!Construct an EncoderLayer object.N      ?      ?)r-   rI   r.   	self_attnfeed_forwardfeed_forward_macaronconv_moduler   norm_ffnorm_mhanorm_ff_macaronff_scale	norm_conv
norm_finalr   Dropoutdropoutsizenormalize_beforeconcat_afterLinearconcat_linearstochastic_depth_rate)
r6   rY   rM   rN   rO   rP   dropout_raterZ   r[   r^   r8   r:   r;   r.      s*   






zEncoderLayer.__init__Nc                 C   s  t |tr|d |d }}n|d}}d}d}| jr1| jdkr1td | jk }dd| j  }|rN|dur@tj||gdd}|durJ||f|fS ||fS | jdurt|}| j	r]| 
|}||| j | | |  }| j	st| 
|}|}| j	r~| |}|du r|}	n@|j|jd |jd d | jfksJ |ddddddf }	|ddddddf }|du rdn|ddddddf }|dur| |	||||}
n| |	|||}
| jrtj||
fdd}||| |  }n	||| |
  }| j	s| |}| jdur)|}| j	r| |}||| | |  }| j	s)| |}|}| j	r4| |}||| j | | |  }| j	sL| |}| jdurW| |}|duretj||gdd}|durp||f|fS ||fS )a?  Compute encoded features.

        Args:
            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
                - w/o pos emb: Tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        r   r&   NFrL   r<   )
isinstancetupletrainingr^   torchranditemcatrO   rZ   rS   rT   rX   rR   shaperY   rM   r[   r]   rP   rU   rQ   rN   rV   )r6   x_inputmaskcacherA   pos_emb
skip_layerstoch_layer_coeffresidualx_qx_attx_concatr:   r:   r;   rB      sz   





&&







zEncoderLayer.forward)TFrJ   N)rC   rD   rE   rF   r.   rB   rH   r:   r:   r8   r;   rI   z   s    "$rI   encoder_classesConformerEncoderc                6       s  e Zd ZdZddddddddd	d
ddd
ddddd	d
ddg d
dfdededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ee d*ed+e	eee f f2 fd,d-Z
d.efd/d0Z	1	1d8d2ejd3ejd4ejd5ed.eejejeej f f
d6d7Z  ZS )9ru   a  Conformer encoder module.

    Args:
        input_size (int): Input dimension.
        output_size (int): Dimension of attention.
        attention_heads (int): The number of heads of multi head attention.
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        attention_dropout_rate (float): Dropout rate in attention.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        input_layer (Union[str, torch.nn.Module]): Input layer type.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            If True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            If False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        rel_pos_type (str): Whether to use the latest relative positional encoding or
            the legacy one. The legacy relative positional encoding will be deprecated
            in the future. More Details can be found in
            https://github.com/espnet/espnet/pull/2816.
        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
        encoder_attn_layer_type (str): Encoder attention layer type.
        activation_type (str): Encoder activation function type.
        macaron_style (bool): Whether to use macaron style for positionwise layer.
        use_cnn_module (bool): Whether to use convolution module.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): Kernerl size of convolution module.
        padding_idx (int): Padding idx for input_layer=embed.

                皙?rJ   conv2dTFlinear   legacyrel_posrel_selfattnswish   r`   
input_sizeoutput_sizeattention_headslinear_units
num_blocksr_   positional_dropout_rateattention_dropout_rateinput_layerrZ   r[   positionwise_layer_typepositionwise_conv_kernel_sizemacaron_stylerel_pos_typepos_enc_layer_typeselfattention_layer_typeactivation_typeuse_cnn_module	zero_triucnn_module_kernelpadding_idxinterctc_layer_idxinterctc_use_conditioningr^   c                    s  t    | _|dkr|dkrd}|dkrd}n|dkr*|dks#J |dks)J ntd| t|}|dkr;t}n,|d	krBt}n%|dkrO|dksLJ t}n|dkra|dksYJ t}t	
d
 ntd| |	dkrtjtj|tjtj||| _n|	dkrt|||| _n|	dkrt|||| _nq|	dkrt|||| _na|	dkrt|||| _nQ|	dkrt|||| _nA|	dkrtjtjj||d||| _n*t|	tjjrtj|	||| _n|	d u rtj||| _ntd|	 | _|dkr(t	||f
n |dkr6t	||f
n|dkrDt	||f
ntd|dkrUt||fn4|dkrn|dksaJ t ||ft	
d n|dkr|dkszJ t!|||fntd| t"||ftt#rg| t$|krtdt$ d| dt%| 	
fdd| _&| jrt| _'|| _(t$|d krd t)|k rt*||k sJ || _+d | _,d S )!Nr~   r   legacy_rel_posr   legacy_rel_selfattnlatestzunknown rel_pos_type: abs_posscaled_abs_posz=Using legacy_rel_pos and it will be deprecated in the future.zunknown pos_enc_layer: r|   r{   	conv2dpadconv2d2conv2d6conv2d8embed)r   zunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.selfattnzBUsing legacy_rel_selfattn and it will be deprecated in the future.zunknown encoder_attn_layer: z!Length of stochastic_depth_rate (z!) should be equal to num_blocks ()c                    s<   t  	
 r	
 nd r nd  |  	S rs   )rI   )lnumr[   convolution_layerconvolution_layer_argsr_   encoder_selfattn_layerencoder_selfattn_layer_argsr   rZ   r   positionwise_layerpositionwise_layer_argsr^   r   r:   r;   <lambda>  s    z+ConformerEncoder.__init__.<locals>.<lambda>r   )-r-   r.   _output_size
ValueErrorr   r   r   r   r   loggingwarningrd   r   
Sequentialr\   r   rW   r   r   r"   r   r    r!   	Embeddingra   ModulerZ   r   r   r   NotImplementedErrorr	   r   r
   r%   floatlenr   encoders
after_normr   minmaxr   conditioning_layer)r6   r   r   r   r   r   r_   r   r   r   rZ   r[   r   r   r   r   r   r   r   r   r   r   r   r   r   r^   r5   pos_enc_classr8   r   r;   r.   B  s&  











"
 
zConformerEncoder.__init__returnc                 C      | j S rs   r   r6   r:   r:   r;   r        zConformerEncoder.output_sizeNxs_padilensprev_statesctcc                 C   s  t |dddddf  |j}t| jts0t| jts0t| jts0t| jts0t| jt	r\t
| j|d\}}|rStd|d dd| d |d|| ||\}}n| |}g }t| jdkrs| ||\}}nXt| jD ]R\}	}
|
||\}}|	d | jv r|}t|tr|d }| jr| |}||	d |f | jr||}t|tr|\}}|| | }||f}qx|| | }qxt|tr|d }| jr| |}|dd}t|dkr||f|dfS ||dfS )a  Calculate forward propagation.

        Args:
            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
            ilens (torch.Tensor): Input length (#batch).
            prev_states (torch.Tensor): Not to be used now.

        Returns:
            torch.Tensor: Output tensor (#batch, L, output_size).
            torch.Tensor: Output length (#batch).
            torch.Tensor: Not to be used now.

        Nr&   has ) frames and is too short for subsampling (it needs more than  frames), return empty resultsr   )r   todevicera   r   r   r   r    r!   r"   r   rY   r   r   r   r   	enumeraterb   rZ   r   appendr   softmaxr   squeezesum)r6   r   r   r   r   masksshort_status
limit_sizeintermediate_outs	layer_idxencoder_layerencoder_outctc_outrA   rl   olensr:   r:   r;   rB     sd   $














zConformerEncoder.forward)NN)rC   rD   rE   rF   intr   strboolr   r   r.   r   rd   Tensorr   r   r   rB   rH   r:   r:   r8   r;   ru     s    %	
 Mc                       s   e Zd ZdZej i dfdededejjde	de
dd	f fd
dZ			ddejdeej dedeejejf fddZ  ZS )CausalConvolutionaF  ConformerConvolution module definition.
    Args:
        channels: The number of channels.
        kernel_size: Size of the convolving kernel.
        activation: Type of activation function.
        norm_args: Normalization module arguments.
        causal: Whether to use causal convolution (set to True if streaming).
    Fr7   r(   r5   	norm_argscausalr   Nc                    s   t    |d d dksJ || _tjj|d| dddd| _|r*|d | _d}n	d| _|d d }tjj|||d||d| _tjj	|fi || _
tjj||dddd| _|| _dS )z)Construct an ConformerConvolution object.r&   r'   r   )r(   r)   r*   )r)   r*   r,   N)r-   r.   r(   rd   r   r/   r0   lorderr1   r2   r3   r4   r5   )r6   r7   r(   r5   r   r   r*   r8   r:   r;   r.   k  s@   
	

zCausalConvolution.__init__r   rA   rk   right_contextc                 C   s   |  |dd}tjjj|dd}| jdkrX|du r)tjj|| jdfdd}n/tj||gdd}|dkrI|dddd| j|  | f }n|dddd| j df }| 	|}| 
| |}| |dd}||fS )a  Compute convolution module.
        Args:
            x: ConformerConvolution input sequences. (B, T, D_hidden)
            cache: ConformerConvolution input cache. (1, conv_kernel, D_hidden)
            right_context: Number of frames in right context.
        Returns:
            x: ConformerConvolution output sequences. (B, T, D_hidden)
            cache: ConformerConvolution output cache. (1, conv_kernel, D_hidden)
        r&   r'   r<   r   NconstantrJ   )r0   r>   rd   r   r?   r@   r   padrg   r1   r5   r3   r4   )r6   rA   rk   r   r:   r:   r;   rB     s   
&
zCausalConvolution.forward)Nr   )rC   rD   rE   rF   rd   r   rG   r   r   r   r   r.   r   r   r   rB   rH   r:   r:   r8   r;   r   a  s:    4r   c                       s   e Zd ZdZei dfdedejjdejjdejjdejjdejjd	e	d
e
ddf fddZdedejddfddZ	ddejdejdejdeej deejejejf f
ddZ			d dejdejdejdedededeejejf fddZ  ZS )!ChunkEncoderLayera  Chunk Conformer module definition.
    Args:
        block_size: Input/output size.
        self_att: Self-attention module instance.
        feed_forward: Feed-forward module instance.
        feed_forward_macaron: Feed-forward module instance for macaron network.
        conv_mod: Convolution module instance.
        norm_class: Normalization module class.
        norm_args: Normalization module arguments.
        dropout_rate: Dropout rate.
    rJ   
block_sizeself_attrN   rO   conv_mod
norm_classr   r_   r   Nc	           	         s   t    || _|| _|| _d| _|| _||fi || _||fi || _||fi || _	||fi || _
||fi || _tj|| _|| _d| _dS )zConstruct a Conformer object.rK   N)r-   r.   r   rN   rO   feed_forward_scaler   norm_feed_forwardnorm_self_attnorm_macaronrU   rV   rd   r   rW   rX   r   rk   )	r6   r   r   rN   rO   r   r   r   r_   r8   r:   r;   r.     s   

zChunkEncoderLayer.__init__left_contextr   c                 C   s:   t jd|| jf|dt jd| j| jjd f|dg| _dS )zInitialize/Reset self-attention and convolution modules cache for streaming.
        Args:
            left_context: Number of left frames during chunk-by-chunk inference.
            device: Device to use for cache tensor.
        r&   r   N)rd   zerosr   r   r(   rk   r6   r   r   r:   r:   r;   reset_streaming_cache  s   


z'ChunkEncoderLayer.reset_streaming_cacherA   pos_encrj   
chunk_maskc                 C   s   |}|  |}|| j| | |  }|}| |}|}|| | j||||||d }|}| |}| |\}}|| | }|}| |}|| j| | 	|  }| 
|}|||fS )a  Encode input sequences.
        Args:
            x: Conformer input sequences. (B, T, D_block)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_block)
            mask: Source mask. (B, T)
            chunk_mask: Chunk mask. (T_2, T_2)
        Returns:
            x: Conformer output sequences. (B, T, D_block)
            mask: Source mask. (B, T)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_block)
        r   )r   r   rX   rO   r   r   rU   r   r   rN   rV   )r6   rA   r   rj   r   ro   rp   _r:   r:   r;   rB     s2   





zChunkEncoderLayer.forward   r   
chunk_sizer   c              	   C   s*  |}|  |}|| j| |  }|}| |}|dkr)tj| jd |gdd}n|}|}	|dkrC|dd||  | ddf }
n|dd| dddf }
|| j|||	|||d }|}| |}| j	|| jd |d\}}|| }|}| 
|}|| j| |  }| |}|
|g| _||fS )a  Encode chunk of input sequence.
        Args:
            x: Conformer input sequences. (B, T, D_block)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_block)
            mask: Source mask. (B, T_2)
            left_context: Number of frames in left context.
            right_context: Number of frames in right context.
        Returns:
            x: Conformer output sequences. (B, T, D_block)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_block)
        r   r&   r<   Nr   )rk   r   )r   r   rO   r   rd   rg   rk   r   rU   r   r   rN   rV   )r6   rA   r   rj   r   r   r   ro   keyval	att_cache
conv_cacher:   r:   r;   chunk_forward7  s<   

$	



zChunkEncoderLayer.chunk_forwardrs   )r   r   r   )rC   rD   rE   rF   r   r   rd   r   r   r   r   r.   r   r   r   r   r   rB   r   rH   r:   r:   r8   r;   r     sn    	
"
7r   ChunkConformerEncoderc                G       s  e Zd ZdZ												
																							dXdedededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2ed3ed4ed5ed6ed7ed8ed9ed:d;fF fd<d=Zd:efd>d?Z	d@edAed:efdBdCZ
d@ed:efdDdEZdFedGejd:d;fdHdIZdJejdKejd:eejejf fdLdMZdJejdKejd:eejejf fdNdOZ		P	dYdJejdKejdQedFedRed:ejfdSdTZ		P	dYdJejdKejdUejdQedFedRed:ejfdVdWZ  ZS )ZConformerChunkEncoderzEncoder module definition.
    Args:
        input_size: Input size.
        body_conf: Encoder body configuration.
        input_conf: Encoder input configuration.
        main_conf: Encoder main configuration.
    rv   rw   rx   ry   rz   rJ   FTr|   r}   r~   r   r   r   
layer_normr   h㈵>      ?   r   r&   r   r   r   r   r   r   r_   r   r   embed_vgg_likerZ   r[   r   r   r   r   r   r   r   r   r   	norm_typer   conv_mod_norm_epsconv_mod_norm_momentumsimplified_att_scoredynamic_chunk_trainingshort_chunk_thresholdshort_chunk_sizeleft_chunk_sizetime_reduction_factorunified_model_trainingdefault_chunk_sizejitter_rangesubsampling_factorr   Nc#           (         s   t    t||"|	d| _t|| _t|}#|||#f||d}$||#|$|p+|f |||fg }%t|D ]}& fdd}'|%|' q9t	dd |%D | _
| _|| _|| _|| _|| _|| _| | _|!| _|| _dS )zConstruct an Encoder object.)r   	conv_sizer  vgg_liker   )epsmomentumc                      s$   t t t t t  dS )N)r_   )r   r   r   r   r:   conv_mod_argsr_   mult_att_argsr   pos_wise_argsr:   r;   r     s    z0ConformerChunkEncoder.__init__.<locals>.<lambda>c                 S   s   g | ]}| qS r:   r:   ).0fnr:   r:   r;   
<listcomp>  s    z2ConformerChunkEncoder.__init__.<locals>.<listcomp>N)r-   r.   r#   r   r   r   r   ranger   r   r   r   r  r  r  r  r  r  r  r  )(r6   r   r   r   r   r   r_   r   r   r  rZ   r[   r   r   r   r   r   r   r   r   r   r  r   r  r	  r
  r  r  r  r  r  r  r  r  r  r5   conv_mod_norm_args
fn_modulesr   moduler8   r  r;   r.   ~  sb   
&	
zConformerChunkEncoder.__init__c                 C   r   rs   r   r   r:   r:   r;   r     r   z!ConformerChunkEncoder.output_sizerY   
hop_lengthc                 C   s   | j || S )aL  Return the corresponding number of sample for a given chunk size, in frames.
        Where size is the number of features frames after applying subsampling.
        Args:
            size: Number of frames after subsampling.
            hop_length: Frontend's hop length
        Returns:
            : Number of raw samples
        r   get_size_before_subsampling)r6   rY   r#  r:   r:   r;   get_encoder_input_raw_size  s   	z0ConformerChunkEncoder.get_encoder_input_raw_sizec                 C   s   | j |S )a  Return the corresponding number of sample for a given chunk size, in frames.
        Where size is the number of features frames after applying subsampling.
        Args:
            size: Number of frames after subsampling.
        Returns:
            : Number of raw samples
        r$  )r6   rY   r:   r:   r;   get_encoder_input_size  s   z,ConformerChunkEncoder.get_encoder_input_sizer   r   c                 C   s   | j ||S )zInitialize/Reset encoder streaming cache.
        Args:
            left_context: Number of frames in left context.
            device: Device ID.
        )r   r   r   r:   r:   r;   r     s   z+ConformerChunkEncoder.reset_streaming_cacherA   x_lenc                 C   st  t | jj|d\}}|r$td|d dd| d |d|t||j}| jr| j	rD| j
t| j | jd d  }n| j
}| |||\}}| |}t|d|| j|jd}| j|||dd	}	| j||||d	}
|d
d}| jdkr|	dddd| jddf }	|
dddd| jddf }
t|d | jd }|	|
|fS | jr|d}| j	rtd|d }||| j kr|}n|| j d }n| j
}| |||\}}| |}t|d|| j|jd}n| ||d\}}| |}d}| j||||d	}|d
d}| jdkr5|dddd| jddf }t|d | jd }||dfS )  Encode input sequences.
        Args:
            x: Encoder input features. (B, T_in, F)
            x_len: Encoder input features lengths. (B,)
        Returns:
           x: Encoder outputs. (B, T_out, D_enc)
           x_len: Encoder outputs lenghts. (B,)
        r&   r   r   r   r   )r&   r  r   Nr   r   )r   r   r  rY   r   r   r   r   r  rc   r  rd   randintr  rf   r   r   r  r   eqr   r  floor_divider  r  r  )r6   rA   r(  r   r   rj   r   r   r   x_uttx_chunkr   max_lenr:   r:   r;   rB     s   







zConformerChunkEncoder.forwardc                 C   s   t | jj|d\}}|r$td|d dd| d |d|t||j}| ||d\}}| |}| j	|||dd}| j
dkrW|dddd| j
ddf }|S )r)  r&   r   r   r   r   Nr   )r   r   r  rY   r   r   r   r   r   r   r  )r6   rA   r(  r   r   rj   r   r.  r:   r:   r;   full_utt_forwards  s*   


z&ConformerChunkEncoder.full_utt_forward    r   r   c                 C   s   t | jj|d\}}|r$td|d dd| d |d|t|}| |||\}}| |}	t|d|| j|j	d}
| j
||	||
d}|dd}| jdkrh|d d d d | jd d f }|S )	Nr&   r   r   r   r   r*  r   r   )r   r   r  rY   r   r   r   r   r  r   r   r,  r   r  )r6   rA   r(  r   r   r   r   r   rj   r   r   r   r:   r:   r;   simu_chunk_forward  s8   


z(ConformerChunkEncoder.simu_chunk_forwardprocessed_framesc           
      C   s   t |}| ||d\}}|dkr-tj||jdd|d}||k}tj||gdd}| j||d}	| j	j
||	||||d}|dkrR|ddd| ddf }| jdkrf|dddd| jddf }|S )a  Encode input sequences as chunks.
        Args:
            x: Encoder input features. (1, T_in, F)
            x_len: Encoder input features lengths. (1,)
            processed_frames: Number of frames already seen.
            left_context: Number of frames in left context.
            right_context: Number of frames in right context.
        Returns:
           x: Encoder outputs. (B, T_out, D_enc)
        Nr   r   r&   r<   r   )r   r   r   )r   r   rd   aranger   viewfliprg   r   r   r   r  )
r6   rA   r(  r4  r   r   r   rj   processed_maskr   r:   r:   r;   r     s*   	
z#ConformerChunkEncoder.chunk_forward)!rv   rw   rx   ry   rz   rz   rJ   FTFr|   r}   Fr~   r   r   r   TFr  r   r  rz   FFr  r  r   r&   Fr   rw   r&   )r   r2  r   )rC   rD   rE   rF   r   r   r   r   r.   r   r&  r'  rd   r   r   r   r   rB   r1  r3  tensorr   rH   r:   r:   r8   r;   r  t  s<   	
 !"#$p

e
)
.r  ):rF   r   typingr   r   r   r   r   rd   r   funasr.models.ctc.ctcr   #funasr.models.transformer.attentionr	   r
   r   r   #funasr.models.transformer.embeddingr   r   r   r   r   $funasr.models.transformer.layer_normr   0funasr.models.transformer.utils.multi_layer_convr   r   *funasr.models.transformer.utils.nets_utilsr   r   r   r   r   r   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   r   +funasr.models.transformer.utils.subsamplingr   r   r    r!   r"   r#   funasr.registerr$   pdbr   r%   rI   registerru   r   r   r  r:   r:   r:   r;   <module>   sL   E 
%  D` 
4