o
    iR                     @   s8   d Z ddlmZmZmZ ddlZG dd dejjZdS )z$Conv1d block for Transducer encoder.    )OptionalTupleUnionNc                       sF  e Zd ZdZ								d'dededeeef d	eeef d
eeef deeef dedededededdf fddZ	dede
jddfddZ		d(de
jde
jdee
j dee
j dee
je
je
jf f
ddZ		d)de
jde
jde
jded edee
je
jf fd!d"Zde
jde
jfd#d$Zde
jde
jfd%d&Z  ZS )*Conv1da  Conv1d module definition.

    Args:
        input_size: Input dimension.
        output_size: Output dimension.
        kernel_size: Size of the convolving kernel.
        stride: Stride of the convolution.
        dilation: Spacing between the kernel points.
        groups: Number of blocked connections from input channels to output channels.
        bias: Whether to add a learnable bias to the output.
        batch_norm: Whether to use batch normalization after convolution.
        relu: Whether to use a ReLU activation after convolution.
        causal: Whether to use causal convolution (set to True if streaming).
        dropout_rate: Dropout rate.

       TF        
input_sizeoutput_sizekernel_sizestridedilationgroupsbias
batch_normrelucausaldropout_ratereturnNc              	      s   t    |
r|d | _d}nd| _|}tjj|||||||d| _tjj|d| _|	r2tj	 | _
|r;tj|| _tj||| _|| _|| _|	| _|| _|
| _|| _||d  | _|| _d| _dS )zConstruct a Conv1d object.r   r   )r   r   r   r   )pN)super__init__lordertorchnnr   convDropoutdropoutReLU	relu_funcBatchNorm1dbnLinearout_posr   r	   r   r   r   r
   paddingr   cache)selfr   r	   r
   r   r   r   r   r   r   r   r   	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr_transducer/encoder/blocks/conv1d.pyr      s<   



zConv1d.__init__left_contextdevicec                 C   s"   t jd| j| jd f|d| _dS )zInitialize/Reset Conv1d cache for streaming.

        Args:
            left_context: Number of left frames during chunk-by-chunk inference.
            device: Device to use for cache tensor.

        r   )r+   N)r   zerosr   r
   r$   )r%   r*   r+   r(   r(   r)   reset_streaming_cacheS   s   zConv1d.reset_streaming_cachexpos_encmask
chunk_maskc                 C   s   | dd}| jdkrtjj|| jdfdd}n
| |}| |}| |}| j	r0| 
|}| |}| jr=| |}| dd}||| |fS )a  Encode input sequences.

        Args:
            x: Conv1d input sequences. (B, T, D_in)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_in)
            mask: Source mask. (B, T)
            chunk_mask: Chunk mask. (T_2, T_2)

        Returns:
            x: Conv1d output sequences. (B, sub(T), D_out)
            mask: Source mask. (B, T) or (B, sub(T))
            pos_enc: Positional embedding sequences.
                       (B, 2 * (T - 1), D_att) or (B, 2 * (sub(T) - 1), D_out)

        r      r   constantr   )	transposer   r   r   
functionalpadcreate_new_maskcreate_new_pos_encr   r   r    r   r   r   r"   )r%   r.   r/   r0   r1   r(   r(   r)   forward_   s   






zConv1d.forwardr   right_contextc                 C   s   t j| j|ddgdd}|dkr&|dddd| j|  | f | _n|dddd| j df | _| |}| jrC| |}| |}| j	rP| 
|}|dd}|| |fS )a  Encode chunk of input sequence.

        Args:
            x: Conv1d input sequences. (B, T, D_in)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_in)
            mask: Source mask. (B, T)
            left_context: Number of frames in left context.
            right_context: Number of frames in right context.

        Returns:
            x: Conv1d output sequences. (B, T, D_out)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_out)

        r   r2   dimr   N)r   catr$   r4   r   r   r   r    r   r   r   r"   )r%   r.   r/   r0   r*   r:   r(   r(   r)   chunk_forward   s   ( 



zConv1d.chunk_forwardc                 C   s:   | j dkr|ddd| j  f }|dddd| jf S )zCreate new mask for output sequences.

        Args:
            mask: Mask of input sequences. (B, T)

        Returns:
            mask: Mask of output sequences. (B, sub(T))

        r   N)r#   r   )r%   r0   r(   r(   r)   r7      s   

zConv1d.create_new_maskc                 C   s   |ddd| dd d ddf }|dd| dd dddf }| jdkrI|ddd| j ddf }|ddd| j ddf }|dddd| jddf }|dddd| jddf }tj||ddddddf gdd}|S )a&  Create new positional embedding vector.

        Args:
            pos_enc: Input sequences positional embedding.
                     (B, 2 * (T - 1), D_in)

        Returns:
            pos_enc: Output sequences positional embedding.
                     (B, 2 * (sub(T) - 1), D_in)

        Nr   r2   r   r;   )sizer#   r   r   r=   )r%   r/   pos_enc_positivepos_enc_negativer(   r(   r)   r8      s   ($
(zConv1d.create_new_pos_enc)r   r   r   TFTFr   )NN)r   r   )__name__
__module____qualname____doc__intr   r   boolfloatr   r   r+   r-   Tensorr   r9   r>   r7   r8   __classcell__r(   r(   r&   r)   r      s    



	
9
1
+r   )	rE   typingr   r   r   r   r   Moduler   r(   r(   r(   r)   <module>   s    