o
    i'                     @   s   d Z ddlmZ ddlmZmZmZ ddlZddlZddl	m
Z
 ddlmZ ddlmZmZ G dd	 d	eZG d
d dejjZdS )z&Sinc convolutions for raw audio input.    )OrderedDict)OptionalTupleUnionN)check_argument_types)AbsPreEncoder)LogCompressionSincConvc                       s   e Zd ZdZ							d'd	eeeef d
edededededef fddZdd Z							d(d
ededededef
ddZ
dd Zd ejd!ejd"eejejf fd#d$Zd"efd%d&Z  ZS ))LightweightSincConvsu  Lightweight Sinc Convolutions.

    Instead of using precomputed features, end-to-end speech recognition
    can also be done directly from raw audio using sinc convolutions, as
    described in "Lightweight End-to-End Speech Recognition from Raw Audio
    Data Using Sinc-Convolutions" by Kürzinger et al.
    https://arxiv.org/abs/2010.07597

    To use Sinc convolutions in your model instead of the default f-bank
    frontend, set this module as your pre-encoder with `preencoder: sinc`
    and use the input of the sliding window frontend with
    `frontend: sliding_window` in your yaml configuration file.
    So that the process flow is:

    Frontend (SlidingWindow) -> SpecAug -> Normalization ->
    Pre-encoder (LightweightSincConvs) -> Encoder -> Decoder

    Note that this method also performs data augmentation in time domain
    (vs. in spectral domain in the default frontend).
    Use `plot_sinc_filters.py` to visualize the learned Sinc filters.
    >        	leakyreludropouthammingmelfsin_channelsout_channelsactivation_typedropout_typewindowing_type
scale_typec                    s   t  sJ t   t|trt|}|| _|| _|| _	|| _
|| _|| _|| _tjjttjjd| _|| jvrEtdt| j  tjjtjjd| _|| jvr`tdt| j  |   |   dS )a  Initialize the module.

        Args:
            fs: Sample rate.
            in_channels: Number of input channels.
            out_channels: Number of output channels (for each input channel).
            activation_type: Choice of activation function.
            dropout_type: Choice of dropout function.
            windowing_type: Choice of windowing function.
            scale_type:  Choice of filter-bank initialization scale.
        )r   spatial	dropout2dzDropout type has to be one of )r   reluz!Activation type has to be one of N)r   super__init__
isinstancestrhumanfriendly
parse_sizer   r   r   r   r   r   r   torchnnDropoutSpatialDropout	Dropout2dchoices_dropoutNotImplementedErrorlistkeys	LeakyReLUReLUchoices_activation_create_sinc_convsespnet_initialization_fn)selfr   r   r   r   r   r   r   	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/preencoder/sinc.pyr   )   s@   





zLightweightSincConvs.__init__c              	   C   s   t  }d}t| j|dd| j| j| jd| _t d| jfdt fdtj	j
|dd	fd
tj	dfg}tj	||d< |}d}| j||dddddd|d< |}| j}dD ]}| j||ddd|d| < |}qR| j}| j||dddd|d< tj	|| _d S )N   e   r   )kernel_sizestrider   window_funcr   Filtersr   	BatchNormTaffineAvgPool   SincConvBlock   r   g?)depthwise_kernel_sizedepthwise_stridepointwise_groupsavgpooldropout_probabilityDConvBlock1)r?         	   )rB   rC   
DConvBlock   )rB   rC   rD   DConvBlock5)r   r	   r   r   r   r   filtersr   r"   r#   BatchNorm1d	AvgPool1d
Sequentialgen_lsc_blockr   blocks)r0   rS   r   blockr   layerr3   r3   r4   r.   d   s\   	
	
z'LightweightSincConvs._create_sinc_convsrJ   Nr   333333?FrB   rC   rF   c	                 C   s   t  }	|s||}}
|
dkr|||
 }}
|
dkstjj|||||d|	d< |r5tjj||dd|d|	d< | j| j  |	d< tjj|dd|	d	< |rRtjd
|	d< | j| j	 ||	d< tj
|	S )a  Generate a convolutional block for Lightweight Sinc convolutions.

        Each block consists of either a depthwise or a depthwise-separable
        convolutions together with dropout, (batch-)normalization layer, and
        an optional average-pooling layer.

        Args:
            in_channels: Number of input channels.
            out_channels: Number of output channels.
            depthwise_kernel_size: Kernel size of the depthwise convolution.
            depthwise_stride: Stride of the depthwise convolution.
            depthwise_groups: Number of groups of the depthwise convolution.
            pointwise_groups: Number of groups of the pointwise convolution.
            dropout_probability: Dropout probability in the block.
            avgpool: If True, an AvgPool layer is inserted.

        Returns:
            torch.nn.Sequential: Neural network building block.
        r   )groups	depthwiser   	pointwise
activationTr<   	batchnormr?   rE   r   )r   r"   r#   Conv1dr-   r   rO   rP   r'   r   rQ   )r0   r   r   rB   rC   depthwise_groupsrD   rF   rE   rT   rr3   r3   r4   rR      s.   



z"LightweightSincConvs.gen_lsc_blockc                 C   sZ   | j   | jD ]"}|D ]}t|tjjkr)|jr)d|jj	dd< d|j
j	dd< qqdS )z/Initialize sinc filters with filterbank values.g      ?Ng        )rN   init_filtersrS   typer"   r#   rO   r=   weightdatabias)r0   rT   rU   r3   r3   r4   r/      s   

z-LightweightSincConvs.espnet_initialization_fninputinput_lengthsreturnc                 C   sV   |  \}}}}||| ||}| j|}|  \}	}
}||||
| }||fS )a  Apply Lightweight Sinc Convolutions.

        The input shall be formatted as (B, T, C_in, D_in)
        with B as batch size, T as time dimension, C_in as channels,
        and D_in as feature dimension.

        The output will then be (B, T, C_out*D_out)
        with C_out and D_out as output dimensions.

        The current module structure only handles D_in=400, so that D_out=1.
        Remark for the multichannel case: C_out is the number of out_channels
        given at initialization multiplied with C_in.
        )sizeviewrS   forward)r0   rd   re   BTC_inD_ininput_framesoutput_frames_C_outD_outr3   r3   r4   ri      s   zLightweightSincConvs.forwardc                 C   s   | j | j S )zGet the output size.)r   r   )r0   r3   r3   r4   output_size   s   z LightweightSincConvs.output_size)r   r   r   r   r   r   r   )rJ   r   Nr   rV   F)__name__
__module____qualname____doc__r   intr   floatr   r.   rR   r/   r"   Tensorr   ri   rs   __classcell__r3   r3   r1   r4   r
      sf    ;>
6	
r
   c                       sR   e Zd ZdZ		ddedeeeef  f fddZ	de
jd	e
jfd
dZ  ZS )r%   z^Spatial dropout module.

    Apply dropout to full channels on tensors of input (B, C, D)
    rV   NrF   shapec                    s:   t  sJ t   |du rd}tj|| _|f| _dS )zInitialize.

        Args:
            dropout_probability: Dropout probability.
            shape (tuple, list): Shape of input tensors.
        N)r   r?   r   )r   r   r   r"   r#   r&   r   r|   )r0   rF   r|   r1   r3   r4   r     s   

zSpatialDropout.__init__xrf   c                 C   s"   |j | j }| |}|j | j S )z"Forward of spatial dropout module.)permuter|   r   )r0   r}   yr3   r3   r4   ri     s   
zSpatialDropout.forward)rV   N)rt   ru   rv   rw   ry   r   r   tupler)   r   r"   rz   ri   r{   r3   r3   r1   r4   r%      s    r%   )rw   collectionsr   typingr   r   r   r    r"   	typeguardr   %espnet2.asr.preencoder.abs_preencoderr   espnet2.layers.sinc_convr   r	   r
   r#   Moduler%   r3   r3   r3   r4   <module>   s    l