o
    ei`                     @   sh   d Z ddlZddlZddlmZmZmZ G dd dejj	Z
G dd dejj	ZG dd	 d	ejj	ZdS )
zDWide ResNet for Speech Enhancement.

Author
 * Peter Plantinga 2022
    N)ISTFTSTFTspectral_magnitudec                       sX   e Zd ZdZddddg dddej ejj	j
d	d
f fdd	Zdd Zdd Z  ZS )EnhanceResnetaY  Model for enhancement based on Wide ResNet.

    Full model description at: https://arxiv.org/pdf/2112.06068.pdf

    Arguments
    ---------
    n_fft : int
        Number of points in the fourier transform, see ``speechbrain.processing.features.STFT``
    win_length : int
        Length of stft window in ms, see ``speechbrain.processing.features.STFT``
    hop_length : int
        Time between windows in ms, see ``speechbrain.processing.features.STFT``
    sample_rate : int
        Number of samples per second of input audio.
    channel_counts : list of ints
        Number of output channels in each CNN block. Determines number of blocks.
    dense_count : int
        Number of dense layers.
    dense_nodes : int
        Number of nodes in the dense layers.
    activation : function
        Function to apply before convolution layers.
    normalization : class
        Name of class to use for constructing norm layers.
    dropout : float
        Portion of layer outputs to drop during training (between 0 and 1).
    mask_weight : float
        Amount of weight to give mask. 0 - no masking, 1 - full masking.

    Example
    -------
    >>> inputs = torch.rand([10, 16000])
    >>> model = EnhanceResnet()
    >>> outputs, feats = model(inputs)
    >>> outputs.shape
    torch.Size([10, 15872])
    >>> feats.shape
    torch.Size([10, 63, 257])
              i>  )   r	      r
   r   r      i   皙?gGz?c                    s  t    || _t||||d| _tjjjd d |d d gd| _	|D ]}| j	j
t|||	|
d q#tjjj| j	 d| _t|D ](}| jj
tjjj|dd | j
| | j
tjjj | j
tjj|
d qA| jj
tjjj|d d d	 t||||d| _d S )
N)n_fft
win_length
hop_lengthsample_rater      )input_shape)channels
activationnormalizationdropoutT)	n_neuronscombine_dims)p)r   )super__init__mask_weightr   stftsbnnet
containers
SequentialCNNappend	ConvBlockget_output_shapeDNNrangelinearLinearr   	LayerNormtorchnnDropoutr   istft)selfr   r   r   r   channel_countsdense_countdense_nodesr   r   r   r   channel_count_	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/EnhanceResnet.pyr   6   sL   
	zEnhanceResnet.__init__c                 C   sp   |  |}| |}| | |}|jdddd}| j| | }|d| j | 7 }| |}| ||fS )z;Processes the input tensor and outputs the enhanced speech.r   r   )minmax)r   extract_featsr&   r"   clamp	unsqueezer   r.   )r/   x
noisy_speclog_magmaskmasked_specenhanced_featuresr7   r7   r8   forwardv   s   


zEnhanceResnet.forwardc                 C   s   t t|ddS )z<Takes the stft output and produces features for computation.g      ?)power)r+   log1pr   )r/   r?   r7   r7   r8   r<      s   zEnhanceResnet.extract_feats)__name__
__module____qualname____doc__r+   r,   GELUr   r   r   BatchNorm2dr   rE   r<   __classcell__r7   r7   r5   r8   r      s     *@r   c                       s<   e Zd ZdZej ejj	j
df fdd	Zdd Z  ZS )r$   a  Convolution block, including squeeze-and-excitation.

    Arguments
    ---------
    input_shape : tuple of ints
        The expected size of the inputs.
    channels : int
        Number of output channels.
    activation : function
        Function applied before each block.
    normalization : class
        Name of a class to use for constructing norm layers.
    dropout : float
        Portion of block outputs to drop during training.

    Example
    -------
    >>> inputs = torch.rand([10, 20, 30, 128])
    >>> block = ConvBlock(input_shape=inputs.shape, channels=256)
    >>> outputs = block(inputs)
    >>> outputs.shape
    torch.Size([10, 20, 15, 256])
    r   c                    s   t    || _tjjj||ddd| _tjjj||dd| _||d| _	tjjj||dd| _
||d| _tjjj|d| _t|d| _d S )N   )r   r   )r   out_channelskernel_sizestride)in_channelsrP   rQ   )
input_size)	drop_rate)r   r   r   r   r   r"   Conv2d
downsampleconv1norm1conv2norm2r   	Dropout2dSEblockse_block)r/   r   r   r   r   r   r5   r7   r8   r      s(   
zConvBlock.__init__c                 C   sp   |  |}| |}| |}| |}| |}| |}| |}| |}| |}|| |9 }|| S )z6Processes the input tensor with a convolutional block.)rW   r   rY   r   rX   r[   rZ   r^   )r/   r?   residualr7   r7   r8   rE      s   








zConvBlock.forward)rH   rI   rJ   rK   r+   r,   rL   r   r   r   r*   r   rE   rN   r7   r7   r5   r8   r$      s    r$   c                       s(   e Zd ZdZ fddZdd Z  ZS )r]   a  Squeeze-and-excitation block.

    Defined: https://arxiv.org/abs/1709.01507

    Arguments
    ---------
    input_size : tuple of ints
        Expected size of the input tensor

    Example
    -------
    >>> inputs = torch.rand([10, 20, 30, 256])
    >>> se_block = SEblock(input_size=inputs.shape[-1])
    >>> outputs = se_block(inputs)
    >>> outputs.shape
    torch.Size([10, 1, 1, 256])
    c                    s6   t    tjjj||d| _tjjj||d| _d S )N)rT   r   )r   r   r   r   r(   r)   linear1linear2)r/   rT   r5   r7   r8   r      s   
zSEblock.__init__c                 C   sT   | d| d }tj|ddd| }| |}tjj|}| |}t|S )z;Processes the input tensor with a squeeze-and-excite block.r   r   )r   r   T)dimkeepdim)	sizer+   sumr`   r,   
functionalrelura   sigmoid)r/   r?   countr7   r7   r8   rE      s   


zSEblock.forward)rH   rI   rJ   rK   r   rE   rN   r7   r7   r5   r8   r]      s    	r]   )rK   r+   speechbrainr   speechbrain.processing.featuresr   r   r   r,   Moduler   r$   r]   r7   r7   r7   r8   <module>   s     F