o
    ½e¦iø6  ã                   @   s¾   d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	 ddl
mZ ddd„Zddd	„ZG d
d„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejjƒZdS )zNResNet PreActivated for speaker verification

Authors
 * Mickael Rouvier 2022
é    N)ÚLinear)ÚBatchNorm1dé   c                 C   s   t j| |d|dddS )z#2D convolution with kernel_size = 3é   r   F)Úkernel_sizeÚstrideÚpaddingÚbias©ÚnnÚConv2d©Ú	in_planesÚ
out_planesr   © r   ú]/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/ResNet.pyÚconv3x3   s   úr   c                 C   s   t j| |d|ddS )z#2D convolution with kernel_size = 1r   F©r   r   r	   r
   r   r   r   r   Úconv1x1   s   
ÿr   c                       s0   e Zd ZdZdejf‡ fdd„	Zdd„ Z‡  ZS )ÚSEBlockaÙ  An implementation of Squeeze-and-Excitation Block.

    Arguments
    ---------
    channels : int
        The number of channels.
    reduction : int
        The reduction factor of channels.
    activation : Callable
        The function to apply between layers.

    Example
    -------
    >>> inp_tensor = torch.rand([1, 64, 80, 40])
    >>> se_layer = SEBlock(64)
    >>> out_tensor = se_layer(inp_tensor)
    >>> out_tensor.shape
    torch.Size([1, 64, 80, 40])
    r   c                    sN   t t| ƒ ¡  t d¡| _t t ||| ¡|ƒ t || |¡t ¡ ¡| _	d S )Nr   )
Úsuperr   Ú__init__r   ÚAdaptiveAvgPool2dÚavg_poolÚ
Sequentialr   ÚSigmoidÚfc)ÚselfÚchannelsÚ	reductionÚ
activation©Ú	__class__r   r   r   ;   s   
üzSEBlock.__init__c                 C   s@   |  ¡ \}}}}|  |¡ ||¡}|  |¡ ||dd¡}|| S )ú^Intermediate step. Processes the input tensor x
        and returns an output tensor.
        r   )Úsizer   Úviewr   )r   ÚxÚbÚcÚ_Úyr   r   r   ÚforwardG   s   zSEBlock.forward©	Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚReLUr   r+   Ú__classcell__r   r   r!   r   r   &   s    r   c                       s2   e Zd ZdZddejf‡ fdd„	Zdd„ Z‡  ZS )Ú
BasicBlocka–  An implementation of ResNet Block.

    Arguments
    ---------
    in_channels : int
        Number of input channels.
    out_channels : int
        The number of output channels.
    stride : int
        Factor that reduce the spatial dimensionality
    downsample : torch function
        A function for downsample the identity of block when stride != 1
    activation : torch class
        A class for constructing the activation layers.

    Example
    -------
    >>> inp_tensor = torch.rand([1, 64, 80, 40])
    >>> layer = BasicBlock(64, 64, stride=1)
    >>> out_tensor = layer(inp_tensor)
    >>> out_tensor.shape
    torch.Size([1, 64, 80, 40])
    r   Nc                    sp   t t| ƒ ¡  |ƒ | _t |¡| _t|||ƒ| _t |¡| _	t||ƒ| _
t |¡| _t||ƒ| _|| _|| _d S ©N)r   r3   r   r    r   ÚBatchNorm2dÚbn1r   Úconv1Úbn2Úconv2Úbn3r   Úconv3Ú
downsampler   )r   Úin_channelsÚout_channelsr   r<   r    r!   r   r   r   j   s   
zBasicBlock.__init__c                 C   s~   |}|   |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}| jdur9|  |¡}||7 }|S ©r#   N)r6   r    r7   r8   r9   r:   r;   r<   ©r   r&   ÚresidualÚoutr   r   r   r+      s   










zBasicBlock.forwardr,   r   r   r!   r   r3   Q   s    úr3   c                       s4   e Zd ZdZdddejf‡ fdd„	Zdd„ Z‡  ZS )ÚSEBasicBlockaí  An implementation of Squeeze-and-Excitation ResNet Block.

    Arguments
    ---------
    in_channels : int
        Number of input channels.
    out_channels : int
        The number of output channels.
    reduction : int
        The reduction factor of channels.
    stride : int
        Factor that reduce the spatial dimensionality
    downsample : torch function
        A function for downsample the identity of block when stride != 1
    activation : torch class
        A class for constructing the activation layers.

    Example
    -------
    >>> inp_tensor = torch.rand([1, 64, 80, 40])
    >>> layer = SEBasicBlock(64, 64, stride=1)
    >>> out_tensor = layer(inp_tensor)
    >>> out_tensor.shape
    torch.Size([1, 64, 80, 40])
    r   Nc                    s|   t t| ƒ ¡  |ƒ | _t |¡| _t|||ƒ| _t |¡| _	t||ƒ| _
t |¡| _t||ƒ| _|| _|| _t||ƒ| _d S r4   )r   rC   r   r    r   r5   r6   r   r7   r8   r9   r:   r   r;   r<   r   r   Úse)r   r=   r>   r   r   r<   r    r!   r   r   r   µ   s   	zSEBasicBlock.__init__c                 C   sˆ   |}|   |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}| jdur>|  |¡}||7 }|S r?   )	r6   r    r7   r8   r9   r:   r;   rD   r<   r@   r   r   r   r+   Ï   s   











zSEBasicBlock.forwardr,   r   r   r!   r   rC   š   s    ùrC   c                       s^   e Zd ZdZddejjg d¢g d¢g d¢df‡ fdd	„	Zddd„Zddd„Z	ddd„Z
‡  ZS )ÚResNeta  An implementation of ResNet

    Arguments
    ---------
    input_size : int
        Expected size of the input dimension.
    device : str
        Device used, e.g., "cpu" or "cuda".
    activation : torch class
        A class for constructing the activation layers.
    channels : list of ints
        List of number of channels used per stage.
    block_sizes : list of ints
        List of number of groups created per stage.
    strides : list of ints
        List of stride per stage.
    lin_neurons : int
        Number of neurons in linear layers.

    Example
    -------
    >>> input_feats = torch.rand([2, 400, 80])
    >>> compute_embedding = ResNet(lin_neurons=256)
    >>> outputs = compute_embedding(input_feats)
    >>> outputs.shape
    torch.Size([2, 256])
    éP   Úcpu)é€   rH   é   rI   )r   é   é   r   )r   é   rL   rL   rI   c           
   
      s  t ƒ  ¡  t|ƒdksJ ‚t|ƒdksJ ‚t|ƒdksJ ‚t ||d |d  |d  |d   ¡}tjd|d ddddd| _t |d ¡| _	|ƒ | _
| j|d |d |d |d d| _| j|d |d |d |d d| _| j|d |d |d |d d| _| j|d |d |d |d d| _tj d| |d	  ¡| _t tj|d	 | d
ddt ¡ t d
¡tjd
|d	 | ddtjdd¡| _t d| |d	  |¡| _tj |¡| _|  ¡ D ]*}	t|	tjƒrótjj|	j ddd qàt|	tjƒr
tj !|	j d¡ tj !|	j"d¡ qàd S )NrJ   r   r   rL   r   F)r	   )r   éÿÿÿÿrH   )r   ©ÚdimÚfan_outÚrelu)ÚmodeÚnonlinearity)#r   r   ÚlenÚmathÚceilr   r   r7   r5   r6   Úactivation1Ú_make_layer_seÚlayer1Úlayer2Ú_make_layerÚlayer3Úlayer4Útorchr   Ú
norm_statsr   ÚConv1dr1   ÚSoftmaxÚ	attentionr   Úfc_embedÚ
norm_embedÚmodulesÚ
isinstanceÚinitÚkaiming_normal_ÚweightÚ	constant_r	   )
r   Ú
input_sizeÚdevicer    r   Úblock_sizesÚstridesÚlin_neuronsÚ	input_outÚmr!   r   r   r     sT   

"ÿÿÿÿÿ
ûÿ€ùzResNet.__init__r   c              	   C   s|   d}|dks
||krt  t j||d|ddt  |¡¡}g }| t||d||ƒ¡ td|ƒD ]}| t||dƒ¡ q-t j|Ž S )aù  Construct the squeeze-and-excitation block layer.

        Arguments
        ---------
        in_channels : int
            Number of input channels.
        out_channels : int
            The number of output channels.
        block_num: int
            Number of ResNet blocks for the network.
        stride : int
            Factor that reduce the spatial dimensionality. Default is 1

        Returns
        -------
        se_block : nn.Sequential
            Squeeze-and-excitation block
        Nr   Fr   )r   r   r   r5   ÚappendrC   Úrange©r   r=   r>   Ú	block_numr   r<   ÚlayersÚir   r   r   rX   C  s&   ûøÿ
zResNet._make_layer_sec              	   C   sx   d}|dks
||krt  t j||d|ddt  |¡¡}g }| t||||ƒ¡ td|ƒD ]
}| t||ƒ¡ q,t j|Ž S )aß  
        Construct the ResNet block layer.

        Arguments
        ---------
        in_channels : int
            Number of input channels.
        out_channels : int
            The number of output channels.
        block_num: int
            Number of ResNet blocks for the network.
        stride : int
            Factor that reduce the spatial dimensionality. Default is 1

        Returns
        -------
        block : nn.Sequential
            ResNet block
        Nr   Fr   )r   r   r   r5   rr   r3   rs   rt   r   r   r   r[   l  s"   ûø
zResNet._make_layerNc                 C   sä   |  d¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}| dd¡}| 	dd¡}|  
|¡}tj|| dd}t tj|d | dd|d  jdd¡}tj||gdd}|  |¡}|  |¡}|  |¡}|S )aM  Returns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        lengths : torch.Tensor
            Corresponding relative lengths of the inputs.

        Returns
        -------
        x : torch.Tensor
            The embedding vector.
        r   rL   r   rN   gñhãˆµøä>)Úmin)Ú	unsqueezer7   r6   rW   rY   rZ   r\   r]   Ú	transposeÚflattenrb   r^   ÚsumÚsqrtÚclampÚcatr_   rc   rd   )r   r&   ÚlengthsÚwÚmuÚsgr   r   r   r+   ”  s$   








,


zResNet.forward©r   r4   )r-   r.   r/   r0   r^   r   r1   r   rX   r[   r+   r2   r   r   r!   r   rE   ë   s    ø
;
)(rE   c                       s2   e Zd ZdZ				d
‡ fdd„	Zdd	„ Z‡  ZS )Ú
Classifieraß  This class implements the cosine similarity on the top of features.

    Arguments
    ---------
    input_size : int
        Expected size of the inputs.
    device : str
        Device used, e.g., "cpu" or "cuda".
    lin_blocks : int
        Number of linear layers.
    lin_neurons : int
        Number of neurons in linear layers.
    out_neurons : int
        Number of classes.

    Example
    -------
    >>> classify = Classifier(input_size=2, lin_neurons=2, out_neurons=2)
    >>> outputs = torch.tensor([ [1., -1.], [-9., 1.], [0.9, 0.1], [0.1, 0.9] ])
    >>> outputs = outputs.unsqueeze(1)
    >>> cos = classify(outputs)
    >>> (cos < -1.0).long().sum()
    tensor(0)
    >>> (cos > 1.0).long().sum()
    tensor(0)
    rG   r   rI   é»  c                    sn   t ƒ  ¡  t ¡ | _t|ƒD ]}| j t|dt||dg¡ |}qt 	t
j|||d¡| _tj | j¡ d S )N)rk   )rk   Ú	n_neurons)rl   )r   r   r   Ú
ModuleListÚblocksrs   ÚextendÚ_BatchNorm1dr   Ú	Parameterr^   ÚFloatTensorri   rg   Úxavier_uniform_)r   rk   rl   Ú
lin_blocksro   Úout_neuronsÚblock_indexr!   r   r   r   Ú  s   


þÿÿzClassifier.__init__c                 C   s>   | j D ]}||ƒ}qt t | d¡¡t | j¡¡}| d¡S )zúReturns the output probabilities over speakers.

        Arguments
        ---------
        x : torch.Tensor
            Torch tensor.

        Returns
        -------
        x : torch.Tensor
            Output probabilities over speakers.
        r   )r‰   ÚFÚlinearÚ	normalizeÚsqueezeri   ry   )r   r&   Úlayerr   r   r   r+   ô  s   

 
zClassifier.forward)rG   r   rI   r†   )r-   r.   r/   r0   r   r+   r2   r   r   r!   r   r…   ¾  s    úr…   r„   )r0   rU   r^   Útorch.nnr   Útorch.nn.functionalÚ
functionalr’   Úspeechbrain.nnet.linearr   Úspeechbrain.nnet.normalizationr   r‹   r   r   ÚModuler   r3   rC   rE   r…   r   r   r   r   Ú<module>   s    

+IQ T