o
    9wÖiJ<  ã                   @   s  d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZmZmZmZmZmZ G dd„ dejƒZdeiZG d	d
„ d
ejƒZG dd„ dejƒZG dd„ dejƒZddd„Zddd„Zddd„Zddd„Zddd„Zddd„Z ddd„Z!dS )é    )Ú	lru_cache)ÚOptionalN)Ú	rearrange)Ú	StatsPool)Úconv1d_num_framesÚconv1d_receptive_field_centerÚconv1d_receptive_field_sizeÚmulti_conv_num_framesÚ!multi_conv_receptive_field_centerÚmulti_conv_receptive_field_sizec                       s@   e Zd ZdZd‡ fdd„	Zddeej fdd„Zd	d
„ Z	‡  Z
S )ÚTSTPz©
    Temporal statistics pooling, concatenate mean and std, which is used in
    x-vector
    Comment: simple concatenation can not make full use of both statistics
    r   c                    s    t t| ƒ ¡  || _tƒ | _d S ©N)Úsuperr   Ú__init__Úin_dimr   Ú
stats_pool)Úselfr   Úkwargs©Ú	__class__© úm/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/pyannote/audio/models/embedding/wespeaker/resnet.pyr   ,   s   zTSTP.__init__NÚweightsc                 C   s   t |dƒ}| j||dS )zè

        Parameters
        ----------
        features : (batch, dimension, channel, frames) torch.Tensor
            Batch of features
        weights: (batch, frames) torch.Tensor, optional
            Batch of weights

        zBbatch dimension channel frames -> batch (dimension channel) frames©r   )r   r   )r   Úfeaturesr   r   r   r   Úforward1   s
   þzTSTP.forwardc                 C   s   | j d | _| jS )Né   )r   Úout_dim)r   r   r   r   Úget_out_dimL   s   zTSTP.get_out_dim©r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   ÚtorchÚTensorr   r   Ú__classcell__r   r   r   r   r   %   s
    r   c                       sh   e Zd ZdZd‡ fdd„	Zededefdd„ƒZddedefd	d
„Zddedefdd„Z	dd„ Z
‡  ZS )Ú
BasicBlocké   c              	      s´   t t| ƒ ¡  || _tj||d|ddd| _t |¡| _tj||ddddd| _	t |¡| _
t ¡ | _|dks>|| j| krXt tj|| j| d|ddt | j| ¡¡| _d S d S )Né   r(   F©Úkernel_sizeÚstrideÚpaddingÚbias©r+   r,   r.   )r   r'   r   r,   ÚnnÚConv2dÚconv1ÚBatchNorm2dÚbn1Úconv2Úbn2Ú
SequentialÚshortcutÚ	expansion©r   Ú	in_planesÚplanesr,   r   r   r   r   W   s.   ÿÿ
û
øÿzBasicBlock.__init__Únum_samplesÚreturnc                 C   ó$   t |ddg| jdgddgddgdS ©Nr)   r(   ©r+   r,   r-   Údilation©r	   r,   ©r   r=   r   r   r   Ú
num_framesp   s   ûzBasicBlock.num_framesrE   c                 C   r?   r@   ©r   r,   ©r   rE   r   r   r   Úreceptive_field_sizez   ó   ûzBasicBlock.receptive_field_sizer   Úframec                 C   r?   r@   ©r
   r,   ©r   rJ   r   r   r   Úreceptive_field_centerƒ   rI   z!BasicBlock.receptive_field_centerc                 C   sB   t  |  |  |¡¡¡}|  |  |¡¡}||  |¡7 }t  |¡}|S r   )ÚFÚrelur4   r2   r6   r5   r8   ©r   ÚxÚoutr   r   r   r   Œ   s
   
zBasicBlock.forward©r(   r   ©r    r!   r"   r9   r   r   ÚintrE   rH   rM   r   r&   r   r   r   r   r'   T   s    			r'   c                       sh   e Zd ZdZd‡ fdd„	Zededefdd„ƒZdd	edefd
d„Zddedefdd„Z	dd„ Z
‡  ZS )Ú
Bottlenecké   r(   c              	      sÜ   t t| ƒ ¡  || _tj||ddd| _t |¡| _tj||d|ddd| _	t |¡| _
tj|| j| ddd| _t | j| ¡| _t ¡ | _|dksR|| j| krlt tj|| j| d|ddt | j| ¡¡| _d S d S )Nr(   F)r+   r.   r)   r*   r/   )r   rV   r   r,   r0   r1   r2   r3   r4   r5   r6   r9   Úconv3Úbn3r7   r8   r:   r   r   r   r   —   s2   ÿÿ
û
øÿzBottleneck.__init__r=   r>   c                 C   ó&   t |g d¢d| jdgg d¢g d¢dS ©N)r(   r)   r(   r(   )r   r(   r   )r(   r(   r(   rA   rC   rD   r   r   r   rE   ²   s   
ûzBottleneck.num_framesrE   c                 C   rZ   r[   rF   rG   r   r   r   rH   ¼   ó   
ûzBottleneck.receptive_field_sizer   rJ   c                 C   rZ   r[   rK   rL   r   r   r   rM   Å   r\   z!Bottleneck.receptive_field_centerc                 C   sX   t  |  |  |¡¡¡}t  |  |  |¡¡¡}|  |  |¡¡}||  |¡7 }t  |¡}|S r   )	rN   rO   r4   r2   r6   r5   rY   rX   r8   rP   r   r   r   r   Î   s   
zBottleneck.forwardrS   r   rT   r   r   r   r   rV   ”   s    			rV   c                       sÀ   e Zd Z					d ‡ fdd„	Zdd	„ Zed
edefdd„ƒZd!dedefdd„Zd"dedefdd„Z	de
jde
jfdd„Z	d#de
jde
jde
jfdd„Zd#de
jdee
j fdd„Z‡  ZS )$ÚResNeté    é(   é€   r   Tc                    s@  t t| ƒ ¡  || _|| _|| _t|d ƒ| d | _|| _t	j
d|ddddd| _t	 |¡| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _t| | j|j d
| _| j ¡ | _t	 | j|¡| _| jr”t	j|dd| _t	 ||¡| _d S t	 ¡ | _t	 ¡ | _d S )Né   r(   r)   Fr*   r   )r,   r   rW   )r   )Úaffine)r   r]   r   r;   Úfeat_dimÚ	embed_dimrU   Ú	stats_dimÚtwo_emb_layerr0   r1   r2   r3   r4   Ú_make_layerÚlayer1Úlayer2Úlayer3Úlayer4ÚPOOLING_LAYERSr9   Úpoolr   Úpool_out_dimÚLinearÚseg_1ÚBatchNorm1dÚseg_bn_1Úseg_2ÚIdentity)r   ÚblockÚ
num_blocksÚ
m_channelsrc   rd   Úpooling_funcrf   r   r   r   r   Ø   s0   
ÿ
ÿ
zResNet.__init__c                 C   sL   |gdg|d   }g }|D ]}|  || j||ƒ¡ ||j | _qtj|Ž S )Nr(   )Úappendr;   r9   r0   r7   )r   ru   r<   rv   r,   ÚstridesÚlayersr   r   r   rg   þ   s   
zResNet._make_layerr=   r>   c                 C   sH   |}t |ddddd}| j| j| j| jfD ]}|D ]}| |¡}qq|S )zíCompute number of output frames

        Parameters
        ----------
        num_samples : int
            Number of input samples.

        Returns
        -------
        num_frames : int
            Number of output frames.
        r)   r(   rA   )r   rh   ri   rj   rk   rE   )r   r=   rE   r{   Úlayerr   r   r   rE     s   
ÿÿzResNet.num_framesr(   rE   c                 C   sP   |}t | j| j| j| jgƒD ]}t |ƒD ]}| |¡}qqt|ddddd}|S )a
  Compute size of receptive field

        Parameters
        ----------
        num_frames : int, optional
            Number of frames in the output signal

        Returns
        -------
        receptive_field_size : int
            Receptive field size.
        r)   r(   )rE   r+   r,   r-   rB   )Úreversedrh   ri   rj   rk   rH   r   )r   rE   rH   r{   r|   r   r   r   rH     s   ÿûzResNet.receptive_field_sizer   rJ   c                 C   sR   |}t | j| j| j| jgƒD ]}t |ƒD ]}|j|d}qqt|ddddd}|S )zúCompute center of receptive field

        Parameters
        ----------
        frame : int, optional
            Frame index

        Returns
        -------
        receptive_field_center : int
            Index of receptive field center.
        )rJ   r)   r(   )rJ   r+   r,   r-   rB   )r}   rh   ri   rj   rk   rM   r   )r   rJ   rM   r{   r|   r   r   r   rM   <  s   ÿÿûzResNet.receptive_field_centerÚfbankc                 C   sZ   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	|¡}|S )a8  Extract frame-wise embeddings

        Parameters
        ----------
        fbanks : (batch, frames, features) torch.Tensor
            Batch of fbank features

        Returns
        -------
        embeddings : (batch, ..., embedding_frames) torch.Tensor
            Batch of frame-wise embeddings.

        r   r   r(   )
ÚpermuteÚ
unsqueeze_rN   rO   r4   r2   rh   ri   rj   rk   )r   r~   rR   r   r   r   Úforward_frames[  s   




zResNet.forward_framesNÚframesr   c                 C   sR   | j ||d}|  |¡}| jr"t |¡}|  |¡}|  |¡}||fS t d¡|fS )aÝ  Extract speaker embeddings

        Parameters
        ----------
        frames : torch.Tensor
            Batch of frames with shape (batch, ..., embedding_frames).
        weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
            Batch of weights passed to statistics pooling layer.

        Returns
        -------
        embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor
            Batch of embeddings.
        r   ç        )	rm   rp   rf   rN   rO   rr   rs   r$   Útensor)r   r‚   r   ÚstatsÚembed_arR   Úembed_br   r   r   Úforward_embeddingr  s   



zResNet.forward_embeddingc                 C   s¨   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	|¡}| j
||d}|  |¡}| jrMt |¡}|  |¡}|  |¡}||fS t d¡|fS )aN  Extract speaker embeddings

        Parameters
        ----------
        fbank : (batch, frames, features) torch.Tensor
            Batch of features
        weights : (batch, frames) torch.Tensor, optional
            Batch of weights

        Returns
        -------
        embedding : (batch, embedding_dim) torch.Tensor
        r   r   r(   r   rƒ   )r   r€   rN   rO   r4   r2   rh   ri   rj   rk   rm   rp   rf   rr   rs   r$   r„   )r   r~   r   rR   r…   r†   r‡   r   r   r   r     s   








zResNet.forward)r^   r_   r`   r   TrS   r   r   )r    r!   r"   r   rg   r   rU   rE   rH   rM   r$   r%   r   rˆ   r   r   r&   r   r   r   r   r]   ×   s,    ø&ÿÿÿ
þ$r]   Tc                 C   ó   t tg d¢| |||dS )N)r   r   r   r   ©rc   rd   rx   rf   ©r]   r'   rŠ   r   r   r   ÚResNet18±  ó   úrŒ   c                 C   r‰   ©N)r)   rW   é   r)   rŠ   r‹   rŠ   r   r   r   ÚResNet34¼  r   r   c                 C   r‰   rŽ   ©r]   rV   rŠ   r   r   r   ÚResNet50Ç  r   r’   c                 C   r‰   )N)r)   rW   é   r)   rŠ   r‘   rŠ   r   r   r   Ú	ResNet101Ò  r   r”   c                 C   r‰   )N)r)   ra   é$   r)   rŠ   r‘   rŠ   r   r   r   Ú	ResNet152Ý  r   r–   c                 C   r‰   )N)r   é   é0   r)   rŠ   r‘   rŠ   r   r   r   Ú	ResNet221è  r   r™   c                 C   r‰   )N)é
   é   é@   r)   rŠ   r‘   rŠ   r   r   r   Ú	ResNet293ó  r   r   )r   T)"Ú	functoolsr   Útypingr   r$   Útorch.nnr0   Útorch.nn.functionalÚ
functionalrN   Úeinopsr   Ú$pyannote.audio.models.blocks.poolingr   Ú$pyannote.audio.utils.receptive_fieldr   r   r   r	   r
   r   ÚModuler   rl   r'   rV   r]   rŒ   r   r’   r”   r–   r™   r   r   r   r   r   Ú<module>   s*    
,@C 
[




