o
    oik	                     @   s`   d Z ddlZddlZddlmZ ddlmZmZ G dd dejZ	G dd deZ
dd	d
ZdS )a^  
 * Software Name : spk_embeddings.py
 * SPDX-FileCopyrightText: Copyright (c) Orange SA
 * SPDX-License-Identifier: CC-BY-SA-3.0
 *
 * This software is distributed under the Creative Commons Attribution Share Alike 3.0 Unported,
 * see the "LICENSE.txt" file for more details or https://huggingface.co/Orange/Speaker-wavLM-pro/blob/main/LICENSE.txt
    N)WavLMPreTrainedModel
WavLMModelc                       s&   e Zd Zd fdd	Zdd Z  ZS )	TopLayers      c                    sh   t t|   tjd|dd| _tj|ddd| _tj||dd| _tj|ddd| _	tj
dd| _d S )	Ni      )in_channelsout_channelskernel_sizeFgMbP?)num_featuresaffineepsT)inplace)superr   __init__nnConv1daffine1BatchNorm1d
batchnorm1affine2
batchnorm2ReLU
activation)self	embd_sizetop_interm_size	__class__ F/home/ubuntu/kanitts-2-dataset-pipeline/utils/speaker_emb/spk_wavLM.pyr      s   zTopLayers.__init__c                 C   sJ   |  | | |}| | | |}tj|d d d d df S )Nr   )r   r   r   r   r   r   
functional	normalize)r   xoutr   r   r    forward   s   zTopLayers.forward)r   r   __name__
__module____qualname__r   r%   __classcell__r   r   r   r    r      s    r   c                       s$   e Zd Z fddZdd Z  ZS )EmbeddingsModelc                    s*   t  | t|| _t|j|j| _d S )N)r   r   r   wavlmr   r   r   
top_layers)r   configr   r   r    r      s   
zEmbeddingsModel.__init__c                 C   s~   ||j ddd |jddd }| j|ddj}|jddjdd}tj|j dd|	dfddjdd}| 
|S )	Nr   dimF)input_valuesoutput_hidden_statesg|=)ming      ?   )mean	unsqueezestdr,   last_hidden_statevarclamptorchcatpowr-   )r   r1   x_normbase_outvx_statsr   r   r    r%   "   s
   ((
zEmbeddingsModel.forwardr&   r   r   r   r    r+      s    r+     c                 C   sn   t | \}}|dksJ d|jdd}|jd |kr)td|   |d | }||jdd}|  S )Ni>  z;please convert your audio file to a sampling rate of 16 kHzr   r/   ztruncating long signal )
torchaudioloadr5   shapeprintr6   clonedetach)fnmmodelmax_sizesigsrembdr   r   r    compute_embedding.   s   rO   )rB   )__doc__r;   rC   torch.nnr   (transformers.models.wavlm.modeling_wavlmr   r   Moduler   r+   rO   r   r   r   r    <module>   s    	