o
    oia'                     @   s   d Z ddlZddlmZ ddlm  mZ ddlZddl	m
Z
mZ ddlmZmZ G dd dejZG dd deZG d	d
 d
Z			dde
ejejef dededee dejf
ddZdS )aV  
Speaker Embedder Module for KaniTTS-2
====================================

Lightweight module for generating speaker embeddings from audio using WavLM model.
Model: Orange/Speaker-wavLM-tbr (16kHz input, 128-dim L2-normalized output)

Based on spk_embeddings.py from Orange SA (CC-BY-SA-3.0)
https://huggingface.co/Orange/Speaker-wavLM-tbr
    N)UnionOptional)WavLMPreTrainedModel
WavLMModelc                       s4   e Zd ZdZd
dedef fddZdd	 Z  ZS )	TopLayersu  
    Projection layers on top of WavLM for speaker embedding extraction.

    Architecture:
        - Conv1d: 2048 → 512
        - BatchNorm + ReLU
        - Conv1d: 512 → embd_size (default 128)
        - BatchNorm + ReLU
        - L2 normalization
          	embd_sizetop_interm_sizec                    sh   t t|   tjd|dd| _tj|ddd| _tj||dd| _tj|ddd| _	tj
dd| _d S )	Ni      )in_channelsout_channelskernel_sizeFgMbP?)num_featuresaffineepsT)inplace)superr   __init__nnConv1daffine1BatchNorm1d
batchnorm1affine2
batchnorm2ReLU
activation)selfr	   r
   	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/kani_tts/speaker_embedder.pyr       s   zTopLayers.__init__c                 C   sH   |  | | |}| | | |}t|dddddf S )z
        Args:
            x: Stats pooling output [batch, 2048, 1]

        Returns:
            L2-normalized embeddings [batch, embd_size]
        Nr   )r   r   r   r   r   F	normalize)r   xoutr!   r!   r"   forward(   s   zTopLayers.forward)r   r   )__name__
__module____qualname____doc__intr   r'   __classcell__r!   r!   r   r"   r      s    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )EmbeddingsModelz
    Complete WavLM-based speaker embedding model.

    Architecture:
        1. MVN normalization on input audio
        2. WavLM encoder
        3. Stats pooling (mean + std)
        4. Top projection layers
        5. L2 normalization
    c                    s*   t  | t|| _t|j|j| _d S N)r   r   r   wavlmr   r	   r
   
top_layers)r   configr   r!   r"   r   A   s   
zEmbeddingsModel.__init__c                 C   s   ||j ddd |jdddd  }| j|ddj}|j dd}|jddjdd}|d	}tj||fddj	d
d}| 
|S )z
        Args:
            input_values: Audio waveform [batch, time_samples]

        Returns:
            Speaker embeddings [batch, embd_size]
        r   T)dimkeepdimg|=F)input_valuesoutput_hidden_statesr3   )ming      ?   )meanstdr0   last_hidden_statevarclamppowtorchcat	unsqueezer1   )r   r5   x_normbase_outr:   r=   r;   x_statsr!   r!   r"   r'   F   s   	

zEmbeddingsModel.forward)r(   r)   r*   r+   r   r'   r-   r!   r!   r   r"   r.   5   s    r.   c                   @   s   e Zd ZdZ			ddedee defdd	Zd
ej	de
dej	fddZ	dd
eejej	f dee
 dej	fddZdedej	fddZdS )SpeakerEmbedderu  
    Simple speaker embedder for single audio → embedding generation.

    Features:
        - Loads WavLM model once
        - Generates 128-dim L2-normalized speaker embeddings
        - Expects 16kHz audio input
        - Handles variable-length audio (max 20 seconds recommended)
        - Returns PyTorch tensors ready for TTS model

    Usage:
        embedder = SpeakerEmbedder()

        # From numpy array (16kHz)
        audio = np.random.randn(16000 * 5)  # 5 seconds
        embedding = embedder.embed_audio(audio)  # [1, 128]

        # From torch tensor
        audio_tensor = torch.randn(1, 16000 * 5)
        embedding = embedder.embed_audio(audio_tensor)
    nineninesix/speaker-emb-tbrN      >@
model_namedevicemax_duration_secc                 C   s   || _ d| _|| _t|| j | _|du r#ttj rdnd| _nt|| _t	d| d t
|| _| j| j | j  t	d| j  dS )a  
        Initialize speaker embedder.

        Args:
            model_name: HuggingFace model ID
            device: Target device ('cuda', 'cpu', or None for auto-detect)
            max_duration_sec: Maximum audio duration in seconds (longer audio will be truncated)
        >  Ncudacpuu)   🔊 Loading WavLM speaker embedder from z...u   ✅ Speaker embedder ready on )rI   	target_srrK   r,   max_samplesr@   rJ   rM   is_availableprintr.   from_pretrainedmodeltoeval)r   rI   rJ   rK   r!   r!   r"   r   w   s   
zSpeakerEmbedder.__init__audiosample_ratereturnc                 C   s   |  dkr|jd |jd k r|jdd}n|d }|  dkr)td|j || jkrNzddlm} W n tyA   tdw |j|| jd}||}|jd dkrYtd	|jd | j	krzt
d
|jd | dd| j d |d| j	 }|S )a(  
        Prepare audio for embedding extraction: resample if needed, convert to mono, truncate.

        Args:
            audio: Audio tensor (1D or 2D)
            sample_rate: Current sample rate of audio

        Returns:
            Prepared audio tensor [time_samples] at target_sr
        r9   r   r   r7   z#Expected 1D or 2D audio, got shape NzKtorchaudio is required for resampling. Install with: pip install torchaudio)	orig_freqnew_freqzAudio is emptyu   ⚠️  Audio is z.2fzs, truncating to s)r3   shaper:   
ValueErrorrO   torchaudio.transforms
transformsImportErrorResamplerP   rR   rK   )r   rW   rX   T	resamplerr!   r!   r"   _prepare_audio   s,   
$zSpeakerEmbedder._prepare_audioc                 C   s   t |tjrt| }n| }|du r| j}| ||}|d	| j
}t  | |}W d   |S 1 s=w   Y  |S )a  
        Generate speaker embedding from audio.

        Args:
            audio: Audio waveform as numpy array or torch tensor
                   - If 1D: shape [time_samples]
                   - If 2D: shape [batch, time_samples] or [channels, time_samples]
            sample_rate: Sample rate of input audio (if None, assumes 16kHz)

        Returns:
            Speaker embedding tensor [1, 128] (L2-normalized)

        Raises:
            ValueError: If audio is empty
        Nr   )
isinstancenpndarrayr@   
from_numpyfloatrO   re   rB   rU   rJ   no_gradrT   )r   rW   rX   audio_batch	embeddingr!   r!   r"   embed_audio   s   

zSpeakerEmbedder.embed_audio
audio_pathc                 C   sd   zddl }W n ty   tdw ||\}}|jd dkr'|jdd}n|d }| j||dS )ae  
        Generate speaker embedding from audio file.

        Args:
            audio_path: Path to audio file (supports any sample rate, will be resampled)

        Returns:
            Speaker embedding tensor [1, 128]

        Raises:
            ImportError: If torchaudio is not installed
            ValueError: If audio file cannot be loaded
        r   NzTtorchaudio is required for loading audio files. Install with: pip install torchaudior   r7   rX   )
torchaudiora   loadr]   r:   rn   )r   ro   rq   rW   srr!   r!   r"   embed_audio_file   s   z SpeakerEmbedder.embed_audio_file)rG   NrH   r/   )r(   r)   r*   r+   strr   rj   r   r@   Tensorr,   re   r   rg   rh   rn   rt   r!   r!   r!   r"   rF   `   s8    
 
6
*rF   rL   rG   rW   rX   rI   rJ   rY   c                 C   s.   t ||d}t| tr|| S |j| |dS )a+  
    Convenience function to generate speaker embedding in one line.

    Args:
        audio: Audio as numpy array, torch tensor, or file path
        sample_rate: Sample rate of audio (ignored if audio is file path)
        model_name: HuggingFace model ID
        device: Target device

    Returns:
        Speaker embedding [1, 128]

    Example:
        # From numpy array
        audio_np = np.random.randn(16000 * 5)
        emb = compute_speaker_embedding(audio_np)

        # From file
        emb = compute_speaker_embedding("speaker.wav")
    )rI   rJ   rp   )rF   rf   ru   rt   rn   )rW   rX   rI   rJ   embedderr!   r!   r"   compute_speaker_embedding  s   

rx   )rL   rG   N)r+   r@   torch.nnr   torch.nn.functional
functionalr#   numpyrg   typingr   r   (transformers.models.wavlm.modeling_wavlmr   r   Moduler   r.   rF   rh   rv   ru   r,   rx   r!   r!   r!   r"   <module>   s2    !+ 7