o
    @Tib                     @   s   d dl Z d dlmZ d dlZd dlZd dlmZ d dlm  mZ	 ddl
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ e ZeG dd dZeG dd dZG dd dejZdS )    N)	dataclass   )FiniteScalarQuantizer)GlobalEncoder)PostNet)SSLFeatureExtractor)Transformer)wav2vec2_model)freeze_modules
get_loggerc                   @   s   e Zd ZU dZeedf ed< dZeedf ed< dZe	ed< dZ
eed	< d
Zeed< dZe	ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )LinaCodecConfig)   	   .local_ssl_layers)r      global_ssl_layersTnormalize_ssl_featuresr   downsample_factor   mel_upsample_factoruse_conv_downsamplelinearlocal_interpolation_modemel_interpolation_modei]  sample_ratei   n_fft   
hop_lengthd   n_melscenterpaddingN)__name__
__module____qualname__r   tupleint__annotations__r   r   boolr   r   r   r   strr   r   r   r   r   r!    r*   r*   C/home/ubuntu/.local/lib/python3.10/site-packages/linacodec/model.pyr      s   
 r   c                   @   sD   e Zd ZU dZejdB ed< dZejdB ed< dZejdB ed< dS )LinaCodecFeaturesNcontent_embeddingcontent_token_indicesglobal_embedding)	r"   r#   r$   r-   torchTensorr'   r.   r/   r*   r*   r*   r+   r,   *   s   
 r,   c                       s  e Zd ZdZdedededededB ded	ed
ede	f fddZ
defddZdedefddZdededededB fddZdefddZded	ed
ede	fddZdYdededefddZdedefd d!Zdedefd"d#Zd$eej d%ee dejfd&d'ZdZd$ejd)edejfd*d+Z	d[d,ejd-edB deejejf fd.d/Zd0ejdeejejejejf dB fd1d2Zd3ejdejfd4d5Zd6ejd7ejd8edejfd9d:Z d;ee de!eejf fd<d=Z"e#d>edd fd?d@Z$e#				d\dAedB dBedB d>edB dCedB ddDf
dEdFZ%e& d]d,ejdHedIede'fdJdKZ(dLejdejfdMdNZ)e& 			d^dOejdPejdB dQejdB dRedB dejf
dSdTZ*e& dUejdVejdejfdWdXZ+  Z,S )_LinaCodecModelz?Model architecture and forward pass logic for Kanade tokenizer.configssl_feature_extractorlocal_encoderlocal_quantizerfeature_decoderNglobal_encoder
mel_prenetmel_decodermel_postnetc
           
         sJ   t    || _| || | |||| | | | ||||	 d S N)super__init__r3   _init_ssl_extractor_init_local_branch_init_global_branch_init_mel_decoder)
selfr3   r4   r5   r6   r7   r8   r9   r:   r;   	__class__r*   r+   r>   4   s   

zLinaCodecModel.__init__pathc                 C   sF   t |}tdi |d }|j|d dd}| | _ddg| _dS )	z8Loads distilled wavlm model, 970m params --> 250m paramsr3   
state_dictFstrictr      Nr*   )r0   loadr	   load_state_dictcudawavlm_modeldistilled_layers)rC   rF   ckptrN   resultr*   r*   r+   load_distilled_wavlmG   s
   

z#LinaCodecModel.load_distilled_wavlmc                 C   s   || _ t| j g td| j j  t|j| _t| jdkr0tdt| j d| j  ntd| jd  d |jrDtd t|j	| _	t| j	dkrbtdt| j	 d	| j	  dS td| j	d  d
 dS )z/Initialize and configure SSL feature extractor.z;SSL feature extractor initialized and frozen, feature dim: r   zUsing average of z SSL layers for local branch: zUsing single SSL layer r   z for local branchz.Normalizing local SSL features before encodingz SSL layers for global branch: z for global branchN)
r4   r
   loggerdebugfeature_dimlistr   lenr   r   )rC   r3   r4   r*   r*   r+   r?   O   s&   
z"LinaCodecModel._init_ssl_extractorc                 C   s   || _ || _|| _|j| _| jdkrQtd| j  |jrD|j}tj	|||j|jd| _
tj|||j|jd| _td|j  dS d| _
d| _td dS d| _
d| _dS )zOInitialize local branch components (encoder, downsampling, quantizer, decoder).r   z(Using temporal downsampling with factor kernel_sizestridez6Using Conv1d downsampling/upsampling with kernel size NzJUsing average pooling and linear interpolation for downsampling/upsampling)r5   r6   r7   r   rS   rT   r   
output_dimnnConv1dconv_downsampleConvTranspose1dconv_upsample)rC   r3   r5   r6   r7   rU   r*   r*   r+   r@   l   s(   

z!LinaCodecModel._init_local_branchc                 C   s
   || _ dS )z$Initialize global branch components.Nr8   )rC   r8   r*   r*   r+   rA      s   
z"LinaCodecModel._init_global_branchc                 C   sZ   || _ || _|| _d| _|jdkr+|j}tj|||j|jd| _t	d|j  dS dS )zIInitialize mel decoder components (prenet, upsampling, decoder, postnet).Nr   rX   z5Using Conv1DTranspose for mel upsampling with factor )
r9   r:   r;   mel_conv_upsampler   r[   r\   r_   rS   rT   )rC   r3   r9   r:   r;   	input_dimr*   r*   r+   rB      s   
z LinaCodecModel._init_mel_decoderFaudio_lengthensure_recon_lengthreturnc                 C   s~   | j }| jj}|| |j }t||j }|r(|| j  }dkr(|| j| 7 }||}||j | }	t|	| d }
|
S )zWCalculate required padding for input waveform to ensure consistent SSL feature lengths.r   r   )	r4   r3   r   ssl_sample_ratemathceilhop_sizer   get_minimum_input_length)rC   rd   re   	extractorr   num_samples_after_resamplingexpected_ssl_output_length	remainder%num_samples_required_after_resamplingnum_samples_requiredr!   r*   r*   r+   _calculate_waveform_padding   s   
z*LinaCodecModel._calculate_waveform_paddingtoken_lengthc                 C   s:   | j }| jj}|| j }||}||j | }t|S )z:Calculate the original audio length based on token length.)r4   r3   r   r   rk   rg   rh   ri   )rC   rs   rl   r   feature_lengthrp   rq   r*   r*   r+    _calculate_original_audio_length   s   


z/LinaCodecModel._calculate_original_audio_lengthc                 C   sL   | j jdkr|| j j d S | j jdkr|| j j S || j j | j j d S )zBCalculate the target mel spectrogram length based on audio length.r    r   same)r3   r!   r   r   )rC   rd   r*   r*   r+   _calculate_target_mel_length   s
   z+LinaCodecModel._calculate_target_mel_lengthfeatureslayersc                    sL   t |dkr fdd|D }tj|ddjdd}|S  |d d  }|S )Nr   c                    s   g | ]} |d   qS )r   r*   ).0irx   r*   r+   
<listcomp>   s    z8LinaCodecModel._process_ssl_features.<locals>.<listcomp>r   )dim)rW   r0   stackmean)rC   rx   ry   selected_featuresmixed_featuresr*   r|   r+   _process_ssl_features   s   z$LinaCodecModel._process_ssl_features:0yE>epsc                 C   s<   | j js|S tj|ddd}tj|ddd}|| ||  S )Nr   T)r~   keepdim)r3   r   r0   r   std)rC   rx   r   r   r   r*   r*   r+   _normalize_ssl_features   s
   z&LinaCodecModel._normalize_ssl_featureswaveformr!   c                 C   s   |  dkr|d}|dkrtj|||fdd}t " | j|dd}| j|}| jj	|t
| jdd }W d   n1 sBw   Y  | || j}| |}| || j}||fS )	a  Forward pass to extract SSL features. (B, T, C)
        Args:
            waveform: Input waveform tensor of shape (B, channels, samples)
            padding: Optional padding to apply on both sides of the waveform. This is useful to ensure
                     that the SSL feature extractor produces consistent output lengths.
        Returns:
            local_ssl_features: Local SSL features for local branch. (B, T, C)
            global_ssl_features: Global SSL features for global branch. (B, T, C)
           r   r   constant)moder   )
num_layersN)r~   squeezeFpadr0   no_gradr4   	resamplerrN   extract_featuresmaxrO   r   r   r   )rC   r   r!   acoustic_wavlm_featuresdistilled_wavlm_featureslocal_ssl_featuresglobal_ssl_featuresr*   r*   r+   forward_ssl_features   s   


z#LinaCodecModel.forward_ssl_featuresr   c           	      C   s  |  |}| jdkr.| jjr| |dddd}ntj|dd| j| jddd}d}t	d}| j
dur| |\}}|d }t|d }|}| jdkrz| jjre| |dddd}ntj|dd|jd | jjddd}| 
|}n| j|\}}||||fS )	a  Forward pass to extract content embeddings from the local branch.
        Args:
            local_ssl_features: Local SSL features tensor of shape (B, T, C)
        Returns:
            local_quantized: Quantized local embeddings. (B, T/factor, C)
            indices: Content token indices. (B, T/factor)
            ssl_recon: Reconstructed SSL features (if feature decoder is present). (B, T, C)
            perplexity: Quantizer perplexity (if feature decoder is present). Scalar tensor.
        r   r   rX   Ng        indices
perplexitysizer   )r5   r   r3   r   r^   	transposer   
avg_pool1dr0   tensorr7   r6   r   r`   interpolateshaper   encode)	rC   r   local_encoded	ssl_reconr   local_quantizedlocal_quantize_infor   local_latent_for_sslr*   r*   r+   forward_content   s:   





zLinaCodecModel.forward_contentr   c                 C   s   |  |}|S )zForward pass to extract global embeddings from the global branch.
        Args:
            global_ssl_features: Global SSL features tensor of shape (B, T, C)
        Returns:
            global_encoded: Global embeddings. (B, C)
        ra   )rC   r   global_encodedr*   r*   r+   forward_global1  s   
zLinaCodecModel.forward_globalcontent_embeddingsglobal_embeddings
mel_lengthc                 C   s   |  |}| jdur| |dddd}tj|dd|| jjddd}| j||dd}|dd}| 	|}|S )a  Forward pass to generate mel spectrogram from content and global embeddings.
        Args:
            content_embeddings: Content embeddings tensor of shape (B, T, C)
            global_embeddings: Global embeddings tensor of shape (B, C)
            mel_length: Target mel spectrogram length (T_mel)
        Returns:
            mel_recon: Reconstructed mel spectrogram tensor of shape (B, n_mels, T_mel)
        Nr   r   r   )	condition)
r9   rb   r   r   r   r3   r   r:   	unsqueezer;   )rC   r   r   r   local_latent	mel_reconr*   r*   r+   forward_mel;  s   


zLinaCodecModel.forward_melinclude_modulesc                   s,   fdddD   fdd|   D }|S )zPGet model weights for saving. Excludes certain modules not needed for inference.c                    s   g | ]}| vr|qS r*   r*   )rz   m)r   r*   r+   r}   \  s
    z2LinaCodecModel.weights_to_save.<locals>.<listcomp>)r4   r7   r`   c                    s,   i | ]\ }t  fd dD s |qS )c                 3   s    | ]}  |V  qd S r<   )
startswith)rz   exclnamer*   r+   	<genexpr>d  s    z<LinaCodecModel.weights_to_save.<locals>.<dictcomp>.<genexpr>)any)rz   param)excluded_modulesr   r+   
<dictcomp>a  s    z2LinaCodecModel.weights_to_save.<locals>.<dictcomp>)named_parameters)rC   r   rG   r*   )r   r   r+   weights_to_saveZ  s   

zLinaCodecModel.weights_to_saveconfig_pathc                 C   s4   t jdd}|jdtd ||}||}|jS )zInstantiate KanadeModel from config file.
        Args:
            config_path (str): Path to model configuration file (.yaml).
        Returns:
            KanadeModel: Instantiated KanadeModel.
        F)exit_on_errorz--model)type)jsonargparseArgumentParseradd_argumentr2   
parse_pathinstantiate_classesmodel)clsr   parsercfgr*   r*   r+   from_hparamsh  s
   

zLinaCodecModel.from_hparamsrepo_idrevisionweights_pathKanadeModelc           	      C   s   |durddl m} ||d|d}||d|d}n|du s!|du r%td| |}ddlm} ||d	d
}|j|dd td|  |S )at  Load LinaCodec either from HuggingFace Hub or local config and weights files.
        Args:
            repo_id (str, optional): HuggingFace Hub repository ID. If provided, loads config and weights from the hub.
            revision (str, optional): Revision (branch, tag, commit) for the HuggingFace Hub repo.
            config_path (str, optional): Path to model configuration file (.yaml). Required if repo_id is not provided.
            weights_path (str, optional): Path to model weights file (.safetensors). Required if repo_id is not provided.
        Returns:
            LinaCodec: Loaded LinaCodec instance.
        Nr   )hf_hub_downloadzconfig.yaml)r   zmodel.safetensorszePlease provide either HuggingFace Hub repo_id or both config_path and weights_path for model loading.)	load_filecpu)deviceFrH   z&Loaded weights from safetensors file: )	huggingface_hubr   
ValueErrorr   safetensors.torchr   rL   rS   info)	r   r   r   r   r   r   r   r   rG   r*   r*   r+   from_pretrainedv  s   
zLinaCodecModel.from_pretrainedTreturn_contentreturn_globalc                 C   s   | d}| |}| j|d|d\}}t }tjdtjdd5 |r:| |\}	}
}}|		d|_
|
	d|_|rO| |}|	d|_W d   |S W d   |S 1 sZw   Y  |S )a  Extract content and/or global features from audio using Kanade model.
        Args:
            waveform (torch.Tensor): Input audio waveform tensor (samples,). The sample rate should match model config.
            return_content (bool): Whether to extract content features.
            return_global (bool): Whether to extract global features.
        Returns:
            dict[str, torch.Tensor]: Extracted features.
        r   )r!   rM   Tdevice_typedtypeenabledN)r   rr   r   r   r,   r0   autocastbfloat16r   r   r-   r.   r   r/   )rC   r   r   r   rd   r!   r   r   rQ   r-   token_indices_r/   r*   r*   r+   r     s&   








zLinaCodecModel.encoder   c                 C   s   | j |}|S )zXGet content embeddings from content token indices. (..., seq_len) -> (..., seq_len, dim))r6   decode)rC   r   r-   r*   r*   r+   decode_token_indices  s   z#LinaCodecModel.decode_token_indicesr/   r.   r-   target_audio_lengthc                 C   s   |du r|du rt d| |}|du r|d}| |}tjdtjdd | |}|d}|d}| j	|||d}W d   n1 sJw   Y  |
dS )a)  Synthesize audio from content and global features using LinaCodec model and Vocos.
        Args:
            global_embedding (torch.Tensor): Global embedding tensor (dim,).
            content_token_indices (torch.Tensor, optional): Optional content token indices tensor (seq_len).
            content_embedding (torch.Tensor, optional): Optional content embedding tensor (seq_len, dim).
                If both content_token_indices and content_embedding are provided, content_embedding takes precedence.
            target_audio_length (int, optional): Target length of the output audio in samples.
                If None, uses the original audio length estimated from the sequence length of content tokens.
        Returns:
            torch.Tensor: Generated mel spectrogram tensor (n_mels, T).
        NzCEither content_token_indices or content_embedding must be provided.r   rM   Tr   )r   )r   r   r   ru   r0   r   r   rw   r   r   r   )rC   r/   r.   r-   r   seq_lenr   mel_spectrogramr*   r*   r+   r     s   






zLinaCodecModel.decodesource_waveformreference_waveformc                 C   s>   | j |ddd}| j |ddd}| j|j|j|dd}|S )a  Convert voice using LinaCodec model and Vocos, keeping content from source and global characteristics from reference.
        Only supports single audio input. Just a convenient wrapper around encode and decode methods.
        Args:
            source_waveform (torch.Tensor): Source audio waveform tensor (samples,).
            reference_waveform (torch.Tensor): Reference audio waveform tensor (samples_ref,).
        Returns:
            torch.Tensor: Converted mel spectrogram tensor (n_mels, T).
        TF)r   r   r   )r-   r/   r   )r   r   r-   r/   r   )rC   r   r   source_featuresreference_featuresr   r*   r*   r+   voice_conversion  s   zLinaCodecModel.voice_conversion)F)r   r<   )NNNN)TT)NNN)-r"   r#   r$   __doc__r   r   r   r   r   r   r>   r)   rR   r?   r@   rA   rB   r&   r(   rr   ru   rw   rV   r0   r1   r   floatr   r%   r   r   r   r   dictr   classmethodr   r   inference_moder,   r   r   r   r   __classcell__r*   r*   rD   r+   r2   1   s    	

"

"	


 
4

 ) %&r2   )rh   dataclassesr   r   r0   torch.nnr\   torch.nn.functional
functionalr   
module.fsqr   module.global_encoderr   module.postnetr   module.ssl_extractorr   module.transformerr   module.distill_wavlmr	   utilr
   r   rS   r   r,   Moduler2   r*   r*   r*   r+   <module>   s&    