o
    ²i@   ã                   @   sL  d dl Z d dlmZ d dlmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ G dd„ dejƒZedkr¤edƒZejddZdZe  dde ed ƒ¡Z!e  de ed ƒd¡Z"e"e!e!dœZ#ee#ƒZ$e %e#¡\Z&Z'e (e&e'¡Z)e  *e$d  +¡ e)¡rže,dƒ dS e,dƒ dS dS )é    N)ÚPath)ÚDictÚAny)Ú
DictConfig)Ú	load_file)Úload_config)ÚSpeakerEncoder)ÚEncoder)ÚDecoder)ÚWaveGenerator)ÚFactorizedVectorQuantizec                       sÚ   e Zd ZdZdeeef dejdejdejdejdejdejd	d
f‡ fdd„Z	e
ded	d fdd„ƒZdeeef d	eeef fdd„Ze ¡ deeef fdd„ƒZe ¡ dd„ ƒZdeeef fdd„Zdd„ Z‡  ZS )ÚBiCodeczŠ
    BiCodec model for speech synthesis, incorporating a speaker encoder, feature encoder/decoder,
    quantizer, and wave generator.
    Ú
mel_paramsÚencoderÚdecoderÚ	quantizerÚspeaker_encoderÚprenetÚpostnetÚreturnNc           	         s<   t ƒ  ¡  || _|| _|| _|| _|| _|| _|  |¡ dS )aå  
        Initializes the BiCodec model with the required components.

        Args:
            mel_params (dict): Parameters for the mel-spectrogram transformer.
            encoder (nn.Module): Encoder module.
            decoder (nn.Module): Decoder module.
            quantizer (nn.Module): Quantizer module.
            speaker_encoder (nn.Module): Speaker encoder module.
            prenet (nn.Module): Prenet network.
            postnet (nn.Module): Postnet network.
        N)	ÚsuperÚ__init__r   r   r   r   r   r   Úinit_mel_transformer)	Úselfr   r   r   r   r   r   r   Úkwargs©Ú	__class__© úD/home/ubuntu/veenaModal/external/sparktts/sparktts/models/bicodec.pyr   %   s   
zBiCodec.__init__Ú	model_dirc              	   K   s  |› d}t |› dƒd }|d }tdi |d ¤Ž}tdi |d ¤Ž}tdi |d ¤Ž}tdi |d ¤Ž}	tdi |d	 ¤Ž}
tdi |d
 ¤Ž}| |||
||||	d}t|ƒ}|j|dd\}}|D ]	}td|› ƒ qb|D ]	}td|› ƒ qn| 	¡  | 
¡  |S )zí
        Loads the model from a checkpoint.

        Args:
            model_dir (Path): Path to the model directory containing checkpoint and config.
        
        Returns:
            BiCodec: The initialized BiCodec model.
        z/model.safetensorsz/config.yamlÚaudio_tokenizerr   r   r   r   r   r   r   )r   r   r   r   r   r   r   F)ÚstrictzMissing tensor: zUnexpected tensor: Nr   )r   r	   r   r
   r   r   r   Úload_state_dictÚprintÚevalÚremove_weight_norm)Úclsr   r   Ú	ckpt_pathÚconfigr   r   r   r   r   r   r   ÚmodelÚ
state_dictÚmissing_keysÚunexpected_keysÚkeyr   r   r   Úload_from_checkpointE   s6   
ù
zBiCodec.load_from_checkpointÚbatchc              
   C   s¶   |d }|   |d ¡ d¡}|  | dd¡¡}|  |¡}|  | dd¡¡\}}|}d}	|  |d |¡}
|  |
¡}|
| d¡ }
|  	|
¡}|d |d	 |d
 |||||d  d¡|	dœ	S )a%  
        Performs a forward pass through the model.

        Args:
            batch (dict): A dictionary containing features, reference waveform, and target waveform.
        
        Returns:
            dict: A dictionary containing the reconstruction, features, and other metrics.
        ÚfeatÚref_wavé   é   FÚz_qéÿÿÿÿÚvq_lossÚ
perplexityÚ
active_numÚwav)	r6   r7   Úcluster_sizeÚreconsÚ	pred_featÚx_vectorÚd_vectorÚaudiosÚwith_speaker_loss)
Úmel_transformerÚsqueezer   Ú	transposer   r   r   r   Ú	unsqueezer   )r   r/   r0   ÚmelÚzÚ
vq_outputsr=   r>   Ú
conditionsr@   Úxr<   Ú	wav_reconr   r   r   Úforwardq   s*   



÷zBiCodec.forwardc                 C   sV   |d }|   |d ¡ d¡}|  | dd¡¡}| j |¡}| j | dd¡¡}||fS )zî
        Tokenizes the input audio into semantic and global tokens.

        Args:
            batch (dict): The input audio features and reference waveform.

        Returns:
            tuple: Semantic tokens and global tokens.
        r0   r1   r2   r3   )rA   rB   r   rC   r   Útokenizer   )r   r/   r0   rE   rF   Úsemantic_tokensÚglobal_tokensr   r   r   rL   —   s   zBiCodec.tokenizec                 C   s@   | j  |¡}| j |¡}|  ||¡}|| d¡ }|  |¡}|S )a  
        Detokenizes the semantic and global tokens into a waveform.

        Args:
            semantic_tokens (tensor): Semantic tokens.
            global_tokens (tensor): Global tokens.

        Returns:
            tensor: Reconstructed waveform.
        r5   )r   Ú
detokenizer   r   rD   r   )r   rM   rN   r4   r>   rI   rJ   r   r   r   rO   «   s   
zBiCodec.detokenizer(   c                 C   sL   ddl m} |j|d |d |d |d |d |d |d	 d
ddd
| _dS )z¸
        Initializes the MelSpectrogram transformer based on the provided configuration.

        Args:
            config (dict): Configuration parameters for MelSpectrogram.
        r   NÚsample_rateÚn_fftÚ
win_lengthÚ
hop_lengthÚmel_fminÚmel_fmaxÚnum_melsr2   Úslaney)Ún_melsÚpowerÚnormÚ	mel_scale)Útorchaudio.transformsÚ
transformsÚMelSpectrogramrA   )r   r(   ÚTTr   r   r   r   ¿   s   özBiCodec.init_mel_transformerc                 C   s   dd„ }|   |¡ dS )z-Removes weight normalization from all layers.c                 S   s*   z
t jj | ¡ W d S  ty   Y d S w )N)ÚtorchÚnnÚutilsr%   Ú
ValueError)Úmr   r   r   Ú_remove_weight_norm×   s
   ÿz7BiCodec.remove_weight_norm.<locals>._remove_weight_normN)Úapply)r   re   r   r   r   r%   Õ   s   zBiCodec.remove_weight_norm)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Ústrr   ra   ÚModuler   Úclassmethodr   r.   rK   r`   Úno_gradrL   rO   r   r%   Ú__classcell__r   r   r   r   r      s8    
þýüûúùø
ö "+&
r   Ú__main__z3pretrained_models/SparkTTS-0.5B/BiCodec/config.yamlz'pretrained_models/SparkTTS-0.5B/BiCodec)r   g¸…ëQ¸î?é   r2   i€>  é2   i   )r0   r9   r1   r;   zTest successfulzTest failed)-r`   Útorch.nnra   Úpathlibr   Útypingr   r   Ú	omegaconfr   Úsafetensors.torchr   Úsparktts.utils.filer   Ú(sparktts.modules.speaker.speaker_encoderr   Ú-sparktts.modules.encoder_decoder.feat_encoderr	   Ú-sparktts.modules.encoder_decoder.feat_decoderr
   Ú/sparktts.modules.encoder_decoder.wave_generatorr   Ú.sparktts.modules.vq.factorized_vector_quantizer   rl   r   rg   r(   r.   r)   ÚdurationÚrandnÚintrI   r0   ÚinputsÚoutputsrL   rM   rN   rO   rJ   ÚallcloseÚdetachr#   r   r   r   r   Ú<module>   s<    Cÿê