o
    i]                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ G dd	 d	Zed
krnd dlZe e j rJdndZededZdZee\ZZeed eZ e!de d dS dS )    N)Path)AnyDictTuple)Wav2Vec2FeatureExtractorWav2Vec2Model)load_config)
load_audio)BiCodecc                       s   e Zd ZdZddedejf fddZdd Zd	e	j
d
e	j
fddZded
ee	j
ejf fddZdejd
ejfddZdeeef d
ejfddZded
eejejf fddZdejdejd
e	jfddZ  ZS )BiCodecTokenizerz<BiCodec tokenizer for handling audio input and tokenization.N	model_dirdevicec                    s4   t    	 || _|| _t| d| _|   d S )Nz/config.yaml)super__init__r   r   r   config_initialize_model)selfr   r   kwargs	__class__ L/home/ubuntu/veenaModal/external/sparktts/sparktts/models/audio_tokenizer.pyr       s   
zBiCodecTokenizer.__init__c                 C   sZ   t | j d| j| _t| j d| _t	| j d| j| _
d| j
j_dS )zELoad and initialize the BiCodec model and Wav2Vec2 feature extractor.z/BiCodecz/wav2vec2-large-xlsr-53TN)r
   load_from_checkpointr   tor   modelr   from_pretrained	processorr   feature_extractorr   output_hidden_states)r   r   r   r   r   ,   s   

z"BiCodecTokenizer._initialize_modelwavreturnc                 C   s\   t | jd | jd  | jd  | jd  }t|}||kr(t||| d }|d| S )z/Get reference audio clip for speaker embedding.sample_rateref_segment_durationlatent_hop_length   N)intr   lennptile)r   r   ref_segment_length
wav_lengthr   r   r   get_ref_clip9   s   zBiCodecTokenizer.get_ref_clipwav_pathc                 C   s@   t || jd | jd d}| |}t|d }||fS )z0load auido and get reference audio from wav pathr!   volume_normalize)sampling_rater-   r   )r	   r   r+   torch
from_numpy	unsqueezefloat)r   r,   r   wav_refr   r   r   process_audioH   s   
zBiCodecTokenizer.process_audiowavsc                 C   sP   | j |dddddj}| || jj}|jd |jd  |jd  d }|S )	zextract wav2vec2 features>  ptT)r.   return_tensorspaddingr               )r   input_valuesr   r   r   hidden_states)r   r5   inputsfeat	feats_mixr   r   r   extract_wav2vec2_featuresU   s   z*BiCodecTokenizer.extract_wav2vec2_featuresbatchc                 C   s.   |  |d }||d< | j|\}}||fS )a  tokenize the batch of audio

        Args:
            batch:
                wavs (List[np.ndarray]): batch of audio
                ref_wavs (torch.Tensor): reference audio. shape: (batch_size, seq_len)

        Returns:
            semantic_tokens: semantic tokens. shape: (batch_size, seq_len, latent_dim)
            global_tokens: global tokens. shape: (batch_size, seq_len, global_dim)
        r   rA   )rC   r   tokenize)r   rD   featssemantic_tokensglobal_tokensr   r   r   tokenize_batche   s   zBiCodecTokenizer.tokenize_batch
audio_pathc                 C   sd   |  |\}}| |}t|d | j|| j|| jd}| j	|\}}||fS )ztokenize the audior   )r   ref_wavrA   )
r4   rC   r/   r0   r1   r2   r   r   r   rE   )r   rJ   r   rK   rA   rD   rG   rH   r   r   r   rE   w   s   


zBiCodecTokenizer.tokenizerH   rG   c                 C   s,   | d}| j||}|    S )aB  detokenize the tokens to waveform

        Args:
            global_tokens: global tokens. shape: (batch_size, global_dim)
            semantic_tokens: semantic tokens. shape: (batch_size, latent_dim)

        Returns:
            wav_rec: waveform. shape: (batch_size, seq_len) for batch or (seq_len,) for single
        r$   )r1   r   
detokenizedetachsqueezecpunumpy)r   rH   rG   wav_recr   r   r   rL      s   
zBiCodecTokenizer.detokenize)N)__name__
__module____qualname____doc__r   r/   r   r   r   r'   ndarrayr+   r   Tensorr4   rC   r   strr   rI   rE   arrayrL   __classcell__r   r   r   r   r      s     r   __main__cudarO   z pretrained_models/Spark-TTS-0.5B)r   r   zexample/prompt_audio.wavzexample/prompt_recon.wavr6   )"r/   rP   r'   pathlibr   typingr   r   r   transformersr   r   sparktts.utils.filer   sparktts.utils.audior	   sparktts.models.bicodecr
   r   rR   	soundfilesfr   r\   is_available	tokenizerr,   rE   rH   rG   rL   rN   rQ   writer   r   r   r   <module>   s*   y