o
    ꁱiE                     @   s   d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
ZddlZddlmZ ddlmZ eeZG dd dZG dd	 d	eZd	dgZdS )
z'
Processor class for VibeVoice models.
    N)ListOptionalUnionDictAny)FeatureExtractionMixin)loggingc                   @   sl   e Zd ZdZddedefddZdejd	efd
dZ	ddejde
e d	efddZdejd	ejfddZdS )AudioNormalizerz
    Audio normalization class for VibeVoice tokenizer.
    
    This class provides audio normalization to ensure consistent input levels
    for the VibeVoice tokenizer while maintaining audio quality.
    ư>target_dB_FSepsc                 C   s   || _ || _dS )z
        Initialize the audio normalizer.
        
        Args:
            target_dB_FS (float): Target dB FS level for the audio. Default: -25
            eps (float): Small value to avoid division by zero. Default: 1e-6
        Nr   r   )selfr   r    r   U/home/ubuntu/vibevoice-community/vibevoice/processor/vibevoice_tokenizer_processor.py__init__   s   
zAudioNormalizer.__init__audioreturnc                 C   s>   t t |d }d| jd  || j  }|| }|||fS )z
        Adjust the audio to the target dB FS level.
        
        Args:
            audio (np.ndarray): Input audio signal
            
        Returns:
            tuple: (normalized_audio, rms, scalar)
           
      )npsqrtmeanr   r   )r   r   rmsscalarnormalized_audior   r   r   tailor_dB_FS&   s   

zAudioNormalizer.tailor_dB_FSNr   c                 C   s<   |du rt t |}|dkr|| j }nd}|| |fS )a  
        Avoid clipping by scaling down if necessary.
        
        Args:
            audio (np.ndarray): Input audio signal
            scalar (float, optional): Explicit scaling factor
            
        Returns:
            tuple: (normalized_audio, scalar)
        Ng      ?)r   maxabsr   )r   r   r   max_valr   r   r   avoid_clipping5   s   zAudioNormalizer.avoid_clippingc                 C   s"   |  |\}}}| |\}}|S )z
        Normalize the audio by adjusting to target dB FS and avoiding clipping.
        
        Args:
            audio (np.ndarray): Input audio signal
            
        Returns:
            np.ndarray: Normalized audio signal
        )r   r"   )r   r   _r   r   r   __call__I   s   zAudioNormalizer.__call__)r
   r   N)__name__
__module____qualname____doc__floatr   r   ndarraytupler   r   r"   r$   r   r   r   r   r	      s    r	   c                       s  e Zd ZdZdgZ				d)deded	ed
ef fddZde	j
de	j
fddZdee	j
ee f de	j
fddZ			d*deee	j
ee ee	j
 eee  ee f dee dee fddZdede	j
fddZ	d+deee	j
f dee de	j
fddZdeeef fddZ	 		!	"d,deeje	j
eeeje	j
f  f d#edee ded$ef
d%d&Zde	j
dede	j
fd'd(Z  ZS )-VibeVoiceTokenizerProcessora}  
    Processor for VibeVoice acoustic tokenizer models.
    
    This processor handles audio preprocessing for VibeVoice models, including:
    - Audio format conversion (stereo to mono)
    - Optional audio normalization
    - Streaming support for infinite-length audio
    
    Args:
        sampling_rate (int, optional): Expected sampling rate. Defaults to 24000.
        normalize_audio (bool, optional): Whether to normalize audio. Defaults to True.
        target_dB_FS (float, optional): Target dB FS for normalization. Defaults to -25.
        eps (float, optional): Small value for numerical stability. Defaults to 1e-6.
    input_features]  Tr
   r   sampling_ratenormalize_audior   r   c                    sN   t  jdi | || _|| _| jrt||d| _nd | _||||d| _d S )Nr   )r0   r1   r   r   r   )superr   r0   r1   r	   
normalizerfeature_extractor_dict)r   r0   r1   r   r   kwargs	__class__r   r   r   l   s   z$VibeVoiceTokenizerProcessor.__init__r   r   c                 C   s   t |jdkr	|S t |jdkrL|jd dkrtj|ddS |jd dkr,tj|ddS |jd dkr8|dS |jd dkrD|dS td|j td|j )z
        Convert stereo audio to mono if needed.
        
        Args:
            audio (np.ndarray): Input audio array
            
        Returns:
            np.ndarray: Mono audio array
           r   r   )axiszUnexpected audio shape: z%Audio should be 1D or 2D, got shape: )lenshaper   r   squeeze
ValueErrorr   r   r   r   r   _ensure_mono   s   


z(VibeVoiceTokenizerProcessor._ensure_monoc                 C   sR   t |tjstj|tjd}n|tj}| |}| jr'| jdur'| |}|S )z
        Process a single audio array.
        
        Args:
            audio: Single audio input
            
        Returns:
            np.ndarray: Processed audio
        dtypeN)	
isinstancer   r+   arrayfloat32astyper?   r1   r3   r>   r   r   r   _process_single_audio   s   

z1VibeVoiceTokenizerProcessor._process_single_audioNreturn_tensorsc           	         s  |du rt d|dur| jkrtd| d j d t|tr+ |}d}n1t|trZt|dkr:t dt	d	d
 |D rO fdd|D }d}nt|d t
jtf}nd}|rh fdd|D }n |g}|dkrt|dkrt|d dd}nBtdd |D d}n4|dkrt|dkr|d t
jt
jddf }nt
|ddt
jddf }nt|dkr|d n|}d|i}|S )a  
        Process audio for VibeVoice models.
        
        Args:
            audio: Audio input(s) to process. Can be:
                - str: Path to audio file
                - np.ndarray: Audio array
                - List[float]: Audio as list of floats
                - List[np.ndarray]: Batch of audio arrays
                - List[str]: Batch of audio file paths
            sampling_rate (int, optional): Sampling rate of the input audio
            return_tensors (str, optional): Return format ('pt' for PyTorch, 'np' for NumPy)
            
        Returns:
            dict: Processed audio inputs with keys:
                - input_features: Audio tensor(s) ready for the model
        NzAudio input is requiredzInput sampling rate (z') differs from expected sampling rate (z). Please resample your audio.Fr   zEmpty audio list providedc                 s   s    | ]}t |tV  qd S r%   )rB   str).0itemr   r   r   	<genexpr>   s    z7VibeVoiceTokenizerProcessor.__call__.<locals>.<genexpr>c                       g | ]}  |qS r   )_load_audio_from_path)rI   pathr   r   r   
<listcomp>       z8VibeVoiceTokenizerProcessor.__call__.<locals>.<listcomp>Tc                    rL   r   )rF   rI   arO   r   r   rP      rQ   ptr8   c                 S   s   g | ]}t |qS r   )torch
from_numpyrR   r   r   r   rP      rQ   r   r   )r=   r0   loggerwarningrB   rH   rM   listr:   allr   r+   rF   rU   rV   	unsqueezestacknewaxis)	r   r   r0   rG   r5   
is_batchedprocessed_audior.   outputsr   rO   r   r$      sD   


 z$VibeVoiceTokenizerProcessor.__call__
audio_pathc                 C   s   t j|d  }|dv rddl}|j|| jdd\}}|S |dkrBtj|dd	 }t	|tj
r7| }nt|}|tjS |d
krQt|}|tjS td| d)z
        Load audio from file path.
        
        Args:
            audio_path (str): Path to audio file
            
        Returns:
            np.ndarray: Loaded audio array
        r8   ).wavz.mp3z.flacz.m4az.oggr   NT)srmonoz.ptcpu)map_locationz.npyzUnsupported file format: zC. Supported formats: .wav, .mp3, .flac, .m4a, .ogg, .pt, .npy, .npz)osrN   splitextlowerlibrosaloadr0   rU   r<   rB   Tensornumpyr   rC   rE   rD   r=   )r   ra   file_extrj   audio_arrayrc   audio_tensorr   r   r   rM     s*   




z1VibeVoiceTokenizerProcessor._load_audio_from_pathaudio_path_or_array	normalizec                 C   sZ   t |tr| |}ntj|tjd}| j}|dur|| _z| |}W || _|S || _w )a  
        Convenience method to preprocess audio from file path or array.
        This method is kept for backward compatibility but __call__ is recommended.
        
        Args:
            audio_path_or_array: Path to audio file or numpy array
            normalize: Whether to normalize (overrides default setting)
            
        Returns:
            np.ndarray: Preprocessed audio array
        r@   N)rB   rH   rM   r   rC   rD   r1   rF   )r   rq   rr   ro   original_normalize	processedr   r   r   preprocess_audio7  s   
z,VibeVoiceTokenizerProcessor.preprocess_audioc                 C   s   | j S )zb
        Convert the object to a dict containing all attributes needed for serialization.
        )r4   rO   r   r   r   to_dictZ  s   z#VibeVoiceTokenizerProcessor.to_dict
output.wavFaudio_output_pathbatch_prefixc                 C   s"  |du r| j }zddl}W n ty   tdw t|tjr*|   	 }n+t|t
jr3|}n"t|trLtdd |D rIdd |D }n|}n	tdt| g }t|tr|}	tj|	d	d
 t|D ]"\}
}| ||}tj|	| |
 d}|||| || qi|S t|jdkr|jd }|dkr|}	tj|	d	d
 t|D ]7}
||
 }t|jdkr|jd dkr|d}| ||}tj|	| |
 d}|||| || q|S | }| ||}|||| || |S | ||}|||| || |S )a;  
        Save audio data to WAV file(s).
        
        Args:
            audio: Audio data to save. Can be:
                - torch.Tensor: PyTorch tensor with shape (B, C, T) or (B, T) or (T)
                - np.ndarray: NumPy array with shape (B, C, T) or (B, T) or (T)
                - List of tensors or arrays
            output_path: Path where to save the audio. If saving multiple files,
                this is treated as a directory and individual files will be saved inside.
            sampling_rate: Sampling rate for the saved audio. Defaults to the processor's rate.
            normalize: Whether to normalize audio before saving.
            batch_prefix: Prefix for batch files when saving multiple audios.
                
        Returns:
            List[str]: Paths to the saved audio files.
        Nr   zQsoundfile is required to save audio files. Install it with: pip install soundfilec                 s   s    | ]	}t |tjV  qd S r%   )rB   rU   rl   rR   r   r   r   rK     s    z9VibeVoiceTokenizerProcessor.save_audio.<locals>.<genexpr>c                 S   s    g | ]}|     qS r   )r*   detachre   rm   rR   r   r   r   rP     s     z:VibeVoiceTokenizerProcessor.save_audio.<locals>.<listcomp>zUnsupported audio type: T)exist_okrb      r8   )r0   	soundfileImportErrorrB   rU   rl   r*   r{   re   rm   r   r+   rY   rZ   r=   typerg   makedirs	enumerate_prepare_audio_for_saverN   joinwriteappendr:   r;   ranger<   )r   r   ry   r0   rr   rz   sfaudio_npsaved_paths
output_diri
audio_item	file_path
batch_sizesingle_audior   r   r   
save_audio`  sh   

'



z&VibeVoiceTokenizerProcessor.save_audioc                 C   sL   t |jdkr|jd dkr|d}|r$t| }|dkr$|| }|S )a4  
        Prepare audio for saving by ensuring it's the right shape and optionally normalizing.
        
        Args:
            audio: Audio data as numpy array
            normalize: Whether to normalize audio
            
        Returns:
            np.ndarray: Processed audio ready for saving
        r8   r   )r:   r;   r<   r   r    r   )r   r   rr   r!   r   r   r   r     s   
z3VibeVoiceTokenizerProcessor._prepare_audio_for_save)r/   Tr
   r   )NNNr%   )rw   NFrx   )r&   r'   r(   r)   model_input_namesintboolr*   r   r   r+   r?   r   r   rF   rH   r   r$   rM   ru   r   r   rv   rU   rl   r   r   __classcell__r   r   r6   r   r-   [   sl    "*
S+
#	 
"kr-   )r)   rg   jsonwarningstypingr   r   r   r   r   rm   r   rU   %transformers.feature_extraction_utilsr   transformers.utilsr   
get_loggerr&   rW   r	   r-   __all__r   r   r   r   <module>   s     
H   