
    hE                         d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
ZddlZddlmZ ddlmZ  ej        e          Z G d d          Z G d d	e          Zd	dgZdS )
z'
Processor class for VibeVoice models.
    N)ListOptionalUnionDictAny)FeatureExtractionMixin)loggingc                       e Zd ZdZddedefdZdej        defd	Z	ddej        de
e         defdZdej        dej        fdZd
S )AudioNormalizerz
    Audio normalization class for VibeVoice tokenizer.
    
    This class provides audio normalization to ensure consistent input levels
    for the VibeVoice tokenizer while maintaining audio quality.
    ư>target_dB_FSepsc                 "    || _         || _        dS )z
        Initialize the audio normalizer.
        
        Args:
            target_dB_FS (float): Target dB FS level for the audio. Default: -25
            eps (float): Small value to avoid division by zero. Default: 1e-6
        Nr   r   )selfr   r   s      Y/workspace/chatterbox-finetuning/src/vibevoice/processor/vibevoice_tokenizer_processor.py__init__zAudioNormalizer.__init__   s     )    audioreturnc                     t          j        t          j        |dz                      }d| j        dz  z  || j        z   z  }||z  }|||fS )z
        Adjust the audio to the target dB FS level.
        
        Args:
            audio (np.ndarray): Input audio signal
            
        Returns:
            tuple: (normalized_audio, rms, scalar)
           
      )npsqrtmeanr   r   )r   r   rmsscalarnormalized_audios        r   tailor_dB_FSzAudioNormalizer.tailor_dB_FS&   sU     gbgeQh''(()B./3>B 6>f,,r   Nr    c                     |9t          j        t          j        |                    }|dk    r|| j        z   }nd}||z  |fS )a  
        Avoid clipping by scaling down if necessary.
        
        Args:
            audio (np.ndarray): Input audio signal
            scalar (float, optional): Explicit scaling factor
            
        Returns:
            tuple: (normalized_audio, scalar)
        Ng      ?)r   maxabsr   )r   r   r    max_vals       r   avoid_clippingzAudioNormalizer.avoid_clipping5   sK     >fRVE]]++G}} 48+v~v%%r   c                 h    |                      |          \  }}}|                     |          \  }}|S )z
        Normalize the audio by adjusting to target dB FS and avoiding clipping.
        
        Args:
            audio (np.ndarray): Input audio signal
            
        Returns:
            np.ndarray: Normalized audio signal
        )r"   r'   )r   r   _s      r   __call__zAudioNormalizer.__call__I   s9     ''..q!&&u--qr   )r   r   N)__name__
__module____qualname____doc__floatr   r   ndarraytupler"   r   r'   r*    r   r   r   r      s         	 	U 	u 	 	 	 	-"* - - - - -& &BJ & &SX & & & &(bj RZ      r   r   c                   r    e Zd ZdZdgZ	 	 	 	 d deded	ed
ef fdZde	j
        de	j
        fdZdee	j
        ee         f         de	j
        fdZ	 	 	 d!deee	j
        ee         ee	j
                 eee                  ee         f         dee         dee         fdZdede	j
        fdZ	 d"deee	j
        f         dee         de	j
        fdZdeeef         fdZ	 	 	 	 d#deej        e	j
        eeej        e	j
        f                  f         dedee         dedef
dZde	j
        dede	j
        fdZ xZS )$VibeVoiceTokenizerProcessora}  
    Processor for VibeVoice acoustic tokenizer models.
    
    This processor handles audio preprocessing for VibeVoice models, including:
    - Audio format conversion (stereo to mono)
    - Optional audio normalization
    - Streaming support for infinite-length audio
    
    Args:
        sampling_rate (int, optional): Expected sampling rate. Defaults to 24000.
        normalize_audio (bool, optional): Whether to normalize audio. Defaults to True.
        target_dB_FS (float, optional): Target dB FS for normalization. Defaults to -25.
        eps (float, optional): Small value for numerical stability. Defaults to 1e-6.
    input_features]  Tr   r   sampling_ratenormalize_audior   r   c                      t                      j        di | || _        || _        | j        rt	          ||          | _        nd | _        ||||d| _        d S )Nr   )r8   r9   r   r   r3   )superr   r8   r9   r   
normalizerfeature_extractor_dict)r   r8   r9   r   r   kwargs	__class__s         r   r   z$VibeVoiceTokenizerProcessor.__init__l   s     	""6"""*.  	#-<SQQQDOO"DO +.(	'
 '
###r   r   r   c                    t          |j                  dk    r|S t          |j                  dk    r|j        d         dk    rt          j        |d          S |j        d         dk    rt          j        |d          S |j        d         dk    r|                    d          S |j        d         dk    r|                    d          S t          d|j                   t          d|j                   )z
        Convert stereo audio to mono if needed.
        
        Args:
            audio (np.ndarray): Input audio array
            
        Returns:
            np.ndarray: Mono audio array
           r   r   )axiszUnexpected audio shape: z%Audio should be 1D or 2D, got shape: )lenshaper   r   squeeze
ValueErrorr   r   s     r   _ensure_monoz(VibeVoiceTokenizerProcessor._ensure_mono   s     u{q  L""{1~""wu1----Q1$$wu1---- ;q>Q&& ==+++[^q(( ==+++$%M%M%MNNNRU[RRSSSr   c                 *   t          |t          j                  s!t          j        |t          j                  }n|                    t          j                  }|                     |          }| j        r| j        |                     |          }|S )z
        Process a single audio array.
        
        Args:
            audio: Single audio input
            
        Returns:
            np.ndarray: Processed audio
        dtype)	
isinstancer   r1   arrayfloat32astyperH   r9   r<   rG   s     r   _process_single_audioz1VibeVoiceTokenizerProcessor._process_single_audio   s     %,, 	-HU"*555EELL,,E !!%((  	+DO$?OOE**Er   Nreturn_tensorsc                     |t          d          |1| j        k    r&t                              d| d j         d           t	          |t
                    r                     |          }d}nt	          |t                    rtt          |          dk    rt          d          t          d	 |D                       r fd
|D             }d}n*t	          |d         t          j        t          f          }nd}|r fd|D             }n                     |          g}|dk    rt          |          dk    rAt          j        |d                                       d                              d          }nt          j        d |D                                           d          }n|dk    rgt          |          dk    r(|d         t          j        t          j        ddf         }nIt          j        |          ddt          j        ddf         }nt          |          dk    r|d         n|}d|i}|S )a  
        Process audio for VibeVoice models.
        
        Args:
            audio: Audio input(s) to process. Can be:
                - str: Path to audio file
                - np.ndarray: Audio array
                - List[float]: Audio as list of floats
                - List[np.ndarray]: Batch of audio arrays
                - List[str]: Batch of audio file paths
            sampling_rate (int, optional): Sampling rate of the input audio
            return_tensors (str, optional): Return format ('pt' for PyTorch, 'np' for NumPy)
            
        Returns:
            dict: Processed audio inputs with keys:
                - input_features: Audio tensor(s) ready for the model
        NzAudio input is requiredzInput sampling rate (z') differs from expected sampling rate (z). Please resample your audio.Fr   zEmpty audio list providedc              3   @   K   | ]}t          |t                    V  d S r+   )rL   str).0items     r   	<genexpr>z7VibeVoiceTokenizerProcessor.__call__.<locals>.<genexpr>   s,      ;;T:dC((;;;;;;r   c                 :    g | ]}                     |          S r3   )_load_audio_from_path)rU   pathr   s     r   
<listcomp>z8VibeVoiceTokenizerProcessor.__call__.<locals>.<listcomp>   s'    LLLd33D99LLLr   Tc                 :    g | ]}                     |          S r3   )rP   )rU   ar   s     r   r[   z8VibeVoiceTokenizerProcessor.__call__.<locals>.<listcomp>   s'    LLLt99!<<LLLr   ptrA   c                 6    g | ]}t          j        |          S r3   )torch
from_numpyrU   r]   s     r   r[   z8VibeVoiceTokenizerProcessor.__call__.<locals>.<listcomp>   s#    -[-[-[ae.>q.A.A-[-[-[r   r   r   )rF   r8   loggerwarningrL   rT   rY   listrC   allr   r1   rP   r`   ra   	unsqueezestacknewaxis)	r   r   r8   rQ   r>   
is_batchedprocessed_audior6   outputss	   `        r   r*   z$VibeVoiceTokenizerProcessor.__call__   s   0 =6777 $$:L)L)LNNU U U"&"4U U U   eS!! 	..u55EJJt$$ 	5zzQ !<=== ;;U;;;;; FLLLLeLLL!

 (a2:t2DEE

 J  	BLLLLeLLLOO#99%@@AO T!!?##q((!&!1/!2D!E!E!O!OPQ!R!R!\!\]^!_!_ "'-[-[?-[-[-[!\!\!f!fgh!i!it##?##q((!0!3BJ
AAA4M!N!#/!:!:111bj!!!;K!L363G3G13L3L_Q//RaN ^
 r   
audio_pathc                 |   t           j                            |          d                                         }|dv r&ddl}|                    || j        d          \  }}|S |dk    rt          j        |d	                                          }t          |t          j
                  r|                                }nt          j        |          }|                    t          j                  S |d
k    r3t          j        |          }|                    t          j                  S t!          d| d          )z
        Load audio from file path.
        
        Args:
            audio_path (str): Path to audio file
            
        Returns:
            np.ndarray: Loaded audio array
        rA   ).wavz.mp3z.flacz.m4az.oggr   NT)srmonoz.ptcpu)map_locationz.npyzUnsupported file format: zC. Supported formats: .wav, .mp3, .flac, .m4a, .ogg, .pt, .npy, .npz)osrZ   splitextlowerlibrosaloadr8   r`   rE   rL   Tensornumpyr   rM   rO   rN   rF   )r   rm   file_extrw   audio_arrayrp   audio_tensors          r   rY   z1VibeVoiceTokenizerProcessor._load_audio_from_path  sB    7##J//288::@@@NNN%ll% +  OK
  :juEEEMMOOL,55 5*0022 h|44%%bj111'*--K%%bj111UH U U U  r   audio_path_or_array	normalizec                    t          |t                    r|                     |          }n t          j        |t          j                  }| j        }||| _        	 |                     |          }|| _        n# || _        w xY w|S )a  
        Convenience method to preprocess audio from file path or array.
        This method is kept for backward compatibility but __call__ is recommended.
        
        Args:
            audio_path_or_array: Path to audio file or numpy array
            normalize: Whether to normalize (overrides default setting)
            
        Returns:
            np.ndarray: Preprocessed audio array
        rJ   )rL   rT   rY   r   rM   rN   r9   rP   )r   r~   r   r|   original_normalize	processeds         r   preprocess_audioz,VibeVoiceTokenizerProcessor.preprocess_audio7  s      )3// 	J445HIIKK(#6bjIIIK "1 #,D 	622;??I $6D  #5D 5555s   A: :	Bc                     | j         S )zb
        Convert the object to a dict containing all attributes needed for serialization.
        )r=   )r   s    r   to_dictz#VibeVoiceTokenizerProcessor.to_dictZ  s     **r   
output.wavFaudio_output_pathbatch_prefixc                    || j         }	 ddl}n# t          $ r t          d          w xY wt          |t          j                  rK|                                                                                                	                                }nzt          |t          j                  r|}n]t          |t                    r)t          d |D                       rd |D             }n"|}nt          dt          |                     g }t          |t                    r|}	t!          j        |	d           t%          |          D ]l\  }
}|                     ||          }t           j                            |	| |
 d	          }|                    |||           |                    |           mnt1          |j                  d
k    rB|j        d         }|dk    r|}	t!          j        |	d           t5          |          D ]}
||
         }t1          |j                  dk    r&|j        d         dk    r|                    d          }|                     ||          }t           j                            |	| |
 d	          }|                    |||           |                    |           n|                                }|                     ||          }|                    |||           |                    |           nB|                     ||          }|                    |||           |                    |           |S )a;  
        Save audio data to WAV file(s).
        
        Args:
            audio: Audio data to save. Can be:
                - torch.Tensor: PyTorch tensor with shape (B, C, T) or (B, T) or (T)
                - np.ndarray: NumPy array with shape (B, C, T) or (B, T) or (T)
                - List of tensors or arrays
            output_path: Path where to save the audio. If saving multiple files,
                this is treated as a directory and individual files will be saved inside.
            sampling_rate: Sampling rate for the saved audio. Defaults to the processor's rate.
            normalize: Whether to normalize audio before saving.
            batch_prefix: Prefix for batch files when saving multiple audios.
                
        Returns:
            List[str]: Paths to the saved audio files.
        Nr   zQsoundfile is required to save audio files. Install it with: pip install soundfilec              3   J   K   | ]}t          |t          j                  V  d S r+   )rL   r`   ry   rb   s     r   rW   z9VibeVoiceTokenizerProcessor.save_audio.<locals>.<genexpr>  s.      >>1:a..>>>>>>r   c                     g | ]L}|                                                                                                                                 MS r3   )r0   detachrr   rz   rb   s     r   r[   z:VibeVoiceTokenizerProcessor.save_audio.<locals>.<listcomp>  sD    LLLAGGII,,..2244::<<LLLr   zUnsupported audio type: T)exist_okro      rA   )r8   	soundfileImportErrorrL   r`   ry   r0   r   rr   rz   r   r1   re   rf   rF   typert   makedirs	enumerate_prepare_audio_for_saverZ   joinwriteappendrC   rD   rangerE   )r   r   r   r8   r   r   sfaudio_npsaved_paths
output_diri
audio_item	file_path
batch_sizesingle_audios                  r   
save_audioz&VibeVoiceTokenizerProcessor.save_audio`  s   2   .M	""""" 	 	 	9  	 eU\** 	G{{}}++--113399;;HHrz** 		GHHt$$ 	G>>>>>>> !LLeLLL EUEEFFF h%% 1	0$J K
T2222 "+8!4!4 . .:!99*iPP
GLL5Ma5M5M5MNN	J>>>""9----	. 8>""a''%^A.
>>!,J K
T:::: #:.. 
6 
6'/{|122Q66+1!499/;/C/CA/F/F'+'C'CLR['\'\$&GLL=Ua=U=U=U$V$V	L-HHH#**95555
6 "*!1!1!3!3J!%!=!=j)!T!TJHH[*mDDD&&{3333 "99(INN
j-@@@"";///s    *c                     t          |j                  dk    r&|j        d         dk    r|                    d          }|r1t          j        |                                          }|dk    r||z  }|S )a4  
        Prepare audio for saving by ensuring it's the right shape and optionally normalizing.
        
        Args:
            audio: Audio data as numpy array
            normalize: Whether to normalize audio
            
        Returns:
            np.ndarray: Processed audio ready for saving
        rA   r   )rC   rD   rE   r   r%   r$   )r   r   r   r&   s       r   r   z3VibeVoiceTokenizerProcessor._prepare_audio_for_save  st     u{aEKNa$7$7MM!$$E  	(fUmm''))G{{r   )r7   Tr   r   )NNNr+   )r   NFr   )r,   r-   r.   r/   model_input_namesintboolr0   r   r   r1   rH   r   r   rP   rT   r   r*   rY   r   r   r   r   r`   ry   r   r   __classcell__)r?   s   @r   r5   r5   [   s         ** # $!
 

 
 	

 
 
 
 
 
 
6T"* T T T T T85T%[1H+I bj    6 fj'+(,	Q QS"*d5k4
3CT$u+EVX\]`XaabQ  }Q !	Q Q Q Qf& &
 & & & &V %)   "3
?3  D>  
	       F+c3h + + + + ('+$i iU\2:tE%,
:R4S/TTUi i  }	i
 i i i i iVRZ D RZ        r   r5   )r/   rt   jsonwarningstypingr   r   r   r   r   rz   r   r`   %transformers.feature_extraction_utilsr   transformers.utilsr	   
get_loggerr,   rc   r   r5   __all__r3   r   r   <module>r      s    
			   3 3 3 3 3 3 3 3 3 3 3 3 3 3      H H H H H H & & & & & &		H	%	%D D D D D D D DPE E E E E"8 E E EP )*;
<r   