o
    ꁱi~w                     @   s   d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl
Z
d dlZd dlZd dlmZmZmZmZmZ d dlmZmZ ddlmZ eeZG dd dZdgZdS )	    N)ListOptionalUnionDictAnyTuple)BatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging   )AudioNormalizerc                   @   s  e Zd ZdZd@ddZedd Zd	eee	j
f fd
dZ							dAdeeeee eeee ee f  deeeeeejf  eeeeejf   f  deeeef deeeef dee deeeef  dedefddZ	dBdeeef deeeeejf   deeef fddZ					dCdeeeef  deeeef deeeef dee deeeef  dedefddZdeeeejf  deee eej ee f fddZ			dDdeej deeeef  d eeeej f  d!eej! deeef f
d"d#Z"d$edefd%d&Z#d'edefd(d)Z$d*edeeeef  fd+d,Z%d-ed.edefd/d0Z&d1d2 Z'd3d4 Z(e)d5d6 Z*	7			8dEd9eej+ejeeej+ejf  f d:ed;ee d<ed=edefd>d?Z,dS )FVibeVoiceProcessoraW  
    Constructs a VibeVoice processor which wraps a VibeVoice tokenizer and audio processor into a single processor.

    [`VibeVoiceProcessor`] offers all the functionalities of [`VibeVoiceTokenizer`] and [`VibeVoiceTokenizerProcessor`]. 
    See the [`~VibeVoiceProcessor.__call__`] and [`~VibeVoiceProcessor.decode`] for more information.

    Args:
        tokenizer (`VibeVoiceTextTokenizer` or `VibeVoiceTextTokenizerFast`):
            The tokenizer for text processing.
        audio_processor (`VibeVoiceTokenizerProcessor`):
            The audio processor for speech processing.
        speech_tok_compress_ratio (`int`, *optional*, defaults to 3200):
            The compression ratio for speech tokenization.
        db_normalize (`bool`, *optional*, defaults to True):
            Whether to apply decibel normalization to audio inputs.
    N  Tc                 K   s2   || _ || _|| _|| _|rt nd | _d| _d S )Nz~ Transform the text provided by various speakers into speech output, utilizing the distinct voice of each respective speaker.
)	tokenizeraudio_processorspeech_tok_compress_ratiodb_normalizer   audio_normalizersystem_prompt)selfr   r   r   r   kwargs r   K/home/ubuntu/vibevoice-community/vibevoice/processor/vibevoice_processor.py__init__#   s   
zVibeVoiceProcessor.__init__c              
   K   s  ddl }ddl}ddlm} ddlm} ddlm}m} |j	
|d}	d}
|j	|	rFt|	d}||}
W d   n1 s@w   Y  nMz%||dfi |}t|d}||}
W d   n1 sew   Y  W n' ty } ztd	| d
|  td ddd}
W Y d}~nd}~ww |
dd}|
dd}|
ddp|dd}td|  d| v r|j|fi |}ntd| dd|
v r|
d }||dd|dd|dd|ddd}n| }| ||||d S )!a  
        Instantiate a VibeVoiceProcessor from a pretrained VibeVoice processor.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:
                - a string, the *model id* of a pretrained model
                - a path to a *directory* containing processor config

        Returns:
            [`VibeVoiceProcessor`]: The processor object instantiated from pretrained model.
        r   N)cached_filer   )VibeVoiceTokenizerProcessor)VibeVoiceTextTokenizerVibeVoiceTextTokenizerFastpreprocessor_config.jsonrz-Could not load preprocessor_config.json from : zUsing default configurationr   T)r   r   r   r   language_model_pretrained_namezQwen/Qwen2.5-1.5BzLoading tokenizer from qwenzUnsupported tokenizer type for z&. Supported types: Qwen, Llama, Gemma.r   sampling_rate]  normalize_audiotarget_dB_FSepsư>)r'   r)   r*   r,   )r   r   r   r   )osjsontransformers.utilsr   vibevoice_tokenizer_processorr   2vibevoice.modular.modular_vibevoice_text_tokenizerr    r!   pathjoinexistsopenload	Exceptionloggerwarninggetpopinfolowerfrom_pretrained
ValueError)clspretrained_model_name_or_pathr   r.   r/   r   r   r    r!   config_pathconfigfconfig_fileer   r   r%   r   audio_configr   r   r   r   r?   +   sr   
	



z"VibeVoiceProcessor.from_pretrainedsave_directoryc                 K   s   ddl }ddl}|j|dd d| j| jdt| jddt| jd	dt| jd
dt| jdddd}|j|d}t	|d}|j
||dd W d   n1 sQw   Y  td|  dS )a  
        Save a processor to a directory, so that it can be re-loaded using the
        [`~VibeVoiceProcessor.from_pretrained`] class method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the processor will be saved.
        r   NT)exist_okr   r   r'   r(   r)   r*   r+   r,   r-   )feature_extractor_typer'   r)   r*   r,   )processor_classr   r   r   r"   w   )indentz!Processor configuration saved in )r.   r/   makedirsr   r   getattrr   r3   r4   r6   dumpr9   r=   )r   rI   r   r.   r/   processor_configrC   rE   r   r   r   save_pretrained   s$   	z"VibeVoiceProcessor.save_pretrainedFtextvoice_samplespadding
truncation
max_lengthreturn_tensorsreturn_attention_maskreturnc                 K   s   t |tst |trt|dkrt |d ts|g}	d}
n|}	d}
|dur8|
r1t |d ttjfr5|g}n
|}ndgt|	 }g }t|	|D ]\}}| ||}|| qF| j	||||||d}|S )a  
        Main method to process one or more podcast scripts with optional voice samples.

        Args:
            text (`str`, `List[str]`):
                The input text(s) to process. Can be:
                - A single script string
                - A list of script strings for batch processing
                - A path to a .json or .txt file
                - A list of paths
            voice_samples (`List[Union[str, np.ndarray]]`, `List[List[Union[str, np.ndarray]]]`, *optional*):
                Voice samples for each script. Can be:
                - A list of samples for a single script
                - A list of lists for batch processing
            padding (`bool`, `str` or `PaddingStrategy`, defaults to `True`):
                Whether to pad sequences to the same length
            truncation (`bool`, `str` or `TruncationStrategy`, defaults to `False`):
                Whether to truncate sequences
            max_length (`int`, *optional*):
                Maximum length of the returned sequences
            return_tensors (`str` or `TensorType`, *optional*):
                If set, will return tensors of a particular framework
            return_attention_mask (`bool`, defaults to `True`):
                Whether to return the attention mask

        Returns:
            `BatchEncoding`: A BatchEncoding with the following fields:
                - **input_ids** -- List of token id sequences or tensor
                - **attention_mask** -- List of attention masks or tensor
                - **speech_tensors** -- Padded speech inputs (if voice_samples provided)
                - **speech_masks** -- Speech masks (if voice_samples provided)
                - **speech_input_mask** -- Boolean masks indicating speech token positions
        r   FTN)rW   rX   rY   rZ   r[   )

isinstancestrlistlennpndarrayzip_process_singleappend_batch_encode)r   rU   rV   rW   rX   rY   rZ   r[   r   texts
is_batchedvoice_samples_listall_encodings
text_inputvoice_inputencodingbatch_encodingr   r   r   __call__   s.   .-	zVibeVoiceProcessor.__call__c                 C   s  d}t |tr+|drtj|r| |}n|dr)tj|r)| |}n|}|du r6td| | 	|}t
tdd |D }| j| j}|r^| |dt| \}}}	ng g g }}}	|| }
dgt| |	 }|
| jjddd	7 }
|dgt| jjddd	 7 }|D ] \}}| jjd
| d| ddd	}|
|7 }
|dgt| 7 }q|
| jjddd	| jjg 7 }
|dgt| jjddd	d  7 }|
|r|nd|||dS )z Process a single podcast script.Nz.jsonz.txtzCould not process input text: c                 s   s    | ]\}}|V  qd S Nr   ).0
speaker_id_r   r   r   	<genexpr>      z5VibeVoiceProcessor._process_single.<locals>.<genexpr>Fz Text input:
add_special_tokens	 Speaker :
z Speech output:
r   )	input_idsspeech_inputsspeech_input_maskparsed_scriptall_speakers)r]   r^   endswithr.   r3   r5   _convert_json_to_script_convert_text_to_scriptr@   _parse_scriptr_   setr   encoder   _create_voice_promptr`   speech_start_id)r   rU   rV   scriptparsed_linesr   system_tokensvoice_tokensvoice_speech_inputsvoice_speech_masksfull_tokensr}   rr   speaker_textspeaker_text_tokensr   r   r   rd      s>   

"
z"VibeVoiceProcessor._process_single	encodingsc                 C   s  dd |D }dd |D }t |tr|rtjntj}	nt |tr&t|}	n|}	|	tjkr|	tjkr<tdd |D }
n|	tjkrH|durH|}
n	tdd |D }
g }g }g }t||D ]I\}}|rtt	||
krt|d|
 }|d|
 }|
t	| }| j
jg| | }dg| d	gt	|  }d
g| | }|| || || q\|}|}n|rdd |D nd}g }d
}|D ]}|d dur||d  d}qt }|durtj|tjd|d< |r|durtj|tjd|d< tj|tjd|d< n||d< |r
|dur
||d< ||d< |r%| j||d}|d |d< |d |d< nd|d< d|d< dd |D |d< dd |D |d< |S )z5Combine multiple encodings into a batch with padding.c                 S      g | ]}|d  qS )r{   r   rq   encr   r   r   
<listcomp>=      z4VibeVoiceProcessor._batch_encode.<locals>.<listcomp>c                 S   r   )r}   r   r   r   r   r   r   >  r   c                 s       | ]}t |V  qd S rp   r`   rq   idsr   r   r   rt   K  ru   z3VibeVoiceProcessor._batch_encode.<locals>.<genexpr>Nc                 s   r   rp   r   r   r   r   r   rt   O  ru   r   r   Fc                 S   s   g | ]	}d gt | qS )r   r   r   r   r   r   r   k  s    r|   Tdtyper{   attention_maskr}   )rZ   padded_speechesspeech_tensorsspeech_masksc                 S   r   )r~   r   r   r   r   r   r     r   parsed_scriptsc                 S   r   )r   r   r   r   r   r   r     r   all_speakers_list)r]   boolr	   LONGEST
DO_NOT_PADr^   max
MAX_LENGTHrc   r`   r   pad_idre   extendr   torchtensorlongprepare_speech_inputs)r   r   rW   rX   rY   rZ   r[   input_ids_listspeech_input_masks_listpadding_strategymax_lenpadded_input_idsattention_maskspadded_speech_input_masksr{   speech_maskpadding_length
padded_idsr   padded_speech_maskall_speech_inputs
has_speechr   rn   speech_dictr   r   r   rf   2  sv   






z VibeVoiceProcessor._batch_encodespeaker_samplesc                 C   s  | j j}| j jddd}g }dgt| }t|D ]\}}| j jd| ddd}t|tr5| j|}	n4t|t	rad|v rIt
j|d t
jd}	n d|v rXt
j|d t
jd}	ntd	|  t
j|t
jd}	| jrt| jrt| |	}	t|	jd
 | j }
|| j jg |g|
  | j jg | j jddd }dgt| dg dg|
  dg dg }|| || ||	 q|||fS )z
        Create voice prompt tokens and process audio samples.
        
        Returns:
            tuple: (voice_tokens, voice_speech_inputs, voice_speech_masks)
        z Voice input:
Frv   rx   ry   arrayr   audioz>Dictionary audio input must have 'array' or 'audio' key, got: r   rz   T)r   speech_diffusion_idr   r`   	enumerater]   r^   r   _load_audio_from_pathdictra   r   float32r@   keysr   r   mathceilshaper   r   speech_end_idr   re   )r   r   vae_token_idvoice_full_tokensr   r   rr   speaker_audioprefix_tokenswavvae_tok_lenspeaker_tokensvae_input_maskr   r   r   r     sP   






z'VibeVoiceProcessor._create_voice_promptr|   devicer   c                    s$  |sdddS  fdd|D }t dd |D }|d jdkr.tjt||fdtjd	}ntjt|||d jd
 fdtjd	}tjt|t |ftjd}t	t
||D ]\}	\}
}|
||	dt|
f< d||	d|f< qV||d}|dkrtj|||ptjd|d< tj||tjd|d< |S )ah  
        Prepare speech inputs for model consumption.
        
        Args:
            speech_inputs: List of speech arrays
            return_tensors: Output tensor type
            device: Device to place tensors on
            dtype: Data type for tensors
            
        Returns:
            Dictionary with padded_speeches and speech_masks
        N)r   r   c                    s"   g | ]}t |jd   j qS )r   )r   r   r   r   rq   sr   r   r   r     s   " z<VibeVoiceProcessor.prepare_speech_inputs.<locals>.<listcomp>c                 s   s    | ]}|j d  V  qdS )r   N)r   r   r   r   r   rt     s    z;VibeVoiceProcessor.prepare_speech_inputs.<locals>.<genexpr>r   r   )
fill_valuer   r   Tpt)r   r   r   r   )r   ndimra   fullr`   r   r   zerosbool_r   rc   r   r   r   )r   r|   rZ   r   r   vae_tok_seqlensmax_speech_lengthr   r   ispeechvae_tok_lengthresultr   r   r   r     s$   
&z(VibeVoiceProcessor.prepare_speech_inputs	json_filec           
   
   C   s   ddl }t|ddd}||}W d   n1 sw   Y  t|ts)tdg }|D ]W}t|ts=td|  q-|	d}|	d	}|du sO|du rXtd
|  q-zt
|}	W n ttfyr   td| d Y q-w | }|r|d|	 d|  q-|stdd|S )z
        Convert JSON format to script format.
        Expected JSON format:
        [
            {"speaker": "1", "text": "Hello everyone..."},
            {"speaker": "2", "text": "Great to be here..."}
        ]
        r   Nr#   utf-8rm   z0JSON file must contain a list of speaker entrieszSkipping non-dict entry: speakerrU   z(Skipping entry missing speaker or text: zInvalid speaker ID: z, skipping entrySpeaker r$   z#No valid entries found in JSON filerz   )r/   r6   r7   r]   r_   r@   r   r9   r:   r;   int	TypeErrorstripre   r4   )
r   r   r/   rE   datascript_linesitemr   rU   rr   r   r   r   r     s:   	




z*VibeVoiceProcessor._convert_json_to_script	text_filec           
      C   s   t |ddd}| }W d   n1 sw   Y  g }d}|D ]:}| }|s*q!td|tj}|rPt|d}|d }	|	rO|d| d	|	  q!|d| d	|  q!|sbt	d
d
|S )z
        Convert text file to script format.
        Handles multiple formats:
        1. Already formatted as "Speaker X: text"
        2. Plain text (assigns to Speaker 1)
        
        Handles edge cases like multiple colons in a line.
        r#   r   r   Nr   ^Speaker\s+(\d+)\s*:\s*(.*)$rN   r   r$   z#No valid content found in text filerz   )r6   	readlinesr   rematch
IGNORECASEr   groupre   r@   r4   )
r   r   rE   linesr   current_speakerlinespeaker_matchrr   rU   r   r   r   r   6  s(   	

z*VibeVoiceProcessor._convert_text_to_scriptr   c                 C   s   |  d}g }g }|D ]9}|  sqtd|  tj}|r=t|d}d|d   }|||f || qt	d| d q|sMt
dt|}	|	d	krig }
|D ]\}}|
|d |f qY|
S |S )
z4Parse script into list of (speaker_id, text) tuples.rz   r   r    rN   zCould not parse line: ''z&No valid speaker lines found in scriptr   )r   splitr   r   r   r   r   re   r9   r:   r@   min)r   r   r   r   speaker_idsr   r   rr   rU   min_speaker_idnormalized_linesr   r   r   r   \  s,   z VibeVoiceProcessor._parse_scripttext_inputsaudio_inputsc                 C   s4   t |}d|v r|d |d< d|v r|d |d< |S )z8Merge text and audio inputs into a single BatchEncoding.r   r|   	streaming)r   )r   r   r   mergedr   r   r   _merge_inputs  s   z VibeVoiceProcessor._merge_inputsc                 O      | j j|i |S )z
        This method forwards all its arguments to VibeVoiceTextTokenizer's [`~PreTrainedTokenizer.batch_decode`].
        Please refer to the docstring of this method for more information.
        )r   batch_decoder   argsr   r   r   r   r       zVibeVoiceProcessor.batch_decodec                 O   r  )z
        This method forwards all its arguments to VibeVoiceTextTokenizer's [`~PreTrainedTokenizer.decode`].
        Please refer to the docstring of this method for more information.
        )r   decoder  r   r   r   r	    r  zVibeVoiceProcessor.decodec                 C   s*   | j j}| jj}tt|| ddg S )zB
        Return the list of inputs accepted by the model.
        r|   r}   )r   model_input_namesr   r_   r   fromkeys)r   tokenizer_input_namesaudio_processor_input_namesr   r   r   r
    s   z$VibeVoiceProcessor.model_input_names
output.wavaudio_r   output_pathr'   	normalizebatch_prefixc                 C   s   | j j|||||dS )a  
        Save audio data to a file.
        Args:
            audio (Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]]):
                The audio data to save. Can be a single tensor/array or a list of them.
            output_path (str, optional): Path to save the audio file. Defaults to "output.wav".
            sampling_rate (int, optional): Sampling rate for the audio. If None, uses the processor's default.
            normalize (bool, optional): Whether to normalize the audio before saving. Defaults to False.
            batch_prefix (str, optional): Prefix for batch audio files. Defaults to "audio_".
        Returns:
            str: The path to the saved audio file.
        )r  r'   r  r  )r   
save_audio)r   r   r  r'   r  r  r   r   r   r    s   zVibeVoiceProcessor.save_audio)NNr   T)NNTFNNTrp   )TFNNT)NNN)r  NFr  )-__name__
__module____qualname____doc__r   classmethodr?   r   r^   r.   PathLikerT   r   r   r   r
   ra   rb   r   r	   r   r   r   r   ro   r   r   rd   rf   r   r   r   r   r   r   r   r   r   r  r  r	  propertyr
  Tensorr  r   r   r   r   r      s    

U$".

V


?
d
B

2/&%

 r   )r   warningstypingr   r   r   r   r   r   r.   r   numpyra   r   $transformers.tokenization_utils_baser   r	   r
   r   r   r0   r   r   r1   r   
get_loggerr  r9   r   __all__r   r   r   r   <module>   s&     
     /