o
    iXs                     @   s   d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl
Z
d dlZd dlZd dlmZmZmZmZmZ d dlmZmZ ddlmZ eeZG dd dZdgZdS )	    N)ListOptionalUnionDictAnyTuple)BatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging   )AudioNormalizerc                   @   s  e Zd ZdZd@ddZedd Zd	eee	j
f fd
dZ							dAdeeeee eeee ee f  deeeeeejf  eeeeejf   f  deeeef deeeef dee deeeef  dedefddZ	dBdeeef deeeeejf   deeef fddZ					dCdeeeef  deeeef deeeef dee deeeef  dedefddZdeeeejf  deee eej ee f fddZ			dDdeej deeeef  d eeeej f  d!eej! deeef f
d"d#Z"d$edefd%d&Z#d'edefd(d)Z$d*edeeeef  fd+d,Z%d-ed.edefd/d0Z&d1d2 Z'd3d4 Z(e)d5d6 Z*	7			8dEd9eej+ejeeej+ejf  f d:ed;ee d<ed=edefd>d?Z,dS )FVibeVoiceProcessoraW  
    Constructs a VibeVoice processor which wraps a VibeVoice tokenizer and audio processor into a single processor.

    [`VibeVoiceProcessor`] offers all the functionalities of [`VibeVoiceTokenizer`] and [`VibeVoiceTokenizerProcessor`]. 
    See the [`~VibeVoiceProcessor.__call__`] and [`~VibeVoiceProcessor.decode`] for more information.

    Args:
        tokenizer (`VibeVoiceTextTokenizer` or `VibeVoiceTextTokenizerFast`):
            The tokenizer for text processing.
        audio_processor (`VibeVoiceTokenizerProcessor`):
            The audio processor for speech processing.
        speech_tok_compress_ratio (`int`, *optional*, defaults to 3200):
            The compression ratio for speech tokenization.
        db_normalize (`bool`, *optional*, defaults to True):
            Whether to apply decibel normalization to audio inputs.
    N  Tc                 K   s2   || _ || _|| _|| _|rt nd | _d| _d S )Nz~ Transform the text provided by various speakers into speech output, utilizing the distinct voice of each respective speaker.
)	tokenizeraudio_processorspeech_tok_compress_ratiodb_normalizer   audio_normalizersystem_prompt)selfr   r   r   r   kwargs r   P/home/ubuntu/VibeVoice-finetuning/src/vibevoice/processor/vibevoice_processor.py__init__#   s   
zVibeVoiceProcessor.__init__c                 K   s`  ddl }ddl}ddlm} ddlm}m} |j|d}|j	|r>t
|d}	||	}
W d   n1 s8w   Y  ntd| d	 d
dd}
|
dd
}|
dd}|
ddpc|dd}td|  d| v r||j|fi |}ntd| dd|
v r|
d }||dd|dd|dd|ddd}n| }| ||||dS )a  
        Instantiate a VibeVoiceProcessor from a pretrained VibeVoice processor.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:
                - a string, the *model id* of a pretrained model
                - a path to a *directory* containing processor config

        Returns:
            [`VibeVoiceProcessor`]: The processor object instantiated from pretrained model.
        r   Nr   )VibeVoiceTokenizerProcessor)VibeVoiceTextTokenizerVibeVoiceTextTokenizerFastpreprocessor_config.jsonrz%No preprocessor_config.json found at z, using defaultsr   T)r   r   r   r   language_model_pretrained_namezQwen/Qwen2.5-1.5BzLoading tokenizer from qwenzUnsupported tokenizer type for z&. Supported types: Qwen, Llama, Gemma.r   sampling_rate]  normalize_audiotarget_dB_FSepsư>)r%   r'   r(   r*   )r   r   r   r   )osjsonvibevoice_tokenizer_processorr   2vibevoice.modular.modular_vibevoice_text_tokenizerr   r    pathjoinexistsopenloadloggerwarninggetpopinfolowerfrom_pretrained
ValueError)clspretrained_model_name_or_pathr   r,   r-   r   r   r    config_pathfconfigr   r   r#   r   audio_configr   r   r   r   r;   +   sP   



z"VibeVoiceProcessor.from_pretrainedsave_directoryc                 K   s   ddl }ddl}|j|dd d| j| jdt| jddt| jd	dt| jd
dt| jdddd}|j|d}t	|d}|j
||dd W d   n1 sQw   Y  td|  dS )a  
        Save a processor to a directory, so that it can be re-loaded using the
        [`~VibeVoiceProcessor.from_pretrained`] class method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the processor will be saved.
        r   NT)exist_okr   r   r%   r&   r'   r(   r)   r*   r+   )feature_extractor_typer%   r'   r(   r*   )processor_classr   r   r   r!   w   )indentz!Processor configuration saved in )r,   r-   makedirsr   r   getattrr   r0   r1   r3   dumpr5   r9   )r   rC   r   r,   r-   processor_configr?   r@   r   r   r   save_pretrainedr   s$   	z"VibeVoiceProcessor.save_pretrainedFtextvoice_samplespadding
truncation
max_lengthreturn_tensorsreturn_attention_maskreturnc                 K   s   t |tst |trt|dkrt |d ts|g}	d}
n|}	d}
|dur8|
r1t |d ttjfr5|g}n
|}ndgt|	 }g }t|	|D ]\}}| ||}|| qF| j	||||||d}|S )a  
        Main method to process one or more podcast scripts with optional voice samples.

        Args:
            text (`str`, `List[str]`):
                The input text(s) to process. Can be:
                - A single script string
                - A list of script strings for batch processing
                - A path to a .json or .txt file
                - A list of paths
            voice_samples (`List[Union[str, np.ndarray]]`, `List[List[Union[str, np.ndarray]]]`, *optional*):
                Voice samples for each script. Can be:
                - A list of samples for a single script
                - A list of lists for batch processing
            padding (`bool`, `str` or `PaddingStrategy`, defaults to `True`):
                Whether to pad sequences to the same length
            truncation (`bool`, `str` or `TruncationStrategy`, defaults to `False`):
                Whether to truncate sequences
            max_length (`int`, *optional*):
                Maximum length of the returned sequences
            return_tensors (`str` or `TensorType`, *optional*):
                If set, will return tensors of a particular framework
            return_attention_mask (`bool`, defaults to `True`):
                Whether to return the attention mask

        Returns:
            `BatchEncoding`: A BatchEncoding with the following fields:
                - **input_ids** -- List of token id sequences or tensor
                - **attention_mask** -- List of attention masks or tensor
                - **speech_tensors** -- Padded speech inputs (if voice_samples provided)
                - **speech_masks** -- Speech masks (if voice_samples provided)
                - **speech_input_mask** -- Boolean masks indicating speech token positions
        r   FTN)rQ   rR   rS   rT   rU   )

isinstancestrlistlennpndarrayzip_process_singleappend_batch_encode)r   rO   rP   rQ   rR   rS   rT   rU   r   texts
is_batchedvoice_samples_listall_encodings
text_inputvoice_inputencodingbatch_encodingr   r   r   __call__   s.   .-	zVibeVoiceProcessor.__call__c                 C   s  d}t |tr+|drtj|r| |}n|dr)tj|r)| |}n|}|du r6td| | 	|}t
tdd |D }| j| j}|r^| |dt| \}}}	ng g g }}}	|| }
dgt| |	 }|
| jjddd	7 }
|dgt| jjddd	 7 }|D ] \}}| jjd
| d| ddd	}|
|7 }
|dgt| 7 }q|
| jjddd	| jjg 7 }
|dgt| jjddd	d  7 }|
|r|nd|||dS )z Process a single podcast script.Nz.jsonz.txtzCould not process input text: c                 s   s    | ]\}}|V  qd S Nr   ).0
speaker_id_r   r   r   	<genexpr>       z5VibeVoiceProcessor._process_single.<locals>.<genexpr>Fz Text input:
add_special_tokens	 Speaker :
z Speech output:
r   )	input_idsspeech_inputsspeech_input_maskparsed_scriptall_speakers)rW   rX   endswithr,   r0   r2   _convert_json_to_script_convert_text_to_scriptr<   _parse_scriptrY   setr   encoder   _create_voice_promptrZ   speech_start_id)r   rO   rP   scriptparsed_linesry   system_tokensvoice_tokensvoice_speech_inputsvoice_speech_masksfull_tokensrw   rl   speaker_textspeaker_text_tokensr   r   r   r^      s>   

"
z"VibeVoiceProcessor._process_single	encodingsc                 C   s  dd |D }dd |D }t |tr|rtjntj}	nt |tr&t|}	n|}	|	tjkr|	tjkr<tdd |D }
n|	tjkrH|durH|}
n	tdd |D }
g }g }g }t||D ]I\}}|rtt	||
krt|d|
 }|d|
 }|
t	| }| j
jg| | }dg| d	gt	|  }d
g| | }|| || || q\|}|}n|rdd |D nd}g }d
}|D ]}|d dur||d  d}qt }|durtj|tjd|d< |r|durtj|tjd|d< tj|tjd|d< n||d< |r
|dur
||d< ||d< |r%| j||d}|d |d< |d |d< nd|d< d|d< dd |D |d< dd |D |d< |S )z5Combine multiple encodings into a batch with padding.c                 S      g | ]}|d  qS )ru   r   rk   encr   r   r   
<listcomp>.      z4VibeVoiceProcessor._batch_encode.<locals>.<listcomp>c                 S   r   )rw   r   r   r   r   r   r   /  r   c                 s       | ]}t |V  qd S rj   rZ   rk   idsr   r   r   rn   <  ro   z3VibeVoiceProcessor._batch_encode.<locals>.<genexpr>Nc                 s   r   rj   r   r   r   r   r   rn   @  ro   r   r   Fc                 S   s   g | ]	}d gt | qS )r   r   r   r   r   r   r   \  s    rv   Tdtyperu   attention_maskrw   )rT   padded_speechesspeech_tensorsspeech_masksc                 S   r   )rx   r   r   r   r   r   r     r   parsed_scriptsc                 S   r   )ry   r   r   r   r   r   r     r   all_speakers_list)rW   boolr	   LONGEST
DO_NOT_PADrX   max
MAX_LENGTHr]   rZ   r   pad_idr_   extendr   torchtensorlongprepare_speech_inputs)r   r   rQ   rR   rS   rT   rU   input_ids_listspeech_input_masks_listpadding_strategymax_lenpadded_input_idsattention_maskspadded_speech_input_masksru   speech_maskpadding_length
padded_idsr   padded_speech_maskall_speech_inputs
has_speechr   rh   speech_dictr   r   r   r`   #  sv   






z VibeVoiceProcessor._batch_encodespeaker_samplesc                 C   s,  | j j}| j jddd}g }dgt| }t|D ]w\}}| j jd| ddd}t|tr5| j|}	nt	j
|t	jd}	| jrH| jrH| |	}	t|	jd | j }
|| j jg |g|
  | j jg | j jddd }dgt| dg d	g|
  dg dg }|| || ||	 q|||fS )
z
        Create voice prompt tokens and process audio samples.
        
        Returns:
            tuple: (voice_tokens, voice_speech_inputs, voice_speech_masks)
        z Voice input:
Frp   rr   rs   r   r   rt   T)r   speech_diffusion_idr   rZ   	enumeraterW   rX   r   _load_audio_from_pathr[   arrayfloat32r   r   mathceilshaper   r   speech_end_idr   r_   )r   r   vae_token_idvoice_full_tokensr   r   rl   speaker_audioprefix_tokenswavvae_tok_lenspeaker_tokensvae_input_maskr   r   r   r     sD   





z'VibeVoiceProcessor._create_voice_promptrv   devicer   c                    s$  |sdddS  fdd|D }t dd |D }|d jdkr.tjt||fdtjd	}ntjt|||d jd
 fdtjd	}tjt|t |ftjd}t	t
||D ]\}	\}
}|
||	dt|
f< d||	d|f< qV||d}|dkrtj|||ptjd|d< tj||tjd|d< |S )ah  
        Prepare speech inputs for model consumption.
        
        Args:
            speech_inputs: List of speech arrays
            return_tensors: Output tensor type
            device: Device to place tensors on
            dtype: Data type for tensors
            
        Returns:
            Dictionary with padded_speeches and speech_masks
        N)r   r   c                    s"   g | ]}t |jd   j qS )r   )r   r   r   r   rk   sr   r   r   r     s   " z<VibeVoiceProcessor.prepare_speech_inputs.<locals>.<listcomp>c                 s   s    | ]}|j d  V  qdS )r   N)r   r   r   r   r   rn     s    z;VibeVoiceProcessor.prepare_speech_inputs.<locals>.<genexpr>r   r   )
fill_valuer   r   Tpt)r   r   r   r   )r   ndimr[   fullrZ   r   r   zerosbool_r   r]   r   r   r   )r   rv   rT   r   r   vae_tok_seqlensmax_speech_lengthr   r   ispeechvae_tok_lengthresultr   r   r   r     s$   
&z(VibeVoiceProcessor.prepare_speech_inputs	json_filec           
   
   C   s   ddl }t|ddd}||}W d   n1 sw   Y  t|ts)tdg }|D ]W}t|ts=td|  q-|	d}|	d	}|du sO|du rXtd
|  q-zt
|}	W n ttfyr   td| d Y q-w | }|r|d|	 d|  q-|stdd|S )z
        Convert JSON format to script format.
        Expected JSON format:
        [
            {"speaker": "1", "text": "Hello everyone..."},
            {"speaker": "2", "text": "Great to be here..."}
        ]
        r   Nr"   utf-8rg   z0JSON file must contain a list of speaker entrieszSkipping non-dict entry: speakerrO   z(Skipping entry missing speaker or text: zInvalid speaker ID: z, skipping entrySpeaker : z#No valid entries found in JSON filert   )r-   r3   r4   rW   rY   r<   dictr5   r6   r7   int	TypeErrorstripr_   r1   )
r   r   r-   r@   datascript_linesitemr   rO   rl   r   r   r   r{     s:   	




z*VibeVoiceProcessor._convert_json_to_script	text_filec           
      C   s   t |ddd}| }W d   n1 sw   Y  g }d}|D ]:}| }|s*q!td|tj}|rPt|d}|d }	|	rO|d| d	|	  q!|d| d	|  q!|sbt	d
d
|S )z
        Convert text file to script format.
        Handles multiple formats:
        1. Already formatted as "Speaker X: text"
        2. Plain text (assigns to Speaker 1)
        
        Handles edge cases like multiple colons in a line.
        r"   r   r   Nr   ^Speaker\s+(\d+)\s*:\s*(.*)$rH   r   r   z#No valid content found in text filert   )r3   	readlinesr   rematch
IGNORECASEr   groupr_   r<   r1   )
r   r   r@   linesr   current_speakerlinespeaker_matchrl   rO   r   r   r   r|     s(   	

z*VibeVoiceProcessor._convert_text_to_scriptr   c                 C   s   |  d}g }g }|D ]9}|  sqtd|  tj}|r=t|d}d|d   }|||f || qt	d| d q|sVt
dt|d	d
  t|}	|	dkrrg }
|D ]\}}|
|d |f qb|
S |S )z4Parse script into list of (speaker_id, text) tuples.rt   r   r    rH   zCould not parse line: ''z3No valid speaker lines found in script. Input was: N   r   )r   splitr   r   r   r   r   r_   r5   r6   r<   reprmin)r   r   r   r   speaker_idsr   r   rl   rO   min_speaker_idnormalized_linesr   r   r   r}   E  s,   z VibeVoiceProcessor._parse_scripttext_inputsaudio_inputsc                 C   s4   t |}d|v r|d |d< d|v r|d |d< |S )z8Merge text and audio inputs into a single BatchEncoding.audiorv   	streaming)r   )r   r   r   mergedr   r   r   _merge_inputsj  s   z VibeVoiceProcessor._merge_inputsc                 O      | j j|i |S )z
        This method forwards all its arguments to VibeVoiceTextTokenizer's [`~PreTrainedTokenizer.batch_decode`].
        Please refer to the docstring of this method for more information.
        )r   batch_decoder   argsr   r   r   r   r  w     zVibeVoiceProcessor.batch_decodec                 O   r   )z
        This method forwards all its arguments to VibeVoiceTextTokenizer's [`~PreTrainedTokenizer.decode`].
        Please refer to the docstring of this method for more information.
        )r   decoder  r   r   r   r  ~  r  zVibeVoiceProcessor.decodec                 C   s*   | j j}| jj}tt|| ddg S )zB
        Return the list of inputs accepted by the model.
        rv   rw   )r   model_input_namesr   rY   r   fromkeys)r   tokenizer_input_namesaudio_processor_input_namesr   r   r   r    s   z$VibeVoiceProcessor.model_input_names
output.wavaudio_r   output_pathr%   	normalizebatch_prefixc                 C   s   | j j|||||dS )a  
        Save audio data to a file.
        Args:
            audio (Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]]):
                The audio data to save. Can be a single tensor/array or a list of them.
            output_path (str, optional): Path to save the audio file. Defaults to "output.wav".
            sampling_rate (int, optional): Sampling rate for the audio. If None, uses the processor's default.
            normalize (bool, optional): Whether to normalize the audio before saving. Defaults to False.
            batch_prefix (str, optional): Prefix for batch audio files. Defaults to "audio_".
        Returns:
            str: The path to the saved audio file.
        )r  r%   r  r  )r   
save_audio)r   r   r  r%   r  r  r   r   r   r    s   zVibeVoiceProcessor.save_audio)NNr   T)NNTFNNTrj   )TFNNT)NNN)r
  NFr  )-__name__
__module____qualname____doc__r   classmethodr;   r   rX   r,   PathLikerN   r   r   r   r
   r[   r\   r   r	   r   r   r   r   ri   r   r   r^   r`   r   r   r   r   r   r   r{   r|   r}   r   r  r  propertyr  Tensorr  r   r   r   r   r      s    

F$".

V


?
d
:

2/&%

 r   )r   warningstypingr   r   r   r   r   r   r,   r   numpyr[   r   $transformers.tokenization_utils_baser   r	   r
   r   r   transformers.utilsr   r   r.   r   
get_loggerr  r5   r   __all__r   r   r   r   <module>   s&     
     