o
    ê±i<[  ã                   @   sÜ   d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZmZ ddlmZmZ z
ddlmZ d	ZW n eyZ   d
Ze d¡ Y nw e e¡ZdZG dd„ dƒZ dgZ!dS )z+
Processor class for VibeVoice ASR models.
é    N)ÚListÚOptionalÚUnionÚDictÚAnyÚTuple)ÚBatchEncoding)Ú
TensorTypeÚloggingé   )ÚVibeVoiceTokenizerProcessorÚAudioNormalizer)Úload_audio_use_ffmpegTFzHaudio_utils not available, will fall back to soundfile for audio loadingzYYou are a helpful assistant that transcribes audio input into text output in JSON format.c                   @   s˜  e Zd ZdZ					d*dd„Zdd	„ Zed
d„ ƒZdee	e
jf fdd„Z									d+deee	ejejeee	ejejf  f  dee deee	ef  dedee dedededee	 defdd„Z				d,dee	ejejf dee dededee	 dee	ef fdd„Z				d-deee	ef  dedee dedee	 defdd „Zd!d"„ Zd#d$„ Zd%e	deee	ef  fd&d'„Zed(d)„ ƒZ dS ).ÚVibeVoiceASRProcessoraK  
    Processor for VibeVoice ASR (Automatic Speech Recognition) models.
    
    This processor handles audio preprocessing and tokenization for ASR tasks,
    following the exact format used in training with proper chat templates.
    
    Args:
        tokenizer: The text tokenizer for processing text
        audio_processor: The audio processor for processing speech
        speech_tok_compress_ratio (int): Compression ratio for speech tokenization
        target_sample_rate (int): Target sample rate for audio
        normalize_audio (bool): Whether to normalize audio input
    Né@  éÀ]  Tc                 K   sJ   || _ |p
t||d| _|| _|| _|| _|rtƒ | _nd | _|  ¡  d S )N)Úsampling_rateÚnormalize_audio)	Ú	tokenizerr   Úaudio_processorÚspeech_tok_compress_ratioÚtarget_sample_rater   r   Úaudio_normalizerÚ_cache_special_tokens)Úselfr   r   r   r   r   Úkwargs© r   úO/home/ubuntu/vibevoice-community/vibevoice/processor/vibevoice_asr_processor.pyÚ__init__-   s   	þ
zVibeVoiceASRProcessor.__init__c                 C   s¸   t | jdƒr| jj| _n| j d¡| _t | jdƒr| jj| _n| j d¡| _t | jdƒr2| jj| _n| j d¡| _t | jdƒrF| jj| _d
S t | jdƒrS| jj| _d
S | j d	¡| _d
S )z'Cache special token IDs for efficiency.Úspeech_start_idz<|speech_start|>Úspeech_end_idz<|speech_end|>Úspeech_pad_idz<|speech_pad|>Úpad_idÚpad_token_idz<|endoftext|>N)Úhasattrr   r   Úconvert_tokens_to_idsr    r!   r"   r#   ©r   r   r   r   r   G   s   z+VibeVoiceASRProcessor._cache_special_tokensc              
   K   sª  ddl }ddlm} ddlm} tj |d¡}i }tj |¡r:t	|dƒ}| 
|¡}W d  ƒ n1 s4w   Y  nEz%||dfi |¤Ž}	t	|	dƒ}| 
|¡}W d  ƒ n1 sYw   Y  W n ty~ }
 zt d|
› ¡ t d¡ W Y d}
~
nd}
~
ww | d	d
¡}| dd¡}| dd¡}| dd¡pœ| dd¡}t d|› ¡ d| ¡ v rµ|j|fi |¤Ž}ntd|› ƒ‚t||| dd¡| dd¡d}| |||||dS )a&  
        Load processor from a pretrained model path.
        
        Args:
            pretrained_model_name_or_path: Path to the pretrained model
            **kwargs: Additional keyword arguments
            
        Returns:
            VibeVoiceASRProcessor: The loaded processor
        r   N)Úcached_file)ÚVibeVoiceASRTextTokenizerFastúpreprocessor_config.jsonÚrz)Could not load preprocessor_config.json: zUsing default configurationr   i€  r   r   r   TÚlanguage_model_pretrained_namezQwen/Qwen2.5-1.5BzLoading tokenizer from ÚqwenzUnsupported tokenizer type for Útarget_dB_FSéçÿÿÿÚepsçíµ ÷Æ°>)r   r   r-   r/   )r   r   r   r   r   )ÚjsonÚtransformers.utilsr'   Ú2vibevoice.modular.modular_vibevoice_text_tokenizerr(   ÚosÚpathÚjoinÚexistsÚopenÚloadÚ	ExceptionÚloggerÚwarningÚgetÚpopÚinfoÚlowerÚfrom_pretrainedÚ
ValueErrorr   )ÚclsÚpretrained_model_name_or_pathr   r1   r'   r(   Úconfig_pathÚconfigÚfÚconfig_fileÚer   r   r   r+   r   r   r   r   r   rA   `   sf   ÿ€þýÿ€€þÿþ

üûz%VibeVoiceASRProcessor.from_pretrainedÚsave_directoryc                 K   sŠ   ddl }tj|dd d| j| j| jdddœ}tj |d	¡}t|d
ƒ}|j	||dd W d  ƒ n1 s6w   Y  t
 d|› ¡ dS )zÈ
        Save processor configuration to a directory.
        
        Args:
            save_directory: Directory to save the configuration
            **kwargs: Additional keyword arguments
        r   NT)Úexist_okr   r.   r0   )Úprocessor_classr   r   r   r-   r/   r)   Úwé   )Úindentz!Processor configuration saved in )r1   r4   Úmakedirsr   r   r   r5   r6   r8   Údumpr;   r?   )r   rJ   r   r1   Úprocessor_configrE   rG   r   r   r   Úsave_pretrained¥   s   ú	ÿz%VibeVoiceASRProcessor.save_pretrainedFÚaudior   Úreturn_tensorsÚpaddingÚ
max_lengthÚ
truncationÚadd_generation_promptÚuse_streamingÚcontext_infoÚreturnc
                 K   sr   |du rt dƒ‚t|tƒrd}|}nd}|g}g }|D ]}| j|||||	d}| |¡ q| j|||||d}|S )a  
        Process audio input for ASR model.
        
        Args:
            audio: Audio input(s). Can be:
                - str: Path to audio file
                - np.ndarray: Audio array
                - torch.Tensor: Audio tensor
                - List of the above for batch processing
            sampling_rate: Sampling rate of input audio
            return_tensors: Output format ('pt' for PyTorch, 'np' for NumPy)
            padding: Whether to pad batch inputs
            max_length: Maximum sequence length
            truncation: Whether to truncate long sequences
            add_generation_prompt: Whether to add generation prompt for inference
            use_streaming: Whether to use streaming mode (True by default, auto False if <60s)
            context_info: Optional context information (e.g., hotwords, metadata) to help transcription
            
        Returns:
            BatchEncoding with:
                - input_ids: Token IDs for the model
                - attention_mask: Attention mask
                - acoustic_input_mask: Mask indicating speech token positions
                - speech_tensors: Processed speech features
                - speech_masks: Valid speech masks
                - vae_tok_seqlens: Length of each speech segment in tokens
        Nz*Audio input is required for ASR processingTF)r   rY   rZ   r[   )rV   rW   rX   rU   )rB   Ú
isinstanceÚlistÚ_process_single_audioÚappendÚ_batch_encode)r   rT   r   rU   rV   rW   rX   rY   rZ   r[   r   Ú
is_batchedÚ
audio_listÚall_encodingsÚaudio_inputÚencodingÚbatch_encodingr   r   r   Ú__call__Á   s2   (
ûûzVibeVoiceASRProcessor.__call__c              
      sz  t |tƒrktrBz
t|dd\}}W nF tyA } z$t d|› ¡ ddl}	|	 |¡\}}|j	dkr7|j
dd}W Y d}~nd}~ww ddl}	|	 |¡\}}|j	dkrX|j
dd}|ˆ jkrjddl}
|
j||ˆ jd}n't |tjƒr| ¡  ¡ }|j	dkr€| ¡ }ntj|tjd	}|j	dkr’| ¡ }| tj¡}ˆ jr£ˆ jr£ˆ  |¡}t|ƒˆ j }|r²|d
k r²d}t t|ƒˆ j ¡}ˆ jjdtdœgdd}ˆ j  |¡}ˆ j !ˆ j"¡}ˆ j !ˆ j#¡}ˆ j !ˆ j$¡}g d¢}|rÿ| %¡ rÿd|d›d| %¡ › dd &|¡ }nd|d›dd &|¡ }d &|g|g|  |g ¡d | }ˆ jjd|dœgdd}|| }‡ fdd„|D ƒ}||||dœS )a¥  
        Process a single audio input.
        
        Args:
            audio: Single audio input
            sampling_rate: Audio sampling rate
            add_generation_prompt: Whether to add generation prompt
            context_info: Optional context information (e.g., hotwords, metadata) to help transcription
            
        Returns:
            Dictionary with processed tokens and audio features
        F)Úresamplez2ffmpeg loading failed, falling back to soundfile: r   Nr   )Úaxis)Úorig_srÚ	target_sr©Údtypeg      N@Úsystem)ÚroleÚcontent)Útokenize)ú
Start timeúEnd timeú
Speaker IDÚContentz
This is a z.2fz! seconds audio, with extra info: z(

Please transcribe it with these keys: z, z6 seconds audio, please transcribe it with these keys: Ú Ú
ÚuserTc                    s   g | ]}|ˆ j krd nd‘qS )r   r   )r!   )Ú.0Útokenr&   r   r   Ú
<listcomp>{  s    z?VibeVoiceASRProcessor._process_single_audio.<locals>.<listcomp>)Ú	input_idsÚacoustic_input_maskÚspeechÚvae_tok_len)'r]   ÚstrÚHAS_FFMPEG_UTILSr   r:   ÚwarningsÚwarnÚ	soundfileÚreadÚndimÚmeanr   Úlibrosari   ÚtorchÚTensorÚcpuÚnumpyÚsqueezeÚnpÚarrayÚfloat32Úastyper   r   ÚlenÚmathÚceilr   r   Úapply_chat_templateÚSYSTEM_PROMPTÚencodeÚconvert_ids_to_tokensr   r!   r    Ústripr6   )r   rT   r   rY   rZ   r[   Úaudio_arrayÚfile_srrI   Úsfr‰   Úaudio_durationr€   Úsystem_prompt_textÚsystem_tokensÚsp_start_tokenÚsp_pad_tokenÚsp_end_tokenÚ	show_keysÚuser_suffixÚuser_input_stringÚuser_tokensÚfull_tokensr~   r   r&   r   r_     sŠ   

€ú

ý€
€


þ$ÿþþ
þüz+VibeVoiceASRProcessor._process_single_audioÚ	encodingsc                 C   s²  dd„ |D ƒ}dd„ |D ƒ}dd„ |D ƒ}dd„ |D ƒ}	|r‡|dur%|}
n	t dd„ |D ƒƒ}
g }g }g }t||ƒD ]H\}}|rQt|ƒ|
krQ|d|
… }|d|
… }|
t|ƒ }| jg| | }d	g| | }d	g| d
gt|ƒ  }| |¡ | |¡ | |¡ q9|}|}ndd„ |D ƒ}t dd„ |D ƒƒ}tjt|ƒ|ftjd}tjt|ƒt |	ƒftd}t	t||	ƒƒD ]\}\}}|||dt|ƒ…f< d||d|…f< q·t
ƒ }|dkrtj|tjd|d< tj|tjd|d< tj|tjd|d< tj|tjd|d< tj|tjd|d< |S t|ƒd
kr|n|d	 |d< t|ƒd
kr$|n|d	 |d< t|ƒd
kr3|n|d	 |d< t|ƒd
krB|n|d	 |d< t|ƒd
krQ|n|d	 |d< |S )a~  
        Combine multiple encodings into a batch.
        
        Args:
            encodings: List of encoded samples
            padding: Whether to pad sequences
            max_length: Maximum sequence length
            truncation: Whether to truncate
            return_tensors: Output format
            
        Returns:
            BatchEncoding with batched data
        c                 S   ó   g | ]}|d  ‘qS )r}   r   ©rz   Úencr   r   r   r|   š  ó    z7VibeVoiceASRProcessor._batch_encode.<locals>.<listcomp>c                 S   rª   )r~   r   r«   r   r   r   r|   ›  r­   c                 S   rª   )r   r   r«   r   r   r   r|   œ  r­   c                 S   rª   )r€   r   r«   r   r   r   r|     r­   Nc                 s   ó    | ]}t |ƒV  qd S ©N©r“   ©rz   Úidsr   r   r   Ú	<genexpr>¤  ó   € z6VibeVoiceASRProcessor._batch_encode.<locals>.<genexpr>r   r   c                 S   s   g | ]	}d gt |ƒ ‘qS )r   r°   r±   r   r   r   r|   ¾  s    c                 s   r®   r¯   r°   )rz   Úsr   r   r   r³   Á  r´   rm   TÚptr}   Úattention_maskr~   Úspeech_tensorsÚspeech_masks)ÚmaxÚzipr“   r"   r`   r   Úzerosr‘   ÚboolÚ	enumerater   rŠ   ÚtensorÚlong)r   r©   rV   rW   rX   rU   Úinput_ids_listÚacoustic_masks_listÚspeech_listÚvae_tok_lensÚtarget_lengthÚpadded_input_idsÚpadded_acoustic_masksÚattention_masksr}   Úacoustic_maskÚpadding_lengthÚ
padded_idsÚpadded_acousticr·   Úmax_speech_lengthÚpadded_speechesr¹   Úir   Úvae_lenrg   r   r   r   ra   „  sZ   


	úz#VibeVoiceASRProcessor._batch_encodec                 O   ó   | j j|i |¤ŽS )zi
        Decode batch of token IDs to text.
        Forwards to tokenizer's batch_decode method.
        )r   Úbatch_decode©r   Úargsr   r   r   r   rÒ   Ü  ó   z"VibeVoiceASRProcessor.batch_decodec                 O   rÑ   )zZ
        Decode token IDs to text.
        Forwards to tokenizer's decode method.
        )r   ÚdecoderÓ   r   r   r   rÖ   ã  rÕ   zVibeVoiceASRProcessor.decodeÚtextc              
   C   sÊ  z¤d|v r|  d¡d }|  d|¡}|||…  ¡ }nH|  d¡}|dkr)|  d¡}|dkrad}|}t|t|ƒƒD ]!}|| dv rE|d	7 }q8|| d
v rY|d	8 }|dkrY|d	 } nq8|||… }n|}t |¡}t|tƒrp|g}g }|D ]-}	t|	tƒr¡i }
ddddddddœ}| ¡ D ]\}}||	v r™|	| |
|< q‹|
r¡| 	|
¡ qt|W S  tj
yÉ } zt d|› ¡ t d|› ¡ g W  Y d}~S d}~w tyä } zt d|› ¡ g W  Y d}~S d}~ww )zý
        Post-process the generated transcription text to extract structured data.
        
        Args:
            text: Generated text from the model
            
        Returns:
            List of dictionaries with transcription segments
        z```jsoné   z```ú[éÿÿÿÿÚ{r   z[{r   z]}Ú
start_timeÚend_timeÚ
speaker_idr×   )rs   ÚStartrt   ÚEndru   ÚSpeakerrv   z)Failed to parse JSON from transcription: z
Raw text: Nz%Error post-processing transcription: )Úfindrš   Úranger“   r1   Úloadsr]   ÚdictÚitemsr`   ÚJSONDecodeErrorr;   r<   Údebugr:   )r   r×   Ú
json_startÚjson_endÚjson_strÚbracket_countrÏ   ÚresultÚcleaned_resultÚitemÚcleaned_itemÚkey_mappingÚkeyÚ
mapped_keyrI   r   r   r   Úpost_process_transcriptionê  sn   



€


ù	€
€€€þz0VibeVoiceASRProcessor.post_process_transcriptionc                 C   s   g d¢S )z0Return the list of inputs accepted by the model.)r}   r·   r~   r¸   r¹   r   r&   r   r   r   Úmodel_input_names7  s   z'VibeVoiceASRProcessor.model_input_names)NNr   r   T)	NNNTNFTTN)NTTN)TNFN)!Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   ÚclassmethodrA   r   r   r4   ÚPathLikerS   r   r   ÚndarrayrŠ   r‹   r   Úintr	   r½   r   rh   r   r   r_   ra   rÒ   rÖ   rô   Úpropertyrõ   r   r   r   r   r      s¢    
ú
Dö(þýüûúùø	÷
ö
ôMúþýüûú

ù|úþýüûú
ùXMr   )"rù   r4   r1   r”   rƒ   Útypingr   r   r   r   r   r   r   r   rŠ   Ú$transformers.tokenization_utils_baser   r2   r	   r
   Úvibevoice_tokenizer_processorr   r   Úaudio_utilsr   r‚   ÚImportErrorr„   Ú
get_loggerrö   r;   r—   r   Ú__all__r   r   r   r   Ú<module>   s4     þ
    
"