o
    ꁱi<[                     @   s   d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZmZ ddlmZmZ z
ddlmZ d	ZW n eyZ   d
Zed Y nw eeZdZG dd dZ dgZ!dS )z+
Processor class for VibeVoice ASR models.
    N)ListOptionalUnionDictAnyTuple)BatchEncoding)
TensorTypelogging   )VibeVoiceTokenizerProcessorAudioNormalizer)load_audio_use_ffmpegTFzHaudio_utils not available, will fall back to soundfile for audio loadingzYYou are a helpful assistant that transcribes audio input into text output in JSON format.c                   @   s  e Zd ZdZ					d*ddZdd	 Zed
d Zdee	e
jf fddZ									d+deee	ejejeee	ejejf  f  dee deee	ef  dedee dedededee	 defddZ				d,dee	ejejf dee dededee	 dee	ef fddZ				d-deee	ef  dedee dedee	 defdd Zd!d" Zd#d$ Zd%e	deee	ef  fd&d'Zed(d) Z dS ).VibeVoiceASRProcessoraK  
    Processor for VibeVoice ASR (Automatic Speech Recognition) models.
    
    This processor handles audio preprocessing and tokenization for ASR tasks,
    following the exact format used in training with proper chat templates.
    
    Args:
        tokenizer: The text tokenizer for processing text
        audio_processor: The audio processor for processing speech
        speech_tok_compress_ratio (int): Compression ratio for speech tokenization
        target_sample_rate (int): Target sample rate for audio
        normalize_audio (bool): Whether to normalize audio input
    N@  ]  Tc                 K   sJ   || _ |p
t||d| _|| _|| _|| _|rt | _nd | _|   d S )N)sampling_ratenormalize_audio)		tokenizerr   audio_processorspeech_tok_compress_ratiotarget_sample_rater   r   audio_normalizer_cache_special_tokens)selfr   r   r   r   r   kwargs r   O/home/ubuntu/vibevoice-community/vibevoice/processor/vibevoice_asr_processor.py__init__-   s   	
zVibeVoiceASRProcessor.__init__c                 C   s   t | jdr| jj| _n| jd| _t | jdr| jj| _n| jd| _t | jdr2| jj| _n| jd| _t | jdrF| jj| _d
S t | jdrS| jj| _d
S | jd	| _d
S )z'Cache special token IDs for efficiency.speech_start_idz<|speech_start|>speech_end_idz<|speech_end|>speech_pad_idz<|speech_pad|>pad_idpad_token_idz<|endoftext|>N)hasattrr   r   convert_tokens_to_idsr    r!   r"   r#   r   r   r   r   r   G   s   z+VibeVoiceASRProcessor._cache_special_tokensc              
   K   s  ddl }ddlm} ddlm} tj|d}i }tj|r:t	|d}|
|}W d   n1 s4w   Y  nEz%||dfi |}	t	|	d}|
|}W d   n1 sYw   Y  W n ty~ }
 ztd|
  td W Y d}
~
nd}
~
ww |d	d
}|dd}|dd}|ddp|dd}td|  d| v r|j|fi |}ntd| t|||dd|ddd}| |||||dS )a&  
        Load processor from a pretrained model path.
        
        Args:
            pretrained_model_name_or_path: Path to the pretrained model
            **kwargs: Additional keyword arguments
            
        Returns:
            VibeVoiceASRProcessor: The loaded processor
        r   N)cached_file)VibeVoiceASRTextTokenizerFastpreprocessor_config.jsonrz)Could not load preprocessor_config.json: zUsing default configurationr   i  r   r   r   Tlanguage_model_pretrained_namezQwen/Qwen2.5-1.5BzLoading tokenizer from qwenzUnsupported tokenizer type for target_dB_FSepsư>)r   r   r-   r/   )r   r   r   r   r   )jsontransformers.utilsr'   2vibevoice.modular.modular_vibevoice_text_tokenizerr(   ospathjoinexistsopenload	Exceptionloggerwarninggetpopinfolowerfrom_pretrained
ValueErrorr   )clspretrained_model_name_or_pathr   r1   r'   r(   config_pathconfigfconfig_fileer   r   r   r+   r   r   r   r   r   rA   `   sf   

z%VibeVoiceASRProcessor.from_pretrainedsave_directoryc                 K   s   ddl }tj|dd d| j| j| jddd}tj|d	}t|d
}|j	||dd W d   n1 s6w   Y  t
d|  dS )z
        Save processor configuration to a directory.
        
        Args:
            save_directory: Directory to save the configuration
            **kwargs: Additional keyword arguments
        r   NT)exist_okr   r.   r0   )processor_classr   r   r   r-   r/   r)   w   )indentz!Processor configuration saved in )r1   r4   makedirsr   r   r   r5   r6   r8   dumpr;   r?   )r   rJ   r   r1   processor_configrE   rG   r   r   r   save_pretrained   s   	z%VibeVoiceASRProcessor.save_pretrainedFaudior   return_tensorspadding
max_length
truncationadd_generation_promptuse_streamingcontext_inforeturnc
                 K   sr   |du rt dt|trd}|}nd}|g}g }|D ]}| j|||||	d}|| q| j|||||d}|S )a  
        Process audio input for ASR model.
        
        Args:
            audio: Audio input(s). Can be:
                - str: Path to audio file
                - np.ndarray: Audio array
                - torch.Tensor: Audio tensor
                - List of the above for batch processing
            sampling_rate: Sampling rate of input audio
            return_tensors: Output format ('pt' for PyTorch, 'np' for NumPy)
            padding: Whether to pad batch inputs
            max_length: Maximum sequence length
            truncation: Whether to truncate long sequences
            add_generation_prompt: Whether to add generation prompt for inference
            use_streaming: Whether to use streaming mode (True by default, auto False if <60s)
            context_info: Optional context information (e.g., hotwords, metadata) to help transcription
            
        Returns:
            BatchEncoding with:
                - input_ids: Token IDs for the model
                - attention_mask: Attention mask
                - acoustic_input_mask: Mask indicating speech token positions
                - speech_tensors: Processed speech features
                - speech_masks: Valid speech masks
                - vae_tok_seqlens: Length of each speech segment in tokens
        Nz*Audio input is required for ASR processingTF)r   rY   rZ   r[   )rV   rW   rX   rU   )rB   
isinstancelist_process_single_audioappend_batch_encode)r   rT   r   rU   rV   rW   rX   rY   rZ   r[   r   
is_batched
audio_listall_encodingsaudio_inputencodingbatch_encodingr   r   r   __call__   s2   (
zVibeVoiceASRProcessor.__call__c              
      sz  t |trktrBz
t|dd\}}W nF tyA } z$td|  ddl}	|	|\}}|j	dkr7|j
dd}W Y d}~nd}~ww ddl}	|	|\}}|j	dkrX|j
dd}| jkrjddl}
|
j|| jd}n't |tjr|  }|j	dkr| }ntj|tjd	}|j	dkr| }|tj} jr jr |}t| j }|r|d
k rd}tt| j } jjdtdgdd} j |} j! j"} j! j#} j! j$}g d}|r|% rd|dd|%  dd&| }nd|ddd&| }d&|g|g|  |g d | } jjd|dgdd}|| } fdd|D }||||dS )a  
        Process a single audio input.
        
        Args:
            audio: Single audio input
            sampling_rate: Audio sampling rate
            add_generation_prompt: Whether to add generation prompt
            context_info: Optional context information (e.g., hotwords, metadata) to help transcription
            
        Returns:
            Dictionary with processed tokens and audio features
        F)resamplez2ffmpeg loading failed, falling back to soundfile: r   Nr   )axis)orig_sr	target_srdtypeg      N@system)rolecontent)tokenize)
Start timeEnd time
Speaker IDContentz
This is a z.2fz! seconds audio, with extra info: z(

Please transcribe it with these keys: z, z6 seconds audio, please transcribe it with these keys:  
userTc                    s   g | ]}| j krd ndqS )r   r   )r!   ).0tokenr&   r   r   
<listcomp>{  s    z?VibeVoiceASRProcessor._process_single_audio.<locals>.<listcomp>)	input_idsacoustic_input_maskspeechvae_tok_len)'r]   strHAS_FFMPEG_UTILSr   r:   warningswarn	soundfilereadndimmeanr   librosari   torchTensorcpunumpysqueezenparrayfloat32astyper   r   lenmathceilr   r   apply_chat_templateSYSTEM_PROMPTencodeconvert_ids_to_tokensr   r!   r    stripr6   )r   rT   r   rY   rZ   r[   audio_arrayfile_srrI   sfr   audio_durationr   system_prompt_textsystem_tokenssp_start_tokensp_pad_tokensp_end_token	show_keysuser_suffixuser_input_stringuser_tokensfull_tokensr~   r   r&   r   r_     s   







$
z+VibeVoiceASRProcessor._process_single_audio	encodingsc                 C   s  dd |D }dd |D }dd |D }dd |D }	|r|dur%|}
n	t dd |D }
g }g }g }t||D ]H\}}|rQt||
krQ|d|
 }|d|
 }|
t| }| jg| | }d	g| | }d	g| d
gt|  }|| || || q9|}|}ndd |D }t dd |D }tjt||ftjd}tjt|t |	ftd}t	t||	D ]\}\}}|||dt|f< d||d|f< qt
 }|dkrtj|tjd|d< tj|tjd|d< tj|tjd|d< tj|tjd|d< tj|tjd|d< |S t|d
kr|n|d	 |d< t|d
kr$|n|d	 |d< t|d
kr3|n|d	 |d< t|d
krB|n|d	 |d< t|d
krQ|n|d	 |d< |S )a~  
        Combine multiple encodings into a batch.
        
        Args:
            encodings: List of encoded samples
            padding: Whether to pad sequences
            max_length: Maximum sequence length
            truncation: Whether to truncate
            return_tensors: Output format
            
        Returns:
            BatchEncoding with batched data
        c                 S      g | ]}|d  qS )r}   r   rz   encr   r   r   r|         z7VibeVoiceASRProcessor._batch_encode.<locals>.<listcomp>c                 S   r   )r~   r   r   r   r   r   r|     r   c                 S   r   )r   r   r   r   r   r   r|     r   c                 S   r   )r   r   r   r   r   r   r|     r   Nc                 s       | ]}t |V  qd S Nr   rz   idsr   r   r   	<genexpr>      z6VibeVoiceASRProcessor._batch_encode.<locals>.<genexpr>r   r   c                 S   s   g | ]	}d gt | qS )r   r   r   r   r   r   r|     s    c                 s   r   r   r   )rz   sr   r   r   r     r   rm   Tptr}   attention_maskr~   speech_tensorsspeech_masks)maxzipr   r"   r`   r   zerosr   bool	enumerater   r   tensorlong)r   r   rV   rW   rX   rU   input_ids_listacoustic_masks_listspeech_listvae_tok_lenstarget_lengthpadded_input_idspadded_acoustic_masksattention_masksr}   acoustic_maskpadding_length
padded_idspadded_acousticr   max_speech_lengthpadded_speechesr   ir   vae_lenrg   r   r   r   ra     sZ   


	z#VibeVoiceASRProcessor._batch_encodec                 O      | j j|i |S )zi
        Decode batch of token IDs to text.
        Forwards to tokenizer's batch_decode method.
        )r   batch_decoder   argsr   r   r   r   r        z"VibeVoiceASRProcessor.batch_decodec                 O   r   )zZ
        Decode token IDs to text.
        Forwards to tokenizer's decode method.
        )r   decoder   r   r   r   r     r   zVibeVoiceASRProcessor.decodetextc              
   C   s  zd|v r| dd }| d|}|||  }nH| d}|dkr)| d}|dkrad}|}t|t|D ]!}|| dv rE|d	7 }q8|| d
v rY|d	8 }|dkrY|d	 } nq8||| }n|}t|}t|trp|g}g }|D ]-}	t|	tri }
dddddddd}| D ]\}}||	v r|	| |
|< q|
r|	|
 qt|W S  tj
y } ztd|  td|  g W  Y d}~S d}~w ty } ztd|  g W  Y d}~S d}~ww )z
        Post-process the generated transcription text to extract structured data.
        
        Args:
            text: Generated text from the model
            
        Returns:
            List of dictionaries with transcription segments
        z```json   z```[{r   z[{r   z]}
start_timeend_time
speaker_idr   )rs   Startrt   Endru   Speakerrv   z)Failed to parse JSON from transcription: z
Raw text: Nz%Error post-processing transcription: )findr   ranger   r1   loadsr]   dictitemsr`   JSONDecodeErrorr;   r<   debugr:   )r   r   
json_startjson_endjson_strbracket_countr   resultcleaned_resultitemcleaned_itemkey_mappingkey
mapped_keyrI   r   r   r   post_process_transcription  sn   






	
z0VibeVoiceASRProcessor.post_process_transcriptionc                 C   s   g dS )z0Return the list of inputs accepted by the model.)r}   r   r~   r   r   r   r&   r   r   r   model_input_names7  s   z'VibeVoiceASRProcessor.model_input_names)NNr   r   T)	NNNTNFTTN)NTTN)TNFN)!__name__
__module____qualname____doc__r   r   classmethodrA   r   r   r4   PathLikerS   r   r   ndarrayr   r   r   intr	   r   r   rh   r   r   r_   ra   r   r   r   propertyr   r   r   r   r   r      s    

D(	

M

|
XMr   )"r   r4   r1   r   r   typingr   r   r   r   r   r   r   r   r   $transformers.tokenization_utils_baser   r2   r	   r
   vibevoice_tokenizer_processorr   r   audio_utilsr   r   ImportErrorr   
get_loggerr   r;   r   r   __all__r   r   r   r   <module>   s4     
    
"