o
    ꁱiȋ                     @   s  d Z ddlmZ ddlmZmZmZmZmZm	Z	m
Z
 ddlmZ ddlZddlmZ ddlmZmZ ddlmZmZmZmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4 e"5e6Z7e8edrej9du rg de_9dZ:dZ;	d&dedee<ef de=dee<ef fddZ>eG dd  d eZ?eG d!d" d"eZ@G d#d$ d$e/eZAeBe*eA g d%ZCdS )'a6  
VibeVoice Streaming Inference Model (0.5B)

This module implements the inference engine for real-time streaming TTS.
Key features:
- Window-based text/speech interleaving for streaming
- Binary EOS classifier for end-of-speech detection
- Classifier-free guidance for speech quality
- Audio streaming support
    )	dataclass)AnyDictListOptionalTupleUnionCallable)tqdmN)	AutoModelAutoModelForCausalLM)GenerationMixinGenerationConfigLogitsProcessorLogitsProcessorListStoppingCriteriaList)BaseModelOutputWithPastModelOutput)modeling_utils)PreTrainedModel)FlashAttentionKwargs)logging   ) VibeVoiceTokenizerStreamingCache)VibeVoiceDiffusionHead)DPMSolverMultistepScheduler)VibeVoiceStreamingConfig)VibeVoiceTextTokenizerVibeVoiceTextTokenizerFast)!VibeVoiceStreamingPreTrainedModelVibeVoiceStreamingModelBinaryClassifier)AudioStreamerAsyncAudioStreamerALL_PARALLEL_STYLES)tpnonecolwiserowwise      outputsmodel_kwargsnum_new_tokensreturnc                 C   sx   t | d|d< |d }tj|||jd |fgdd|d< t|d d d |d d | d |d j|d< |S )a.  
    Update model_kwargs after adding new tokens.

    Mainly for the case num_new_tokens > 1 (e.g. a whole text window):
      - past_key_values: take from current outputs
      - attention_mask: append num_new_tokens ones
      - cache_position: advance by creating a range for all new positions
    past_key_valuesattention_maskr   dimcache_positionr   )getattrtorchcatnew_onesshapearangetodevice)r+   r,   r-   r0    r=   \/home/ubuntu/vibevoice-community/vibevoice/modular/modeling_vibevoice_streaming_inference.py#_update_model_kwargs_for_generation-   s   
8r?   c                   @   s    e Zd ZU dZeej ed< dS )VibeVoiceCausalLMOutputWithPastNlogits)__name__
__module____qualname__rA   r   r6   FloatTensor__annotations__r=   r=   r=   r>   r@   H   s   
 r@   c                   @   sH   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	ej ed< dS )VibeVoiceGenerationOutputaC  
    Output type for VibeVoice generation.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences.
        speech_outputs (`List[torch.FloatTensor]`, *optional*):
            List of generated speech waveforms or latents for each speech segment.
    N	sequencesspeech_outputsreach_max_step_sample)rB   rC   rD   __doc__rH   r6   
LongTensorrF   rI   r   r   rE   rJ   
BoolTensorr=   r=   r=   r>   rG   M   s
   
 	rG   c                (       s  e Zd ZdZ fddZedd Zedd Zedd	 Zed
d Z	edd Z
edd Zdd Zdd Zdd Zdd Zdd ZdOddZdOddZ											dPdejd eej d!eej d"eeeej   d#eej d$eej d%ee d&ee d'ee d(ee d)eej d*eeef fd+d,Z													dQdejd eej d!eej d"eeeej   d#eej d$eej d%ee d&ee d'ee d(ee d)eej d-eej d.eej d*eeef fd/d0Zd1d2 Z dRd4d5Z!e" 															6	7	dSd8eej d9ee# d:ee$ d;ee% d<ee&e'ejge(e' f  d=ee d>ed? d@eee)e*f  dAeej dBeej dCeej dDeej dEeej dFeej dGedHe+dIee&g ef  d*eeje,f f$dJdKZ-e" dTdMdNZ.  Z/S )U3VibeVoiceStreamingForConditionalGenerationInferenceaG  
    VibeVoice Streaming model for conditional generation inference.

    This class handles the full streaming TTS pipeline:
    1. Prefill text prompt with cached voice embeddings
    2. Window-based text/speech interleaving
    3. Diffusion-based speech token generation
    4. Binary EOS detection for natural stopping
    c                    s:   t  | t|| _t|jj| _|jj	| _
|   d S N)super__init__r    modelr!   decoder_confighidden_sizetts_eos_classifierdiffusion_head_configddpm_num_inference_stepsddpm_inference_steps	post_init)selfconfig	__class__r=   r>   rQ   h   s
   

z<VibeVoiceStreamingForConditionalGenerationInference.__init__c                 C      | j jS rO   )rR   noise_schedulerrZ   r=   r=   r>   r_   w      zCVibeVoiceStreamingForConditionalGenerationInference.noise_schedulerc                 C   r^   rO   )rR   prediction_headr`   r=   r=   r>   rb   {   ra   zCVibeVoiceStreamingForConditionalGenerationInference.prediction_headc                 C   r^   rO   )rR   speech_scaling_factorr`   r=   r=   r>   rc      ra   zIVibeVoiceStreamingForConditionalGenerationInference.speech_scaling_factorc                 C   r^   rO   )rR   speech_bias_factorr`   r=   r=   r>   rd      ra   zFVibeVoiceStreamingForConditionalGenerationInference.speech_bias_factorc                 C   r^   rO   )rR   acoustic_tokenizerr`   r=   r=   r>   re      ra   zFVibeVoiceStreamingForConditionalGenerationInference.acoustic_tokenizerc                 C   r^   rO   )rR   acoustic_connectorr`   r=   r=   r>   rf      ra   zFVibeVoiceStreamingForConditionalGenerationInference.acoustic_connectorc                 C   sF   t | jdds	dS t| drt| jjdr!| jjjj| j_dS dS dS )zY
        Tie the weights between the input embeddings and the output embeddings.
        tie_word_embeddingsFNlm_headembed_tokens)r5   r[   hasattrrR   language_modelri   weightrh   r`   r=   r=   r>   tie_weights   s
   z?VibeVoiceStreamingForConditionalGenerationInference.tie_weightsc                 C   s
   | j  S rO   )rR   get_input_embeddingsr`   r=   r=   r>   rn      s   
zHVibeVoiceStreamingForConditionalGenerationInference.get_input_embeddingsc                 C   s   | j | d S rO   )rR   set_input_embeddings)rZ   valuer=   r=   r>   ro      s   zHVibeVoiceStreamingForConditionalGenerationInference.set_input_embeddingsc                 C   s   dS )zR
        This model does not define an `lm_head` (vocabulary projection).
        Nr=   r`   r=   r=   r>   get_output_embeddings   s   zIVibeVoiceStreamingForConditionalGenerationInference.get_output_embeddingsc                 C      t d)z
        No-op because there is no `lm_head`. Provided only to satisfy optional API calls.
        To enable, first create `self.lm_head` then allow assignment.
        zvOutput embeddings (lm_head) are not defined for this model. Create one before calling set_output_embeddings if needed.RuntimeError)rZ   new_embeddingsr=   r=   r>   set_output_embeddings   s   zIVibeVoiceStreamingForConditionalGenerationInference.set_output_embeddingsNc                 C   s   | j | dS )z@Set the speech tokenizers used for encoding and decoding speech.N)rR   set_speech_tokenizers)rZ   re   r=   r=   r>   rw      s   zIVibeVoiceStreamingForConditionalGenerationInference.set_speech_tokenizersc                 C   s   |p| j jj| _d S rO   )r[   rV   rW   rX   )rZ   	num_stepsr=   r=   r>   set_ddpm_inference_steps   s   zLVibeVoiceStreamingForConditionalGenerationInference.set_ddpm_inference_steps	input_idsr0   position_idsr/   inputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictr4   r.   c                 K   s   |
dur|
n| j j}
|du r| j |}| jjd|||||||	|
|d	|}|
s.|d n|j}|dur9tdt|j||j	dS )aN  
        Single pass of the base text LM.

        - Builds embeddings if `inputs_embeds` not provided.
        - Uses (and returns) `past_key_values` when `use_cache=True`.
        - No loss / no lm_head / no speech logic.

        Args:
            input_ids: (B, S) token ids.
            attention_mask: (B, S) mask.
            past_key_values: cache from previous steps.
            cache_position: positions for cached tokens.
            labels: unsupported (will raise).

        Returns:
            BaseModelOutputWithPast with `last_hidden_state` and `past_key_values`.
        N	r|   r0   r{   r/   r~   r   r   r   r4   r   4Loss computation is not implemented in this version.)r/   last_hidden_state
attentionsr=   )
r[   use_return_dictrR   rn   rk   r   NotImplementedErrorr   r/   r   )rZ   rz   r0   r{   r/   r|   r}   r~   r   r   r   r4   kwargsr+   hidden_statesr=   r=   r>   
forward_lm   s0    
z>VibeVoiceStreamingForConditionalGenerationInference.forward_lmlm_last_hidden_statetts_text_masksc                 K   s   |
dur|
n| j j}
|du r| j |}|jd |jd  }||dd|dddf< || j|  }| jjd|||||||	|
|d	|}|
sO|d n|j}| 	|dddddf }|durht
dt||j||jdS )	aV  
        Single pass of the TTS LM.

        - Overwrites tail embeddings with `lm_last_hidden_state`.
        - Adds type embedding via `tts_text_masks` (1=text, 0=speech).
        - Predicts EOS from last hidden state (binary classifier).
        - No loss / no full acoustic decoding here.

        Args:
            input_ids: (B, S) token ids.
            attention_mask: (B, S) mask.
            lm_last_hidden_state: (B, K, H) hidden states to splice into the tail.
            tts_text_masks: (B, 1) mask marking current position as text(1)/speech(0).
            past_key_values: cache from previous TTS steps.
            cache_position: positions for cached tokens.
            labels: unsupported (will raise).

        Returns:
            VibeVoiceCausalLMOutputWithPast with `logits` (EOS), `last_hidden_state`, `past_key_values`.
        Nr   r   r   r1   r   )rA   r/   r   r   r=   )r[   r   rR   rn   r9   tts_input_typeslongtts_language_modelr   rU   r   r@   r/   r   )rZ   rz   r0   r{   r/   r|   r}   r~   r   r   r   r4   r   r   r   	start_idxr+   r   rA   r=   r=   r>   forward_tts_lm   s:   %
zBVibeVoiceStreamingForConditionalGenerationInference.forward_tts_lmc                 O   rr   )a  
        Unified forward is intentionally disabled.

        Reasons:
          1. The inference pipeline is staged: base text LM, then TTS LM, plus streaming & diffusion handled in `generate`.
          2. A monolithic call would hide required sequencing (prefill, window stepping, speech diffusion sampling).

        Use instead:
          - self.forward_lm(...)       for a base text LM step (prefill or incremental).
          - self.forward_tts_lm(...)   for a single TTS LM step (needs LM hidden states).
          - self.generate(...)         for full streaming (text + speech + diffusion + audio assembly).

        Raises:
            RuntimeError: Always (by design).
        zWUnified forward is disabled. Use `forward_lm`, `forward_tts_lm`, or `generate` instead.rs   )rZ   argsr   r=   r=   r>   forward@  s   z;VibeVoiceStreamingForConditionalGenerationInference.forwardFc              	   K   s  |d u rt |j|j|jd}nt di ||j|j|jd}| j|df|j|j|jd|\}}|j|_|j|_|j|_| ||j|\}}}|j	d }	| j
}
| j|d|
d d|_|j|d< || j
}|j	d }|dd u ow|jd u}|d	d u o|jd u}| j||||||d
}|jd }| ||d |	||
 tj||
tjd|d< | D ]\}}t|tjr|j|
d||< q|r| j|||d t |j
|d}| j|t d}|||||fS |||fS )N)bos_token_ideos_token_idpad_token_idT)speech_start_idspeech_end_idspeech_diffusion_idr   r<   r~   r   
max_length
min_length)generation_confighas_default_max_lengthhas_default_min_lengthmodel_input_nameinputs_tensorinput_ids_length)r<   dtyper4   )r   input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnlogits_processorr<   r,   )r   stopping_criteriar=   )r   r   r   r   _prepare_generation_configr   r   r   _prepare_model_inputsr9   r<   _prepare_special_tokensr~   r;   getr   r   _prepare_generated_length_prepare_cache_for_generationr6   r:   r   items
isinstanceTensor_get_logits_processorr   _get_stopping_criteriar   )rZ   r   inputs	tokenizerreturn_processorsr   r,   r   r   
batch_sizer<   rz   r   r   r   max_cache_lengthkvr   r   r=   r=   r>   #_build_generate_config_model_kwargsT  s   





	

zWVibeVoiceStreamingForConditionalGenerationInference._build_generate_config_model_kwargsT      ?r   r   r   r   r   synced_gpusassistant_modelr   audio_streamernegative_prompt_idsnegative_prompt_attention_maskspeech_tensorsspeech_masksspeech_input_masktts_text_idsreturn_speech	cfg_scalestop_check_fnc           M      K   s	  | dd}|d}| dd}| dd}| dd}|| j}|dddu r7| jjj|jd  |d< | j	|||fd	d
i|\}}}}}t
j|d jd df|t
j|d jdt
j|d jd dft
j|d jd|ddd}| j	dd|fd	di|\}}}|||ddd}| j	dd|fd	di|\}} }t
j|d jd df|t
j|d jdt
j|d jd dft
j|d jd|ddd}!| j	dd|fd	di|!\}"}#}$t }%|jd }&|&dksJ d|j}'t
j|&t
j|'d}(|dd})dd t|&D }*d}+t
j|&t
j|'d},|jd tkrtn|jd }-|d }.|d }/|d }0|d }1t|.||-d}t|/| |-d} | j|0|dd}| j|1|#dd}#|jd }2d}3d}4|dd
rut|jd|2 d|2 d|j d |2dd!}5nd}5	 |dur| r|)rtd"|2d   |dur|  n|( rt|5d#r|5d$ n|dd|+t |+d t f }6|dd|+d t |+d% t f jd }7|+d7 }+|6jd dkrt
j||6gdd&}t
j||6gdd&}|jd |jkr|)rtd'|j d( t
j|&|'d)|(  }8|8 dkrd
|,|8< n%|2|6jd 7 }2|4|6jd 7 }4|5durG|5|6jd  |5d|4 d*|3 d+|2 d|j d 	 | j|fi |}9| j d2i |9d
ddd,}.t|.||7d}| j|fi | }:t
!|ddddf |.j"d-};| j#d2i |:|;d
ddd,}/| j|/| dd} t
$dg}<tt%D ]n}=|/j"|<dddf }>|1j"|<dddf }?| j&|>|?|d.'d}@|@| j(j)|@j | j(j*|@j }A| j(j+j,|A| j(j+j|%|<| j(j+jd
dd/}Bt-|<D ]\}C}D|D. }E|(|E s|*|E /|B|C  q|dur|0|B|< | j(1|@}Ft
j|t
!|ddddf gdd&}|jd |jkr9 n|2d7 }2|3d7 }3|5dur^|5d |5d|4 d*|3 d+|2 d|j d 	 | j|fi | }:t
2|ddddf |Fd-};| j#d2i |:|;d
ddd,}/|=t%d kr|7dkrt|/| |7d} n| j|/| dd} t
j|$t
!|ddddf gdd&}$| j|$fi |#}Gt
2|$ddddf |Fd-}H| j#d2i |G|Hd
ddd,}1| j|1|#dd}#t
3| 4|/j"|<dddf }I|Id . d0krd
|(|<< |dur||< q|jd |jkr<|)r&td'|j d( t
j|&|'d)|(  }8|8 dkr;d
|,|8< nqx|durG|  g }J|*D ]}K|Kr^t
j|Kdd&}L|J/|L qK|J/d qK|,durx|,5 rxtd'|j d( t6||r|J|,d1S d|,d1S )3u|  
        Generate speech from text using streaming TTS.

        Text is fed in small windows (dynamic slicing of `tts_text_ids`), which enables streaming text input:
        you don't need the full text upfront. After each text window, a loop samples several speech latents
        (diffusion). The interleaved text encoding + speech generation enables streaming text input and
        realtime speech output.

        The function only supports batch size = 1 currently.

        - Windowed text prefill → incremental LM + TTS LM updates.
        - Interleave speech token diffusion sampling (`sample_speech_tokens`).
        - Stops on EOS (binary classifier) or max length / external `stop_check_fn`.
        - Returns final token `sequences` and (optionally) concatenated speech audio.

        Args:
            inputs: Input tensor (usually from processor)
            generation_config: Generation configuration
            audio_streamer: If provided, emits audio chunks during generation
            tts_text_ids: Full text tokens to stream in windows
            cfg_scale: Classifier-free guidance scale for speech diffusion (default: 1.0)
            return_speech: If False, skips audio decode concatenation
            stop_check_fn: External early-stop hook (returns True to halt)
            **kwargs: Additional arguments including tokenizer, all_prefilled_outputs, etc.

        Returns:
            VibeVoiceGenerationOutput with:
              - sequences: final token ids
              - speech_outputs: list of concatenated audio tensors (or None)
              - reach_max_step_sample: flags for samples stopped by max length
        r   Nz<|image_pad|>tts_lm_input_idstts_lm_attention_maskall_prefilled_outputsmax_new_tokensr1   r   Trz   r   r   )r   r<   d   )rz   r0   r   Fz'Currently only supports batch size == 1verbosec                 S   s   g | ]}g qS r=   r=   ).0_r=   r=   r>   
<listcomp>  s    zPVibeVoiceStreamingForConditionalGenerationInference.generate.<locals>.<listcomp>lmtts_lmneg_lm
neg_tts_lm)r-   )is_encoder_decodershow_progress_barz
Prefilled z tokens, current step (z / ))totaldescinitialleavez&Generation stopped externally at step set_descriptionzGeneration complete   r2   z"Reached maximum generation length z, stopped it.r   z text tokens, generated z speech tokens, current step ()r   r   r   )r   r   )r   )cachesample_indicesr~   debugg      ?)rH   rI   rJ   r=   )7popconvert_tokens_to_idsr;   r<   r   r[   rS   max_position_embeddingsr9   r   r6   fullr   onesr   zerosboolrangeTTS_TEXT_WINDOW_SIZEr?   r
   r   printendallrj   r   r7   r:   numelupdateprepare_inputs_for_generationr   	ones_liker   r   rL   TTS_SPEECH_WINDOW_SIZEsample_speech_tokens	unsqueezerR   rc   rd   re   decode	enumerateitemappendputrf   
zeros_likesigmoidrU   anyrG   )MrZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   neg_text_input_idr   r   r   r,   rz   negative_kwargsnegative_generation_confignegative_model_kwargsnegative_input_idstts_lm_kwargstts_lm_generation_configtts_lm_model_kwargstts_lm_negative_kwargs!tts_lm_negative_generation_configtts_lm_negative_model_kwargstts_lm_negative_input_idsacoustic_cacher   r<   finished_tagsr   audio_chunkstts_text_window_indexrJ   first_text_window_sizer+   tts_lm_outputsnegative_outputstts_lm_negative_outputssteptotal_generated_speech_tokenstotal_prefilled_text_tokensprogress_barcur_input_tts_text_idsnext_text_window_sizereached_samplesmodel_inputstts_lm_model_inputstts_lm_additional_inputsdiffusion_indicescur_speech_indexpositive_conditionnegative_conditionspeech_latentscaled_latentaudio_chunki
sample_idxidxacoustic_embedtts_lm_negative_model_inputs!tts_lm_negative_additional_inputstts_eos_logitsfinal_audio_outputssample_chunksconcatenated_audior=   r=   r>   generate  s  6
&$

&$





 *
&

$	

(

&
(
 

  
z<VibeVoiceStreamingForConditionalGenerationInference.generate      @c                 C   s
  | j j| j tj||gdd| j jj}t	|j
d | jj|}| j jjD ]P}|dt|d  }tj||gdd}| j j|||j
d ||d}tj|t|d dd\}	}
|
||	|
   }tj||gdd}| j j|||j}q*|dt|d  S )as  
        Sample speech tokens using diffusion with classifier-free guidance.

        Args:
            condition: Positive conditioning from TTS LM hidden states
            neg_condition: Negative conditioning for CFG
            cfg_scale: Classifier-free guidance scale (higher = more adherence to text)

        Returns:
            Generated speech latents
        r   r2   Nr   )	condition)rR   r_   set_timestepsrX   r6   r7   r;   rb   r<   randnr9   r[   acoustic_vae_dim	timestepslenrepeatsplitr  prev_sample)rZ   r.  neg_conditionr   speechthalfcombinedepscond_eps
uncond_epshalf_epsr=   r=   r>   r     s   $zHVibeVoiceStreamingForConditionalGenerationInference.sample_speech_tokensrO   )NNNNNNNNNNN)NNNNNNNNNNNNN)F)NNNNNNNNNNNNNNTr   N)r-  )0rB   rC   rD   rK   rQ   propertyr_   rb   rc   rd   re   rf   rm   rn   ro   rq   rv   rw   ry   r6   rL   r   r   r   rE   r   r   r   r   rM   r@   r   r   r   no_gradr   r   r   r	   intr   r"   r#   floatrG   r,  r   __classcell__r=   r=   r\   r>   rN   ]   sB   








	


@	


M
H	
  <rN   )rN   rG   r@   r   r   )r   )DrK   dataclassesr   typingr   r   r   r   r   r   r	   r
   r6   torch.nnnntransformers.models.autor   r   transformers.generationr   r   r   r   r   transformers.modeling_outputsr   r   transformersr   transformers.modeling_utilsr   +transformers.modeling_flash_attention_utilsr   transformers.utilsr   modular_vibevoice_tokenizerr    modular_vibevoice_diffusion_headr   vibevoice.schedule.dpm_solverr   !configuration_vibevoice_streamingr    modular_vibevoice_text_tokenizerr   r   modeling_vibevoice_streamingr   r    r!   streamerr"   r#   
get_loggerrB   loggerrj   r$   r   r   strrB  r?   r@   rG   rN   register__all__r=   r=   r=   r>   <module>   s^    $




     