o
    ꁱi{                     @   s  d dl mZ d dlmZmZmZmZmZmZ d dl	m	Z	 d dl
Z
d dlmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  ddl!m"Z"m#Z# ddl$m%Z% d dl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2 e 3e4Z5e6edrej7du rg de_7eG dd deZ8eG dd deZ9G dd deZ:G dd de/eZ;e<e)e; dgZ=dS )    )	dataclass)DictListOptionalTupleUnionCallable)tqdmN)	AutoModelAutoModelForCausalLM)GenerationMixinGenerationConfigLogitsProcessorLogitsProcessorListStoppingCriteriaList)BaseModelOutputWithPastModelOutput)modeling_utils)PreTrainedModel)FlashAttentionKwargs)logging   ) VibeVoiceTokenizerStreamingCacheVibeVoiceTokenizerEncoderOutput)VibeVoiceDiffusionHead)DPMSolverMultistepScheduler)VibeVoiceConfig)VibeVoiceTextTokenizerVibeVoiceTextTokenizerFast)VibeVoiceModelVibeVoicePreTrainedModel)AudioStreamerAsyncAudioStreamerALL_PARALLEL_STYLES)tpnonecolwiserowwisec                   @   s    e Zd ZU dZeej ed< dS )VibeVoiceCausalLMOutputWithPastNlogits)__name__
__module____qualname__r)   r   torchFloatTensor__annotations__ r0   r0   R/home/ubuntu/vibevoice-community/vibevoice/modular/modeling_vibevoice_inference.pyr(   "   s   
 r(   c                   @   sH   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	ej ed< dS )VibeVoiceGenerationOutputaH  
    Output type for VibeVoice generation.
    
    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. 
        speech_outputs (`List[torch.FloatTensor]`, *optional*):
            List of generated speech waveforms or latents for each speech segment.
    N	sequencesspeech_outputsreach_max_step_sample)r*   r+   r,   __doc__r3   r-   
LongTensorr/   r4   r   r   r.   r5   
BoolTensorr0   r0   r0   r1   r2   &   s
   
 	r2   c                   @   sF   e Zd ZdZddee dejfddZdej	dej
d	ej
fd
dZdS )!VibeVoiceTokenConstraintProcessorzJConstrains token generation to only valid tokens during speech generation.Nvalid_token_idsdevicec                 C   s   t j|t j|d| _d S )Ndtyper;   )r-   tensorlongr:   )selfr:   r;   r0   r0   r1   __init__8   s   z*VibeVoiceTokenConstraintProcessor.__init__	input_idsscoresreturnc                 C   s.   t |td}d|d d | jf< || }|S )Nz-infr   )r-   	full_likefloatr:   )r@   rB   rC   maskr0   r0   r1   __call__;   s   z*VibeVoiceTokenConstraintProcessor.__call__N)r*   r+   r,   r6   r   intr-   r;   rA   r7   r.   rH   r0   r0   r0   r1   r9   5   s     r9   c                *       s  e Zd ZdgZddiZ fddZedd Zedd	 Zed
d Z	edd Z
edd Zedd Zedd Zedd Zdd Zdd Zdd Zdd Zdd ZdUd!d"ZdVd#d$ZdWd&d'Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	(dXd)ejd*eej d+eej d,eeeej   d-eej d.eej d/ee d0ee d1ee d2ee d3eej d4eej d5eej d6eej d7ee e!f d8eee"f f d9d:Z#dYd<d=Z$e% 	 	 	 	 	 	 	 	 	 	 	 	 	 	>	>	?	 	 dZd@eej dAee& dBee' dCee( dDee)e ejge*e  f  dEee dFedG dHeee+e,f  dIeej dJeej d4eej d5eej d6eej dKedLedMe-dNee)g ef  dOee. d8eeje/f f&dPdQZ0e% d[dSdTZ1  Z2S )\*VibeVoiceForConditionalGenerationInferencezlm_head.weightlm_headcolwise_repc                    sF   t  | t|| _tj|jj|jjdd| _	|j
j| _|   d S )NF)bias)superrA   r   modelnnLineardecoder_confighidden_size
vocab_sizerL   diffusion_head_configddpm_num_inference_stepsddpm_inference_steps	post_init)r@   config	__class__r0   r1   rA   H   s
   

z3VibeVoiceForConditionalGenerationInference.__init__c                 C      | j jS rI   )rP   noise_schedulerr@   r0   r0   r1   r^   W      z:VibeVoiceForConditionalGenerationInference.noise_schedulerc                 C   r]   rI   )rP   prediction_headr_   r0   r0   r1   ra   [   r`   z:VibeVoiceForConditionalGenerationInference.prediction_headc                 C   r]   rI   )rP   speech_scaling_factorr_   r0   r0   r1   rb   _   r`   z@VibeVoiceForConditionalGenerationInference.speech_scaling_factorc                 C   r]   rI   )rP   speech_bias_factorr_   r0   r0   r1   rc   c   r`   z=VibeVoiceForConditionalGenerationInference.speech_bias_factorc                 C   r]   rI   )rP   acoustic_tokenizerr_   r0   r0   r1   rd   g   r`   z=VibeVoiceForConditionalGenerationInference.acoustic_tokenizerc                 C   r]   rI   )rP   semantic_tokenizerr_   r0   r0   r1   re   k   r`   z=VibeVoiceForConditionalGenerationInference.semantic_tokenizerc                 C   r]   rI   )rP   acoustic_connectorr_   r0   r0   r1   rf   o   r`   z=VibeVoiceForConditionalGenerationInference.acoustic_connectorc                 C   r]   rI   )rP   semantic_connectorr_   r0   r0   r1   rg   s   r`   z=VibeVoiceForConditionalGenerationInference.semantic_connectorc                 C   sF   t | jdds	dS t| drt| jjdr!| jjjj| j_dS dS dS )zY
        Tie the weights between the input embeddings and the output embeddings.
        tie_word_embeddingsFNrL   embed_tokens)getattrrZ   hasattrrP   language_modelri   weightrL   r_   r0   r0   r1   tie_weightsw   s
   z6VibeVoiceForConditionalGenerationInference.tie_weightsc                 C   s
   | j  S rI   )rP   get_input_embeddingsr_   r0   r0   r1   ro         
z?VibeVoiceForConditionalGenerationInference.get_input_embeddingsc                 C   s   | j | d S rI   )rP   set_input_embeddings)r@   valuer0   r0   r1   rq      s   z?VibeVoiceForConditionalGenerationInference.set_input_embeddingsc                 C   s   | j S rI   rL   r_   r0   r0   r1   get_output_embeddings   s   z@VibeVoiceForConditionalGenerationInference.get_output_embeddingsc                 C   s
   || _ d S rI   rs   )r@   new_embeddingsr0   r0   r1   set_output_embeddings   rp   z@VibeVoiceForConditionalGenerationInference.set_output_embeddingsNc                 C   s   | j || dS )z@Set the speech tokenizers used for encoding and decoding speech.N)rP   set_speech_tokenizers)r@   rd   re   r0   r0   r1   rw      s   z@VibeVoiceForConditionalGenerationInference.set_speech_tokenizersc                 C   s   |p| j jj| _d S rI   )rZ   rV   rW   rX   )r@   	num_stepsr0   r0   r1   set_ddpm_inference_steps   s   zCVibeVoiceForConditionalGenerationInference.set_ddpm_inference_stepsaudioc                 C   s,  t   |dkrE| jj|d}|j| jjjdd }|| jj	|j
 | jj	|j
 }| j||  }||fW  d   S |dkrt|| jjjd}|j| jjjdd }|| jj	|j
 | jj	|j
 }| j||  }||fW  d   S td| d	1 sw   Y  dS )
z8Process speech inputs through tokenizers and connectors.rz   r   )	dist_typer   Npt)meanstdzSpeech type z not implemented)r-   no_gradrP   rd   encode	unsqueezesamplestd_dist_typerc   tor;   rb   rf   cpur   rZ   fix_stdNotImplementedError)r@   speech_tensorsspeech_masksspeech_typeencoder_outputacoustic_latentsacoustic_featuresacoustic_connectedr0   r0   r1   _process_speech_inputs   s"   
$$zAVibeVoiceForConditionalGenerationInference._process_speech_inputsr   rB   attention_maskposition_idspast_key_valuesinputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionr   r   speech_input_masklogits_to_keeprD   c                 K   s   |
dur|
n| j j}
|du r| j |}|dur1|dur1| || j|\}}|dur1|||< | jd|||||||	|
|d	|}|
sI|d n|j}t|t	rWt
| dn|}| |dd|ddf }|durotdt||j||jdS )a  
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
            speech_tensors (`torch.FloatTensor`, *optional*):
                Input speech waveforms for voice cloning or speech understanding.
            speech_masks (`torch.BoolTensor`, *optional*):
                Masks indicating valid speech frames.
            speech_input_mask (`torch.BoolTensor`, *optional*):
                Positions in the input sequence where speech embeddings should be inserted.
        
        Returns:
            `VibeVoiceCausalLMOutputWithPast` or tuple
        N)	r   r   r   r   r   r   r   r   r   r   z4Loss computation is not implemented in this version.)r)   r   last_hidden_state
attentionsr0   )rZ   use_return_dictrP   ro   r   r   r=   r   
isinstancerJ   slicerL   r   r(   r   r   )r@   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsr   speech_embedsoutputshidden_statesslice_indicesr)   r0   r0   r1   forward   s>   #
z2VibeVoiceForConditionalGenerationInference.forwardFc              	   K   s  |d u rt |j|j|jd}nt di ||j|j|jd}| j|df|j|j|jd|\}}|j|_|j|_|j|_| ||j|\}}}|j	d }	| j
}
| j|d|
d d|_|j|d< || j
}|j	d }|dd u ow|jd u}|d	d u o|jd u}| j||||||d
}|jd }| ||d |	||
 tj||
tjd|d< | D ]\}}t|tjr|j|
d||< q|r| j|||d t |j
|d}| j|t d}|||||fS |||fS )N)bos_token_ideos_token_idpad_token_idT)speech_start_idspeech_end_idspeech_diffusion_idr   r;   r   r   
max_length
min_length)generation_confighas_default_max_lengthhas_default_min_lengthmodel_input_nameinputs_tensorinput_ids_length)r;   r=   r   )r   input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnlogits_processorr;   model_kwargs)r   stopping_criteriar0   )r   r   r   r   _prepare_generation_configr   r   r   _prepare_model_inputsshaper;   _prepare_special_tokensr   r   getr   r   _prepare_generated_length_prepare_cache_for_generationr-   aranger?   itemsr   Tensor_get_logits_processorr   _get_stopping_criteriar   )r@   r   inputs	tokenizerreturn_processorsr   r   r   r   
batch_sizer;   rB   r   r   r   max_cache_lengthkvr   r   r0   r0   r1   #_build_generate_config_model_kwargs   s   





	

zNVibeVoiceForConditionalGenerationInference._build_generate_config_model_kwargsT      ?r   r   r   r   r   synced_gpusassistant_modelr   audio_streamernegative_prompt_idsnegative_prompt_attention_mask
is_prefillreturn_speech	cfg_scalestop_check_fn
tqdm_classc           \   
   K   s,  | dd}| dd}| dd}| dd}|dddu r.| jjj|d jd	  |d< | j|||fd
di|\}}}}}tj|d jd df|j	tj
|d jdtj|d jd dftj
|d jd|ddd}| jdd|fd
di|\}}}t }t }|jd } |j}!tj| tj|!d}"tj| tj
|!d}#d}$|dd}%dd t| D }&|jd	 }'|d jd	d}(|j	|j|j|jg})t|dr|jdur|)|j t|)|!d}*|du rt }||* t|j|' t||' }+t|j|( ||( 
 },tj| tj|!d}-|ddr$|dur|nt}.|.t|+ddd}/nt|+}/|/D ]}0|durN| rN|%rBtd|0d   |durK|   n|durnt|drnt |j!rn|%rktd|0d    nr|"" rt|/dr~|/#d   n_|jd	 |jkrtd!|j d" tj$| |!d|"  }1|1% dkrd|-|1<  n5t|/dr|"  & }2|/#d#|2 d$|  d% | j'|fi |}3|ri }4|dur|j(|!d|4d&< |dur|(|!|4d'< |dur|(|!|4d(< d}n
|3 d)d}5d)|$i}4| d9i |3|4ddddd*}6| j)|6|dd+}|6j*ddd	ddf j(dtj+|jd,}7|||7}8|j,rLt-j.j/|8d	d}9tj0|9dd-1d}:ntj2|8d	d}:|j|:|"< tj3||:dddf gd	d}|d.ds| j'|fi |};|;d) du r|$dur|$|;d)< d|;d< | d9i |;ddddd*}<| j)|<|dd+}tj3||:dddf gd	d}|:|jk  r|:|jkj4dd/1d}=|=|"|=   }>|>% dkrd|"|>< |%rtd0|>5  d1|0d  d2dd3 |dur||> |0|,k}?tj4|?|" @ dd/1d}@|@% dkr2d|"|@< d|-|@< |%r(td0|@5  d4|0d  d2dd3 |dur2||@ |:|jkj4dd/1d}A|A% dkrO|6|A |6|A tj$| |!d|" |:|j	k@  }B|B% dkr|d.drt7|B5 D ]\}C}Dd|d |Dddf< d|d |Dd	f< qrt7t8|d5 j9|d5 j:D ]?\}E\}F}G|B5 D ]3}D|F|Ddddddf ; |F|Dddd	ddf< |G|Ddddddf ; |G|Dddd	ddf< qq|B5 D ]
}D|j	||Dd	f< q| j<= |:>d}Htj$| |!d|" |:|jk@  }I|I% dkr|d.drO| j'|fi |};|;d) du r*|$dur*|$|;d)< d|;d< | d9i |;ddddd*}<| j)|<|dd+}tj3||:dddf gd	d}|" |:|jk@ }J|J  rKtj$| |!d|J }K|#|K }L|d jd }Mt7t8|K5 |L5 D ].\}C\}D}N|Nd |Md k r|d |D|Nd	f ; |d |D|Nd df< d|d |D|Nf< q{t7t8|d5 j9|d5 j:D ]^\}E\}F}Gt8|K5 |L5 D ]M\}D}N|Nd |Fjd d k r|F|Ddd|Nd	ddf ; |F|Ddd|Nd dddf< |G|Ddd|Nd	ddf ; |G|Ddd|Nd dddf< qƐqt8|K5 |L5 D ]#\}D}N|Nd |jd d k rA||D|Nd	f ; ||D|Nd df< q|#|K  d7  < |6j?|Id	ddf }O|<j?|Id	ddf }P| j@|O|P|d6>d}Q|Q| j<jA(|Qj | j<jB(|Qj }R| j<jCjD|R(| j<jCj||I(| j<jCjddd7}St7|ID ]\}C}D|D& }T|"|T s|&|T |S|C  q|dur|E|S|I | j<jFjG|S||Iddd7jH}U| j<I|Q}V| j<J|U}W|V|W }X|X|H|I< |H}$q*|dur|  g }Y|&D ]}Z|Zr tj3|Zd	d}[|Y|[ q|Yd qtK||r|Y|-d8S d|-d8S ):a+  
        Generates sequences of token ids and optionally speech outputs.
        
        Args:
            All standard generation arguments from GenerationMixin
            negative_prompt_ids: Negative prompt for CFG in speech generation
            negative_prompt_attention_mask: Attention mask for negative prompt
            speech_tensors: Input speech for voice cloning
            speech_masks: Masks for speech tensors  
            speech_input_mask: Positions to insert speech embeddings
            return_speech: Whether to decode and return speech outputs
            cfg_scale: CFG scale for speech generation
            stop_check_fn: Optional callable that returns True if generation should stop
 
        Returns:
            Generated token sequences and optionally speech outputs
        r   Nparsed_scriptsall_speakers_listmax_length_times   max_new_tokensrB   r   Tr   r   r<   d   )rB   r   r   Fverbosec                 S   s   g | ]}g qS r0   r0   ).0_r0   r0   r1   
<listcomp>  s    zGVibeVoiceForConditionalGenerationInference.generate.<locals>.<listcomp>r   dimr   r   show_progress_bar
Generating)descleavez&Generation stopped externally at step finished_flagsz,Audio generation stopped externally at step set_descriptionzGeneration completez"Reached maximum generation length z, stopped it.zGenerating (active: /)r   r   r   r   )r   r   r   r   )is_encoder_decoder)copyr=   r;   )num_samplesrefresh_negative)as_tuplezSamples z reached EOS token at step .)flushz' reached max generation length at step r   )r   )cachesample_indicesr   debug)r3   r4   r5   r0   )Lpopr   rZ   rS   max_position_embeddingsr   r   r-   fullr   r?   r;   onesr   zerosboolrangesumr   r   r   rk   r   appendr9   r   minr   rJ   r	   printendanyr   allr   r   numelitemprepare_inputs_for_generationr   #_update_model_kwargs_for_generationr)   float32	do_samplerQ   
functionalsoftmaxmultinomialsqueezeargmaxcatnonzerotolistset_to_zero	enumeratezip	key_cachevalue_cacheclonerP   ro   r   r   sample_speech_tokensrb   rc   rd   decodeputre   r   r}   rf   rg   r2   )\r@   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rB   negative_kwargsnegative_generation_confignegative_model_kwargsnegative_input_idsacoustic_cachesemantic_cacher   r;   finished_tagscorrect_cntr   r   audio_chunksinitial_lengthinitial_length_per_samplevalid_tokenstoken_constraint_processor	max_stepsmax_step_per_sampler5   tqdm_fnprogress_barstepreached_samplesactive_samplesmodel_inputsprefill_inputsr   r   next_token_logitsnext_token_scoresprobsnext_tokensnegative_model_inputsnegative_outputseos_indicesnew_eos_indicesmax_length_reachednew_max_length_indicesdiffusion_end_indicesdiffusion_start_indicesi
sample_idx	layer_idxk_cachev_cachenext_inputs_embedsdiffusion_indicesnon_diffusion_masknon_diffusion_indicesstart_indicesseq_len	start_idxpositive_conditionnegative_conditionspeech_latentscaled_latentaudio_chunkidxsemantic_featuresacoustic_embedsemantic_embeddiffusion_embedsfinal_audio_outputssample_chunksconcatenated_audior0   r0   r1   generateF  s  )($











(


"

"



04

"<<$	

	
z3VibeVoiceForConditionalGenerationInference.generate      @c                 C   s
  | j j| j tj||gdd| j jj}t	|j
d | jj|}| j jjD ]P}|d t|d  }tj||gdd}| j j|||j
d ||d}tj|t|d dd\}	}
|
||	|
   }tj||gdd}| j j|||j}q*|d t|d  S )Nr   r   r   )	condition)rP   r^   set_timestepsrX   r-   r  r   ra   r;   randnr   rZ   acoustic_vae_dim	timestepslenrepeatsplitr0  prev_sample)r@   r\  neg_conditionr   speechthalfcombinedepscond_eps
uncond_epshalf_epsr0   r0   r1   r    s   $z?VibeVoiceForConditionalGenerationInference.sample_speech_tokens)NNrI   )rz   )NNNNNNNNNNNNNNr   )F)NNNNNNNNNNNNNTTr   NN)r[  )3r*   r+   r,   _tied_weights_keys_tp_planrA   propertyr^   ra   rb   rc   rd   re   rf   rg   rn   ro   rq   rt   rv   rw   ry   r   r-   r7   r   r   r   r.   r   r8   r   rJ   r   r(   r   r   r   r   r   r   r   r   r!   r"   rF   typer2   rZ  r  __classcell__r0   r0   r[   r1   rK   D   s   










 	




KH	
  vrK   )>dataclassesr   typingr   r   r   r   r   r   r	   r-   torch.nnrQ   transformers.models.autor
   r   transformers.generationr   r   r   r   r   transformers.modeling_outputsr   r   transformersr   transformers.modeling_utilsr   +transformers.modeling_flash_attention_utilsr   transformers.utilsr   modular_vibevoice_tokenizerr   r    modular_vibevoice_diffusion_headr   vibevoice.schedule.dpm_solverr   configuration_vibevoicer    modular_vibevoice_text_tokenizerr   r   modeling_vibevoicer   r    streamerr!   r"   
get_loggerr*   loggerrk   r#   r(   r2   r9   rK   register__all__r0   r0   r0   r1   <module>   sH     

     