o
    聱iT                     @   s  d dl mZ d dlmZmZmZmZmZmZ d dl	m	Z	 d dl
Z
d dlmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  ddl!m"Z"m#Z# ddl$m%Z% d dl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2 e 3e4Z5e6edrej7du rg de_7eG dd deZ8eG dd deZ9G dd deZ:G dd de/eZ;e<e)e; dgZ=dS )    )	dataclass)DictListOptionalTupleUnionCallable)tqdmN)	AutoModelAutoModelForCausalLM)GenerationMixinGenerationConfigLogitsProcessorLogitsProcessorListStoppingCriteriaList)BaseModelOutputWithPastModelOutput)modeling_utils)PreTrainedModel)FlashAttentionKwargs)logging   ) VibeVoiceTokenizerStreamingCacheVibeVoiceTokenizerEncoderOutput)VibeVoiceDiffusionHead)DPMSolverMultistepScheduler)VibeVoiceConfig)VibeVoiceTextTokenizerVibeVoiceTextTokenizerFast)VibeVoiceModelVibeVoicePreTrainedModel)AudioStreamerAsyncAudioStreamerALL_PARALLEL_STYLES)tpnonecolwiserowwisec                   @   s    e Zd ZU dZeej ed< dS )VibeVoiceCausalLMOutputWithPastNlogits)__name__
__module____qualname__r)   r   torchFloatTensor__annotations__ r0   r0   W/home/ubuntu/VibeVoice-finetuning/src/vibevoice/modular/modeling_vibevoice_inference.pyr(   "   s   
 r(   c                   @   sH   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	ej ed< dS )VibeVoiceGenerationOutputaH  
    Output type for VibeVoice generation.
    
    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. 
        speech_outputs (`List[torch.FloatTensor]`, *optional*):
            List of generated speech waveforms or latents for each speech segment.
    N	sequencesspeech_outputsreach_max_step_sample)r*   r+   r,   __doc__r3   r-   
LongTensorr/   r4   r   r   r.   r5   
BoolTensorr0   r0   r0   r1   r2   &   s
   
 	r2   c                   @   sF   e Zd ZdZddee dejfddZdej	dej
d	ej
fd
dZdS )!VibeVoiceTokenConstraintProcessorzJConstrains token generation to only valid tokens during speech generation.Nvalid_token_idsdevicec                 C   s   t j|t j|d| _d S )Ndtyper;   )r-   tensorlongr:   )selfr:   r;   r0   r0   r1   __init__8   s   z*VibeVoiceTokenConstraintProcessor.__init__	input_idsscoresreturnc                 C   s.   t |td}d|d d | jf< || }|S )Nz-infr   )r-   	full_likefloatr:   )r@   rB   rC   maskr0   r0   r1   __call__;   s   z*VibeVoiceTokenConstraintProcessor.__call__N)r*   r+   r,   r6   r   intr-   r;   rA   r7   r.   rH   r0   r0   r0   r1   r9   5   s     r9   c                &       s  e Zd ZdgZddiZ fddZedd Zedd	 Zed
d Z	edd Z
edd Zedd Zedd Zedd Zdd Zdd Zdd Zdd Zdd ZdSd!d"ZdTd#d$ZdUd&d'Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	(dVd)ejd*eej d+eej d,eeeej   d-eej d.eej d/ee d0ee d1ee d2ee d3eej d4eej d5eej d6eej d7ee e!f d8eee"f f d9d:Z#dWd<d=Z$e% 	 	 	 	 	 	 	 	 	 	 	 	 	 	>	?	 dXd@eej dAee& dBee' dCee( dDee)e ejge*e  f  dEee dFedG dHeee+e,f  dIeej dJeej d4eej d5eej d6eej dKedLe-dMee)g ef  d8eeje.f f"dNdOZ/e% dYdQdRZ0  Z1S )Z*VibeVoiceForConditionalGenerationInferencezlm_head.weightlm_headcolwise_repc                    sF   t  | t|| _tj|jj|jjdd| _	|j
j| _|   d S )NF)bias)superrA   r   modelnnLineardecoder_confighidden_size
vocab_sizerL   diffusion_head_configddpm_num_inference_stepsddpm_inference_steps	post_init)r@   config	__class__r0   r1   rA   H   s
   

z3VibeVoiceForConditionalGenerationInference.__init__c                 C      | j jS rI   )rP   noise_schedulerr@   r0   r0   r1   r^   W      z:VibeVoiceForConditionalGenerationInference.noise_schedulerc                 C   r]   rI   )rP   prediction_headr_   r0   r0   r1   ra   [   r`   z:VibeVoiceForConditionalGenerationInference.prediction_headc                 C   r]   rI   )rP   speech_scaling_factorr_   r0   r0   r1   rb   _   r`   z@VibeVoiceForConditionalGenerationInference.speech_scaling_factorc                 C   r]   rI   )rP   speech_bias_factorr_   r0   r0   r1   rc   c   r`   z=VibeVoiceForConditionalGenerationInference.speech_bias_factorc                 C   r]   rI   )rP   acoustic_tokenizerr_   r0   r0   r1   rd   g   r`   z=VibeVoiceForConditionalGenerationInference.acoustic_tokenizerc                 C   r]   rI   )rP   semantic_tokenizerr_   r0   r0   r1   re   k   r`   z=VibeVoiceForConditionalGenerationInference.semantic_tokenizerc                 C   r]   rI   )rP   acoustic_connectorr_   r0   r0   r1   rf   o   r`   z=VibeVoiceForConditionalGenerationInference.acoustic_connectorc                 C   r]   rI   )rP   semantic_connectorr_   r0   r0   r1   rg   s   r`   z=VibeVoiceForConditionalGenerationInference.semantic_connectorc                 C   sF   t | jdds	dS t| drt| jjdr!| jjjj| j_dS dS dS )zY
        Tie the weights between the input embeddings and the output embeddings.
        tie_word_embeddingsFNrL   embed_tokens)getattrrZ   hasattrrP   language_modelri   weightrL   r_   r0   r0   r1   tie_weightsw   s
   z6VibeVoiceForConditionalGenerationInference.tie_weightsc                 C   s
   | j  S rI   )rP   get_input_embeddingsr_   r0   r0   r1   ro         
z?VibeVoiceForConditionalGenerationInference.get_input_embeddingsc                 C   s   | j | d S rI   )rP   set_input_embeddings)r@   valuer0   r0   r1   rq      s   z?VibeVoiceForConditionalGenerationInference.set_input_embeddingsc                 C   s   | j S rI   rL   r_   r0   r0   r1   get_output_embeddings   s   z@VibeVoiceForConditionalGenerationInference.get_output_embeddingsc                 C   s
   || _ d S rI   rs   )r@   new_embeddingsr0   r0   r1   set_output_embeddings   rp   z@VibeVoiceForConditionalGenerationInference.set_output_embeddingsNc                 C   s   | j || dS )z@Set the speech tokenizers used for encoding and decoding speech.N)rP   set_speech_tokenizers)r@   rd   re   r0   r0   r1   rw      s   z@VibeVoiceForConditionalGenerationInference.set_speech_tokenizersc                 C   s   |p| j jj| _d S rI   )rZ   rV   rW   rX   )r@   	num_stepsr0   r0   r1   set_ddpm_inference_steps   s   zCVibeVoiceForConditionalGenerationInference.set_ddpm_inference_stepsaudioc                 C   s,  t   |dkrE| jj|d}|j| jjjdd }|| jj	|j
 | jj	|j
 }| j||  }||fW  d   S |dkrt|| jjjd}|j| jjjdd }|| jj	|j
 | jj	|j
 }| j||  }||fW  d   S td| d	1 sw   Y  dS )
z8Process speech inputs through tokenizers and connectors.rz   r   )	dist_typer   Npt)meanstdzSpeech type z not implemented)r-   no_gradrP   rd   encode	unsqueezesamplestd_dist_typerc   tor;   rb   rf   cpur   rZ   fix_stdNotImplementedError)r@   speech_tensorsspeech_masksspeech_typeencoder_outputacoustic_latentsacoustic_featuresacoustic_connectedr0   r0   r1   _process_speech_inputs   s"   
$$zAVibeVoiceForConditionalGenerationInference._process_speech_inputsr   rB   attention_maskposition_idspast_key_valuesinputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionr   r   speech_input_masklogits_to_keeprD   c                 K   s   |
dur|
n| j j}
|du r| j |}|dur1|dur1| || j|\}}|dur1|||< | jd|||||||	|
|d	|}|
sI|d n|j}t|t	rWt
| dn|}| |dd|ddf }|durotdt||j||jdS )a  
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
            speech_tensors (`torch.FloatTensor`, *optional*):
                Input speech waveforms for voice cloning or speech understanding.
            speech_masks (`torch.BoolTensor`, *optional*):
                Masks indicating valid speech frames.
            speech_input_mask (`torch.BoolTensor`, *optional*):
                Positions in the input sequence where speech embeddings should be inserted.
        
        Returns:
            `VibeVoiceCausalLMOutputWithPast` or tuple
        N)	r   r   r   r   r   r   r   r   r   r   z4Loss computation is not implemented in this version.)r)   r   last_hidden_state
attentionsr0   )rZ   use_return_dictrP   ro   r   r   r=   r   
isinstancerJ   slicerL   r   r(   r   r   )r@   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsr   speech_embedsoutputshidden_statesslice_indicesr)   r0   r0   r1   forward   s>   #
z2VibeVoiceForConditionalGenerationInference.forwardFc              	   K   s  |d u rt |j|j|jd}nt di ||j|j|jd}| j|df|j|j|jd|\}}|j|_|j|_|j|_| ||j|\}}}|j	d }	| j
}
| j|d|
d d|_|j|d< || j
}|j	d }|dd u ow|jd u}|d	d u o|jd u}| j||||||d
}|jd }| ||d |	||
 tj||
tjd|d< | D ]\}}t|tjr|j|
d||< q|r| j|||d t |j
|d}| j|t d}|||||fS |||fS )N)bos_token_ideos_token_idpad_token_idT)speech_start_idspeech_end_idspeech_diffusion_idr   r;   r   r   
max_length
min_length)generation_confighas_default_max_lengthhas_default_min_lengthmodel_input_nameinputs_tensorinput_ids_length)r;   r=   r   )r   input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnlogits_processorr;   model_kwargs)r   stopping_criteriar0   )r   r   r   r   _prepare_generation_configr   r   r   _prepare_model_inputsshaper;   _prepare_special_tokensr   r   getr   r   _prepare_generated_length_prepare_cache_for_generationr-   aranger?   itemsr   Tensor_get_logits_processorr   _get_stopping_criteriar   )r@   r   inputs	tokenizerreturn_processorsr   r   r   r   
batch_sizer;   rB   r   r   r   max_cache_lengthkvr   r   r0   r0   r1   #_build_generate_config_model_kwargs   s   





	

zNVibeVoiceForConditionalGenerationInference._build_generate_config_model_kwargsT      ?r   r   r   r   r   synced_gpusassistant_modelr   audio_streamernegative_prompt_idsnegative_prompt_attention_maskreturn_speech	cfg_scalestop_check_fnc           Z   
   K   s  | dd}| dd}| dd}| dd}|dddu r.| jjj|d jd	  |d< | j|||fd
di|\}}}}}tj|d jd df|j	tj
|d jdtj|d jd dftj
|d jd|ddd}| jdd|fd
di|\}}}t }t }|jd }|j}tj|tj|d} tj|tj
|d}!d}"d}#|dd}$dd t|D }%|jd	 }&|d jd	d}'|j	|j|j|jg}(t|dr|jdur|(|j t|(|d})|du rt }||) t|j|& t||& }*t|j|' ||' 
 }+tj|tj|d},|ddrtt|*ddd}-nt|*}-|-D ]}.|durG| rG|$r;td|.d   |durD|   n{|durgt|drgt |j!rg|$rdtd|.d    n[| " rzt|-drw|-#d   nH|jd	 |jkrtd!|j d" tj$||d|   }/|/% dkrd|,|/<  nt|-dr|   & }0|-#d#|0 d$| d% | j'|fi |}1|"r|j(|d|(||(|d&}2d}"n
|1 d'd}3d'|#i}2| d7i |1|2ddddd(}4| j)|4|dd)}|4j*ddd	ddf j(dtj+|jd*}5|||5}6|j,r.t-j.j/|6d	d}7tj0|7dd+1d}8ntj2|6d	d}8|j|8| < tj3||8dddf gd	d}|d,ds| j'|fi |}9|9d' du rm|#durm|#|9d'< d|9d< | d7i |9ddddd(}:| j)|:|dd)}tj3||8dddf gd	d}|8|jk  r|8|jkj4dd-1d};|;| |;   }<|<% dkrd| |<< |$rtd.|<5  d/|.d  d0dd1 |dur||< |.|+k}=tj4|=|  @ dd-1d}>|>% dkrd| |>< d|,|>< |$r
td.|>5  d2|.d  d0dd1 |dur||> |8|jkj4dd-1d}?|?% dkr1|6|? |6|? tj$||d|  |8|j	k@  }@|@% dkr|d,drt7|@5 D ]\}A}Bd|d |Bddf< d|d |Bd	f< qTt7t8|d3 j9|d3 j:D ]?\}C\}D}E|@5 D ]3}B|D|Bdddddf ; |D|Bddd	ddf< |E|Bdddddf ; |E|Bddd	ddf< qqy|@5 D ]
}B|j	||Bd	f< q| j<= |8>d}Ftj$||d|  |8|jk@  }G|G% dkr|d,dr1| j'|fi |}9|9d' du r|#dur|#|9d'< d|9d< | d7i |9ddddd(}:| j)|:|dd)}tj3||8dddf gd	d}|  |8|jk@ }H|H  r-tj$||d|H }I|!|I }J|d jd }Kt7t8|I5 |J5 D ].\}A\}B}L|Ld |Kd k r|d |B|Ld	f ; |d |B|Ld df< d|d |B|Lf< q]t7t8|d3 j9|d3 j:D ]^\}C\}D}Et8|I5 |J5 D ]M\}B}L|Ld |Djd d k r|D|Bdd|Ld	ddf ; |D|Bdd|Ld dddf< |E|Bdd|Ld	ddf ; |E|Bdd|Ld dddf< qqt8|I5 |J5 D ]#\}B}L|Ld |jd d k r#||B|Ld	f ; ||B|Ld df< q|!|I  d7  < |4j?|Gd	ddf }M|:j?|Gd	ddf }N| j@|M|N|d4>d}O|O| j<jA(|Oj | j<jB(|Oj }P| j<jCjD|P(| j<jCj||G(| j<jCjddd5}Qt7|GD ]\}A}B|B& }R| |R s|%|R |Q|A  qz|dur|E|Q|G | j<jFjG|Q||Gddd5jH}S| j<I|O}T| j<J|S}U|T|U }V|V|F|G< |F}#q#|dur|  g }W|%D ]}X|Xrtj3|Xd	d}Y|W|Y q|Wd qtK||r|W|,d6S d|,d6S )8a+  
        Generates sequences of token ids and optionally speech outputs.
        
        Args:
            All standard generation arguments from GenerationMixin
            negative_prompt_ids: Negative prompt for CFG in speech generation
            negative_prompt_attention_mask: Attention mask for negative prompt
            speech_tensors: Input speech for voice cloning
            speech_masks: Masks for speech tensors  
            speech_input_mask: Positions to insert speech embeddings
            return_speech: Whether to decode and return speech outputs
            cfg_scale: CFG scale for speech generation
            stop_check_fn: Optional callable that returns True if generation should stop
 
        Returns:
            Generated token sequences and optionally speech outputs
        r   Nparsed_scriptsall_speakers_listmax_length_times   max_new_tokensrB   r   Tr   r   r<   d   )rB   r   r   Fverbosec                 S   s   g | ]}g qS r0   r0   ).0_r0   r0   r1   
<listcomp>  s    zGVibeVoiceForConditionalGenerationInference.generate.<locals>.<listcomp>r   dimr   r   show_progress_bar
Generating)descleavez&Generation stopped externally at step finished_flagsz,Audio generation stopped externally at step set_descriptionzGeneration completez"Reached maximum generation length z, stopped it.zGenerating (active: /))r   r   r   r   )r   r   r   r   )is_encoder_decoder)copyr=   r;   )num_samplesrefresh_negative)as_tuplezSamples z reached EOS token at step .)flushz' reached max generation length at step r   )r   )cachesample_indicesr   debug)r3   r4   r5   r0   )Lpopr   rZ   rS   max_position_embeddingsr   r   r-   fullr   r?   r;   onesr   zerosboolrangesumr   r   r   rk   r   appendr9   r   minr   rJ   r	   printendanyr   allr   r   numelitemprepare_inputs_for_generationr   #_update_model_kwargs_for_generationr)   float32	do_samplerQ   
functionalsoftmaxmultinomialsqueezeargmaxcatnonzerotolistset_to_zero	enumeratezip	key_cachevalue_cacheclonerP   ro   r   r   sample_speech_tokensrb   rc   rd   decodeputre   r   r}   rf   rg   r2   )Zr@   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rB   negative_kwargsnegative_generation_confignegative_model_kwargsnegative_input_idsacoustic_cachesemantic_cacher   r;   finished_tagscorrect_cnt
is_prefillr   r   audio_chunksinitial_lengthinitial_length_per_samplevalid_tokenstoken_constraint_processor	max_stepsmax_step_per_sampler5   progress_barstepreached_samplesactive_samplesmodel_inputsprefill_inputsr   r   next_token_logitsnext_token_scoresprobsnext_tokensnegative_model_inputsnegative_outputseos_indicesnew_eos_indicesmax_length_reachednew_max_length_indicesdiffusion_end_indicesdiffusion_start_indicesi
sample_idx	layer_idxk_cachev_cachenext_inputs_embedsdiffusion_indicesnon_diffusion_masknon_diffusion_indicesstart_indicesseq_len	start_idxpositive_conditionnegative_conditionspeech_latentscaled_latentaudio_chunkidxsemantic_featuresacoustic_embedsemantic_embeddiffusion_embedsfinal_audio_outputssample_chunksconcatenated_audior0   r0   r1   generateF  s  '($









(


"

"



04

"<<$	

	
z3VibeVoiceForConditionalGenerationInference.generate      @c                 C   s
  | j j| j tj||gdd| j jj}t	|j
d | jj|}| j jjD ]P}|d t|d  }tj||gdd}| j j|||j
d ||d}tj|t|d dd\}	}
|
||	|
   }tj||gdd}| j j|||j}q*|d t|d  S )Nr   r   r   )	condition)rP   r^   set_timestepsrX   r-   r  r   ra   r;   randnr   rZ   acoustic_vae_dim	timestepslenrepeatsplitr.  prev_sample)r@   rZ  neg_conditionr   speechthalfcombinedepscond_eps
uncond_epshalf_epsr0   r0   r1   r    s   $z?VibeVoiceForConditionalGenerationInference.sample_speech_tokens)NNrI   )rz   )NNNNNNNNNNNNNNr   )F)NNNNNNNNNNNNNTr   N)rY  )2r*   r+   r,   _tied_weights_keys_tp_planrA   propertyr^   ra   rb   rc   rd   re   rf   rg   rn   ro   rq   rt   rv   rw   ry   r   r-   r7   r   r   r   r.   r   r8   r   rJ   r   r(   r   r   r   r   r   r   r   r   r!   r"   rF   r2   rX  r  __classcell__r0   r0   r[   r1   rK   D   s
   










 	




KH	
  rrK   )>dataclassesr   typingr   r   r   r   r   r   r	   r-   torch.nnrQ   transformers.models.autor
   r   transformers.generationr   r   r   r   r   transformers.modeling_outputsr   r   transformersr   transformers.modeling_utilsr   +transformers.modeling_flash_attention_utilsr   transformers.utilsr   modular_vibevoice_tokenizerr   r    modular_vibevoice_diffusion_headr   vibevoice.schedule.dpm_solverr   configuration_vibevoicer    modular_vibevoice_text_tokenizerr   r   modeling_vibevoicer   r    streamerr!   r"   
get_loggerr*   loggerrk   r#   r(   r2   r9   rK   register__all__r0   r0   r0   r1   <module>   sH     

     