o
    ꁱiFW                     @   s  d dl mZ d dlmZmZmZmZmZmZ d dl	m	Z	 d dl
Z
d dlmZ d dlm  mZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ d dl,m-Z- ddl.m/Z/ e%0e1Z2e3edrej4du rg de_4eG dd deZ5eG dd deZ6G dd dej7Z8G dd de!Z9G dd de9Z:G dd de9Z;e<e/e: e<e/e; g d Z=dS )!    )	dataclass)DictListOptionalTupleUnionCallable)tqdmN)	AutoModelAutoModelForCausalLM)ACT2FN)CausalLMOutputBaseModelOutputWithPastModelOutput)LlamaRMSNorm)modeling_utils)PreTrainedModel)FlashAttentionKwargs)logging   ) VibeVoiceTokenizerStreamingCacheVibeVoiceAcousticTokenizerModelVibeVoiceSemanticTokenizerModel)VibeVoiceDiffusionHead)DPMSolverMultistepScheduler)VibeVoiceConfigALL_PARALLEL_STYLES)tpnonecolwiserowwisec                   @   s   e Zd ZU dZeej ed< dZeej ed< dZ	ee
 ed< dZejed< dZeeeej   ed< dZeeejdf  ed< dZeeejdf  ed	< dS )
VibeVoiceCausalLMOutputWithPastNlossdiffusion_lossspeech_token_numlogitspast_key_values.hidden_states
attentions)__name__
__module____qualname__r"   r   torchFloatTensor__annotations__r#   r$   intr%   r&   r   r'   r(    r0   r0   H/home/ubuntu/vibevoice-community/vibevoice/modular/modeling_vibevoice.pyr!       s   
 r!   c                   @   s6   e Zd ZU dZdZejed< dZe	e
ej  ed< dS )VibeVoiceGenerationOutputaH  
    Output type for VibeVoice generation.
    
    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. 
        speech_outputs (`List[torch.FloatTensor]`, *optional*):
            List of generated speech waveforms or latents for each speech segment.
    N	sequencesspeech_outputs)r)   r*   r+   __doc__r3   r,   
LongTensorr.   r4   r   r   r-   r0   r0   r0   r1   r2   +   s   
 	r2   c                       s$   e Zd Z fddZdd Z  ZS )SpeechConnectorc                    s8   t    t||| _t|dd| _t||| _d S )Ngư>)eps)super__init__nnLinearfc1r   normfc2)self	input_dim
output_dim	__class__r0   r1   r:   ;   s   
zSpeechConnector.__init__c                 K   s"   |  |}| |}| |}|S N)r=   r>   r?   )r@   featureskwargsxr0   r0   r1   forwardA   s   


zSpeechConnector.forward)r)   r*   r+   r:   rI   __classcell__r0   r0   rC   r1   r7   :   s    r7   c                   @   s<   e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZdd ZdS )VibeVoicePreTrainedModelmodelTr&   c                 C   s   t |tr|  d S t| jdrt| jjdr| jjj}nt| jdr1t| jjdr1| jjj}nd}t |tj	rQ|j
jjd|d |jd urO|jj  d S d S t |tjrf|j
jd |jj  d S d S )Nlanguage_model_configinitializer_rangedecoder_configg{Gz?        )meanstd      ?)
isinstancer   initialize_weightshasattrconfigrM   rN   rO   r;   r<   weightdatanormal_biaszero_	LayerNormfill_)r@   modulerR   r0   r0   r1   _init_weightsU   s"   

z&VibeVoicePreTrainedModel._init_weightsN)r)   r*   r+   r   config_classbase_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_cache_class_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_static_cache_supports_attention_backendr`   r0   r0   r0   r1   rK   I   s    rK   c                       s   e Zd Z fddZdd Zdd Zddd	Z										dd
ejde	ej
 de	ej de	eeej   de	ej de	e de	e de	e de	e de	ej deeef fddZ  ZS )VibeVoiceModelc                    s  t  | t|dr!|jd ur!t|jtrtt|j}n|j}ntj}|j	}t
|| _t
|j|| _t
|j|| _t|j|j|| _t|j|j|| _| dttd | dttd t
|j|| _t|jj|jj|jj d| _!d S )Ntorch_dtypespeech_scaling_factornanspeech_bias_factor)num_train_timestepsbeta_scheduleprediction_type)"r9   r:   rV   rl   rT   strgetattrr,   float32rO   r
   from_configlanguage_modelacoustic_tokenizer_configtoacoustic_tokenizersemantic_tokenizer_configsemantic_tokenizerr7   acoustic_vae_dimhidden_sizeacoustic_connectorsemantic_vae_dimsemantic_connectorregister_buffertensorfloatdiffusion_head_configprediction_headr   ddpm_num_stepsddpm_beta_schedulerr   noise_scheduler)r@   rW   dtype	lm_configrC   r0   r1   r:   l   s(   zVibeVoiceModel.__init__c                 C   sL   t | jdr
| jjS | jj D ]\}}|jdkr!t| j|  S qJ d)Nembed_tokenszembed_tokens.weightFzshould not arrive here)rV   rw   r   fullmapitems	orig_namert   )r@   nameattrr0   r0   r1   get_input_embeddings   s   
z#VibeVoiceModel.get_input_embeddingsc                 C      || j _d S rE   )rw   r   r@   valuer0   r0   r1   set_input_embeddings      z#VibeVoiceModel.set_input_embeddingsNc                 C   s<   || _ || _| j dur| j   | jdur| j  dS dS )z@Set the speech tokenizers used for encoding and decoding speech.N)rz   r|   eval)r@   rz   r|   r0   r0   r1   set_speech_tokenizers   s   


z$VibeVoiceModel.set_speech_tokenizers	input_idsattention_maskposition_idsr&   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionreturnc                 K   sZ   |	d ur|	n| j j}	| jd|||||||||	|
d
|}|	s!|S t|j|j|j|jdS )N
r   r   r   r&   r   r   r   r   r   r   )last_hidden_stater&   r'   r(   r0   )rW   use_return_dictrw   r   r   r&   r'   r(   )r@   r   r   r   r&   r   r   r   r   r   r   rG   outputsr0   r0   r1   rI      s.   zVibeVoiceModel.forward)NN)
NNNNNNNNNN)r)   r*   r+   r:   r   r   r   r,   r6   r   Tensorr   r-   boolr   r   rI   rJ   r0   r0   rC   r1   rk   k   sL    $

	

rk   c                ,       sp  e Zd ZdgZddiZ fddZdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Z				d0ddZ																		d1dejdeej deej deeej  deej deej d ee d!ee d"ee d#ee d$eej d%eej d&eej d'eej d(eej d)eej d*eej d+ed,eeeeejef f  d-eeef f(d.d/Z  ZS )2!VibeVoiceForConditionalGenerationzlm_head.weightlm_headcolwise_repc                    sD   t  | t|| _|jj| _tj|jj| jdd| _	| 
  d S )NF)r[   )r9   r:   rk   rL   rO   
vocab_sizer;   r<   r~   r   	post_init)r@   rW   rC   r0   r1   r:      s
   

z*VibeVoiceForConditionalGeneration.__init__c                 C   s
   | j  S rE   )rL   r   r@   r0   r0   r1   r      s   
z6VibeVoiceForConditionalGeneration.get_input_embeddingsc                 C   s   | j | d S rE   )rL   r   r   r0   r0   r1   r      s   z6VibeVoiceForConditionalGeneration.set_input_embeddingsc                 C   s   | j S rE   r   r   r0   r0   r1   get_output_embeddings   s   z7VibeVoiceForConditionalGeneration.get_output_embeddingsc                 C   r   rE   rL   rw   )r@   decoderr0   r0   r1   set_decoder   r   z-VibeVoiceForConditionalGeneration.set_decoderc                 C   s   | j jS rE   r   r   r0   r0   r1   get_decoder   s   z-VibeVoiceForConditionalGeneration.get_decoderc                 C   s   t | jjddrD|  }|  }t|dr|j|_n||_t |dddur>tj	|j
jd|jjd |j
jd  fdd|j
_td dS td	 dS )
zY
        Tie the weights between the input embeddings and the output embeddings.
        tie_word_embeddingsFrX   r[   Nr   constantz;Tied input and output embeddings using standard assignment.z0tie_word_embeddings is False, not tying weights.)rt   rW   rO   r   r   rV   rX   r;   
functionalpadr[   rY   shapeprint)r@   output_embeddingsinput_embeddingsr0   r0   r1   tie_weights   s   

z-VibeVoiceForConditionalGeneration.tie_weightsc                 C   s
   || _ d S rE   r   )r@   new_embeddingsr0   r0   r1   set_output_embeddings  s   
z7VibeVoiceForConditionalGeneration.set_output_embeddingsNaudioFc              	   C   s  |d u r | j jj}tdd||  j}| j	|}||fS t
  |dkrWt
  | jj|dd d }W d    n1 sGw   Y  || jjjd }	nQ|dkr| j jj}||dd|}
|
d}| jjjd }tj||
j|
jd| }|jdgdg|
 d  R  }|
|t|
j|
  }	ntd| d	t| jjst| jjr+d
|	|    }|	|    }t ! rt " rt j#|t j$j%d t j#|t j$j%d t & }| jj'||  | jj'||  t(d| jj d| jj dd n| jj'| | jj'| t(d| jj d| jj dd |	| jj | jj }W d    n	1 s@w   Y  | j	|}|rR||fS || || fS )Nr   r   r   vaeg?)r   devicezSpeech type  not implementedrS   )opz%Speech scaling factor (distributed): z, bias factor: T)flushz(Speech scaling factor (single process): ))rW   rx   vae_dimr,   zerosry   r   rX   rL   r   no_gradrz   encode	unsqueezesamplestd_dist_typereshapesizefix_stdrandnr   r   viewdimr   NotImplementedErrorisnanrm   ro   flattenrR   rQ   distis_availableis_initialized
all_reduceReduceOpSUMget_world_sizecopy_r   )r@   speech_tensorsspeech_masksspeech_typereturn_unmaskr   audio_featuresconnect_featuresframesaudio_tokensspeech_mode
batch_sizer   rR   scaling_factorbias_factor
world_sizer0   r0   r1   forward_speech_features  sN   



 " (z9VibeVoiceForConditionalGeneration.forward_speech_featuresr   r   r   r   r&   r   labelsr   r   r   r   r   r   r   speeches_loss_inputspeech_semantic_tensorsacoustic_input_maskacoustic_loss_maskddpm_batch_mulrG   r   c           ,      K   s,  |
d ur|
n| j j}
|  |}| j|}|d urX| j|d ur%||nd ||dddd\}}|d urW|d urE|| ||  ||< n|| ||< ||@ }|| }|| }n| j|d urc||nd ||ddd\}}|d urw|||< | jd ||||||d|
|d
}|j}| 	|}d }|d ur	 d }|d ur;|
  dkr;|| } |j\}!}"tj|!| |"f|j|jd	}#tjt| j jj|!| dd
|j}$|j|dd}%| j|dd}&| jj|%|#|$}'| j|'|$||&}(| j jj})|)dkr|#}*n|)dkr| jj|%|#|$}*ntd|) dtj|( |* dd}|"dkr2|dkr2||" | }n:tj d|jd}n1t
dd | jj! D d }|t
dd | jj"! D d 7 }|t
dd | jj! D d 7 }|
s||!f|# dd   }+||f|+ S t$|||d ur|!nd||j%|j&|j'dS )Nr   r   T)r   r   r   r   )r   r   r   Fr   r   )r   r   )replacement)r   epsilonv_predictionzPrediction type r   sum)	reductionrP   )r   c                 s       | ]}|  V  qd S rE   r   .0pr0   r0   r1   	<genexpr>      z<VibeVoiceForConditionalGeneration.forward.<locals>.<genexpr>c                 s   r   rE   r   r   r0   r0   r1   r     r   c                 s   r   rE   r   r   r0   r0   r1   r     r   r   )r"   r#   r$   r%   r&   r'   r(   )(rW   r   r   rL   r   r   type_asgetr   r   r   itemr   r,   r   r   r   multinomialonesr   r   ry   repeat_interleaver   	add_noiser   rr   get_velocityr   Fmse_lossr   r   
parametersr   to_tupler!   r&   r'   r(   ),r@   r   r   r   r&   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rG   rH   $semantic_speech_all_connect_featuresspeech_all_featuresspeech_all_connect_featurestarget_latent_maskspeech_featuresspeech_connect_featuresr   r'   r%   r"   r#   condition_features
speech_lenlatent_sizenoise	timestepsspeech_features_repeatedcondition_features_repeatednoisy_speech_featuresmodel_outputrr   target_for_lossoutputr0   r0   r1   rI   L  s   








""z)VibeVoiceForConditionalGeneration.forward)NNr   F)NNNNNNFNNNNNNNNNNr   )r)   r*   r+   _tied_weights_keys_tp_planr:   r   r   r   r   r   r   r   r   r,   r6   r   r   r   r-   r   
BoolTensorr/   r   rs   r   r   r!   rI   rJ   r0   r0   rC   r1   r      s    
=	

r   )rk   rK   r   r!   r2   )>dataclassesr   typingr   r   r   r   r   r   r	   r,   torch.nnr;   torch.nn.functionalr   r  torch.distributeddistributedr   transformers.models.autor
   r   transformers.activationsr   transformers.modeling_outputsr   r   r   (transformers.models.llama.modeling_llamar   transformersr   transformers.modeling_utilsr   +transformers.modeling_flash_attention_utilsr   transformers.utilsr   modular_vibevoice_tokenizerr   r   r    modular_vibevoice_diffusion_headr   vibevoice.schedule.dpm_solverr   configuration_vibevoicer   
get_loggerr)   loggerrV   r   r!   r2   Moduler7   rK   rk   r   register__all__r0   r0   r0   r1   <module>   sF     


"i  