o
    ꁱiX                     @   s&  d dl mZmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d	d
lmZmZ d	dlmZ d	dlmZmZ eeZ e!edrcej"du rhg de_"G dd deZ#G dd de#Z$G dd de#eZ%e	&ee$ e
&ee% g dZ'dS )    )ListOptionalTupleUnionN)	AutoModelAutoModelForCausalLM)CausalLMOutputBaseModelOutputWithPast)modeling_utils)PreTrainedModel)logging)GenerationMixin   ) VibeVoiceTokenizerStreamingCacheVibeVoiceTokenizerEncoderOutput)VibeVoiceASRConfig)VibeVoiceCausalLMOutputWithPastSpeechConnectorALL_PARALLEL_STYLES)tpnonecolwiserowwisec                   @   s@   e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZdZdd ZdS )VibeVoiceASRPreTrainedModelmodelTpast_key_valuesc                 C   s   t | jdrt | jjdr| jjj}nt | jdr&t | jjdr&| jjj}nd}t|tjrF|jj	j
d|d |jd urD|jj	  d S d S t|tjr[|jj	d |jj	  d S d S )Nlanguage_model_configinitializer_rangedecoder_configg{Gz?g        meanstdg      ?)hasattrconfigr   r   r   
isinstancennLinearweightdatanormal_biaszero_	LayerNormfill_)selfmoduler!    r0   L/home/ubuntu/vibevoice-community/vibevoice/modular/modeling_vibevoice_asr.py_init_weights+   s   
z)VibeVoiceASRPreTrainedModel._init_weightsN)__name__
__module____qualname__r   config_classbase_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_static_cache_supports_attention_backendr2   r0   r0   r0   r1   r      s    r   c                       s   e Zd Z fddZdd Zdd Zddd	Z										dd
ejde	ej
 de	ej de	eeej   de	ej de	e de	e de	e de	e de	ej deeef fddZ  ZS )VibeVoiceASRModelc                    s   t  | t|dr!|jd ur!t|jtrtt|j}n|j}ntj}|j	}t
|| _t
|j|| _t
|j|| _t|j|j|| _t|j|j|| _d S )Ntorch_dtype)super__init__r"   rB   r$   strgetattrtorchfloat32r   r   from_configlanguage_modelacoustic_tokenizer_configtoacoustic_tokenizersemantic_tokenizer_configsemantic_tokenizerr   acoustic_vae_dimhidden_sizeacoustic_connectorsemantic_vae_dimsemantic_connector)r.   r#   dtype	lm_config	__class__r0   r1   rD   ?   s   zVibeVoiceASRModel.__init__c                 C   sL   t | jdr
| jjS | jj D ]\}}|jdkr!t| j|  S qJ d)Nembed_tokenszembed_tokens.weightFzshould not arrive here)r"   rJ   rY   fullmapitems	orig_namerF   )r.   nameattrr0   r0   r1   get_input_embeddingsU   s   
z&VibeVoiceASRModel.get_input_embeddingsc                 C      || j _d S N)rJ   rY   r.   valuer0   r0   r1   set_input_embeddings_      z&VibeVoiceASRModel.set_input_embeddingsNc                 C   s<   || _ || _| j dur| j   | jdur| j  dS dS )z@Set the speech tokenizers used for encoding and decoding speech.N)rM   rO   eval)r.   rM   rO   r0   r0   r1   set_speech_tokenizersb   s   


z'VibeVoiceASRModel.set_speech_tokenizers	input_idsattention_maskposition_idsr   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionreturnc                 K   sZ   |	d ur|	n| j j}	| jd|||||||||	|
d
|}|	s!|S t|j|j|j|jdS )N
rh   ri   rj   r   rk   rl   rm   rn   ro   rp   )last_hidden_stater   hidden_states
attentionsr0   )r#   use_return_dictrJ   r	   rs   r   rt   ru   )r.   rh   ri   rj   r   rk   rl   rm   rn   ro   rp   kwargsoutputsr0   r0   r1   forwardn   s.   zVibeVoiceASRModel.forward)NN)
NNNNNNNNNN)r3   r4   r5   rD   r_   rd   rg   rG   
LongTensorr   Tensorr   FloatTensorboolr   r	   ry   __classcell__r0   r0   rW   r1   rA   >   sL    

	

rA   c                #       s|  e Zd ZdZdgZddiZ fddZdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Z			d/dejdeej deej defddZ															d0deej deej deej d eeej  d!eej d"eej d#ee d$ee d%ee d&ee d'eej deej deej deej d(eej d)eeef f d*d+Z						,				d1d-d.Z  ZS )2$VibeVoiceASRForConditionalGenerationz
    VibeVoice model for Automatic Speech Recognition (ASR) with language modeling head for conditional generation.
    This class is designed for inference and generation tasks.
    zlm_head.weightlm_headcolwise_repc                    s   t  | t|| _|jj| _t|dr+|jd ur+t|jt	r't
t|j}n|j}ntj}tj|jj| jdd|| _|   d S )NrB   F)r*   )rC   rD   rA   r   r   
vocab_sizer"   rB   r$   rE   rF   rG   rH   r%   r&   rQ   rL   r   	post_init)r.   r#   rU   rW   r0   r1   rD      s   

z-VibeVoiceASRForConditionalGeneration.__init__c                 C   s
   | j  S ra   )r   r_   r.   r0   r0   r1   r_         
z9VibeVoiceASRForConditionalGeneration.get_input_embeddingsc                 C   s   | j | d S ra   )r   rd   rb   r0   r0   r1   rd      s   z9VibeVoiceASRForConditionalGeneration.set_input_embeddingsc                 C   s   | j S ra   r   r   r0   r0   r1   get_output_embeddings   s   z:VibeVoiceASRForConditionalGeneration.get_output_embeddingsc                 C   s
   || _ d S ra   r   )r.   new_embeddingsr0   r0   r1   set_output_embeddings   r   z:VibeVoiceASRForConditionalGeneration.set_output_embeddingsc                 C   r`   ra   r   rJ   )r.   decoderr0   r0   r1   set_decoder   re   z0VibeVoiceASRForConditionalGeneration.set_decoderc                 C   s   | j jS ra   r   r   r0   r0   r1   get_decoder   s   z0VibeVoiceASRForConditionalGeneration.get_decoderc                 C   sD   t | jjddr |  }|  }t|dr|j|_dS ||_dS dS )zGTie the weights between the input embeddings and the output embeddings.tie_word_embeddingsFr'   N)rF   r#   r   r   r_   r"   r'   )r.   output_embeddingsinput_embeddingsr0   r0   r1   tie_weights   s   

z0VibeVoiceASRForConditionalGeneration.tie_weightsN      N@speech_tensorsspeech_masksspeech_semantic_tensorsstreaming_segment_durationc           !   	   C   s  t | jdr | jjdur t| jjtrtt| jj}n| jj}ntj}||}|j	dkr2|
d}|j\}}d}t|| }	||	k}
t  |
s| jj|
d}|j| jjjdd }| j|}|durq| j|}n| jj|
dj}| j|}nt }t }g }g }tj||jd}dtd	tfd
d}t|||	}t|}t|D ]I\}\}}|dd||f  }| dkrq||d k}| jjj|
d||d|d}||j | jjj|
d||d|d}||j qtj |dd }t!|| jjj"d}|j| jjjdd }| j|}tj |dd }| j|}|dur7|| ||  } n|| } W d   | S W d   | S 1 sOw   Y  | S )a  
        Encode speech input into features that can be used by the language model.
        This method is called once before generation to process the speech input.
        
        For long audio (>600s by default), uses streaming processing to avoid conv overflow (>2^32).
        Segments are processed independently, then concatenated before final sampling.
        
        Args:
            speech_tensors: Input audio tensor [batch_size, samples]
            speech_masks: Optional mask for speech features
            speech_semantic_tensors: Optional pre-computed semantic tokens
            streaming_segment_duration: Segment duration in seconds for streaming processing (default: 60s)
        rB   Nr   r   i]  )	dist_typedevicetotal_lengthsegment_lengthc                 s   sH    |dkr	t dtd| |D ]}t|| | }||kr!||fV  qdS )z8Iterate over audio segments with a given segment length.r   zsegment_length must be positiveN)
ValueErrorrangemin)r   r   startendr0   r0   r1   _iter_segments  s   
zJVibeVoiceASRForConditionalGeneration.encode_speech.<locals>._iter_segmentsT)cachesample_indicesrl   is_final_chunk)dimr   )#r"   r#   rB   r$   rE   rF   rG   rH   rL   ndim	unsqueezeshapeintno_gradr   rM   encodesamplestd_dist_typerR   rT   rO   r    r   aranger   listlen	enumerate
contiguousnumelappendcatr   fix_std)!r.   r   r   r   r   rU   
batch_sizetotal_samplessample_ratesegment_samplesuse_streamingencoder_outputaudio_tokensacoustic_featuressemantic_featuressemantic_tokensacoustic_encoder_cachesemantic_encoder_cacheacoustic_mean_segmentssemantic_mean_segmentsr   r   segmentsnum_segmentsseg_idxr   r   chunkis_finalacoustic_encoder_outputsemantic_encoder_outputacoustic_mean_fullcombined_featuresr0   r0   r1   encode_speech   s   








X
XXz2VibeVoiceASRForConditionalGeneration.encode_speechrh   ri   rj   r   rk   labelsrl   rm   rn   ro   rp   acoustic_input_maskrq   c                 K   s  |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
|dur$|n| j j}|du r6|dur6|  |}|durJ|durJ| j|||d}|||< | jd|||||||	|
|d
}|
s_|d n|j}| 	|}d}|dur|dddddf 
 }|dddf 
 }t }|d| j}|d}||j}|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )	z]
        Forward pass for the model. Handles both training and generation scenarios.
        N)r   r   r   rr   r   .r   )losslogitsr   rt   ru   )r#   rm   rn   rv   rl   r_   r   r   rs   r   r   r%   CrossEntropyLossviewr   rL   r   r   r   rt   ru   )r.   rh   ri   rj   r   rk   r   rl   rm   rn   ro   rp   r   r   r   r   rw   speech_featuresrx   rt   r   r   shift_logitsshift_labelsloss_fctoutputr0   r0   r1   ry   U  s\   


z,VibeVoiceASRForConditionalGeneration.forwardTc                 K   s  |dur,t |tr|d d jd }n| }|dur,|jd |kr,|dd|df }|du r[|dur[| dd }||dkd |dur[|dur[|dd|jd  df }|du r|durg| nd}tj|||durv|jd n|jd  |dur|j	n|j	d}|dur|du rd|i}nd|i}|
|||||d	 |durt|dkr|d dkr|
||	|
|d
 n
|
ddddd
 |
| |S )a=  
        Prepare inputs for generation step. This method is called by generate() 
        for each token generation step.
        
        Following Qwen2-VL's approach: speech inputs are only forwarded on the first pass
        (when cache_position[0] == 0), and are excluded in subsequent generation steps.
        Nr      r   r   r   rk   rh   )rj   rp   r   rl   ri   )r   r   r   r   )r$   tupler   get_seq_lengthlongcumsummasked_fill_rG   r   r   updater   )r.   rh   r   ri   rk   rp   rj   rl   r   r   r   r   rw   past_lengthpast_seen_tokensmodel_inputsr0   r0   r1   prepare_inputs_for_generation  sX   

 

zBVibeVoiceASRForConditionalGeneration.prepare_inputs_for_generation)NNr   )NNNNNNNNNNNNNNN)
NNNNNTNNNN)r3   r4   r5   __doc___tied_weights_keys_tp_planrD   r_   rd   r   r   r   r   r   rG   r|   r   
BoolTensorfloatr   rz   r{   r   r}   r   r   r   ry   r   r~   r0   r0   rW   r1   r      s    
 	


Vr   )r   rA   r   )(typingr   r   r   r   rG   torch.nnr%   transformers.models.autor   r   transformers.modeling_outputsr   r	   transformersr
   transformers.modeling_utilsr   transformers.utilsr   transformers.generationr   modular_vibevoice_tokenizerr   r   configuration_vibevoicer   modeling_vibevoicer   r   
get_loggerr3   loggerr"   r   r   rA   r   register__all__r0   r0   r0   r1   <module>   s.    

 Z  k