o
    ir                     @   s  U d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
mZmZmZ ddlZddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z?m@Z@ ddlAmBZBmCZCmDZDmEZE ddlFmGZGmHZHmIZImJZJmKZK dZLdZMG dd de?ZNG d d! d!e?ZOeNeOB ZPeeQd"< G d#d$ d$e7ZRG d%d& d&e5eR ZSG d'd( d(e6eR ZTG d)d* d*ejUZVG d+d, d,ejUZWG d-d. d.ejUeZXG d/d0 d0eZYe+jZeTeReSd1G d2d3 d3ejUeDeEeCZ[d4ej\e]ej\ B e]e]ej\  B d5ej\fd6d7Z^dS )8zPyTorch Ultravox model.    N)IterableMappingSequence)SimpleNamespace)	AnnotatedAnyLiteral	TypeAlias)nn)
functional)BatchFeatureProcessorMixin)ModuleUtilsMixin)WhisperFeatureExtractor)WhisperEncoderWhisperEncoderLayer)
VllmConfig)BaseDummyOptions)
MulAndSilu
get_act_fn)RMSNorm)DefaultModelLoader)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)MultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)UltravoxConfig)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapper
flatten_bninit_vllm_registered_modelmaybe_prefix	<|audio|>   c                   @   s   e Zd ZU dZed ed< eeje	ej B e	e	ej  B e
dddf ed< eeje
df ed< 	 eeje
df ed	< 	 eeje
d
f ed< dS )UltravoxAudioFeatureInputszz
    Dimensions:
    - b: batch size
    - n: number of chunks
    - t: Time frames (M)
    - nmb: Number of mel bins
    audio_featurestypebnnmbtdatalens	token_lenn
num_chunksN__name__
__module____qualname____doc__r   __annotations__r   torchTensorlistr(    rI   rI   Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/ultravox.pyr5   C   s   
 
r5   c                   @   sD   e Zd ZU dZed ed< eeje	ej B e
ddddf ed< d	S )
UltravoxAudioEmbeddingInputszx
    Dimensions:
    - b: batch size
    - na: number of audios
    - afs: audio feature size
    - hs: hidden size
    audio_embedsr7   bnaafshsr;   Nr@   rI   rI   rI   rJ   rK   [   s   
 rK   UltravoxAudioInputsc                   @   s`   e Zd ZdedefddZdedefddZdd Zde	fd	d
Z
deee	dB f fddZdS )UltravoxProcessingInfokwargsreturnc                 K   s.   | j jj}| j jdi |}t|_|j|_|S NrI   )ctxmodel_config	hf_configget_hf_processor_AUDIO_PLACEHOLDER_OVERRIDEaudio_token_replacementaudio_token_indexaudio_replacement_token_id)selfrS   confighf_processorrI   rI   rJ   rY   p   s
   
z'UltravoxProcessingInfo.get_hf_processorc                 K   s<   | j di |}|j}t|tr|S |j}t|tsJ |S rU   )rY   audio_processor
isinstancer   feature_extractor)r^   rS   r`   ra   rc   rI   rI   rJ   get_feature_extractor|   s   
z,UltravoxProcessingInfo.get_feature_extractorc                 C   s    |   }t|j|  |  dS )N)	target_srtarget_channelsexpected_hidden_size)rd   r   sampling_rateget_target_channels_get_expected_hidden_size)r^   rc   rI   rI   rJ   get_data_parser   s   z&UltravoxProcessingInfo.get_data_parserc                 C   s   dS )z8Return target audio channels for Ultravox models (mono).r)   rI   r^   rI   rI   rJ   ri      s   z*UltravoxProcessingInfo.get_target_channelsNc                 C   s   dd iS )NaudiorI   rl   rI   rI   rJ   get_supported_mm_limits   s   z.UltravoxProcessingInfo.get_supported_mm_limits)rA   rB   rC   objectr   rY   r   rd   rk   intri   r   strrn   rI   rI   rI   rJ   rR   o   s    	rR   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )UltravoxDummyInputsBuilder	mm_countsrT   c                 C   s   | dd}d| S )Nrm   r   r3   )get)r^   rs   
num_audiosrI   rI   rJ   get_dummy_text   s   z)UltravoxDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sP   | j  }|j}|j| t }|dd}|r|dnd }d| j|||diS )Nrm   r   )lengthru   	overrides)inford   rh   chunk_length_MAX_ENCODER_BATCH_SIZErt   _get_dummy_audios)	r^   rw   rs   rx   rc   rh   	audio_lenru   audio_overridesrI   rI   rJ   get_dummy_mm_data   s   
z,UltravoxDummyInputsBuilder.get_dummy_mm_dataN)
rA   rB   rC   r   rq   rp   rv   r   r   r   rI   rI   rI   rJ   rr      s    	
rr   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )UltravoxMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrT   c           
         s   | dg s| j j|dd}| |}tt|gdddS t|}|dg }t|t	s0J | jj
di |}tdi ||jdd}tdi |d|i}|d	d  |d
d  |dd  t j||||d}	|	d|	d< |	S )NaudiosF)add_special_tokens)	input_idspt)tensor_typeT)rh   include_audio_num_chunksr   padding
truncation)r   r   r   r   audio_valuesr6   rI   )rt   r{   get_tokenizerencode_apply_hf_processor_tokens_onlyr   dictpoprb   rH   rd   rh   super_call_hf_processor)
r^   r   r   r   r   
prompt_idsr   rc   item_processor_dataoutput	__class__rI   rJ   r      s8   


z.UltravoxMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   sH   | dtd}ttd|td|td|tdtddS )Naudio_num_chunksr   rm   )r6   audio_token_len
audio_lensr   rL   )rt   rF   zerosr   r   flat_from_sizesbatched)r^   r   r   r?   rI   rI   rJ   _get_mm_fields_config   s   


z1UltravoxMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   | j jdi |}|j| dtd}tj|dtjd t	tj
dgtjd g dtf fdd}tdd	|d
gS )Nr   r   )dimdtyper   item_idxc                    s6    |  } | d  }d ||   }gt| S )Nr)   r   )sumrp   )r   startendr   chunks_start_idxout_mm_datareplacement_idrI   rJ   get_replacement_ultravox
  s   zQUltravoxMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_ultravoxrm   r3   )modalitytargetreplacementrI   )r{   rY   r]   get_datart   rF   r   cumsumint32cattensorrp   r#   )r^   r   r   r   r`   r?   r   rI   r   rJ   _get_prompt_updates   s"   z/UltravoxMultiModalProcessor._get_prompt_updates)rA   rB   rC   rq   r   ro   r   r   r   r   r   r   r   r   r$   r   __classcell__rI   rI   r   rJ   r      s8    


+



r   c                       s>   e Zd ZdZd
def fddZdejdejfdd	Z  Z	S )StackAudioFrameszk
    Stack the audio embedding frames to reduce the sequence length by a factor
    of `stack_factor`.
       stack_factorc                    s   t    || _d S r   )r   __init__r   )r^   r   r   rI   rJ   r     s   

zStackAudioFrames.__init__rL   rT   c                 C   sh   |j \}}}|| j d | j | j }t|ddd|| f}|j \}}}|||| j || j }|S )Nr)   r   )shaper   Fpadview)r^   rL   BTCT_padrI   rI   rJ   forward#  s   zStackAudioFrames.forward)r   )
rA   rB   rC   rD   rp   r   rF   rG   r   r   rI   rI   r   rJ   r     s    r   c                       >   e Zd Zdef fddZdejdejdejfddZ  ZS )	UltravoxFeedForwardProjectorr_   c                    s   t    |j| _t|j| _|jj|j }t|| _	t
j|| jdd| _| j}|jdkr6t | _|d }nt|j| _|jj}t
j||dd| _|jrXt|| _t
 | _d S t
 | _t|| _d S )NF)biasswiglu   )r   r   hidden_size
hidden_dimr   r   _pad_and_stackaudio_configr   ln_prer
   Linearlinear_1projector_actr   actr   text_configlinear_2projector_ln_midln_midIdentityln_post)r^   r_   dim_indim_middim_outr   rI   rJ   r   /  s$   





z%UltravoxFeedForwardProjector.__init__r6   r   rT   c                 C   sJ   |  |}| |}| |}| |}| |}| |}| |}|S r   )r   r   r   r   r   r   r   )r^   r6   r   hidden_statesrI   rI   rJ   r   J  s   






z$UltravoxFeedForwardProjector.forward	rA   rB   rC   r&   r   rF   rG   r   r   rI   rI   r   rJ   r   .  s    r   c                       r   )	UltravoxTransformerProjectorr_   c                    s   t    tdd| _t|j| _|jj|j }t	
|j t|| _t| j| _t j j| _t fddt|jD | _t j| _t j|jj| _d S )NF)
is_decoderc                    s   g | ]}t  qS rI   )r   ).0_projector_audio_configrI   rJ   
<listcomp>j  s    z9UltravoxTransformerProjector.__init__.<locals>.<listcomp>)r   r   r   r_   r   r   r   r   r   copydeepcopyr   r   r
   r   d_model	linear_in	Embeddingmax_source_positionsembed_positions
ModuleListrangenum_projector_layerslayersr   r   
linear_out)r^   r_   r   r   r   rJ   r   X  s(   




z%UltravoxTransformerProjector.__init__r6   r   rT   c                 C   s   |  |}|jd }tj||jdd d d f |d d d f }| ||j|j}| |}| 	|}| 
tj|d|jd}|| }i }dt| jd jjv rXd |d< | jD ]}	|	|fd|i|}
|
d }q[| |}| |}|S )Nr)   devicelayer_head_maskr   attention_mask)r   r   rF   aranger   ltget_extended_attention_maskr   r   r   r   sizeinspect	signaturer   r   
parametersr   r   )r^   r6   r   max_len_stackedr   extended_attention_maskr   	positionsrS   layerlayer_outputsrI   rI   rJ   r   u  s>   









z$UltravoxTransformerProjector.forwardr   rI   rI   r   rJ   r   W  s    r   c                       sh   e Zd ZdZdZ fddZedd Zdej	dB d	ej	fd
dZ
	ddej	dej	dB fddZ  ZS )ModifiedWhisperEncodera  
    Encoder portion of OpenAI's Whisper model.

    This implementation is a slightly modified version of HF Transformers'
    Whisper Encoder, with only a few fixes:
    1. base_model_prefix updated to allow for doing `.from_pretrained`
       directly on the encoder
    2. allow less than 30 second of audio padding to be passed in:
        - relaxed ValueError check for `input_features` length to be less
           than or equal to `expected_seq_length` instead of strictly equal
        - embed_pos is now sliced to match the length of `inputs_embeds`

    Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
    See commentary: https://github.com/huggingface/transformers/issues/25744
    zmodel.encoderc                    s   t  j|i | d| j_d S )NF)r   r   r_   r   )r^   argsrS   r   rI   rJ   r     s   zModifiedWhisperEncoder.__init__c                 C   s    | j j| jjd  | jjd  S )Nr   )r_   r   conv1strideconv2rl   rI   rI   rJ   max_context_length  s   

z)ModifiedWhisperEncoder.max_context_lengthr   Nr   c                 C   s`   |du rdS |  |}|jd }tj||jddddf |dd}| j|d|jd}|S )aH  
        Create attention mask based on audio lengths to mask out padding tokens
        For each sample in batch:
        - Convert raw audio length to feature length after convolutions
        - Create bool mask: True for valid positions and False for padding
        - Convert to attention mask format expected by transformer layers
        (1.0 for positions to attend to, large negative for positions to ignore)
        This masking ensures consistent behavior between training and inference
        by preventing the model from attending to padding tokens in both cases
        Nr)   r   r   )	 _get_feat_extract_output_lengthsr   rF   r   r   r   r   r   r   )r^   r   r   audio_feature_lenmax_seq_lenr   rI   rI   rJ   get_attention_mask_by_audio_len  s   


z6ModifiedWhisperEncoder.get_attention_mask_by_audio_leninput_featuresc                 C   s   | j }|jd |krtd| d|jd  d| dtj| |}tj| |}|ddd}| j	j
d |d	 }|| }tjj|| j| jd
}| ||}i }dt| jd jjv rfd |d< | jD ]}	|	||fi |}
|
d }qi| |}|S )Nr  z7Whisper expects the mel input features to be of length z or less, but found z-. Make sure to pad the input mel features to .r   r   r)   )ptrainingr   )r  r   
ValueErrorr
   r   gelur	  r  permuter   weightr   dropoutr  r  r   r   r   r   r  
layer_norm)r^   r  r   expected_seq_lengthinputs_embeds	embed_posr   r   rS   encoder_layerr  rI   rI   rJ   r     sB   



zModifiedWhisperEncoder.forwardr   )rA   rB   rC   rD   base_model_prefixr   propertyr  rF   rG   r  r   r   rI   rI   r   rJ   r    s"    

r  )r{   dummy_inputsc                       s  e Zd Zg dddgdZeddidZeded	ed
edB fddZ	ddde
def fddZd
efddZdejdejdejd
ejfddZded
edB fddZded
eeejdf B fd d!Zded
efd"d#Z	d7dd$d%d&ejd'edB d(ejdB d)ed
ejf
 fd*d+Z		d8d&ejdB d,ejd-ejdB d.ejdB d
ejeB f
d/d0Zd1ejd
ejfd2d3Zd4eeeejf  d
ee fd5d6Z   Z!S )9UltravoxModel)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzaudio_tower.model.encoder.audio_tower.)orig_to_new_prefixr   irT   Nc                 C   s   | drdS td)Nrm   r3   z Only audio modality is supported)
startswithr  )clsr   r.  rI   rI   rJ   get_placeholder_str  s   
z!UltravoxModel.get_placeholder_str )prefixvllm_configr3  c                   s&  t    |jj}|jj}|| _|| _| jsJ g | _|jd ur-| j	t
j|jd dd |jd ur?| j	t
j|jd dd | |d t|j| _|jdkrWt|| _nt|| _W d    n1 sfw   Y  | | t||jt|dd| _W d    n1 sw   Y  | jj| _d S )Nr,  )model_or_pathrevisionr3  language_model.rm   r   language_model)r4  rX   r3  )r   r   rW   rX   multimodal_configr_   multi_modal_configsecondary_weightsaudio_model_idappendr   Sourcetext_model_id_mark_tower_modelr  r   audio_towerr   r   multi_modal_projectorr   _mark_language_modelr1   wrapped_model_configr2   r8  make_empty_intermediate_tensors)r^   r4  r3  r_   r9  r   rI   rJ   r     sN   






zUltravoxModel.__init__c                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r7  zmulti_modal_projector.r,  )r8  	connectortower_model)r   from_string_fieldrl   rI   rI   rJ   get_mm_mappingL  s
   zUltravoxModel.get_mm_mappingr  r   r   c                 C   s   | | jj}|d}g }td|tD ]-}t|t |}| ||| ||| }	|	 | jj}	| |	||| }
||
 qt	j
|dd}|S )Nr   )r   )torA  r   r   r   r}   minrB  r=  rF   r   )r^   r  r   r   r6   
batch_sizeaudio_embeddingsr   r   batch_featuresbatch_embeddingsrI   rI   rJ   _audio_features_to_embeddingsV  s   
z+UltravoxModel._audio_features_to_embeddingsrS   c                 K   s   | dd }| dd }| dd }| dd }| dd }|d u r(|d u r(d S |d ur5td||||dS |d ur?td|dS td)	Nr6   rL   r   r   r   )r7   r;   r<   r=   r?   )r7   r;   z This line should be unreachable.)r   r5   rK   AssertionError)r^   rS   r6   rL   r   r   r   rI   rI   rJ   _parse_and_validate_audio_inputs  s$   z-UltravoxModel._parse_and_validate_audio_inputaudio_input.c                 C   s   |d dkr
|d S t |d }|d }|d }| |||}|jd }tj||jd|jd d	}||d d d f k }|| }	d
d ||d  D }
|	|
S )Nr7   rL   r;   r<   r=   r)   r   r   r  c                 S   s   g | ]}|   qS rI   )r   item)r   
chunk_lensrI   rI   rJ   r     s    
z6UltravoxModel._process_audio_input.<locals>.<listcomp>r?   )	pad_and_concat_to_dim3rP  r   rF   r   r   expandsplittolist)r^   rS  r6   r   r   
embeddingsmax_lenindicesmaskflattened_embeddings
embed_lensrI   rI   rJ   _process_audio_input  s$   
	

z"UltravoxModel._process_audio_inputc                 K   s*   | j di |}|d u rg S | |}|S rU   )rR  r`  )r^   rS   rS  rM  rI   rI   rJ   embed_multimodal  s
   
zUltravoxModel.embed_multimodalT)is_multimodalhandle_oov_mm_tokenr   multimodal_embeddingsrb  rc  c                   s0   |d u s|d u rt  |S t  j||||dS )N)rd  rb  rc  )r   embed_input_ids)r^   r   rd  rb  rc  r   rI   rJ   re    s   
zUltravoxModel.embed_input_idsr  intermediate_tensorsr  c                 K   s8   |durd}| j }t|dr|j }|j||||d}|S )a  Run forward pass for Ultravox

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted audio embeddings. The to-be-inserted
        audio has a size that is essentially 6.25 tokens per second of audio.

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Position indices for the input tokens.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.

        Nr8  )r  )r8  hasattrmodel)r^   r   r  rf  r  rS   r8  r   rI   rI   rJ   r     s   
zUltravoxModel.forwardr   c                 C   s   | j |S r   )r8  compute_logits)r^   r   rI   rI   rJ   ri    s   zUltravoxModel.compute_logitsweightsc                 C   s   t | dgd}|j|| jdS )Nr,  )ignore_unexpected_prefixes)mapper)r.   load_weightshf_to_vllm_mapper)r^   rj  loaderrI   rI   rJ   rm    s   zUltravoxModel.load_weightsr   )NN)"rA   rB   rC   packed_modules_mappingr/   rn  classmethodrq   rp   r1  r   r   r   rI  rF   rG   rP  ro   rQ   rR  r   tupler`  r*   ra  boolre  r%   r   ri  r   setrm  r   rI   rI   r   rJ   r$    sz    0



&

&,r$  featuresrT   c                    sl   t | tjr| jdkrt| } | S dd | D } tdd | D  dd | D }  fdd| D } t| S )a  
    Pad and concatenate a list of tensors.

    output:
        Tensor of shape [B, C, M] where M is the maximum length of the input
        tensors, B is the sum of the batch sizes of the input tensors.
        C must be the same for all input tensors.
       c                 S   s   g | ]}t |qS rI   )rV  r   frI   rI   rJ   r     s    z*pad_and_concat_to_dim3.<locals>.<listcomp>c                 s   s    | ]}|j d  V  qdS )r  N)r   rw  rI   rI   rJ   	<genexpr>  s    z)pad_and_concat_to_dim3.<locals>.<genexpr>c                 S   s(   g | ]}|j d g|jdd R  qS )r  r  N)r   r   rw  rI   rI   rJ   r     s   ( c              	      s&   g | ]}t |d  |jd  fqS )r   r  )r   r   r   rw  r[  rI   rJ   r     s   & )rb   rF   rG   ndimr0   maxr   )ru  rI   rz  rJ   rV    s   

rV  )_rD   r   r   collections.abcr   r   r   typesr   typingr   r   r   r	   rF   r
   torch.nnr   r   transformersr   r   transformers.modeling_utilsr   transformers.models.whisperr   ,transformers.models.whisper.modeling_whisperr   r   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   r   $vllm.model_executor.layers.layernormr    vllm.model_executor.model_loaderr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr    r!   r"   r#   r$   vllm.sequencer%   (vllm.transformers_utils.configs.ultravoxr&   vllm.utils.tensor_schemar'   r(   
interfacesr*   r+   r,   r-   utilsr.   r/   r0   r1   r2   rZ   r}   r5   rK   rQ   rE   rR   rr   r   Moduler   r   r   r  register_processorr$  rG   rH   rV  rI   rI   rI   rJ   <module>   sj   
*c)Fi r