o
    
۾i#C                     @   s  U d Z ddlmZmZmZ ddlmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: G dd de1Z;G dd de1Z<e;e<B Z=e	e>d< G dd dej?Z@de
jAfddZBdeeCe
jAf fddZDG d d! d!e&ZEG d"d# d#e*ZFG d$d% d%e(eF ZGG d&d' d'e)eF ZHejIeHeFeGd(G d)d* d*ej?e5e6ZJdS )+zEInference-only Qwen2-Audio model compatible with HuggingFace weights.    )IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)BatchFeature)Qwen2AudioConfigQwen2AudioEncoderQwen2AudioProcessor)WhisperFeatureExtractor)
VllmConfig)BaseDummyOptions)MULTIMODAL_REGISTRY)	AudioItemModalityDataMultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)AudioProcessorItemsDictEmbeddingItemsModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixc                   @   sZ   e Zd ZU dZed ed< eeje	ej B e
dddf ed< eeje
ddf ed< d	S )
Qwen2AudioFeatureInputszV
    Dimensions:
        - na: Number of audios
        - nmb: Number of mel bins
    audio_featurestypenanmbi  input_featuresfeature_attention_maskN)__name__
__module____qualname____doc__r   __annotations__r   torchTensorlistr#    r:   r:   Z/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/qwen2_audio.pyr+   H   s   
 
r+   c                   @   sF   e Zd ZU dZdZed ed< eee	j
 eddddhdf ed< dS )	Qwen2AudioEmbeddingInputsz
    Dimensions:
        - bn: Batch size
        - naf: Number of audio features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    audio_embedsr-   bnnafhs)dynamic_dimsN)r2   r3   r4   r5   r-   r   r6   r   r9   r7   r8   r#   r:   r:   r:   r;   r<   [   s   
 r<   Qwen2AudioInputsc                       s.   e Zd Zdedef fddZdd Z  ZS )Qwen2AudioMultiModalProjectoraudio_hidden_sizetext_hidden_sizec                    s    t    tj||dd| _d S )NT)bias)super__init__nnLinearlinear)selfrD   rE   	__class__r:   r;   rH   r   s   
z&Qwen2AudioMultiModalProjector.__init__c                 C   s   |  |}|S N)rK   )rL   r,   hidden_statesr:   r:   r;   forwardv   s   
z%Qwen2AudioMultiModalProjector.forward)r2   r3   r4   intrH   rQ   __classcell__r:   r:   rM   r;   rC   q   s    rC   input_lengthsc                 C   s(   | d d d }|d d d }||fS )Nr$      r:   )rT   feat_lengthsoutput_lengthsr:   r:   r;    _get_feat_extract_output_lengths|   s   rX   	hf_inputsc                 C   s    t tdtdtddS )Naudio)r=   r0   r1   )dictr   batched)rY   r:   r:   r;   _qwen2audio_field_config   s
   r]   c                       sD   e Zd Zdeeejf ee B de	e
e
f dB f fddZ  ZS )Qwen2AudioMultiModalDataParserdatareturnNc                    s(   t |trt|ddhtdS t |S )NrZ   r=   )modalityrequired_fieldsfields_factory)
isinstancer[   r   r]   rG   _parse_audio_data)rL   r_   rM   r:   r;   re      s   
z0Qwen2AudioMultiModalDataParser._parse_audio_data)r2   r3   r4   r[   strr7   r8   r   r   r   r   re   rS   r:   r:   rM   r;   r^      s    r^   c                   @   sh   e Zd Zdd ZdedefddZdedefddZd	d
 Z	de
fddZdeee
dB f fddZdS )Qwen2AudioProcessingInfoc                 C   s   | j tS rO   )ctxget_hf_configr
   rL   r:   r:   r;   ri      s   z&Qwen2AudioProcessingInfo.get_hf_configkwargsr`   c                 K   s   | j jtfi |S rO   )rh   get_hf_processorr   )rL   rk   r:   r:   r;   rl      s   z)Qwen2AudioProcessingInfo.get_hf_processorc                 K   s(   | j di |}|j}t|tsJ |S Nr:   )rl   feature_extractorrd   r   )rL   rk   hf_processorrn   r:   r:   r;   get_feature_extractor   s   z.Qwen2AudioProcessingInfo.get_feature_extractorc                 C   s    |   }t|j|  |  dS )N)	target_srtarget_channelsexpected_hidden_size)rp   r^   sampling_rateget_target_channels_get_expected_hidden_size)rL   rn   r:   r:   r;   get_data_parser   s   z(Qwen2AudioProcessingInfo.get_data_parserc                 C   s   dS )z;Return target audio channels for Qwen2 Audio models (mono).r$   r:   rj   r:   r:   r;   ru      s   z,Qwen2AudioProcessingInfo.get_target_channelsNc                 C   s   dd iS )NrZ   r:   rj   r:   r:   r;   get_supported_mm_limits   s   z0Qwen2AudioProcessingInfo.get_supported_mm_limits)r2   r3   r4   ri   objectr   rl   r   rp   rw   rR   ru   r   rf   rx   r:   r:   r:   r;   rg      s    	rg   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Qwen2AudioDummyInputsBuilder	mm_countsr`   c                 C   s$   | dd}| j }|j}|| S )NrZ   r   )getinforl   audio_token)rL   r{   
num_audiosro   r~   r:   r:   r;   get_dummy_text   s   
z+Qwen2AudioDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sL   | j  }|j}|j| }|dd}|r|dnd }d| j|||diS )NrZ   r   )lengthr   	overrides)r}   rp   rt   chunk_lengthr|   _get_dummy_audios)	rL   r   r{   r   rn   rt   	audio_lenr   audio_overridesr:   r:   r;   get_dummy_mm_data   s   

z.Qwen2AudioDummyInputsBuilder.get_dummy_mm_datarO   )
r2   r3   r4   r   rf   rR   r   r   r   r   r:   r:   r:   r;   rz      s    
rz   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deee	f fd
dZ
ded	eeef dedee fddZ  ZS )Qwen2AudioMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr`   c                    s   | dg }|r||d< |dg s)| j |}| |}tt|gdddS | jjdi |}tdi |d|j	i}t
 j||||dS )	NaudiosrZ   )	input_idspt)tensor_typert   )r   r   r   r   r:   )popr|   r}   get_tokenizerencode_apply_hf_processor_tokens_onlyr	   r[   rp   rt   rG   _call_hf_processor)rL   r   r   r   r   r   
prompt_idsrn   rM   r:   r;   r      s&   

z0Qwen2AudioMultiModalProcessor._call_hf_processorrY   hf_processor_mm_kwargsc                 C   s   t |S rO   )r]   )rL   rY   r   r:   r:   r;   _get_mm_fields_config   s   z3Qwen2AudioMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   | j jdi |}| j  }| }t|dd}t|dd}t|dd}	|| ||  ||	 | d}
|
d u r@g nt|
tj	sHJ t
|
d\}}| d	tf fd
d}td||dgS )Nr~   z	<|AUDIO|>audio_bos_tokenz<|audio_bos|>audio_eos_tokenz<|audio_eos|>r1   item_idxc                    s   r|  }nd |  }t |jdksJ d|jd }|dkr4dt}|| }td| dg| }tj g| g dS )	Nr=   rU   z audio_embeds must be a 2D tensorr   rZ   zThe audio (len=z1) is too short to be represented inside the model)embed_token_id)lenshape	get_itemsr   get_audio_length
ValueErrorr    select_token_id)r   num_featuresr=   r   r   audio_tokensaudio_bos_idaudio_eos_idaudio_output_lengthsaudio_token_idr   out_mm_datar:   r;   get_replacement_qwen2_audio  s    




zVQwen2AudioMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_qwen2_audiorZ   )ra   targetreplacementr:   )r}   rl   r   	get_vocabgetattrget_datar|   rd   r7   r8   rX   sumtolistrR   r   )rL   r   r   r   	processor	tokenizervocabr~   r   r   r1   _audio_output_lensr   r:   r   r;   _get_prompt_updates   s2   

z1Qwen2AudioMultiModalProcessor._get_prompt_updates)r2   r3   r4   rf   r   ry   r   r	   r   r   r   r   r   r   r   r   rS   r:   r:   rM   r;   r      s8    


!



r   )r}   dummy_inputsc                       s  e Zd ZededededB fddZddd	ed
ef fddZde	de
dB fddZde
dejeejdf B fddZde	defddZ		d"dejdB dejdedB dejdB de	dejeB fddZdejdejdB fddZdeeeejf  dee fd d!Z  ZS )#"Qwen2AudioForConditionalGenerationra   ir`   Nc                 C   s   | drd| dS td)NrZ   zAudio z%: <|audio_bos|><|AUDIO|><|audio_eos|>z Only audio modality is supported)
startswithr   )clsra   r   r:   r:   r;   get_placeholder_strE  s   
z6Qwen2AudioForConditionalGeneration.get_placeholder_str )prefixvllm_configr   c                   s   t    |jj}|j}|jj}|| _|| _|| _| |d t|j	| _
t|j	j|jj| _W d    n1 s:w   Y  | | t||jt|ddgd| _W d    n1 s]w   Y  | jj| _d S )NrZ   language_modelQwen2ForCausalLM)r   	hf_configr   architectures)rG   rH   model_configr   quant_configmultimodal_configconfig_mark_tower_modelr   audio_configaudio_towerrC   d_modeltext_confighidden_sizemulti_modal_projector_mark_language_modelr)   r*   r   make_empty_intermediate_tensors)rL   r   r   r   r   r   rM   r:   r;   rH   L  s.   

	z+Qwen2AudioForConditionalGeneration.__init__rk   c                 K   sj   | dd }| dd }| dd }|d u r|d u rd S |d ur&td|dS |d ur1td||dS td)Nr0   r=   r1   )r-   r=   r,   )r-   r0   r1   z This line should be unreachable.)r   r<   r+   AssertionError)rL   rk   r0   r=   r1   r:   r:   r;   _parse_and_validate_audio_inputg  s    zBQwen2AudioForConditionalGeneration._parse_and_validate_audio_inputaudio_input.c                 C   sX  |d dkr|d }t |S |d }|d }| j|d\}}|j\}}}	|	d d d }
tjd|
|j|jd		d
||
}|	d
||
}||k}||dd|

|d|
|
}|j| jjjj| jjjjd	}td
||< | j||d}|j}| |}|j\}}}|	d}t|
|||j|k }|| d|}t||  S )Nr-   r=   r0   r1   r   rU   r$   r   )dtypedevicez-inf)attention_mask)tupler   rX   r   r   r7   aranger   r   	unsqueezeexpandviewtoconv1weightfloatlast_hidden_stater   splitflattenr   )rL   r   r=   r0   r1   audio_feat_lengthsr   
batch_sizer   max_mel_seq_lenmax_seq_len	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskaudio_outputsselected_audio_featurer,   r   max_audio_tokens	embed_dimaudio_features_maskmasked_audio_featuresr:   r:   r;   _process_audio_input  sb   





z7Qwen2AudioForConditionalGeneration._process_audio_inputc                 K   s*   | j di |}|d u rg S | |}|S rm   )r   r   )rL   rk   r   r   r:   r:   r;   embed_multimodal  s
   
z3Qwen2AudioForConditionalGeneration.embed_multimodalr   	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r   )r   model)rL   r   r   r   r   rk   rP   r:   r:   r;   rQ     s   z*Qwen2AudioForConditionalGeneration.forwardrP   c                 C   s   | j |S rO   )r   compute_logits)rL   rP   r:   r:   r;   r     s   z1Qwen2AudioForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S rO   )r(   load_weights)rL   r   loaderr:   r:   r;   r    s   
z/Qwen2AudioForConditionalGeneration.load_weights)NN)r2   r3   r4   classmethodrf   rR   r   r   rH   ry   rB   r   r7   r8   r   r   r%   r   r!   rQ   r   r   setr  rS   r:   r:   rM   r;   r   ?  sH    

@

,r   )Kr5   collections.abcr   r   r   typingr   r   r   r   r7   torch.nnrI   transformersr	   transformers.models.qwen2_audior
   r   r   transformers.models.whisperr   vllm.configr   vllm.config.multimodalr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   r   vllm.multimodal.parser   r   r   r   r   vllm.multimodal.processingr   r   r   r   r   r    vllm.sequencer!   vllm.utils.tensor_schemar"   r#   
interfacesr%   r&   r'   utilsr(   r)   r*   r+   r<   rB   r6   ModulerC   r8   rX   rf   r]   r^   rg   rz   r   register_processorr   r:   r:   r:   r;   <module>   sD    i