o
    iY                     @   s$  U d dl mZmZmZ d dlmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z< dZ=G dd de2Z>G dd de2Z?e>e?B Z@eeAd< G dd deZBG dd dejCZDG dd de'ZEG d d! d!e+ZFG d"d# d#e)eF ZGd$eeHe	jIf fd%d&ZJG d'd( d(e*eF ZKejLeKeFeGd)G d*d+ d+ejCe7e8e6ZMdS ),    )IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)BatchFeaturePretrainedConfig)AudioFlamingo3ConfigAudioFlamingo3Processor)Qwen2AudioEncoder)
VllmConfig)BaseDummyOptions)
get_act_fn)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)DictEmbeddingItemsModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixiX  c                   @   sp   e Zd ZU dZed ed< eeje	ej B e
dddf ed< eeje
ddf ed< eeje
d	f ed
< dS )AudioFlamingo3FeatureInputsz
    Dimensions:
        - num_chunks: Number of audio chunks (flattened)
        - nmb: Number of mel bins
        - num_audios: Number of original audio files
    audio_featurestype
num_chunksnmbi  input_featuresfeature_attention_mask
num_audioschunk_countsN)__name__
__module____qualname____doc__r   __annotations__r   torchTensorlistr#    r=   r=   _/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/audioflamingo3.pyr,   L   s$   
 
r,   c                   @   sF   e Zd ZU dZdZed ed< eee	j
 eddddhdf ed< dS )	AudioFlamingo3EmbeddingInputsz
    Dimensions:
        - bn: Batch size
        - naf: Number of audio features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    audio_embedsr.   bnnafhs)dynamic_dimsN)r5   r6   r7   r8   r.   r   r9   r   r<   r:   r;   r#   r=   r=   r=   r>   r?   e   s   
 r?   AudioFlamingo3Inputsc                       sV   e Zd Zdef fddZ	ddejeej B dejfddZd	ejfd
dZ	  Z
S )AudioFlamingo3Encoderconfigc                    sT   t  | tjddd| _t | _tt	|dd}| j
dtj|dd d S )N   )kernel_sizestridenum_mel_bins   freqsF)requires_grad)super__init__nn	AvgPool1d
avg_poolerModulepos_embr:   emptygetattrregister_parameter	Parameter)selfrG   rM   	__class__r=   r>   rP   |   s   
zAudioFlamingo3Encoder.__init__Nr1   attention_maskc                 C   s   t |tr
t|}tj| |}tj| |}|	dd}|| j
jd |dd d f  |j}| jD ]}|||d }|d }q:|ddd}| |}|ddd}| |}|S )Nr   rH   r$   )
isinstancer<   r:   stackrQ   
functionalgeluconv1conv2	transposeembed_positionsweightsizetodtypelayerspermuterS   
layer_norm)rZ   r1   r]   hidden_stateslayerlayer_outputsr=   r=   r>   forward   s$   

 



zAudioFlamingo3Encoder.forwardinput_lengthsc                 C   s(   |d d d }|d d d }||fS )z{
        Computes the output length of the convolutional layers and the output length
        of the audio encoder
        r$   rH   r=   )rZ   rs   output_lengthsr=   r=   r>    _get_feat_extract_output_lengths   s   z6AudioFlamingo3Encoder._get_feat_extract_output_lengthsN)r5   r6   r7   r
   rP   r:   r;   r<   rr   ru   __classcell__r=   r=   r[   r>   rF   {   s    
 rF   c                       s*   e Zd Zdef fddZdd Z  ZS )!AudioFlamingo3MultiModalProjectorrG   c                    sR   t    tj|jj|jj|jd| _t	|j
| _tj|jj|jj|jd| _d S )N)bias)rO   rP   rQ   Linearaudio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2)rZ   rG   r[   r=   r>   rP      s   
z*AudioFlamingo3MultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S rv   )r   r   r   )rZ   r-   ro   r=   r=   r>   rr      s   


z)AudioFlamingo3MultiModalProjector.forward)r5   r6   r7   r
   rP   rr   rw   r=   r=   r[   r>   rx      s    rx   c                       sD   e Zd Zdeeejf ee B de	eef dB f fddZ
  ZS )"AudioFlamingo3MultiModalDataParserdatareturnNc                    s(   t |trt|ddhtdS t |S )Naudior@   )modalityrequired_fieldsfields_factory)r`   dictr   _audioflamingo3_field_configrO   _parse_audio_data)rZ   r   r[   r=   r>   r      s   
z4AudioFlamingo3MultiModalDataParser._parse_audio_data)r5   r6   r7   r   strr:   r;   r   r   r   r   rw   r=   r=   r[   r>   r      s    r   c                   @   sR   e Zd Zdd ZdefddZdefddZdd	 Zd
ee	e
dB f fddZdS )AudioFlamingo3ProcessingInfoc                 C   s   | j tS rv   )ctxget_hf_configr   rZ   r=   r=   r>   r      s   z*AudioFlamingo3ProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rv   )r   get_hf_processorr   )rZ   r   r=   r=   r>   r      s   z-AudioFlamingo3ProcessingInfo.get_hf_processorc                 K   s   | j di |}|j}|S Nr=   )r   feature_extractor)rZ   r   hf_processorr   r=   r=   r>   get_feature_extractor   s   z2AudioFlamingo3ProcessingInfo.get_feature_extractorc                 C   s   |   }t|j|  dS )N)	target_srexpected_hidden_size)r   r   sampling_rate_get_expected_hidden_size)rZ   r   r=   r=   r>   get_data_parser   s
   z,AudioFlamingo3ProcessingInfo.get_data_parserr   Nc                 C   s   ddiS )Nr   r$   r=   r   r=   r=   r>   get_supported_mm_limits   s   z4AudioFlamingo3ProcessingInfo.get_supported_mm_limits)r5   r6   r7   r   objectr   r   r   r   r   intr   r=   r=   r=   r>   r      s    r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS ) AudioFlamingo3DummyInputsBuilder	mm_countsr   c                 C   s$   | dd}| j }|j}|| S )Nr   r   )getinfor   audio_token)rZ   r   r3   r   r   r=   r=   r>   get_dummy_text   s   
z/AudioFlamingo3DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sJ   | j  }|j}t| }|dd}|r|dnd }d| j|||diS )Nr   r   )lengthr3   	overrides)r   r   r   MAX_AUDIO_LENr   _get_dummy_audios)	rZ   r   r   r   r   r   	audio_lenr3   audio_overridesr=   r=   r>   get_dummy_mm_data   s   
z2AudioFlamingo3DummyInputsBuilder.get_dummy_mm_datarv   )
r5   r6   r7   r   r   r   r   r   r   r   r=   r=   r=   r>   r      s    

r   	hf_inputsc                 C   sn   |  d}|d ur#ttdtjd|ddtjd|ddtddS ttdtdtdtddS )Nr4   r   r   dim)r@   r1   r2   r4   )r   r   r   batchedflat_from_sizes)r   r4   r=   r=   r>   r     s$   

r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZ	ded	eeef deee
f fd
dZded	eeef dedee fddZ  ZS )!AudioFlamingo3MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr   c                    sR  | dg }|r||d< |dg s)| j |}| |}tt|gdddS | jjdi |}tdi |d|j	i}|d}t
|tsK|g}g }	|j	}
|j}t|
| }tt| }|D ]&}t
|trlt|n|jd }td|| d | }||kr|}|	| qat j||||d	}d
|v r| d
|d< tj|	tjd|d< |S )Naudiosr   )	input_idspt)tensor_typer   r   r$   )r   r   r   r   input_features_maskr2   )rk   r4   r=   )popr   r   get_tokenizerencode_apply_hf_processor_tokens_onlyr	   r   r   r   r`   r<   chunk_lengthr   r   lenshapemaxappendrO   _call_hf_processorr:   tensorlong)rZ   r   r   r   r   r   
prompt_idsr   
audio_listr4   r   r   window_sizemax_windowsr   	n_samplesn_winoutputsr[   r=   r>   r   (  sJ   


z4AudioFlamingo3MultiModalProcessor._call_hf_processorr   hf_processor_mm_kwargsc                 C   s   t |S rv   )r   )rZ   r   r   r=   r=   r>   _get_mm_fields_configa  s   z7AudioFlamingo3MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc           	         s   | j jd
i |}| j  }| }t|dd}||  d u r$|j | dddtf fdd}t	d||d	gS )Nr   z<sound>r2   r4   item_idxc                    s<  d ur}d urPt tjr n}t|d |  }||  }|| }t trI|| }t|dkrCt |d tjrCt|}nt|}n|| }nt trZ|  }n|  	d}|d}|d d d }|d d d }	|	 
 }
nd |  }|jd }
|
dkrtd gt|
 }tj| dS )Nr   r^   r$   rH   r@   zAudio is too short)embed_token_id)r`   r:   r;   tolistsumr<   r   ra   r   	unsqueezeitemr   
ValueErrorr   r    select_token_id)r   counts	start_idxcountend_idx	mask_listmaskrs   conv_lengthsaudio_output_lengthsnum_featuresr@   audio_tokensaudio_token_idr4   r2   out_mm_datar=   r>   get_replacement_audioflamingo3|  sB   







z]AudioFlamingo3MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_audioflamingo3r   )r   targetreplacementr=   )
r   r   r   	get_vocabrW   r   r   get_datar   r   )	rZ   r   r   r   	processor	tokenizervocabr   r   r=   r   r>   _get_prompt_updatesh  s"   



0z5AudioFlamingo3MultiModalProcessor._get_prompt_updates)r5   r6   r7   r   r   r   r   r   r	   r   r   r   r   r   r   r   r   rw   r=   r=   r[   r>   r   %  s8    


9



r   )r   dummy_inputsc                       s  e Zd ZdZg dddgdZdefddZd	d
dedef fddZ	de
dedB fddZdedejeejdf B fddZde
defddZ		d%dejdB dejdedB dejdB de
dejeB fddZdejdejdB fd d!Zd"eeeejf  dee fd#d$Z  ZS )&&AudioFlamingo3ForConditionalGenerationz
    AudioFlamingo3 model for conditional generation.

    This model integrates a Whisper-based audio encoder with a Qwen2 language model.
    It supports multi-chunk audio processing.
    )q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   c                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        zlanguage_model.zmulti_modal_projector.zaudio_tower.)language_model	connectortower_model)r   from_string_fieldr   r=   r=   r>   get_mm_mapping  s
   z5AudioFlamingo3ForConditionalGeneration.get_mm_mapping )prefixvllm_configr  c                   s   t    |jj}|j}|jj}|| _|| _|| _| |d t|j	| _
t|| _W d    n1 s5w   Y  | | t||jt|ddgd| _W d    n1 sXw   Y  | jj| _d S )Nr   r   Qwen2ForCausalLM)r  	hf_configr  architectures)rO   rP   model_configr  quant_configmultimodal_configrG   _mark_tower_modelrF   r{   audio_towerrx   multi_modal_projector_mark_language_modelr*   r}   r+   r   make_empty_intermediate_tensors)rZ   r  r  rG   r  r	  r[   r=   r>   rP     s.   

	z/AudioFlamingo3ForConditionalGeneration.__init__r   Nc                 K   sx   | dd }| dd }| dd }| dd }|d u r"|d u r"d S |d ur,td|dS |d ur8td|||dS td)	Nr1   r@   r2   r4   )r.   r@   r-   )r.   r1   r2   r4   z This line should be unreachable.)r   r?   r,   AssertionError)rZ   r   r1   r@   r2   r4   r=   r=   r>   _parse_and_validate_audio_input  s$   zFAudioFlamingo3ForConditionalGeneration._parse_and_validate_audio_inputaudio_input.c                 C   s8  |d dkr|d }t |S |d }|d }|d}t|tr.tj|dd}tj|dd}|d u r;dg|jd  }n!t|tjrF| }nt|tr\|r\t|d tjr\d	d
 |D }|	d}|d d d }|d d d }|j\}	}
}|d d d }tj
d||j|jdd|	|}|d|	|}||k}||	dd||	d||}|j| jjjj| jjjjd}td||< | j||d}| |}|j\}}}|d}t
||||j|k }|| d|}t||  }g }d}|D ]}||||  }|tj|dd ||7 }qt |S )Nr.   r@   r1   r2   r4   r   r   r$   c                 S   s   g | ]}|  qS r=   )r   ).0cr=   r=   r>   
<listcomp>  s    zOAudioFlamingo3ForConditionalGeneration._process_audio_input.<locals>.<listcomp>r^   rH   )rk   devicez-inf)r]   )tupler   r`   r<   r:   catr   r;   r   r   arangerk   r  r   expandviewrj   r  rd   rh   floatr  splitflattenr   )rZ   r  r@   r1   r2   r4   rs   r   r   
batch_size_max_mel_seq_lenmax_seq_len	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskr-   r3   max_audio_tokens	embed_dimaudio_features_maskmasked_audio_featureschunk_embeddingsgrouped_embeddingscurrent_idxr   audio_chunksr=   r=   r>   _process_audio_input  s   









z;AudioFlamingo3ForConditionalGeneration._process_audio_inputc                 K   s*   | j di |}|d u rg S | |}|S r   )r  r/  )rZ   r   r  r*  r=   r=   r>   embed_multimodale  s
   
z7AudioFlamingo3ForConditionalGeneration.embed_multimodalr   	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r3  )r   model)rZ   r   r1  r2  r3  r   ro   r=   r=   r>   rr   l  s   z.AudioFlamingo3ForConditionalGeneration.forwardro   c                 C   s   | j |S rv   )r   compute_logits)rZ   ro   r=   r=   r>   r5    s   z5AudioFlamingo3ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S rv   )r)   load_weights)rZ   r6  loaderr=   r=   r>   r7    s   
z3AudioFlamingo3ForConditionalGeneration.load_weights)NN)r5   r6   r7   r8   packed_modules_mappingr   r   r   r   rP   r   rE   r  r:   r;   r  r/  r%   r0  r!   rr   r5  r   setr7  rw   r=   r=   r[   r>   r     sN    


^

,r   )Ncollections.abcr   r   r   typingr   r   r   r   r:   torch.nnrQ   transformersr	   r
   "transformers.models.audioflamingo3r   r   transformers.models.qwen2_audior   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   r   vllm.multimodal.processingr   r   r   r   r   r    vllm.sequencer!   vllm.utils.tensor_schemar"   r#   
interfacesr%   r&   r'   r(   utilsr)   r*   r+   r   r,   r?   rE   r9   rF   rT   rx   r   r   r   r   r;   r   r   register_processorr   r=   r=   r=   r>   <module>   sX    
9

 


