o
    -iW                     @   s$  U d dl mZmZmZ d dlmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z< dZ=G dd de2Z>G dd de2Z?e>e?B Z@eeAd< G dd deZBG dd dejCZDG dd de+ZEG d d! d!e)eE ZFd"eeGe	jHf fd#d$ZIG d%d& d&e'ZJG d'd( d(e*eE ZKejLeKeEeFd)G d*d+ d+ejCe7e8e6ZMdS ),    )IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)BatchFeaturePretrainedConfig)AudioFlamingo3ConfigAudioFlamingo3Processor)Qwen2AudioEncoder)
VllmConfig)BaseDummyOptions)
get_act_fn)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)DictEmbeddingItemsModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixiX  c                   @   sp   e Zd ZU dZed ed< eeje	ej B e
dddf ed< eeje
ddf ed< eeje
d	f ed
< dS )AudioFlamingo3FeatureInputsz
    Dimensions:
        - num_chunks: Number of audio chunks (flattened)
        - nmb: Number of mel bins
        - num_audios: Number of original audio files
    audio_featurestype
num_chunksnmbi  input_featuresfeature_attention_mask
num_audioschunk_countsN)__name__
__module____qualname____doc__r   __annotations__r   torchTensorlistr#    r=   r=   f/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/audioflamingo3.pyr,   L   s$   
 
r,   c                   @   sF   e Zd ZU dZdZed ed< eee	j
 eddddhdf ed< dS )	AudioFlamingo3EmbeddingInputsz
    Dimensions:
        - bn: Batch size
        - naf: Number of audio features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    audio_embedsr.   bnnafhs)dynamic_dimsN)r5   r6   r7   r8   r.   r   r9   r   r<   r:   r;   r#   r=   r=   r=   r>   r?   e   s   
 r?   AudioFlamingo3Inputsc                       sV   e Zd Zdef fddZ	ddejeej B dejfddZd	ejfd
dZ	  Z
S )AudioFlamingo3Encoderconfigc                    s    t  | tjddd| _d S )N   )kernel_sizestride)super__init__nn	AvgPool1d
avg_poolerselfrG   	__class__r=   r>   rL   |   s   zAudioFlamingo3Encoder.__init__Nr1   attention_maskc                 C   s   t |tr
t|}tj| |}tj| |}|	dd}|| j
jd |dd d f  |j}| jD ]}|||}|d }q:|ddd}| |}|ddd}| |}|S )Nr   rH   r$   )
isinstancer<   r:   stackrM   
functionalgeluconv1conv2	transposeembed_positionsweightsizetodtypelayerspermuterO   
layer_norm)rQ   r1   rT   hidden_stateslayerlayer_outputsr=   r=   r>   forward   s$   

 




zAudioFlamingo3Encoder.forwardinput_lengthsc                 C   s(   |d d d }|d d d }||fS )z{
        Computes the output length of the convolutional layers and the output length
        of the audio encoder
        r$   rH   r=   )rQ   rj   output_lengthsr=   r=   r>    _get_feat_extract_output_lengths   s   z6AudioFlamingo3Encoder._get_feat_extract_output_lengthsN)r5   r6   r7   r
   rL   r:   r;   r<   ri   rl   __classcell__r=   r=   rR   r>   rF   {   s    
rF   c                       s*   e Zd Zdef fddZdd Z  ZS )!AudioFlamingo3MultiModalProjectorrG   c                    sR   t    tj|jj|jj|jd| _t	|j
| _tj|jj|jj|jd| _d S )N)bias)rK   rL   rM   Linearaudio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2rP   rR   r=   r>   rL      s   
z*AudioFlamingo3MultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S rm   )rv   rx   ry   )rQ   r-   rf   r=   r=   r>   ri      s   


z)AudioFlamingo3MultiModalProjector.forward)r5   r6   r7   r
   rL   ri   rn   r=   r=   rR   r>   ro      s    ro   c                   @   sJ   e Zd Zdd ZdefddZdefddZdeee	d	B f fd
dZ
d	S )AudioFlamingo3ProcessingInfoc                 C   s   | j tS rm   )ctxget_hf_configr   rQ   r=   r=   r>   r|      s   z*AudioFlamingo3ProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rm   )r{   get_hf_processorr   )rQ   r~   r=   r=   r>   r      s   z-AudioFlamingo3ProcessingInfo.get_hf_processorc                 K   s   | j di |}|j}|S Nr=   )r   feature_extractor)rQ   r~   hf_processorr   r=   r=   r>   get_feature_extractor   s   z2AudioFlamingo3ProcessingInfo.get_feature_extractorreturnNc                 C   s   dd iS )Naudior=   r}   r=   r=   r>   get_supported_mm_limits   s   z4AudioFlamingo3ProcessingInfo.get_supported_mm_limits)r5   r6   r7   r|   objectr   r   r   strintr   r=   r=   r=   r>   rz      s
    rz   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS ) AudioFlamingo3DummyInputsBuilder	mm_countsr   c                 C   s$   | dd}| j }|j}|| S )Nr   r   )getinfor   audio_token)rQ   r   r3   r   r   r=   r=   r>   get_dummy_text   s   
z/AudioFlamingo3DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sJ   | j  }|j}t| }|dd}|r|dnd }d| j|||diS )Nr   r   )lengthr3   	overrides)r   r   sampling_rateMAX_AUDIO_LENr   _get_dummy_audios)	rQ   r   r   r   r   r   	audio_lenr3   audio_overridesr=   r=   r>   get_dummy_mm_data   s   
z2AudioFlamingo3DummyInputsBuilder.get_dummy_mm_datarm   )
r5   r6   r7   r   r   r   r   r   r   r   r=   r=   r=   r>   r      s    

r   	hf_inputsc                 C   sn   |  d}|d ur#ttdtjd|ddtjd|ddtddS ttdtdtdtddS )Nr4   r   r   dim)r@   r1   r2   r4   )r   dictr   batchedflat_from_sizes)r   r4   r=   r=   r>   _audioflamingo3_field_config   s$   

r   c                       sD   e Zd Zdeeejf ee B de	eef dB f fddZ
  ZS )"AudioFlamingo3MultiModalDataParserdatar   Nc                    s(   t |trt|ddhtdS t |S )Nr   r@   )modalityrequired_fieldsfields_factory)rW   r   r   r   rK   _parse_audio_data)rQ   r   rR   r=   r>   r     s   
z4AudioFlamingo3MultiModalDataParser._parse_audio_data)r5   r6   r7   r   r   r:   r;   r   r   r   r   rn   r=   r=   rR   r>   r     s    r   c                
       s   e Zd ZdefddZdedeeef deee	f deeef de
f
 fdd	Zd
e
deeef deeef fddZdedeeef dedee fddZ  ZS )!AudioFlamingo3MultiModalProcessorr   c                 C   s   | j  }t|jdS )N)	target_sr)r   r   r   r   )rQ   r   r=   r=   r>   _get_data_parser  s   
z2AudioFlamingo3MultiModalProcessor._get_data_parserpromptmm_data	mm_kwargs
tok_kwargsc                    sR  | dg }|r||d< |dg s)| j |}| |}tt|gdddS | jjdi |}tdi |d|j	i}|d}t
|tsK|g}g }	|j	}
|j}t|
| }tt| }|D ]&}t
|trlt|n|jd }td|| d | }||kr|}|	| qat j||||d	}d
|v r| d
|d< tj|	tjd|d< |S )Naudiosr   )	input_idspt)tensor_typer   r   r$   )r   r   r   r   input_features_maskr2   )rb   r4   r=   )popr   r   get_tokenizerencode_apply_hf_processor_tokens_onlyr	   r   r   r   rW   r<   chunk_lengthr   r   lenshapemaxappendrK   _call_hf_processorr:   tensorlong)rQ   r   r   r   r   r   
prompt_idsr   
audio_listr4   r   r   window_sizemax_windowsr   	n_samplesn_winoutputsrR   r=   r>   r     sJ   


z4AudioFlamingo3MultiModalProcessor._call_hf_processorr   hf_processor_mm_kwargsc                 C   s   t |S rm   )r   )rQ   r   r   r=   r=   r>   _get_mm_fields_configW  s   z7AudioFlamingo3MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc           	         s   | j jd
i |}| j  }| }t|dd}||  d u r$|j | dddtf fdd}t	d||d	gS )Nr   z<sound>r2   r4   item_idxc                    s<  d ur}d urPt tjr n}t|d |  }||  }|| }t trI|| }t|dkrCt |d tjrCt|}nt|}n|| }nt trZ|  }n|  	d}|d}|d d d }|d d d }	|	 
 }
nd |  }|jd }
|
dkrtd gt|
 }tj| dS )Nr   rU   r$   rH   r@   zAudio is too short)embed_token_id)rW   r:   r;   tolistsumr<   r   rX   r   	unsqueezeitemr   
ValueErrorr   r    select_token_id)r   counts	start_idxcountend_idx	mask_listmaskrj   conv_lengthsaudio_output_lengthsnum_featuresr@   audio_tokensaudio_token_idr4   r2   out_mm_datar=   r>   get_replacement_audioflamingo3r  sB   







z]AudioFlamingo3MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_audioflamingo3r   )r   targetreplacementr=   )
r   r   r   	get_vocabgetattrr   r   get_datar   r   )	rQ   r   r   r   	processor	tokenizervocabr   r   r=   r   r>   _get_prompt_updates^  s"   



0z5AudioFlamingo3MultiModalProcessor._get_prompt_updates)r5   r6   r7   r   r   r   r   r   r   r   r	   r   r   r   r   r   r   r   r   rn   r=   r=   rR   r>   r     s:    


9



r   )r   dummy_inputsc                       s  e Zd ZdZg dddgdZdefddZd	d
dedef fddZ	de
dedB fddZdedejeejdf B fddZde
defddZ		d%dejdejdedB dejdB de
dejeB fddZdejdejdB fd d!Zd"eeeejf  dee fd#d$Z  ZS )&&AudioFlamingo3ForConditionalGenerationz
    AudioFlamingo3 model for conditional generation.

    This model integrates a Whisper-based audio encoder with a Qwen2 language model.
    It supports multi-chunk audio processing.
    )q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   c                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        zlanguage_model.zmulti_modal_projector.zaudio_tower.)language_model	connectortower_model)r   from_string_fieldr}   r=   r=   r>   get_mm_mapping  s
   z5AudioFlamingo3ForConditionalGeneration.get_mm_mapping )prefixvllm_configr   c                   s   t    |jj}|j}|jj}|| _|| _|| _| |d t|j	| _
t|| _W d    n1 s5w   Y  | | t||jt|ddgd| _W d    n1 sXw   Y  | jj| _d S )Nr   r   Qwen2ForCausalLM)r   	hf_configr   architectures)rK   rL   model_configr   quant_configmultimodal_configrG   _mark_tower_modelrF   rr   audio_towerro   multi_modal_projector_mark_language_modelr*   rt   r+   r   make_empty_intermediate_tensors)rQ   r   r   rG   r   r   rR   r=   r>   rL     s.   

	z/AudioFlamingo3ForConditionalGeneration.__init__r~   Nc                 K   sx   | dd }| dd }| dd }| dd }|d u r"|d u r"d S |d ur,td|dS |d ur8td|||dS td)	Nr1   r@   r2   r4   )r.   r@   r-   )r.   r1   r2   r4   z This line should be unreachable.)r   r?   r,   AssertionError)rQ   r~   r1   r@   r2   r4   r=   r=   r>   _parse_and_validate_audio_input  s$   zFAudioFlamingo3ForConditionalGeneration._parse_and_validate_audio_inputaudio_input.c                 C   s8  |d dkr|d }t |S |d }|d }|d}t|tr.tj|dd}tj|dd}|d u r;dg|jd  }n!t|tjrF| }nt|tr\|r\t|d tjr\d	d
 |D }|	d}|d d d }|d d d }|j\}	}
}|d d d }tj
d||j|jdd|	|}|d|	|}||k}||	dd||	d||}|j| jjjj| jjjjd}td||< | j||d}| |}|j\}}}|d}t
||||j|k }|| d|}t||  }g }d}|D ]}||||  }|tj|dd ||7 }qt |S )Nr.   r@   r1   r2   r4   r   r   r$   c                 S   s   g | ]}|  qS r=   )r   ).0cr=   r=   r>   
<listcomp>  s    zOAudioFlamingo3ForConditionalGeneration._process_audio_input.<locals>.<listcomp>rU   rH   )rb   devicez-inf)rT   )tupler   rW   r<   r:   catr   r;   r   r   arangerb   r  r   expandviewra   r  r[   r_   floatr  splitflattenr   )rQ   r  r@   r1   r2   r4   rj   r   r   
batch_size_max_mel_seq_lenmax_seq_len	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskr-   r3   max_audio_tokens	embed_dimaudio_features_maskmasked_audio_featureschunk_embeddingsgrouped_embeddingscurrent_idxr   audio_chunksr=   r=   r>   _process_audio_input  s   









z;AudioFlamingo3ForConditionalGeneration._process_audio_inputc                 K   s*   | j di |}|d u rg S | |}|S r   )r  r%  )rQ   r~   r  r   r=   r=   r>   embed_multimodal[  s
   
z7AudioFlamingo3ForConditionalGeneration.embed_multimodalr   	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r)  )r   model)rQ   r   r'  r(  r)  r~   rf   r=   r=   r>   ri   b  s   z.AudioFlamingo3ForConditionalGeneration.forwardrf   c                 C   s   | j |S rm   )r   compute_logits)rQ   rf   r=   r=   r>   r+  u  s   z5AudioFlamingo3ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S rm   )r)   load_weights)rQ   r,  loaderr=   r=   r>   r-  {  s   
z3AudioFlamingo3ForConditionalGeneration.load_weights)NN)r5   r6   r7   r8   packed_modules_mappingr   r   r   r   rL   r   rE   r  r:   r;   r  r%  r%   r&  r!   ri   r+  r   setr-  rn   r=   r=   rR   r>   r     sN    


^

,r   )Ncollections.abcr   r   r   typingr   r   r   r   r:   torch.nnrM   transformersr	   r
   "transformers.models.audioflamingo3r   r   transformers.models.qwen2_audior   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   r   vllm.multimodal.processingr   r   r   r   r   r    vllm.sequencer!   vllm.utils.tensor_schemar"   r#   
interfacesr%   r&   r'   r(   utilsr)   r*   r+   r   r,   r?   rE   r9   rF   Modulero   rz   r   r   r;   r   r   r   register_processorr   r=   r=   r=   r>   <module>   sX    
2

 


