o
    
۾i                     @  s$  d Z ddlmZ ddlZddlmZmZmZ ddlm	Z	 ddl
mZ ddlZddlZddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZA G dd dejBZCG dd dejBZDG dd dejBZEG dd  d ejBZFG d!d" d"ejBZGG d#d$ d$e2ZHG d%d& d&e0eH ZIG d'd( d(e1eH ZJe&jKeJeHeId)G d*d+ d+ejBe<e=ZLdS ),a  Inference-only FunAudioChat model compatible with HuggingFace weights.

FunAudioChat is a Qwen3 text model augmented with:
  - a continuous audio encoder (Whisper-mel frontend + transformer)
  - a discrete audio encoder (speech tokenizer + projector)

In the HF implementation, audio features are scattered into `<|AUDIO|>` token
positions via `inputs_embeds`, while `position_ids` (RoPE) remains standard 1D.
    )annotationsN)IterableMappingSequence)cached_property)Any)PreTrainedTokenizerFastWhisperFeatureExtractor)get_activation)BatchFeature)BaseModelOutput)
VllmConfig)BaseDummyOptions)MMEncoderAttention)QKVParallelLinearRowParallelLinear)default_weight_loader)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)AudioProcessorItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)_has_module   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixc                      s    e Zd Zd	d
 fddZ  ZS )_SinusoidsPositionEmbedding     @lengthintchannelsmax_timescalefloatc                   s   t    |d dkrtdt||d d  }t| t|d   }t|d d tj	f |tj	d d f  }| j
dtjt|t|gdddd d S )	N   r   z4SinusoidsPositionEmbedding needs even channels inputr"   positional_embeddingdimF)
persistent)super__init__
ValueErrornplogtorchexparanger/   newaxisregister_buffercatsincos)selfr+   r-   r.   log_timescale_incrementinv_timescalesscaled_time	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/funaudiochat.pyr6   =   s   
(
z$_SinusoidsPositionEmbedding.__init__)r*   )r+   r,   r-   r,   r.   r/   )__name__
__module____qualname__r6   __classcell__rH   rH   rF   rI   r)   <   s    r)   c                      s<   e Zd ZdZd fddZdd
dZ		ddddZ  ZS )FunAudioChatAudioAttentionz>Multi-headed attention used inside the continuous audio tower.configr   c                   s  t    t|j| _t|j| _tt|dd| _	| j| j | _
d| _|| _| j
| j | jkr=td| j d| j d| j
d | _d| _d| _d| _t| j| j
| jd	d
| _| jj| _| jj| _| j| j
 | _| j| j
 | _t| j| j
| j| jdd| _t| j| jd	d
| _d S )Nattention_dropout        r"   z8embed_dim must be divisible by num_heads (got embed_dim=z, num_heads=z).g      FTbiaszfunaudiochat_audio_tower.attn)	num_heads	head_sizescalenum_kv_headsprefix)r5   r6   r,   d_model	embed_dimencoder_attention_headstotal_num_headsr/   getattrdropouthead_dimnum_key_value_groupsrO   r7   scalingrP   
is_decoder	is_causalr   qkv_projrT   rW   q_sizekv_sizer   attnr   out_projrB   rO   rF   rH   rI   r6   S   sR   


z#FunAudioChatAudioAttention.__init__weights"Iterable[tuple[str, torch.Tensor]]returnset[str]c                 C  s   g d}t |  }t  | jjd ur| jj  W d    n1 s%w   Y  t }|D ]F\}}|D ]!\}}}	||vr?q5|||}|| }
t	|
dt
}||
||	  n|dra||vraq/|| }
t	|
dt
}||
| || q/|S )N))rd   q_projq)rd   k_projk)rd   v_projvweight_loaderz.bias)dictnamed_parametersr:   no_gradrd   rS   zero_setreplacer]   r   endswithadd)rB   rj   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_name
shard_nameshard_idparamrt   rH   rH   rI   load_weights   s0   

z'FunAudioChatAudioAttention.load_weightsNhidden_statestorch.Tensor
cu_seqlenstorch.Tensor | Noneattention_maskkwargsobjectc                 K  s   ~~|  \}}| |\}}|j| j| j| jgdd\}}	}
d }|d ur3|dd  |d d   }| j|d|| j|	d|| j|
d|| j||d|d}| |\}}|S )Nr2   r"   )r   
max_seqlen)	sizerd   splitre   rf   maxrg   reshaperh   )rB   r   r   r   r   
seq_length_qkvquery_states
key_statesvalue_statesr   attn_outputoutputrH   rH   rI   forward   s*   z"FunAudioChatAudioAttention.forwardrO   r   rj   rk   rl   rm   NN)
r   r   r   r   r   r   r   r   rl   r   )rJ   rK   rL   __doc__r6   r   r   rM   rH   rH   rF   rI   rN   P   s    
,(rN   c                      s,   e Zd Zd fddZ	ddddZ  ZS )FunAudioChatAudioEncoderLayerrO   r   c                   s   t    t|j| _t|| _t| j| _	t
|j| _tt|j| _t
|j| _t| jt|j| _tt|j| j| _t| j| _d S N)r5   r6   r,   rY   rZ   rN   	self_attnnn	LayerNormself_attn_layer_normr/   r^   r
   stractivation_functionactivation_fnactivation_dropoutLinearencoder_ffn_dimfc1fc2final_layer_normri   rF   rH   rI   r6      s   

z&FunAudioChatAudioEncoderLayer.__init__Nr   r   r   r   r   r   r   rl   tuple[torch.Tensor]c                 K  s   |}|  |}| jd|||d|}|| }|}| |}| | |}tjj|| j| j	d}| 
|}tjj|| j| j	d}|| }|fS )N)r   r   r   )ptrainingrH   )r   r   r   r   r   r   
functionalr^   r   r   r   )rB   r   r   r   r   residualrH   rH   rI   r      s,   




z%FunAudioChatAudioEncoderLayer.forwardr   r   )
r   r   r   r   r   r   r   r   rl   r   )rJ   rK   rL   r6   r   rM   rH   rH   rF   rI   r      s    r   c                      s^   e Zd ZdZd+ fddZed,dd	Zd-ddZd.ddZ		d/d0d$d%Z	d1d)d*Z
  ZS )2FunAudioChatAudioEncoderzContinuous audio tower.rO   r   c                   s   t     | _t j}t j| _t j| _t jr"|d nd| _	t j
| _
tj| j|ddd| _tj||dddd| _t fdd	tt jD | _t|| _tjddd
| _t|t j| _t| j|| _tdt j| _d S )Ng      ?g      ?   r"   )kernel_sizepaddingr0   )r   strider   c                   s   g | ]}t  qS rH   )r   ).0r   rO   rH   rI   
<listcomp>  s    z5FunAudioChatAudioEncoder.__init__.<locals>.<listcomp>)r   )r5   r6   rO   r,   rY   num_mel_binsmax_source_positionsboolscale_embeddingembed_scalen_windowr   Conv1dconv1conv2
ModuleListrangeencoder_layerslayersr   ln_post	AvgPool1d
avg_poolerr   
output_dimprojr)   r1   	Embeddingaudio_bos_eos_token)rB   rO   rZ   rF   r   rI   r6      s*   


z!FunAudioChatAudioEncoder.__init__rl   torch.dtypec                 C  s
   | j jjS r   )r   weightdtyperB   rH   rH   rI   r     s   
zFunAudioChatAudioEncoder.dtypeinputs_tensorr   r   r   c                 C  s   t | jdddkrd S |jd }tjdd||ft|jj|j|jd}t	dt
|D ]}t||d   }t||  }d|d||||f< q+|S )N_attn_implementationeagerflash_attention_2r   r"   devicer   .)r]   rO   shaper:   fullfinfor   minr   r   lenr,   item)rB   r   r   r   r   istartendrH   rH   rI   _prepare_attention_mask  s   

z0FunAudioChatAudioEncoder._prepare_attention_maskinput_featuresfeature_lensaftercnn_lensspeech_maxlenr,   r   r   r   c           +        s  t |dkrtdstdt| jd jjddstdt |d}|j}|dk}t	
|d }	|	 dkrOt | jj}
tt	j|||
f|| jjjdd	S |j| d
d  fdd|	D }t	j|d
d}|| }|| }t	|| jd   }g }| jd }t|D ].\}}t ||  }|dkrq||g|d
   t | | }|dkr|}|| qt	j|t	j|d}|j| d
d}| j||ddd\}}}tj | !|| }tj | "|#d
d}|| j$j$d |j%d
 d d f &d'|j }|| }t	t	jd
|jt	j(d|)d
*df't	j(}| jD ]}||fd|i|\}q$|j| dd}g } g }!|D ]2}"t |"j%d }#|#dkrbtjj+|"#dd
ddd#dd
}$n|"}$| |$ |!t |$j%d  qBt	j| dd}%| | ,|%}&t-|&j|!dd}'|'rt |'d j%d nt | jj}
t	j|||
f|'r|'d jn| jjj|d}(t.|	|'D ]\})}*t/t |*j%d t |}#|*d |# |(t |)d |#f< qt|(d	S )NiL  
flash_attnzFunAudioChat long audio (~300s) requires FlashAttention-2 for the continuous audio tower, but `flash_attn` is not installed in the runtime environment.r   is_flash_attn_backendFzFunAudioChat long audio (~300s) requires FlashAttention for the continuous audio tower, but the selected MM encoder attention backend is not FlashAttention.r   )last_hidden_stater"   r2   c                   s   g | ]} t | qS rH   r,   )r   r   input_features_listrH   rI   r   U  s    z4FunAudioChatAudioEncoder.forward.<locals>.<listcomp>r0   r   r   right)padding_valuepadding_sider   )r   r   r   )0r,   r!   RuntimeErrorr]   r   r   rg   r   r   r:   wherenumelr   out_featuresr   zerosr   r   r   tolistr?   ceilr   long	enumerater   extendappendtensorpadded_and_mask_functionr   r   gelur   r   	transposer1   r   	unsqueezetoint32sumcumsum
avg_pool1dr   listzipr   )+rB   r   r   r   r   r   original_batch_sizer   
valid_maskvalid_indicesr   valid_input_features_listvalid_input_featuresvalid_feature_lensvalid_aftercnn_lens	chunk_numchunk_lengths_listfull_chunk_lenr   r+   num_chunks_for_samplelast_chunk_lenchunk_lengths
chunk_listpadded_featurepadded_maskpadded_mask_after_cnnpadded_embedr   r   encoder_layerhidden_states_listpooled_listpooled_lengthseach_audio_statesseq_lenpooledpooled_concatprocessed_concatprocessed_audio_listoutput_hidden_states	valid_idx	processedrH   r   rI   r   '  s   




 
z FunAudioChatAudioEncoder.forwardrQ   r   tensor_listSequence[torch.Tensor]
tensor_lenr   r/   r   r   /tuple[torch.Tensor, torch.Tensor, torch.Tensor]c                 C  s<  t |  }t |d jd }tjt|||f|| j|d jd}tj	t||ftj
|jd}t|D ]\}	}
t |
 }d||	d |f< ||	 ||	d d d |f< q5|d d d }t |  }tj	t||ftj
|jd}t|D ]\}	}
d||	d t |
 f< qw|dkrtd||d|j| fS )Nr   )r   
fill_valuer   r   r   r"   r0   r   z Only right padding is supported.)r,   r   r   r   r:   r   r   r   r   r   r   r   NotImplementedErrorr  r  r   )rB   r)  r+  r   r   max_lenr3   padded_tensor
batch_maskr   r+   
length_valfeature_lens_after_cnnmax_len_after_cnnbatch_mask_after_cnnrH   rH   rI   r     s<   
z1FunAudioChatAudioEncoder.padded_and_mask_functioninput_lengthstorch.LongTensor)tuple[torch.LongTensor, torch.LongTensor]c                 C  s(   |d d d }|d d d }||fS )Nr"   r0   rH   rB   r6  output_lengthsrH   rH   rI    _get_feat_extract_output_lengths  s   z9FunAudioChatAudioEncoder._get_feat_extract_output_lengthsr   )rl   r   )r   r   r   r   rl   r   )r   r   r   r   r   r   r   r,   r   r   rl   r   )rQ   r   )
r)  r*  r+  r   r   r/   r   r   rl   r,  r6  r7  rl   r8  )rJ   rK   rL   r   r6   propertyr   r   r   r   r;  rM   rH   rH   rF   rI   r      s    

 ,r   c                      s>   e Zd ZdZd fddZ			ddddZdddZ  ZS )FunAudioChatDiscreteEncoderz@Discrete audio encoder (speech tokenizer -> grouped embeddings).rO   r   c                   s   t    t|j| _t|j| _t|j| _t|dd| _	t
t|j| j| j| _t
j| j| jdd| _t
j| j| jdd| _d S )Ncontinuous_features_moder|   FrR   )r5   r6   r,   pad_token_idpadding_idx
group_sizer   hidden_sizer]   r?  r   r   codebook_sizeembed_tokensr   output_matchingcontinual_output_matchingri   rF   rH   rI   r6     s   

z$FunAudioChatDiscreteEncoder.__init__N	audio_idsr   continuous_audio_featuresr   continuous_audio_output_lengthsfeature_exist_maskrl   c                 C  s   ~|  |}||jd d| j| j }||jd d| j| jjdd}| |}|d uri||jd d| j| jjdd}| |}|d u rVtj	|jd ftj
|jd}| jdkre||  |7  < |S |||< |S )Nr   r   r0   r2   r   r|   )rE  r   r   rB  rC  meanrF  rG  r:   onesr   r   r?  )rB   rH  rI  rJ  rK  inputs_embedsr   continuous_audio_hidden_statesrH   rH   rI   r     sD   



z#FunAudioChatDiscreteEncoder.forwardr6  r7  r8  c                 C  s   || j  d | j  }||fS Nr"   )rB  r9  rH   rH   rI   r;  !  s   z<FunAudioChatDiscreteEncoder._get_feat_extract_output_lengthsr   )NNN)
rH  r   rI  r   rJ  r   rK  r   rl   r   r<  )rJ   rK   rL   r   r6   r   r;  rM   rH   rH   rF   rI   r>    s    *r>  c                   @  sz   e Zd ZU dZded< edddZed d	d
ZdddZd ddZ	dd Z
d!ddZd"ddZd#ddZd"ddZdS )$FunAudioChatProcessingInfo   r,   	token_fpsrl   r	   c                 C  s   t | jS r   )r	   from_pretrainedmodel_idr   rH   rH   rI   feature_extractor+  s   z,FunAudioChatProcessingInfo.feature_extractorr   c                 C  s   t j| jddS )Nspeech_tokenizer)	subfolder)r   rT  rU  r   rH   rH   rI   rW  /  s   z+FunAudioChatProcessingInfo.speech_tokenizerc                 C     | j S r   )rV  r   rH   rH   rI   get_feature_extractor5     z0FunAudioChatProcessingInfo.get_feature_extractorc                 C  rY  r   )rW  r   rH   rH   rI   get_speech_tokenizer8  r[  z/FunAudioChatProcessingInfo.get_speech_tokenizerc                 C  s   t t| jj|  |  dS )N)	target_srtarget_channelsexpected_hidden_size)r   r,   rV  sampling_rateget_target_channels_get_expected_hidden_sizer   rH   rH   rI   get_data_parser;  s
   
z*FunAudioChatProcessingInfo.get_data_parserMapping[str, int | None]c                 C  s   dd iS )NaudiorH   r   rH   rH   rI   get_supported_mm_limitsB  s   z2FunAudioChatProcessingInfo.get_supported_mm_limitsc                 C     dS rP  rH   r   rH   rH   rI   ra  E  s   z.FunAudioChatProcessingInfo.get_target_channelsr!  	mm_countsMapping[str, int]Mapping[str, int] | Nonec                 C  s,   |   }t|dd }tt|dd}d|iS )Naudio_configr     re  get_hf_configr]   r,   )rB   r!  rh  cfg	audio_cfgmax_audio_tokensrH   rH   rI   get_mm_max_tokens_per_itemH  s   z5FunAudioChatProcessingInfo.get_mm_max_tokens_per_itemc                 C  s$   |   }t|dd }tt|ddS )Nrk  rB     rm  )rB   ro  rp  rH   rH   rI   get_audio_group_sizeT  s   z/FunAudioChatProcessingInfo.get_audio_group_sizeN)rl   r	   )rl   r   )rl   rd  )rl   r,   )r!  r,   rh  ri  rl   rj  )rJ   rK   rL   rS  __annotations__r   rV  rW  rZ  r\  rc  rf  ra  rr  rt  rH   rH   rH   rI   rQ  (  s   
 




rQ  c                   @  s$   e Zd ZdddZ	ddddZdS )FunAudioChatDummyInputsBuilderrh  ri  rl   r   c                 C  s   | dd}dt| S )Nre  r   #<|audio_bos|><|AUDIO|><|audio_eos|>)getr,   )rB   rh  
num_audiosrH   rH   rI   get_dummy_text]  s   z-FunAudioChatDummyInputsBuilder.get_dummy_textNr!  r,   
mm_options%Mapping[str, BaseDummyOptions] | Noner   c                 C  s   | j  }t|j}| j  }t|dd }tt|dd}| j  }	tt| j dd}
td|td|	 }td|| |
 d |
 }t|dd}|rQ|dnd }d| j	|||d	iS )
Nrk  r   rl  rS  rR  r"   re  r   )r+   ry  	overrides)
inforZ  r,   r`  rn  r]   rt  r   rx  _get_dummy_audios)rB   r!  rh  r{  rV  r`  ro  rp  rq  rB  rS  target_num_frames	audio_lenry  audio_overridesrH   rH   rI   get_dummy_mm_dataa  s(   



z0FunAudioChatDummyInputsBuilder.get_dummy_mm_data)rh  ri  rl   r   r   )r!  r,   rh  ri  r{  r|  rl   r   )rJ   rK   rL   rz  r  rH   rH   rH   rI   rv  Z  s    
rv  c                   @  s4   e Zd Zdd	d
ZdddZdddZd ddZdS )!FunAudioChatMultiModalProcessorpromptr   mm_dataMapping[str, object]	mm_kwargs
tok_kwargsrl   r   c                 C  s  | j  }t|j|fi |g}|dg }|s td|iS | j  }t|j	}	tt
|ddp2d}
g }g }| j  }|jpAd}|D ]V}t|tjrT|   }tj|tjd}|
dkrv|jd |
k rvtj|d|
|jd  fdd	}|| tt|jd t|	 t| j j }||td
t|  qD| j  }||ddd|dd}|||	dddd}|d |d |d |d tjt|ftjdd}td|i|S )Naudios	input_idsn_ffti  z<|audio_pad|>r   r   constant)moder"   TFpt)return_attention_maskreturn_token_type_idsr   pad_to_multiple_ofreturn_tensors
max_length)r`  r  r   r  r   r   
speech_idsspeech_attention_maskr   feature_attention_maskrK  )r~  get_tokenizerr:   r   encoderx  r   rZ  r,   r`  r]   r\  	pad_token
isinstanceTensordetachcpunumpyr8   asarrayfloat32r   padr   r/   rS  r   rt  rM  r   r   )rB   r  r  r  r  	tokenizerr  r  rV  srmin_sampleswavsspeech_strsrW  r  re  audio_np
num_framesaudio_group_sizespeech_inputs
wav_inputs	mm_inputsrH   rH   rI   _call_hf_processor  s`   





 
		z2FunAudioChatMultiModalProcessor._call_hf_processorprompt_textmm_itemsr   hf_processor_mm_kwargstokenization_kwargsr   c                 C  rg  )NFrH   )rB   r  r  r  r  rH   rH   rI   _hf_processor_applies_updates  s   z=FunAudioChatMultiModalProcessor._hf_processor_applies_updates	hf_inputs#Mapping[str, MultiModalFieldConfig]c                 C  s.   t dt dt dt dt ddS )Nre  r  )r   batched)rB   r  r  rH   rH   rI   _get_mm_fields_config  s   z5FunAudioChatMultiModalProcessor._get_mm_fields_configout_mm_kwargsr   Sequence[PromptUpdate]c                   s   | j  }| }d}|| | }|d}|d u rg  nt|tjs'J |d}	| j 	 }
|	|
 d |
 
  d fdd}td	||d
gS )Nz	<|AUDIO|>r  r   r"   item_idxr,   c                   sZ    rt  |  nd}|dkr!dt}|| }td| dg| }tj|dS )Nr"   r   re  zThe audio (len=z1) is too short to be represented inside the model)embed_token_id)r,   	get_itemsr   get_audio_lengthr7   r   select_token_id)r  num_featuresr  r  audio_tokensaudio_output_lengthsaudio_token_idr  rH   rI   get_replacement_funaudiochat  s   


zYFunAudioChatMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_funaudiochatre  )modalitytargetreplacement)r  r,   )r~  r  	get_vocabget_datarx  r  r:   r  r  rt  r   r   )rB   r  r  r  r  vocabaudio_tokenout_mm_datar  speech_lengthsrB  r  rH   r  rI   _get_prompt_updates  s*   



z3FunAudioChatMultiModalProcessor._get_prompt_updatesN)
r  r   r  r  r  r  r  r  rl   r   )
r  r   r  r   r  r  r  r  rl   r   )r  r   r  r  rl   r  )r  r   r  r  r  r   rl   r  )rJ   rK   rL   r  r  r  r  rH   rH   rH   rI   r    s
    

C
	r  )r~  dummy_inputsc                      sj   e Zd Zed.ddZd	d
d/ fddZd0ddZd1ddZ		d2d3d$d%Zd4d'd(Z	d5d,d-Z
  ZS )6$FunAudioChatForConditionalGenerationr  r   r   r,   rl   
str | Nonec                 C  s   | drdS td)Nre  rw  z Only audio modality is supported)
startswithr7   )clsr  r   rH   rH   rI   get_placeholder_str  s   
z8FunAudioChatForConditionalGeneration.get_placeholder_str )rX   vllm_configr   rX   c                  s   t    |jj}|j}|jj}|| _|| _|| _| |d t|j	| _
t|j	| _W d    n1 s6w   Y  | | t||jt|ddgd| _W d    n1 sYw   Y  | jj| _d S )Nre  language_modelQwen3ForCausalLM)r  	hf_configrX   architectures)r5   r6   model_configr  quant_configmultimodal_configrO   _mark_tower_modelr   rk  continuous_audio_towerr>  audio_tower_mark_language_modelr'   text_configr(   r  make_empty_intermediate_tensors)rB   r  rX   rO   r  r  rF   rH   rI   r6     s*   

	z-FunAudioChatForConditionalGeneration.__init__r   r   r  r   !tuple[torch.Tensor, torch.Tensor]c           
      C  s   |  dkr6|jd |jd kr6tt|jd t|jd }|d d d |f }|d d d d d |f }tj|dd}|ddd|  dd}| j	|\}}| j||||d}	|	j
|fS )Nr   r"   r   r2   r   r0   )r   r   r   )r3   r   r   r,   r:   r  permuter   r  r;  r   )
rB   r   r  r   min_lenr   flat_featuresaudio_feat_lengthsr  audio_outputsrH   rH   rI   _get_continuous_audio_features7  s,   

zCFunAudioChatForConditionalGeneration._get_continuous_audio_featuresr   r   r#   c           !        s  | d}| d}| d}| d}| d}|d u rg S tt| jdd}t|tjst|ttfr~t	|dkr~t
dd	 |D r~g }|D ]*}	|	 d
kr\|	jd dkr\|	d}	|	 dkrmtdt|	j d||	 qHtjjj|d|d}n
tdt| d|d u r||jtjd}t|tjst|ttfrt	|dkrt
dd	 |D rg }
|D ]*}	|	 d
kr|	jd dkr|	d}	|	 dkrtdt|	j d|
|	 qtjjj|
ddd}n
tdt| dtdddk}|rtdt|j dt|j dd t| jjdd }td| dd t| jdr@| jjj}tdt |! " ddd zG| jj#d j$}t |j%j! " }t |j&j! " }t |j'j! " }t |j(j! " }td |dd!|dd"|dd#|ddd W n
 t)y   Y nw t|tjrtd$t|j dd t|tjrtd%t|j dd t| jj*}t|jd& }|| d | | }||krt| jj+}|| }tj,j-|d|f|d'}tj,j-|d|fdd'}t|jd& }d }d }|d ur%|d ur%t|tjsJ t|tjsJ | j.|||d(\}}|d u r8tj/|jd ftj0|j1d)}t|tjsAJ | j||||d* | j2|3d&\}}|4 }t fd+d	t5|D }|r
d,d- |D }td.| dd |r|d }td/|j6 d0|j1 d1t0t7|8  d2t |! " ddd td3d}|r
|jd dkr
t	|dkr
|d d4kr
tj9:|st;<||d =   > ?  td5| dd |@d6d7} |d ur
tj9:| s
t;<| |=   > ?  td8|  dd |S )9Nr  r  r   r  rK  rA  r   c                 s      | ]	}t |tjV  qd S r   r  r:   r  r   trH   rH   rI   	<genexpr>j      zHFunAudioChatForConditionalGeneration.embed_multimodal.<locals>.<genexpr>r0   r"   z@FunAudioChat speech_ids must be a 1D tensor per item (got shape=)T)batch_firstr   zGFunAudioChat speech_ids must be a Tensor or a sequence of Tensors (got r  c                 s  r  r   r  r  rH   rH   rI   r    r  zKFunAudioChat speech_attention_mask must be a 1D tensor per item (got shape=zRFunAudioChat speech_attention_mask must be a Tensor or a sequence of Tensors (got VLLM_FUN_AUDIOCHAT_DEBUGr  1z+[FunAudioChat] embed_multimodal speech_ids=z speech_attention_mask=)flushr   z[FunAudioChat] audio_attn_impl=r   z[FunAudioChat] conv1_w_norm=z.6gz[FunAudioChat] attn0_q_norm=z k_norm=z v_norm=z o_norm=z[FunAudioChat] input_features=z&[FunAudioChat] feature_attention_mask=r   )value)r   r  r   r   )rI  rJ  rK  c                 3  s(    | ]\}} |d t |f V  qd S r   r   )r   r   r+   audio_featuresrH   rI   r    s    
c                 S  s   g | ]	}t |jd  qS )r   )r,   r   r  rH   rH   rI   r     s    zIFunAudioChatForConditionalGeneration.embed_multimodal.<locals>.<listcomp>z)[FunAudioChat] embed_multimodal out_lens=z[FunAudioChat] embed0 dtype=z device=z nan=z norm=VLLM_FUN_AUDIOCHAT_DUMP_PATH
   z [FunAudioChat] dumped embeds to z.npyz	_cont.npyz$[FunAudioChat] dumped continuous to )Arx  r,   r]   r  r  r:   r  r  tupler   allr3   r   squeeze	TypeErrorr   r   utilsrnnpad_sequencetypener  int64osgetenvprintr  rO   hasattrr   r   r/   normr   r   r   rn   rp   rr   rh   	ExceptionrB  rA  r   r  r  rM  r   r   r;  r  r   r   r   isnananypathexistsr8   saver  r  r  rz   )!rB   r   r  r  r   r  rK  pad_idspeech_ids_tensorsr  mask_tensorsdebug	attn_implconv1_wattn0q_normk_normv_normo_normrB  r   
target_lenpad_lenrI  rJ  r   r  lengthsembeds
embed_lenst0	dump_path	cont_pathrH   r  rI   embed_multimodalZ  s|  













 
z5FunAudioChatForConditionalGeneration.embed_multimodalNr  	positionsintermediate_tensorsIntermediateTensors | NonerN  r   "torch.Tensor | IntermediateTensorsc                 K  s"   ~|d urd }| j j||||dS )N)rN  )r  model)rB   r  r&  r'  rN  r   rH   rH   rI   r      s   z,FunAudioChatForConditionalGeneration.forwardr   c                 C  s   | j |S r   )r  compute_logits)rB   r   rH   rH   rI   r+  3  s   z3FunAudioChatForConditionalGeneration.compute_logitsrj   rk   rm   c                 C  s   t | dgd}||S )Nzaudio_invert_tower.)skip_prefixes)r&   r   )rB   rj   loaderrH   rH   rI   r   6  s   
z1FunAudioChatForConditionalGeneration.load_weights)r  r   r   r,   rl   r  )r  r   rX   r   )r   r   r  r   r   r,   rl   r  )r   r   rl   r#   r   )r  r   r&  r   r'  r(  rN  r   r   r   rl   r)  )r   r   rl   r   r   )rJ   rK   rL   classmethodr  r6   r  r%  r   r+  r   rM   rH   rH   rF   rI   r    s    

# K
r  )Mr   
__future__r   r  collections.abcr   r   r   	functoolsr   typingr   r  r8   r:   torch.nnr   transformersr   r	   transformers.activationsr
   %transformers.feature_extraction_utilsr   transformers.modeling_outputsr   vllm.configr   vllm.config.multimodalr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   !vllm.model_executor.layers.linearr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r   vllm.sequencer    vllm.utils.import_utilsr!   
interfacesr#   r$   r%   r  r&   r'   r(   Moduler)   rN   r   r   r>  rQ  rv  r  register_processorr  rH   rH   rH   rI   <module>   s\   
 t- tD
2
( 