o
    
۾i[t                     @   sv  U d Z ddlmZmZmZmZ ddlmZmZm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z; e<dZ=G dd de-Z>G dd de-Z?e>e?B Z@e
eAd< deeBejCf fddZDG dd de$ZEG dd de4ZFG d d! d!e6ZGG d"d# d#e3eG ZHG d$d% d%e5eG ZIG d&d' d'ejJZKG d(d) d)ejJZLG d*d+ d+eZMG d,d- d-ZNG d.d/ d/eNe1ZOG d0d1 d1eNe2ZPeOePd2ZQejReIeGeHd3G d4d5 d5eNe1ZSdS )6zCInference-only MiniCPM-O model compatible with HuggingFace weights.    )CallableIterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)nn)BatchFeature)BaseModelOutputWithPast)ACT2FNWhisperAttentionWhisperConfigWhisperEncoder)
VllmConfig)BaseDummyOptions)MULTIMODAL_REGISTRYMultiModalKwargsItems)MultiModalDataDictMultiModalFieldConfigNestedTensors)	AudioItemAudioProcessorItemsDictEmbeddingItemsModalityDataModalityDataItemsMultiModalDataItems)PromptReplacementPromptUpdatePromptUpdateDetails)TensorSchemaTensorShape   )_MAX_FRAMES_PER_VIDEOMiniCPMV2_6MiniCPMV4_5MiniCPMVDummyInputsBuilderMiniCPMVMultiModalDataParserMiniCPMVMultiModalProcessorMiniCPMVProcessingInfo_minicpmv_field_config)AutoWeightsLoadercast_overflow_tensorsmaybe_prefixcpuc                   @   sp   e Zd ZU dZdZed ed< eej	e
ej	 B eddddhdf ed< 	 eej	e
ej	 B edd	f ed
< dS )MiniCPMOAudioFeatureInputsz
    Dimensions:
        - bns: Batch size * number of audios * number of slices
        - bn: Batch size * number of audios
        - c: Number of channels
        - l: Length
        - s: Number of slices
    audio_featurestypebnscldynamic_dimsbnsaudio_feature_lensN__name__
__module____qualname____doc__r2   r   __annotations__r   torchTensorlistr"    rD   rD   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/minicpmo.pyr0   O   s   
 	r0   c                   @   sL   e Zd ZU dZdZed ed< eej	e
ej	 B eddddhdf ed< dS )	MiniCPMOAudioEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of audios
        - s: Number of slices
        - h: Hidden size (must match language model backbone)

    Length of each slice may vary, so pass it as a list.
    audio_embedsr2   r8   r9   hr6   Nr;   rD   rD   rD   rE   rF   o   s   
 	rF   MiniCPMOAudioInputs	hf_inputsc                 C   s0   t di t| tdtdtddS )Naudio)r1   r:   rG   rD   )dictr+   r   batched)rJ   rD   rD   rE   _minicpmo_field_config   s   
rN   c                       sP   e Zd Zdeeejf deeeejf geeef f ddf fddZ	  Z
S )MiniCPMOAudioEmbeddingItemsdatafields_factoryreturnNc                    s   t  j|ddh|d d S )NimagerG   )modalityrequired_fieldsrQ   )super__init__)selfrP   rQ   	__class__rD   rE   rW      s   
z$MiniCPMOAudioEmbeddingItems.__init__)r<   r=   r>   r   strrA   rB   r   r   rW   __classcell__rD   rD   rY   rE   rO      s    
rO   c                       sD   e Zd Zdeeejf ee B de	e
e
f dB f fddZ  ZS )MiniCPMOMultiModalDataParserrP   rR   Nc                    s"   t |trt|tdS t |S )N)rQ   )
isinstancerL   rO   rN   rV   _parse_audio_data)rX   rP   rY   rD   rE   r_      s   
z.MiniCPMOMultiModalDataParser._parse_audio_data)r<   r=   r>   rL   r[   rA   rB   r   r   r   r   r_   r\   rD   rD   rY   rE   r]      s    r]   c                	       s   e Zd ZdZdd ZdeeedB f f fddZ			d"d
ede	dedefddZ
defddZdefddZdefddZdefddZdefddZdefddZdedefddZdedeeef defd d!Z  ZS )#MiniCPMOProcessingInfo(<audio>./</audio>)c                 C   s   t |  |  dS )N)	target_srexpected_hidden_size)r]   get_default_audio_sampling_rate_get_expected_hidden_sizerX   rD   rD   rE   get_data_parser   s   z&MiniCPMOProcessingInfo.get_data_parserrR   Nc                    s   i t   dd iS )NrK   )rV   get_supported_mm_limitsrf   rY   rD   rE   rh      s   z.MiniCPMOProcessingInfo.get_supported_mm_limitsTr#   
audio_lenschunk_inputchunk_lengthc                 C   s   |   }|j|||dS N)rj   rk   )get_hf_processorget_audio_placeholder)rX   ri   rj   rk   hf_processorrD   rD   rE   rn      s   z,MiniCPMOProcessingInfo.get_audio_placeholderc                 C   s   |   }t|ddS )Naudio_pool_step   )get_hf_configgetattr)rX   	hf_configrD   rD   rE   get_default_audio_pool_step   s   z2MiniCPMOProcessingInfo.get_default_audio_pool_stepc                 C      dS )Ni>  rD   rf   rD   rD   rE   rd         z6MiniCPMOProcessingInfo.get_default_audio_sampling_ratec                 C   s
   |   jS N)rr   audio_chunk_lengthrf   rD   rD   rE   get_chunk_length   s   
z'MiniCPMOProcessingInfo.get_chunk_lengthc                 C   s,   |   }d}|d d d }|| | d S )Nd   r#   rq   )ru   )rX   	pool_stepfbank_feat_in_chunkcnn_feat_in_chunkrD   rD   rE   get_max_audio_tokens_per_chunk   s   z5MiniCPMOProcessingInfo.get_max_audio_tokens_per_chunkc                 C   rv   )N   rD   rf   rD   rD   rE   'get_max_audio_chunks_with_most_features   rw   z>MiniCPMOProcessingInfo.get_max_audio_chunks_with_most_featuresc                 C   s   |   }|  | S rx   )r   r   )rX   
num_chunksrD   rD   rE   get_max_audio_tokens   s   z+MiniCPMOProcessingInfo.get_max_audio_tokensr   c                 C   s$   |   }|  }t|| | d S )Nr#   )rd   r   int)rX   r   sampling_ratenum_tokens_per_chunkrD   rD   rE   get_audio_len_by_num_chunks   s   z2MiniCPMOProcessingInfo.get_audio_len_by_num_chunksseq_len	mm_countsc           
      C   sl   | dd}| dd}| dd}|  | }|  | }| || | }t|t|d t}	t|	dS )NrS   r   videorK   r#   )getget_max_image_tokensr   get_max_video_framesminmaxr$   )
rX   r   r   
max_images
max_videos
max_audiosmax_image_tokensmax_audio_tokensmax_total_framesmax_frames_per_videorD   rD   rE   !get_num_frames_with_most_features   s   

z8MiniCPMOProcessingInfo.get_num_frames_with_most_featuresTr#   )r<   r=   r>   audio_patternrg   r   r[   r   rh   boolrn   ru   rd   rz   r   r   r   r   r   r\   rD   rD   rY   rE   r`      s:    

r`   c                	       sd   e Zd Zdeeef def fddZ	d
dedeeef deeef dB def fdd	Z	  Z
S )MiniCPMODummyInputsBuilderr   rR   c                    s(   | dd}| jj| }t || S )NrK   r   )r   infor   rV   get_dummy_text)rX   r   
num_audiosaudio_prompt_textsrY   rD   rE   r      s   z)MiniCPMODummyInputsBuilder.get_dummy_textNr   
mm_optionsc                    s^   | dd}| j | j  }|r| dnd }d| j|||di}i t ||||S )NrK   r   )lengthr   	overrides)r   r   r   rd   _get_dummy_audiosrV   get_dummy_mm_data)rX   r   r   r   r   	audio_lenaudio_overridesaudio_mm_datarY   rD   rE   r     s   z,MiniCPMODummyInputsBuilder.get_dummy_mm_datarx   )r<   r=   r>   r   r[   r   r   r   r   r   r\   rD   rD   rY   rE   r      s    
r   c                
       s   e Zd Z		ddedededefddZd	eeef d
eeef deeef deee	f fddZ
d	eeef d
eeef deeef deee	f f fddZdedeeef dedee f fddZdedeeef deeef fddZ  ZS )MiniCPMOMultiModalProcessorTr#   ri   rj   rk   rR   c                 C   s   | j j|||dS rl   )r   rn   )rX   ri   rj   rk   rD   rD   rE   get_audio_prompt_texts  s
   z2MiniCPMOMultiModalProcessor.get_audio_prompt_textsmm_data	mm_kwargs
tok_kwargsc           	      C   s   | d }d u ri S | jjd|idd}|dttf}t|tr&i }|S | j| jjgt	| ddd |D ii |ddi|d	d
hd}dd t
|d	 |d
 D }||d	< |S )NaudiosrK   F)validatec                 S   s   g | ]}|gqS rD   rD   ).0rK   rD   rD   rE   
<listcomp><  s    z>MiniCPMOMultiModalProcessor.process_audios.<locals>.<listcomp>rj   Tr1   r:   )promptsr   r   r   out_keysc                 S   s$   g | ]\}}|d d d |f qS rx   rD   )r   featfeature_lenrD   rD   rE   r   D  s    )r   r   parse_mm_data	get_itemsrO   r   r^   _base_call_hf_processorr   lenzip)	rX   r   r   r   r   mm_itemsparsed_audiosaudio_inputsunpadded_audio_featuresrD   rD   rE   process_audios)  s0   

z*MiniCPMOMultiModalProcessor.process_audiosc                    s"   i t  |||| |||S rx   )rV   process_mm_inputsr   )rX   r   r   r   rY   rD   rE   r   O  s
   z-MiniCPMOMultiModalProcessor.process_mm_inputsr   hf_processor_mm_kwargsout_mm_kwargsc                    sD   t  j ||d}jj}dtf fdd}g |td||dS )N)r   r   r   item_idxc                    s\     dttf}t|tr || d }jttt	|}n|
| }t|dS )NrK   rG   z<unk>)r   rO   r   r^   r   r   r   summapr   get_audio_lengthr    select_textr   )r   r   single_audio_embedsr   r   rX   rD   rE   get_audio_replacementh  s   

zNMiniCPMOMultiModalProcessor._get_prompt_updates.<locals>.get_audio_replacementrK   )rT   targetreplacement)rV   _get_prompt_updatesr   r   r   r   )rX   r   r   r   base_updatesaudio_placeholderr   rY   r   rE   r   Z  s    z/MiniCPMOMultiModalProcessor._get_prompt_updatesrJ   c                 C   s   t |S rx   )rN   )rX   rJ   r   rD   rD   rE   _get_mm_fields_config  s   z1MiniCPMOMultiModalProcessor._get_mm_fields_configr   )r<   r=   r>   r   r   r[   r   r   objectr   r   r   r   r   r   r   r   r   r   r   r\   rD   rD   rY   rE   r     s\    





&




)

r   c                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	MultiModalProjectorin_dimout_dimc                    s<   t    tj||dd| _t | _tj||dd| _d S )NT)in_featuresout_featuresbias)rV   rW   r
   Linearlinear1ReLUrelulinear2)rX   r   r   rY   rD   rE   rW     s   

zMultiModalProjector.__init__r1   rR   c                 C   s   |  | |}| |}|S rx   )r   r   r   )rX   r1   hidden_statesrD   rD   rE   forward  s   
zMultiModalProjector.forward)	r<   r=   r>   r   rW   rA   rB   r   r\   rD   rD   rY   rE   r     s    r   c                       sB   e Zd Zdedef fddZdejdejdejfdd	Z  Z	S )
MiniCPMWhisperEncoderLayerconfig	layer_idxc                    s   t    |j| _t| j|j|j||d| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)	embed_dim	num_headsdropoutr   r   )rV   rW   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr
   	LayerNormself_attn_layer_normr   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_norm)rX   r   r   rY   rD   rE   rW     s    
z#MiniCPMWhisperEncoderLayer.__init__r   attention_maskrR   c                 C   s   |}|  |}| j||d\}}tjj|| j| jd}|| }|}| |}| | |}tjj|| j	| jd}| 
|}tjj|| j| jd}|| }|jtjkrWt|}|f}|S )N)r   r   ptraining)r   r   r
   
functionalr   r   r   r   r   r   r   dtyperA   float16r-   )rX   r   r   residual_outputsrD   rD   rE   r     s2   






z"MiniCPMWhisperEncoderLayer.forward)
r<   r=   r>   r   r   rW   rA   rB   r   r\   rD   rD   rY   rE   r     s    r   c                       sD   e Zd Zdef fddZ	d
dejdejdB defdd	Z  Z	S )MiniCPMWhisperEncoderr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |d qS ))r   )r   )r   ir   rD   rE   r     s    
z2MiniCPMWhisperEncoder.__init__.<locals>.<listcomp>)rV   rW   r
   
ModuleListrangeencoder_layerslayers)rX   r   rY   r   rE   rW     s   

zMiniCPMWhisperEncoder.__init__Ninput_featuresr   rR   c                 C   s  |j | jjj| jjjd}tj| |}tj| |}|	ddd}| j
j}|d |jd d d f }|| }tjj|| j| jd}d}t| jD ](\}}||f }d}	| jrgtg }
|
| jk rgd}	|	rld	}qM|||}|d }qM| |}||f }t||d
S )Nr   devicer   rq   r#   r   rD   FT)NN)last_hidden_stater   )toconv1weightr   r  r
   r   geluconv2permuteembed_positionsshaper   r   	enumerater  rA   rand	layerdrop
layer_normr   )rX   r  r   inputs_embeds	embed_posr   encoder_statesidxencoder_layerto_dropdropout_probabilitylayer_outputsrD   rD   rE   r     sB   






zMiniCPMWhisperEncoder.forwardrx   )
r<   r=   r>   r   rW   rA   rB   r   r   r\   rD   rD   rY   rE   r     s    r   c                       sN  e Zd ZdZg dddgdZedededed	B fd
dZddde	def fddZ
ddde	defddZdeeeejf  dee fddZdedfdedededejdedejfddZd ejfd!d"Zd#edeej fd$d%Zd&eded	B fd'd(Zd&edef fd)d*Zd+edejeej B fd,d-Zd.ef fd/d0Z   Z!S )1MiniCPMOBaseModelz9Base mixin class for MiniCPM-O models with audio support.)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projrT   r   rR   Nc                 C   s2   | drdS | drdS | drdS td)NrS   z(<image>./</image>)r   z(<video>./</video>)rK   ra   z0Only image, video or audio modality is supported)
startswith
ValueError)clsrT   r   rD   rD   rE   get_placeholder_str  s   


z%MiniCPMOBaseModel.get_placeholder_str prefixvllm_configr(  c                   X   t  j||d | |d | j|t|dd| _W d    d S 1 s%w   Y  d S Nr)  r(  rK   apmrV   rW   _mark_tower_modelinit_audio_moduler.   r-  rX   r)  r(  rY   rD   rE   rW   )     

"zMiniCPMOBaseModel.__init__c                C   sP   | j j}t|}t|jd }tj| j j| j jd| _t	|| j
d| _d| _|S )N   )stride)r   r   )r   audio_configr   r   r   r
   	AvgPool1drp   audio_avg_poolerr   r   audio_projection_layeraudio_encoder_layer)rX   r)  r(  r6  modelaudio_output_dimrD   rD   rE   r0  1  s   z#MiniCPMOBaseModel.init_audio_moduleweightsc                 C   s   t | dgd}||S )Ntts)skip_prefixes)r,   load_weights)rX   r=  loaderrD   rD   rE   r@  ?  s   
zMiniCPMOBaseModel.load_weightsr5  r   size
chunk_sizenum_left_chunksr  num_lookheadc                 C   s   t j|||t jd}t j||d}|| }|dk rt |}	nt j|| dd}
|
| }	|d }t j|| | |d}t j||dd}|	d}	|d}||	k||k @ }|S )N)r  r   )r  r   )r   r#   )r   )rA   zerosr   arange
zeros_likeclamp	unsqueeze)rX   rB  rC  rD  r  rE  retrow_indiceschunk_indicesstart_indicesstart_chunk_indicesend_chunk_indicesend_indicescol_indicesrD   rD   rE   subsequent_chunk_maskC  s    

z'MiniCPMOBaseModel.subsequent_chunk_maskinput_lengthsc                 C   s>   |d d d }|| j j | j j d }|jtjd}||fS )Nr#   rq   )r   )r   rp   r  rA   int32)rX   rT  input_lengths_after_cnninput_lengths_after_poolingrD   rD   rE    _get_feat_extract_output_lengthsc  s   
z2MiniCPMOBaseModel._get_feat_extract_output_lengthsrP   c           !   	   C   sz  | j j}|d }t|trNt|}|d jd }tdd |D }|d j}|d j}t	j
|||f||d}	t|D ]\}
}|jd }||	|
dd |f< q:n|}	|d	 }t|t	jr_|d}t	|}|	j\}}}|d
 d d
 }t	jd||j|jdd||}|d
||}||k}||d
d
||d
||}|j| jjjj| jjjjd}|dkrt|d }| j||d|jd}t	|t	|}td||< | j|	|dj| j }| |}|d
d}|  |}|d
d}| !|\}}|}tt	j  }d}t"t|D ]1}
tt	j  } t"t||
 D ]}| #||d || d d f  |d
7 }q|#t	$|  q	|S )Nr1   r   c                 s   s    | ]}|j d  V  qdS )r5  N)r  )r   itemrD   rD   rE   	<genexpr>v  s    z<MiniCPMOBaseModel.get_audio_hidden_states.<locals>.<genexpr>r  r5  .r:   r#   rq   2   )rB  rC  rD  r  z-inf)r   )%r   ry   r^   rC   r   r  r   r  r   rA   rF  r  rB   unbindhstackrG  rJ  expandviewr  r-  r  r  r   rS  
logical_orlogical_notfloatr   r:  r9  	transposer8  rX  r   appendcat)!rX   rP   rk   wavforms_rawBCLr  r   wavformsr   wavforms_itemL_itemaudio_feature_lens_rawr:   
batch_sizer   max_mel_seq_lenmax_seq_len	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskchunk_num_frame
chunk_maskaudio_statesrG   feature_lens_after_poolingnum_audio_tokensfinal_audio_embedsr  target_audio_embeds_lstrD   rD   rE   get_audio_hidden_statesl  s   









z)MiniCPMOBaseModel.get_audio_hidden_stateskwargsc                 K   sX   | dd }| dd }|d u r|d u rd S |d ur td|dS | d}td||dS )Nr1   rG   )r2   rG   r:   )r2   r1   r:   )poprF   r0   )rX   r  r1   rG   r:   rD   rD   rE   _parse_and_validate_audio_input  s   
z1MiniCPMOBaseModel._parse_and_validate_audio_inputc                    sD   t  jdi |}|D ]}|dv rd|vr| jdi ||d< q|S )N)r1   rG   r   rD   )rV   %_parse_and_validate_multimodal_inputsr  )rX   r  
modalities	input_keyrY   rD   rE   r    s   z7MiniCPMOBaseModel._parse_and_validate_multimodal_inputsaudio_inputc                 C   s   |d dkr
|d S |  |S )Nr2   rG   )r~  )rX   r  rD   rD   rE   _process_audio_input  s   
z&MiniCPMOBaseModel._process_audio_inputr  c                    s@   t  |}|D ]}|dkr|d }| |}|t|7 }q|S )Nr   )rV   _process_multimodal_inputsr  tuple)rX   r  multimodal_embeddingsrT   r  audio_embeddingsrY   rD   rE   r    s   
z,MiniCPMOBaseModel._process_multimodal_inputs)"r<   r=   r>   r?   packed_modules_mappingclassmethodr[   r   r%  r   rW   r0  r   r  rA   rB   setr@  
CPU_DEVICEr  rS  
LongTensorrX  r0   rC   r~  r   rI   r  rL   r  r  r  r\   rD   rD   rY   rE   r    s\    
$
 	
^

	r  c                       0   e Zd ZdZdddedef fddZ  ZS )MiniCPMO2_6z(MiniCPM-O 2.6 model with Qwen2 backbone.r&  r'  r)  r(  c                   r*  r+  r.  r1  rY   rD   rE   rW     r2  zMiniCPMO2_6.__init__r<   r=   r>   r?   r   r[   rW   r\   rD   rD   rY   rE   r        $r  c                       r  )MiniCPMO4_5z(MiniCPM-O 4.5 model with Qwen3 backbone.r&  r'  r)  r(  c                   r*  r+  r.  r1  rY   rD   rE   rW     r2  zMiniCPMO4_5.__init__r  rD   rD   rY   rE   r    r  r  )rq      )r3     )r   dummy_inputsc                   @   s@   e Zd ZdZdddedefddZdddedefdd	Zd
S )MiniCPMOz
    MiniCPM-O model with audio support.
    Different versions use different LLM backbones:
    - Version 2.6: Uses Qwen2
    - Version 4.5: Uses Qwen3
    r&  r'  r)  r(  c          
   
   C   s   |j j}t|dr;zt|j}|d}tdd |d d D }W n ttfy: } z
td|j d|d }~ww d}t	
|}|d u r^d	d
d tt	 D }	td|	 d| |||dS )Nversion.c                 s   s    | ]}t |V  qd S rx   )r   )r   xrD   rD   rE   r[  7  s    z#MiniCPMO.__new__.<locals>.<genexpr>rq   z(Invalid model version format in config: z5. Expected a dot-separated version string like '4.5'.r  z, c                 S   s"   g | ]}|d   d|d  qS )r   r  r#   rD   )r   vrD   rD   rE   r   E  s   " z$MiniCPMO.__new__.<locals>.<listcomp>z+Currently, MiniCPMO only supports versions z. Got version: r,  )model_configrt   hasattrr[   r  splitr  r#  	TypeError_MINICPMO_SUPPORT_VERSIONr   joinsortedkeys)
r$  r)  r(  r   version_strversion_partsr  einstance_clssupported_versionsrD   rD   rE   __new__/  s8   



zMiniCPMO.__new__c                C   s   d S rx   rD   r1  rD   rD   rE   rW   N  s   zMiniCPMO.__init__N)r<   r=   r>   r?   r   r[   r  rW   rD   rD   rD   rE   r  "  s    r  )Tr?   collections.abcr   r   r   r   typingr   r   r   r	   rA   r
   transformersr   transformers.modeling_outputsr   ,transformers.models.whisper.modeling_whisperr   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.multimodalr   r   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   r   r   vllm.multimodal.processingr   r   r    vllm.utils.tensor_schemar!   r"   minicpmvr$   r%   r&   r'   r(   r)   r*   r+   utilsr,   r-   r.   r  r  r0   rF   rI   r@   r[   rB   rN   rO   r]   r`   r   r   Moduler   r   r   r  r  r  r  register_processorr  rD   rD   rD   rE   <module>   sX    (

 
	L"o7@ v