o
    -i3i                     @   s8  U d Z ddlmZmZmZmZ ddlmZmZm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z; e<dZ=G dd de.Z>G dd de.Z?e>e?B Z@e
eAd< deeBejCf fddZDG dd de$ZEG dd de4ZFG d d! d!e6ZGG d"d# d#e3eG ZHG d$d% d%e5eG ZIG d&d' d'ejJZKG d(d) d)ejJZLG d*d+ d+eZMejNeIeGeHd,G d-d. d.e2ZOdS )/zCInference-only MiniCPM-O model compatible with HuggingFace weights.    )CallableIterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)nn)BatchFeature)BaseModelOutputWithPast)ACT2FNWhisperAttentionWhisperConfigWhisperEncoder)
VllmConfig)BaseDummyOptions)MULTIMODAL_REGISTRYMultiModalKwargsItems)MultiModalDataDictMultiModalFieldConfigNestedTensors)	AudioItemAudioProcessorItemsDictEmbeddingItemsModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser)PromptReplacementPromptUpdatePromptUpdateDetails)TensorSchemaTensorShape   )_MAX_FRAMES_PER_VIDEOMiniCPMV2_6MiniCPMVDummyInputsBuilderMiniCPMVMultiModalDataParserMiniCPMVMultiModalProcessorMiniCPMVProcessingInfo_minicpmv_field_config)AutoWeightsLoadercast_overflow_tensorsmaybe_prefixcpuc                   @   sp   e Zd ZU dZdZed ed< eej	e
ej	 B eddddhdf ed< 	 eej	e
ej	 B edd	f ed
< dS )MiniCPMOAudioFeatureInputsz
    Dimensions:
        - bns: Batch size * number of audios * number of slices
        - bn: Batch size * number of audios
        - c: Number of channels
        - l: Length
        - s: Number of slices
    audio_featurestypebnscldynamic_dimsbnsaudio_feature_lensN__name__
__module____qualname____doc__r2   r   __annotations__r   torchTensorlistr#    rD   rD   `/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/minicpmo.pyr0   O   s   
 	r0   c                   @   sL   e Zd ZU dZdZed ed< eej	e
ej	 B eddddhdf ed< dS )	MiniCPMOAudioEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of audios
        - s: Number of slices
        - h: Hidden size (must match language model backbone)

    Length of each slice may vary, so pass it as a list.
    audio_embedsr2   r8   r9   hr6   Nr;   rD   rD   rD   rE   rF   o   s   
 	rF   MiniCPMOAudioInputs	hf_inputsc                 C   s0   t di t| tdtdtddS )Naudio)r1   r:   rG   rD   )dictr+   r   batched)rJ   rD   rD   rE   _minicpmo_field_config   s   
rN   c                       sP   e Zd Zdeeejf deeeejf geeef f ddf fddZ	  Z
S )MiniCPMOAudioEmbeddingItemsdatafields_factoryreturnNc                    s   t  j|ddh|d d S )NimagerG   )modalityrequired_fieldsrQ   )super__init__)selfrP   rQ   	__class__rD   rE   rW      s   
z$MiniCPMOAudioEmbeddingItems.__init__)r<   r=   r>   r   strrA   rB   r   r   rW   __classcell__rD   rD   rY   rE   rO      s    
rO   c                       sD   e Zd Zdeeejf ee B de	e
e
f dB f fddZ  ZS )MiniCPMOMultiModalDataParserrP   rR   Nc                    s"   t |trt|tdS t |S )N)rQ   )
isinstancerL   rO   rN   rV   _parse_audio_data)rX   rP   rY   rD   rE   r_      s   
z.MiniCPMOMultiModalDataParser._parse_audio_data)r<   r=   r>   rL   r[   rA   rB   r   r   r   r   r_   r\   rD   rD   rY   rE   r]      s    r]   c                	       s   e Zd ZdZdeeedB f f fddZ		d ded	ed
edefddZ	defddZ
defddZdefddZdefddZdefddZdefddZdedefddZdedeeef defddZ  ZS )!MiniCPMOProcessingInfo(<audio>./</audio>)rR   Nc                    s   i t   dd iS )NrK   )rV   get_supported_mm_limitsrX   rY   rD   rE   rb      s   z.MiniCPMOProcessingInfo.get_supported_mm_limitsTr$   
audio_lenschunk_inputchunk_lengthc                 C   s   |   }|j|||dS N)re   rf   )get_hf_processorget_audio_placeholder)rX   rd   re   rf   hf_processorrD   rD   rE   ri      s   z,MiniCPMOProcessingInfo.get_audio_placeholderc                 C      dS )N   rD   rc   rD   rD   rE   get_default_audio_pool_step      z2MiniCPMOProcessingInfo.get_default_audio_pool_stepc                 C   rk   )Ni>  rD   rc   rD   rD   rE   get_default_audio_sampling_rate   rn   z6MiniCPMOProcessingInfo.get_default_audio_sampling_ratec                 C   s
   |   jS N)get_hf_configaudio_chunk_lengthrc   rD   rD   rE   get_chunk_length   s   
z'MiniCPMOProcessingInfo.get_chunk_lengthc                 C   s,   |   }d}|d d d }|| | d S )Nd   r$   rl   )rm   )rX   	pool_stepfbank_feat_in_chunkcnn_feat_in_chunkrD   rD   rE   get_max_audio_tokens_per_chunk   s   z5MiniCPMOProcessingInfo.get_max_audio_tokens_per_chunkc                 C   rk   )N   rD   rc   rD   rD   rE   'get_max_audio_chunks_with_most_features   rn   z>MiniCPMOProcessingInfo.get_max_audio_chunks_with_most_featuresc                 C   s   |   }|  | S rp   )rz   rx   )rX   
num_chunksrD   rD   rE   get_max_audio_tokens   s   z+MiniCPMOProcessingInfo.get_max_audio_tokensr{   c                 C   s$   |   }|  }t|| | d S )Nr$   )ro   rx   int)rX   r{   sampling_ratenum_tokens_per_chunkrD   rD   rE   get_audio_len_by_num_chunks   s   z2MiniCPMOProcessingInfo.get_audio_len_by_num_chunksseq_len	mm_countsc           
      C   sl   | dd}| dd}| dd}|  | }|  | }| || | }t|t|d t}	t|	dS )NrS   r   videorK   r$   )getget_max_image_tokensr|   get_max_video_framesminmaxr%   )
rX   r   r   
max_images
max_videos
max_audiosmax_image_tokensmax_audio_tokensmax_total_framesmax_frames_per_videorD   rD   rE   !get_num_frames_with_most_features   s   

z8MiniCPMOProcessingInfo.get_num_frames_with_most_featuresTr$   )r<   r=   r>   audio_patternr   r[   r}   rb   boolri   rm   ro   rs   rx   rz   r|   r   r   r\   rD   rD   rY   rE   r`      s8    

r`   c                	       sd   e Zd Zdeeef def fddZ	d
dedeeef deeef dB def fdd	Z	  Z
S )MiniCPMODummyInputsBuilderr   rR   c                    s(   | dd}| jj| }t || S )NrK   r   )r   infor   rV   get_dummy_text)rX   r   
num_audiosaudio_prompt_textsrY   rD   rE   r      s   z)MiniCPMODummyInputsBuilder.get_dummy_textNr   
mm_optionsc                    s^   | dd}| j | j  }|r| dnd }d| j|||di}i t ||||S )NrK   r   )lengthr   	overrides)r   r   rz   ro   _get_dummy_audiosrV   get_dummy_mm_data)rX   r   r   r   r   	audio_lenaudio_overridesaudio_mm_datarY   rD   rE   r      s   z,MiniCPMODummyInputsBuilder.get_dummy_mm_datarp   )r<   r=   r>   r   r[   r}   r   r   r   r   r\   rD   rD   rY   rE   r      s    
r   c                
       s  e Zd ZdefddZ		ddedededefd	d
Zde	ee
f de	ee
f de	ee
f de	eef fddZde	ee
f de	ee
f de	ee
f de	eef f fddZdede	ee
f dedee f fddZdede	ee
f de	eef fddZ  ZS )MiniCPMOMultiModalProcessorrR   c                 C   s   t | j dS )N)	target_sr)r]   r   ro   rc   rD   rD   rE   _get_data_parser  s   z,MiniCPMOMultiModalProcessor._get_data_parserTr$   rd   re   rf   c                 C   s   | j j|||dS rg   )r   ri   )rX   rd   re   rf   rD   rD   rE   get_audio_prompt_texts  s
   z2MiniCPMOMultiModalProcessor.get_audio_prompt_textsmm_data	mm_kwargs
tok_kwargsc                 C   s   | d }d u ri S |  d|idttf}t|tr#i }|S | j| jj	gt
| ddd |D ii |ddi|ddhd	}d
d t|d |d D }||d< |S )NaudiosrK   c                 S   s   g | ]}|gqS rD   rD   ).0rK   rD   rD   rE   
<listcomp>:  s    z>MiniCPMOMultiModalProcessor.process_audios.<locals>.<listcomp>re   Tr1   r:   )promptsr   r   r   out_keysc                 S   s$   g | ]\}}|d d d |f qS rp   rD   )r   featfeature_lenrD   rD   rE   r   B  s    )r   r   parse_mm_data	get_itemsrO   r   r^   _base_call_hf_processorr   r   lenzip)rX   r   r   r   r   parsed_audiosaudio_inputsunpadded_audio_featuresrD   rD   rE   process_audios&  s0   


z*MiniCPMOMultiModalProcessor.process_audiosc                    s"   i t  |||| |||S rp   )rV   process_mm_inputsr   )rX   r   r   r   rY   rD   rE   r   M  s
   z-MiniCPMOMultiModalProcessor.process_mm_inputsmm_itemshf_processor_mm_kwargsout_mm_kwargsc                    sD   t  j ||d}jj}dtf fdd}g |td||dS )N)r   r   r   item_idxc                    s\     dttf}t|tr || d }jttt	|}n|
| }t|dS )NrK   rG   z<unk>)r   rO   r   r^   r   r   r   summapr   get_audio_lengthr!   select_textr   )r   r   single_audio_embedsr   r   rX   rD   rE   get_audio_replacementf  s   

zNMiniCPMOMultiModalProcessor._get_prompt_updates.<locals>.get_audio_replacementrK   )rT   targetreplacement)rV   _get_prompt_updatesr   r   r}   r   )rX   r   r   r   base_updatesaudio_placeholderr   rY   r   rE   r   X  s    z/MiniCPMOMultiModalProcessor._get_prompt_updatesrJ   c                 C   s   t |S rp   )rN   )rX   rJ   r   rD   rD   rE   _get_mm_fields_config  s   z1MiniCPMOMultiModalProcessor._get_mm_fields_configr   )r<   r=   r>   r   r   r}   r   r[   r   r   objectr   r   r   r   r   r   r    r   r   r   r   r\   rD   rD   rY   rE   r     s^    





'




)

r   c                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	MultiModalProjectorin_dimout_dimc                    s<   t    tj||dd| _t | _tj||dd| _d S )NT)in_featuresout_featuresbias)rV   rW   r
   Linearlinear1ReLUrelulinear2)rX   r   r   rY   rD   rE   rW     s   

zMultiModalProjector.__init__r1   rR   c                 C   s   |  | |}| |}|S rp   )r   r   r   )rX   r1   hidden_statesrD   rD   rE   forward  s   
zMultiModalProjector.forward)	r<   r=   r>   r}   rW   rA   rB   r   r\   rD   rD   rY   rE   r     s    r   c                       sB   e Zd Zdedef fddZdejdejdejfdd	Z  Z	S )
MiniCPMWhisperEncoderLayerconfig	layer_idxc                    s   t    |j| _t| j|j|j||d| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)	embed_dim	num_headsdropoutr   r   )rV   rW   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr
   	LayerNormself_attn_layer_normr   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_norm)rX   r   r   rY   rD   rE   rW     s    
z#MiniCPMWhisperEncoderLayer.__init__r   attention_maskrR   c                 C   s   |}|  |}| j||d\}}tjj|| j| jd}|| }|}| |}| | |}tjj|| j	| jd}| 
|}tjj|| j| jd}|| }|jtjkrWt|}|f}|S )N)r   r   ptraining)r   r   r
   
functionalr   r   r   r   r   r   r   dtyperA   float16r-   )rX   r   r   residual_outputsrD   rD   rE   r     s2   






z"MiniCPMWhisperEncoderLayer.forward)
r<   r=   r>   r   r}   rW   rA   rB   r   r\   rD   rD   rY   rE   r     s    r   c                       sD   e Zd Zdef fddZ	d
dejdejdB defdd	Z  Z	S )MiniCPMWhisperEncoderr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |d qS ))r   )r   )r   ir   rD   rE   r     s    
z2MiniCPMWhisperEncoder.__init__.<locals>.<listcomp>)rV   rW   r
   
ModuleListrangeencoder_layerslayers)rX   r   rY   r   rE   rW     s   

zMiniCPMWhisperEncoder.__init__Ninput_featuresr   rR   c                 C   s  |j | jjj| jjjd}tj| |}tj| |}|	ddd}| j
j}|d |jd d d f }|| }tjj|| j| jd}d}t| jD ](\}}||f }d}	| jrgtg }
|
| jk rgd}	|	rld	}qM|||}|d }qM| |}||f }t||d
S )Nr   devicer   rl   r$   r   rD   FT)NN)last_hidden_stater   )toconv1weightr   r   r
   r   geluconv2permuteembed_positionsshaper   r   	enumerater   rA   rand	layerdrop
layer_normr   )rX   r   r   inputs_embeds	embed_posr   encoder_statesidxencoder_layerto_dropdropout_probabilitylayer_outputsrD   rD   rE   r     sB   






zMiniCPMWhisperEncoder.forwardrp   )
r<   r=   r>   r   rW   rA   rB   r   r   r\   rD   rD   rY   rE   r     s    r   )r   dummy_inputsc                       sJ  e Zd Zg dddgdZededededB fd	d
Zdddedef fddZ	dddedefddZ
deeeejf  dee fddZdedfdedededejdedejfddZdejfd d!Zd"edeej fd#d$Zd%ededB fd&d'Zd%edef fd(d)Zd*edejeej B fd+d,Zd-ef fd.d/Z  Z S )0MiniCPMO)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projrT   r   rR   Nc                 C   s2   | drdS | drdS | drdS td)NrS   z(<image>./</image>)r   z(<video>./</video>)rK   ra   z0Only image, video or audio modality is supported)
startswith
ValueError)clsrT   r   rD   rD   rE   get_placeholder_str  s   


zMiniCPMO.get_placeholder_str )prefixvllm_configr"  c                   sX   t  j||d | |d | j|t|dd| _W d    d S 1 s%w   Y  d S )N)r#  r"  rK   apm)rV   rW   _mark_tower_modelinit_audio_moduler.   r$  )rX   r#  r"  rY   rD   rE   rW   *  s   

"zMiniCPMO.__init__c                C   sP   | j j}t|}t|jd }tj| j j| j jd| _t	|| j
d| _d| _|S )N   )stride)r   r   )r   audio_configr   r}   r   r
   	AvgPool1daudio_pool_stepaudio_avg_poolerr   r   audio_projection_layeraudio_encoder_layer)rX   r#  r"  r*  modelaudio_output_dimrD   rD   rE   r&  2  s   zMiniCPMO.init_audio_moduleweightsc                 C   s   t | dgd}||S )Ntts)skip_prefixes)r,   load_weights)rX   r2  loaderrD   rD   rE   r5  @  s   
zMiniCPMO.load_weightsr)  r   size
chunk_sizenum_left_chunksr   num_lookheadc                 C   s   t j|||t jd}t j||d}|| }|dk rt |}	nt j|| dd}
|
| }	|d }t j|| | |d}t j||dd}|	d}	|d}||	k||k @ }|S )N)r   r   )r   r   )r   r$   )r   )rA   zerosr   arange
zeros_likeclamp	unsqueeze)rX   r7  r8  r9  r   r:  retrow_indiceschunk_indicesstart_indicesstart_chunk_indicesend_chunk_indicesend_indicescol_indicesrD   rD   rE   subsequent_chunk_maskD  s    

zMiniCPMO.subsequent_chunk_maskinput_lengthsc                 C   s>   |d d d }|| j j | j j d }|jtjd}||fS )Nr$   rl   )r   )r   r,  r   rA   int32)rX   rI  input_lengths_after_cnninput_lengths_after_poolingrD   rD   rE    _get_feat_extract_output_lengthsd  s   
z)MiniCPMO._get_feat_extract_output_lengthsrP   c           !   	   C   sz  | j j}|d }t|trNt|}|d jd }tdd |D }|d j}|d j}t	j
|||f||d}	t|D ]\}
}|jd }||	|
dd |f< q:n|}	|d	 }t|t	jr_|d}t	|}|	j\}}}|d
 d d
 }t	jd||j|jdd||}|d
||}||k}||d
d
||d
||}|j| jjjj| jjjjd}|dkrt|d }| j||d|jd}t	|t	|}td||< | j|	|dj| j }| |}|d
d}|  |}|d
d}| !|\}}|}tt	j  }d}t"t|D ]1}
tt	j  } t"t||
 D ]}| #||d || d d f  |d
7 }q|#t	$|  q	|S )Nr1   r   c                 s   s    | ]}|j d  V  qdS )r)  N)r  )r   itemrD   rD   rE   	<genexpr>w  s    z3MiniCPMO.get_audio_hidden_states.<locals>.<genexpr>r   r)  .r:   r$   rl   2   )r7  r8  r9  r   z-inf)r   )%r   rr   r^   rC   r   r  r   r   r   rA   r;  r  rB   unbindhstackr<  r?  expandviewr   r$  r  r  r}   rH  
logical_orlogical_notfloatr   r/  r.  	transposer-  rM  r   appendcat)!rX   rP   rf   wavforms_rawBCLr   r   wavformsr   wavforms_itemL_itemaudio_feature_lens_rawr:   
batch_sizer   max_mel_seq_lenmax_seq_len	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskchunk_num_frame
chunk_maskaudio_statesrG   feature_lens_after_poolingnum_audio_tokensfinal_audio_embedsr  target_audio_embeds_lstrD   rD   rE   get_audio_hidden_statesm  s   









z MiniCPMO.get_audio_hidden_stateskwargsc                 K   sX   | dd }| dd }|d u r|d u rd S |d ur td|dS | d}td||dS )Nr1   rG   )r2   rG   r:   )r2   r1   r:   )poprF   r0   )rX   rt  r1   rG   r:   rD   rD   rE   _parse_and_validate_audio_input  s   
z(MiniCPMO._parse_and_validate_audio_inputc                    sD   t  jdi |}|D ]}|dv rd|vr| jdi ||d< q|S )N)r1   rG   r   rD   )rV   %_parse_and_validate_multimodal_inputsrv  )rX   rt  
modalities	input_keyrY   rD   rE   rw    s   z.MiniCPMO._parse_and_validate_multimodal_inputsaudio_inputc                 C   s   |d dkr
|d S |  |S )Nr2   rG   )rs  )rX   rz  rD   rD   rE   _process_audio_input  s   
zMiniCPMO._process_audio_inputrx  c                    s@   t  |}|D ]}|dkr|d }| |}|t|7 }q|S )Nr   )rV   _process_multimodal_inputsr{  tuple)rX   rx  multimodal_embeddingsrT   rz  audio_embeddingsrY   rD   rE   r|    s   
z#MiniCPMO._process_multimodal_inputs)!r<   r=   r>   packed_modules_mappingclassmethodr[   r}   r   r   rW   r&  r   r}  rA   rB   setr5  
CPU_DEVICEr   rH  
LongTensorrM  r0   rC   rs  r   rI   rv  rL   rw  r{  r|  r\   rD   rD   rY   rE   r    sZ    
$
 	
^

	r  )Pr?   collections.abcr   r   r   r   typingr   r   r   r	   rA   r
   transformersr   transformers.modeling_outputsr   ,transformers.models.whisper.modeling_whisperr   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.multimodalr   r   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   r   r   r   vllm.multimodal.processingr   r    r!   vllm.utils.tensor_schemar"   r#   minicpmvr%   r&   r'   r(   r)   r*   r+   utilsr,   r-   r.   r   r  r0   rF   rI   r@   r[   rB   rN   rO   r]   r`   r   r   Moduler   r   r   register_processorr  rD   rD   rD   rE   <module>   sJ   $	$	
 
	D"u7@