o
    -iX                     @   s  U d Z ddlZddlmZ ddlmZmZmZmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZ ddlZddlZddlZddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZHmIZImJZJmKZKmLZLmMZM ddlNmOZO ddlPmQZQ ddlRmSZS ddlTmUZUmVZV ddlWmXZX dd lYmZZZ dd!l[m\Z\m]Z]m^Z^m_Z_ dd"l`maZambZbmcZc d#ZdG d$d% d%eUZeG d&d' d'eUZfeeefB Zgeehd(< e
ejid)d*ZjG d+d, d,e%ZkG d-d. d.ekZld/ed0emend1f fd2d3Zod4eepejqf fd5d6ZrG d7d8 d8e:ZsG d9d: d:e:ZtG d;d< d<eAZuG d=d> d>eHZved?evevd@ZwG dAdB dBeEew ZxG dCdD dDeGew ZyG dEdF dFejze^e_Z{G dGdH dHe{Z|G dIdJ dJe{e]Z}G dKdL dLe{e]Z~G dMdN dNe{e]ZG dOdP dPe{e]Ze|e}e~eedQZe3jeyevexdRG dSdT dTe{e^e]ZdS )UzCInference-only MiniCPM-V model compatible with HuggingFace weights.    N)defaultdict)CallableIterableMappingSequence)partial)chain)	AnnotatedAnyLiteral	TypeAlias)nn)trunc_normal_)BatchFeaturePretrainedConfig)TypeVar)
VllmConfig)BaseDummyOptions)QuantizationConfig)BaseResampler
Resampler2get_2d_sincos_pos_embedLlamaForCausalLMMiniCPMForCausalLM)MultiModelKeysQwen2ForCausalLMQwen3ForCausalLM)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)
DictEmbeddingItems	ImageItemImageProcessorItems	ImageSizeModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser	VideoItemVideoProcessorItems)BaseDummyInputsBuilder)BaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetailsResolvedPromptUpdate	_seq2text)current_platform)IntermediateTensors)flatten_2d_lists)TensorSchemaTensorShape)set_default_torch_dtype   )Idefics2VisionTransformer)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoader
flatten_bnmaybe_prefix   c                	   @   sx   e Zd ZU dZdZed ed< eee	j
 eddddddhdf ed< ee	j
edd	f ed
< ee	j
edf ed< dS )MiniCPMVImagePixelInputsz
    Dimensions:
        - bns: Batch size * number of images * number of slices
        - bn: Batch size * number of images
        - c: Number of channels
        - h: Height
        - w: Width
    pixel_valuestypebnschwdynamic_dims   	tgt_sizesbn
num_slicesN)__name__
__module____qualname____doc__rJ   r   __annotations__r	   listtorchTensorr<    r]   r]   `/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/minicpmv.pyrH   g   s$   
 	rH   c                   @   sH   e Zd ZU dZed ed< eeje	ej B e
ddddhdf ed< dS )	MiniCPMVImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ns: Number of slices
        - hs: Hidden size (must match language model backbone)
    image_embedsrJ   rS   nshsrO   N)rU   rV   rW   rX   r   rY   r	   r[   r\   rZ   r<   r]   r]   r]   r^   r_      s   
 r_   MiniCPMVImageInputsgư>)epsc                       s   e Zd ZdedddfdededededB deegejf d	eeef d
e	dB de
ddf fddZ	dd	eeef dejjddfddZdejdejjddfddZdejdejdejfddZ  ZS )Resampler2_5NF   rg    num_queries	embed_dim	num_headskv_dim
norm_layermax_sizequant_configprefixreturnc	           	   	      s0   t  j|||||||d || _| | j d S )Nro   rp   )super__init__rn   _set_2d_pos_cache)	selfri   rj   rk   rl   rm   rn   ro   rp   	__class__r]   r^   rt      s   
zResampler2_5.__init__cpudevicec                 C   s8   t | j|dd}t| |}| jd|dd d S )NrQ      )version	pos_embedF
persistent)r   rj   r[   
from_numpyfloattoregister_buffer)rv   rn   rz   pos_embed_arrr~   r]   r]   r^   ru      s
   zResampler2_5._set_2d_pos_cacherR   c                 C   s   |d d df    }|d d df    }t|tr"t|ts$J || jd ks2|| jd krLt || jd t || jd f| _| | j| d S d S )Nr   r>   )maxitem
isinstanceintrn   ru   )rv   rR   rz   max_hmax_wr]   r]   r^   _adjust_pos_cache   s   zResampler2_5._adjust_pos_cachexc                 C   s  |j d |j d ksJ |j d }|j}|j}|d d df |d d df  }| j||d |  }t|ts;J tj	||ftj
|d}g }	t|D ]/}
||
  \}}|	| jd |d |d d f || df| d||
||
 d f< qLtjjjj|	dddddd	}	| |\}}| |ddd	}| | j}| j| ||||	 ||d
d }|ddd	}| |}|| j }|S )Nr   r>   rz   dtyperz   T        batch_firstpadding_valuerQ   key_padding_mask)shaperz   r   r   r   r   r   r   r[   zerosboolrangetolistappendr~   reshaper   r   utilsrnnpad_sequencepermutekv_projln_kvln_qqueryattn_repeatln_postproj)rv   r   rR   bsrz   r   	patch_lenmax_patch_lenr   r~   itgt_htgt_w_qoutr]   r]   r^   forward   sL   
 .




zResampler2_5.forwardry   )rU   rV   rW   
DEFAULT_LNr   r   r   	LayerNormtupler   strrt   r[   typesDeviceru   r\   r   r   __classcell__r]   r]   rw   r^   re      sR    
	


	
$re   c                       s   e Zd ZdeddddfdededededB d	eegejf d
eeef dede	dB de
ddf fddZdedejfddZ	d!dedejjddfddZ	d!dedejjfddZdejejB fddZ	d"dejdejdejfdd Z  ZS )#Resampler4_5Nrf   i  rh   ri   rj   rk   rl   rm   rn   max_temporal_sizero   rp   rq   c
           
   
      sL   t  j||||||||	d t| jdd || _| | j | | j d S )Nrr   {Gz?std)rs   rt   r   r   r   _set_temporal_pos_cacheapply_init_weights)
rv   ri   rj   rk   rl   rm   rn   r   ro   rp   rw   r]   r^   rt      s   zResampler4_5.__init__posc                 C   s~   |d dksJ t j|d t jd}||d  }dd|  }|d}t d||}t |}t |}t j||gd	d
}|S )z
        embed_dim: output dimension for each position
        pos: a list of positions to be encoded: size (M,)
        out: (M, D)
        rQ   r   r   g       @      ?i'  r   zm,d->mdr>   )axis)nparangefloat32r   einsumsincosconcatenate)rv   rj   r   omegar   emb_sinemb_cosembr]   r]   r^   *get_1d_sincos_pos_embed_from_temporal_size  s   


z7Resampler4_5.get_1d_sincos_pos_embed_from_temporal_sizery   rz   c                 C   sB   t j|t jd}t| | j| |}| j	d|dd d S )Nr   temporal_pos_embedFr   )
r   r   r   r[   r   r   rj   r   r   r   )rv   r   rz   temporal_sizer~   r]   r]   r^   r   )  s   	z$Resampler4_5._set_temporal_pos_cachec                 C   s&   || j kr|| _ | | j | d S d S N)r   r   )rv   r   rz   r]   r]   r^   _adjust_temporal_pos_cache8  s   
z'Resampler4_5._adjust_temporal_pos_cachemc                 C   s   t |tjr&t|jdd t |tjr"|jd ur$tj|jd d S d S d S t |tjr>tj|jd tj|jd d S d S )Nr   r   r   r   )	r   r   Linearr   weightbiasinit	constant_r   )rv   r   r]   r]   r^   r   ?  s   zResampler4_5._init_weightsr   rR   c                 C   sX  |j d |j d ksJ |j d }|j}|j}|d d df |d d df  }| j||d d}d }	|d urTtt|}	t|	dd}
|
dkrId}|
| jkrT| 	|
| | 
 }t|tsaJ tj||ftj|d}| |\}}| |ddd	}| | j}g }g }t|D ]O}|| \}}|r|	| dkr|tj| j||d n|| j|	|  | || jd |d |d d f || df| d|||| d f< qtjjjj|dd
dddd	}|}|| }|r|tj |dd7 }t!|}g }g }g }d}|D ]P}|t!| }||d d ||d d f ddd	d| j ||d d ||d d f ddd	d| j ||||d d f dd |}qtjjjj|dd
dddd	}tjjjj|dd
dddd	}tjjjj|ddd"d}| j#| $|||||dd }|ddd	}| %|}|| j& }|S )Nr   r>   r   F)defaultr   Tr   rQ   r   r   )dimr   )'r   rz   r   r   rZ   r   from_iterabler   r   r   r   r   r   r[   r   r   r   r   r   r   r   r   r   rj   r   r   r~   r   r   r   r   r   stacklensqueezer   r   r   r   )rv   r   rR   temporal_idsr   rz   r   r   temporal_pos_embtemporal_ids_flattenr   r   r   r   r   pos_embed_2dpos_embed_temporalr   r   r   kvmerge_kmerge_vmerge_key_padding_maskstarttpendr   r]   r]   r^   r   H  s   
 
.

,,







zResampler4_5.forwardr   r   )rU   rV   rW   r   r   r   r   r   r   r   r   rt   r   ndarrayr   r[   r   r   r   r   r   r   r\   r   r   r]   r]   rw   r^   r      sp    
	



r   configrq   .c                 C   sP   t | dd }|d u r| jdkr| jdkrdS dS t|}tdd |dD S )	Nr}    	  @   rQ   r   r{   c                 s   s    | ]}t |V  qd S r   r   .0r   r]   r]   r^   	<genexpr>  s    z(get_version_by_config.<locals>.<genexpr>.)getattrhidden_size	query_numr   r   split)r   version_floatversion_strr]   r]   r^   get_version_by_config  s   r  	hf_inputsc                 C   sH   t tdtdtdtdtdtdtdtddS )Nimagevideo)rI   image_sizesrR   r`   video_pixel_valuesvideo_image_sizesvideo_tgt_sizesvideo_embeds)dictr#   batched)r  r]   r]   r^   _minicpmv_field_config  s   r  c                       sb   e Zd Zdeeejf deeeejf geeef f ddf fddZ	de
defdd	Z  ZS )
MiniCPMVImageEmbeddingItemsdatafields_factoryrq   Nc                       t  j|dddh|d d S )Nr  r`   r  modalityrequired_fieldsr  rs   rt   rv   r  r  rw   r]   r^   rt        
z$MiniCPMVImageEmbeddingItems.__init__indexc                 C   &   |  |d  }t|d |d dS )Nr  r   r>   widthheightgetr   r)   )rv   r  
image_sizer]   r]   r^   get_image_size     z*MiniCPMVImageEmbeddingItems.get_image_size)rU   rV   rW   r   r   r[   r\   r   r#   rt   r   r)   r   r   r]   r]   rw   r^   r    s    
r  c                       st   e Zd Zdeeejf deeeejf geeef f ddf fddZ	de
defdd	Zde
de
fd
dZ  ZS )MiniCPMVVideoEmbeddingItemsr  r  rq   Nc                    r  )Nr  r
  r  r  r  r  rw   r]   r^   rt     r  z$MiniCPMVVideoEmbeddingItems.__init__r  c                 C   r  )Nr  r   r>   r  r  )rv   r  
frame_sizer]   r]   r^   get_frame_size  r!  z*MiniCPMVVideoEmbeddingItems.get_frame_sizec                 C   s   t | |d S )Nr  )r   r  )rv   r  r]   r]   r^   get_num_frames     z*MiniCPMVVideoEmbeddingItems.get_num_frames)rU   rV   rW   r   r   r[   r\   r   r#   rt   r   r)   r$  r%  r   r]   r]   rw   r^   r"    s    
r"  c                       sx   e Zd Zdeeejf ee B de	e
e
f dB f fddZdeeejf ee B de	e
e
f dB f fddZ  ZS )MiniCPMVMultiModalDataParserr  rq   Nc                    "   t |trt|tdS t |S N)r  )r   r  r  r  rs   _parse_image_datarv   r  rw   r]   r^   r*        
z.MiniCPMVMultiModalDataParser._parse_image_datac                    r(  r)  )r   r  r"  r  rs   _parse_video_datar+  rw   r]   r^   r-    r,  z.MiniCPMVMultiModalDataParser._parse_video_data)rU   rV   rW   r  r   r[   r\   r*   r'   r+   r
   r*  r.   r-  r   r]   r]   rw   r^   r'    s    r'  c                   @   sn  e Zd ZdZdZdd ZdefddZdefdd	Zd
d Z	de
eedB f fddZ			d1dedededB dedef
ddZ	d2dededB deeef dB fddZ	d2dededB defddZdefddZdefddZdefd d!Zdefd"d#Zd$ed%e
eef defd&d'Zdefd(d)Zdefd*d+Zd,edefd-d.Zd$ed%e
eef defd/d0ZdS )3MiniCPMVProcessingInfo(<image>./</image>)(<video>./</video>)c                 C   s
   | j  S r   )ctxget_hf_configrv   r]   r]   r^   r2    s   
z$MiniCPMVProcessingInfo.get_hf_configkwargsc                 K   sL   | j jdi |}|j}dD ]}t||}t|tjr#t|||  q|S )N)meanr   r]   )	r1  get_hf_processorimage_processorr   r   r   r   setattrr   )rv   r4  hf_processorr7  attrvalr]   r]   r^   r6     s   
z'MiniCPMVProcessingInfo.get_hf_processorc                 K   s   | j di |jS Nr]   )r6  r7  )rv   r4  r]   r]   r^   get_image_processor-  r&  z*MiniCPMVProcessingInfo.get_image_processorc                 C   s   t |  S r   )r  r2  r3  r]   r]   r^   get_model_version0  s   z(MiniCPMVProcessingInfo.get_model_versionrq   Nc                 C   s    dd i}|   dv rd |d< |S )Nr     rQ         r   rC  r|   r  )r>  )rv   	mm_limitsr]   r]   r^   get_supported_mm_limits3  s   z.MiniCPMVProcessingInfo.get_supported_mm_limitsr   Tr  	image_idxmax_slice_numsuse_image_idc                 C   s<   |   }|  }|dks|dkr||S |j||||dS )Nr   r{   )rG  rH  rI  )r=  r>  get_slice_image_placeholder)rv   r  rG  rH  rI  r7  r}   r]   r]   r^   rJ  :  s   
z2MiniCPMVProcessingInfo.get_slice_image_placeholderc                 C   sF   |   }|  }|dks|dkr||S |d u r|j}|j||dS )Nr   r{   rH  )r=  r>  get_sliced_gridrH  )rv   r  rH  r7  r}   r]   r]   r^   rL  O  s   
z&MiniCPMVProcessingInfo.get_sliced_gridc                 C   sB   |   }| j||d}|d u rd }}n|\}}|| d |j S )NrK  r   r>   )r=  rL  image_feature_size)rv   r  rH  r7  gridncolsnrowsr]   r]   r^   get_num_image_tokensc  s   
z+MiniCPMVProcessingInfo.get_num_image_tokensc                 C   s   |   }| |S r   )!get_image_size_with_most_featuresrQ  )rv   r  r]   r]   r^   get_max_image_tokensu     
z+MiniCPMVProcessingInfo.get_max_image_tokensc                 C   s   t |  ddS )Nmax_slice_num	   )r   r2  r3  r]   r]   r^   get_image_max_slice_numy  s   z.MiniCPMVProcessingInfo.get_image_max_slice_numc                 C   (   t |  dd}|  }t||| dS Nr  i  r  )r   r2  rW  r)   rv   r  rU  r]   r]   r^   rR  |     z8MiniCPMVProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   }| j||  dS )NrK  )'get_video_frame_size_with_most_featuresrQ  get_video_max_slice_num)rv   r#  r]   r]   r^   get_max_video_frame_tokens  s
   z1MiniCPMVProcessingInfo.get_max_video_frame_tokensseq_len	mm_countsc                 C   s   |  ||}|  | }|S r   )!get_num_frames_with_most_featuresr^  )rv   r_  r`  
num_framesnum_video_tokens_totalr]   r]   r^   get_max_video_tokens  s   z+MiniCPMVProcessingInfo.get_max_video_tokensc                 C      dS )Nr>   r]   r3  r]   r]   r^   r]       z.MiniCPMVProcessingInfo.get_video_max_slice_numc                 C   rX  rY  )r   r2  r]  r)   rZ  r]   r]   r^   r\    r[  z>MiniCPMVProcessingInfo.get_video_frame_size_with_most_features
max_tokensc                 C   s   |   }|| }|S r   )r^  )rv   rg  num_frame_tokensrb  r]   r]   r^   get_max_video_frames  s   z+MiniCPMVProcessingInfo.get_max_video_framesc                 C   sP   | dd}| dd}|  | }| || }t|t|d t}t|dS )Nr  r   r  r>   )r  rS  ri  minr   _MAX_FRAMES_PER_VIDEO)rv   r_  r`  
max_images
max_videosmax_image_tokensmax_total_framesmax_frames_per_videor]   r]   r^   ra    s   
z8MiniCPMVProcessingInfo.get_num_frames_with_most_features)r   NTr   )rU   rV   rW   image_patternvideo_patternr2  objectr6  r=  r>  r   r   r   rF  r)   r   rJ  r   rL  rQ  rS  rW  rR  r^  rd  r]  r\  ri  ra  r]   r]   r]   r^   r.    sv    




	
r.  _I)boundr   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )MiniCPMVDummyInputsBuilderr`  rq   c                 C   s8   | dd}| dd}| jj| }| jj| }|| S )Nr  r   r  )r  inforq  rr  )rv   r`  
num_images
num_videosimage_prompt_textsvideo_prompt_textsr]   r]   r^   get_dummy_text  s
   z)MiniCPMVDummyInputsBuilder.get_dummy_textNr_  
mm_optionsc                 C   s   | dd}| dd}| j \}}| j \}}	| j||}
|r(| dnd }|r1| dnd }| j||||d| j||	|
|dg| dS )Nr  r   r  )r  r  rx  	overridesr  r  )r  rw  rR  r\  ra  _get_dummy_images)rv   r_  r`  r}  rx  ry  image_widthimage_heightvideo_widthvideo_heightnum_video_framesimage_overridesvideo_overridesr]   r]   r^   get_dummy_mm_data  s2   z,MiniCPMVDummyInputsBuilder.get_dummy_mm_datar   )
rU   rV   rW   r   r   r   r|  r   r"   r  r]   r]   r]   r^   rv    s    
rv  c                       s  e Zd ZdefddZd,dededefddZded	edefd
dZ	de
eef de
eef de
eef de
eef fddZde
eef de
eef de
eef de
eef fddZde
eef de
eef de
eef de
eef fddZdee de
eee f de
eef de
eef dee deeef f fddZdede
eef de
eef de
eef def
ddZdedede
eef de
eef def
d d!Zdede
eef d"edee fd#d$Zd%ed&edef fd'd(Zd)ede
eef de
eef fd*d+Z   Z!S )-MiniCPMVMultiModalProcessorrq   c                 C   s   t  S r   )r'  r3  r]   r]   r^   _get_data_parser  s   z,MiniCPMVMultiModalProcessor._get_data_parserr   r  rG  c                 C   s   | j j||dS )N)rG  )rw  rJ  )rv   r  rG  r]   r]   r^   get_image_prompt_texts  s   z2MiniCPMVMultiModalProcessor.get_image_prompt_textsrb  c                 C   s   | j j|d| j  dd| S )Nr   F)r  rG  rH  rI  )rw  rJ  r]  )rv   r  rb  r]   r]   r^   get_video_prompt_texts  s   z2MiniCPMVMultiModalProcessor.get_video_prompt_textsmm_data	mm_kwargs
tok_kwargsc                 C   s~   | d }d u ri S |  d|idttf}t|tr#i }|S | j| jj	gt
| ddd |D i||h dd}|S )Nimagesr  c                 S   s   g | ]}|gqS r]   r]   )r   r  r]   r]   r^   
<listcomp>  s    z>MiniCPMVMultiModalProcessor.process_images.<locals>.<listcomp>>   rR   r  rI   promptsr  r  r  out_keys)r  r  parse_mm_data	get_itemsr  r(   r   _base_call_hf_processorrw  rq  r   )rv   r  r  r  r  parsed_imagesimage_inputsr]   r]   r^   process_images  s"   


z*MiniCPMVMultiModalProcessor.process_imagesc                    s   | d }d u ri S   d|idttf}t|tr"i }n  j fdd|D dt|ii |d j	
 i|h dd}d	d
 | D }|S )Nvideosr  c                    s   g | ]
} j jt| qS r]   )rw  rq  r   )r   r  r3  r]   r^   r  &  s    z>MiniCPMVMultiModalProcessor.process_videos.<locals>.<listcomp>r  rH  >   rR   r  rI   r  c                 S   s   i | ]
\}}d | |qS video_r]   r   r   r   r]   r]   r^   
<dictcomp>2      z>MiniCPMVMultiModalProcessor.process_videos.<locals>.<dictcomp>)r  r  r  r  r"  r/   r   r  rZ   rw  r]  items)rv   r  r  r  r  parsed_videosvideo_inputsr]   r3  r^   process_videos  s.   




z*MiniCPMVMultiModalProcessor.process_videosc                 C   s    i |  |||| |||S r   )r  r  )rv   r  r  r  r]   r]   r^   process_mm_inputs6  s
   z-MiniCPMVMultiModalProcessor.process_mm_inputsr  r  c          
         s   | j  dv rt j||||dnGttttj f tt	|D ]7\ }t j| fdd|
 D ||d}|
 D ]\}}	t|	dksNJ |t|	f| |	d  q<q!fdd|D S )Nr?  )promptr  r  r  c                    s   i | ]	\}}||  qS r]   r]   r  )r   r]   r^   r  X  s    zGMiniCPMVMultiModalProcessor._base_call_hf_processor.<locals>.<dictcomp>r>   r   c                    s   i | ]}| | qS r]   r]   )r   r   )inputsr]   r^   r  a  s    )rw  r>  rs   _call_hf_processorr   r   rZ   r[   r\   	enumerater  r   r   )
rv   r  r  r  r  r  r  
inputs_oner   r   rw   )r   r  r^   r  A  s(   
z3MiniCPMVMultiModalProcessor._base_call_hf_processorr  c                 C   sB   | j  }t|j|fi |g}| |||}td|i|S )N	input_ids)rw  get_tokenizerr[   tensorencoder  r   )rv   r  r  r  r  	tokenizerr  	mm_inputsr]   r]   r^   r  c  s   
z.MiniCPMVMultiModalProcessor._call_hf_processorprompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsc                 C   re  )NFr]   )rv   r  r  r  r  r]   r]   r^   _hf_processor_applies_updatesv     z9MiniCPMVMultiModalProcessor._hf_processor_applies_updatesout_mm_kwargsc                    s   dj jfdj jfg}g }j  }|D ]\}}||j|dd}	|	|kr.|||	f q||7 }dtffdd}
dtffdd	}|
|d
  fdd|D S )Nr  r  F)add_special_tokensitem_idxc                    s.     dttf}|| }t|| dS )Nr  <unk>)r  r  r(   r   r5   select_textr  )r  r  r  r  rv   r]   r^   get_image_replacement  s   

zNMiniCPMVMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacementc                    s8     dttf}|| }|| }t||dS )Nr  r  )r  r"  r/   r$  r%  r5   r  r  )r  r  r#  rb  r  r]   r^   get_video_replacement  s   


zNMiniCPMVMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacementr  c                    s"   g | ]\}}t || | d qS ))r  targetreplacement)r3   )r   r  pattern)get_replacementr]   r^   r    s    
zCMiniCPMVMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>)rw  rq  rr  r  decoder  r   r   )rv   r  r  r  placeholdersadditional_placeholdersr  r  r  sub_patternr  r  r]   )r  r  rv   r^   _get_prompt_updates  s*   



z/MiniCPMVMultiModalProcessor._get_prompt_updatescached_updatenew_item_idxc              
      s   t  ||}|jdkrT| j }| j }| j }t||jj	}|j
}|dks-|dkr4|j}	|j}
n|j}	|j}
|t||	 | |
 |	 | |
 dd}|S )Nr  r   r{   r>   r  )rs   _recompute_cached_prompt_updater  rw  r  r=  r>  r7   contentfullr  im_start_tokenim_end_tokenim_id_start	im_id_endwith_contentr5   r  replace)rv   r  r  
new_updater  r7  r}   textprev_item_idxim_startim_endrw   r]   r^   r    s4   



z;MiniCPMVMultiModalProcessor._recompute_cached_prompt_updater  c                 C   s   t |S r   )r  )rv   r  r  r]   r]   r^   _get_mm_fields_config  s   z1MiniCPMVMultiModalProcessor._get_mm_fields_config)r   )"rU   rV   rW   r-   r  r)   r   r   r  r  r   rs  r%   r  r  r  rZ   r   setr  r  r   r  r,   r   r  r$   r4   r  r6   r  r#   r  r   r]   r]   rw   r^   r    s    









#







"






	

;&

r  c                       s  e Zd ZdZdZededededB fddZd	d
de	def fddZ
dedededB fddZdedefddZdedejeej B eejdf B fddZdefddZdedefddZ		d8dejdejdedB d ejdB dedejfd!d"Zd#ejdejdB fd$d%Zd&eeeejf  dee fd'd(Zdefd)d*Z 		d9de	dede!j"fd+d,Z#		d9d-e$d.e%dB dede!j"fd/d0Z&			d:d1ed2ed.e%dB dede!j"f
d3d4Z'd5e(dejfd6d7Z)  Z*S );MiniCPMVBaseModelz_
    The abstract class of MiniCPMV can only be inherited, but cannot be
    instantiated.
    Tr  r   rq   Nc                 C   s$   | drdS | drdS td)Nr  r/  r  r0  z)Only image or video modality is supported)
startswith
ValueError)clsr  r   r]   r]   r^   get_placeholder_str  s
   

z%MiniCPMVBaseModel.get_placeholder_strrh   rp   vllm_configrp   c                   s   |j j}|j j}|j}|jdk| _t   || _|| _t	| j| _
| | | j|t|dd| _W d    n1 s=w   Y  | |ddh7 | j||t|dd| _| j
dkr`| jjn| jjj| _| jj| _| j| j| j|t|d	d
| _W d    n1 sw   Y  | jj| _d S )Nr  llmr  rp   r  r  vpmr  r   	resamplerrr   )model_config	hf_configmultimodal_configro   mm_encoder_tp_modeuse_data_parallelrs   rt   r   r  r}   _mark_language_modelinit_llmrF   r  _mark_tower_modelinit_vision_moduler  rj   
embeddings
vision_dimr   init_resamplerr  make_empty_intermediate_tensors)rv   r  rp   r   r  ro   rw   r]   r^   rt     s<   





zMiniCPMVBaseModel.__init__r4  c           	      K   s   | dd }| dd }|d u r|d u rd S |d ur td|dS | d}tdd |D }t|}t|dd}td|||d	S )
NrI   r`   )rJ   r`   rR   c                 S      g | ]}t |qS r]   )r   )r   psr]   r]   r^   r  4      zFMiniCPMVBaseModel._parse_and_validate_vision_input.<locals>.<listcomp>T)concat)rJ   rI   rR   rT   )popr_   r[   r  rE   rH   )	rv   r  r4  rI   r`   rR   num_slices_flatpixel_values_flattgt_sizes_flatr]   r]   r^    _parse_and_validate_vision_input!  s&   
z2MiniCPMVBaseModel._parse_and_validate_vision_inputc                 K   sl   i }|D ]/}|dv rd|vr| j 	di ||d< |dv r3d|vr3| j 	di dd | D |d< q|S )	N)rI   r`   r  )r  r
  r  c                 S   s   i | ]
\}}| d |qS r  )removeprefixr  r]   r]   r^   r  Q  r  zKMiniCPMVBaseModel._parse_and_validate_multimodal_inputs.<locals>.<dictcomp>)r  )r  )r  r  )rv   r4  
modalities	input_keyr]   r]   r^   %_parse_and_validate_multimodal_inputs?  s$   

z7MiniCPMVBaseModel._parse_and_validate_multimodal_inputsimage_input.c                 C   s>   |d dkr
|d S |  |}|d }dd || D S )NrJ   r`   rT   c                 S   s   g | ]}| d dqS )r   r>   )flatten)r   er]   r]   r^   r  `  s    z;MiniCPMVBaseModel._process_vision_input.<locals>.<listcomp>)get_vision_hidden_statesr   r   )rv   r  image_features_flatrT   r]   r]   r^   _process_vision_inputV  s
   
z'MiniCPMVBaseModel._process_vision_inputr  c                 C   s^   d}|D ](}|dkr|d }|  |}|t|7 }|dkr,|d }|  |}|t|7 }q|S )Nr]   r  r  )r  r   )rv   r  multimodal_embeddingsr  r  image_embeddingsvideo_inputvideo_embeddingsr]   r]   r^   _process_multimodal_inputsb  s   

z,MiniCPMVBaseModel._process_multimodal_inputsc                 K   s"   | j di |}|sg S | |S r<  )r  r	  )rv   r4  r  r]   r]   r^   embed_multimodalu  s   
z"MiniCPMVBaseModel.embed_multimodalr  	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r  r  r  r  )r  model)rv   r  r  r  r  r4  hidden_statesr]   r]   r^   r   |  s   zMiniCPMVBaseModel.forwardr  c                 C   s   | j |S r   )r  compute_logits)rv   r  r]   r]   r^   r    s   z MiniCPMVBaseModel.compute_logitsweightsc                 C   s   t | }||S r   rD   load_weightsrv   r  loaderr]   r]   r^   r    rT  zMiniCPMVBaseModel.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  r  r  )language_model	connectortower_model)r   from_string_fieldr3  r]   r]   r^   get_mm_mapping  s   z MiniCPMVBaseModel.get_mm_mappingc                 C      t r   NotImplementedErrorrv   r  rp   r]   r]   r^   r    s   zMiniCPMVBaseModel.init_llmr   ro   c                 C   r  r   r  )rv   r   ro   rp   r]   r]   r^   r    s   z$MiniCPMVBaseModel.init_vision_modulerj   r  c                 C   r  r   r  )rv   rj   r  ro   rp   r]   r]   r^   r    r  z MiniCPMVBaseModel.init_resamplerr  c                 C   r  r   r  r+  r]   r]   r^   r    rf  z*MiniCPMVBaseModel.get_vision_hidden_states)NNrh   Nrh   )+rU   rV   rW   rX   supports_encoder_tp_dataclassmethodr   r   r  r   rt   rs  rc   r  r  r  r[   r\   rZ   r   r  r	  r@   r
  r9   r
   r   r  r   r  r  r   r  r   Moduler  r   r   r  r  rH   r  r   r]   r]   rw   r^   r    s    (



$


	r  c                       s   e Zd ZdZdddedef fddZ	ddededejfd	d
Z		dde
dedB dedejfddZ		ddedededB dedejf
ddZdedejfddZ  ZS )MiniCPMV2_0Frh   r  r  rp   c                   "   t  j||d | jdksJ d S )Nr  r   rs   rt   r}   r  rw   r]   r^   rt        zMiniCPMV2_0.__init__rq   c                 C      t ||dS Nr  r   r  r]   r]   r^   r       zMiniCPMV2_0.init_llmr   ro   Nc                 C   s   zdd l }W n ty   tdtw ttj |jdddddd}W d    n1 s-w   Y  |jt d}t||j	j
rL|jd urLtj |_| jjrX|jd d |_|S )	Nr   zPlease install timm==0.9.10z#vit_so400m_patch14_siglip_384.webliFT)
pretrainednum_classesdynamic_img_sizedynamic_img_padr   r   )timmImportErrorr=   r[   float16create_modelr   get_default_dtyper   modelsVisionTransformer	attn_poolr   Identityr   drop_vision_last_layerblocks)rv   r   ro   rp   r/  r  r]   r]   r^   r    s,   
	
zMiniCPMV2_0.init_vision_modulerj   r  c                 C   sh   t tj t||d tt| jj|dd||d}W d    n1 s%w   Y  |j	t
jt dS )N   FT)rj   rk   	grid_sizerl   adaptivedo_post_projectionro   rp   rz   r   )r=   r[   r1  r   r   mathsqrtr   r   r   r8   device_typer3  rv   rj   r  ro   rp   r  r]   r]   r^   r    s   
zMiniCPMV2_0.init_resamplerr  c                 C   s   |d }| j jj\}}| j jjj}t| j dd}ttj	  }|D ]>}|d j
dd  \}	}
t|	| t|
| f}| j |d|}|dkrU|d d |d f }|| || q t|S )NrI   num_prefix_tokensr   )r  patch_embed
patch_sizer~   r  r   r   rZ   r[   r\   r   r?  ceilforward_features	unsqueezerJ   r   r  vstack)rv   r  rI   P_hP_wr   rC  respixel_valueHWtgt_sizevision_embeddingr]   r]   r^   r    s   
z$MiniCPMV2_0.get_vision_hidden_statesr  r   )rU   rV   rW   r!  r   r   rt   r   r#  r  r   r   r  r   r  rH   r[   r\   r  r   r]   r]   rw   r^   r$    sF    

&
r$  c                       s   e Zd Zg dddgdZdddedef fd	d
Z	ddededejfddZ		dde
dedB dedejfddZ		ddedededB dedejf
ddZdedejfddZ  ZS )MiniCPMV2_5q_projk_projv_proj	gate_projup_projqkv_projgate_up_projrh   r  r  rp   c                   r%  )Nr  r{   r&  r  rw   r]   r^   rt   '  r'  zMiniCPMV2_5.__init__rq   c                 C   r(  r)  r   r  r]   r]   r^   r  +  r*  zMiniCPMV2_5.init_llmr   ro   Nc                 C   4   t |j||| jd}| jjr|jjd d |j_|S N)ro   rp   r  r   r?   vision_configr  r   r8  encoderlayersrv   r   ro   rp   r  r]   r]   r^   r  2     zMiniCPMV2_5.init_vision_modulerj   r  c              	   C   Z   t tj t| jj||d |||d}W d    n1 sw   Y  |jtjt	 dS Nr:  )ri   rj   rk   rl   ro   rp   r>  
r=   r[   r1  re   r   r   r   r8   rA  r3  rB  r]   r]   r^   r  B  s   

zMiniCPMV2_5.init_resamplerr  c                 C   s  |d }|d }t |}|d jd }tdd |D }|d j}|d j}tj|d||f||d}	t|D ]\}
}|jd	 }||	|
d
d |f< q6|d	}| 	 }t
|ts[J tj||ftj|d}t|D ]\}
}d||
d |f< qj| j|	|dd d}| ||S )NrI   rR   r   rD  c                 s       | ]}|j d  V  qdS r   Nr   r   r   r]   r]   r^   r   ]      z7MiniCPMV2_5.get_vision_hidden_states.<locals>.<genexpr>   r   r   .Tr>   patch_attention_maskrR   r   r   r   rz   r   r[   r   r  prodr   r   r   r   r  rI  r  rv   r  rI   rR   BPLrz   r   all_pixel_valuesr   pixel_values_itemL_itemnum_patchesmax_patchespatch_attn_masknum_patches_itemrR  r]   r]   r^   r  W  .   



z$MiniCPMV2_5.get_vision_hidden_statesr  r   )rU   rV   rW   packed_modules_mappingr   r   rt   r   r#  r  r   r   r  r   r  rH   r[   r\   r  r   r]   r]   rw   r^   rS    sN    


rS  c                          e Zd Zg dddgdZdddedef fd	d
Z	ddededejfddZ			dde
dedB dedejfddZ		ddedededB dedejf
ddZdedejfddZdeeeejf  dee fddZ  ZS )MiniCPMV2_6rT  rX  rY  rZ  rh   r  r  rp   c                   r%  )Nr  r@  r&  r  rw   r]   r^   rt     r'  zMiniCPMV2_6.__init__rq   c                 C   r(  r)  r   r  r]   r]   r^   r    r*  zMiniCPMV2_6.init_llmNr   ro   c                 C   r]  r^  r_  rc  r]   r]   r^   r    rd  zMiniCPMV2_6.init_vision_modulerj   r  c              	   C   re  rf  rg  rB  r]   r]   r^   r       
zMiniCPMV2_6.init_resamplerr  c                 C     |d }|d }t |}|d jd }tdd |D }|d j}|d j}tj|d||f||d}	t|D ]\}
}|jd	 }||	|
d
d |f< q6|d	}| 	 }t
|ts[J tj||ftj|d}t|D ]\}
}d||
d |f< qj| j|	|d|d}| ||S )NrI   rR   r   rD  c                 s   rh  ri  rj  rk  r]   r]   r^   r     rl  z7MiniCPMV2_6.get_vision_hidden_states.<locals>.<genexpr>rm  r   r   .Tr>   rn  rp  rr  r]   r]   r^   r    r}  z$MiniCPMV2_6.get_vision_hidden_statesr  c                 C      t | g dd}||S N)zapm.audiotts)skip_prefixesr  r  r]   r]   r^   r       
zMiniCPMV2_6.load_weightsr  r   rU   rV   rW   r~  r   r   rt   r   r#  r  r   r   r  r   r  rH   r[   r\   r  r   r   r  r  r   r]   r]   rw   r^   r  w  R    



,r  c                       r  )MiniCPMV4_0rT  rX  rY  rZ  rh   r  r  rp   c                   r%  )Nr  rB  r&  r  rw   r]   r^   rt     r'  zMiniCPMV4_0.__init__rq   c                 C   r(  r)  r   r  r]   r]   r^   r    r*  zMiniCPMV4_0.init_llmNr   ro   c                 C   r]  r^  r_  rc  r]   r]   r^   r    rd  zMiniCPMV4_0.init_vision_modulerj   r  c              	   C   re  rf  rg  rB  r]   r]   r^   r    r  zMiniCPMV4_0.init_resamplerr  c                 C   r  )NrI   rR   r   rD  c                 s   rh  ri  rj  rk  r]   r]   r^   r     rl  z7MiniCPMV4_0.get_vision_hidden_states.<locals>.<genexpr>rm  r   r   .Tr>   rn  rp  rr  r]   r]   r^   r    r}  z$MiniCPMV4_0.get_vision_hidden_statesr  c                 C   r  r  r  r  r]   r]   r^   r  6  r  zMiniCPMV4_0.load_weightsr  r   r  r]   r]   rw   r^   r    r  r  c                       r  )MiniCPMV4_5rT  rX  rY  rZ  rh   r  r  rp   c                   r%  )Nr  rD  r&  r  rw   r]   r^   rt   H  r'  zMiniCPMV4_5.__init__rq   c                 C   r(  r)  r   r  r]   r]   r^   r  L  r*  zMiniCPMV4_5.init_llmNr   ro   c                 C   r]  r^  r_  rc  r]   r]   r^   r  S  rd  zMiniCPMV4_5.init_vision_modulerj   r  c              	   C   re  rf  )
r=   r[   r1  r   r   r   r   r8   rA  r3  rB  r]   r]   r^   r  c  r  zMiniCPMV4_5.init_resamplerr  c                 C   s2  |d }|d }| dd }t|}|d jd }tdd |D }|d j}|d j}	tj|d||f|	|d	}
|d u r>d nt|}t	|D ]\}}|jd
 }||
|dd |f< qF|
d
}|  }t|tskJ tj||ftj|d	}t	|D ]\}}d||d |f< qz| j|
|d|d}| |||S )NrI   rR   r   r   rD  c                 s   rh  ri  rj  rk  r]   r]   r^   r     rl  z7MiniCPMV4_5.get_vision_hidden_states.<locals>.<genexpr>rm  r   r   .Tr>   rn  )r  r   r   r   rz   r   r[   r   r:   r  rq  r   r   r   r   r  rI  r  )rv   r  rI   rR   r   rs  rt  ru  rz   r   rv  all_temporal_idsr   rw  rx  ry  rz  r{  r|  rR  r]   r]   r^   r  y  s4   



z$MiniCPMV4_5.get_vision_hidden_statesr  c                 C   r  r  r  r  r]   r]   r^   r    r  zMiniCPMV4_5.load_weightsr  r   r  r]   r]   rw   r^   r  ;  sR    



,#r  )r   r{   r@  rB  rD  )rw  dummy_inputsc                   @   s(   e Zd ZdZdddedefddZdS )	MiniCPMVz
    Different versions of MiniCPMV use different visual encoders and LLMs,
    which is not conducive to the current integration logic of LoRA and
    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
    rh   r  r  rp   c                C   s   |j j}t|ds|jdkr|jdkrd}nd}nt|jd}tdd |D }t	
|}|d u rKd	d
d tt	 D }td| d| | j|j | j|j |||dS )Nr}   r   r   r   r{   r   c                 S   r  r]   r   r   r]   r]   r^   r    r  z$MiniCPMV.__new__.<locals>.<listcomp>z, c                 S   s"   g | ]}|d   d|d  qS )r   r   r>   r]   )r   r   r]   r]   r^   r    s   " z+Currently, MiniCPMV only supports versions z. Got version: r  )r  r  hasattrr   r   r   r}   r   r   _SUPPORT_VERSIONr  joinsortedkeysr  r~  updateembedding_modules)r  r  rp   r   r}   instance_clssupported_versionsr]   r]   r^   __new__  s,   

zMiniCPMV.__new__N)rU   rV   rW   rX   r   r   r  r]   r]   r]   r^   r    s    r  )rX   r?  collectionsr   collections.abcr   r   r   r   	functoolsr   	itertoolsr   typingr	   r
   r   r   numpyr   r[   torch.typesr   torch.nn.initr   transformersr   r   typing_extensionsr   vllm.configr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   $vllm.model_executor.layers.resamplerr   r   r    vllm.model_executor.models.llamar   "vllm.model_executor.models.minicpmr   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.qwen2r    vllm.model_executor.models.qwen3r    vllm.multimodalr!   vllm.multimodal.inputsr"   r#   r$   r%   vllm.multimodal.parser&   r'   r(   r)   r*   r+   r,   r-   r.   r/   vllm.multimodal.processingr0   $vllm.multimodal.processing.processorr1   r2   r3   r4   r5   r6   r7   vllm.platformsr8   vllm.sequencer9   vllm.utils.collection_utilsr:   vllm.utils.tensor_schemar;   r<   vllm.utils.torch_utilsr=   idefics2_vision_modelr?   
interfacesr@   rA   rB   rC   r   rD   rE   rF   rk  rH   r_   rc   rY   r   r   re   r   r   r   r  r   r\   r  r  r"  r'  r.  rt  rv  r  r#  r  r$  rS  r  r  r  r  register_processorr  r]   r]   r]   r^   <module>   s   0$	_ C /   V]]bbg	