o
    پip                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZmZ ddlZddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9m:Z: eejej;f Z<	dAde=dej>dee=e=f dej;fddZ?	dAde=dej>dee=e=f dej;fdd Z@	!	dBde=d"ee=ee=e=f f d#eAdee=e=f dej;f
d$d%ZBG d&d' d'eZCG d(d) d)eZDeeCeDf ZEeejFd*d+ZGG d,d- d-ejHZIG d.d/ d/eIZJG d0d1 d1eIZKd2edee=d3f fd4d5ZLG d6d7 d7ejHZMG d8d9 d9eMZNG d:d; d;eMZOG d<d= d=eMZPeNeOePd>ZQG d?d@ d@ZReRZSdS )CzCInference-only MiniCPM-V model compatible with HuggingFace weights.    N)partial)chain)	AnyCallableIterableListLiteralOptionalTuple	TypedDictUnion)Image)nn)trunc_normal_)PretrainedConfig)ReplicatedLinear)LogitsProcessor)QuantizationConfig))MultiModalityDataPaddingPatternTokenPairsgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)ForwardBatch)set_default_torch_dtype)default_weight_loader)Idefics2VisionTransformer)LlamaConfigLlamaForCausalLM)Qwen2ConfigQwen2ForCausalLM)Qwen3ConfigQwen3ForCausalLM)
add_prefixflatten_nested_list   r   	embed_dimposversionreturnc                 C   s   | d dksJ t j| d t jd}|| d  }dd|  }|dkrC|d}t d	||}t |}t |}t j||gd
d}|S t d||}t |}t |}t j||gdd}|S )z
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,) / (H, W)
    out: (M, D) / (H, W, D)
    r%   r   dtype       @      ?'  r$   m,d->md   axisz	hw,d->hwdnparangefloat32reshapeeinsumsincosconcatenate)r&   r'   r(   omegaoutemb_sinemb_cosemb rB   N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/minicpmv.py!get_1d_sincos_pos_embed_from_gridE   s    




rD   gridc                 C   sl   | d dksJ t | d |d |}t | d |d |}|dkr+tj||gdd}|S tj||gdd}|S )Nr%   r   r1   r$   r2   r/   )rD   r5   r<   )r&   rE   r(   emb_hemb_wrA   rB   rB   rC   !get_2d_sincos_pos_embed_from_grid`   s   rH   F	grid_size	cls_tokenc           
      C   s   t |tr||}}n	|d |d }}tj|tjd}tj|tjd}t||}tj|dd}t |tjr?|jd||fksAJ |dkrf|	dd||g}t
| ||}	|rdtjtd| g|	gdd}	|	S t
| ||}	|	S )z
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or
                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    r   r1   r*   r2   r%   r$   )
isinstanceintr5   r6   r7   meshgridstackndarrayshaper8   rH   r<   zeros)
r&   rI   rJ   r(   grid_h_sizegrid_w_sizegrid_hgrid_wrE   	pos_embedrB   rB   rC   get_2d_sincos_pos_embedt   s    
 rW   c                   @   s@   e Zd ZU ed ed< eej ed< 	 ejed< 	 ejed< dS )MiniCPMVImagePixelInputspixel_valuestypedataimage_bounds	tgt_sizesN)__name__
__module____qualname__r   __annotations__r   torchTensorrB   rB   rB   rC   rX      s   
 

rX   c                   @   s0   e Zd ZU ed ed< ejed< 	 ejed< dS )MiniCPMVImageEmbeddingInputsimage_embedsrZ   r[   r\   N)r^   r_   r`   r   ra   rb   rc   rB   rB   rB   rC   rd      s   
 

rd   gư>)epsc                       s   e Zd ZdZdedddfdedededee d	eegej	f d
e
dee deddf fddZdejddfddZdefddZ  ZS )BaseResamplerz
    A 2D perceiver-resampler network with one cross attention layers by
        (grid_size**2) learnable queries and 2d sincos pos_emb.
    Outputs:
        A tensor with the shape of (grid_size**2, embed_dim)
    NT num_queriesr&   	num_headskv_dim
norm_layerdo_post_projectionquant_configprefixr)   c	           	         s   t    || _|| _|| _tt| j|| _	t
| j	dd |d ur6||kr6t||d|td|d| _ndd | _t||| _||| _||| _|| _|rU||nd | _|rjt|d t|| | _d S d | _d S )	N{Gz?stdFkv_proj)biasrn   ro   c                  _   s   t  | i |d fS N)r   Identity)argskwargsrB   rB   rC   <lambda>   s   z(BaseResampler.__init__.<locals>.<lambda>g      )super__init__ri   r&   rj   r   	Parameterrb   rQ   queryr   r   r"   rs   MultiheadAttentionattnln_qln_kvrm   ln_postrandnproj)	selfri   r&   rj   rk   rl   rm   rn   ro   	__class__rB   rC   r{      s2   


	

zBaseResampler.__init__mc                 C   s   t |tjr&t|jdd t |tjr"|jd ur$tj|jd d S d S d S t |tjr>tj|jd tj|jd d S d S )Nrp   rq   r   r-   )	rK   r   Linearr   weightrt   init	constant_	LayerNorm)r   r   rB   rB   rC   _init_weights   s   zBaseResampler._init_weightsNc                 C   s   | dd|dS )Nr1   )	unsqueezerepeat)r   r}   r   rB   rB   rC   _repeat  s   zBaseResampler._repeat)r^   r_   r`   __doc__
DEFAULT_LNrL   r	   r   r   r   boolr   strr{   Moduler   r   __classcell__rB   rB   r   rC   rg      s8    	
,	rg   c                       s   e Zd Zdedddfdedededee deegejf d	e	eef d
ee
 deddf fddZ	dd	e	eef dejjddfddZdejdejjddfddZdejdejdejfddZ  ZS )Resampler2_5NF   r   rh   ri   r&   rj   rk   rl   max_sizern   ro   r)   c	           	   	      s<   t  j|||||||d || _| | j | | j d S Nrn   ro   )rz   r{   r   _set_2d_pos_cacheapplyr   )	r   ri   r&   rj   rk   rl   r   rn   ro   r   rB   rC   r{     s   
zResampler2_5.__init__cpudevicec                 C   8   t | j|dd}t| |}| jd|dd d S Nr%      )r(   rV   F
persistentrW   r&   rb   
from_numpyfloattoregister_bufferr   r   r   pos_embed_arrrV   rB   rB   rC   r   "  
   zResampler2_5._set_2d_pos_cacher]   c                 C      |d d df    }|d d df    }t|tr"t|ts$J || jd ks2|| jd krLt || jd t || jd f| _| | j| d S d S Nr   r1   maxitemrK   rL   r   r   r   r]   r   max_hmax_wrB   rB   rC   _adjust_pos_cache+     zResampler2_5._adjust_pos_cachexc                 C   s  |j d |j d ksJ |j d }|j}|j}|d d df |d d df  }| j||d |  }t|ts;J tj	||ftj
|d}g }	t|D ]/}
||
  \}}|	| jd |d |d d f || df| d||
||
 d f< qLtjjjj|	dddddd	}	| |\}}| |ddd	}| | j}| j| ||||	 ||d
d }|ddd	}| |}|| j }|S )Nr   r1   r   r+   r   r/   T        batch_firstpadding_valuer%   key_padding_mask)rP   r   r+   r   r   r   rK   rL   rb   rQ   r   rangetolistappendrV   r8   r   r   utilsrnnpad_sequencepermuters   r   r   r}   r   r   r   r   )r   r   r]   bsr   r+   	patch_lenmax_patch_lenr   rV   itgt_htgt_w_qr>   rB   rB   rC   forward9  sP   
 .



zResampler2_5.forwardr   )r^   r_   r`   r   rL   r	   r   r   r   r
   r   r   r{   rb   typesDevicer   rc   r   r   r   rB   rB   r   rC   r     sR    
	


	
$r   c                       s  e Zd Zdeddddfdedededee d	eegejf d
e	eef dee
 deddf fddZdedejfddZ	d"d
e	eef dejjddfddZdejdejjddfddZ	d"dedejjddfddZ	d"dedejjfddZ	d#dejdejdejfd d!Z  ZS )$Resampler4_5Nr   i  rh   ri   r&   rj   rk   rl   r   rn   ro   r)   c
           
   	      sN   t  j|||||||	d || _|| _| | j | | j | | j d S r   )rz   r{   r   max_temporal_sizer   _set_temporal_pos_cacher   r   )
r   ri   r&   rj   rk   rl   r   r   rn   ro   r   rB   rC   r{   l  s   
zResampler4_5.__init__r'   c                 C   s~   |d dksJ t j|d t jd}||d  }dd|  }|d}t d||}t |}t |}t j||gd	d
}|S )z
        embed_dim: output dimension for each position
        pos: a list of positions to be encoded: size (M,)
        out: (M, D)
        r%   r   r*   r,   r-   r.   r/   r0   r1   r2   r4   )r   r&   r'   r=   r>   r?   r@   rA   rB   rB   rC   *get_1d_sincos_pos_embed_from_temporal_size  s   


z7Resampler4_5.get_1d_sincos_pos_embed_from_temporal_sizer   r   c                 C   r   r   r   r   rB   rB   rC   r     r   zResampler4_5._set_2d_pos_cacher]   c                 C   r   r   r   r   rB   rB   rC   r     r   zResampler4_5._adjust_pos_cacher   c                 C   sB   t j|t jd}t| | j| |}| j	d|dd d S )Nr*   temporal_pos_embedFr   )
r5   r6   r7   rb   r   r   r&   r   r   r   )r   r   r   temporal_sizerV   rB   rB   rC   r     s   	z$Resampler4_5._set_temporal_pos_cachec                 C   s&   || j kr|| _ | | j | d S d S ru   )r   r   )r   r   r   rB   rB   rC   _adjust_temporal_pos_cache  s   
z'Resampler4_5._adjust_temporal_pos_cacher   c                 C   sT  |j d |j d ksJ |j d }|j}|j}|d d df |d d df  }| j||d d}d }	|d urRtt|}	t|	}
|
dkrGd}|
| jkrR| 	|
| | 
 }t|ts_J tj||ftj|d}| |\}}| |ddd}| | j}g }g }t|D ]O}|| \}}|r|	| dkr|tj| j||d n|| j|	|  | || jd |d |d d f || df| d|||| d f< qtjjjj|dd	d
ddd}|}|| }|r|tj |dd7 }t!|}g }g }g }d}|D ]P}|t!| }||d d ||d d f dddd| j ||d d ||d d f dddd| j ||||d d f dd |}q	tjjjj|dd	d
ddd}tjjjj|dd	d
ddd}tjjjj|ddd
"d}| j#| $|||||dd }|ddd}| %|}|| j& }|S )Nr   r1   r   Fr/   Tr   r%   r   r   dimr   )'rP   r   r+   r   listr   from_iterabler   r   r   r   rK   rL   rb   rQ   r   rs   r   r   r   r}   r   r   r&   r   r   rV   r8   r   r   r   r   rN   lensqueezer   r   r   r   )r   r   r]   temporal_idsr   r   r+   r   temporal_pos_embtemporal_ids_flattenr   r   r   r   r   pos_embed_2dpos_embed_temporalr   r   r   kvmerge_kmerge_vmerge_key_padding_maskstarttpendr>   rB   rB   rC   r     s   
 
.
,,





zResampler4_5.forwardr   ru   )r^   r_   r`   r   rL   r	   r   r   r   tupler   r   r{   r5   rO   r   rb   r   r   r   rc   r   r   r   r   r   rB   rB   r   rC   r   j  s    
	



	


r   config.c                 C   sP   t | dd }|d u r| jdkr| jdkrdS dS t|}tdd |dD S )	Nr(   i 	  @   r$   r   c                 s   s    | ]}t |V  qd S ru   rL   .0r   rB   rB   rC   	<genexpr>I  s    z(get_version_by_config.<locals>.<genexpr>.)getattrhidden_size	query_numr   r   split)r   version_floatversion_strrB   rB   rC   get_version_by_config>  s   r   c                       s  e Zd ZdZddddedee def fdd	Z		d/d
e	j
dee dededee dee de	j
fddZd
e	j
dedee fddZd
e	j
dee dee	j
e	j
f fddZdejfddZd
e	j
de	j
dedede	j
f
ddZ		d0dedee dedejfdd Z	d1dedee dedejfd!d"Z		d0d#ed$edee dedejf
d%d&Z		d/d'ee	j
 d(ee	j
 d)ee	j
 de	j
fd*d+Zd,ee de	j
fd-d.Z  Z S )2MiniCPMBaseModelz_
    The abstract class of MiniCPMV can only be inherited, but cannot be
    instantiated.
    Nrh   r   r   rn   ro   c                   s   t    || _t| j| _| j||td|d| _| ||td|| _	| jdkr.| j	j
n| j	jj
| _| jj| _
| j| j
| j|td|d| _t|| _d S )Nllmr   rn   ro   vpmr$   	resamplerr   )rz   r{   r   r   r(   init_llmr"   r  init_vision_moduler  r&   
embeddings
vision_dimr   init_resamplerr  r   logits_processorr   r   rn   ro   r   rB   rC   r{   R  s*   


zMiniCPMBaseModel.__init__	input_ids
pad_valuesim_start_id	im_end_idslice_start_idslice_end_idr)   c                 C   sN  ||k}||k}|dur|||kO }|||kO }t |\}	|	d7 }	t |\}
t|	t|
krbt|	d t|
krb|d |v rbt|	dkrbt|
dkrb|
d |	d k rbt t jdg|	jd|	g}	tt|	t|
}|dkrwt jd|jdS g }t|D ]}|	| }|
| }||k r|	||f q}|st jd|jdS t j||jd}|S )z`
        Returns a tensor indicating the bounds (start and end token ids) of the images
        Nr1   r   r   )r   r%   )
rb   wherer   cattensorr   minrQ   r   r   )r   r  r  r  r  r  r  
start_condend_condimage_start_tokensimage_end_tokensvalid_image_numsvalid_pairsr   start_token	end_tokenvalid_pairs_tensorrB   rB   rC   _get_image_boundsw  sB   z"MiniCPMBaseModel._get_image_boundsrx   c                 K   s   | dg }| dg }| dd }| dd }| dd }| dd }| dd }	| dd }
|	d ura| j||
||||d	}t|	tjtfsPtd
t|	 t|	trZt|	}	t	||	ddS | j||
||||d	}t
|j|jd||ddS )NrY   r]   r  r  r  r  re   r  )r  r  r  r  r  r  z*Incorrect type of image embeds. Got type: )r\   r[   rZ   r   )r\   r[   r]   rZ   )popr  rK   rb   rc   r   
ValueErrorrZ   r  rd   rX   r   r   )r   r  rx   rY   r]   r  r  r  r  re   r  r\   rB   rB   rC   _parse_and_validate_inputs  sX   

z+MiniCPMBaseModel._parse_and_validate_inputsimage_inputsc              	   C   s   | j |}|d u rtjg |jd}||fS |d dkr)|d |j|j}n| |}|d }t	|dkrat
dd | D |j}|d|d	d
d
|jd	 |d	|jd	  ||fS )Nr   rZ   re   r[   r\   r   c                 S   s"   g | ]\}}t j||t jd qS )r*   )rb   r6   long)r   r   r   rB   rB   rC   
<listcomp>  s    z2MiniCPMBaseModel.get_embedding.<locals>.<listcomp>r/   r1   )r  get_input_embeddingsrb   r  r   rZ   r+   r   get_vision_hidden_statesr   rN   r   scatter_viewr   rP   )r   r  r#  vlm_embeddingvision_hidden_statesr\   image_indicesrB   rB   rC   get_embedding  s2   
zMiniCPMBaseModel.get_embeddingc                 C   
   | j  S ru   )r  r&  r   rB   rB   rC   r&    s   
z%MiniCPMBaseModel.get_input_embeddings	positionsforward_batchc                 K   s   t ||| | j|d}|S )N)r  r1  multimodal_modellanguage_modelr0  )r   r  )r   r  r0  r1  rx   hidden_statesrB   rB   rC   r     s   zMiniCPMBaseModel.forwardc                 C      t ru   NotImplementedErrorr  rB   rB   rC   r       zMiniCPMBaseModel.init_llmc                 C   r5  ru   r6  r  rB   rB   rC   r  &  r8  z#MiniCPMBaseModel.init_vision_moduler&   r  c                 C   r5  ru   r6  )r   r&   r  rn   ro   rB   rB   rC   r	  .  s   zMiniCPMBaseModel.init_resamplerrY   patch_attn_maskr]   c                 C   r5  ru   r6  )r   rY   r9  r]   rB   rB   rC   get_vision_embedding7  r8  z%MiniCPMBaseModel.get_vision_embeddingitemsc                 C   r5  ru   r6  )r   r;  rB   rB   rC   get_image_feature?  s   z"MiniCPMBaseModel.get_image_featureNNNrh   rh   )!r^   r_   r`   r   r   r	   r   r   r{   rb   rc   r   rL   r  objectMiniCPMVImageInputsr"  r
   r-  r   	Embeddingr&  r   r   r   r   r  r  r	  r:  r   r<  r   rB   rB   r   rC   r   L  s    	+
;
5
$




 r   c                       <  e Zd Zg dddgdZg dZdddd	d
dZi Zg Z		d(dede	e
 def fddZ		d(dede	e
 dedejfddZ	d)dede	e
 dedejfddZ		d(dedede	e
 dedejf
ddZ		d*deej de	ej de	ej dejfdd Zd!ee dejfd"d#Zd$ee d%efd&d'Z  ZS )+MiniCPMV2_6q_projk_projv_proj	gate_projup_projqkv_projgate_up_projfc1fc2out_projrL  o_projrM  	down_projrs   rL  r   rL  r1   rL  r%   rM  r   rM  r1   rF  rG  rH  rI  rJ  Nrh   r   rn   ro   c                    $   t  j|||d | jdksJ d S )Nr  r%      rz   r{   r(   r  r   rB   rC   r{   k     zMiniCPMV2_6.__init__r)   c                 C      t |||dS Nr  )r   r  rB   rB   rC   r  t     zMiniCPMV2_6.init_llmc                 C   P   t |j||d}| jjr|jjd d |j_t|d|jj t|d|jj	 |S Nr  r/   r&   
patch_size
r   vision_configr   drop_vision_last_layerencoderlayerssetattrr  r&   rd  r   r   rn   ro   modelrB   rB   rC   r  |     zMiniCPMV2_6.init_vision_moduler&   r  c              	   C   X   t tj t| jj||d |||d}W d    n1 sw   Y  |jdt dS N   )ri   r&   rj   rk   rn   ro   cuda)r   r+   r   rb   float16r   r   r   r   get_default_dtyper   r&   r  rn   ro   r  rB   rB   rC   r	       zMiniCPMV2_6.init_resamplerrY   r9  r]   c                 C      | j |||d}|S Npatch_attention_maskr]   r  r   rY   r9  r]   vision_embeddingrB   rB   rC   r:       z MiniCPMV2_6.get_vision_embeddingr;  c                 C     t dd |D }tjt dd |D dd}t||jd ks"J | jjjjj	}| jjjjj
}dd |D }|d d df |d d df    }t|tsRJ tjjjj|dd	d
}|j\}	}
}|ddd|	dd|
}tj|	d|ftj|d}| j|j	d}|d d df |d d df  }tj|d|j	dd|dk |d d dd d f< | j||||d}| ||S )Nc                 S      g | ]}|j qS rB   featurer   r   rB   rB   rC   r%        z1MiniCPMV2_6.get_image_feature.<locals>.<listcomp>c                 S   r  rB   tgt_sizer  rB   rB   rC   r%    r  r   r   c                 S       g | ]}|j d dd dqS r1   )end_dimr   flattenr   r   r   rB   rB   rC   r%        r1   Tr   r   r%      r/   r   r   ry  r#   rb   rN   r   rP   r  r  position_embeddingr   r   r+   r   r   rK   rL   r   r   r   r   r   r8   rQ   r   cloner   r6   sizer   rZ   r  r   r;  rY   r]   r   r+   all_pixel_values_lstmax_patchesall_pixel_valuesBLr   r9  tgt_sizes_tensormask_shapesr}  rB   rB   rC   r<    B   (
 zMiniCPMV2_6.get_image_featurer  r#  c           	      C   <   |j }|j}|j}|j}||f||fg}t|}|||S ru   r  r  r  r  r   pad_input_tokens	r   r  r#  r  r  r  r  media_token_pairspatternrB   rB   rC   pad_input_ids     zMiniCPMV2_6.pad_input_idsr>  r?  r=  )r^   r_   r`   packed_modules_mappingsupported_lora_modules#bitsandbytes_stacked_params_mappingembedding_modulesembedding_padding_modulesr   r	   r   r   r{   r   r   r   r  r  rL   r	  r   rb   rc   r:  r   r<  r   r  r   rB   rB   r   rC   rD  C      	



'rD  c                       rC  )+MiniCPMV4_0rE  rI  rJ  rK  rN  rT  rU  rV  rW  rX  rY  Nrh   r   rn   ro   c                    rZ  )Nr     r   r]  r  r   rB   rC   r{   	  r^  zMiniCPMV4_0.__init__r)   c                 C   r_  r`  )r   r  rB   rB   rC   r    ra  zMiniCPMV4_0.init_llmc                 C   rb  rc  re  rk  rB   rB   rC   r    rm  zMiniCPMV4_0.init_vision_moduler&   r  c              	   C   rn  ro  rr  ru  rB   rB   rC   r	  *  rv  zMiniCPMV4_0.init_resamplerrY   r9  r]   c                 C   rw  rx  r{  r|  rB   rB   rC   r:  >  r~  z MiniCPMV4_0.get_vision_embeddingr;  c                 C   r  )Nc                 S   r  rB   r  r  rB   rB   rC   r%  M  r  z1MiniCPMV4_0.get_image_feature.<locals>.<listcomp>c                 S   r  rB   r  r  rB   rB   rC   r%  O  r  r   r   c                 S   r  r  r  r  rB   rB   rC   r%  U  r  r1   Tr   r   r%   r  r/   r   r   ry  r  r  rB   rB   rC   r<  K  r  zMiniCPMV4_0.get_image_featurer  r#  c           	      C   r  ru   r  r  rB   rB   rC   r  r  r  zMiniCPMV4_0.pad_input_idsr>  r?  r=  )r^   r_   r`   r  r  r  r  r  r   r	   r   r   r{   r   r   r   r  r  rL   r	  r   rb   rc   r:  r   r<  r   r  r   rB   rB   r   rC   r    r  r  c                       sH  e Zd Zg dddgdZg dZdddd	d
dZi Zg Z		d*dede	e
 def fddZ		d*dede	e
 dedejfddZ	d+dede	e
 dedejfddZ		d*dedede	e
 dedejf
ddZ		d,deej de	ej de	ej dejfdd Zd!ee dejfd"d#Zd$ee d%efd&d'Z fd(d)Z  ZS )-MiniCPMV4_5rE  rI  rJ  rK  rN  rT  rU  rV  rW  rX  rY  Nrh   r   rn   ro   c                    rZ  )Nr  r  r   r]  r  r   rB   rC   r{     r^  zMiniCPMV4_5.__init__r)   c                 C   s$   t |||d}tdd ||_|S )Nr  c                 S   r.  ru   )rl  r&  r/  rB   rB   rC   ry     s   
 z&MiniCPMV4_5.init_llm.<locals>.<lambda>)r!   r   
MethodTyper&  )r   r   rn   ro   r  rB   rB   rC   r    s
   zMiniCPMV4_5.init_llmc                 C   rb  rc  re  rk  rB   rB   rC   r    rm  zMiniCPMV4_5.init_vision_moduler&   r  c              	   C   rn  ro  )r   rb   rs  r   r   r   r   rt  ru  rB   rB   rC   r	    rv  zMiniCPMV4_5.init_resamplerrY   r9  r]   c                 C   rw  rx  r{  r|  rB   rB   rC   r:    r~  z MiniCPMV4_5.get_vision_embeddingr;  c                 C   r  )Nc                 S   r  rB   r  r  rB   rB   rC   r%    r  z1MiniCPMV4_5.get_image_feature.<locals>.<listcomp>c                 S   r  rB   r  r  rB   rB   rC   r%    r  r   r   c                 S   r  r  r  r  rB   rB   rC   r%    r  r1   Tr   r   r%   r  r/   r   r   ry  r  r  rB   rB   rC   r<    r  zMiniCPMV4_5.get_image_featurer  r#  c           	      C   r  ru   r  r  rB   rB   rC   r    r  zMiniCPMV4_5.pad_input_idsc                    s   t    | S ru   )rz   evalr/  r   rB   rC   r     s   
zMiniCPMV4_5.evalr>  r?  r=  )r^   r_   r`   r  r  r  r  r  r   r	   r   r   r{   r    r   r   r  r  rL   r	  r   rb   rc   r:  r   r<  r   r  r  r   rB   rB   r   rC   r    s    	



'r  )r[  r  r  c                	       s   e Zd ZU dZi Zg Zi Zg Zej	e
d< 		ddedee deddf fd	d
Zdd Zdd Zdeeeejf  fddZ  ZS )MiniCPMVz
    Different versions of MiniCPMV use different visual encoders and LLMs,
    which is not conducive to the current integration logic of LoRA and
    bitsandbytes in SGLang. Therefore, it is necessary to separate them.
    minicpmvNrh   r   rn   ro   r)   c           	   
      s   t    t|dsd}nt|jd}tdd |D }t|}|d u r?d	dd t
t D }td| d	| z||||d
}|| _W n ty` } z	td|  |d }~ww || _d S )Nr(   r[  r   c                 S   s   g | ]}t |qS rB   r   r   rB   rB   rC   r%  D  s    z%MiniCPMV.__init__.<locals>.<listcomp>z, c                 S   s"   g | ]}|d   d|d  qS )r   r   r1   rB   )r   r   rB   rB   rC   r%  I  s   " z+Currently, MiniCPMV only supports versions z. Got version: r  z Failed to instantiate MiniCPMV: )rz   r{   hasattrr   r(   r   r   _SUPPORT_VERSIONgetjoinsortedkeysr!  r  	Exceptionprintr   )	r   r   rn   ro   r(   instance_classsupported_versionsr  er   rB   rC   r{   8  s8   




zMiniCPMV.__init__c                 C   s   |dkrd S t | j|S )Nr  )r   r  )r   namerB   rB   rC   __getattr__Z  s   zMiniCPMV.__getattr__c                 O   s   | j |i |S ru   )r  )r   rw   rx   rB   rB   rC   __call___  s   zMiniCPMV.__call__weightsc                 C   s  g d}t | j }|D ]~\}}d|v sd|v rqd|v s"d|v r#q|dr-||vr-q|dd}d	|v rG|| }t|d
t}||| q|D ](\}}	}
|	|vrSqI||	|}|drc||vrcqI|| }|j}||||
  n|dr|||vr|q|| }t|d
t}||| qd S )N))rL  rF  r   )rL  rG  r   )rL  rH  r   )rM  rI  r   )rM  rJ  r1   zrotary_emb.inv_freq~	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerzself_attn.out_projzself_attn.projsamplerweight_loaderz.bias)	dictr  named_parameters
startswithreplacer   r   endswithr  )r   r  stacked_params_mappingparams_dictr  loaded_weightparamr  
param_nameweight_nameshard_idrB   rB   rC   load_weightsb  s@   	

zMiniCPMV.load_weightsr>  )r^   r_   r`   r   r  r  r  r  r   r   ra   r   r	   r   r   r{   r  r  r   r
   rb   rc   r  r   rB   rB   r   rC   r  (  s*   
 
"$r  )r$   )Fr$   )Tr   r   	functoolsr   	itertoolsr   typingr   r   r   r   r   r	   r
   r   r   numpyr5   rb   torch.typesPILr   r   torch.nn.initr   transformersr   sglang.srt.layers.linearr   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.model_loader.utilsr   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.idefics2r   sglang.srt.models.llamar   r   sglang.srt.models.qwen2r   r   sglang.srt.models.qwen3r    r!   sglang.srt.utilsr"   r#   rc   RawImageTyperL   rO   rD   rH   r   rW   rX   rd   rA  r   r   r   rg   r   r   r   r   rD  r  r  r  r  
EntryClassrB   rB   rB   rC   <module>   s   ,





!Ad U x   'n