o
    پix                     @   s  d Z ddlZddlmZmZ ddlmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlm  mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 e8e9Z:G dd dej;Z<G dd dej;Z=G dd dej;Z>G dd dej;Z?G dd dej;Z@G d d! d!ej;ZAee7ZBG d"d# d#ej;ZCeCgZDdS )$zDInference-only Ernie45-VL model compatible with HuggingFace weights.    N)	lru_cachepartial)IterableListOptionalTupleType)	rearrange)PretrainedConfig)	QuickGELU)VisionAttention)RMSNorm)ColumnParallelLinearRowParallelLinear)LogitsProcessor)FusedMoE)QuantizationConfig)ParallelLMHead)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)Ernie4_5_VLMoeModel)
add_prefix)get_processorc                       s\   e Zd Zdeddfdededeej dee	 de
f
 fdd	Zd
ejdejfddZ  ZS )Ernie4_5_VisionMLPN in_featureshidden_features	act_layerquant_configprefixc                    sF   t    t|||td|d| _| | _t|||td|d| _d S )Nfc1r"   r#   fc2)super__init__r   r   r$   actr   r&   )selfr   r    r!   r"   r#   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/ernie45_vl.pyr(   5   s   
zErnie4_5_VisionMLP.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r$   r)   r&   )r*   r/   
x_parallel_r-   r-   r.   forwardL   s   
zErnie4_5_VisionMLP.forward)__name__
__module____qualname__r   intr   nnModuler   r   strr(   torchTensorr4   __classcell__r-   r-   r+   r.   r   3   s"    r   c                       sz   e Zd Zedddfdedededeej deej de	e
 d	ed
df fddZdejdejdejd
ejfddZ  ZS )Ernie4_5_VisionBlockNr   dim	num_heads	mlp_ratior!   
norm_layerr"   r#   r0   c           	   
      s|   t    |d u rttjdd}||| _||| _t|| }t|||dd|t	d|d| _
t||||t	d|d| _d S )Nư>epsTattn)	embed_dimrA   projection_sizeuse_qkv_parallelflatten_batchr"   r#   mlp)r!   r"   r#   )r'   r(   r   r9   	LayerNormnorm1norm2r8   r   r   rG   r   rL   )	r*   r@   rA   rB   r!   rC   r"   r#   mlp_hidden_dimr+   r-   r.   r(   U   s,   



	zErnie4_5_VisionBlock.__init__r/   
cu_seqlensposition_embeddingsc                 C   sN   |  |}t|d}| j|||d}t|d}|| }|| | | }|S )Nzs b ... -> b s ...rQ   rR   zb s ... -> s b ...)rN   r	   rG   rL   rO   )r*   r/   rQ   rR   hidden_statesrG   r-   r-   r.   r4   w   s   


zErnie4_5_VisionBlock.forward)r5   r6   r7   r   r8   floatr   r9   r:   r   r   r;   r(   r<   r=   r4   r>   r-   r-   r+   r.   r?   S   s>    	"r?   c                	       sL   e Zd Z			ddedededdf fd	d
ZdejdejfddZ  ZS )Ernie4_5_VisionPatchEmbed         
patch_sizein_chansrH   r0   Nc                    s:   t    || _|| _|| _tj|| | |dd| _d S )NF)bias)r'   r(   rZ   in_channelsrH   r9   Linearproj)r*   rZ   r[   rH   r+   r-   r.   r(      s
   
z"Ernie4_5_VisionPatchEmbed.__init__rT   c                 C   s"   | j jj}||}|  |}|S r1   )r_   weightdtypeto)r*   rT   target_dtyper-   r-   r.   r4      s   


z!Ernie4_5_VisionPatchEmbed.forward)rW   rX   rY   )	r5   r6   r7   r8   r(   r<   r=   r4   r>   r-   r-   r+   r.   rV      s    rV   c                       s^   e Zd Z	ddeddf fddZdd Zd	d
 Zdeeee	j
f  dee fddZ  ZS ) VariableResolutionResamplerModelr   r#   r0   Nc              	      s~  t    || _|| _|| _|| _|| _|j| _| j| j | j | _| j| j | j | j | _	t
| j| jddt|dd | dd| _t | _t
| j| jddt|dd | dd| _tj| jdd| _| jrt
| j	| jddt|dd | dd| _t | _t
| j| jddt|dd | d	d| _tj| jdd| _t
| j| jddt|dd | d
d| _t|t|ddd| _d S )NTr"   z.spatial_linear1)r\   gather_outputr"   r#   z.spatial_linear2rD   rE   z.temporal_linear1z.temporal_linear2z.mlprms_norm_eps)hidden_sizerF   )r'   r(   in_dimout_dimconfigspatial_conv_sizetemporal_conv_sizeuse_temporal_convspatial_dimtemporal_dimr   getattrspatial_linear1r9   GELUspatial_geluspatial_linear2rM   spatial_normtemporal_linear1temporal_gelutemporal_linear2temporal_normrL   r   
after_norm)r*   rh   ri   rk   rl   rj   r#   r+   r-   r.   r(      s   
	

	
	

	
	
	z)VariableResolutionResamplerModel.__init__c                 C   s$   |j \}}|d||d  g}|S )N   )shapereshape)r*   r/   rk   SCr-   r-   r.   spatial_conv_reshape   s   
z5VariableResolutionResamplerModel.spatial_conv_reshapec                    s^    fdd}d
 fdd	} fdd} fdd	}||} j r)|||}||}||}|S )Nc                    sB     |  j}  | \} } | }  | \} } | } | S r1   )r   rk   rq   rs   rt   ru   r/   r3   r*   r-   r.   fwd_spatial   s   

z=VariableResolutionResamplerModel.forward.<locals>.fwd_spatialFc              
      s  |   }|d d df |d d dd f }}|d jd  }|d jd  }tj|j|jd}d|d< | d d |dd < g }	t	|||D ]!\}
}}t
d|
dD ]}|	t|||  ||d |   qYqNttj|	dd| j}	g }t	|||D ]'\}
}}t
|
dkrdnd|
dD ]}|t|||  ||d |   qqttj|dd| j}tj| d|	d}tj| d|d}tj||gdd} | S )	Nr      r{   r|   ra   )axis)r@   indexr@   )cpunumpyprodrk   npemptysizera   cumsumziprangeappendaranger<   tensorconcatenaterb   deviceindex_selectconcat)r/   grid_thw	to_tensorgrid_thw_cpugrid_tgrid_hwgrid_hw_after_convtokens_per_img_or_vidbatch_offsetslice_offsetstemporoal_sizespatial_sizeb_offsettemp_offsetslice_offsets2x_timestep_1x_timestep_2r   r-   r.   fwd_placeholder  sZ   &

	zAVariableResolutionResamplerModel.forward.<locals>.fwd_placeholderc                    s4     | \} } | }  | \} } | } | S r1   )rv   rw   rx   ry   r   r   r-   r.   fwd_temporal8  s
   

z>VariableResolutionResamplerModel.forward.<locals>.fwd_temporalc                    s     | \} } | } | S r1   )rL   rz   r   r   r-   r.   fwd_mlp?  s   
z9VariableResolutionResamplerModel.forward.<locals>.fwd_mlpF)rm   )r*   r/   r   r   r   r   r   r-   r   r.   r4      s   
1
z(VariableResolutionResamplerModel.forwardweightsc                 C   sZ   t | jdd}t }|D ]\}}||vrq|| }t|dt}||| || q|S )NFremove_duplicateweight_loader)dictnamed_parameterssetrp   r   add)r*   r   params_dictloaded_paramsnameloaded_weightparamr   r-   r-   r.   load_weightsK  s   
z-VariableResolutionResamplerModel.load_weights)r   )r5   r6   r7   r;   r(   r   r4   r   tupler<   r=   r   r   r>   r-   r-   r+   r.   rd      s    U,Ord   c                       s@   e Zd Zddededdf fddZdedejfd	d
Z  Z	S )Ernie4_5_VisionRotaryEmbedding     @r@   thetar0   Nc                    s0   t    d|tjd|dtjd|   | _d S )Ng      ?r   r|   )startendstepra   )r'   r(   r<   r   float32inv_freq)r*   r@   r   r+   r-   r.   r(   [  s   
z'Ernie4_5_VisionRotaryEmbedding.__init__seqlenc                 C   s,   t j|| jj| jjd}t j|| jd}|S )Nr   ra   )inputvec2)r<   r   r   r   ra   outer)r*   r   seqfreqsr-   r-   r.   r4   a  s
   z&Ernie4_5_VisionRotaryEmbedding.forward)r   )
r5   r6   r7   r8   rU   r(   r<   r=   r4   r>   r-   r-   r+   r.   r   Y  s    r   c                       s   e Zd Z			ddededee deddf
 fd	d
Ze	de
jfddZe	de
jfddZde
jde
jfddZde
jde
jde
jfddZ  ZS )Ernie4_5_VisionTransformerrD   Nr   vision_confignorm_epsr"   r#   r0   c                    s   t    |j}|j}|j}|j}|j |j}	|j|j	|| _t
|| d| _ttj|d  }
t|
d | _t fddt|	D | _tj|dd| _d S )N)rZ   r[   rH   rE   r|   c                    s,   g | ]}t  td | dqS )zblocks.)r@   rA   rB   rC   r"   r#   )r?   r   ).0irH   rB   rC   rA   r#   r"   r-   r.   
<listcomp>  s    	z7Ernie4_5_VisionTransformer.__init__.<locals>.<listcomp>rD   )r'   r(   rZ   spatial_merge_sizer[   rg   rH   depthrA   rB   rV   patch_embedr   r9   rM   r   rotary_pos_emb
ModuleListr   blocksln)r*   r   r   r"   r#   rZ   r   r[   rg   r   head_dimr+   r   r.   r(   k  s0   
	z#Ernie4_5_VisionTransformer.__init__c                 C   s   | j jjjS r1   )r   r_   r`   ra   r   r-   r-   r.   ra        z Ernie4_5_VisionTransformer.dtypec                 C   s   | j d jjjjS )Nr   )r   rL   r&   r`   r   r   r-   r-   r.   r     s   z!Ernie4_5_VisionTransformer.devicer   c                 C   s  g }t |dD ]c}||  \}}}t|dd|}t|d|d}||| j | j|| j | j	dddd
 }||| j | j|| j | j	dddd
 }|tj||gdd|d q	tj|dd}|d d dd f  }	| |	}
|
| 
d}|S )Nr   r   r{   r|   rX   r   )r   r   tolistr<   r   	unsqueezeexpandr~   r   permuteflattenr   stackrepeatcatmaxr   )r*   r   pos_idsr   thwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr   r-   r-   r.   rot_pos_emb  s:   "

z&Ernie4_5_VisionTransformer.rot_pos_embr/   c           	      C   s   |j | j| jd}| |}| |}tj||fdd}| | f}t	|d d df |d d df  |d d df j
dtjd}t|d|g}|d}| jD ]	}||||d}qV| |}|jd	krp|jdd}|S )
Nr   r{   r   r   r|   r   )r@   ra   rS   rX   )rb   r   ra   r   r   r<   r   cossinrepeat_interleaver   int32	new_zerosr   r   r   ndimsqueeze)	r*   r/   r   r   embrR   rQ   blkfinal_outputr-   r-   r.   r4     s$   

,



z"Ernie4_5_VisionTransformer.forward)rD   Nr   )r5   r6   r7   r
   rU   r   r   r;   r(   propertyr<   ra   r   r=   r   r4   r>   r-   r-   r+   r.   r   i  s6    -!r   c                	       s"  e Zd Zg dZddddddZ			d-d
edee deddf fddZ	de
e defddZdejdejdejfddZde
e dejfddZde
e dejfddZdejdeddfddZd d! Zd"edefd#d$Z	%d.dejd&ejded'efd(d)Zd*eeeejf  fd+d,Z  ZS )/&Ernie4_5_VLMoeForConditionalGeneration)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)qkv_projr   )r   r   )r   r|   )gate_up_projr   )r   r   )q_projk_projv_proj	gate_projup_projNr   rj   r"   r#   r0   c              	      s.  t    || _t|jt|dd|td|d| _t||td|d| _	t
| jj| jj| jj| jj| jtd|d| _|jrD| j	j| _nt|j|j|td	|d
| _d| jjv | _t|| _t| jdd rdd | jjt| jdd t| jdd t| jdd t| jdd fD }tj|tjd| _d S d | _d S )Nrf   rD   vision_model)r   r"   r#   model)r#   resampler_model)rj   r#   lm_headr%   mrope_sectionim_patch_idc                 S   s   g | ]}|d ur|qS r1   r-   )r   token_idr-   r-   r.   r   #  s
    zCErnie4_5_VLMoeForConditionalGeneration.__init__.<locals>.<listcomp>image_start_token_idimage_end_token_idvideo_start_token_idvideo_end_token_idr   )r'   r(   rj   r   r   rp   r   r  r   r  rd   pixel_hidden_sizerg   rk   rl   r  tie_word_embeddingsembed_tokensr  r   
vocab_sizerope_scalingis_mrope_enabledr   logits_processorr  r<   r   long_visual_token_ids_tensor_cache)r*   rj   r"   r#   visual_token_idsr+   r-   r.   r(     sT   

	

z/Ernie4_5_VLMoeForConditionalGeneration.__init__	input_ids	mm_inputsc                 C   s   t  }|||S r1   )r   pad_input_tokens)r*   r  r  patternr-   r-   r.   pad_input_ids4  s   z4Ernie4_5_VLMoeForConditionalGeneration.pad_input_idspixel_valuesr   c                 C   s   |d ur?||dk }|  d dkrtd|   d|dd}tjt|d d dd f |d d df dg ddd}| ||}|S )	Nr   rX   zgrid_thw has z6 elements after filtering,which is not divisible by 3.r{   r   )r   r   r   r   )value)numel
ValueErrorr~   Fpadr<   r   r  )r*   r  r   image_featuresr-   r-   r.   _vision_forward8  s   (z6Ernie4_5_VLMoeForConditionalGeneration._vision_forwarditemsc                 C      t jdd |D dd| jj}t jdd |D dd}| dks)J | | dks5J | | j||d}| ||}|S )Nc                 S      g | ]}|j qS r-   featurer   itemr-   r-   r.   r   P      zLErnie4_5_VLMoeForConditionalGeneration.get_image_feature.<locals>.<listcomp>r   r   c                 S   r%  r-   )image_grid_thwr(  r-   r-   r.   r   S  r*  r|   r   	r<   r   typer  ra   r   r@   r"  r  )r*   r#  r  r+  image_featureimage_embedsr-   r-   r.   get_image_featureN     z8Ernie4_5_VLMoeForConditionalGeneration.get_image_featurec                 C   r$  )Nc                 S   r%  r-   r&  r(  r-   r-   r.   r   \  r*  zLErnie4_5_VLMoeForConditionalGeneration.get_video_feature.<locals>.<listcomp>r   r   c                 S   r%  r-   )video_grid_thwr(  r-   r-   r.   r   _  r*  r|   r,  r-  )r*   r#  r  r3  video_featurevideo_embedsr-   r-   r.   get_video_featureZ  r2  z8Ernie4_5_VLMoeForConditionalGeneration.get_video_featureforward_batchc           	      C   s   | j du r
d| _dS | j j|j|jd}g }t|dr7|jdur7|jD ]}|du r*q#|jD ]}||j	 q-q#t
j||jd}t
j||gdd}t
||dd| _dS )	z@Set mask for visual tokens (image/video patches and delimiters).Nr   r  )r   r   r   r{   r   )r  visual_token_maskrb   r   ra   hasattrr  mm_itemsr   	pad_valuer<   	as_tensorr   isinr~   )	r*   r  r7  visual_token_ids_tensor
pad_valuesmm_inputr)  placeholder_tensorpad_visual_token_ids_tensorr-   r-   r.   _set_visual_token_maskf  s6   


z=Ernie4_5_VLMoeForConditionalGeneration._set_visual_token_maskc                 C   s   | j jS r1   )r  r  r   r-   r-   r.   get_input_embeddings  s   z;Ernie4_5_VLMoeForConditionalGeneration.get_input_embeddingsmodule_namec                 C   s   | d S )Nr  )
startswith)r*   rE  r-   r-   r.   should_apply_lora  r   z8Ernie4_5_VLMoeForConditionalGeneration.should_apply_loraF	positionsget_embeddingc                 C   s   | j r|j}|j s'| r'| j r'|jdkr|ddks'J d|  | || | |j	d ksCJ d|j	 d|j	 dt
||| j| || jd	}d
| _| ||| j|S )a
  Run forward pass for Ernie45-VL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,).
                (Use input_metadata.mrope_positions to replace it)
        r|   r   rX   zMmultimodal section rotary embedding requires (3, seq_len) positions, but got r{   z
input_ids z and position_ids z should have the same length)r  r7  language_modelmultimodal_modelrH  r8  N)r  mrope_positionsforward_mode	is_decodecontains_image_inputsr   r   rC  r  r}   r   r  r8  r  r  )r*   r  rH  r7  rI  rT   r-   r-   r.   r4     s8   	
z.Ernie4_5_VLMoeForConditionalGeneration.forwardr   c              	   C   s$  g d}ddddddd}t jd	d
dt| jjd}t| jdd}|D ]j\}}d|v r.q$| jjr7d|v r7q$|D ]2\}}	}
|	|vrCq9d|v rL||vrLq9||	|}|	dr\||vr\q9|| }|j
}||||
  n#d|v rv|dd}|dr|dd}| D ]\}}||v r|||d} nqd|v rt|dd }| jjd }||d k}|r|dd}n|d| d||  }|D ]i}|\}}	}}
|	|vrqt|dd }|| jjd d k}||	|}|r|dd}n|dd}|	ds
|	d r||vrq|| v r(|| }|j
}|||||
|d! n	td"| d#  n\|	d$rC|d%d&}|j}n|	d'rR|d(d)}|j}d*|v r]|d+d}|	dsi|	d ro||vroq$|| v r|| }t|d,t}||| q$td"| d# q$d S )-N))r   r   q)r   r   k)r   r   v)r   r   r   )r   r   r   zspatial_linear1.zspatial_linear2.zspatial_norm.ztemporal_linear1.ztemporal_linear2.ztemporal_norm.)zspatial_linear.0.zspatial_linear.2.zspatial_linear.3.ztemporal_linear.0.ztemporal_linear.2.ztemporal_linear.3.r   	down_projr   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_expertsFr   zrotary_emb.inv_freqzlm_head.weightzmlp.experts.z.biasr  z	attn.qkv.zattn.qkv_proj.zmodel.resampler_modelr  r   zmlp.experts.r   z	.experts.z.text_experts.z.vision_experts._bias)shard_id	expert_idz
Parameter z not found in params_dictzmlp.gate.weightzgate.weightztext_experts_gate.weightzmlp.gate.weight_1zgate.weight_1zvision_experts_gate.weighte_score_correction_biasz.moe_statics.r   )r   make_expert_params_mappingr   rj   moe_num_expertsr   r   r  replaceendswithr   rF  r#  r8   splitkeysloggerwarningTrp   r   )r*   r   stacked_params_mappingresampler_weight_mappingexpert_params_mappingr   r   r   
param_nameweight_namer[  r   r   old_weight_namenew_weight_name
moe_offsetvision_expert_start_idxis_text_expertmappingr\  r-   r-   r.   r     s   	




z3Ernie4_5_VLMoeForConditionalGeneration.load_weights)Nr   r   )r5   r6   r7   #default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingr
   r   r   r;   r(   r   r8   r   r  r<   r=   r"  r   r1  r6  r   rC  rD  boolrG  r4   r   r   r   r>   r-   r-   r+   r.   r     sb    <

	
$5r   )E__doc__logging	functoolsr   r   typingr   r   r   r   r   r   r   r<   torch.nnr9   torch.nn.functional
functionalr  einopsr	   transformersr
   sglang.srt.layers.activationr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   "sglang.srt.layers.logits_processorr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   *sglang.srt.layers.quantization.base_configr   *sglang.srt.layers.vocab_parallel_embeddingr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr    sglang.srt.models.ernie45_moe_vlr   sglang.srt.utilsr   &sglang.srt.utils.hf_transformers_utilsr   	getLoggerr5   rd  r:   r   r?   rV   rd   r   r   cached_get_processorr   
EntryClassr-   r-   r-   r.   <module>   sL   
 7 9x  
l