o
    پiP                     @   sj  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z- ddl.m/Z/m0Z0 e1e2Z3G dd dej4Z5G dd dej4Z6G dd dej4Z7G dd dej4Z8dS )zQInference-only Ernie4.5 VL model compatible with baidu/ERNIE-4.5-VL-*-PT weights.    N)islice)AnyDictOptionalTupleUnion)nn)PretrainedConfig)get_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)is_dp_attention_enabled)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)get_moe_impl_class)TopK)QuantizationConfig)RadixAttention)Ernie4_5_VLRotaryEmbedding)PPMissingLayer)VocabParallelEmbedding)ForwardBatchPPProxyTensors)DeepseekV2MLP)
add_prefixmake_layersc                       s   e Zd Z									dd	ed
edededededeeee	f  de
dededee dede
ddf fddZdejdejdedejfddZ  ZS )Ernie4_5_VLMoeAttentionr   '  NT        Fconfighidden_size	num_headsnum_kv_headslayer_id
rope_thetarope_scalingrope_is_neox_stylefreq_allocationmax_position_embeddingsquant_configprefixbiasreturnc              
      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|d| j| j | _
t	|dd}t|| j
 | _| j| j
 | _| j| j
 | _| j
d | _|| _|
| _t|| j
| j| j||td|d| _t| j| j
 |||td|d| _|	}| j
d	 |	 d	 }| j
d	 |	 d	 }t| j
| j
|
|d
t |||gd| _t| j| j
| j| j||td|d| _d S )Nr      head_dimpartial_rotary_factorg      qkv_proj)r/   r-   r.   o_proj   F)	head_size
rotary_dimr,   baseis_neox_styledtypemrope_sectionattn)r&   r'   r-   r.   )super__init__r$   r   total_num_headsr%   total_num_kv_headsmaxr&   getattrr2   intr8   q_sizekv_sizescalingr(   r,   r   r   r4   r   r5   r   torchget_default_dtype
rotary_embr   r=   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   tp_sizer3   t_ropeh_ropew_rope	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/ernie45_moe_vl.pyr?   4   st   

	
		z Ernie4_5_VLMoeAttention.__init__	positionshidden_statesforward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )N)dim)r4   splitrE   rF   rJ   r=   r5   )rK   rT   rU   rV   qkv_qkvattn_outputoutputrR   rR   rS   forward   s    zErnie4_5_VLMoeAttention.forward)	r   r   NTr    r!   Nr"   F)__name__
__module____qualname__r	   rD   floatr   r   strr   boolr   r?   rH   Tensorr   ra   __classcell__rR   rR   rP   rS   r   3   s`    	
Ur   c                	       sX   e Zd Z		ddededee def fddZd	e	j
d
e	j
dede	j
fddZ  ZS )Ernie4_5_VLMoeMoENr"   r#   r'   r-   r.   c              
      sB  t    || _t | _t|dd| _|j| _|j}t	|}| j|kr/t
d| j d| d|j}|d }|d }	|j}
t|d|jd |jd g}
|
d }|
d }|jd |jd ks_J ttjd|jd tjd	| _||ksuJ ||kr||krt|j|jd d
tj|td|d| _t|jdd
| jd d| _t||jd |j|j|jd | j|td|d| _|	|ksJ ||	kr||krt|j|jd d
tj|td|d| _t|jdd
| jd d| _t||jd |j|j|jd | j|td|d| _| jdkr|jd |j }t |j||j!|d
td|d| _"d S d S )Nmoe_num_shared_expertsr   zTensor parallel size z' is greater than the number of experts .r1   moe_layer_end_indexr6   r;   Ftext_experts_gate)r/   params_dtyper-   r.   T)top_krenormalizeuse_grouped_topkcorrection_biastext_experts)num_expertsrq   r$   intermediate_sizer'   r-   r.   vision_experts_gatevision_expertsshared_experts)r$   rw   
hidden_actr-   reduce_resultsr.   )#r>   r?   r'   r   rL   rC   rk   r$   moe_num_expertsrB   
ValueErrormoe_layer_start_indexrm   num_hidden_layersr   	ParameterrH   emptyfloat32e_score_correction_biasr   r   ro   r   moe_ktext_experts_topkr   moe_intermediate_sizeru   rx   vision_experts_topkry   Ernie4_5_VLMoeMLPr{   rz   )rK   r#   r'   r-   r.   r}   max_moe_num_expertsr   text_moe_layer_start_indexvision_moe_layer_start_indexrm   text_moe_layer_end_indexvision_moe_layer_end_indexrw   rP   rR   rS   r?      s   


	
	
zErnie4_5_VLMoeMoE.__init__rU   visual_token_maskkwargsr0   c                 K   s  | j dkr
| |nd }|j}|jd }|d|}tj }|d ur.|s.| }| }	nd}d}	|rN| 	|j
tjd\}
}| ||
}| j||d}ny|	r|d| j }| }t|}|| d| j}|| d| j}| |j
tjd\}}| ||}| j||d ||< | 	|j
tjd\}
}| ||
}| j||d ||< n| |j
tjd\}}| ||}| j||d}|d ur|| }| jdkrt|}||S )Nr   rW   Frn   )rU   topk_outputr1   )rk   rz   shapeviewrH   cudais_current_stream_capturingallanyrx   tor   r   ry   repeatr$   rg   
zeros_likereshapero   r   ru   flattenrL   r   )rK   rU   r   r   shared_output
orig_shape
hidden_dim	capturing
all_visual
any_visualvision_router_logitsr[   vision_topk_outputfinal_hidden_statestext_token_masktext_hidden_statesvision_hidden_statestext_router_logitstext_topk_outputr   rR   rR   rS   ra     s   







zErnie4_5_VLMoeMoE.forwardNr"   )rb   rc   rd   r	   rD   r   r   rf   r?   rH   rh   objectra   ri   rR   rR   rP   rS   rj      s*    trj   c                       sz   e Zd ZdZ		ddedee def fddZd	e	j
d
e	j
dedee	j
 de	j
dB dedee	j
e	j
f fddZ  ZS )Ernie4_5_VLMoeDecoderLayerzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    Nr"   r'   r-   r.   c                    sh  t    t|dd}t|dd }t|dd}t|dd}t|dd	}	t||j|j|j||||||j|td
||j	d| _
|j}
t|
}t|d|jd |jd g}t|}||ks\J |j}t|}t|dd}t|d|dk}|r|d | dkr||kr||krt|||td|d| _nt|j|j|j|td|d| _t|j|jd| _t|j|jd| _d S )Nr(   i  r)   r*   Fr+   r    r,   i   	self_attn)r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   rm   r1   moe_layer_intervaluse_moer   mlp)r#   r'   r-   r.   )r$   rw   r{   r-   r.   eps)r>   r?   rC   r   r$   num_attention_headsnum_key_value_headsr,   r   use_biasr   r   minr   rB   r}   rj   r   r   rw   r{   r   rms_norm_epsinput_layernormpost_attention_layernorm)rK   r#   r'   r-   r.   r(   r)   r*   r+   r,   r   min_moe_layer_start_indexrm   max_moe_layer_end_indexr}   r   r   r   rP   rR   rS   r?   k  sp   

z#Ernie4_5_VLMoeDecoderLayer.__init__rT   rU   rV   residualr   r   r0   c                 K   s   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}t| jtr8| j||fi |}||fS | |}||fS )N)rT   rU   rV   )r   r   r   
isinstancer   rj   )rK   rT   rU   rV   r   r   r   rR   rR   rS   ra     s   

z"Ernie4_5_VLMoeDecoderLayer.forwardr   )rb   rc   rd   __doc__rD   r   r   rf   r?   rH   rh   r   r   r   ra   ri   rR   rR   rP   rS   r   d  s4    
Gr   c                       s   e Zd Z		ddedee deddf fddZdej	fd	d
Z
e 			ddej	dej	dedej	dee dej	dB deej	ef fddZ  ZS )Ernie4_5_VLMoeModelNr"   r#   r-   r.   r0   c                    s   t     | _t | _| jjr!t j jt	  t
d|d| _nt | _t j fdd| jj| jjt
d|d\| _| _| _| jjrPt j jd| _d S tdd	| _d S )
Nembed_tokens)	enable_tpr.   c                    s   t |  |dS )N)r'   r#   r-   r.   )r   )idxr.   r#   r-   rR   rS   <lambda>  s    z.Ernie4_5_VLMoeModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizer.   r   T)return_tuple)r>   r?   r#   r
   pp_groupis_first_rankr   
vocab_sizer$   r   r   r   r   r   r   rank_in_group
world_sizer   start_layer	end_layeris_last_rankr   r   norm)rK   r#   r-   r.   rP   r   rS   r?     s*   

zErnie4_5_VLMoeModel.__init__c                 C   s   | j S )N)r   )rK   rR   rR   rS   get_input_embeddings  s   z(Ernie4_5_VLMoeModel.get_input_embeddings	input_idsrT   rV   input_embedspp_proxy_tensorsr   c                 C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }t| j| j| jD ]}	|	|||||\}}q*| j jsBt||dS |j	d dkr\|d u rT| 
|}|S | 
||\}}
|S )NrU   r   )rU   r   r   )r   r   r   r   r   r   r   r   r   r   r   )rK   r   rT   rV   r   r   r   rU   r   layerr[   rR   rR   rS   ra     s8   

zErnie4_5_VLMoeModel.forwardr   )NNN)rb   rc   rd   r	   r   r   rf   r?   rH   rh   r   no_gradr   r   r   ra   ri   rR   rR   rP   rS   r     s@    %r   )9r   logging	itertoolsr   typingr   r   r   r   r   rH   r   transformersr	   sglang.srt.distributedr
   r   r   sglang.srt.layers.dp_attentionr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.moe.ep_moe.layerr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   r   sglang.srt.models.deepseek_v2r   r   sglang.srt.utilsr   r   	getLoggerrb   loggerModuler   rj   r   r   rR   rR   rR   rS   <module>   s6   
d Nn