o
    پiK                     @   sn  d Z ddlZddlmZmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( e( rddl)m*Z* G dd de	j+Z,ej-fddZ.G dd de	j+Z/G dd de	j+Z0G dd de	j+Z1G dd de	j+Z2e2Z3dS )zBInference-only MiniCPM3 model compatible with HuggingFace weights.    N)AnyDictIterableOptionalTuple)nn)PretrainedConfig)$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixis_cuda)bmm_fp8c                       sH   e Zd Z		ddedededee deddf fd	d
Zdd Z  Z	S )MiniCPM3MLPN hidden_sizeintermediate_size
hidden_actquant_configprefixreturnc                    sh   t    t||gd d|td|d| _t||d|td|d| _|dkr.td| dt | _	d S )	N   Fgate_up_projbiasr    r!   	down_projsiluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r   r$   r   r'   
ValueErrorr
   act_fn)selfr   r   r   r    r!   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/minicpm3.pyr*   1   s(   

zMiniCPM3MLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r$   r,   r'   )r-   xgate_up_r0   r0   r1   forwardO   s   
zMiniCPM3MLP.forwardNr   )
__name__
__module____qualname__intstrr   r   r*   r6   __classcell__r0   r0   r.   r1   r   0   s"    r   c                 C   sl   t |}|  \}}t | | jdd}|j| }| | j|j|jd}||	 |
  fS )Ng-q=)min)r>   max)torchfinfoaminmaxmaximumabsclampr?   r>   to
contiguousfloat
reciprocal)r3   dtyperA   min_valmax_valamaxscale	x_scl_satr0   r0   r1   input_to_float8V   s   

rP   c                       s   e Zd Z						ddedededed	ed
ededededeeee	f  dedee
 deddf fddZdejdejdedejfddZ  ZS )MiniCPM3AttentionMLA'  N    r   configr   	num_headsqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rank
rope_thetarope_scalingmax_position_embeddingsr    r!   r"   c                    s  t    || _|| _|| _|| _|| | _|| _|| _|| _	|| _
t }|| dks-J || | _| jd | _|	| _|| _| jd urmt| j| jd|td|d| _t| j|jd| _t|| j
| j d|td|d| _nt| j| j
| j d|td|d| _t| j| j	| j d|td	|d| _t| j	|jd| _t| j	| j
| j| j  d|td
|d| _t| j
| j | jd|td|d| _t||||	|
d| _t | j| j	| j | jd|| j	|td|d| _!d | _"d | _#d | _$d S )Nr   g      Fq_a_projr%   epsq_b_projq_projkv_a_proj_with_mqa	kv_b_projo_proj)
rotary_dimmax_positionbaser\      attn)num_kv_headslayer_idrX   r    r!   )%r)   r*   rl   r   rV   rW   qk_head_dimrX   rY   rZ   rU   r	   num_local_headsscalingr[   r]   r   r   r^   r   rms_norm_epsq_a_layernormr   ra   rb   rc   kv_a_layernormrd   r   re   r   
rotary_embr   rj   w_kcw_vcw_scale)r-   rT   r   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r    rl   r!   tp_sizer.   r0   r1   r*   a   s   










zMiniCPM3AttentionMLA.__init__	positionshidden_statesforward_batchc              
   C   s  |j d }||| j| j| j }| jd ur1| |d }| |}| |d 	d| j| j
}n| |d 	d| j| j
}|j| j| jgdd\}}| jjtjkrkt|ddtj\}	}
t|	| j|
| jtj}nt|dd| j}|dd|dd | jf< | |d }|dd | jf }| | d}|d}||dd | jf< |d| jd f }|j |j g}| ||d|j d |j d  |d|j d |j d  \}}|	|d |	|d }}||d| jd f< ||d| jd f< | ||||}|	d| j| j}| jjtjkr-t|ddtj\}}t|| j|| jtj}nt|dd| j}|dd dd}| !|\}}|S )Nr   dimri   .r#   )"shape	new_emptyrn   rZ   rW   rY   r^   rq   ra   viewrm   rb   splitrV   rt   rJ   r@   float8_e4m3fnrP   	transposer   rv   bfloat16bmmrc   rr   rG   	unsqueezers   reshaperj   ru   flattenre   )r-   rx   ry   rz   q_lenq_inputqq_nopeq_pe
q_nope_valq_nope_scale
q_nope_outlatent_cachev_inputk_inputk_peoriginal_shapesattn_outputattn_output_valattn_output_scaleattn_bmm_outputoutputr5   r0   r0   r1   r6      sj   




zMiniCPM3AttentionMLA.forward)rR   NrS   NNr   )r8   r9   r:   r   r;   rH   r   r   r<   r   r   r*   r@   Tensorr   r6   r=   r0   r0   r.   r1   rQ   _   sZ    	
irQ   c                       sp   e Zd Z		ddededee deddf
 fdd	Zd
e	j
de	j
dedee	j
 dee	j
e	j
f f
ddZ  ZS )MiniCPM3DecoderLayerNr   rT   rl   r    r!   r"   c                    s   t    || _|j| _t|dd}t|dd }t|dd}t|| j|j|j|j| j|j t	|dr5|j
nd |j|||||td|d| _t| j|j|j|td	|d
| _t|j|jd| _t|j|jd| _d S )Nr[   rR   r\   r]   rS   rY   	self_attn)rT   r   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r    rl   r!   mlp)r   r   r   r    r!   r_   )r)   r*   rT   r   getattrrQ   num_attention_headsrV   rW   hasattrrY   rZ   r   r   r   r   r   r   r   rp   input_layernormpost_attention_layernorm)r-   rT   rl   r    r!   r[   r\   r]   r.   r0   r1   r*     sB   

zMiniCPM3DecoderLayer.__init__rx   ry   rz   residualc                 C   sz   |}|  |}| j|||d}||| jjt| jj   }|}| |}| |}||| jjt| jj   }|d fS )N)rx   ry   rz   )	r   r   rT   scale_depthmathsqrtnum_hidden_layersr   r   )r-   rx   ry   rz   r   r0   r0   r1   r6   =  s"   


zMiniCPM3DecoderLayer.forwardr7   )r8   r9   r:   r   r;   r   r   r<   r*   r@   r   r   r   r6   r=   r0   r0   r.   r1   r     s2    ,r   c                       sb   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
dej	dej	f
ddZ  ZS )MiniCPM3ModelNr   rT   r    r!   r"   c                    sx   t     | _ j| _ j| _t| j j jtdd| _	t
 fddt jD | _t j jd| _d S )Nembed_tokensorg_num_embeddingsr!   c              	      s(   g | ]}t  |td | dqS )zlayers.r    r!   )r   r   ).0irT   r!   r    r0   r1   
<listcomp>m  s    z*MiniCPM3Model.__init__.<locals>.<listcomp>r_   )r)   r*   rT   pad_token_idpadding_idx
vocab_sizer   r   r   r   r   
ModuleListranger   layersr   rp   normr-   rT   r    r!   r.   r   r1   r*   \  s    
zMiniCPM3Model.__init__	input_idsrx   rz   input_embedsc           	      C   sb   |d u r|  || jj }n|}d }tt| jD ]}| j| }|||||\}}q| |}|S r2   )r   rT   	scale_embr   lenr   r   )	r-   r   rx   rz   r   ry   r   r   layerr0   r0   r1   r6   y  s   


zMiniCPM3Model.forwardr7   r2   )r8   r9   r:   r   r   r   r<   r*   r@   r   r   r6   r=   r0   r0   r.   r1   r   [  s0    "r   c                       s   e Zd Z		ddedee deddf fddZe	 	dd	ej
d
ej
dedej
dej
f
ddZdeeeej
f  fddZ  ZS )MiniCPM3ForCausalLMNr   rT   r    r!   r"   c                    s   t    || _t| jdd| _|| _t||td|d| _| jj	s1t
|j|j|jtd|d| _| jj| jj | _t|| _d S )Nnum_expertsr   modelr   lm_headr   )r)   r*   rT   r   r   r    r   r   r   tie_word_embeddingsr   r   r   r   dim_model_basescale_widthr   logits_processorr   r.   r0   r1   r*     s    
zMiniCPM3ForCausalLM.__init__r   rx   rz   r   c                 C   sV   |d ur
|| j j }| ||||}|| j }| j jr | jj}n| j}| ||||S r2   )rT   r   r   r   r   r   r   r   )r-   r   rx   rz   r   ry   r   r0   r0   r1   r6     s   

zMiniCPM3ForCausalLM.forwardweightsc                 C   s  ddg}dd t | jD }t|  }|D ]\}}d|v rqd|v s'd|v r(q| jjr1d|v r1q|D ](\}}}	||vr=q3|||}|d	rM||vrMq3|| }
|
j}||
||	  n<|D ] \}}}||vrhq^|||}|| }
|
j}||
|||d
  n|d	r||vrq|| }
t	|
dt
}||
| qt | jjD ]E}| jj| j}|jjdd|j|j fj|j|jgdd\}}|dd dd|_| dd|_t|jdr|jj|_|`qd S )N)r$   	gate_projr   )r$   up_projri   c              	   S   s:   g | ]}d D ]}|dv rdndd| d| d|fqqS ))w1w2w3)r   r   wsw2szexperts..z.weightr0   )r   	expert_idweight_namer0   r0   r1   r     s    z4MiniCPM3ForCausalLM.load_weights.<locals>.<listcomp>zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedzlm_head.weightz.bias)r   weight_loaderr   r{   ri   r|   r#   weight_scale)r   r   dictnamed_parametersrT   r   replaceendswithr   r   r   r   r   r   r   rd   weight	unflattenrV   rX   r   r   rG   rt   ru   r   r   rv   )r-   r   stacked_params_mappingexpert_params_mappingparams_dictnameloaded_weight
param_namer   shard_idparamr   r   rl   r   rt   ru   r0   r0   r1   load_weights  sn   



z MiniCPM3ForCausalLM.load_weightsr7   r2   )r8   r9   r:   r   r   r   r<   r*   r@   no_gradr   r   r6   r   r   r   r=   r0   r0   r.   r1   r     s4    $r   )4__doc__r   typingr   r   r   r   r   r@   r   transformersr   sglang.srt.distributedr	   sglang.srt.layers.activationr
   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   r   
sgl_kernelr   Moduler   r   rP   rQ   r   r   r   
EntryClassr0   r0   r0   r1   <module>   s8   &	 2K7u