o
    پi4D                     @   s  d Z ddlmZmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd dej/Z4G dd  d ej/Z5e5Z6dS )!zInference-only Deepseek model.    )AnyDictIterableOptionalTupleN)nn)PretrainedConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)	fused_moe)MoeRunnerConfig)TopK)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixc                       sN   e Zd Z			ddedededee ded	ed
df fddZdd Z	  Z
S )DeepseekMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixreturnc              	      sj   t    t||gd d|td|d| _t||d||td|d| _|dkr/td| d	t | _	d S )
N   Fgate_up_projbiasr#   r%   	down_proj)r*   r#   r$   r%   siluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r   r(   r   r+   
ValueErrorr   act_fn)selfr    r!   r"   r#   r$   r%   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/deepseek.pyr.   8   s*   
	
zDeepseekMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r(   r0   r+   )r1   xgate_up_r4   r4   r5   forwardX   s   
zDeepseekMLP.forward)NTr   )__name__
__module____qualname__intstrr   r   boolr.   r:   __classcell__r4   r4   r2   r5   r   6   s(     r   c                       sR   e Zd Z		ddedee def fddZdd	 Zd
e	j
de	j
fddZ  ZS )DeepseekMoENr   configr#   r%   c              	      s   t     | _t | _t | _ j| _ j| _	| j| jkr+t
d| j d| j dt| j	 jd| _t fddt| jD | _|   t j| jdd tdd	| _ jd urv j j }t j| jdtd
d| _d S d S )NzTensor parallel size z' is greater than the number of experts .)top_krenormalizec                    s2   g | ]}t  j j jd t| ddqS )Fz.expertsr    r!   r"   r#   r$   r%   )r   r    moe_intermediate_sizer"   r   ).0idxrC   r%   r#   r4   r5   
<listcomp>w   s    	z(DeepseekMoE.__init__.<locals>.<listcomp>Fgater)   shared_expertsrG   )r-   r.   rC   r	   rankr
   tp_sizen_routed_expertsnum_experts_per_tokrE   r/   r   norm_topk_probtopkr   
ModuleListrangeexpertspack_paramsr   r    r   rM   n_shared_expertsrH   r   r"   rN   )r1   rC   r#   r%   r!   r2   rK   r5   r.   a   sP   

	
zDeepseekMoE.__init__c                 C   s   g }g }| j D ]}||jj ||jj qtj|| _tj	| j|}t
||D ]\}}||_q,| jjt|g|d jR  | _tj|| _tj	| j|}t
||D ]\}}||_qX| jjt|g|d jR  | _d S )Nr   )rW   appendr(   weightr+   torch_utils_flatten_dense_tensorsw1_unflatten_dense_tensorszipdataviewlenshapew2)r1   r_   rf   expertw1srb   paramw2sr4   r4   r5   rX      s   
 $zDeepseekMoE.pack_paramshidden_statesr&   c           	      C   s   |j \}}|d|}| jjd ur| |}| |\}}| ||}tj|| j| j	|t
ddd}| jjd ur<|| }t|}|||S )NT)inplace)r_   rf   topk_outputmoe_runner_config)re   rc   rC   rY   rN   rM   rT   r   r_   rf   r   r   )	r1   rk   
num_tokens
hidden_dimshared_outputrouter_logitsr9   rn   final_hidden_statesr4   r4   r5   r:      s"   

zDeepseekMoE.forwardNr   )r;   r<   r=   r   r   r   r?   r.   rX   r\   Tensorr:   rA   r4   r4   r2   r5   rB   _   s    7rB   c                       s   e Zd Z						ddededed	ed
edeeeef  dedee	 deddf fddZ
dejdejdedejfddZ  ZS )DeepseekAttentionr   '  N    r   r    	num_headsnum_kv_headslayer_id
rope_thetarope_scalingmax_position_embeddingsr#   r%   r&   c
              
      sJ  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _|| _t|| j	| j| jd|td|	d| _t| j| j	 |d|td|	d| _t| j	| j	|||d| _t| j| j	| j| j||td	|	d
| _d S )Nr      g      Fqkv_projr)   o_proj)
rotary_dimmax_positionbaser~   attn)r{   r|   r#   r%   )r-   r.   r    r
   total_num_headsrz   total_num_kv_headsmaxr{   head_dimq_sizekv_sizescalingr}   r   r   r   r   r   r   r   
rotary_embr   r   )r1   r    rz   r{   r|   r}   r~   r   r#   r%   rP   r2   r4   r5   r.      sb   



zDeepseekAttention.__init__	positionsrk   forward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )Nrl   )dim)r   splitr   r   r   r   r   )r1   r   rk   r   qkvr9   qkvattn_outputoutputr4   r4   r5   r:   
  s    zDeepseekAttention.forward)r   rx   Nry   Nr   )r;   r<   r=   r>   floatr   r   r?   r   r   r.   r\   rv   r   r:   rA   r4   r4   r2   r5   rw      sJ    	
Frw   c                       sf   e Zd Z		ddededee deddf
 fdd	Zd
e	j
de	j
dedee	j
 de	j
f
ddZ  ZS )DeepseekDecoderLayerNr   rC   r|   r#   r%   r&   c                    s   t    |j| _t|dd}t|dd }t|dd}t| j|j|j|||||td|d	| _|j	d urL||j
krL||j dkrLt||td	|d
| _nt|j|j|j|td	|d| _t|j|jd| _t|j|jd| _d S )Nr}   rx   r~   r   ry   	self_attn)	r    rz   r{   r|   r}   r~   r   r#   r%   r   mlp)rC   r#   r%   )r    r!   r"   r#   r%   eps)r-   r.   r    getattrrw   num_attention_headsnum_key_value_headsr   r   rQ   first_k_dense_replacemoe_layer_freqrB   r   r   r!   r"   r   rms_norm_epsinput_layernormpost_attention_layernorm)r1   rC   r|   r#   r%   r}   r~   r   r2   r4   r5   r.     sF   



zDeepseekDecoderLayer.__init__r   rk   r   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)r   rk   r   )r   r   r   r   )r1   r   rk   r   r   r4   r4   r5   r:   H  s   
zDeepseekDecoderLayer.forwardru   )r;   r<   r=   r   r>   r   r   r?   r.   r\   rv   r   r:   rA   r4   r4   r2   r5   r     s2    .r   c                       sf   e Zd ZdZ		ddedee deddf fdd	Z	dd
e	j
de	j
dede	j
de	j
f
ddZ  ZS )DeepseekModelFNr   rC   r#   r%   r&   c                    sd   t     j| _ j| _t j j| _t	 fddt
 jD | _t j jd| _d S )Nc              	      s(   g | ]}t  |td | dqS )zlayers.r#   r%   )r   r   )rI   r|   rK   r4   r5   rL   t  s    z*DeepseekModel.__init__.<locals>.<listcomp>r   )r-   r.   pad_token_idpadding_idx
vocab_sizer   r    embed_tokensr   rU   rV   num_hidden_layerslayersr   r   normr1   rC   r#   r%   r2   rK   r5   r.   e  s   
zDeepseekModel.__init__	input_idsr   r   input_embedsc           
      C   s`   |d u r
|  |}n|}d }tt| jD ]}| j| }|||||\}}q| ||\}}	|S r6   )r   rV   rd   r   r   )
r1   r   r   r   r   rk   r   ilayerr9   r4   r4   r5   r:     s   

zDeepseekModel.forwardru   r6   )r;   r<   r=   fall_back_to_pt_during_loadr   r   r   r?   r.   r\   rv   r   r:   rA   r4   r4   r2   r5   r   a  s2     r   c                       s   e Zd Z		ddedee deddf fddZdej	fd	d
Z
e 	ddejdejdedejdejf
ddZdeeeejf  fddZ  ZS )DeepseekForCausalLMNr   rC   r#   r%   r&   c                    sV   t    || _|| _t||td|d| _t|j|j	|td|d| _
t|| _d S )Nmodel)r%   lm_headr   )r-   r.   rC   r#   r   r   r   r   r   r    r   r   logits_processorr   r2   r4   r5   r.     s   
zDeepseekForCausalLM.__init__c                 C   s   | j jS r6   )r   r   )r1   r4   r4   r5   get_input_embeddings  s   z(DeepseekForCausalLM.get_input_embeddingsr   r   r   r   c                 C   s"   |  ||||}| ||| j|S r6   )r   r   r   )r1   r   r   r   r   rk   r4   r4   r5   r:     s   
zDeepseekForCausalLM.forwardweightsc                 C   s   g d}t |  }|D ]g\}}d|v rq|D ]5\}}}||vr!q|||}|dr1||vr1qd|v s9d|v r>||vr>q|| }	|	j}
|
|	||  n&|drW||vrWqd|v s_d|v rd||vrdq|| }	t|	dt}
|
|	| qd S )N))r   q_projr   )r   k_projr   )r   v_projr   )r(   	gate_projr   )r(   up_projr   zrotary_emb.inv_freqz.biaszmlp.experts.zmlp.shared_experts.weight_loader)dictnamed_parametersreplaceendswithr   r   r   )r1   r   stacked_params_mappingparams_dictnameloaded_weight
param_nameweight_nameshard_idri   r   r4   r4   r5   load_weights  s8   	
z DeepseekForCausalLM.load_weightsru   r6   )r;   r<   r=   r   r   r   r?   r.   r   	Embeddingr   r\   no_gradrv   r   r:   r   r   r   rA   r4   r4   r2   r5   r     s6    $r   )7__doc__typingr   r   r   r   r   r\   r   transformersr   sglang.srt.distributedr	   r
   r   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   &sglang.srt.layers.moe.fused_moe_tritonr    sglang.srt.layers.moe.moe_runnerr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   Moduler   rB   rw   r   r   r   
EntryClassr4   r4   r4   r5   <module>   s6   )cVI6R