o
    پiuB                     @   sx  d Z ddlZddlmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. e/e0Z1G dd dej2Z3G dd dej2Z4G dd dej2Z5G dd dej2Z6G dd dej2Z7e7Z8dS )zInference-only Mixtral model.    N)IterableOptionalTupleUnion)nn)MixtralConfig)get_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)FusedMoE)TopK)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loader)
add_prefixmake_layersc                       st   e Zd ZdZ				ddededededed	eej d
ee dee de	f fddZ
dejdejfddZ  ZS )
MixtralMoEa  A tensor-parallel MoE implementation for Mixtral that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N num_expertstop_khidden_sizeintermediate_sizelayer_idparams_dtypequant_configtp_sizeprefixc
           
         sf   t    t | _|| _t||d|d td|	d| _t|dd| _	t
|||||||td|	d| _d S )NFgate)biasr%   r&   r(   T)r!   renormalizeexperts)r    r!   r$   r"   r#   r%   r&   r(   )super__init__r	   r'   r"   r   r   r)   r   topkr   r,   )
selfr    r!   r"   r#   r$   r%   r&   r'   r(   	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/mixtral.pyr.   @   s2   
	zMixtralMoE.__init__hidden_statesreturnc                 C   sV   |j }|d| j}| |\}}| ||}| ||}| jdkr&t|}||S )N   )shapeviewr"   r)   r/   r,   r'   r
   )r0   r5   
orig_shaperouter_logits_topk_outputfinal_hidden_statesr3   r3   r4   forwardj   s   

zMixtralMoE.forward)NNNr   )__name__
__module____qualname____doc__intr   torchdtyper   strr.   Tensorr@   __classcell__r3   r3   r1   r4   r   7   s4    	
*r   c                       sr   e Zd Z					ddededed	ed
ededee deddf fddZde	j
de	j
dede	j
fddZ  ZS )MixtralAttentionr      '  Nr   r"   	num_headsnum_kv_headsr$   max_position
rope_thetar&   r(   r6   c	           
   
      sJ  t    || _t }	|| _| j|	 dksJ | j|	 | _|| _| j|	kr/| j|	 dks.J n	|	| j dks8J td| j|	 | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _t|| j	| j| jd|td|d| _t| j| j	 |d|td|d| _t| j	| j	|t| jdd	| _t| j| j	| j| j||td
|d| _d S )Nr   r8   g      Fqkv_proj)r*   r&   r(   o_projT)
rotary_dimrP   baseis_neox_styleattn)rO   r$   r&   r(   )r-   r.   r"   r	   total_num_headsrN   total_num_kv_headsmaxrO   head_dimq_sizekv_sizescalingrQ   r   r   rR   r   rS   r   rE   
rotary_embr   rW   )
r0   r"   rN   rO   r$   rP   rQ   r&   r(   r'   r1   r3   r4   r.   x   s`   

	
zMixtralAttention.__init__	positionsr5   forward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )Nr7   )dim)rR   splitr\   r]   r_   rW   rS   )r0   r`   r5   ra   qkvr=   qkvattn_outputoutputr3   r3   r4   r@      s    zMixtralAttention.forward)r   rL   rM   Nr   )rA   rB   rC   rE   floatr   r   rH   r.   rF   rI   r   r@   rJ   r3   r3   r1   r4   rK   w   sD    	
BrK   c                       sh   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 de	j
f
ddZ  ZS )MixtralDecoderLayerr   Nr   configr$   r&   r(   r6   c                    s   t    |j| _t|dd}t| j|j|j|j|||td|d| _	t
|j|j|j|j||td|d| _t|j|jd| _t|j|jd| _d S )NrQ   rM   	self_attn)r"   rN   rP   rO   r$   rQ   r&   r(   block_sparse_moe)r    r!   r"   r#   r$   r&   r(   eps)r-   r.   r"   getattrrK   num_attention_headsmax_position_embeddingsnum_key_value_headsr   rm   r   num_local_expertsnum_experts_per_tokr#   rn   r   rms_norm_epsinput_layernormpost_attention_layernorm)r0   rl   r$   r&   r(   rQ   r1   r3   r4   r.      s4   

	zMixtralDecoderLayer.__init__r`   r5   ra   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)r`   r5   ra   )rx   rm   ry   rn   )r0   r`   r5   ra   rz   r3   r3   r4   r@      s   
zMixtralDecoderLayer.forward)r   Nr   )rA   rB   rC   r   rE   r   r   rH   r.   rF   rI   r   r@   rJ   r3   r3   r1   r4   rk      s4    #rk   c                       st   e Zd Z		ddedee deddf fddZ		dd	ej	d
ej	de
dej	dee deej	ef fddZ  ZS )MixtralModelNr   rl   r&   r(   r6   c                    s   t     j| _ j| _t | _| jjr#t j j	t
d|d| _nt | _t j fdd| jj| jjddd\| _| _| _| jjrPt j	 jd| _d S tdd	| _d S )
Nembed_tokensr(   c                    s   t  | |dS )N)rl   r&   r$   r(   )rk   )idxr(   rl   r&   r3   r4   <lambda>  s    z'MixtralModel.__init__.<locals>.<lambda>layersT)pp_rankpp_sizer(   return_tuplero   )r   )r-   r.   pad_token_idpadding_idx
vocab_sizer   pp_groupis_first_rankr   r"   r   r|   r   r   num_hidden_layersrank_in_group
world_sizer   start_layer	end_layeris_last_rankr   rw   normr0   rl   r&   r(   r1   r   r4   r.     s,   

zMixtralModel.__init__	input_idsr`   ra   input_embedspp_proxy_tensorsc                 C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }t| j| jD ]}| j| }	|	||||\}}q(| j jsDt||dS | 	||\}}
|S )Nr5   rz   )r5   rz   )
r   r   r|   ranger   r   r   r   r   r   )r0   r   r`   ra   r   r   r5   rz   ilayerr=   r3   r3   r4   r@   *  s*   

zMixtralModel.forwardNr   NN)rA   rB   rC   r   r   r   rH   r.   rF   rI   r   r   r   r@   rJ   r3   r3   r1   r4   r{     s6    )r{   c                       s   e Zd Z		ddedee deddf fddZe	 		dd	ej
d
ej
dedej
dee dej
fddZedd Zedd Zdeeeej
f  fddZ  ZS )MixtralForCausalLMNr   rl   r&   r(   r6   c                    s\   t    t | _|| _|| _t||td|d| _t	|j
|jtd|d| _t|| _d S )Nmodel)r&   r(   lm_headr}   )r-   r.   r   r   rl   r&   r{   r   r   r   r   r"   r   r   logits_processorr   r1   r3   r4   r.   R  s   
zMixtralForCausalLM.__init__r   r`   ra   r   r   c                 C   s2   | j |||||d}| jjr| ||| j|S |S )N)r   )r   r   r   r   r   )r0   r   r`   ra   r   r   r5   r3   r3   r4   r@   d  s   	
zMixtralForCausalLM.forwardc                 C      | j jS N)r   r   r0   r3   r3   r4   r   |     zMixtralForCausalLM.start_layerc                 C   r   r   )r   r   r   r3   r3   r4   r     r   zMixtralForCausalLM.end_layerweightsc              	   C   s  g d}t jddd| jjd}t|  }|D ]\}}t|}|d ur6t| jdr6|| jj	k s5|| jj
kr6qd|v r;q|D ]-\}}	}
|	|vrGq=||	|}|dsW|d	r\||vr\q=|| }|j}||||
  ns|D ]3}|\}}	}}
|	|vrzqm||	|}|ds|d	r||vrqm|| }|j}|||||
|d
  n=|ds|d	r||vrq|dr||vrq|d u rq|| v r|| }t|dt}||| qtd| d qd S )N))rR   q_projre   )rR   k_projrf   )rR   v_projrg   w1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer    r   zrotary_emb.inv_freqz.bias_bias)shard_id	expert_idz	.kv_scaleweight_loaderz
Parameter z not found in params_dict)r   make_expert_params_mappingrl   ru   dictnamed_parametersr   hasattrr   r   r   replaceendswithr   keysrq   r   loggerwarning)r0   r   stacked_params_mappingexpert_params_mappingparams_dictnameloaded_weightr$   
param_nameweight_namer   paramr   mappingr   r3   r3   r4   load_weights  s   	
zMixtralForCausalLM.load_weightsr   r   )rA   rB   rC   r   r   r   rH   r.   rF   no_gradrI   r   r   r@   propertyr   r   r   r   r   rJ   r3   r3   r1   r4   r   P  sB    

$r   )9rD   loggingtypingr   r   r   r   rF   r   transformersr   sglang.srt.distributedr   r	   r
   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   &sglang.srt.layers.moe.fused_moe_tritonr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   r   	getLoggerrA   r   Moduler   rK   rk   r{   r   
EntryClassr3   r3   r3   r4   <module>   s8   
@Q=K 