o
    پi.<                     @   s^  d Z ddlmZmZmZ ddlZddlZddlm	  m
Z ddlm	Z	 ddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( G dd de	j)Z*G dd de	j)Z+G dd de	j)Z,G dd de	j)Z-G dd de	j)Z.G dd de	j)Z/e/Z0dS )zInference-only Mixtral model.    )IterableOptionalTupleN)nn)MixtralConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixc                       sV   e Zd Z		ddedededee deddf fd	d
Zdej	dej	fddZ
  ZS )
MixtralMLPN num_expertshidden_sizeintermediate_sizequant_configprefixreturnc                    s   t    || _|| _|| _t| j| jd|td|d| _t| j| jd|td|d| _t| j| jd|td|d| _	t
 | _d S )NFw1biasr   r   w2w3)super__init__r   ffn_dim
hidden_dimr   r   r   r"   r#   r   SiLUact_fn)selfr   r   r   r   r   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/mixtral_quant.pyr%   4   s4   
	zMixtralMLP.__init__hidden_statesc                 C   s@   |  |\}}| |}| |\}}|| }| |\}}|S N)r   r)   r#   r"   )r*   r/   w1_out_w3_outcurrent_hidden_statesr-   r-   r.   forwardZ   s   
zMixtralMLP.forwardNr   )__name__
__module____qualname__intr   r   strr%   torchTensorr5   __classcell__r-   r-   r+   r.   r   3   s"    &r   c                       sJ   e Zd Z		ddedee def fddZdej	d	ej	fd
dZ
  ZS )
MixtralMoENr   configr   r   c                    s   t     _t _t _ j_ j	_
jjkr+tdj dj dttjjj  _jsGtdj dt fddtjD _t jjdd td	d
_d S )NzTensor parallel size z' is greater than the number of experts .zRank z has no experts assigned to it.c              
      s>   g | ]}|j v rtj j jtd | dndqS )zexperts.r   r   N)expert_indiciesr   num_total_expertsr   r   r   ).0idxr@   r   r   r*   r-   r.   
<listcomp>}   s    
z'MixtralMoE.__init__.<locals>.<listcomp>Fgater    )r$   r%   r@   r   rankr   tp_sizenum_local_expertsrD   num_experts_per_toktop_k
ValueErrornparray_splitrangetolistrC   r   
ModuleListexpertsr   r   r   rI   r*   r@   r   r   r+   rG   r.   r%   d   sB   

zMixtralMoE.__init__r/   r   c                 C   s   |  |\}}tj|dtjd}tj|| jdd\}}||jddd }d }| jD ]'}| j	| }||k}	||	 jddd}
||
|
}|d u rK|}q)|| q)t|S )N   )dimdtyperX   T)rX   keepdim)rI   Fsoftmaxr<   floattopkrN   sumrC   rU   mul_add_r	   )r*   r/   router_logitsr2   routing_weightsselected_expertsfinal_hidden_states
expert_idxexpert_layerexpert_maskexpert_weightsr4   r-   r-   r.   r5      s    


zMixtralMoE.forwardr6   )r7   r8   r9   r   r   r   r;   r%   r<   r=   r5   r>   r-   r-   r+   r.   r?   c   s    0r?   c                       sr   e Zd Z					ddededed	ed
ededee deddf fddZde	j
de	j
dede	j
fddZ  ZS )MixtralAttentionr      '  Nr   r   	num_headsnum_kv_headslayer_idmax_position
rope_thetar   r   r   c	           
   
      sJ  t    || _t }	|| _| j|	 dksJ | j|	 | _|| _| j|	kr/| j|	 dks.J n	|	| j dks8J td| j|	 | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _t|| j	| j| jd|td|d| _t| j| j	 |d|td|d| _t| j	| j	|t| jdd	| _t| j| j	| j| j||td
|d| _d S )Nr   rW   g      Fqkv_projr    o_projT)
rotary_dimrr   baseis_neox_styleattn)rp   rq   r   r   )r$   r%   r   r   total_num_headsro   total_num_kv_headsmaxrp   head_dimq_sizekv_sizescalingrs   r   r   rt   r   ru   r   r:   
rotary_embr   ry   )
r*   r   ro   rp   rq   rr   rs   r   r   rK   r+   r-   r.   r%      s`   

	
zMixtralAttention.__init__	positionsr/   forward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )NrZ   r[   )rt   splitr~   r   r   ry   ru   )r*   r   r/   r   qkvr2   qkvattn_outputoutputr-   r-   r.   r5      s    zMixtralAttention.forward)r   rm   rn   Nr   )r7   r8   r9   r:   r_   r   r   r;   r%   r<   r=   r   r5   r>   r-   r-   r+   r.   rl      sD    	
Brl   c                       sh   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 de	j
f
ddZ  ZS )MixtralDecoderLayerr   Nr   r@   rq   r   r   r   c                    s   t    |j| _t|dd}t| j|j|j|j|||td|d| _	t
||td|d| _t|j|jd| _t|j|jd| _d S )Nrs   rn   	self_attn)r   ro   rr   rp   rq   rs   r   r   block_sparse_moe)r@   r   r   eps)r$   r%   r   getattrrl   num_attention_headsmax_position_embeddingsnum_key_value_headsr   r   r?   r   r
   rms_norm_epsinput_layernormpost_attention_layernorm)r*   r@   rq   r   r   rs   r+   r-   r.   r%      s,   

zMixtralDecoderLayer.__init__r   r/   r   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)r   r/   r   )r   r   r   r   )r*   r   r/   r   r   r-   r-   r.   r5     s   
zMixtralDecoderLayer.forward)r   Nr   )r7   r8   r9   r   r:   r   r   r;   r%   r<   r=   r   r5   r>   r-   r-   r+   r.   r      s4    r   c                       sb   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
dej	dej	f
ddZ  ZS )MixtralModelNr   r@   r   r   r   c                    sn   t     j| _ j| _t j jtdd| _t	
 fddt jD | _t j jd| _d S )Nembed_tokensr   c              	      s(   g | ]}t  |td | dqS )zlayers.rB   )r   r   )rE   ir@   r   r   r-   r.   rH   G  s    z)MixtralModel.__init__.<locals>.<listcomp>r   )r$   r%   pad_token_idpadding_idx
vocab_sizer   r   r   r   r   rT   rR   num_hidden_layerslayersr
   r   normrV   r+   r   r.   r%   7  s   
zMixtralModel.__init__	input_idsr   r   input_embedsc           
      C   s`   |d u r
|  |}n|}d }tt| jD ]}| j| }|||||\}}q| ||\}}	|S r0   )r   rR   lenr   r   )
r*   r   r   r   r   r/   r   r   layerr2   r-   r-   r.   r5   S  s   

zMixtralModel.forwardr6   r0   )r7   r8   r9   r   r   r   r;   r%   r<   r=   r   r5   r>   r-   r-   r+   r.   r   6  s0    !r   c                       s   e Zd Z		ddedee deddf fddZe	 	dd	ej
d
ej
dedej
dej
f
ddZdeeeej
f  fddZ  ZS )QuantMixtralForCausalLMNr   r@   r   r   r   c                    sT   t    || _|| _t||td|d| _t|j|j	td|d| _
t|| _d S )NmodelrB   lm_headr   )r$   r%   r@   r   r   r   r   r   r   r   r   r   logits_processorrV   r+   r-   r.   r%   i  s   
z QuantMixtralForCausalLM.__init__r   r   r   r   c                 C   s"   |  ||||}| ||| j|S r0   )r   r   r   )r*   r   r   r   r   r/   r-   r-   r.   r5   z  s   
zQuantMixtralForCausalLM.forwardweightsc                 C   s   g d}t |  }|D ]`\}}d|v rq|D ]-\}}}||vr!q|||}|dr1||vr1q||vr6q|| }	|	j}
|
|	||  n'|drO||vrOqd|v rX||vrXq||vr]q|| }	t|	dt}
|
|	| qd S )N))rt   q_projr   )rt   k_projr   )rt   v_projr   zrotary_emb.inv_freqz.biaszblock_sparse_moe.experts.weight_loader)dictnamed_parametersreplaceendswithr   r   r   )r*   r   stacked_params_mappingparams_dictnameloaded_weight
param_nameweight_nameshard_idparamr   r-   r-   r.   load_weights  s8   
z$QuantMixtralForCausalLM.load_weightsr6   r0   )r7   r8   r9   r   r   r   r;   r%   r<   no_gradr=   r   r5   r   r   r   r>   r-   r-   r+   r.   r   h  s4    $r   )1__doc__typingr   r   r   numpyrP   r<   torch.nn.functionalr   
functionalr]   transformersr   sglang.srt.distributedr   r   r	   sglang.srt.layers.layernormr
   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   Moduler   r?   rl   r   r   r   
EntryClassr-   r-   r-   r.   <module>   s2   0IQ92F