o
    پi>                     @   sR  d Z ddlmZmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) G dd dej*Z+G dd dej*Z,G dd dej*Z-G dd dej*Z.G dd dej*Z/e/Z0dS )z?Inference-only OLMoE model compatible with HuggingFace weights.    )AnyDictIterableOptionalTupleN)nn)PretrainedConfig)$get_tensor_model_parallel_world_size)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)FusedMoE)TopK)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixmake_layersprint_warning_oncec                       sv   e Zd ZdZ					ddedededed	eej d
ee dee dede	f fddZ
dejdejfddZ  ZS )OlmoeMoEa  A tensor-parallel MoE implementation for Olmoe that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    Nr    num_expertstop_khidden_sizeintermediate_sizeparams_dtypequant_configtp_sizelayer_idprefixc
           
   
      sZ   t    || _t||dd td|	d| _t|dd| _t|||d||td|	d| _	d S )NFgatebiasr"   r%   )r   renormalizeTexperts)r   r   r    reduce_resultsr"   r$   r%   )
super__init__r   r   r   r&   r   topkr   r*   )
selfr   r   r   r    r!   r"   r#   r$   r%   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/olmoe.pyr-   9   s,   
zOlmoeMoE.__init__hidden_statesreturnc                 C   sD   |j }|d| j}| |\}}| ||}| ||}||S )N)shapeviewr   r&   r.   r*   )r/   r4   
orig_shaperouter_logits_topk_outputfinal_hidden_statesr2   r2   r3   forward`   s   
zOlmoeMoE.forward)NNNr   r   )__name__
__module____qualname____doc__intr   torchdtyper   strr-   Tensorr>   __classcell__r2   r2   r0   r3   r   0   s6    	
'r   c                       s   e Zd Z					ddedededed	ed
eeeef  dedee	 deddf fddZ
dejdejdedejfddZ  ZS )OlmoeAttention'  N   r   r$   r   	num_headsnum_kv_heads
rope_thetarope_scalingmax_position_embeddingsr"   r%   r5   c
              
      sh  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _|| _t|| j	| j| jd|td|	d| _t|dd| _t|dd| _t| j| j	 |d|td	|	d| _t| j	| j	|||d
d| _t| j| j	| j|| j|td|	d| _d S )Nr      g      Fqkv_projr'   h㈵>epso_projT)
rotary_dimmax_positionbaserO   is_neox_styleattn)r$   rM   r"   r%   )r,   r-   r   r	   total_num_headsrL   total_num_kv_headsmaxrM   head_dimq_sizekv_sizescalingrN   rP   r   r   rR   r
   q_normk_normr   rV   r   
rotary_embr   r[   )r/   r$   r   rL   rM   rN   rO   rP   r"   r%   r#   r0   r2   r3   r-   m   sh   

	
zOlmoeAttention.__init__	positionsr4   forward_batchc                 C   s   |  |\}}|j| j| j| jgdd\}}}| | | | }}| |||\}}| ||||}	| 	|	\}
}|
S )Nr6   )dim)
rR   splitr`   ra   rc   
contiguousrd   re   r[   rV   )r/   rf   r4   rg   qkvr;   qkvattn_outputoutputr2   r2   r3   r>      s    zOlmoeAttention.forward)rJ   NrK   Nr   )r?   r@   rA   rC   floatr   r   rF   r   r   r-   rD   rG   r   r>   rH   r2   r2   r0   r3   rI   k   sH    	
HrI   c                       sh   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 de	j
f
ddZ  ZS )OlmoeDecoderLayerr   Nr   configr$   r"   r%   r5   c                    s   t    |j| _t|dd}t|dd }t|dd}t|| j|j|j||||td|d	| _t	|j
|j|j|j||td|d	| _t|jd
d| _t|jd
d| _d S )NrN   rJ   rO   rP   rK   	self_attn)r   rL   rM   rN   rO   rP   r"   r%   mlp)r   r   r   r    r$   r"   r%   rS   rT   )r,   r-   r   getattrrI   num_attention_headsnum_key_value_headsr   rt   r   r   num_experts_per_tokr    ru   r
   input_layernormpost_attention_layernorm)r/   rs   r$   r"   r%   rN   rO   rP   r0   r2   r3   r-      s6   
	zOlmoeDecoderLayer.__init__rf   r4   rg   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)rf   r4   rg   )rz   rt   r{   ru   )r/   rf   r4   rg   r|   r2   r2   r3   r>      s   
zOlmoeDecoderLayer.forward)r   Nr   )r?   r@   rA   r   rC   r   r   rF   r-   rD   rG   r   r>   rH   r2   r2   r0   r3   rr      s4    %rr   c                       sb   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
dej	dej	f
ddZ  ZS )
OlmoeModelNr   rs   r"   r%   r5   c                    sj   t     j| _ j| _t j jtd|d| _t	 j
 fddtd|d| _t jdd| _d S )Nembed_tokensr%   c                    s   t  | |dS )N)rs   r"   r$   r%   )rr   )idxr%   rs   r"   r2   r3   <lambda>  s    z%OlmoeModel.__init__.<locals>.<lambda>layersrS   rT   )r,   r-   pad_token_idpadding_idx
vocab_sizer   r   r   r~   r   num_hidden_layersr   r
   normr/   rs   r"   r%   r0   r   r3   r-     s   

zOlmoeModel.__init__	input_idsrf   rg   input_embedsc           
      C   s`   |d u r
|  |}n|}d }tt| jD ]}| j| }|||||\}}q| ||\}}	|S N)r~   rangelenr   r   )
r/   r   rf   rg   r   r4   r|   ilayerr;   r2   r2   r3   r>   "  s   

zOlmoeModel.forwardNr   r   )r?   r@   rA   r   r   r   rF   r-   rD   rG   r   r>   rH   r2   r2   r0   r3   r}     s0     r}   c                       s   e Zd ZdZ		ddedee deddf fdd	Z	dd
e	j
de	j
dede	j
de	j
f
ddZdeeee	j
f  fddZ  ZS )OlmoeForCausalLMFNr   rs   r"   r%   r5   c                    sV   t    || _|| _t||td|d| _t|j|j	|td|d| _
t|| _d S )Nmodelr   lm_head)r"   r%   )r,   r-   rs   r"   r}   r   r   r   r   r   r   r   logits_processorr   r0   r2   r3   r-   ;  s   
zOlmoeForCausalLM.__init__r   rf   rg   r   c                 C   s"   |  ||||}| ||| j|S r   )r   r   r   )r/   r   rf   rg   r   r4   r2   r2   r3   r>   O  s   
zOlmoeForCausalLM.forwardweightsc              	   C   sj  g d}t jddd| jjd}t|  }|D ]\}}d|v r q|D ]2\}}}	||vr,q"d|v r1q"|||}|drA||vrAq"||vrFq"|| }
|
j}||
||	  n]|D ]$}|\}}}}	||vrdqW|||}|| }
|
j}||
|||	|d	  n6|dr||vrq|d
r|dd}||vrt	d| d| d q|}|| }
t
|
dt}||
| qd S )N))rR   q_projrl   )rR   k_projrm   )rR   v_projrn   )gate_up_proj	gate_projr   )r   up_projrQ   r   	down_projr   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   zrotary_emb.inv_freqzmlp.expertsz.bias)shard_id	expert_idkv_scalez	.kv_scalez.attn.kv_scalez'Found kv scale in the checkpoint (e.g. z6), but not found the expected name in the model (e.g. z). kv-scale is not loaded.weight_loader)r   make_expert_params_mappingrs   r   dictnamed_parametersreplaceendswithr   r   rv   r   )r/   r   stacked_params_mappingexpert_params_mappingparams_dictnameloaded_weight
param_nameweight_namer   paramr   mappingr   remapped_kv_scale_namer2   r2   r3   load_weights[  s~   

zOlmoeForCausalLM.load_weightsr   r   )r?   r@   rA   fall_back_to_pt_during_loadr   r   r   rF   r-   rD   rG   r   r>   r   r   r   rH   r2   r2   r0   r3   r   7  s4    
$r   )1rB   typingr   r   r   r   r   rD   r   transformersr   sglang.srt.distributedr	   sglang.srt.layers.layernormr
   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   &sglang.srt.layers.moe.fused_moe_tritonr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   r   r   Moduler   rI   rr   r}   r   
EntryClassr2   r2   r2   r3   <module>   s0   ;YA2~