o
    پi5                     @   sT  d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( G dd dej)Z*G dd dej)Z+G dd dej)Z,G dd dej)Z-G dd dej)Z.e.gZ/dS )z Inference-only GraniteMoe model.    )IterableOptionalN)nn)GraniteConfig)$get_tensor_model_parallel_world_size)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)FusedMoE)TopK)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)mixtral)
add_prefixc                       st   e Zd ZdZ				ddededededed	eej d
ee dee de	f fddZ
dejdejfddZ  ZS )GraniteMoeMoEa
  A tensor-parallel MoE implementation for GraniteMoe that shards each
    expert across all ranks.
    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N num_expertstop_khidden_sizeintermediate_sizelayer_idparams_dtypequant_configtp_sizeprefixc
           
         s`   t    || _t||d|d |	 dd| _t|dd| _t||||||d||	 dd	| _d S )NFz.gate)biasr    r!   r#   T)r   renormalizez.experts)	r   r   r   r   r   r    reduce_resultsr!   r#   )	super__init__r   r	   gater   topkr   experts)
selfr   r   r   r   r   r    r!   r"   r#   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/granitemoe.pyr(   (   s2   
	zGraniteMoeMoE.__init__hidden_statesreturnc                 C   sD   |j }|d| j}| |\}}| ||}| ||}||S )N)shapeviewr   r)   r*   r+   )r,   r1   
orig_shaperouter_logits_topk_outputfinal_hidden_statesr/   r/   r0   forwardR   s   
zGraniteMoeMoE.forward)NNNr   )__name__
__module____qualname____doc__intr   torchdtyper   strr(   Tensorr;   __classcell__r/   r/   r-   r0   r       s4    	
*r   c                       s|   e Zd Z						ddededed	ed
ededee dee deddf fddZde	j
de	j
dede	j
fddZ  ZS )GraniteMoeAttention   r   '  Nr   r   	num_headsnum_kv_headsmax_positionr   
rope_thetar!   attention_multiplierr#   r2   c
              	      sV  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|| j | _	| j| j	 | _
| j| j	 | _|d ur[|n| j	d | _|| _t|| j	| j| jd||	 dd| _t| j| j	 |d||	 dd| _t| j	| j	|t| jdd	| _t| j| j	| j| j|||	 d
d| _d S )Nr      r3   Fz	.qkv_proj)r$   r!   r#   z.o_projT)
rotary_dimrK   baseis_neox_stylez.attn)rJ   r   r!   r#   )r'   r(   r   r   total_num_headsrI   total_num_kv_headsmaxrJ   head_dimq_sizekv_sizescalingrL   r   qkv_projr
   o_projr   r@   
rotary_embr   attn)r,   r   rI   rJ   rK   r   rL   r!   rM   r#   r"   r-   r/   r0   r(   ^   sf   

	
zGraniteMoeAttention.__init__	positionsr1   forward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )Nr3   dim)rY   splitrV   rW   r[   r\   rZ   )r,   r]   r1   r^   qkvr8   qkvattn_outputoutputr/   r/   r0   r;      s    zGraniteMoeAttention.forward)rG   r   rH   NNr   )r<   r=   r>   r@   floatr   r   rC   r(   rA   rD   r   r;   rE   r/   r/   r-   r0   rF   \   sJ    	
GrF   c                       s^   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dede	j
fddZ  ZS )GraniteMoeDecoderLayerr   Nr   configr   r!   r#   r2   c                    s   t    |j| _t|dd}t| j|j|j|j|||| d|jd	| _	t
|j|j|j|j||| dd| _t|j|jd| _t|j|jd| _|j| _d S )NrL   rH   z
.self_attn)	r   rI   rK   rJ   rL   r   r!   r#   rM   z.block_sparse_moe)r   r   r   r   r   r!   r#   eps)r'   r(   r   getattrrF   num_attention_headsmax_position_embeddingsnum_key_value_headsrM   	self_attnr   num_local_expertsnum_experts_per_tokr   block_sparse_moer   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplier)r,   rj   r   r!   r#   rL   r-   r/   r0   r(      s8   

zGraniteMoeDecoderLayer.__init__r]   r1   r^   c                 C   sV   |}|  |}| j|||d}||| j  }|}| |}| |}||| j  }|S )N)r]   r1   r^   )rv   rq   rx   rw   rt   )r,   r]   r1   r^   residualr/   r/   r0   r;      s   


zGraniteMoeDecoderLayer.forward)r   Nr   )r<   r=   r>   r   r@   r   r   rC   r(   rA   rD   r   r;   rE   r/   r/   r-   r0   ri      s0    &ri   c                       sx   e Zd Z		ddedee def fddZdej	d	ej	fd
dZ
	ddej	dej	dedeej	 d	ej	f
ddZ  ZS )GraniteMoeModelNr   rj   r!   r#   c                    sb   t    t j j jd| _ j| _t fddt	 j
D | _t j jd| _d S )N)org_num_embeddingsc              	      s(   g | ]}t  |td | dqS )zlayers.r!   r#   )ri   r   ).0irj   r#   r!   r/   r0   
<listcomp>  s    z,GraniteMoeModel.__init__.<locals>.<listcomp>rk   )r'   r(   r   
vocab_sizer   embed_tokensembedding_multiplierr   
ModuleListrangenum_hidden_layerslayersr   ru   norm)r,   rj   r!   r#   r-   r   r0   r(      s   
zGraniteMoeModel.__init__	input_idsr2   c                 C   s
   |  |S N)r   )r,   r   r/   r/   r0   get_input_embeddings  s   
z$GraniteMoeModel.get_input_embeddingsr]   r^   inputs_embedsc                 C   sZ   |d ur|}n|  |}|| j9 }tt| jD ]}| j| }||||}q| |}|S r   )r   r   r   lenr   r   )r,   r   r]   r^   r   r1   r~   layerr/   r/   r0   r;     s   



zGraniteMoeModel.forwardNr   r   )r<   r=   r>   r   r   r   rC   r(   rA   rD   r   r   r;   rE   r/   r/   r-   r0   rz      s.    rz   c                       s   e Zd Z		ddedee def fddZe	 		dd	ej
d
ej
dedej
dedefddZdeeeej
f  dee fddZ  ZS )GraniteMoeForCausalLMNr   rj   r!   r#   c                    s   t    || _|| _t||td|d| _t|j|j	|td|d| _
|jr.| jjj| j
_t|dr9d|j }nd }t||d| _ttjdd| _d S )	Nmodelr|   lm_headlogits_scalingg      ?)logit_scaleT)pooling_type	normalize)r'   r(   rj   r!   rz   r   r   r   r   r   r   tie_word_embeddingsr   weighthasattrr   r   logits_processorr   r   LASTpooler)r,   rj   r!   r#   r   r-   r/   r0   r(   ,  s&   

zGraniteMoeForCausalLM.__init__Fr   r]   r^   input_embedsget_embeddingr2   c                 C   s6   |  ||||}|s| ||| j|}|S | ||S r   )r   r   r   r   )r,   r   r]   r^   r   r   r1   logits_processor_outputr/   r/   r0   r;   J  s   	
zGraniteMoeForCausalLM.forwardweightsc                 C   s2  i }|D ]\}}| drKt|dD ]5}|dd| d}|dd| d}|| jddd\}}	||vs;J ||vsAJ |||< |	||< qq| drst|dD ]}|dd| d	}
|| }|
|vsmJ |||
< qWq| d
r|d
d}||vsJ |||< q|||< qtj| |  d S )Nz%.block_sparse_moe.input_linear.weightr   z.block_sparse_moe.experts.z
.w1.weightz
.w3.weight   r_   z&.block_sparse_moe.output_linear.weightz
.w2.weightz%.block_sparse_moe.router.layer.weightz.block_sparse_moe.gate.weight)	endswithr   sizereplacechunkr   MixtralForCausalLMload_weightsitems)r,   r   new_weightsnpew1_namew3_namew1_paramw3_paramw2_namew2_param	gate_namer/   r/   r0   r   \  sJ   









z"GraniteMoeForCausalLM.load_weightsr   )NF)r<   r=   r>   r   r   r   rC   r(   rA   no_gradrD   r   boolr   r;   r   tuplesetr   rE   r/   r/   r-   r0   r   *  s6    ,r   )0r?   typingr   r   rA   r   transformersr   sglang.srt.distributedr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r	   r
   "sglang.srt.layers.logits_processorr   r   &sglang.srt.layers.moe.fused_moe_tritonr   sglang.srt.layers.moe.topkr   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.modelsr   sglang.srt.utilsr   Moduler   rF   ri   rz   r   
EntryClassr/   r/   r/   r0   <module>   s2    <W?8
Y