o
    پif                     @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 ddl2m3Z3 G dd dej4Z5G dd dej4Z6G dd dej4Z7G dd  d ej4Z8e8e6d!Z9G d"d# d#ej4Z:G d$d% d%ej4Z;e;gZ<dS )&    )IterableOptionalN)nn)GraniteMoeSharedConfig)GraniteMoeHybridConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)HybridLinearAttnBackendMamba2AttnBackend)MambaMixer2)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loader)maybe_prefix)make_layers   )GraniteMoeMoEc                       sJ   e Zd Z		ddededB def fddZdejd	ejfd
dZ	  Z
S )GraniteMoeSharedMLPN configquant_configprefixc                    s   t    |j| _|j| _t| j| jgd d|| dd| _t| j| jd|| dd| _|j	dkr<t
d|j	 d	t | _d S )
N   Fz.input_linear)
input_sizeoutput_sizesbiasr$   r%   z.output_linearr)   r$   r%   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__hidden_sizer'   shared_intermediate_sizer   input_linearr   output_linear
hidden_act
ValueErrorr	   act_fnselfr#   r$   r%   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/granitemoehybrid.pyr-   )   s,   


zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r0   r4   r1   )r6   r;   gate_up_xr9   r9   r:   forwardH   s   
zGraniteMoeSharedMLP.forwardNr"   )__name__
__module____qualname__r   r   strr-   torchTensorrA   __classcell__r9   r9   r7   r:   r!   (   s    r!   c                       s`   e Zd Z		ddedededB deddf
 fdd	Zd
ej	dej	dej	dB de
fddZ  ZS )!GraniteMoeHybridMambaDecoderLayerNr"   r#   	layer_idxr$   r%   r<   c                    s   t    || _|| _|j| _|j| _t|j|j|j|j	|j
|j|j|| dd	| _d | _t|dddkrKt|j|j|j|j||t | dd| _t|dddkrUd n	t||| dd	| _t|j|jd
| _t|j|jd
| _d S )Nz.mixer)	cache_paramsr.   use_conv_biasuse_biasn_groupsrms_norm_eps
activationr$   r%   num_local_expertsr   .block_sparse_moenum_expertstop_kr.   intermediate_sizelayer_idr$   tp_sizer%   r/   .shared_mlpr$   r%   eps)r,   r-   r#   rK   r.   residual_multiplierr   mamba2_cache_paramsmamba_conv_biasmamba_proj_biasmamba_n_groupsrP   r2   mambablock_sparse_moegetattrr    rR   num_experts_per_tokrW   r   r!   
shared_mlpr   input_layernormpost_attention_layernormr6   r#   rK   r$   r%   r7   r9   r:   r-   P   sL   
z*GraniteMoeHybridMambaDecoderLayer.__init__	positionsr;   residualforward_batchc                 C   s   |}|  |}t|}|j}t|tsJ t|jtsJ |jj| j	| j
||dd ||| j  }|}| |}| jd u rI| jd urH| |}n| jd ur`| }| |}|| | }~n| |}||| j  }||fS )NT)mixerrX   r;   outputuse_triton_causal_conv)rh   rG   
empty_likeattn_backend
isinstancer
   linear_attn_backendr   rA   rc   rK   r^   ri   rg   rd   clone)r6   rk   r;   rl   rm   ro   rr   moe_hidden_statesr9   r9   r:   rA      s8   








z)GraniteMoeHybridMambaDecoderLayer.forwardrB   rC   rD   rE   r   intr   rF   r-   rG   rH   r   rA   rI   r9   r9   r7   r:   rJ   O   s.    3rJ   c                       sd   e Zd Z		ddedededB deddf
 fdd	Z	dd
ej	dej	de
dB dej	fddZ  ZS )GraniteMoeHybridAttentionNr"   r#   rX   r$   r%   r<   c              	      sN  t    d| _|j| _|j| _|j| _|j| _| j| j | _|j	| _
t }| j| dks/J | j| | _| j
|krD| j
| dksCJ n	|| j
 dksMJ td| j
| | _	t| j| j| j| j
| j|| dd| _t| j| j| j|| dd| _|jdkrt| j| j|j|j|jd| _nd | _t| j| j| j| j	||| d	d
| _d S )NTr   r   	.qkv_projr*   z.o_projrope)	head_size
rotary_dimmax_positionbaserope_scalingz.attn)	num_headshead_dimscalingnum_kv_headsrX   r$   r%   )r,   r-   causalr.   attention_biasattention_multipliernum_attention_headstotal_num_headsr   num_key_value_headstotal_num_kv_headsr   r   maxr   qkv_projr   o_projposition_embedding_typer   max_position_embeddings
rope_thetar   
rotary_embr   attn)r6   r#   rX   r$   r%   rY   r7   r9   r:   r-      sb   




z"GraniteMoeHybridAttention.__init__rk   r;   rm   c           	      C   s   |  |\}}|j| j| j | j| j | j| j gdd\}}}| jd ur.| |||\}}| j||||d}~~~| |d }|S )Ndim)rm   r   )r   splitr   r   r   r   r   r   )	r6   rk   r;   rm   qkvr?   querykeyvaluer9   r9   r:   rA      s   



	z!GraniteMoeHybridAttention.forwardrB   r=   rw   r9   r9   r7   r:   ry      s0    Jry   c                       sn   e Zd Z		ddedededB deddf
 fdd	Z	dd
ej	dej	dej	dB de
dB dej	f
ddZ  ZS )%GraniteMoeHybridAttentionDecoderLayerNr"   r#   rK   r$   r%   r<   c              
      s   t    |j| _|j| _t|||| dd| _d | _t|dddkr9t|j	|j
|j|j||t | dd| _t|dddkrCd n	t||| dd	| _t|j|jd
| _t|j|jd
| _d S )Nz
.self_attn)rX   r$   r%   rR   r   rS   rT   r/   rZ   r[   r\   )r,   r-   r.   r^   ry   	self_attnrd   re   r    rR   rf   rW   r   r!   rg   r   rP   rh   ri   rj   r7   r9   r:   r-     s>   
z.GraniteMoeHybridAttentionDecoderLayer.__init__rk   r;   rl   rm   c                 C   s   |}|  |}| j|||d}||| j  }|}| |}| jd u r-| jd ur,| |}n| jd urD| }| |}|| | }~n| |}||| j  }||fS )N)rk   r;   rm   )rh   r   r^   ri   rg   rd   ru   )r6   rk   r;   rl   rm   rv   r9   r9   r:   rA   =  s,   







z-GraniteMoeHybridAttentionDecoderLayer.forwardrB   r=   rw   r9   r9   r7   r:   r     s4    1r   )	attentionrc   c                       s   e Zd Z		ddededB def fddZdejfd	d
Z				dde
jdB de
jdedB de
jdB dee de
jfddZ  ZS )GraniteMoeHybridModelNr"   r#   r$   r%   c                    s   t     | _| _ j| _t | _| jjr#t| j j	 jd| _
nt | _
 j| _dtdtf fdd}t j|| jj| jj| dd\| _| _| _| jjr\t j	 jd| _ntd	d
| _g | _d S )N)org_num_embeddingsidxr%   c                    s2   t |ddd }t j|  }| ||dS )N.r   r[   )rx   rsplitALL_DECODER_LAYER_TYPESlayer_types)r   r%   rK   layer_classr#   r$   r9   r:   	get_layer  s   z1GraniteMoeHybridModel.__init__.<locals>.get_layerz.layers)pp_rankpp_sizer%   r\   T)return_tuple)r,   r-   r#   r$   
vocab_sizer   pp_groupis_first_rankr   r.   embed_tokensr   embedding_multiplierrx   rF   r   num_hidden_layersrank_in_group
world_sizelayersstart_layer	end_layeris_last_rankr   rP   normlayers_to_capture)r6   r#   r$   r%   r   r7   r   r:   r-   i  s2   



zGraniteMoeHybridModel.__init__r<   c                 C   s   | j S )z$Get input embeddings from the model.)r   r6   r9   r9   r:   get_input_embeddings  s   z*GraniteMoeHybridModel.get_input_embeddings	input_idsrk   rm   inputs_embedspp_proxy_tensorsc                 C   s   | j jr|d ur|}n
| |}|| j }d }n|d usJ |d }|d }g }t| j| jD ]}	|	| jv r=|||  | j	|	 }
|
||||\}}q/| j j
sWt||dS | ||\}}t|dkrg|S ||fS )Nr;   rl   )r;   rl   r   )r   r   r   r   ranger   r   r   appendr   r   r   r   len)r6   r   rk   rm   r   r   r;   rl   aux_hidden_statesilayerr?   r9   r9   r:   rA     s<   




zGraniteMoeHybridModel.forwardrB   )NNN)rC   rD   rE   r   r   rF   r-   r   	Embeddingr   rG   rH   r   r   r   rA   rI   r9   r9   r7   r:   r   h  s6    2r   c                       s   e Zd Zg ddgdgdgdZdddZ			
d%deded	B def fddZe	dd Z
e	dd ZdejfddZ					d&dejdejdedejdedee fddZdeeeeeef  fd d!Zd"eeeejf  dee fd#d$Z  ZS )'GraniteMoeHybridForCausalLM)q_projk_projv_projconv1din_projr0   )r   r   r   r0   input_embeddingsoutput_embeddings)r   lm_headNr"   r#   r$   r%   c                    s   t    d| _t | _|| _|| _t||t|dd| _	t
|j|j| jt|dd| _|jr6| j	jj| j_t|d| jj d| _ttjdd	| _d S )
NFmodel)r#   r$   r%   r   r[   r   )logit_scaleT)pooling_type	normalize)r,   r-   capture_aux_hidden_statesr   r   r$   r#   r   r   r   r   r   r.   r   tie_word_embeddingsr   weightr   logits_scalinglogits_processorr   r   LASTpoolerr5   r7   r9   r:   r-     s.   

z$GraniteMoeHybridForCausalLM.__init__c                 C      | j jS r=   )r   r   r   r9   r9   r:   r        z'GraniteMoeHybridForCausalLM.start_layerc                 C   r   r=   )r   r   r   r9   r9   r:   r   	  r   z%GraniteMoeHybridForCausalLM.end_layerr<   c                 C   r   r=   )r   r   r   r9   r9   r:   r     s   z0GraniteMoeHybridForCausalLM.get_input_embeddingsFr   rk   rm   input_embedsget_embeddingr   c           	      C   sT   |  |||||}d }| jr|\}}| jjr(|s"| ||| j||S | ||S |S r=   )r   r   r   r   r   r   r   )	r6   r   rk   rm   r   r   r   r;   r   r9   r9   r:   rA     s"   	
z#GraniteMoeHybridForCausalLM.forwardc                    s.   dd d| j j} fddt|D S )N	gate_proj	down_projup_projc              	      sV   g | ]'}d fd fdffD ]\}}|fv rdndd| d| d||fqqS )w1w2w3zblock_sparse_moe.experts.w13_zblock_sparse_moe.experts.w2_zblock_sparse_moe.experts.r   r9   ).0	expert_idshard_idweight_nameckpt_down_proj_nameckpt_gate_proj_nameckpt_up_proj_namer9   r:   
<listcomp>8  s     zBGraniteMoeHybridForCausalLM.get_expert_mapping.<locals>.<listcomp>)r#   rR   r   )r6   rU   r9   r   r:   get_expert_mapping/  s   z.GraniteMoeHybridForCausalLM.get_expert_mappingweightsc              	      s>  g d}t |  t |   fdd}fdd}fdd} fdd	}|D ]\}}d
|v r=|d
d}| jd urc| j| }	rc|}
|
 dkrT|
n|
d }
||	|
 |	 q/|||riq/|	dss|	drt
|dD ];}|dd| d}|dd| d}|| jddd\}}||dd||d|d ||dd||d|d qzq/|	ds|	drt
|dD ]}|dd| d}|| }||dd||d|d qq/|	dr|dd }||| q/d!}|D ]\}}}||v r||||||d" d#}q|s||| q/S )$N))rz   z.q_projq)rz   z.k_projk)rz   z.v_projvc                    s,   |  }t |dt}|||  |  d S Nweight_loaderre   r   add)npparamr   loaded_paramsparams_dictr9   r:   _loadW  s   
z7GraniteMoeHybridForCausalLM.load_weights.<locals>._loadc                    s.   |  }t |dt}||||  |  d S r   r   )r   r   r   r   r   r   r9   r:   _load_shard]  s   z=GraniteMoeHybridForCausalLM.load_weights.<locals>._load_shardc                    s4   |  }t |dt}||||||d  |  d S )Nr   r   r   r   )r   r   namer   r   r   r   r   r9   r:   _load_expertd  s   z>GraniteMoeHybridForCausalLM.load_weights.<locals>._load_expertc              	      sj    D ]0}|\}}}}|| vrq|  ||}| }|j}	d}
|	d ur,|	|||||dd}
|
r2|  S qd S )NFT)r   r   return_success)replacer   )r   loaded_weightmapping
param_namer   r   r   name_mappedr   r   success)expert_params_mappingr   r9   r:   _load_quant_expertj  s*   	zDGraniteMoeHybridForCausalLM.load_weights.<locals>._load_quant_expertA_logAr   z%.block_sparse_moe.input_linear.weightz+.block_sparse_moe.input_linear.weight_scalez.block_sparse_moe.experts.z
.w1.weightz
.w3.weightr&   r   z.input_linear.z.experts.w13_r   r   r   z&.block_sparse_moe.output_linear.weightz,.block_sparse_moe.output_linear.weight_scalez
.w2.weightz.output_linear.z.experts.w2_r   z%.block_sparse_moe.router.layer.weightz.block_sparse_moe.gate.weightF)r   T)dictnamed_parameterssetr   r  r$   get_cache_scaler   r   endswithr   sizechunk)r6   r   stacked_params_mappingr   r   r   r	  r   r   
scale_namer  ew1_namew3_namew1_paramw3_paramw2_namew2_param	gate_nameloadedr  r   r   r9   )r  r   r   r:   load_weightsL  s   












z(GraniteMoeHybridForCausalLM.load_weightsrB   )NFN)rC   rD   rE   packed_modules_mappingembedding_modulesr   r   rF   r-   propertyr   r   r   r   r   rG   rH   r   boolr   r   rA   listtuplerx   r   r   r  r  rI   r9   r9   r7   r:   r     sR    $


,r   )=typingr   r   rG   r   $transformers.models.granitemoesharedr   #sglang.srt.configs.granitemoehybridr   sglang.srt.distributedr   r   sglang.srt.layers.activationr	   6sglang.srt.layers.attention.hybrid_linear_attn_backendr
   r   'sglang.srt.layers.attention.mamba.mambar   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.transformersr   sglang.srt.utilsr   
granitemoer    Moduler!   rJ   ry   r   r   r   r   
EntryClassr9   r9   r9   r:   <module>   sJ    '`aS
g  
