o
    پiv                     @   s  U d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZBmCZCmDZDmEZE ddlFmGZG eD ZHG dd dejIZJdaKdd ZLG d d! d!ejIZMG d"d# d#ejIZNG d$d% d%ejIZOG d&d' d'ejIZPG d(d) d)ejIZQG d*d+ d+ejIZReReNB ePB eOB ZSeeReeNeePeeOiZTeUeVeWeS f eXd,< G d-d. d.ejIZYG d/d0 d0ejIZZeZgZ[dS )1zInference-only NemotronH model.    )Iterable)OptionalUnionN)nn)NemotronHConfig)	ATTENTIONMAMBAMLPMOE)get_moe_ep_groupget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)ReLU2)HybridLinearAttnBackendMamba2AttnBackend)MambaMixer2)RMSNorm)ColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)get_moe_impl_class)FusedMoE)TopK)QuantizationConfig)RadixAttention)PPMissingLayerget_layer_id)DEFAULT_VOCAB_PADDING_SIZEParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loadermaybe_remap_kv_scale_namereplace_prefixreplace_substrings)WeightsMapper)get_global_server_args)
add_prefixget_current_device_stream_fastis_cudamake_layers)loggerc                       sX   e Zd Z				ddededee ded	ed
eddf fddZ	de
jfddZ  ZS )NemotronHMLPNFT configintermediate_sizequant_configbiasreduce_resultsprefixreturnc                    sP   t    t|j|||| dd| _t||j|||| dd| _t | _d S )Nz.up_proj
input_sizeoutput_sizer5   r4   r7   z
.down_proj)r:   r;   r5   r4   r6   r7   )	super__init__r   hidden_sizeup_projr   	down_projr   act_fn)selfr2   r3   r4   r5   r6   r7   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/nemotron_h.pyr=   N   s"   
	zNemotronHMLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r?   rA   r@   )rB   rG   _rE   rE   rF   forwardj   s   
zNemotronHMLP.forward)NFTr1   )__name__
__module____qualname__r   intr   r   boolstrr=   torchTensorrJ   __classcell__rE   rE   rC   rF   r0   M   s*    r0   c                 C   s   t d u r|  a t S rH   )_alt_streamStream)device_modulerE   rE   rF   _get_or_create_alt_streamt   s   rW   c                       s   e Zd Z		ddededee deddf
 fdd	Zd
e	j
dee	j
e	j
dB f fddZd
e	j
dee	j
e	j
dB f fddZd
e	j
dee	j
e	j
dB f fddZd
e	j
de	j
fddZ  ZS )NemotronHMoENr1   r2   	layer_idxr4   r7   r8   c                    s  t    t | _|j| _t | _t j	| _
| j
 | _| j
 | _|j| _|j| _t|dd d u| _| jr:|jn|j| _t|j|jdtjd | dd| _ttj|jtjd| j_t|jd|j|j |j!d| jjdd	| _"t#||jt$ j% |j| j|j&d|| d
|j'|dd
| _(|jrt)||j*|j |d| dd| _+nd | _+| jrt|j| j|j,|| dd| _-t| j|j|j,|| dd| _.d S d | _-d | _.d S )Nmoe_latent_sizeFz.gate)r5   params_dtyper4   r7   dtypeTsigmoid      ?)top_kuse_grouped_topk
topk_groupnum_expert_grouprenormalizescoring_funccorrection_biasrouted_scaling_factorz.experts)
num_expertsr`   r>   r3   r6   r4   r7   
activationlayer_idis_gatedz.shared_experts)r3   r4   r6   r7   z.fc1_latent_projr9   z.fc2_latent_proj)/r<   r=   r   tp_sizerg   rQ   get_device_modulerV   r   device_groupep_grouprankep_ranksizeep_sizen_routed_expertsn_shared_expertsgetattruse_latent_moerZ   r>   moe_hidden_sizer   float32gater   	Parameteremptye_score_correction_biasr   num_experts_per_tokrb   n_groupnorm_topk_probtopkr   r*   ep_num_redundant_expertsmoe_intermediate_sizemlp_hidden_actexpertsr0   #moe_shared_expert_intermediate_sizeshared_expertsmlp_biasfc1_latent_projfc2_latent_projrB   r2   rY   r4   r7   rC   rE   rF   r=   |   s   




	
zNemotronHMoE.__init__hidden_statesc                 C   s   t r| |S | |S rH   )_is_cuda#_forward_core_shared_routed_overlap_forward_core_normal)rB   r   rE   rE   rF   _forward_core   s   

zNemotronHMoE._forward_corec                 C   sf   |  |jtjd\}}| jd ur| |}nd }| ||}| jr)| |\}}| ||}||fS Nr\   )	rz   torQ   ry   r   r   rw   r   r   )rB   r   router_logitsrI   shared_outputtopk_outputfinal_hidden_statesrE   rE   rF   r      s   
z!NemotronHMoE._forward_core_normalc                 C   s   t | j}|t  | jd ur| |}nd }| j|* | |jtj	d\}}| 
||}| jr;| |\}}| ||}W d    n1 sKw   Y  t | ||fS r   )rW   rV   wait_streamr,   r   streamrz   r   rQ   ry   r   rw   r   r   )rB   r   
alt_streamr   r   rI   r   r   rE   rE   rF   r      s   

z0NemotronHMoE._forward_core_shared_routed_overlapc                 C   s   |j \}}| |\}}|jtjkr|| j9 }n| jd ur*|d us#J |d| j 9 }| jr4| |\}}|d ur<||7 }| j	dkrEt
|}|||S )Nr_      )shaper   r]   rQ   float16rg   r   rw   r   rl   r   view)rB   r   
num_tokens
hidden_dimr   r   rI   rE   rE   rF   rJ     s   


zNemotronHMoE.forwardNr1   )rK   rL   rM   r   rN   r   r   rP   r=   rQ   rR   tupler   r   r   rJ   rS   rE   rE   rC   rF   rX   {   s<    Y
	

rX   c                       j   e Zd Z		ddededee deddf
 fdd	Zd
e	j
dee	j
 dedee	j
e	j
f fddZ  ZS )NemotronHMLPDecoderLayerNr1   r2   rY   r4   r7   r8   c                    s   t    || _|j}|d |d  dd }t|jtr1t|jdkr+|jd }n	|j| }n|j}t	||||j
| dd| _t|j|jd| _d S )Nr   -r   .mixer)r3   r4   r5   r7   eps)r<   r=   r2   hybrid_override_patterncount
isinstancer3   listlenr0   r   mixerr   r>   layer_norm_epsilonnorm)rB   r2   rY   r4   r7   r   	mlp_indexr3   rC   rE   rF   r=     s"   
z!NemotronHMLPDecoderLayer.__init__r   residualforward_batchc                C   <   |d u r|}|  |}n|  ||\}}| j|}||fS rH   r   r   rJ   rB   r   r   r   rE   rE   rF   rJ   <     z NemotronHMLPDecoderLayer.forwardr   rK   rL   rM   r   rN   r   r   rP   r=   rQ   rR   r#   r   rJ   rS   rE   rE   rC   rF   r     s.    r   c                       r   )NemotronHMoEDecoderLayerNr1   r2   rY   r4   r7   r8   c                    8   t    t|||| dd| _t|j|jd| _d S )Nr   )rY   r4   r7   r   )r<   r=   rX   r   r   r>   r   r   r   rC   rE   rF   r=   N     
z!NemotronHMoEDecoderLayer.__init__r   r   r   c                C   r   rH   r   r   rE   rE   rF   rJ   `  r   z NemotronHMoEDecoderLayer.forwardr   r   rE   rE   rC   rF   r   M  .    r   c                       r   )NemotronHMambaDecoderLayerNr1   r2   rY   r4   r7   r8   c                    s\   t    || _|| _t|j|j|j|j|j	|j
|j|| dd	| _t|j|j
d| _d S )Nr   )	cache_paramsr>   use_conv_biasuse_biasn_groupsrms_norm_epsri   r4   r7   r   )r<   r=   r2   rj   r   mamba2_cache_paramsr>   r   r   mamba_n_groupsr   mamba_hidden_actr   r   r   r   rC   rE   rF   r=   r  s   
z#NemotronHMambaDecoderLayer.__init__r   r   r   c                C   sx   |d u r|}|  |}n|  ||\}}t|}|j}t|ts#J t|jts+J |jj| j	| j
||dd ||fS )NT)r   rj   r   outputuse_triton_causal_conv)r   rQ   
empty_likeattn_backendr   r   linear_attn_backendr   rJ   r   rj   )rB   r   r   r   r   r   rE   rE   rF   rJ     s    
z"NemotronHMambaDecoderLayer.forwardr   r   rE   rE   rC   rF   r   q  s.    r   c                       sV   e Zd Z		ddededee deddf
 fdd	Zd
e	j
dede	j
fddZ  ZS )NemotronHAttentionNr1   r2   rY   r4   r7   r8   c              
      sP  t    |j| _t }|j| _| j| dksJ | j| | _|j| _| j|kr2| j| dks1J n	|| j dks;J t	d| j| | _
t|drS|jd urS|j| _n|j| j | _| j| j | _| j
| j | _| jd | _t|j| j| j| jd|| dd| _t| j| j |jd|| dd| _t| j| j| j| j
||td	|d
| _d S )Nr   r   head_dimg      Fz	.qkv_proj)r5   r4   r7   z.o_projattn)num_kv_headsrj   r4   r7   )r<   r=   r>   r   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr   hasattrr   q_sizekv_sizescalingr   qkv_projr   o_projr   r+   r   )rB   r2   rY   r4   r7   rl   rC   rE   rF   r=     sT   


	
zNemotronHAttention.__init__r   r   c           
      C   sR   |  |\}}|j| j| j| jgdd\}}}| j||||}| |\}	}|	S )N)dim)r   splitr   r   r   rJ   r   )
rB   r   r   qkvrI   qkvattn_outputr   rE   rE   rF   rJ     s
    zNemotronHAttention.forwardr   )rK   rL   rM   r   rN   r   r   rP   r=   rQ   rR   r#   rJ   rS   rE   rE   rC   rF   r     s*    :r   c                       r   )NemotronHAttentionDecoderLayerNr1   r2   rY   r4   r7   r8   c                    r   )Nr   )r7   r   )r<   r=   r   r   r   r>   r   r   r   rC   rE   rF   r=     r   z'NemotronHAttentionDecoderLayer.__init__r   r   r   c                C   s@   |d u r|}|  |}n|  ||\}}| jj||d}||fS )N)r   r   r   r   rE   rE   rF   rJ     s   z&NemotronHAttentionDecoderLayer.forwardr   r   rE   rE   rC   rF   r     r   r   ALL_DECODER_LAYER_TYPESc                       sv   e Zd Zddddedee def fddZ		dd	ej	d
ej	de
dee deej	 deej	ef fddZ  ZS )NemotronHModelNr1   r4   r7   r2   r4   r7   c                   s   t    d } | _|r|j|jpd nd} j| | _ j| _t | _| jj	r4t
| j j jd| _nt | _dtdtf fdd}tt j|| jj| jj| dd	\| _| _| _| jjrlt j jd
| _d S tdd| _d S )Nr   r   )org_num_embeddingsidxr7   c                    s   t  j|   }| | |dS )Nr   )r   r   )r   r7   layer_classr2   r4   rE   rF   	get_layer<  s   z*NemotronHModel.__init__.<locals>.get_layerz.layers)pp_rankpp_sizer7   r   T)return_tuple)r<   r=   r2   lora_extra_vocab_size	max_loras
vocab_sizeorg_vocab_sizer   pp_groupis_first_rankr"   r>   embed_tokensr   rN   rP   r.   r   r   rank_in_group
world_sizelayersstart_layer	end_layeris_last_rankr   r   norm_f)rB   r2   r4   r7   lora_config
lora_vocabr   rC   r   rF   r=     s8   

zNemotronHModel.__init__	input_ids	positionsr   pp_proxy_tensorsinputs_embedsr8   c                 C   s   | j jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| jD ]}| j| }	t|	ts=t	dt
|	 |	j|||d\}}q(| j jsSt||dS | ||\}}
|S )Nr   r   zUnknown layer type: )r   r   r   )r   r   )r   r   r   ranger   r   r   r   Layers
ValueErrortyperJ   r   r$   r   )rB   r   r   r   r   r  r   r   ilayerrI   rE   rE   rF   rJ   L  s.   


zNemotronHModel.forwardNN)rK   rL   rM   r   r   r   rP   r=   rQ   rR   r#   r$   r   rJ   rS   rE   rE   rC   rF   r     s2    2r   c                       s$  e Zd Zg dZdg diZddiZdddd	d
ZeddidZdddde	de
e def fddZ		d0de	de
e defddZdefddZe 		d1dejdejdede
ej de
e f
d d!Zd"d# Zd$efd%d&Zd'd( Zd)d* Z	+d2d,eeeejf  d-eddfd.d/Z  Z S )3NemotronHForCausalLM))r   q_projr   )r   k_projr   )r   v_projr   r   )r
  r  r  backbonemodelAr   zattn.k_scalezattn.v_scale)A_log
embeddingszk_proj.k_scalezv_proj.v_scale	backbone.zmodel.)orig_to_new_prefixNr1   r   r2   r4   r7   c             	      s2  t    d }|| _|| _| j|||d| _t | _| jjrS| jj	dkr.| jj
r.| jj| _n)|j| _|r<|  j|j7  _t| j|j|j|sGtn|j|td|d| _nt | _| jj	dkr| jj
r| jjrs| jj| jjj| jjd n| jjr| jj| jjjt| j j| jjd}| jj | t!|| _"d S )Nr2   r4   r7   r   lm_head)r   padding_sizer4   r7   )dst)rr   r]   src)#r<   r=   r2   r4   _init_modelr  r   r   r   r   tie_word_embeddingsr   r  r   unpadded_vocab_sizer   r!   r>   r    lora_vocab_padding_sizer+   r   r   sendweight	last_rankrecvr   next
parametersr]   
first_rankcopy_r   logits_processor)rB   r2   r4   r7   r   emb_token_weightrC   rE   rF   r=     sL   

zNemotronHForCausalLM.__init__c                 C   s   t ||td|dS )Nr  r  )r   r+   )rB   r2   r4   r7   rE   rE   rF   r    s   z NemotronHForCausalLM._init_modelr8   c                 C   s   | j jS rH   )r  r   rB   rE   rE   rF   get_input_embeddings  s   z)NemotronHForCausalLM.get_input_embeddingsr   r   r   input_embedsr   c                 C   s2   | j |||||}| jjr| ||| j|S |S rH   )r  rJ   r   r   r%  r  )rB   r   r   r   r)  r   r   rE   rE   rF   rJ     s   	

zNemotronHForCausalLM.forwardc                 K   s   | j j|fi |S rH   )mamba_cachecopy_inputs_before_cuda_graphs)rB   input_bufferskwargsrE   rE   rF   r+    s   z3NemotronHForCausalLM.copy_inputs_before_cuda_graphs
batch_sizec                 C   s   | j |S rH   )r*  "get_seqlen_agnostic_capture_inputs)rB   r.  rE   rE   rF   r/    s   z7NemotronHForCausalLM.get_seqlen_agnostic_capture_inputsc                 C   s   | j jj| jjfS rH   )r  r   r  r  r'  rE   rE   rF   get_embed_and_head  s   z'NemotronHForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S rH   )r  r   r  r  rQ   cudaempty_cachesynchronize)rB   embedheadrE   rE   rF   set_embed_and_head  s   

z'NemotronHForCausalLM.set_embed_and_headFweightsis_mtpc              	   C   sZ  g }|D ]\}}t || j}t|| j}|||f qtjddd| jjd}t	| 
 }|D ]\}}|rUd|vr:q/|dd}d|v rU|dd	}|d
rU|d
d}|s\d|v r\q/d|v rn||vrnt||}|d u rnq/t|}|d urt| jdr|| jjk s|| jjkrq/d|v r| jjsq/d|v sd|v r| jjsq/| jD ]-\}	}
}|
|vrq||
|	}|dr||vrq||vrq|| }|j}||||  nZd}|D ]&}|\}	}
}}|
|vrqd}||
|	}|| }|j|||||d |} n/|rq/|dr
||vr
q/|| v r!|| }t|dt}||| q/td| d q/d S )Nr?   r@   r1   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerh   mtpzmtp.layers.zmodel.layers.r  zmodel.embed_tokensr  scaler   r   r   r  z.biasFT)shard_id	expert_idweight_loaderz
Parameter z not found in params_dict)r'   remap_prefixr(   remap_substrappendr   make_expert_params_mappingr2   rt   dictnamed_parametersreplace
startswithr&   r   r   r  r   r   r   r   r   stacked_params_mappingendswithr@  keysrv   r%   r/   warning)rB   r7  r8  updated_weightsnameloaded_weightexpert_params_mappingparams_dictrj   
param_nameweight_namer>  paramr@  is_expert_weightmappingr?  name_mappedrE   rE   rF   load_weights  s   


z!NemotronHForCausalLM.load_weightsr   r  )F)!rK   rL   rM   rI  packed_modules_mappingrA  rB  r)   hf_to_sglang_mapperr   r   r   rP   r=   r  r"   r(  rQ   no_gradrR   r#   r$   rJ   r+  rN   r/  r0  r6  r   r   rO   rX  rS   rE   rE   rC   rF   r	  q  sv    
:

	r	  )\__doc__collections.abcr   typingr   r   rQ   r   sglang.srt.configsr   sglang.srt.configs.nemotron_hr   r   r	   r
   sglang.srt.distributedr   r   r   r   sglang.srt.layers.activationr   6sglang.srt.layers.attention.hybrid_linear_attn_backendr   r   'sglang.srt.layers.attention.mamba.mambar   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   "sglang.srt.layers.moe.ep_moe.layerr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.moe.topkr   sglang.srt.layers.quantizationr   !sglang.srt.layers.radix_attentionr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr    r!   r"   ,sglang.srt.model_executor.forward_batch_infor#   r$   $sglang.srt.model_loader.weight_utilsr%   r&   r'   r(   sglang.srt.models.utilsr)   sglang.srt.server_argsr*   sglang.srt.utilsr+   r,   r-   r.   sglang.utilsr/   r   Moduler0   rT   rW   rX   r   r   r   r   r   r  r   rE  rP   r  __annotations__r   r	  
EntryClassrE   rE   rE   rF   <module>   sn   $ #0$4E'S 
o