o
    
۾i                     @   sj  d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJ ddlKmLZL G dd dejMZNG dd dejMZOG d d! d!ejMZPG d"d# d#ejMZQG d$d% d%ejMZRG d&d' d'ejMZSG d(d) d)ejMZTeRePeTeQd*ZUe
G d+d, d,ejMZVG d-d. d.ejMe:e=e?e;e@e<e>
ZWdS )/zInference-only NemotronH model.    N)CallableIterable)islice)nn)support_torch_compile)CacheConfigModelConfig
VllmConfig)ParallelConfig)get_ep_group$get_tensor_model_parallel_world_size) tensor_model_parallel_all_gather)get_pp_group)ReLUSquaredActivation)	Attention)FusedMoESharedFusedMoE)activation_without_mul)RMSNorm)ColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)MambaMixer2)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)HasInnerStateIsHybridMixtureOfExpertsSupportsLoRASupportsMambaPrefixCaching
SupportsPPSupportsQuant)AutoWeightsLoaderWeightsMapperis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixsequence_parallel_chunk)IntermediateTensors)NemotronHConfigc                       sb   e Zd Z					ddededededB d	ed
edededdf fddZde	j
fddZ  ZS )NemotronHMLPNFT confighidden_sizeintermediate_sizequant_configbiasreduce_resultsis_sequence_parallelprefixreturnc	           	   	      sP   t    t|||||| dd| _t||||||| dd| _t | _d S )Nz.up_proj
input_sizeoutput_sizer:   r9   
disable_tpr=   z
.down_proj)r@   rA   r:   r9   r;   rB   r=   )super__init__r   up_projr   	down_projr   act_fn)	selfr6   r7   r8   r9   r:   r;   r<   r=   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/nemotron_h.pyrD   U   s&   
	zNemotronHMLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)rE   rG   rF   )rH   rM   _rK   rK   rL   forwardu   s   
zNemotronHMLP.forward)NFTFr5   )__name__
__module____qualname__r3   intr   boolstrrD   torchTensorrP   __classcell__rK   rK   rI   rL   r4   T   s4    	
 r4   c                	       sT   e Zd Z			ddededB dedB def fddZd	ej	d
ej	fddZ
  ZS )NemotronHMoENr5   r6   r9   parallel_configr=   c              	      sj  t    t | _|j| _t j| _| j | _	| j
 | _|j| _|j| _t|dd d u| _| jr5|jn|j| _|j| _tj}t|j|jd|d | dd| _ttj|jtjd| j_|j| _|jj| _ | j| _!| j!| j  | _"| j"| j | _#| j	| j# | _$| j$| j# | _%|jd u s|jdkrd | _&n|j'|j }t(||j||d| j| dd| _&| jrt|j| j|j)|| j| d	d
| _*t| j|j|j)|| j| dd
| _+nd | _*d | _+t,d$i d| j&d|jd|j-d| jd|j.ddd|j/d|ddd|j0d|j1d| dddd| jjdt2|j3ddd| jd | j d!| jd"|d#| j*| _4d S )%Nmoe_latent_sizeFz.gate)r:   params_dtyper9   r=   dtyper   z.shared_experts)r6   r7   r8   r9   r;   r<   r=   z.fc1_latent_projr?   z.fc2_latent_projshared_expertsnum_expertstop_kr7   r8   r;   renormalizer9   use_grouped_topkTnum_expert_group
topk_groupr=   z.expertsscoring_funcsigmoide_score_correction_bias
activationis_act_and_mulenable_eplbnum_redundant_expertsr<   router_logits_dtyperouted_input_transformrK   )5rC   rD   r   tp_sizerouted_scaling_factorr   device_groupep_grouprankep_ranksizeep_sizen_routed_expertsn_shared_expertsgetattruse_latent_moer\   r7   moe_hidden_sizeuse_sequence_parallel_moer<   rW   float32r   gater   	Parameteremptyri   rl   eplb_configrm   n_redundant_expertsn_logical_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr`   #moe_shared_expert_intermediate_sizer4   mlp_biasfc1_latent_projfc2_latent_projr   num_experts_per_tokmoe_intermediate_sizenorm_topk_probn_grouprf   r   mlp_hidden_actexperts)rH   r6   r9   r[   r=   rn   r8   rI   rK   rL   rD   }   s   

	




		



zNemotronHMoE.__init__hidden_statesr>   c                 C   s   |j \}}|d|}| jrt|}| |jtjd\}}| j||d\}}|j	tj
kr3|| j9 }n| jd ur?|d| j 9 }| jrI| |\}}| jd urR||7 }| jrat|d}|d | }n| jdkrl| j|}|||S )Nr^   )r   router_logitsg      ?r      )shapeviewr<   r1   r   torW   r~   r   r_   float16rq   r`   r{   r   r   rp   &maybe_all_reduce_tensor_model_parallel)rH   r   
num_tokens
hidden_dimr   rO   shared_outputfinal_hidden_statesrK   rK   rL   rP      s4   




zNemotronHMoE.forward)NNr5   )rQ   rR   rS   r3   r   r
   rV   rD   rW   rX   rP   rY   rK   rK   rI   rL   rZ   |   s    orZ   c                       t   e Zd Z					ddedededB dedB dedB dedB d	e	d
df fddZ
dejdejdB fddZ  ZS )NemotronHMLPDecoderLayerNr5   r6   	layer_idxmodel_configcache_configr9   r[   r=   r>   c                    s   t    || _|j}|d |d  dd }	t|jtr1t|jdkr+|jd }
n	|j|	 }
n|j}
t	||j
|
||j| dd| _t|j
|jd| _d S )Nr   -r   .mixer)r7   r8   r9   r:   r=   eps)rC   rD   r6   hybrid_override_patterncount
isinstancer8   listlenr4   r7   r   mixerr   layer_norm_epsilonnorm)rH   r6   r   r   r   r9   r[   r=   r   	mlp_indexr8   rI   rK   rL   rD     s$   

	z!NemotronHMLPDecoderLayer.__init__r   residualc                 K   :   |d u r|}|  |}n|  ||\}}| |}||fS rN   r   r   rH   r   r   kwargsrK   rK   rL   rP   =     
z NemotronHMLPDecoderLayer.forwardNNNNr5   rQ   rR   rS   r3   rT   r   r   r   r
   rV   rD   rW   rX   rP   rY   rK   rK   rI   rL   r     s8    	"r   c                       r   )NemotronHMoEDecoderLayerNr5   r6   r   r   r   r9   r[   r=   r>   c           
         sZ   t    || _t|dd }|r||n|}	t|	||| dd| _t|j|jd| _	d S )Nget_nemotron_h_config_for_layerr   )r9   r[   r=   r   )
rC   rD   r6   rz   rZ   r   r   r7   r   r   
rH   r6   r   r   r   r9   r[   r=   get_layer_configlayer_configrI   rK   rL   rD   N  s   

z!NemotronHMoEDecoderLayer.__init__r   r   c                 K   r   rN   r   r   rK   rK   rL   rP   h  r   z NemotronHMoEDecoderLayer.forwardr   r   rK   rK   rI   rL   r   M  s8    	r   c                       r   )NemotronHMambaDecoderLayerNr5   r6   r   r   r   r9   r[   r=   r>   c                    sp   t    || _t|j|j|j|j|j |j	|j
|j|j|j|j|j|||| dd| _t|j|jd| _d S )Nr   )r7   ssm_state_sizeconv_kernel_sizer8   use_conv_biasuse_biasn_groups	num_headshead_dimrms_norm_epsrj   r   r   r9   r=   r   )rC   rD   r6   r   r7   r   conv_kernelmamba_num_headsmamba_head_dimr   r   r   r   mamba_hidden_actr   r   r   )rH   r6   r   r   r   r9   r[   r=   rI   rK   rL   rD   y  s(   


z#NemotronHMambaDecoderLayer.__init__r   r   c                 K   s:   |d u r|}|  |}n|  ||\}}| |}||fS rN   r   )rH   r   r   r   outputrK   rK   rL   rP     r   z"NemotronHMambaDecoderLayer.forwardr   r   rK   rK   rI   rL   r   x  s8    	 r   c                       sf   e Zd Z				ddedededB dedB dedB ded	df fd
dZ	de
jd	e
jfddZ  ZS )NemotronHAttentionNr5   r6   r   r   r   r9   r=   r>   c           	   
      s^  t    |j| _t }|j| _| j| dksJ | j| | _|j| _| j|kr2| j| dks1J n	|| j dks;J t	d| j| | _
t|drS|jd urS|j| _n|j| j | _| j| j | _| j
| j | _| jd | _t|j| j| j| jd|| dd| _t| j| j |jd|| dd| _t|d	d }t| j| j| j| j
||| d
|d| _d S )Nr   r   r   g      Fz	.qkv_proj)r:   r9   r=   z.o_projsliding_windowz.attn)num_kv_headsr   r9   r=   per_layer_sliding_window)rC   rD   r7   r   num_attention_headstotal_num_headsr   num_key_value_headstotal_num_kv_headsmaxr   hasattrr   q_sizekv_sizescalingr   qkv_projr   o_projrz   r   attn)	rH   r6   r   r   r   r9   r=   rp   r   rI   rK   rL   rD     sX   
	

	
	zNemotronHAttention.__init__r   c           
      K   sN   |  |\}}|j| j| j| jgdd\}}}| |||}| |\}	}|	S )Nr   )dim)r   splitr   r   r   r   )
rH   r   r   qkvrO   qkvattn_outputr   rK   rK   rL   rP     s
    zNemotronHAttention.forward)NNNr5   )rQ   rR   rS   r3   rT   r   r   r   rV   rD   rW   rX   rP   rY   rK   rK   rI   rL   r     s2    @r   c                       sz   e Zd Z					ddedededB dedB dedB dedB d	e	d
df fddZ
dejdejdejdB fddZ  ZS )NemotronHAttentionDecoderLayerNr5   r6   r   r   r   r9   r[   r=   r>   c           
         sX   t    t|dd }|r||n|}	t|	||||| dd| _t|j|jd| _d S )Nr   r   r=   r   )	rC   rD   rz   r   r   r   r7   r   r   r   rI   rK   rL   rD     s   

	z'NemotronHAttentionDecoderLayer.__init__	positionsr   r   c                 K   s<   |d u r|}|  |}n|  ||\}}| j|d}||fS )N)r   r   )rH   r   r   r   r   rK   rK   rL   rP     s   z&NemotronHAttentionDecoderLayer.forwardr   r   rK   rK   rI   rL   r     s<    	r   )Mr   *Ec                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
defddZdeeeeeef  fddZdeeeejf  dee fddZ  ZS )NemotronHModelr5   r   vllm_configr=   c                   s   t    |jj|j|j |j|j| _j| _t	| jj
| _djv | _dtf fdd}ttj|| dd\| _| _| _tddgj
| _tj
jd	| _d S )
Nr   r=   c              	      s8   t | ddd }tj|  }|| | dS )N.r   )r6   r   r   r   r9   r[   r=   )rT   rsplitALL_DECODER_LAYER_TYPESr   )r=   r   layer_classr   r6   r   r[   r9   rK   rL   	get_layerA  s   z*NemotronHModel.__init__.<locals>.get_layerz.layersr   r   r   r   )rC   rD   r   	hf_configr   r9   r[   r6   
vocab_sizer!   r7   embed_tokensr   has_moerV   r/   r   start_layer	end_layerlayersr.   make_empty_intermediate_tensorsr   r   norm_f)rH   r   r=   r   rI   r   rL   rD   -  s*   

zNemotronHModel.__init__	input_idsr>   c                 C   s
   |  |S rN   )r   rH   r   rK   rK   rL   embed_input_idsY  s   
zNemotronHModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]}||||d\}}q*t  jsAt||dS | 	||\}}|S )Nr   r   )r   r   r   )r   r   )
r   is_first_rankr   r   r   r   r   is_last_rankr2   r   )	rH   r   r   r   r   r   r   layerrO   rK   rK   rL   rP   \  s(   
zNemotronHModel.forwardc                 C   s   t | jdd}|dur|S d}t | jdd}|rD|D ])}t|tr2|ddkr1t||dd}qt |dddkrCt|t |dd}q|S )zGet max n_routed_experts from config or block_configs for puzzle models.

        For heterogeneous models with varying expert counts per layer,
        returns the MAX to ensure all expert weights can be loaded.
        rx   Nr   block_configs
block_typemoer5   )rz   r6   r   dictgetr   )rH   rx   max_expertsr  blockrK   rK   rL   _get_max_n_routed_experts|  s"   
z(NemotronHModel._get_max_n_routed_expertsc              
   C   s0   | j rtj| ddd|  t| ddd}|S g S )NrE   rF   r5   rm   r   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namera   rm   )r   r   make_expert_params_mappingr
  rz   )rH   expert_params_mappingrK   rK   rL   get_expert_mapping  s   
z!NemotronHModel.get_expert_mappingweightsc              
   C   sr  g d}|   }t|  }t }|D ]\}}d|v sd|v r)t||}|d u r)q|D ].\}}	}
|	|vr5q+||	|}|drE||vrEq+t|| rKq+|| }|j}||||
  nWd}|D ]:}|\}}	}}
|	|vrkq^d}||	|}t|| ryq^|| }t	
tdtf |j}|||||
|dd}|r|} nq^|rqt|| rq|| }t|d	t}||| || q|S )
N))r   q_projr   )r   k_projr   )r   v_projr   scale
zero_pointz.biasFT.)shard_id	expert_idreturn_successweight_loader)r  r  named_parameterssetr#   replaceendswithr-   r  typingcastr   rU   rz   r"   add)rH   r  stacked_params_mappingr  params_dictloaded_paramsnameloaded_weight
param_nameweight_namer  paramr  is_expert_weightmappingr  name_mappedsuccessrK   rK   rL   load_weights  sr   




zNemotronHModel.load_weightsNN)rQ   rR   rS   r	   rV   rD   rW   rX   r   r2   rP   rT   r
  r   tupler  r   r  r.  rY   rK   rK   rI   rL   r   +  s&    ,
 ,r   c                
       st  e Zd ZU dZeed< eddiddddZd	g d
iZdddZ	dgZ
edddeejejf fddZedddeeeef eeeef f fddZedeeef fddZdddedef fddZdededdfd d!Zd"ejdejfd#d$Z		d0d"ejdB d%ejd&edB d'ejdB fd(d)Zd*ejdejdB fd+d,Zd-eeeejf  dee fd.d/Z  Z S )1NemotronHForCausalLMTis_non_gated_moebackbonemodelAr   )A_log
embeddings)orig_to_new_prefixorig_to_new_substrr   )r  r  r  input_embeddingsoutput_embeddings)r   lm_headzmtp.r   r	   r>   c                 C   s   t |jj|jj|jjS rN   )r   mamba2_state_dtyper   r_   r   mamba_cache_dtypemamba_ssm_cache_dtype)clsr   rK   rK   rL   !get_mamba_state_dtype_from_config'  s
   z6NemotronHForCausalLM.get_mamba_state_dtype_from_configc              	   C   s>   |j }|jj}|j|j }tj||j|j|j|j|j	|j
dS )a3  Calculate shapes for Mamba's convolutional and state caches.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
            - temporal_state_shape: Shape for state space model cache
        )r8   tp_world_sizer   r   r   
state_sizer   )r[   r   r   r   r   r   mamba2_state_shapetensor_parallel_sizer   r   r   )r@  r   r[   r   r8   rK   rK   rL   !get_mamba_state_shape_from_config2  s   z6NemotronHForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S rN   )r   mamba2_state_copy_func)r@  rK   rK   rL   get_mamba_state_copy_funcO  s   z.NemotronHForCausalLM.get_mamba_state_copy_funcr5   r   r=   c                   s  |j j}|| _|j | _ |j}|j| _t   || _|| _t|t	|dd| _
t|j|jt	|dd| _t|j| _| j
j| _| j
jrg | _|j| _g | _d }| j
jD ]}t|tre|j}| j|jj qSt| j| _|j| _ |j!| _"|j#| _$|j%| _&|j'| _(|j)| _*d S d S )Nr4  )r   r=   r<  r   )+r   r   r   scheduler_configr9   rC   rD   r6   r   r0   r4  r    r   r7   r<  r   logits_processorr   r   expert_weightsr   num_expert_groups
moe_layersr   r   r   r   appendr   r   num_moe_layersr   num_logical_expertsr   num_physical_expertsr   num_local_physical_expertsrx   num_routed_expertsry   num_shared_expertsr   rm   )rH   r   r=   r6   rI  example_moer  rI   rK   rL   rD   S  sJ   


zNemotronHForCausalLM.__init__rQ  rR  Nc                 C   sf   | j |ksJ || _|| _ || j | _| jjD ]}t|tr0|j}||_	||_
| j|_|j  qd S rN   )rR  rQ  rP  rm   r4  r   r   r   r   r   r   r   r   update_expert_map)rH   rQ  rR  r  r  rK   rK   rL    update_physical_experts_metadata  s   

z5NemotronHForCausalLM.update_physical_experts_metadatar   c                 C   s   | j |S rN   )r4  r   r   rK   rK   rL   r     s   z$NemotronHForCausalLM.embed_input_idsr   r   r   c                 K   s   |  ||||}|S rN   )r4  )rH   r   r   r   r   r   r   rK   rK   rL   rP     s   zNemotronHForCausalLM.forwardr   c                 C   s   |  | j|}|S rN   )rJ  r<  )rH   r   logitsrK   rK   rL   compute_logits  s   z#NemotronHForCausalLM.compute_logitsr  c                 C   s   t | dgd}|j|| jdS )Nmtp)skip_prefixes)mapper)r+   r.  hf_to_vllm_mapper)rH   r  loaderrK   rK   rL   r.    s   z!NemotronHForCausalLM.load_weightsr/  )!rQ   rR   rS   r2  rU   __annotations__r,   r]  packed_modules_mappingembedding_moduleslora_skip_prefixesclassmethodr0  rW   r_   rA  rT   rF  r   rH  r	   rV   rD   rW  rX   r   r2   rP   rY  r   r  r.  rY   rK   rK   rI   rL   r1    sh   
 

2


,r1  )X__doc__r  collections.abcr   r   	itertoolsr   rW   r   vllm.compilation.decoratorsr   vllm.configr   r   r	   vllm.config.parallelr
   vllm.distributedr   r   !vllm.distributed.communication_opr   vllm.distributed.parallel_stater   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   r   *vllm.model_executor.layers.fused_moe.utilsr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   -vllm.model_executor.layers.mamba.mamba_mixer2r   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr    r!   -vllm.model_executor.model_loader.weight_utilsr"   r#   %vllm.model_executor.models.interfacesr$   r%   r&   r'   r(   r)   r*    vllm.model_executor.models.utilsr+   r,   r-   r.   r/   r0   r1   vllm.sequencer2   vllm.transformers_utils.configsr3   Moduler4   rZ   r   r   r   r   r   r   r   r1  rK   rK   rK   rL   <module>   sl   $$		( 3+1M. 
Y