o
    
۾i F                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 dd Z1G dd dej2Z3G dd dej4Z5G dd dej4Z6G dd dej4Z7eG d d! d!ej4Z8G d"d# d#ej4e(e)Z9dS )$zBInference-only Nemotron model compatible with HuggingFace weights.    )Iterable)isliceN)nn)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
get_act_fn)	Attention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)NemotronConfig   )SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                  G   s$   t  s| S t jjj| dt  dS )Ncuda)device_typedtype)torchis_autocast_enabledampautocast_mode_castget_autocast_gpu_dtype)args r+   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/nemotron.py_cast_if_autocast_enabledJ   s
   
r-   c                	       sl   e Zd Z					ddeee B ejB dededef fdd	Z		dd
ej
dej
dB dej
fddZ  ZS )NemotronLayerNorm1Ph㈵>TNnormalized_shapeepselementwise_affinebiasc                    s   t  |||||| d S N)super__init__)selfr0   r1   r2   r3   devicer#   	__class__r+   r,   r6   T   s   	zNemotronLayerNorm1P.__init__xresidualreturnc                 C   s   |d ur
|| }|}t || j| jd | j| j}tjjddd tjj	j
| }|d u r.|n||fW  d    S 1 s<w   Y  d S )Nr   r!   F)enabled)r-   r0   weightr3   r1   r$   r&   autocastr   
functional
layer_norm)r7   r;   r<   r*   r+   r+   r,   forward_   s   $zNemotronLayerNorm1P.forward)r/   TTNNr4   )__name__
__module____qualname__intlistr$   Sizefloatboolr6   TensorrC   __classcell__r+   r+   r9   r,   r.   S   s.    r.   c                       sN   e Zd Z			ddededededB ded	ed
df fddZdd Z  Z	S )NemotronMLPNF hidden_sizeintermediate_size
hidden_actquant_configr3   prefixr=   c                    sL   t    t||||| dd| _t||||| dd| _t|| _d S )Nz.up_proj
input_sizeoutput_sizer3   rS   rT   z
.down_proj)r5   r6   r   up_projr   	down_projr
   act_fn)r7   rP   rQ   rR   rS   r3   rT   r9   r+   r,   r6   p   s    
	zNemotronMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S r4   )rX   rZ   rY   )r7   r;   up_r+   r+   r,   rC      s   
zNemotronMLP.forward)NFrO   )
rD   rE   rF   rG   strr   rK   r6   rC   rM   r+   r+   r9   r,   rN   o   s(    rN   c                       sv   e Zd Z					ddedededed	ed
edB dededB deddf fddZ	de
jde
jde
jfddZ  ZS )NemotronAttention    NFrO   configrP   	num_headsnum_kv_headsmax_position_embeddingsrS   r3   cache_configrT   r=   c
              	      sZ  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _t	|dd | _
| j
d u rT| j| j | _
| j| j
 | _| j| j
 | _| j
d | _|| _t|| j
| j| j|||	 dd| _t| j| j
 ||||	 dd| _t| j
||jd	| _t| j| j
| j| j|||	 d
d| _d S )Nr   r   head_dimg      	.qkv_proj)rP   	head_sizetotal_num_headstotal_num_kv_headsr3   rS   rT   z.o_projrU   )max_positionrope_parametersz.attn)rb   rd   rS   rT   )r5   r6   rP   r	   rh   ra   ri   maxrb   getattrre   q_sizekv_sizescalingrc   r   qkv_projr   o_projr   rk   
rotary_embr   attn)r7   r`   rP   ra   rb   rc   rS   r3   rd   rT   tp_sizer9   r+   r,   r6      s`   


	
zNemotronAttention.__init__	positionshidden_statesc           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )N)dim)rq   splitrn   ro   rs   rt   rr   )
r7   rv   rw   qkvr\   qkvattn_outputoutputr+   r+   r,   rC      s    zNemotronAttention.forward)r_   NFNrO   )rD   rE   rF   r   rG   r   rK   r   r]   r6   r$   rL   rC   rM   r+   r+   r9   r,   r^      sD    	
Er^   c                       sr   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	ej	f fddZ  ZS )NemotronDecoderLayerNrO   r`   rd   rS   rT   r=   c                    s   t    |j| _t|dd}t|ddpt|dd}t|| j|jt|d|j||||| dd	| _t| j|j|j	|t|d	d| d
d| _
t|j|jd| _t|j|jd| _d S )Nrc   r_   attention_biasFr3   num_key_value_headsz
.self_attn)	r`   rP   ra   rb   rc   rS   r3   rd   rT   mlp_biasz.mlp)rP   rQ   rR   rS   r3   rT   r1   )r5   r6   rP   rm   r^   num_attention_heads	self_attnrN   rQ   rR   mlpr.   norm_epsinput_layernormpost_attention_layernorm)r7   r`   rd   rS   rT   rc   r   r9   r+   r,   r6      sB   

zNemotronDecoderLayer.__init__rv   rw   r<   c                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)rv   rw   )r   r   r   r   )r7   rv   rw   r<   r+   r+   r,   rC     s   
zNemotronDecoderLayer.forward)NNrO   )rD   rE   rF   r   r   r   r]   r6   r$   rL   tuplerC   rM   r+   r+   r9   r,   r      s0    +r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )NemotronModelrO   rT   vllm_configrT   c                   s   t    |jj|j |j| _| _j| _t j	s$j
r-t jr-t| jj| _nt | _tj fdd| dd\| _| _| _t jrVtjjd| _nt | _tddgj| _d S )Nc                    s   t  | dS )N)r`   rd   rS   rT   )r   r   rd   r`   rS   r+   r,   <lambda>@  s    z(NemotronModel.__init__.<locals>.<lambda>z.layersr   r   rw   r<   )r5   r6   model_config	hf_configrd   rS   r`   
vocab_sizer   is_first_ranktie_word_embeddingsis_last_rankr   rP   embed_tokensr   r   num_hidden_layersstart_layer	end_layerlayersr.   r   normr   make_empty_intermediate_tensors)r7   r   rT   r9   r   r,   r6   )  s8   



zNemotronModel.__init__	input_idsr=   c                 C   s
   |  |S r4   )r   r7   r   r+   r+   r,   embed_input_idsP  s   
zNemotronModel.embed_input_idsNrv   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )Nrw   r<   )rw   r<   )
r   r   r   r   r   r   r   r   r   r   )	r7   r   rv   r   r   rw   r<   layerr\   r+   r+   r,   rC   S  s    
zNemotronModel.forwardweightsc                 C   sF  g d}t |  }t }|D ]\}}| jd urA| j| }rA|| }t|dt}	| dkr2|n|d }|	|| || q|D ].\}
}}||vrMqC|	||
}|
dr]||vr]qCt|| rcqC|| }|j}	|	|||  n)|
dr|||vr|qt||}|d u rqt|| rq|| }t|dt}	|	|| || q|S )N))rf   z.q_projr|   )rf   z.k_projr}   )rf   z.v_projr~   weight_loaderr   z.bias)dictnamed_parameterssetrS   get_cache_scalerm   r   ry   addreplaceendswithr   r   r   )r7   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_idr+   r+   r,   load_weightsp  sL   






zNemotronModel.load_weightsr4   )rD   rE   rF   r   r]   r6   r$   rL   r   r   rC   r   r   r   r   rM   r+   r+   r9   r,   r   '  s     '
,r   c                       s   e Zd Zdg diZdddZddded	ef fd
dZdej	dej	fddZ
		ddej	dB dej	dedB dej	dB dej	eB f
ddZdej	dej	dB fddZdeeeej	f  dee fddZ  ZS )NemotronForCausalLMrq   )q_projk_projv_projinput_embeddingsoutput_embeddings)r   lm_headrO   r   r   rT   c                   s   t    |jj}|j}t|tsJ || _|| _t|t	|dd| _
t jrNt|j|j|t	|dd| _|jr?| j
jj| j_t|dd}t|j|d| _nt | _| j
j| _d S )Nmodel)r   rT   r   )rS   rT   logit_scaleg      ?)scale)r5   r6   r   r   rS   
isinstancer   r`   r   r    r   r   r   r   r   rP   r   r   r   r?   rm   r   logits_processorr   r   )r7   r   rT   r`   rS   r   r9   r+   r,   r6     s2   


zNemotronForCausalLM.__init__r   r=   c                 C   s   | j |S r4   )r   r   r   r+   r+   r,   r     s   z#NemotronForCausalLM.embed_input_idsNrv   r   r   c                 C   s   |  ||||}|S r4   )r   )r7   r   rv   r   r   model_outputr+   r+   r,   rC     s   zNemotronForCausalLM.forwardrw   c                 C   s   |  | j|}|S r4   )r   r   )r7   rw   logitsr+   r+   r,   compute_logits  s   z"NemotronForCausalLM.compute_logitsr   c                 C   s   t | }||S r4   )r   r   )r7   r   loaderr+   r+   r,   r     s   
z NemotronForCausalLM.load_weights)NN)rD   rE   rF   packed_modules_mappingembedding_modulesr   r]   r6   r$   rL   r   r   rC   r   r   r   r   r   rM   r+   r+   r9   r,   r     s6    
#

,r   ):__doc__collections.abcr   	itertoolsr   r$   r   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr   r	   %vllm.model_executor.layers.activationr
   $vllm.model_executor.layers.attentionr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.transformers_utils.configsr   
interfacesr   r   utilsr   r   r   r   r   r    r-   	LayerNormr.   ModulerN   r^   r   r   r   r+   r+   r+   r,   <module>   s:    	"SC 