o
    
۾iB                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ de0de1de1fddZ2de1de1de1fddZ3G dd  d eZ4G d!d" d"ej5Z6e
G d#d$ d$ej5Z7G d%d& d&ej5e'e(e&Z8dS )'z>Inference-only deci model compatible with HuggingFace weights.    )Iterable)isliceN)nn)LlamaConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group)RMSNorm)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)LlamaAttentionLlamaMLP)IntermediateTensors)AttentionType   )HasNoOpsSupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixffn_multn_embdreturnc                 C   s   t d|  | d }t|dS )N         )int_find_multiple)r    r!   intermediate_size r)   [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/nemotron_nas.py_ffn_mult_to_intermediate_size@   s   
r+   nkc                 C   s    | | dkr| S | | | |  S )Nr   r)   )r,   r-   r)   r)   r*   r'   F   s   r'   c                       s~   e Zd Zddddddejfdedededed	ed
edB dedede	dB de
de
ddf fddZd
edB ddfddZ  ZS )DeciLMAttention    NF confighidden_size	num_headsnum_kv_headsmax_position_embeddingsquant_configbiasbias_o_projcache_configprefix	attn_typer"   c                    s$   t  |||||||||	|
| d S N)super__init__)selfr1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   	__class__r)   r*   r>   N   s   zDeciLMAttention.__init__c                 C   s4   d}t |dr|jdv}t| j| j|j|d| _d S )NTposition_embedding_type)mistral_yarnrope_llama4)max_positionrope_parametersis_neox_style)hasattrrB   r   head_dimr5   rF   
rotary_emb)r?   r1   r6   rG   r)   r)   r*   _init_rotary_embj   s   

z DeciLMAttention._init_rotary_emb)__name__
__module____qualname__r   DECODERr   r&   r   boolr   strr>   rK   __classcell__r)   r)   r@   r*   r.   M   sL    	
r.   c                       sv   e Zd Z			ddedededB dedB deddf fd	d
Zde	j
de	j
de	j
dB dee	j
e	j
f fddZ  ZS )DeciLMDecoderLayerNr0   r1   	layer_idxr9   r6   r:   r"   c                    sB  t    |j| }|jj| _|jj| _|j| _t	|dd}t	|ddp)t	|dd}|}	t
|dr4|j}| js[|j|jj }
t|| j|j|
||||	|| dd
| _t|j|jd	| _| jst
|jd
ro|jj}t||j}n|jj}t
|jdr~|jj}n|j}t| j|||t	|dd| dd| _t|j|jd	| _d S d S )Nr5   r/   attention_biasFr7   qkv_biasz
.self_attn)
r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   epsr    
hidden_actmlp_biasz.mlp)r2   r(   rY   r6   r7   r:   )r=   r>   block_configs	attentionno_op_is_no_op_attentionffn_is_no_op_ffnr2   getattrrH   rV   num_attention_headsn_heads_in_groupr.   	self_attnr
   rms_norm_epsinput_layernormr    r+   r(   rY   r   mlppost_attention_layernorm)r?   r1   rT   r9   r6   r:   block_configr5   rU   r8   r4   r    r(   rY   r@   r)   r*   r>      sd   






zDeciLMDecoderLayer.__init__	positionshidden_statesresidualc                 C   sf   | j rn|d u r|}| |}n| ||\}}| j||d}| js/| ||\}}| |}||fS )N)rj   rk   )r^   rf   rd   r`   rh   rg   )r?   rj   rk   rl   r)   r)   r*   forward   s    
zDeciLMDecoderLayer.forward)NNr0   )rL   rM   rN   r   r&   r   r   rQ   r>   torchTensortuplerm   rR   r)   r)   r@   r*   rS      s4    ErS   c                       s   e Zd Zdeddededee f fddZdej	d	ej	fd
dZ
	ddej	dB dej	dedB dej	dB d	ej	eB f
ddZdeeeej	f  d	ee fddZ  ZS )	DeciModelr0   )r:   
layer_typevllm_configr:   rr   c                   s   t    |jj|j |j| _| _j| _j	| _	t
 js(jr3t
 jr3t| j	jd| _nt | _dtf fdd}tj|| dd\| _| _| _t
 jrbtjjd| _nt | _tdd	gj| _d S )
N)r6   r:   c                    s&   t | ddd }| | dS )N.r   r6   r:   )r&   rsplit)r:   rT   r9   r1   rr   r6   r)   r*   	get_layer  s   z%DeciModel.__init__.<locals>.get_layerz.layersr:   rW   rk   rl   )r=   r>   model_config	hf_configr9   r6   r1   pad_token_idpadding_idx
vocab_sizer	   is_first_ranktie_word_embeddingsis_last_rankr   r2   embed_tokensr   rQ   r   num_hidden_layersstart_layer	end_layerlayersr
   re   normr   make_empty_intermediate_tensors)r?   rs   r:   rr   rx   r@   rw   r*   r>      s>   




zDeciModel.__init__	input_idsr"   c                 C   s
   |  |S r<   )r   r?   r   r)   r)   r*   embed_input_ids  s   
zDeciModel.embed_input_idsNrj   intermediate_tensorsinputs_embedsc           
      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }d}t| j| j| jD ]}|js>||||\}}|d7 }q,||||\}}q,t  jsRt	||dS | 
||\}}	|S )Nrk   rl   r   r   )rk   rl   )r	   r   r   r   r   r   r   r^   r   r   r   )
r?   r   rj   r   r   rk   rl   kv_cache_indexlayer_r)   r)   r*   rm     s(   

zDeciModel.forwardweightsc                 C   sr  g d}t |  }t }|D ]\}}d|v rqd|v s d|v r!q| jd urO| j| }rO|| }t|dt}	| dkr@|n|d }|	|| || qd|v sWd|v rat	||}|d u raq|D ].\}
}}||vrmqc|
||
}|d	r}||vr}qct|| rqc|| }|j}	|	|||  n|d	r||vrqt|| rq|| }t|dt}	|	|| || q|S )
N))	.qkv_projz.q_projq)r   z.k_projr-   )r   z.v_projv).gate_up_projz
.gate_projr   )r   z.up_projr   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedweight_loaderr   scale
zero_pointz.bias)dictnamed_parameterssetr6   get_cache_scalera   r   dimaddr   replaceendswithr   r   )r?   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_idr)   r)   r*   load_weights@  sV   






zDeciModel.load_weightsr<   )rL   rM   rN   rS   r   rQ   typer>   rn   ro   r   r   rm   r   rp   r   r   rR   r)   r)   r@   r*   rq      s0    6
,"rq   c                       s  e Zd Zg dddgdZdddZdd	d
dddddddddddddZdddedef fddZd.dedefddZ	de
jde
jfd d!Z	"	"d/de
jd"B d#e
jd$ed"B d%e
jd"B de
jeB f
d&d'Zd(e
jde
jd"B fd)d*Zd+eeee
jf  dee fd,d-Z  ZS )0DeciLMForCausalLM)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projinput_embeddingsoutput_embeddings)r   lm_headzmodel.layersrd   r   r   r   o_projrf   rg   	down_projrh   zmodel.embed_tokensr   z
model.norm)r   r\   wqwkwvwoattention_normfeed_forwardw1w2w3ffn_normtok_embeddingsoutputr   r0   ry   rs   r:   c                   s   t    |jj}|j}|| _| j|t|dd| _t	 j
rGt|j|j|t|dd| _|jr8| j| jj| _t|dd}t|j|d| _nt | _| jj| _d S )Nmodelrs   r:   r   ru   logit_scaleg      ?)r   )r=   r>   rz   r{   r6   r1   _init_modelr   r   r	   r   r   r~   r2   r   r   tie_weightsr   ra   r   logits_processorr   r   )r?   rs   r:   r1   r6   r   r@   r)   r*   r>     s.   


zDeciLMForCausalLM.__init__c                 C   s   t ||dS )Nr   )rq   )r?   rs   r:   r)   r)   r*   r        zDeciLMForCausalLM._init_modelr   r"   c                 C   s   | j |S r<   )r   r   r   r)   r)   r*   r     r   z!DeciLMForCausalLM.embed_input_idsNrj   r   r   c                 C   s   |  ||||}|S r<   )r   )r?   r   rj   r   r   model_outputr)   r)   r*   rm     s   zDeciLMForCausalLM.forwardrk   c                 C   s   |  | j|}|S r<   )r   r   )r?   rk   logitsr)   r)   r*   compute_logits  s   z DeciLMForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   r1   r   r   )r?   r   loaderr)   r)   r*   r     s
   
zDeciLMForCausalLM.load_weights)r0   )NN)rL   rM   rN   packed_modules_mappingembedding_modulesmistral_mappingr   rQ   r>   r   rn   ro   r   r   rm   r   r   rp   r   r   rR   r)   r)   r@   r*   r     sZ     

,r   )9__doc__collections.abcr   	itertoolsr   rn   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   $vllm.model_executor.layers.layernormr
   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r    vllm.model_executor.models.llamar   r   vllm.sequencer   vllm.v1.attention.backendr   
interfacesr   r   r   utilsr   r   r   r   r   r   floatr&   r+   r'   r.   ModulerS   rq   r   r)   r)   r)   r*   <module>   s8    
2d 