o
    ijX                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; G dd dej<Z=G dd dej<Z>G dd dej<Z?	d*dd Z@e
e@d!G d"d# d#ej<ZAG d$d% d%ej<e2e3e0e1ZBG d&d' d'e.eBZCG d(d) d)e-eBZDdS )+z?Inference-only LLaMA model compatible with HuggingFace weights.    )Iterable)isliceN)nn)LlamaConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)	AttentionEncoderOnlyAttention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)AttentionType   )as_embedding_modelas_seq_cls_model)SupportsEagleSupportsEagle3SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sZ   e Zd Z					ddededededB d	ed
edededdf fddZdd Z  Z	S )LlamaMLPNF Thidden_sizeintermediate_size
hidden_actquant_configbiasprefixreduce_results
disable_tpreturnc	           	   	      sn   t    t||gd |||| dd| _t||||||| dd| _|dkr1td| dt | _d S )	N   .gate_up_proj)
input_sizeoutput_sizesr/   r.   r2   r0   z
.down_proj)r6   output_sizer/   r.   r1   r2   r0   siluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)	selfr+   r,   r-   r.   r/   r0   r1   r2   	__class__ V/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/llama.pyr;   Q   s.   
	
zLlamaMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r<   r?   r=   )r@   x_rC   rC   rD   forwardt   s   
zLlamaMLP.forward)NFr*   TF)
__name__
__module____qualname__intstrr   boolr;   rH   __classcell__rC   rC   rA   rD   r)   P   s4    	
#r)   c                       s   e Zd Zddddddejfdedededed	ed
edB dedede	dB de
de
ddf fddZdejdejdejfddZded
edB ddfddZ  ZS )LlamaAttention    NFr*   configr+   	num_headsnum_kv_headsmax_position_embeddingsr.   r/   bias_o_projcache_configr0   	attn_typer3   c                    s  t    t|
}|| _t }|| _| j| dksJ | j| | _|| _| j|kr3| j| dks2J n	|| j dks<J td| j| | _	t
|dd }|pR| j| j | _| j| j | _| j	| j | _| jd | _|| _t|| j| j| j|||
 dd| _t| j| j ||||
 dd| _| j||d	 d }t
|d
d  }rt|dr||j }n|}|t|k sJ d| d| || dk}|r|j}|tjkrtnt}|| j| j| j| j	|	||||
 dd	| _d S )Nr   r   head_dimg      	.qkv_proj)r+   	head_sizetotal_num_headstotal_num_kv_headsr/   r.   r0   z.o_proj)r6   r8   r/   r.   r0   r.   layer_typestarget_layer_countzeffective_layer_idx: z# is out of bounds for layer_types: sliding_attentionz.attn)rT   rW   r.   per_layer_sliding_windowrX   r0   )r:   r;   r$   r+   r
   r\   rS   r]   maxrT   getattrrY   q_sizekv_sizescalingrU   r   qkv_projr   o_proj_init_rotary_embhasattrr`   lensliding_windowr   ENCODER_ONLYr   r   attn)r@   rR   r+   rS   rT   rU   r.   r/   rV   rW   r0   rX   	layer_idxtp_sizerY   rm   r_   effective_layer_idx
is_slidingattn_clsrA   rC   rD   r;   |   s~   





zLlamaAttention.__init__	positionshidden_statesc           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )N)dim)rh   splitre   rf   
rotary_embro   ri   )
r@   ru   rv   qkvrG   qkvattn_outputoutputrC   rC   rD   rH      s    zLlamaAttention.forwardc                 C   sH   d}|o	|  dk}|r|jdkrd}t| j| jt|dd |d| _d S )NTggufllamaFrope_parameters)max_positionr   is_neox_style)get_name
model_typer   rY   rU   rd   rz   )r@   rR   r.   r   is_ggufrC   rC   rD   rj      s   
zLlamaAttention._init_rotary_emb)rI   rJ   rK   r   DECODERr   rL   r   rN   r   rM   r;   torchTensorrH   rj   rO   rC   rC   rA   rD   rP   {   s^    	
b
rP   c                       s   e Zd ZddefdedededB deej	 ddf
 fdd	Z
d
ejdejdejdB deejejf fddZdededB fddZ  ZS )LlamaDecoderLayerr*   Nvllm_configr0   rR   attn_layer_typer3   c                    s  t    |p
|jj}|j}| |}|j| _t|dd}t|ddp(t|dd}|}	t|dr3|j	}t|ddr=t
j}
nt
j}
||| j|jt|d	|j||||	|| d
|
d| _t| j|j|j|t|dd| dd| _t|j|jd| _t|j|jd| _d S )NrU   rQ   attention_biasFr/   qkv_bias	is_causalTnum_key_value_headsz
.self_attn)rR   r+   rS   rT   rU   r.   r/   rV   rW   r0   rX   mlp_biasz.mlp)r+   r,   r-   r.   r/   r0   eps)r:   r;   model_config	hf_configrW   get_quant_configr+   rd   rk   r   r   r   rn   num_attention_heads	self_attnr)   r,   r-   mlpr   rms_norm_epsinput_layernormpost_attention_layernorm)r@   r   r0   rR   r   rW   r.   rU   r   rV   rX   rA   rC   rD   r;      sT   



zLlamaDecoderLayer.__init__ru   rv   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)ru   rv   )r   r   r   r   )r@   ru   rv   r   rC   rC   rD   rH   ;  s   
zLlamaDecoderLayer.forwardc                 C   s   |j S )z?Get quantization config for this layer. Override in subclasses.r^   )r@   r   rC   rC   rD   r   N  s   z"LlamaDecoderLayer.get_quant_config)rI   rJ   rK   rP   r   rM   r   typer   Moduler;   r   r   tuplerH   r   r   rO   rC   rC   rA   rD   r      s2    >
r   c                 C   s.   | durt | d |  d k dS dS )zShape invariants for Llama model compilation, those are translated to
    runtime assertions for unbacked dynamic shapes and are compiled away for
    backedNr   )r   _checksize)	input_idsru   intermediate_tensorsinputs_embedsrC   rC   rD   llama_model_invariantsS  s   "r   )shape_invariantsc                       s   e Zd Zdeddededeej f fddZ	de
jd	e
jfd
dZ	dde
jdB de
jdedB de
jdB d	e
jeB ee
jee
j f B f
ddZdeeee
jf  d	ee fddZ  ZS )
LlamaModelr*   r0   
layer_typer   r0   r   c                   s   t    jj}j}|| _|| _|j| _t js!|j	r,t j
r,t| j|j|d| _nt | _t|j fdd| dd\| _| _| _t j
rTt|j|jd| _nt | _ttdf  | _tdd	g|j| _d S )
Nr^   c                    s    | dS )N)r   r0   rC   r0   r   r   rC   rD   <lambda>  s    z%LlamaModel.__init__.<locals>.<lambda>z.layersr   r   .rv   r   )r:   r;   r   r   r.   rR   
vocab_sizer	   is_first_ranktie_word_embeddingsis_last_rankr   r+   embed_tokensr#   r'   num_hidden_layersstart_layer	end_layerlayersr   r   normr   rL   aux_hidden_state_layersr&   make_empty_intermediate_tensors)r@   r   r0   r   rR   r.   rA   r   rD   r;   c  s:   



zLlamaModel.__init__r   r3   c                 C   s
   |  |S rE   )r   r@   r   rC   rC   rD   embed_input_ids  s   
zLlamaModel.embed_input_idsNru   r   r   c                 K   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }g }tt| j| j| jD ]\}	}
|	| jv r>|	||  |
|||fi |\}}q.t  j
sVt||dS | ||\}}t|dkrh||fS |S )Nrv   r   )rv   r   r   )r	   r   r   	enumerater   r   r   r   r   appendr   r   r   rl   )r@   r   ru   r   r   extra_layer_kwargsrv   r   aux_hidden_statesidxlayerrG   rC   rC   rD   rH     s6   

zLlamaModel.forwardweightsc                 C   sr  g d}t |  }t }|D ]\}}d|v rqd|v s d|v r!q| jd urO| j| }rO|| }t|dt}	| dkr@|n|d }|	|| || qd|v sWd|v rat	||}|d u raq|D ].\}
}}||vrmqc|
||
}|d	r}||vr}qct|| rqc|| }|j}	|	|||  n|d	r||vrqt|| rq|| }t|dt}	|	|| || q|S )
N))rZ   z.q_projr|   )rZ   z.k_projr}   )rZ   z.v_projr~   )r5   z
.gate_projr   )r5   z.up_projr   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedweight_loaderr   scale
zero_pointz.bias)dictnamed_parameterssetr.   get_cache_scalerd   r   rx   addr   replaceendswithr%   r   )r@   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_idrC   rC   rD   load_weights  sV   






zLlamaModel.load_weightsrE   )rI   rJ   rK   r   r   rM   r   r   r   r;   r   r   r   r   r   listrH   r   r   r   rO   rC   rC   rA   rD   r   ]  s0    
+
,(r   c                       s6  e Zd Zg dddgdZdddZded	d
ededee	j
 f fddZdeedf ddfddZdeedf fddZdefd
ededee	j
 fddZdejdejfddZ		d'dejdB dejdedB dejdB dejeB f
dd Zd!ejdejdB fd"d#Zd$eeeejf  dee fd%d&Z  ZS )(LlamaForCausalLM)q_projk_projv_proj	gate_projup_proj)rh   r<   input_embeddingsoutput_embeddings)r   lm_headr*   r   r   r0   r   c                   s   t    |jj}|j}|| _| j|t|d|d| _t	 j
rHt|j|j|t|dd| _|jr9| j| jj| _t|dd}t|j|d| _nt | _| jj| _d S )Nmodelr   r0   r   r   )r.   r0   logit_scaleg      ?)r   )r:   r;   r   r   r.   rR   _init_modelr(   r   r	   r   r   r   r+   r   r   tie_weightsr   rd   r   logits_processorr#   r   )r@   r   r0   r   rR   r.   r   rA   rC   rD   r;     s2   

zLlamaForCausalLM.__init__r   .r3   Nc                 C   s   || j _d S rE   )r   r   )r@   r   rC   rC   rD   set_aux_hidden_state_layers/     z,LlamaForCausalLM.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )zOverride to return default layers for Llama

        Note: The GPU model runner will override this with layers from
        the speculative config if available, providing dynamic configuration.
        r4      )rl   r   r   )r@   
num_layersrC   rC   rD   "get_eagle3_aux_hidden_state_layers2  s   z3LlamaForCausalLM.get_eagle3_aux_hidden_state_layersc                 C   s   t |||dS )Nr   )r   )r@   r   r0   r   rC   rC   rD   r   ;  s   zLlamaForCausalLM._init_modelr   c                 C   s   | j |S rE   )r   r   r   rC   rC   rD   r   C  r   z LlamaForCausalLM.embed_input_idsru   r   r   c                 C   s   |  ||||}|S rE   )r   )r@   r   ru   r   r   model_outputrC   rC   rD   rH   F  s   zLlamaForCausalLM.forwardrv   c                 C   s   |  | j|}|S rE   )r   r   )r@   rv   logitsrC   rC   rD   compute_logitsR  s   zLlamaForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r"   rR   r   r   )r@   r   loaderrC   rC   rD   r   Y  s
   
zLlamaForCausalLM.load_weightsNN)rI   rJ   rK   packed_modules_mappingembedding_modulesr   r   rM   r   r   r   r;   r   rL   r   r   r   r   r   r   r   rH   r   r   r   r   rO   rC   rC   rA   rD   r     s^    	'


,r   c                   @      e Zd ZdS )+LlamaBidirectionalForSequenceClassificationNrI   rJ   rK   rC   rC   rC   rD   r   a      r   c                   @   r   )LlamaBidirectionalModelNr   rC   rC   rC   rD   r   g  r   r   r   )E__doc__collections.abcr   	itertoolsr   r   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.v1.attention.backendr   adaptersr   r   
interfacesr   r   r    r!   utilsr"   r#   r$   r%   r&   r'   r(   r   r)   rP   r   r   r   r   r   r   rC   rC   rC   rD   <module>   sN   $+ X

 
g