o
    پiau                     @   s  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z: e;e<Z=e7 Z>e>rddl?m@Z@ G dd dejAZBG dd dejAZCG dd dejAZDG dd dejAZEG dd  d ejAZFG d!d" d"eFZGG d#d$ d$eFZHG d%d& d&eFZIeFeGeHeIgZJdS )'z?Inference-only LLaMA model compatible with HuggingFace weights.    N)AnyDictIterableListOptionalTupleUnion)nn)LlamaConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loaderkv_cache_scales_loadermaybe_remap_kv_scale_name)get_global_server_args)
add_prefixis_npumake_layers)get_exception_traceback)split_qkv_rmsnorm_ropec                       sn   e Zd Z					ddedededee ded	ed
ee dee ddf fddZ		ddefddZ	  Z
S )LlamaMLPN Thidden_sizeintermediate_size
hidden_actquant_configprefixreduce_resultstp_ranktp_sizereturnc	           	   
      sr   t    t||gd d|td|||d| _t||d|td||||d| _|dkr3td| d	t | _	d S )
N   Fgate_up_proj)biasr.   r/   r1   r2   	down_proj)r6   r.   r/   r0   r1   r2   siluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r$   r5   r   r7   
ValueErrorr   act_fn)	selfr+   r,   r-   r.   r/   r0   r1   r2   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/llama.pyr:   B   s2   
	

zLlamaMLP.__init__Fuse_reduce_scatterc                 C   s.   |  |\}}| |}| j||d\}}|S )N)skip_all_reduce)r5   r<   r7   )r=   xforward_batchrB   gate_up_r@   r@   rA   forwardh   s   

zLlamaMLP.forward)Nr*   TNN)NF)__name__
__module____qualname__intstrr   r   boolr:   rH   __classcell__r@   r@   r>   rA   r)   A   s<    	
)r)   c                       s   e Zd Z								d ded	ed
ededededeeee	f  de
dedee dede
ddf fddZdd Zdd ZdejdejdedejfddZ  ZS )!LlamaAttentionr   '  NT    r*   Fconfigr+   	num_headsnum_kv_headslayer_id
rope_thetarope_scalingrope_is_neox_stylemax_position_embeddingsr.   r/   r6   r3   c              
      sr  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|d| j| j | _
t	|dd}t|| j
 | _| j| j
 | _| j| j
 | _| j
d | _|| _|	| _t|| j
| j| j||
td|d| _t| j| j
 |||
td|d| _t| j
| j|	|||d	| _t| j| j
| j| j||
td
|d| _d S )Nr      head_dimpartial_rotary_factorg      qkv_proj)r6   r.   r/   o_proj)
rotary_dimmax_positionbaserX   is_neox_styleattn)rU   rV   r.   r/   )r9   r:   r+   r   total_num_headsrT   total_num_kv_headsmaxrU   getattrr\   rL   r`   q_sizekv_sizescalingrW   rZ   r   r$   r^   r   r_   r   
rotary_embr   rd   )r=   rS   r+   rT   rU   rV   rW   rX   rY   rZ   r.   r/   r6   r2   r]   r>   r@   rA   r:   x   sl   

	
zLlamaAttention.__init__c                 C   sJ   |  |\}}|j| j| j| jgdd\}}}| |||\}}|||fS )Ndim)r^   splitri   rj   rl   )r=   	positionshidden_statesqkvrG   qkvr@   r@   rA   forward_prepare_native   s    
z%LlamaAttention.forward_prepare_nativec           	      C   sZ   |  |\}}| jj|jjkr| j| t|| jj| jj	| j
| j| j\}}}|||fS N)r^   rd   rV   token_to_kv_poolstart_layerrl   get_cos_sin_with_positionr(   position_sinposition_cosri   rj   r\   )	r=   rq   rr   rE   rs   rG   rt   ru   rv   r@   r@   rA   forward_prepare_npu   s   

z"LlamaAttention.forward_prepare_npurq   rr   rE   c           
      C   sh   t rt| jdr|j r| j||d\}}}n| j|||d\}}}| ||||}| |\}}	|S )Nr{   )rq   rr   rq   rr   rE   )	_is_npuhasattrrl   forward_mode	is_extendrw   r~   rd   r_   )
r=   rq   rr   rE   rt   ru   rv   attn_outputoutputrG   r@   r@   rA   rH      s$   
zLlamaAttention.forward)r   rQ   NTrR   Nr*   F)rI   rJ   rK   r
   rL   floatr   r   rM   r   rN   r   r:   rw   r~   torchTensorr   rH   rO   r@   r@   r>   rA   rP   w   s^    	
NrP   c                       sr   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 dee	j
e	j
f f
ddZ  ZS )LlamaDecoderLayerr   Nr*   rS   rV   r.   r/   r3   c           
         s   t    |j| _t|dd}t|dd }|d ur$t|dd r$|j|d< t|dd}t|dd}t|d	d
p;t|dd
}	t|| j|j|j||||||td||	d| _	t
| j|j|j|td|d| _t|j|jd| _t|j|jd| _d S )NrW   rQ   rX    original_max_position_embeddingsrY   TrZ   rR   attention_biasFr6   	self_attn)rS   r+   rT   rU   rV   rW   rX   rY   rZ   r.   r/   r6   mlp)r+   r,   r-   r.   r/   eps)r9   r:   r+   rh   r   rP   num_attention_headsnum_key_value_headsr$   r   r)   r,   r-   r   r   rms_norm_epsinput_layernormpost_attention_layernorm)
r=   rS   rV   r.   r/   rW   rX   rY   rZ   r   r>   r@   rA   r:      sN   

zLlamaDecoderLayer.__init__rq   rr   rE   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )Nr   )r   r   r   r   )r=   rq   rr   rE   r   r@   r@   rA   rH   (  s   
zLlamaDecoderLayer.forward)r   Nr*   )rI   rJ   rK   r
   rL   r   r   rM   r:   r   r   r   r   rH   rO   r@   r@   r>   rA   r      s4    2r   c                       s   e Zd Z		ddedee deddf fddZ		dd	ej	d
ej	de
dej	dee deej	eej	eej	 f ef fddZdeddfddZdejfddZ  ZS )
LlamaModelNr*   rS   r.   r/   r3   c                    s   t     | _ j| _ j| _t | _| jjr't	 j j
td|d| _nt | _t j fdd| jj| jjdd\| _| _| _| jjrRt j
 jd| _ntdd	| _g | _d S )
Nembed_tokensr.   r/   c                    s   t  | |dS )N)rS   r.   rV   r/   )r   )idxr/   rS   r.   r@   rA   <lambda>Y  s    z%LlamaModel.__init__.<locals>.<lambda>zmodel.layers)pp_rankpp_sizer/   r   T)return_tuple)r9   r:   rS   pad_token_idpadding_idx
vocab_sizer   pp_groupis_first_rankr   r+   r$   r   r   r&   num_hidden_layersrank_in_group
world_sizelayersrz   	end_layeris_last_rankr   r   normlayers_to_capturer=   rS   r.   r/   r>   r   rA   r:   B  s0   



zLlamaModel.__init__	input_idsrq   rE   input_embedspp_proxy_tensorsc                 C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }d }g }	t| j| jD ]}
|
| jv r:|	||  | j|
 }|||||\}}q,| j j	sTt
||dS | ||\}}t|	dkrd|S ||	fS )Nrr   r   )rr   r   r   )r   r   r   rangerz   r   r   appendr   r   r   r   len)r=   r   rq   rE   r   r   rr   r   deferred_normaux_hidden_statesilayerrG   r@   r@   rA   rH   g  s<   


zLlamaModel.forwardquantization_param_pathc                 C   sv   t  }t }t|||| jj| jjjD ]%\}}t| j| t	j
s&| j| j}t|jdr5||j_||j_qtdd S )Nk_scalez8Self attention has no KV cache scaling factor attribute!)r   r   r!   rS   r   r?   
model_type
isinstancer   r	   Identityr   r   rd   r   v_scaleRuntimeError)r=   r   r2   r1   	layer_idxscaling_factorlayer_self_attnr@   r@   rA   load_kv_cache_scales  s$   
zLlamaModel.load_kv_cache_scalesc                 C   s   | j S )z$Get input embeddings from the model.)r   r=   r@   r@   rA   get_input_embeddings  s   zLlamaModel.get_input_embeddingsNr*   )NN)rI   rJ   rK   r
   r   r   rM   r:   r   r   r   r   r   r   r   rH   r   r	   	Embeddingr   rO   r@   r@   r>   rA   r   A  s:    *
3r   c                       s  e Zd Zg dZddgZdddddd	Z	
	dBdedee de	dd
f fddZ
	
	dBdedee de	fddZe 	
		
dCdejdejdedejdedee defddZe 	
dDdejdejdedeeef dejdee fddZed d! Zed"d# Zdejfd$d%Zd&d' Zd(d) Zd*eee	ejf  fd+d,Z 	.dEd/e	d0ed1edeej fd2d3Z!d4d5 Z"d6d7 Z#d8d9 Z$d:d; Z%d<e	dd
fd=d>Z&dDd?ee'e  fd@dAZ(  Z)S )FLlamaForCausalLM)z.gate_proj..down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj..o_proj.r   r   )	.qkv_projr   )r   r[   )r   r4   ).gate_up_projr   )r   r[   ).q_proj.k_proj.v_proj
.gate_proj.up_projNr*   rS   r.   r/   r3   c                    s   t    t | _|| _|| _| ||td|| _| jj	r$| jj
| _nt|j|j|td|t jd| _t|| _ttjdd| _g d| _d| _d S )Nmodellm_head)r.   r/   use_attn_tp_groupT)pooling_type	normalize)r   r   rt   )r   r   ru   )r   r   rv   )r   r   r   )r   r   r[   F)r9   r:   r   r   rS   r.   _init_modelr$   r   tie_word_embeddingsr   r   r   r   r+   r#   enable_dp_lm_headr   logits_processorr   r   LASTpoolerstacked_params_mappingcapture_aux_hidden_statesr   r>   r@   rA   r:     s$   



	zLlamaForCausalLM.__init__c                 C   s   t |||dS )Nr   )r   r   r@   r@   rA   r     s   zLlamaForCausalLM._init_modelFr   rq   rE   r   get_embeddingr   c           	      C   sV   | j |||||d}d }| jr|\}}| jjr)|s#| ||| j||S | ||S |S )N)r   )r   r   r   r   r   r   r   )	r=   r   rq   rE   r   r   r   rr   r   r@   r@   rA   rH     s*   
zLlamaForCausalLM.forwardsplit_intervalc                 C   s   |\}}|dkr|d u r| j ||_n||_t||D ]}| j j| }	|	||j||j\|_|_q|| j jjkrS| j |j|j\}
}|
|_| 	||j| j
|}|S d }|S )Nr   )r   r   rr   r   r   r   rS   r   r   r   r   )r=   r   rq   rE   r   r   startendr   r   rr   rG   resultr@   r@   rA   forward_split_prefill  s0   	z&LlamaForCausalLM.forward_split_prefillc                 C      | j jS rx   )r   rz   r   r@   r@   rA   rz   C     zLlamaForCausalLM.start_layerc                 C   r   rx   )r   r   r   r@   r@   rA   r   G  r   zLlamaForCausalLM.end_layerc                 C   r   rx   )r   r   r   r@   r@   rA   r   K  s   z%LlamaForCausalLM.get_input_embeddingsc                 C   sT   | j D ]\}}}}||v r|||d td  |f  S q|d td  dfS )Nz.weightr[   )r   replacer   )r=   name
param_nameweight_nameshard_id	num_shardr@   r@   rA    get_module_name_from_weight_nameN  s   z1LlamaForCausalLM.get_module_name_from_weight_namec                 C   s   t |  }t|S rx   )dictnamed_parametersr   )r=   params_dictr@   r@   rA   get_num_paramsW  s   zLlamaForCausalLM.get_num_paramsweightsc                 C   s  g d}t |  }|D ]\}}|dr|dd}|dr&|dd}t|}|d urAt| jdrA|| jjk s@|| jjkrAqd|v sId|v rJqd	|v sRd
|v rSq|	dr]||vr]q| j
jrfd|v rfqd|v rtt||}|d u rtq|D ]-\}}}	||vrqv|||}|dr||vrqv||vrqv|| }
|
j}||
||	  n3|dr||vrq|dr||vrq|| v r|| }
t|
dt}||
| qtd| d qd S )Nr   z.activation_scalez.input_scalez.weight_scale_invz.weight_scalerz   zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerlm_head.weightscalez.biasz	.kv_scaleweight_loaderz
Parameter z not found in params_dict)r   r   endswithr   r   r   r   rz   r   
startswithrS   r   r"   r   keysrh   r    loggerwarning)r=   r   r   r   r   loaded_weightrV   r   r   r   paramr   r@   r@   rA   load_weights[  sf   	



zLlamaForCausalLM.load_weightsd   r[   r   truncate_sizer2   c              	      s  z|dkr"| j jr"td | jjj t	j
  d| W S |}d}| jD ]\}}}||v r<|||}|} nq)t|  }	|	| }
|dur|dv r| j j| }| j j| }| j j| j j }|dkrnd}|| }n|dkr{|| }|| }n|dkr|| | }|| }|
jd|| n-|d	v r| j j}|| }|dkrd}|}n|d
kr|}|}|
jd|| n|
j n|
j |d
krd|v sd|v r fddt|D }t	j|  t	j|d
d   t	j
  d| W S  ty   td| dt   Y dS w )zGet the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.

        Only used for unit test with an unoptimized performance.
        For optimized performance, please use torch.save and torch.load.
        r   zTword embedding is tied for this model, return embed_tokens.weight as lm_head.weight.N)rt   ru   rv   rt   r   ru   rv   )r   r[   r[   r_   r7   c                    s   g | ]}t  qS r@   )r   
zeros_like).0rG   weightr@   rA   
<listcomp>  s    z8LlamaForCausalLM.get_weights_by_name.<locals>.<listcomp>rn   zError getting weights by name z in LlamaForCausalLM: )rS   r   r   infor   r   r  cputor   float32numpytolistr   r   r   r   r   r   r+   datanarrowr,   r   distributed
all_gathercat	Exceptionerrorr'   )r=   r   r   r2   mapped_namemapped_shard_idr   r   r   r   r   rT   rU   r\   offsetsizer,   
slice_sizegathered_weightsr@   r  rA   get_weights_by_name  sx   


"z$LlamaForCausalLM.get_weights_by_namec                 C   s   | j jj| jjfS rx   )r   r   r  r   r   r@   r@   rA   get_embed_and_head  s   z#LlamaForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S rx   )r   r   r  r   r   cudaempty_cachesynchronize)r=   embedheadr@   r@   rA   set_embed_and_head  s   

z#LlamaForCausalLM.set_embed_and_headc                 C   s
   | j jjS rx   )r   r   r  r   r@   r@   rA   	get_embed  s   
zLlamaForCausalLM.get_embedc                 C   sJ   t | jdr| jj| jjkrd S | jj`|| jj_tj	  tj
  d S )Ntarget_hidden_size)r   rS   r!  r+   r   r   r  r   r  r  r  )r=   r  r@   r@   rA   	set_embed  s   


zLlamaForCausalLM.set_embedr   c                 C   s   | j | d S rx   )r   r   )r=   r   r@   r@   rA   r     s   z%LlamaForCausalLM.load_kv_cache_scales	layer_idsc                 C   sX   | j jsd S |d u rd| _| jj}d|d |d g| j_d S d| _dd |D | j_d S )NTr4      c                 S   s   g | ]}|d  qS )r[   r@   )r  valr@   r@   rA   r    s    zALlamaForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r   r   r   rS   r   r   r   )r=   r#  
num_layersr@   r@   rA   set_eagle3_layers_to_capture
  s   z-LlamaForCausalLM.set_eagle3_layers_to_capturer   )NFNrx   )r   r[   )*rI   rJ   rK   #default_bitsandbytes_target_modulescolumn_parallel_weights_modules#bitsandbytes_stacked_params_mappingr
   r   r   rM   r:   r   r   no_gradr   r   rN   r   r   rH   r   rL   r   propertyrz   r   r	   r   r   r   r   r   r   r  r  r  r   r"  r   r   r'  rO   r@   r@   r>   rA   r     s    
'
#
(

	J
I r   c                   @      e Zd ZdS )Phi3ForCausalLMNrI   rJ   rK   r@   r@   r@   rA   r.        r.  c                   @   r-  )InternLM3ForCausalLMNr/  r@   r@   r@   rA   r1    r0  r1  c                   @   r-  )IQuestCoderForCausalLMNr/  r@   r@   r@   rA   r2  !  r0  r2  )K__doc__loggingtypingr   r   r   r   r   r   r   r   r	   transformersr
   sglang.srt.distributedr   r   r   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   r   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr    r!   r"   sglang.srt.server_argsr#   sglang.srt.utilsr$   r%   r&   sglang.utilsr'   	getLoggerrI   r   r   *sgl_kernel_npu.norm.split_qkv_rmsnorm_roper(   Moduler)   rP   r   r   r   r.  r1  r2  
EntryClassr@   r@   r@   rA   <module>   sR   $
6~Ls  g