o
    پi}\                     @   s  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 dZ6e7e8Z9G dd dej:Z;G dd dej:Z<G dd dej:Z=G dd dej:Z>G dd dej:Z?e?Z@dS )z?Inference-only Qwen2 model compatible with HuggingFace weights.    N)AnyDictIterableListOptionalTupleUnion)nn)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)
SiluAndMul)is_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loaderkv_cache_scales_loader)get_global_server_args)
add_prefixmake_layersc                       sH   e Zd Z		ddedededee deddf fd	d
Zdd Z  Z	S )Qwen2MLPN hidden_sizeintermediate_size
hidden_actquant_configprefixreturnc                    sh   t    t||gd d|td|d| _t||d|td|d| _|dkr.td| dt | _	d S )	N   Fgate_up_projbiasr)   r*   	down_projsiluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r"   r-   r   r0   
ValueErrorr   act_fn)selfr&   r'   r(   r)   r*   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen2.pyr3   ?   s(   

zQwen2MLP.__init__c                 C   s>   t  jd ur
| }| |\}}| |}| |\}}|S N)r!   rl_on_policy_targetbfloat16r-   r5   r0   )r6   xgate_up_r9   r9   r:   forward]   s   
zQwen2MLP.forwardNr%   )
__name__
__module____qualname__intstrr   r   r3   rA   __classcell__r9   r9   r7   r:   r$   >   s"    r$   c                       s   e Zd Z								ddededed	ee d
ededeeeef  dedee	 dee
eef  deddf fddZdejdejdedejfddZ  ZS )Qwen2AttentionNr   @B    r%   r&   	num_headsnum_kv_headshead_dimlayer_id
rope_thetarope_scalingmax_position_embeddingsr)   dual_chunk_attention_configr*   r+   c              
      s\  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|d urI|| _	n|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _|| _t|| j	| j| jd|	td|d| _t| j| j	 |d|	td|d| _t| j	| j	||||
d	| _t| j| j	| j| j||	td
|d| _d S )Nr      g      Tqkv_projr.   Fo_proj)
rotary_dimmax_positionbaserQ   rS   attn)rM   rO   r)   r*   )r2   r3   r&   r   total_num_headsrL   total_num_kv_headsmaxrM   rN   q_sizekv_sizescalingrP   rR   r   r"   rU   r   rV   r   
rotary_embr   rZ   )r6   r&   rL   rM   rN   rO   rP   rQ   rR   r)   rS   r*   tp_sizer7   r9   r:   r3   h   sh   

	
zQwen2Attention.__init__	positionshidden_statesforward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )N)dim)rU   splitr^   r_   ra   rZ   rV   )r6   rc   rd   re   qkvr@   qkvattn_outputoutputr9   r9   r:   rA      s    zQwen2Attention.forward)Nr   rJ   NrK   NNr%   )rC   rD   rE   rF   r   floatr   rG   r   r   dictr3   torchTensorr   rA   rH   r9   r9   r7   r:   rI   g   sV    	
KrI   c                       s   e Zd Z				ddededee dedeej	j
 d	df fd
dZdejdejdedeej d	eejejf f
ddZ  ZS )Qwen2DecoderLayerr   Nr%   configrO   r)   r*   
alt_streamr+   c                    s   t    |j| _t|dd}t|dd }t|dd}t|dd }	t|dd }
t| j|j|j|	||||||
td|d	| _t	| j|j
|j|td
|d| _t|j|jd| _t|j|jd| _d S )NrP   rJ   rQ   rR   rK   rN   rS   	self_attn)r&   rL   rM   rN   rO   rP   rQ   rR   r)   rS   r*   mlp)r&   r'   r(   r)   r*   )eps)r2   r3   r&   getattrrI   num_attention_headsnum_key_value_headsr"   rv   r$   r'   r(   rw   r   rms_norm_epsinput_layernormpost_attention_layernorm)r6   rt   rO   r)   r*   ru   rP   rQ   rR   rN   rS   r7   r9   r:   r3      sB   
zQwen2DecoderLayer.__init__rc   rd   re   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)rc   rd   re   )r}   rv   r~   rw   )r6   rc   rd   re   r   r9   r9   r:   rA      s   
zQwen2DecoderLayer.forward)r   Nr%   N)rC   rD   rE   Qwen2ConfigrF   r   r   rG   rq   cudaStreamr3   rr   r   r   rA   rH   r9   r9   r7   r:   rs      s:    
*rs   c                       s   e Zd Zddedfdedee dedee	j
 deejj ddf fd	d
ZdejdejfddZde	jfddZ		ddejdejdedejdee deejef fddZdeddfddZ  ZS )
Qwen2ModelNr%   rt   r)   r*   decoder_layer_typeru   r+   c                    s  t    | _j| _j| _t | _| jjr3t	jj
t td|t jd ur-tjnd d| _nt | _p:ttj fdd| jj| jjtd|d\| _| _| _| jjr}t jd urmttjdtjddni }tj
fd	ji|| _ntdd
| _g | _ d S )Nembed_tokens)r)   use_attn_tp_groupr*   params_dtypec                    s   | | dS )N)rO   rt   r)   r*   ru   r9   )idxr*   ru   rt   r   r)   r9   r:   <lambda>(  s    z%Qwen2Model.__init__.<locals>.<lambda>layers)pp_rankpp_sizer*   T)weight_dtypecast_x_before_out_muloverride_orig_dtypefp32_residualrx   )return_tuple)!r2   r3   rt   pad_token_idpadding_idx
vocab_sizer
   pp_groupis_first_rankr   r&   r   r"   r!   r<   rq   float32r   r   rs   r#   num_hidden_layersrank_in_group
world_sizer   start_layer	end_layeris_last_rankrp   r   r|   normlayers_to_capture)r6   rt   r)   r*   r   ru   norm_kwargsr7   r   r:   r3     sX   




zQwen2Model.__init__	input_idsc                 C   s,   t | jdr|  || jj S |  |S )N	scale_emb)hasattrrt   get_input_embeddingsr   r6   r   r9   r9   r:   get_input_embeddingG  s   zQwen2Model.get_input_embeddingc                 C   s   | j S r;   )r   r6   r9   r9   r:   r   M  s   zQwen2Model.get_input_embeddingsrc   re   input_embedspp_proxy_tensorsc                 C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }g }t| j| jD ]"}	|	| jv r>||d ur;|| n| | j|	 }
|
||||\}}q*| j j	sXt
||dS |jd dkrq|d u ri| |}n| ||\}}t|dkry|S ||fS )Nrd   r   )rd   r   r   )r   r   r   ranger   r   r   appendr   r   r   shaper   len)r6   r   rc   re   r   r   rd   r   aux_hidden_statesilayerr@   r9   r9   r:   rA   P  sD   	


zQwen2Model.forwardquantization_param_pathc                 C   sv   t  }t }t|||| jj| jjjD ]%\}}t| j| t	j
s&| j| j}t|jdr5||j_||j_qtdd S )Nk_scalez8Self attention has no KV cache scaling factor attribute!)r   r   r    rt   r   r8   
model_type
isinstancer   r	   Identityrv   r   rZ   r   v_scaleRuntimeError)r6   r   rb   tp_rank	layer_idxscaling_factorlayer_self_attnr9   r9   r:   load_kv_cache_scales  s$   
zQwen2Model.load_kv_cache_scales)NN)rC   rD   rE   rs   r   r   r   rG   typer	   Modulerq   r   r   r3   rr   r   	Embeddingr   r   r   r   rA   r   rH   r9   r9   r7   r:   r     sH    
A
7r   c                       s^  e Zd Zg dZddddddZ			d1d
edee deddf fddZ	de
jde
jfddZdejfddZe
 			d2de
jde
jdede
jdedee de
jfddZe
 	d3de
jde
jdedeeef de
jf
ddZed d! Zed"d# Zd$eeee
jf  fd%d&Zd'd( Zd)d* Zd+eddfd,d-Zd3d.ee e  fd/d0Z!  Z"S )4Qwen2ForCausalLM)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)rU   r   )rU   rT   )rU   r,   )r-   r   )r-   rT   )q_projk_projv_proj	gate_projup_projNr%   rt   r)   r*   r+   c                    s   t    t | _|| _|| _t||td|d| _| jj	r<| jj
dkr-|jr-| jj| _nt|j|j|td|d| _nt | _t|| _ttjdd| _d| _d S )Nmodel)r)   r*   rT   lm_headT)pooling_type	normalizeF)r2   r3   r
   r   rt   r)   r   r"   r   r   r   tie_word_embeddingsr   r   r   r   r&   r   r   logits_processorr   r   LASTpoolercapture_aux_hidden_states)r6   rt   r)   r*   r7   r9   r:   r3     s(   



zQwen2ForCausalLM.__init__r   c                 C   s   | j |S r;   )r   r   r   r9   r9   r:   r     s   z$Qwen2ForCausalLM.get_input_embeddingc                 C      | j jS r;   )r   r   r   r9   r9   r:   r     s   z%Qwen2ForCausalLM.get_input_embeddingsFrc   re   r   get_embeddingr   c           	      C   sV   | j |||||d}d }| jr|\}}| jjr)|s#| ||| j||S | ||S |S )N)r   )r   r   r   r   r   r   r   )	r6   r   rc   re   r   r   r   rd   r   r9   r9   r:   rA     s*   
zQwen2ForCausalLM.forwardsplit_intervalc                 C   s   |\}}|dkr|d u r| j ||_n||_t||D ]}| j j| }	|	||j||j\|_|_q|| j jjkrS| j |j|j\}
}|
|_| 	||j| j
|}|S d }|S )Nr   )r   r   rd   r   r   r   rt   r   r   r   r   )r6   r   rc   re   r   r   startendr   r   rd   r@   resultr9   r9   r:   forward_split_prefill  s0   	z&Qwen2ForCausalLM.forward_split_prefillc                 C   r   r;   )r   r   r   r9   r9   r:   r   $     zQwen2ForCausalLM.start_layerc                 C   r   r;   )r   r   r   r9   r9   r:   r   (  r   zQwen2ForCausalLM.end_layerweightsc                 C   s  g d}t |  }|D ]\}}t|}|d ur+t| jdr+|| jjk s*|| jjkr+q|dkrJ| jjrJ| j	j
rJd|v rJ|d }t|dt}||| d|v sRd|v rSqd|v s[d	|v r\q|d
rf||vrfq|D ]-\}	}
}|
|vrrqh||
|	}|dr||vrqh||vrqh|| }|j}||||  n)|dr||vrq|| v r|| }t|dt}||| qtd| d qd S )N))rU   r   rj   )rU   r   rk   )rU   r   rl   )r-   r   r   )r-   r   rT   r   zmodel.embed_tokens.weightzlm_head.weightweight_loaderzrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerz.biasz
Parameter z not found in params_dict)rp   named_parametersr   r   r   r   r   r   r   rt   r   ry   r   
startswithreplaceendswithr   keysloggerwarning)r6   r   stacked_params_mappingparams_dictnameloaded_weightrO   paramr   
param_nameweight_nameshard_idr9   r9   r:   load_weights,  s^   	

zQwen2ForCausalLM.load_weightsc                 C   s   | j jj| jjfS r;   )r   r   weightr   r   r9   r9   r:   get_embed_and_headp  s   z#Qwen2ForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S r;   )r   r   r   r   rq   r   empty_cachesynchronize)r6   embedheadr9   r9   r:   set_embed_and_heads  s   

z#Qwen2ForCausalLM.set_embed_and_headr   c                 C   s   | j | d S r;   )r   r   )r6   r   r9   r9   r:   r   {  s   z%Qwen2ForCausalLM.load_kv_cache_scales	layer_idsc                 C   sR   | j jsd S d| _|d u r| jj}d|d |d g| j_d S dd |D | j_d S )NTr,      c                 S   s   g | ]}|d  qS )rT   r9   ).0valr9   r9   r:   
<listcomp>  s    zAQwen2ForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r   r   r   rt   r   r   r   )r6   r   
num_layersr9   r9   r:   set_eagle3_layers_to_capture~  s   z-Qwen2ForCausalLM.set_eagle3_layers_to_capturerB   )NFNr;   )#rC   rD   rE   #default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingr   r   r   rG   r3   rq   rr   r   r	   r   r   no_gradr   boolr   rA   r   rF   r   propertyr   r   r   r   r   r   r   r   r   rH   r9   r9   r7   r:   r     s|    ""
(

D r   )A__doc__loggingtypingr   r   r   r   r   r   r   rq   r	   sglang.srt.distributedr
   r   r   sglang.srt.layers.activationr   sglang.srt.layers.dp_attentionr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   r    sglang.srt.server_argsr!   sglang.srt.utilsr"   r#   r   	getLoggerrC   r   r   r$   rI   rs   r   r   
EntryClassr9   r9   r9   r:   <module>   s>   $
)ZD  s