o
    پiR                     @   s  d dl Z d dlmZmZmZmZmZmZ d dlZd dlm	Z	 d dl
mZmZmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z1 d dl/m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z: dZ;e <e=Z>e9 Z?e: Z@e@rd dlAmBZB d dlCmDZDmEZE G dd de	jFZGG dd de	jFZHG dd de2ZIG dd  d e	jFZJeJZKdS )!    N)AnyDictIterableListOptionalTuple)nn)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_size)RMSNorm)QKVParallelLinearRowParallelLinear)LogitsProcessor)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHead)ForwardBatchPPProxyTensors)default_weight_loadermaybe_remap_kv_scale_name)Qwen2MLP)
Qwen2Model)apply_qk_norm)get_global_server_args)
add_prefixis_cudais_npu)split_qkv_rmsnorm_rope)get_cmo_streamwait_cmo_streamc                       s   e Zd Z										d deded	ed
ededeeeef  dee dedee	 dede
dedeejj ddf fddZdd Zdd ZdejdejdedejfddZ  ZS )!Qwen3Attentionr   @B N   F hidden_size	num_headsnum_kv_headslayer_id
rope_thetarope_scalinghead_dimmax_position_embeddingsquant_configrms_norm_epsattention_biasprefix
alt_streamreturnc                    s  t    || _t | _|| _t }t }| j| dksJ | j| | _|| _	| j	|kr6| j	| dks5J n	|| j	 dks?J t
d| j	| | _|pN|| j | _| j| j | _| j| j | _| jd | _|| _|| _t | _t jd ur{ttjddni }t| jfd|
i|| _t| jfd|
i|| _t|| j| j| j	||	||td|d	| _t| j| j |||	||d	td
|d| _ t!| j| j|||d| _"t#| j| j| j| j|td|d| _$|| _%d S )Nr      g      T)weight_dtypecast_x_before_out_mulepsqkv_proj)biasr6   tp_ranktp_sizer9   Fo_proj)rA   r6   rB   rC   reduce_resultsr9   )
rotary_dimmax_positionbaser3   attn)r0   r1   r9   )&super__init__r.   r   rC   total_num_headsr   r   r/   total_num_kv_headsmaxr0   r4   q_sizekv_sizescalingr2   r5   r
   rB   r#   rl_on_policy_targetdicttorchfloat32r   q_normk_normr   r$   r@   r   rD   r   
rotary_embr   rI   r:   )selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   attn_tp_rankattn_tp_sizenorm_kwargs	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen3.pyrK   0   s   



zQwen3Attention.__init__c                 C   sj   |  |\}}|j| j| j| jgdd\}}}t||| j| j| j| jd\}}| 	|||\}}|||fS )N)dim)qkrV   rW   r4   r:   )
r@   splitrO   rP   r"   rV   rW   r4   r:   rX   )rY   	positionshidden_statesqkv_rc   rd   vr_   r_   r`   forward_prepare_native   s    

z%Qwen3Attention.forward_prepare_nativec           	      C   s   |  |\}}| jj|jjkr| j| t|| jj| jj	| j
| j| j| jj| jj| jjt| jdd t| jdd d\}}}|||fS )NrA   )r?   q_weightk_weightq_biask_bias)r@   rI   r1   token_to_kv_poolstart_layerrX   get_cos_sin_with_positionr'   position_sinposition_cosrO   rP   r4   rV   variance_epsilonweightrW   getattr)	rY   rf   rg   forward_batchrh   ri   rc   rd   rj   r_   r_   r`   forward_prepare_npu   s"   
z"Qwen3Attention.forward_prepare_npurf   rg   rx   c           
      C   s   t  jd ur
| }tr|j r| j||d\}}}n| j|||d\}}}t  jd ur9|t	j}|t	j}| 
||||}| |\}}	|S )N)rf   rg   rf   rg   rx   )r#   rR   bfloat16_is_npuforward_mode	is_extendrk   ry   torT   rI   rD   )
rY   rf   rg   rx   rc   rd   rj   attn_outputoutputri   r_   r_   r`   forward   s$   zQwen3Attention.forward)
r   r+   NNr,   NNFr-   N)__name__
__module____qualname__intfloatr   r   strr   r   boolrT   cudaStreamrK   rk   ry   Tensorr   r   __classcell__r_   r_   r]   r`   r*   /   sf    	

]r*   c                       s   e Zd Z				ddededee dedeej	j
 d	df fd
dZ	ddejdejdedeej deej d	eejejf fddZ  ZS )Qwen3DecoderLayerr   Nr-   configr1   r6   r9   r:   r;   c                    s   t    |j| _t|dd}t|dd }t|dd}t|dd }	t| j|j|j||||	|||j|jt	d||d| _
t| j|j|j|t	d	|d
| _t jd ur[ttjdtjddni }
t|jfd|ji|
| _t|jfd|ji|
| _tj||jdddd| _t| j| j| jd| _d S )Nr2   r+   r3   r5   r,   r4   	self_attn)r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   mlp)r.   intermediate_size
hidden_actr6   r9   T)r=   r>   override_orig_dtypefp32_residualr?   F)r1   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparse)layer_scatter_modesinput_layernormpost_attention_layernorm)rJ   rK   r.   rw   r*   num_attention_headsnum_key_value_headsr7   r8   r$   r   Qwen3MLPr   r   r   r#   rR   rS   rT   rU   r   r   r   r   init_newnum_hidden_layersr   r   layer_communicator)rY   r   r1   r6   r9   r:   r2   r3   r5   r4   r\   r]   r_   r`   rK      s~   

zQwen3DecoderLayer.__init__rf   rg   rx   residualpost_residual_additionc                 C   s   | j j||||d\}}|jd dkr| j|||d}| j j|||tr?t js?t| j	j
dr?t| j	jdr?| j	j
j| j	jjgnd d\}}| 	|}trRt rRt  | j |||\}}||fS )N)r   r   rz   rv   )cache)r   prepare_attnshaper   prepare_mlpr|   r#   enable_piecewise_cuda_graphhasattrr   gate_up_proj	down_projrv   r(   r)   postprocess_layer)rY   rf   rg   rx   r   r   r_   r_   r`   r     sB   	



zQwen3DecoderLayer.forward)r   Nr-   NN)r   r   r   Qwen3Configr   r   r   r   rT   r   r   rK   r   r   r   r   r   r_   r_   r]   r`   r      s@    
Ir   c                	       s8   e Zd Z		d	dedee deddf fddZ  ZS )

Qwen3ModelNr-   r   r6   r9   r;   c                    s,   t rtj nd }t j|||t|d d S )N)r   r6   r9   decoder_layer_typer:   )_is_cudarT   r   r   rJ   rK   r   )rY   r   r6   r9   r:   r]   r_   r`   rK   A  s   
zQwen3Model.__init__Nr-   )	r   r   r   r   r   r   r   rK   r   r_   r_   r]   r`   r   @  s    r   c                       sH  e Zd Zg dZddddddZ			d/d
edee deddf fddZ	de
jfddZe 			d0dejdejdedejdedee dejfddZe 	d1dejdejdedeeef dejf
ddZedd Zed d! Zd"eeeejf  fd#d$Zd%d& Zd'd( Zd)eddfd*d+Zd1d,eee  fd-d.Z   Z!S )2Qwen3ForCausalLM)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)r@   r   )r@   r<   )r@      )r   r   )r   r<   )q_projk_projv_proj	gate_projup_projNr-   r   r6   r9   r;   c                    s   t    t | _|| _|| _t||td|d| _| jj	r?| jj
dkr-|jr-| jj| _nt|j|j|t jtd|d| _nt | _t|| _ttjdd| _d| _d S )	Nmodel)r6   r9   r<   lm_head)r6   use_attn_tp_groupr9   T)pooling_type	normalizeF)rJ   rK   r	   pp_groupr   r6   r   r$   r   is_last_rank
world_sizetie_word_embeddingsembed_tokensr   r   
vocab_sizer.   r#   enable_dp_lm_headr   r   logits_processorr   r   LASTpoolercapture_aux_hidden_states)rY   r   r6   r9   r]   r_   r`   rK   e  s*   

	

zQwen3ForCausalLM.__init__c                 C   s
   | j  S r   )r   get_input_embeddingsrY   r_   r_   r`   r     s   
z%Qwen3ForCausalLM.get_input_embeddingsF	input_idsrf   rx   input_embedsget_embeddingpp_proxy_tensorsc           	      C   sV   | j |||||d}d }| jr|\}}| jjr)|s#| ||| j||S | ||S |S )N)r   )r   r   r   r   r   r   r   )	rY   r   rf   rx   r   r   r   rg   aux_hidden_statesr_   r_   r`   r     s*   
zQwen3ForCausalLM.forwardsplit_intervalc                 C   s   |\}}|dkr|d u r| j ||_n||_t||D ]}| j j| }	|	||j||j\|_|_q|| j jjkrS| j |j|j\}
}|
|_| 	||j| j
|}|S d }|S )Nr   )r   r   rg   rangelayersr   r   r   normr   r   )rY   r   rf   rx   r   r   startendilayerrg   ri   resultr_   r_   r`   forward_split_prefill  s0   	z&Qwen3ForCausalLM.forward_split_prefillc                 C      | j jS r   )r   rq   r   r_   r_   r`   rq        zQwen3ForCausalLM.start_layerc                 C   r   r   )r   	end_layerr   r_   r_   r`   r     r   zQwen3ForCausalLM.end_layerweightsc                 C   s  g d}t |  }|D ]\}}|ds)|ds$|ds$|dr)t|d}|dkrH| jjrH| jjrHd|v rH|d }t|d	t	}||| t
|}|d urct| jd
rc|| jjk sb|| jjkrcqd|v skd|v rlqd|v std|v ruq|dr||vrqd|v rt||}|d u rq|D ](\}	}
}|
|vrq||
|	}|dr||vrq|| }|j}||||  n)|dr||vrq|| v r|| }t|d	t	}||| qtd| d qd S )N))r@   r   rc   )r@   r   rd   )r@   r   rj   )r   r   r   )r   r   r<   zmodel.zlayers.zembed_tokens.znorm.r   zmodel.embed_tokens.weightzlm_head.weightweight_loaderrq   zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerscalez.biasz
Parameter z not found in params_dict)rS   named_parameters
startswithr$   r   r   r   r   rw   r   r   r   r   rq   r   r   replaceendswithr   keysloggerwarning)rY   r   stacked_params_mappingparams_dictnameloaded_weightparamr   r1   
param_nameweight_nameshard_idr_   r_   r`   load_weights  sr   	




zQwen3ForCausalLM.load_weightsc                 C   s   | j jj| jjfS r   )r   r   rv   r   r   r_   r_   r`   get_embed_and_head-  s   z#Qwen3ForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S r   )r   r   rv   r   rT   r   empty_cachesynchronize)rY   embedheadr_   r_   r`   set_embed_and_head0  s   

z#Qwen3ForCausalLM.set_embed_and_headquantization_param_pathc                 C   s   | j | d S r   )r   load_kv_cache_scales)rY   r   r_   r_   r`   r   8  s   z%Qwen3ForCausalLM.load_kv_cache_scales	layer_idsc                 C   sR   | j jsd S d| _|d u r| jj}d|d |d g| j_d S dd |D | j_d S )NTr      c                 S   s   g | ]}|d  qS )r<   r_   ).0valr_   r_   r`   
<listcomp>H  s    zAQwen3ForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r   r   r   r   r   r   layers_to_capture)rY   r   r   r_   r_   r`   set_eagle3_layers_to_capture;  s   z-Qwen3ForCausalLM.set_eagle3_layers_to_capturer   )NFNr   )"r   r   r   #default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingr   r   r   r   rK   r   	Embeddingr   rT   no_gradr   r   r   r   r   r   r   r   propertyrq   r   r   r   r   r   r   r   r  r   r_   r_   r]   r`   r   Q  sz    $#
(

L r   )Lloggingtypingr   r   r   r   r   r   rT   r   sglang.srt.distributedr	   r
   r   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.models.qwen2r    r   r!   sglang.srt.models.utilsr"   sglang.srt.server_argsr#   sglang.srt.utilsr$   r%   r&   r   	getLoggerr   r   r   r|   *sgl_kernel_npu.norm.split_qkv_rmsnorm_roper'   #sglang.srt.hardware_backend.npu.cmor(   r)   Moduler*   r   r   r   
EntryClassr_   r_   r_   r`   <module>   sJ    
 s {