o
    پi                     @   sh  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z> ddl?m@Z@mAZA ddlBmCZCmDZD ddlEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZMmNZN ddlOmPZP eQeRZSeK ZTeL ZUeePZVG dd  d ejWZXG d!d" d"ejWZYG d#d$ d$ejWZZeZeYd%Z[G d&d' d'ejWZ\G d(d) d)e\Z]G d*d+ d+eHZ^G d,d- d-eHZ_e_e^gZ`dS ).zWInference-only Qwen3.5 model and Qwen3.5 MoE model compatible with HuggingFace weights.    N)	lru_cache)IterableOptionalSetTupleUnion)	rearrange)get_forward_context)Qwen3_5ConfigQwen3_5MoeConfigQwen3_5TextConfig)get_pp_group)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)RMSNorm)mamba_v2_sharded_weight_loader)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)GemmaRMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)FusedMoE)QuantizationConfig)RadixAttention)RadixLinearAttention)get_rope)VocabParallelEmbedding)get_is_capture_mode)ForwardBatchPPProxyTensors)default_weight_loadersharded_weight_loader)Qwen2MoeMLPQwen2MoeSparseMoeBlock)gdn_with_output)Qwen3VLForConditionalGeneration)
add_prefixis_cudais_npumake_layersset_weight_attrs)get_processorc                       sz   e Zd Z			ddededee deejj	 de
ddf fd	d
Zdd ZdejdefddZdejdefddZ  ZS )Qwen3_5GatedDeltaNetN configlayer_idquant_config
alt_streamprefixreturnc           	         s  t    || _t | _t | _|j| _|j| _	|j
| _|j| _|j| _| j| j | _| j| j	 | _|| _|j| _|| _|j| _|j| _| jd | j | _t| j| jdd | j| jtd|d| _| jjj d| jj_t!| j| j| j| jgd|| j| jtd|d| _"t| j| jd|| j| jtd|d| _#t| j| j	d|| j| jtd	|d| _$t| j| j	d|| j| jtd
|d| _%| jddf}| jddf}t&| jjd t'| jjdt(|||g| j| ji t)*t+,| j	| j | _-t)*t+.| j	| j | _/t'| j/dt0di t'| j-dt0di | jj1| jj2d| jj2d}t3|| j| j | j| j | j	| j | j| j| j|| jj4| j| j/| j-d| _5t6| j| jd dt+7 8 |j9d| _:t;| j| jddd|| j| jtd|d	| _<d S )N   Fconv1d)
input_sizeoutput_sizebiasr5   tp_ranktp_sizer7      in_proj_qkv)r;   output_sizesr=   r5   r>   r?   r7   	in_proj_z	in_proj_b	in_proj_ar   weight_loader)r4   num_q_headsnum_k_headsnum_v_heads
head_q_dim
head_k_dim
head_v_dimconv_weightsr=   
activationA_logdt_biasT)eps
group_sizenorm_before_gatedevicedtypeout_proj)r=   input_is_parallelreduce_resultsr5   r>   r?   r7   )=super__init__r3   r   attn_tp_rankr   attn_tp_sizehidden_sizelinear_num_value_headsrI   linear_num_key_headsrH   linear_key_head_dimrK   linear_value_head_dimrL   key_dim	value_dimr6   linear_conv_kernel_dimconv_kernel_sizer4   
hidden_actrN   rms_norm_epslayer_norm_epsilonconv_dimr   r+   r:   weightdata	unsqueezer   rA   rC   rD   rE   delattrr/   r   nn	ParametertorchonesrP   emptyrO   r&   viewsizer   r=   attnRMSNormGatedget_device_modulecurrent_devicetorch_dtypenormr   rV   )	selfr3   r4   r5   r6   r7   query_key_settingsvalue_settingsrM   	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen3_5.pyrZ   Z   s   
				




zQwen3_5GatedDeltaNet.__init__c                 C   s   t d)Nz8Qwen3.5 Series dont need to fix query key value ordering)NotImplementedError)r{   	mixed_qkvzbar   r   r   fix_query_key_value_ordering   s   z1Qwen3_5GatedDeltaNet.fix_query_key_value_orderinghidden_statesforward_batchc                 C   s<   t |}|j rt d urt||| j |S | ||S N)rp   
empty_likeforward_mode	is_extendr	   r)   r4   _forward)r{   r   r   outputr   r   r   forward   s   
zQwen3_5GatedDeltaNet.forwardc                 C   s   |j \}}| |\}}| |\}}||dd| j}| |\}}| |\}}| }| }| j	j
||||d}	|j }
|	d|	j d }	|d|j d }| |	|}	|	|
}	t|	d}	| |	\}}|S )z
        Forward pass with three parts:
        1. Input projection
        2. Core attention (custom op)
        3. Output projection
        r   )r   r   r   r   z... h d -> ... (h d))shaperA   rC   reshapert   rL   rD   rE   
contiguousru   r   rz   r   rV   )r{   r   r   seq_len_r   r   r   r   core_attn_out
z_shape_ogr   r   r   r   r     s,   


zQwen3_5GatedDeltaNet._forward)NNr2   )__name__
__module____qualname__r   intr   r   rp   cudaStreamstrrZ   r   Tensorr#   r   r   __classcell__r   r   r~   r   r1   Y   s:    
 
r1   c                       sh   e Zd ZdZ			ddededee dedee	j
j d	df fd
dZde	jdee	j fddZ  ZS )Qwen3_5LinearDecoderLayerz<Qwen3.5 Decoder Layer with Linear Attention (GatedDeltaNet).Nr2   r3   r4   r5   r7   r6   r8   c           
         s"  t    || _|| _|r| dkrd n|}t|||||| _|jdkr=t||||t	d|
ddd| _d}d}d}	n(|jdkr]t|j|j|j|t	d|
ddd	| _d
}d
}d
}	ntd|j tj||j|||	d| _t|j|jd| _t|j|jd| _t| j| j| jdd| _d S )Nmodelopt_fp4qwen3_5_moe_textmlpz.linear_attnr2   r4   r3   r5   r6   r7   Tqwen3_5_textr]   intermediate_sizerf   r5   r7   FInvalid model type: r4   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparserQ   layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatter)rY   rZ   r3   r4   get_namer1   linear_attn
model_typer(   r+   replacer   r'   r]   r   rf   
ValueErrorr   init_newnum_hidden_layersr   r   rg   r   r   r   layer_communicator)
r{   r3   r4   r5   r7   r6   linear_attn_quant_configr   r   r   r~   r   r   rZ   6  sh   



z"Qwen3_5LinearDecoderLayer.__init__r   residualc                 K   s   | dd }| j|||\}}|j s| ||}| j|||\}}| j|}| |||}| j	|||\}}||fS )Nr   )
getr   prepare_attnr   is_idler   prepare_mlpshould_use_reduce_scatterr   postprocess_layer)r{   r   r   kwargsr   use_reduce_scatterr   r   r   r   y  s(   
z!Qwen3_5LinearDecoderLayer.forwardNr2   N)r   r   r   __doc__r   r   r   r   r   rp   r   r   rZ   r   r   r   r   r   r~   r   r   3  s.    
Cr   c                       s   e Zd ZdZ			ddededee dedee	j
j d	df fd
dZde	jde	jd	ee	je	jf fddZde	jde	jded	e	jfddZde	jde	jdee	j defddZ  ZS )Qwen3_5AttentionDecoderLayerz*Qwen3.5 Decoder Layer with Full Attention.Nr2   r3   r4   r5   r7   r6   r8   c           
         s6  t    || _|j| _t | _t | _|j| _	| j	| j dks"J | j	| j | _
|j| _| j| jkr>| j| j dks=J n
| j| j dksHJ td| j| j | _|jpZ| j| j
 | _| j
| j | _| j| j | _| jd | _t|dd| _t|drt|dd | _nt|dd | _| jdd	| _| jd
d| _|| _t|dd| _| jrtd t| j| j| j| j| j| jdt  d| _!|r|" dkrd n|}t#|j| j| j	d| j  | jd|| j| jt$d|d	| _%t&| j	| j |jd|d| j| jt$d|d| _'t(| j
| j| j| j|| dd| _)|j*dkr4t+|j|j,|j-|t$d|.ddd| _/d}d}d}	n&|j*dkrRt0||||t$d|.ddd| _/d}d}d}	nt1d|j* t2j3||j4|||	d | _5t6|j|j7d!| _8t6|j|j7d!| _9t6| j|j7d!| _:t6| j|j7d!| _;t<| j5| j8| j9dd"| _=|| _>d S )#Nr   r@   g      max_position_embeddingsi    rope_parametersrope_scaling
rope_thetai'  partial_rotary_factorg      ?attn_output_gateTzusing attn output gate!)	head_size
rotary_dimmax_positionr   baser   is_neox_stylerU   r   Fqkv_proj)r=   r5   r>   r?   r7   o_proj)r=   r5   rX   r>   r?   r7   z.attn)num_kv_headsr4   r7   r   r   
.self_attnr2   r   r   r   r   r   r   r   )?rY   rZ   r3   r]   r   r[   r   r\   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr   head_dimq_sizekv_sizescalinggetattrr   hasattrr   r   r   r   r4   r   loggerwarning_oncer    rp   get_default_dtype
rotary_embr   r   r+   r   r   r   r   ru   r   r'   r   rf   r   r   r(   r   r   r   r   r   r   rg   r   r   q_normk_normr   r   r6   )
r{   r3   r4   r5   r7   r6   attn_quant_configr   r   r   r~   r   r   rZ     s   





z%Qwen3_5AttentionDecoderLayer.__init__qkc                 C   s   | j durIt rItj }| j | |d| j}| |}tj	| j  |d| j}| 
|}W d   n1 s=w   Y  || j  n|d| j}| |}|d| j}| 
|}||j}||j}||fS )z9Apply Q/K normalization with optional alt_stream overlap.Nr   )r6   r"   rp   r   current_streamwait_streamr   r   r   streamr   rs   r   )r{   r   r   r   	q_by_head	k_by_headr   r   r   _apply_qk_norm,  s"   



z+Qwen3_5AttentionDecoderLayer._apply_qk_norm	positionsr   r   c                 C   s  |  |\}}| jrO|j| jd | j| jgdd\}}}|jdd }	|jg |	| jdR  }tj	|ddd\}
}|
j
g |	dR  }
|j
g |	dR  }n|j| j| j| jgdd\}
}}| |
|\}
}| ||
|\}
}| |
|||}| jrt|}|| }| |\}}|S )zFull attention forward pass.r9   r   dimN)r   r   splitr   r   r   rs   r   rp   chunkr   r   r   ru   sigmoidr   )r{   r   r   r   qkvr   q_gater   v
orig_shaper   gateattn_outputr   r   r   r   self_attentionB  s&    
z+Qwen3_5AttentionDecoderLayer.self_attentionr   c                 K   sx   | j |||\}}|j s| j|||d}| j |||\}}| j |}| |||}| j |||\}}||fS )N)r   r   r   )	r   r   r   r   r  r   r   r   r   )r{   r   r   r   r   r   r   r   r   r   r   b  s(   
z$Qwen3_5AttentionDecoderLayer.forwardr   )r   r   r   r   r   r   r   r   r   rp   r   r   rZ   r   r   r   r#   r  r   r   r   r   r~   r   r     sX    
 

 r   )	attentionlinear_attentionc                       s   e Zd ZdZ		ddedee deddf fdd	Zde	j
fd
dZe 			ddejdejdedeej dee deej deejef fddZdeeeejf  fddZ  ZS )Qwen3_5ForCausalLMz-Qwen3.5 Model with support for dense variant.Nr2   r3   r5   r7   r8   c                    s   t    | _j| _t | _trtj	 nd  | jj
r+tjjjt  d| _dtdtf fdd}tj|| dd| _| jjrStjjd| _d S d S )	N)org_num_embeddings	enable_tpidxr7   c                    sB   j |  }t| }|dkrtd|}ntd|}|| | dS )Nr  	self_attnr   )r3   r4   r5   r7   r6   )layers_block_typeALL_DECODER_LAYER_TYPESr+   )r  r7   
layer_typelayer_classr6   r3   r5   r   r   	get_layer  s   

z.Qwen3_5ForCausalLM.__init__.<locals>.get_layerz.layers)r7   r   )rY   rZ   r3   r]   r   pp_group_is_cudarp   r   r   is_first_rankr!   
vocab_sizer   embed_tokensr   r   r.   r   layersis_last_rankr   rg   rz   )r{   r3   r5   r7   r  r~   r  r   rZ     s*   
zQwen3_5ForCausalLM.__init__c                 C   s   | j S r   )r  r{   r   r   r   get_input_embeddings  s   z'Qwen3_5ForCausalLM.get_input_embeddings	input_idsr   r   input_embedspp_proxy_tensorsinput_deepstack_embedsc              	   C   s4  | j jr|d u r| |}n|}d }n|d usJ |d }|d }tt| jD ]J}	| j|	 }
t |	 |
||||d\}}W d    n1 sJw   Y  |d urr| dkrr|	dk rr| j	|	 }|
|d d ||| j	 f  q(| j js~t||dS |jd dkr|d u r| |}|S | ||\}}|S )Nr   r   )r   r   r   r   r      )r   r   )r  r  r  rangelenr  r   with_current_layernumelr]   add_r  r$   r   rz   )r{   r  r   r   r  r  r  r   r   	layer_idxlayersepr   r   r   r   r     sP   


zQwen3_5ForCausalLM.forwardweightsc                 C   s<  g d}t  }t| jdd}|D ]\}}d|v rqd|v rqd|v r$qd|v r.|dd	}d
|v r8|dd}|D ]4\}}}	||vrDq:d|v rIq:|||}|drY||vrYq:||vr^q:|| }
t|
d}||
||	  n'|dry||vryq||vrtd| d q|| }
t|
dt}||
| |	| q|S )N)r   q_projr   )r   k_projr   )r   v_projr   )gate_up_proj	gate_projr   )r*  up_projr@   Fremove_duplicaterotary_emb.inv_freqmtpvisuallanguage_modelmodel.language_model.model..self_attn.r   r2   mlp.experts.biasrF   
Parameter  not found in params_dict
setdictnamed_parametersr   endswithr   r   warningr%   addr{   r%  stacked_params_mappingloaded_paramsparams_dictnameloaded_weight
param_nameweight_nameshard_idparamrF   r   r   r   load_weights  sL   	

zQwen3_5ForCausalLM.load_weightsNr2   )NNN)r   r   r   r   r   r   r   r   rZ   rn   	Embeddingr  rp   no_gradr   r#   r$   r   r   r   r   rK  r   r   r   r~   r   r    sD    0$?r  c                	       sT   e Zd Z		ddedee deddf fddZd	ee	ee
jf  fd
dZ  ZS )Qwen3_5MoeForCausalLMNr2   r3   r5   r7   r8   c                    s   t  j|||d d S )N)r3   r5   r7   )rY   rZ   )r{   r3   r5   r7   r~   r   r   rZ   =  s   zQwen3_5MoeForCausalLM.__init__r%  c              
   C   st  g d}t jddd| jjd}d}d}dd	g}| jj}d
tdtdtjdtdtf
dd}t	 }	t| j
dd}
|D ]\}}d|v rAq8d|v rFq8d|v rKq8d|v rU|dd}d|v r_|dd}|D ]>\}}}d|v snd|v rrd}|}||vrwqad|v r|qa|||}||r||
vrqa||
vrqa|
| }|j}||||  nd}|D ]]}|\}}}}||vrqd}|||}|rd|v r|jdd d!}|||
|d" d#| |||
|d$ d%| n#|||
||| n||r||
vrq|
| }|j}||||||d& |} n0|rq8||r||
vrq8||
 v r)|
| }t|d't}||| n	td(| d) |	| q8|	S )*Nr&  r+  	down_projr,  ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_experts)
r7  _bias.k_scale_k_scale.v_scale_v_scalez.weight_scale_weight_scalez.input_scale_input_scaleFzexperts.w13_weightexperts.gate_up_projr   w1zexperts.w2_weightexperts.down_projr   w2rE  rD  rF  rI  rU  c           	      S   8   ||  }|j }t|D ]}|| }|||| || qdS NTrF   r  	rE  rD  rF  rI  rU  rJ  rF   	expert_idcurr_expert_weightr   r   r   load_fused_expert_weightsn     zEQwen3_5MoeForCausalLM.load_weights.<locals>.load_fused_expert_weightsr-  r/  r0  r1  r2  r3  r4  r5  r   r2   r^  ra  Tr6  r9   r   r   r_  r@   w3rI  rg  rF   r8  r9  )r   make_expert_params_mappingr3   rU  r   r<  rp   r   r   r;  r=  r   r>  rF   r   keysr   r%   r   r?  r@  r{   r%  rB  expert_params_mappingignore_suffixesis_fused_expertfused_expert_params_mappingrU  ri  rC  rD  rE  rF  rG  rH  rI  rJ  rF   is_expert_weightmappingrg  name_mappedr   r   r   rK  E  s   

z"Qwen3_5MoeForCausalLM.load_weightsrL  )r   r   r   r   r   r   r   rZ   r   r   rp   r   rK  r   r   r   r~   r   rO  <  s    $rO  c                       sb   e Zd Zddefdedee def fddZdd	 Z	d
d Z
deeeejf  fddZ  ZS )Qwen3_5ForConditionalGenerationNr2   r3   r5   r7   c                    F   t  |||| t| jdd pt| jdi }d|v | _| jj| _d S Nr   r   mrope_sectionrY   rZ   r   r3   is_mrope_enabledr1  deepstack_visual_indexesr{   r3   r5   r7   language_model_clsrope_configr~   r   r   rZ      s   
z(Qwen3_5ForConditionalGeneration.__init__c                 C      | j jj| jjfS r   modelr  rj   lm_headr  r   r   r   get_embed_and_head     z2Qwen3_5ForConditionalGeneration.get_embed_and_headc                 C   8   | j j`| j`|| j j_|| j_tj  tj  d S r   r  r  rj   r  rp   r   empty_cachesynchronizer{   embedheadr   r   r   set_embed_and_head     

z2Qwen3_5ForConditionalGeneration.set_embed_and_headr%  c                 C   sZ  g d}t  }t| jdd}|D ]\}}d|v rqd|v rqd|v r)|dd}d	|v r3|d
d}|D ]8\}}}	||vr?q5d|v sGd|v rHq5|||}|drX||vrXq5||vr]q5|| }
t|
d}||
||	  n7d|v r~|dd}|dd}|dr||vrq||vrtd| d q|| }
t|
dt}||
| |	| q|S )Nr&  Fr-  r/  r0  r2  r3  r4  r5  r   r2   r1  r6  r7  rF   	attn.qkv.attn.qkv_proj.model.visual.visual.r8  r9  r:  rA  r   r   r   rK    sN   	

z,Qwen3_5ForConditionalGeneration.load_weights)r   r   r   r  r
   r   r   r   rZ   r  r  r   r   rp   r   rK  r   r   r   r~   r   rx    s    $rx  c                	       sv   e Zd ZdZddefdedee deddf fdd	Z	d
d Z
dd Zdeeeejf  fddZedd Z  ZS )"Qwen3_5MoeForConditionalGenerationz"Qwen3.5 MoE Vision-Language Model.Nr2   r3   r5   r7   r8   c                    ry  rz  r|  r  r~   r   r   rZ   ]  s   
z+Qwen3_5MoeForConditionalGeneration.__init__c                 C   r  r   r  r  r   r   r   r  l  r  z5Qwen3_5MoeForConditionalGeneration.get_embed_and_headc                 C   r  r   r  r  r   r   r   r  o  r  z5Qwen3_5MoeForConditionalGeneration.set_embed_and_headr%  c              
   C   s  g d}t jddd| jjd}d}d}dd	g}| jj}d
tdtdtjdtdtf
dd}t	 }	t| j
dd}
|D ]\}}d|v rBq8d|v rGq8d|v rQ|dd}d|v r[|dd}|D ]E\}}}|dsl|drpd}|}||vruq]d|v rzq]d|v rq]|||}||r||
vrq]||
vrq]|
| }|j}||||  nd}|D ]f}|\}}}}||vrqd|v s| jjrqd}|||}|rd|v r|jdd d!}|||
|d" d#| |||
|d$ d%| n#|||
||| n||r||
vrq|
| }|j}||||||d& |} nA|rq8d|v r#|d'd(}|d)d*}||r/||
vr/q8||
 v rF|
| }t|d+t}||| n	td,| d- |	| q8|	S ).Nr&  r+  rP  r,  rQ  )r7  rV  rW  rX  rY  rZ  r[  r\  Fr]  r`  rE  rD  rF  rI  rU  c           	      S   rc  rd  re  rf  r   r   r   ri    rj  zRQwen3_5MoeForConditionalGeneration.load_weights.<locals>.load_fused_expert_weightsr-  r/  r0  r2  r3  r4  r5  r   r2   r^  ra  Tr1  r6  r9   rk  r   r   r_  r@   rl  rm  r  r  r  r  rF   r8  r9  )r   rn  r3   rU  r   r<  rp   r   r   r;  r=  r   r>  rF   encoder_onlyr   ro  r   r%   r   r?  r@  rp  r   r   r   rK  w  s   


z/Qwen3_5MoeForConditionalGeneration.load_weightsc                 C   s   t |d|}t|j|jd dS )Ntext_config)r   num_logical_experts
num_groups)r   r   r   rU  )clsr3   r  r   r   r   $get_model_config_for_expert_location7  s   zGQwen3_5MoeForConditionalGeneration.get_model_config_for_expert_location)r   r   r   r   rO  r   r   r   r   rZ   r  r  r   r   rp   r   rK  classmethodr  r   r   r   r~   r   r  Z  s(     Ar  )ar   logging	functoolsr   typingr   r   r   r   r   rp   torch.nnrn   einopsr   0sglang.srt.compilation.piecewise_context_managerr	   sglang.srt.configs.qwen3_5r
   r   r   sglang.srt.distributedr   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   /sglang.srt.layers.attention.fla.layernorm_gatedr   rv   'sglang.srt.layers.attention.mamba.mambar   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   ,sglang.srt.layers.moe.fused_moe_triton.layerr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   (sglang.srt.layers.radix_linear_attentionr   "sglang.srt.layers.rotary_embeddingr    *sglang.srt.layers.vocab_parallel_embeddingr!   +sglang.srt.model_executor.cuda_graph_runnerr"   ,sglang.srt.model_executor.forward_batch_infor#   r$   $sglang.srt.model_loader.weight_utilsr%   r&   sglang.srt.models.qwen2_moer'   r(   sglang.srt.models.qwen3_nextr)   sglang.srt.models.qwen3_vlr*   sglang.srt.utilsr+   r,   r-   r.   r/   &sglang.srt.utils.hf_transformers_utilsr0   	getLoggerr   r   r  _is_npucached_get_processorModuler1   r   r   r
  r  rO  rx  r  
EntryClassr   r   r   r   <module>   sh   
 [i k 2 D[ h