o
    
۾iq                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZmZmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z( ddl&m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZBmCZC ddlDmEZEmFZFmGZG ddlHmIZJ ddlKmLZL ddlMmNZN ddlOmPZP dd lQmRZR dd!lSmTZT dd"lUmVZVmWZW dd#lXmYZY dd$lZm[Z[ dd%l\m]Z] d&d'l^m_Z_m`Z`maZambZbmcZc d&d(ldmeZemfZfmgZgmhZhmiZimjZjmkZk eelZmenejoejof ZpG d)d* d*ejqZrG d+d, d,ejqe2ZsG d-d. d.ejqZtG d/d0 d0ejqZueG d1d2 d2ejqZvG d3d4 d4eaZwG d5d6 d6ejqe_ebecewe`Zxd7ejod8ejod9ejod:ejod;eyd<dfd=d>Zzd7ejod8ejod9ejod:ejod;eyd<dfd?d@Z{eYd>ezd:ge{dA eWj|dBeVj}dCeVj}dDeVj}dEeVj}fdFdGZ~	H	IdNdJejod9ejod8ejodKejodCedDed<enejoejof fdLdMZdS )OzInference-only Qwen3Next model.    )Iterable)isliceN	rearrange)nn)ACT2FN)support_torch_compile)CacheConfigModelConfigSpeculativeConfig
VllmConfigget_current_vllm_config)divideget_ep_groupget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)ForwardContextget_forward_context)init_logger)	Attention)chunk_gated_delta_rule fused_recurrent_gated_delta_rule)SharedFusedMoE)GemmaRMSNorm)RMSNormGated)ColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)	MambaBase)mamba_v2_sharded_weight_loader)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)causal_conv1d_fncausal_conv1d_update)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_namesharded_weight_loader)Qwen2MoeMLP)sequence_parallel_chunk)set_weight_attrs)current_platform)IntermediateTensors)Qwen3NextConfig)tltriton)direct_register_custom_op)AttentionMetadata)GDNAttentionMetadata   )HasInnerStateIsHybridMixtureOfExpertsSupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s>   e Zd Zd
dedef fddZdejdejfdd	Z  Z	S )Qwen3NextSparseMoeBlock vllm_configprefixc                    s  t    |jj}|j}|j}t | _t j	| _
t j| _| j
 | _|j| _|j| _| j|jkr>td| j d|j dt }|jj}|j| _| j| _|j| _| j| j | _| j| j | _| j| j | _| j| j | _t|j|jd|| dd| _ t|jddd | dd| _!|j"d	krt#|j|j"|j$|d| j!| d
d| _%nd | _%t&| j%| j | j|j'|j|j(d|j)|| d| j| j| jd| _*d S )NzTensor parallel size z' is greater than the number of experts .Fz.gatebiasquant_configrL   r<   z.shared_expert_gater   z.shared_expert)hidden_sizeintermediate_size
hidden_actrP   reduce_resultsexpert_gaterL   z.experts)shared_expertsgatenum_expertstop_krQ   rR   rT   renormalizerP   rL   enable_eplbnum_redundant_expertsis_sequence_parallel)+super__init__model_config	hf_configparallel_configrP   r   tp_sizer   device_groupep_grouprank_in_groupep_ranksizeep_sizerX   n_routed_expertsuse_sequence_parallel_moer]   
ValueErrorr   eplb_configr[   n_logical_expertsr\   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr   rQ   rW   shared_expert_gateshared_expert_intermediate_sizeQwen3NextMLPrS   shared_expertr   num_experts_per_tokmoe_intermediate_sizenorm_topk_probexperts)selfrK   rL   configrb   rP   rm   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.pyr_   i   s   







z Qwen3NextSparseMoeBlock.__init__hidden_statesreturnc                 C   s   |j }|j \}}|d|}| jrt|}| jjr!| j||d}n| |\}}| j||d}| jd ur<|d |d  }| jrKt|d}|d | }n| j	dkrV| j
|}||S )N)r   router_logitsr   r<   )shapeviewr]   r2   r{   is_internal_routerrW   rw   r   rc   &maybe_all_reduce_tensor_model_parallel)r|   r   
orig_shape
num_tokens
hidden_dimfinal_hidden_statesr   _r   r   r   forward   s2   



zQwen3NextSparseMoeBlock.forwardrJ   )
__name__
__module____qualname__r   strr_   torchTensorr   __classcell__r   r   r~   r   rI   h   s    RrI   c                       s   e Zd ZedefddZdeejejf fddZ	deee
df ee
df f fddZ									
d!deded	B ded	B ded	B ded	B dedd	f fddZdd Zdd ZdejdejfddZdejdejdejdejfdd Z  ZS )"Qwen3NextGatedDeltaNetr   c                 C      dS )Ngdn_attentionr   r|   r   r   r   
mamba_type   s   z!Qwen3NextGatedDeltaNet.mamba_typec                 C   s   t | jj| jjS Nr&   gated_delta_net_state_dtyper`   dtypecache_configmamba_cache_dtyper   r   r   r   get_state_dtype   s   z&Qwen3NextGatedDeltaNet.get_state_dtype.c              	   C   s$   t | j| j| j| j| j| j| jS r   )	r'   gated_delta_net_state_shaperc   num_k_headsnum_v_heads
head_k_dim
head_v_dimconv_kernel_sizenum_specr   r   r   r   get_state_shape   s   z&Qwen3NextGatedDeltaNet.get_state_shapeNrJ   r}   r`   r   rP   speculative_configrL   c           
         s~  t    t | _t | _|j| _|j| _|j	| _
|j| _|j| _| j| j
 | _| j| j | _|j| _t|| _|j| _t|j | _|j| _|| _|| _|| _|| _|| _|| _ | j r_| j j!nd| _"| jd | j | _#t$| j| j#d| dd| _%| j%j&j'(d| j%j&_'| jd | jd  | _)| jd | _*t$| j| j)d|| dd| _+t$| j| j*d|| d	d| _,| jddf}| jddf}t-| j%j&d
 t.| j%j&d
t/|||g| j| ji t01t23| j| j | _4t01t25t6| j| j| _7t.| j7d
t8di t.| j4d
t8di t9| j| jd dt:; |j<d| _=t>| j| jdd|| dd| _?t@ jA}	||	jBv r8tCd| | |	jB|< d S )Nr      Fz.conv1d)
input_sizeoutput_sizerO   rL   r<   z.in_proj_qkvz)r   r   rO   rP   rL   z.in_proj_baweight_loaderT)eps
group_sizenorm_before_gatedevicer   z	.out_proj)rO   input_is_parallelrP   rL   zDuplicate layer name: )Dr^   r_   r   rc   r   tp_rankrQ   linear_num_value_headsr   linear_num_key_headsr   linear_key_head_dimr   linear_value_head_dimr   key_dim	value_dimlinear_conv_kernel_dimr   rD   	layer_idxrS   
activationr   actrms_norm_epslayer_norm_epsilonrL   r}   r`   r   rP   r   num_speculative_tokensr   conv_dimr   conv1dweightdata	unsqueezeprojection_size_qkvzprojection_size_bain_proj_qkvz
in_proj_badelattrr3   r#   r   	Parameterr   onesdt_biasemptyr   A_logr0   r   r4   current_devicer   normr    out_projr   compilation_configstatic_forward_contextrl   )
r|   r}   r`   r   rP   r   rL   query_key_settingsvalue_settingsr   r~   r   r   r_      s   
	

		zQwen3NextGatedDeltaNet.__init__c                 C   s`  |  dd | j| j | j| j | j| j | j | j  f }|  dd | j| j d| j | j f }|j| }|j| }| j| j| j| j | j | j| j | j g}| j| j | j| j g}tj||dd\}}}	}
tj||dd\}}|		|	 dd| j}	|
	|
 dd| j}
|	| d| j| j }|	| d| j| j }|||	|
||fS )zQ
        Derives `query`, `key` and `value` tensors from `mixed_qkvzba`.
        Nr   r   dimr   )
rh   r   rc   r   r   r   r   r   splitreshape)r|   
mixed_qkvzmixed_banew_tensor_shape_qkvznew_tensor_shape_basplit_arg_list_qkvzsplit_arg_list_baquerykeyvaluezbar   r   r   fix_query_key_value_orderingr  sB   







z3Qwen3NextGatedDeltaNet.fix_query_key_value_orderingc                    s   |d u rdS t j| j j  j j  j j gdd\}}}t fdd||f\}}t|d jd}| | | fS )N)NNNr   r   c                    s   t | d jdS )Nl (h d) -> 1 l h dd)r   r   xr   r   r   <lambda>  s    z<Qwen3NextGatedDeltaNet.rearrange_mixed_qkv.<locals>.<lambda>r   r   )	r   r   r   rc   r   mapr   r   
contiguous)r|   	mixed_qkvr   r   r   r   r   r   rearrange_mixed_qkv  s    


	
z*Qwen3NextGatedDeltaNet.rearrange_mixed_qkvr   outputc                 C   s  | d}| |\}}| |\}}| ||\}}}	}
}}tdd |||	f\}}}	tj|||	fdd}tj|| j| j	 | j
f|j|jd}tjj||||| j |
j}|d|jd }|
d|
jd }
| ||
}||}t|d}| |\|d|< }dS )	z
        Forward pass with three parts:
        1. Input projection
        2. Core attention (custom op)
        3. Output projection
        r   c                 S   s
   t | dS )Nzl p d -> l (p d)r   r   r   r   r   r     s   
 z0Qwen3NextGatedDeltaNet.forward.<locals>.<lambda>r   r   r   r   z... h d -> ... (h d)N)rh   r   r   r   r   r   catzerosr   rc   r   r   r   opsvllmgdn_attention_corerL   r   r   r   r   r   )r|   r   r   r   projected_states_qkvzr   projected_states_bar   r   r   r   r   r   r   core_attn_out
z_shape_ogr   r   r   r     s:   



zQwen3NextGatedDeltaNet.forwardr   r   r   r   c           )      C   s  t  }|j}|du rdS t|tsJ || j }t|tsJ |j}|j}|j}	|j	}
|j
}|j}|j}|j}| j|j }|d dd}|d }|j}|j}|d| }|d| }|d| }| jj| jjd| jjd}|
dur|jdkr|jdkr|}d}n|d|}|d|}nd}|}|
durt|||| jj| j|dddf d|j |||ddd
}|jdkr|dd}t||| jj| j||||	|d		dd}n|jdkrt|||| jj| j|d|j d
d}nd}| |\}}}| |\}}}t | j!||| j"\}}|
durC|jdkr*|jdkr*|} |}!d}"d}#n!|d|} |d|}!|d|}"|d|}#nd} d}!|}"|}#|
durit#|||| |!|d
|d|jd  ||d
d\}$}%nd\}$}%|jdkr|| $ }&d|&| df< t%||||"|#|&d
|	dd
d
\}'}%|%&|j'||< n"|jdkrt#||||"|#|d
|	d|jd  |d
d
\}'}%nd\}'}%|
dur|'durt(j)d|g|$j*dd R |'j'|'j+d}(|(,d||$ |(,d||' |(-d|d|< dS |
dur|$-d|d|< dS |'-d|d|< dS )zC
        Core attention computation (called by custom op).
        Nr   r   r<   r   F)conv_state_indicesnum_accepted_tokensquery_start_locmax_query_lenvalidate_data)r   conv_stateshas_initial_statecache_indicesr  metadataT)r  r  )qkvgbetainitial_stateinplace_final_state
cu_seqlensssm_state_indicesr  use_qk_l2norm_in_kernelNN.)
r
  r  r  r  r  r  output_final_stater  
head_firstr  )
r
  r  r  r  r  r  r  r  r  r  r   ).r   attn_metadata
isinstancedictrL   r;   r  spec_query_start_locnon_spec_query_start_locspec_sequence_masksspec_token_indxnon_spec_token_indxspec_state_indices_tensornon_spec_state_indices_tensorkv_cachevirtual_engine	transposenum_actual_tokensr  r   r   r   rh   num_prefillsnum_decodesindex_selectr)   rO   r   num_spec_decodesr(   r   fused_gdn_gatingr   r   r   r   r   tor   r   r   r   r   index_copy_squeeze))r|   r   r   r   r   forward_contextr  r  r  r  r  r  r  r  r   self_kv_cache
conv_state	ssm_stater$  r  conv_weightsmixed_qkv_specmixed_qkv_non_specmixed_qkv_non_spec_T
query_speckey_spec
value_specquery_non_speckey_non_specvalue_non_specr  r  g_spec	beta_spec
g_non_specbeta_non_speccore_attn_out_speclast_recurrent_stater  core_attn_out_non_spec
merged_outr   r   r   _forward_core  sB  









z$Qwen3NextGatedDeltaNet._forward_core)NNNNrJ   )r   r   r   propertyr   r   tupler   r   r   intr   r6   r
   r	   r*   r   r_   r   r   r   r   rC  r   r   r   r~   r   r      sR    &}3
9r   c                       sh   e Zd Z				ddededB dedB dedB deddf fd	d
Zde	j
de	j
de	j
fddZ  ZS )Qwen3NextAttentionNrJ   r}   r`   r   rP   rL   r   c              	      s  t    || _|j| _t }|j| _| j| dksJ | j| | _|j| _	| j	|kr5| j	| dks4J n	|| j	 dks>J t
d| j	| | _|jpO| j| j | _| j| j | _| j| j | _| jd | _t|dd | _t|dd| _t|j| j| jd| j  | j	t|dd|| d	d
| _t| j| j |jd|| dd
| _t| j|j|j| jd| _t| j| j| jf| j||| dd| jrt|| jdni | _t| j|jd| _ t| j|jd| _!d S )Nr   r<   g      dual_chunk_attention_configattn_output_gateTqkv_biasFz	.qkv_projrN   z.o_proj)	head_sizemax_positionrope_parametersrH  z.attn)num_kv_headsr   rP   rL   )r   rH  r   )"r^   r_   r}   rQ   r   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxrN  head_dimq_sizekv_sizescalinggetattrrH  rI  r   qkv_projr    o_projr+   max_position_embeddingsrM  
rotary_embr   rD   attnQwen3NextRMSNormr   q_normk_norm)r|   r}   r`   r   rP   rL   rc   r~   r   r   r_     sz   




zQwen3NextAttention.__init__	positionsr   r   c                 C   s`  |  |\}}| jrO|j| jd | j| jgdd\}}}|jd d }	|jg |	| jdR  }tj	|ddd\}
}|
j
g |	dR  }
|j
g |	dR  }n|j| j| j| jgdd\}
}}| |
d| j| jd| j| j }
| |d| j| jd| j| j }| ||
|\}
}| |
||}| jrt|}|| }| |\|d d < }d S )Nr   r   r   )r[  rI  r   rW  rX  r   r   rR  r   chunkr   ra  rV  rb  rN  r^  r_  sigmoidr\  )r|   rc  r   r   qkvr   q_gater  r  r   r
  rW   attn_outputr   r   r   r     s.    
zQwen3NextAttention.forward)NNNrJ   )r   r   r   r6   r
   r	   r*   r   r_   r   r   r   r   r   r   r~   r   rG    s2    NrG  c                	       sZ   e Zd Z	ddedededdf fddZ	dd	ejd
ejdB dejdefddZ	  Z
S )Qwen3NextDecoderLayerrJ   rK   
layer_typerL   r   Nc           
         s  t    |jj}|j}|j}|j}|j}|| _t|| _	| jdkr1t
|||||| dd| _n| jdkrDt||||| dd| _ntd| j t|dsSg n|j}	| j	|	vru|jd	kru| j	d
 |j d	krut|| dd| _nt|j|j|j|| dd| _t|j|jd| _t|j|jd| _t|dd| _| jrtj !tj"d
d
|j|j#d| _$tj !tj"d
d
|j|j#d| _%d S d S )Nlinear_attentionz.linear_attn)r`   r   rP   r   rL   full_attentionz
.self_attn)r`   r   rP   rL   zInvalid layer_type mlp_only_layersr   r<   z.mlprK   rL   )rQ   rR   rS   rP   rL   rO  layer_scaleF)r   )&r^   r_   r`   ra   r   rP   r   rj  rD   r   r   linear_attnrG  	self_attnrl   hasattrrm  rX   decoder_sparse_steprI   mlprv   rQ   rR   rS   r`  r   input_layernormpost_attention_layernormrZ  ro  r   r   r   r   r   attn_layer_scaleffn_layer_scale)
r|   rK   rj  rL   r}   r`   r   rP   r   rm  r~   r   r   r_   4  s   









zQwen3NextDecoderLayer.__init__r   residualrc  kwargsc                 K   s`  |d u r|}|  |}n|  ||\}}t|}| jdkr&| j||d n| jdkr4| j|||d ntd|}| jr]t|j	dkrR|| j
|jd d  }n|| j
|jd  }| ||\}}| |}| jrt|j	dkr|| j|jd d  }||fS t|j	t| jj	ksJ d	t|j	 d
t| jj	 || j|jd  }||fS )Nrk  )r   r   rl  )r   r   rc  zInvalid layer_typer   r   r<   zshape must be the same z, )ru  r   
empty_likerj  rp  rq  rl   ro  lenr   rw  r*  r   rv  rt  rx  )r|   r   ry  rc  rz  self_attention_outputr   r   r   r     sV   




zQwen3NextDecoderLayer.forwardr   r   )r   r   r   r   r   r_   r   r   objectr   r   r   r   r~   r   ri  3  s*    Wri  c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )Qwen3NextModelrJ   rL   rK   rL   c                   s   t    jj j}|j}|j| _ | _ j| _t	| j j
| _dtf fdd}t j|| dd\| _| _| _tddg j
| _t jrUt j
 jd| _d S t | _d S )	NrL   c                    s   t  jt|  | dS )N)rj  rL   )ri  layer_typesrD   r  r}   rK   r   r   	get_layer  s
   z*Qwen3NextModel.__init__.<locals>.get_layerz.layersr  r   ry  rO  )r^   r_   r`   ra   rb   rm   r\   r}   
vocab_sizer-   rQ   embed_tokensr   rG   num_hidden_layersstart_layer	end_layerlayersrF   make_empty_intermediate_tensorsr   is_last_rankr`  r   r   rC   )r|   rK   rL   rb   rm   r  r~   r  r   r_     s*   

zQwen3NextModel.__init__	input_idsr   c                 C   s
   |  |S r   )r  r|   r  r   r   r   embed_input_ids     
zQwen3NextModel.embed_input_idsNrc  intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]}||||d\}}q*t  jsAt||dS | 	||\}}|S )Nr   ry  )rc  r   ry  )r   ry  )
r   is_first_rankr  r   r  r  r  r  r5   r   )	r|   r  rc  r  r  r   ry  layerr   r   r   r   r     s(   
zQwen3NextModel.forwardc                 C   s   t j| ddd| jj| jdS )N	gate_proj	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerX   r\   )r   make_expert_params_mappingr}   rX   r\   r   r   r   r   get_expert_mapping  s   z!Qwen3NextModel.get_expert_mappingweightsc              	   C   s  g d}t |  }t }|  }|D ]\}}d|v rq|dr"q|dr1t||}|d u r1q|D ]8\}}	}
|	|vr=q3d|v rBq3||	|}|drR||vrRq3t|| rXq3||vr]q3|| }|j	}||||
  nn|D ]>}|\}}	}}
|	|vr{qn||	|}t|| rqn|ds|dr||vrqn||vrqn|| }|j	}|||||
|d  n-|dr||vrqt|| rq||vrt
d	| d
 q|| }t|dt}||| || q|S )N))r[  q_projr
  )r[  k_projr  )r[  v_projr  )gate_up_projr  r   )r  r  r<   zrotary_emb.inv_freqmtp.scalezmlp.expertsz.bias_bias)shard_id	expert_idz
Parameter z' not found in params_dict, skip loadingr   )r  named_parameterssetr  
startswithendswithr/   replacerE   r   loggerwarning_oncerZ  r.   add)r|   r  stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer  paramr   mappingr  r   r   r   load_weights  s   	







zQwen3NextModel.load_weightsr  )r   r   r   r   r   r_   r   r   r  r5   r   listrE  rF  r  r   r  r  r   r   r   r~   r   r    s$    %
 ,r  c                   @   s*   e Zd ZdededdfddZdd ZdS )	QwenNextMixtureOfExpertsnum_physical_expertsnum_local_physical_expertsr   Nc                 C   sh   | j |ksJ || _|| _ || j | _| jjD ]}t|jtr1|j}||_	||_
| j|_|j  qd S r   )r  r  num_logical_expertsr\   modelr  r  rt  rI   rq   rp   ro   r{   update_expert_map)r|   r  r  r  moer   r   r    update_physical_experts_metadataw  s   
z9QwenNextMixtureOfExperts.update_physical_experts_metadatac                 C   s   g | _ g | _d }| jjD ]}t|tr$t|jtr$|j}| j|jj	 q|d u r-t
dt| j| _d| _d| _|j| _|j| _|j| _|j| _|j| _d S )Nz-No Qwen3Next layer found in the model.layers.r<   r   )expert_weights
moe_layersr  r  r  ri  rt  rI   appendr{   RuntimeErrorr|  num_moe_layersnum_expert_groupsnum_shared_expertsrn   r  rp   r  rq   r  rj   num_routed_expertsro   r\   )r|   example_moer  r   r   r   set_moe_parameters  s(   z+QwenNextMixtureOfExperts.set_moe_parameters)r   r   r   rF  r  r  r   r   r   r   r  v  s    
r  c                       sP  e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ			d%dejdB dejde
dB dejdB def
ddZedddeejejf fddZedddeeeef eeef f fddZedeeef fddZdejdejdB fddZd eeeejf  dee fd!d"Zdeeeeeef  fd#d$Z  ZS )&Qwen3NextForCausalLM)r  r  r  r  r  )r[  r  rJ   r  rK   rL   c                   s   |j j}|| _|j | _ |j}|j}|jdkrtd|j| _t 	  || _
|| _t|t|dd| _t|j|jt|dd| _t|j| _| jj| _|   d S )NallzhQwen3Next currently does not support 'all' prefix caching, please use '--mamba-cache-mode=align' insteadr  rn  lm_headr  )r`   ra   rK   r   scheduler_configmamba_cache_modeNotImplementedErrorrP   r^   r_   r}   r  rH   r  r,   r  rQ   r  r!   logits_processorr  r  )r|   rK   rL   r}   r   r  r~   r   r   r_     s2   


zQwen3NextForCausalLM.__init__r  r   c                 C   s   | j |S r   )r  r  r  r   r   r   r    s   z$Qwen3NextForCausalLM.embed_input_idsNrc  r  r  rz  c                 K   s   |  ||||}|S r   )r  )r|   r  rc  r  r  rz  r   r   r   r   r     s   zQwen3NextForCausalLM.forwardr   c                 C   s   t |jj|jjS r   r   )clsrK   r   r   r   !get_mamba_state_dtype_from_config  s   z6Qwen3NextForCausalLM.get_mamba_state_dtype_from_configc              	   C   sF   |j }|jj}|j}|jr|jjnd}t||j|j	|j
|j|j|S )Nr   )rb   r`   ra   tensor_parallel_sizer   r   r'   r   r   r   r   r   r   )r  rK   rb   ra   rc   r   r   r   r   !get_mamba_state_shape_from_config  s    
z6Qwen3NextForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S r   )r%   gated_delta_net_state_copy_func)r  r   r   r   get_mamba_state_copy_func  s   z.Qwen3NextForCausalLM.get_mamba_state_copy_funcr   c                 C   s   |  | j|S r   )r  r  )r|   r   r   r   r   compute_logits	  s   z#Qwen3NextForCausalLM.compute_logitsr  c                 C   s   t | dgd}||S )Nr  )skip_prefixes)rB   r  )r|   r  loaderr   r   r   r    s
   
z!Qwen3NextForCausalLM.load_weightsc                 C   s
   | j  S r   )r  r  r   r   r   r   r    r  z'Qwen3NextForCausalLM.get_expert_mappingr  )r   r   r   packed_modules_mappingr   r   r_   r   r   r  r5   r~  r   classmethodrE  r   r  rF  r  r$   r  r  r   r  r  r  r  r   r   r   r~   r   r    sP    		"

$&r  r   r   r   r   
layer_namer   c                 C   s&   t  }|j| }|j| |||d dS )z
    Custom op for the core attention computation.
    Only handles the convolution + recurrent attention part.
    Input/output projections are handled outside this op.
    )r   r   r   r   N)r   no_compile_layersrC  )r   r   r   r   r  r-  r|   r   r   r   r     s   

r   c                 C   r   )z&Fake implementation for torch.compile.Nr   )r   r   r   r   r  r   r   r   gdn_attention_core_fake0  s   r  )op_nameop_funcmutates_args	fake_impl	NUM_HEADSr  	threshold	BLK_HEADSc                 C   sR  t dt dt d}}}||
 t d|
 }|| | ||  | }||k }t j|| |d}t j|| |d}t j|| |d}t j|| |d}|t j|t j }t || |	kd| t dt ||   |}t |t j | }t j	| | || j
j|d t |t j}t j	|| ||j
j|d d S )Nr   r<   r   )mask)r7   
program_idarangeloadr*  float32wherelogexpstorer   
element_tyre  )r  beta_outputr   r   r   r   seq_lenr  r  r  r  i_bi_si_dhead_offoffr  	blk_A_logblk_ablk_bblk_biasr   
softplus_xblk_gblk_beta_outputr   r   r   fused_gdn_gating_kernelC  s$   "*
r         ?      4@r   r   c                 C   s|   |j \}}d}||t|df}	tjd||tj|jd}
tjd|||j|jd}t|	 |
|| |||||||ddd |
|fS )z
    Fused computation of g and beta for Gated Delta Net.
    g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
    beta_output = b.sigmoid()
    TODO maybe use torch.compile to replace this triton kernel
    r<      r   )	num_warps)	r   r8   cdivr   r   r  r   r   r   )r   r   r   r   r  r  batchrR  r  gridr  r  r   r   r   r)  g  s(   
r)  )r  r  )__doc__collections.abcr   	itertoolsr   r   einopsr   r   transformers.activationsr   vllm.compilation.decoratorsr   vllm.configr	   r
   r   r   r   vllm.distributedr   r   r   r   r   r   vllm.forward_contextr   r   vllm.loggerr   $vllm.model_executor.layers.attentionr   "vllm.model_executor.layers.fla.opsr   r   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   r`  r   !vllm.model_executor.layers.linearr   r   r   r    +vllm.model_executor.layers.logits_processorr!   )vllm.model_executor.layers.mamba.abstractr"   -vllm.model_executor.layers.mamba.mamba_mixer2r#   ,vllm.model_executor.layers.mamba.mamba_utilsr$   r%   r&   r'   2vllm.model_executor.layers.mamba.ops.causal_conv1dr(   r)   'vllm.model_executor.layers.quantizationr*   +vllm.model_executor.layers.rotary_embeddingr+   3vllm.model_executor.layers.vocab_parallel_embeddingr,   r-   -vllm.model_executor.model_loader.weight_utilsr.   r/   r0   $vllm.model_executor.models.qwen2_moer1   rv    vllm.model_executor.models.utilsr2   vllm.model_executor.utilsr3   vllm.platformsr4   vllm.sequencer5   vllm.transformers_utils.configsr6   vllm.triton_utilsr7   r8   vllm.utils.torch_utilsr9   vllm.v1.attention.backendr:   #vllm.v1.attention.backends.gdn_attnr;   
interfacesr=   r>   r?   r@   rA   utilsrB   rC   rD   rE   rF   rG   rH   r   r  rE  r   KVCacheModulerI   r   rG  ri  r  r  r  r   r   r  jit	constexprr   floatr)  r   r   r   r   <module>   s    $
x   au  3
,x

	
(