o
    پi$                     @   s   d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ eeZ G dd deZ!e!gZ"dS )z2Inference-only Qwen3Next MTP Speculative Decoding.    N)IterableOptionalTuple)nn)PretrainedConfig)get_pp_group$get_tensor_model_parallel_world_size)GemmaRMSNorm)LogitsProcessor)QuantizationConfig)ParallelLMHead)ForwardBatch)Qwen3NextForCausalLMQwen3NextModel)get_global_server_args)
add_prefixc                       s   e Zd Z		ddedee deddfddZe	 	dd	ej
d
ej
dedeej
 fddZ	ddeeeej
f  def fddZ  ZS )Qwen3NextForCausalLMMTPN configquant_configprefixreturnc                 C   s   t j|  || _t | _|| _t | _t j	d|j
 |j
dd| _t}||j
|j| _||j
|j| _d|_d|_t||td|d| _t|j|j
|td|t jd| _t|| _d S )	N   F)bias   model)r   zmodel.shared_head.head)r   r   use_attn_tp_group)r   Module__init__r   r   tp_sizer   r   pp_groupLinearhidden_sizefcr	   rms_norm_epspre_fc_norm_embeddingpre_fc_norm_hiddennum_hidden_layersfull_attention_intervalr   r   r   r   
vocab_sizer   enable_dp_lm_headlm_headr
   logits_processor)selfr   r   r   RMSNorm_cls r/   T/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen3_next_mtp.pyr   '   s0   z Qwen3NextForCausalLMMTP.__init__	input_ids	positionsforward_batchinput_embedsc                 K   st   |d u r
| j |}|jj}|j s| |}| |}| t	j
||fdd}|  ||||}| ||| j|S )N)dim)r   embed_tokens	spec_infohidden_statesforward_modeis_idler%   r&   r#   torchcatr,   r+   )r-   r1   r2   r3   r4   kwargsr9   r/   r/   r0   forwardL   s    	



zQwen3NextForCausalLMMTP.forwardFweightsis_mtpc                    s   t  j|dd d S )NT)rA   )superload_weights)r-   r@   rA   	__class__r/   r0   rC   j   s   z$Qwen3NextForCausalLMMTP.load_weights)Nr   )N)F)__name__
__module____qualname__r   r   r   strr   r<   no_gradTensorr   r?   r   r   boolrC   __classcell__r/   r/   rD   r0   r   %   s:    
%r   )#__doc__loggingtypingr   r   r   r<   r   transformersr   sglang.srt.distributedr   r   sglang.srt.layers.layernormr	   "sglang.srt.layers.logits_processorr
   *sglang.srt.layers.quantization.base_configr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.models.qwen3_nextr   r   sglang.srt.server_argsr   sglang.srt.utilsr   	getLoggerrF   loggerr   
EntryClassr/   r/   r/   r0   <module>   s$   

K