o
    پi2                     @   s   d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ eeZG dd dej Z!e!gZ"dS )z!Inference-only Qwen3_5 MTP model.    N)IterableOptionalTuple)nn)PretrainedConfig)get_pp_group$get_tensor_model_parallel_world_size)GemmaRMSNorm)LogitsProcessor)FusedMoE)ParallelLMHead)ForwardBatch)default_weight_loader)Qwen3_5ForCausalLM)
add_prefixc                   @   s   e Zd Z		ddededdfddZdd	 Zd
d Ze	 	ddej
dej
dedeej
 fddZ	ddeeeej
f  defddZdS )Qwen3_5ForCausalLMMTPN configprefixreturnc                 C   s   t j|  t|d| _| jr|j}|r| dkrd }|| _t | _	|| _
t | _t jd|j |jdd| _t}||j|j| _||j|j| _d|_d|_t||td|d| _t jru|jrg| jj| _nt|j|j|td	|d
| _t|| _ d S )Ntext_configmodelopt_fp4   F)bias   mtp)r   lm_head)quant_configr   )!r   Module__init__hasattris_multimodalr   get_namer   r   tp_sizer   r   pp_groupLinearhidden_sizefcr	   rms_norm_epspre_fc_norm_embeddingpre_fc_norm_hiddennum_hidden_layersfull_attention_intervalr   r   modelis_last_ranktie_word_embeddingsembed_tokensr   r   
vocab_sizer
   logits_processor)selfr   r   r   RMSNorm_cls r5   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen3_5_mtp.pyr   '   sB   zQwen3_5ForCausalLMMTP.__init__c                 C   s   | j jj| jjfS N)r-   r0   weightr   )r3   r5   r5   r6   get_embed_and_headW   s   z(Qwen3_5ForCausalLMMTP.get_embed_and_headc                 C   s@   | j j`| jjs| j`|| j j_|| j_tj  tj	  d S r7   )
r-   r0   r8   r   r/   r   torchcudaempty_cachesynchronize)r3   embedheadr5   r5   r6   set_embed_and_headZ   s   

z(Qwen3_5ForCausalLMMTP.set_embed_and_head	input_ids	positionsforward_batchinput_embedsc                 K   s   |d u sJ |j }|j r1| r1|j s1|d usJ t|d d | j|d 	dg}|d u r;| j|}|j
j}|j sN| |}| |}tj||gdd}| |}| ||||}| ||| j|S )Nr   dim)mm_input_embedsforward_mode	is_extendcontains_mm_inputsis_draft_extendr:   catr-   r0   	unsqueeze	spec_infohidden_statesis_idler)   r*   r'   r2   r   )r3   rA   rB   rC   rD   kwargsrP   r5   r5   r6   forwardd   s:   	 




zQwen3_5ForCausalLMMTP.forwardFweightsis_mtpc              
   C   s  g d}t | jdd }|d urtjddd|d}ng }d}d}d	d
g}dtdtdtjdtdtf
dd}	t| 	 }
t
 }|D ]\}}d|v rHq>d|vrMq>|drd|dd}|dd}|dd}d|v rn|dd}|D ]C\}}}d|v s}d|v rd}|}||vrqpd|v rqp|||}||r||
vrqp||
vrqp|
| }t |d t}|||| |} nd}|D ]h}|\}}}}||vrqd}|||}|r|d urd|v r|jd!d"d#\}}|	||
|d$| |	||
|d%| n,|	||
||| n#||r||
vrq||
vr nB|
| }|j}||||||d& |} n.|r%q>||r1||
vr1q>||
v rF|
| }t |d t}||| n	td'| d( || q>|S ))N))qkv_projq_projq)rV   k_projk)rV   v_projv)gate_up_proj	gate_projr   )r]   up_projr   num_expertsr^   	down_projr_   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer`   )
z.bias_biasz.k_scale_k_scalez.v_scale_v_scalez.weight_scale_weight_scalez.input_scale_input_scaleF)zexperts.w13_weightexperts.gate_up_projr   w1)zexperts.w2_weightexperts.down_projr   w2nameparams_dictloaded_weightshard_idc           	      S   s8   ||  }|j }t|D ]}|| }|||| || qdS )NT)weight_loaderrange)	rn   ro   rp   rq   r`   paramrr   	expert_idcurr_expert_weightr5   r5   r6   load_fused_expert_weights   s   zEQwen3_5ForCausalLMMTP.load_weights.<locals>.load_fused_expert_weightszrotary_emb.inv_freqr   zmtp.zmodel.zmodel.fcr'   zmodel.pre_fcpre_fcz.self_attn.z
.self_attnr   rj   rl   Tzmlp.expertsrr   r   rF   rk   w3)rq   ru   z
Parameter z' not found in params_dict, skip loading)getattrr   r   make_expert_params_mappingstrdictr:   Tensorintnamed_parametersset
startswithreplaceendswithr   chunkrr   loggerwarning_onceadd)r3   rT   rU   stacked_params_mappingr`   expert_params_mappingignore_suffixesis_fused_expertfused_expert_params_mappingrw   ro   loaded_paramsrn   rp   
param_nameweight_namerq   name_mappedrt   rr   is_expert_weightmappingru   	loaded_w1	loaded_w3r5   r5   r6   load_weights   s   


	




z"Qwen3_5ForCausalLMMTP.load_weights)Nr   r7   )F)__name__
__module____qualname__r   r}   r   r9   r@   r:   no_gradr   r   r   rS   r   r   boolr   r5   r5   r5   r6   r   %   s:    
0
,r   )#__doc__loggingtypingr   r   r   r:   r   transformersr   sglang.srt.distributedr   r   sglang.srt.layers.layernormr	   "sglang.srt.layers.logits_processorr
   ,sglang.srt.layers.moe.fused_moe_triton.layerr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.qwen3_5r   sglang.srt.utilsr   	getLoggerr   r   r   r   
EntryClassr5   r5   r5   r6   <module>   s(   
  
8