o
    پi#                     @   sv  d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZmZmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6 e7e8Z9e5 Z:e6 Z;G dd dej<Z=G dd de/Z>e>gZ?dS )z3Inference-only DeepSeek NextN Speculative Decoding.    N)IterableOptionalTuple)nn)PretrainedConfig)is_deepseek_nsa)get_pp_group$get_tensor_model_parallel_world_size)envs)'get_global_expert_distribution_recorder)can_cp_splitcp_all_gather_rerange_outputcp_split_and_rebuild_datais_nsa_enable_prefill_cpnsa_use_prefill_cpprepare_input_dp_with_cp_dsa)get_attention_cp_rankget_attention_cp_sizeis_dp_attention_enabled)RMSNorm)LogitsProcessor)	Fp8Config)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)!enable_nextn_moe_bf16_cast_to_fp8)DeepseekV2DecoderLayerDeepseekV3ForCausalLM)get_global_server_args)BumpAllocator
add_prefixis_cudais_npuc                       sb   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
dej	dej	f
ddZ  ZS )DeepseekModelNextNN configquant_configprefixreturnc              	      sT  t    t|rtdddgd}nd }|d ur%| dkr%td d }|j| _t|j|j	t
 td|d| _t|j	|jd| _t|j	|jd| _tjd	|j	 |j	d
d| _ts^tj rctj nd | _d}tryt jt jkrydt |j! }t"|d||dt||| jd| _#t$ | _%t|j	|jd| j%_&t' | _(| j(rt) | _*d S d | _*d S )NT   )is_checkpoint_fp8_serializedweight_block_sizemodelopt_fp4zSOverriding DeepseekV3ForCausalLMNextN quant config for modelopt_fp4 Deepseek model.embed_tokens)use_attn_tp_groupr(   )eps   F)biasdecoderzlayers.r   )r'   moe_quant_config_overrideis_nextnr(   
alt_stream)+super__init__r   r   get_nameloggerwarning
vocab_sizer   hidden_sizer   r!   r.   r   rms_norm_epsenormhnormr   Lineareh_proj_is_cudar
   SGLANG_NPU_USE_MULTI_STREAMgettorchcudaStreamr6   _is_npur   speculative_draft_model_path
model_pathstrnum_hidden_layersr   r3   Moduleshared_headnormr   nsa_enable_prefill_cpr   cp_size)selfr&   r'   r(   r4   
layer_name	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/deepseek_nextn.pyr8   ?   sb   



zDeepseekModelNextN.__init__	input_ids	positionsforward_batchinput_embedsc           	      C   s   t dtj|d ur|jn|jd}|d u r| |}n|}|jd dkr7| tj| || 	|j
jfdd}t|| jrBt||}d }t   | |||||\}}W d    n1 s_w   Y  |j s|d urw| j||\}}n| j|}t|| jrt|| j|tj }|S )Nr1   )buffer_sizedtypedevicer   )dim)r    rF   float32r_   r.   shaperB   catr?   r@   	spec_infohidden_statesr   rQ   r   r   disable_this_regionr3   forward_modeis_idlerO   rP   r   rR   rG   current_stream)	rS   rY   rZ   r[   r\   zero_allocatorrf   residual_rW   rW   rX   forward   sR   



	zDeepseekModelNextN.forwardNr%   N)__name__
__module____qualname__r   r   r   rL   r8   rF   Tensorr   rn   __classcell__rW   rW   rU   rX   r$   >   s0    Hr$   c                	       s|   e Zd Z		ddedee deddfddZe	 d	ej
d
ej
dedej
fddZdeeeej
f  f fddZ  ZS )DeepseekV3ForCausalLMNextNNr%   r&   r'   r(   r)   c                 C   s   t j|  || _t | _|| _t | _| 	d t
|| _t | _| jr.t | _t | _nd | _d | _t||td|d| _t|j|j|td|t jd| _t|| _d S )Nrv   model)r(   zmodel.shared_head.head)r'   r(   r/   )r   rN   r8   r&   r	   tp_sizer'   r   pp_group"determine_num_fused_shared_expertsr   use_nsar   rQ   r   cp_rankr   rR   r$   r!   rw   r   r<   r=   r   enable_dp_lm_headlm_headr   logits_processor)rS   r&   r'   r(   rW   rW   rX   r8      s0   


z#DeepseekV3ForCausalLMNextN.__init__rY   rZ   r[   c                 C   sZ   | j rtt|| j| j|rtt|| j| j|j |_	| 
|||}| ||| j|S rp   )rQ   r   lenrR   r{   r   r|   seq_lens_cputolistnsa_cp_metadatarw   r   r~   )rS   rY   rZ   r[   rf   rW   rW   rX   rn      s   
z"DeepseekV3ForCausalLMNextN.forwardweightsc                    s   t  j|dd d S )NT)r5   )r7   load_weights)rS   r   rU   rW   rX   r      s   z'DeepseekV3ForCausalLMNextN.load_weightsro   )rq   rr   rs   r   r   r   rL   r8   rF   no_gradrt   r   rn   r   r   r   ru   rW   rW   rU   rX   rv      s.    
"(rv   )@__doc__loggingtypingr   r   r   rF   r   transformersr   sglang.srt.configs.model_configr   sglang.srt.distributedr   r	   sglang.srt.environr
   #sglang.srt.eplb.expert_distributionr   %sglang.srt.layers.attention.nsa.utilsr   r   r   r   r   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   "sglang.srt.layers.logits_processorr   sglang.srt.layers.quantizationr   *sglang.srt.layers.quantization.base_configr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   'sglang.srt.models.deepseek_common.utilsr   sglang.srt.models.deepseek_v2r   r   sglang.srt.server_argsr   sglang.srt.utilsr    r!   r"   r#   	getLoggerrq   r:   rC   rI   rN   r$   rv   
EntryClassrW   rW   rW   rX   <module>   s:    
 
=