o
    پiz#                     @   s(  d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' dZ(e)e*Z+G dd dej,Z-G dd dej,Z.e.gZ/dS )zSGLang BailingMoENextN model.    N)IterableOptionalTuple)nn)PretrainedConfig)$get_tensor_model_parallel_world_size)is_dp_attention_enabled)RMSNorm)ReplicatedLinear)LogitsProcessor)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)BailingMoEBlockBailingMoEForCausalLM)BailingMoELinearDecoderLayerBailingMoeV2_5ForCausalLM)WeightsMapper)get_global_server_args)BumpAllocator
add_prefixc                       sb   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
dej	dej	f
ddZ  ZS )BailingMoEModelNextNN configquant_configprefixreturnc              	      sH  t    d| _d| _d| _d| _|j| _d|_|d ur)| dkr)t	
d d }|j| _t|j|jt  td|d| _t|j|jd| _t|j|jd| _td	|j |jd
|td|j d|d| _t|don|jdk| _| jrd|_t||ddtd|j |d| _nt|d|td|d| _t | _ t|j|jd| _!d S )N   r   Tmodelopt_fp4zSOverriding DeepseekV3ForCausalLMNextN quant config for modelopt_fp4 Deepseek model.word_embeddings)	enable_tpr   )eps   Fzlayers.z.eh_proj)biasr   r   
model_typebailing_hybrid)r   layer_idis_nextnr   decoder)r   r   )"super__init__layer_group_sizestart_layer	end_layertotal_num_layers
vocab_sizefor_nextn_modelget_nameloggerwarningr   hidden_sizer   r   r    r	   rms_norm_epsenormhnormr
   num_hidden_layerseh_projhasattrr%   	is_hybridattention_typer   r)   r   r   Moduleshared_headfinal_layernormselfr   r   r   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/bailing_moe_nextn.pyr+   6   s^   
	

zBailingMoEModelNextN.__init__	input_ids	positionsforward_batchinput_embedsc           
   	   C   s   |d u r
|  |}n|}|jd dkr/| tj| || |jj	| jj
jfdd\}}d }| jrV|j}t| jd |jrAdnd tj|d}	| j|||||	d\}}n
| ||||\}}|j sx|d urs| ||\}}|S | |}|S )Nr   )dimr#   r   )buffer_sizedtypedevice)hidden_statesrH   rI   residualzero_allocator)r    shaper:   torchcatr7   r8   	spec_inforP   toweightrN   r<   rO   r   r/   can_run_tbofloat32r)   forward_modeis_idler@   )
rB   rG   rH   rI   rJ   rP   _rQ   rO   rR   rE   rE   rF   forwardv   sX   

zBailingMoEModelNextN.forwardNr   N)__name__
__module____qualname__r   r   r   strr+   rT   Tensorr   r^   __classcell__rE   rE   rC   rF   r   5   s0    Er   c                	   @   s   e Zd ZddgddgdZeddidZ			
ddedee de	dd	fddZ
e dejdejdedejfddZdd Zdeee	ejf  fddZdddZd	S ) BailingMoeForCausalLMNextNq_a_projkv_a_proj_with_mqa	gate_projup_proj)fused_qkv_a_proj_with_mqagate_up_projzattention.densezattention.o_proj)orig_to_new_substrNr   r   r   r   r   c                 C   s   t j|  || _t | _|| _t| dr| d t	||t
d|d| _t|j|j|t
d|t jd| _t|| _t| jdrP|jdkrPtj| _tj| _d S tj| _tj| _d S )	N"determine_num_fused_shared_expertsrg   model)r   zmodel.shared_head.head)r   r   use_attn_tp_groupr%   r&   )r   r>   r+   r   r   tp_sizer   r;   ro   r   r   rp   r   r0   r5   r   enable_dp_lm_headlm_headr   logits_processorr%   r   load_weightsbase_load_weights_funcpost_load_weightspost_load_weights_funcr   rA   rE   rE   rF   r+      s,   


z#BailingMoeForCausalLMNextN.__init__rG   rH   rI   c                 C   s    |  |||}| ||| j|S r`   )rp   ru   rt   )rB   rG   rH   rI   rP   rE   rE   rF   r^      s   
z"BailingMoeForCausalLMNextN.forwardc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  dS )zUsed by the eagle_worker.N)rp   r    rX   rt   rT   cudaempty_cachesynchronize)rB   embedheadrE   rE   rF   set_embed_and_head   s   

z-BailingMoeForCausalLMNextN.set_embed_and_headweightsc                 C   s   | j | |dd d S )NT)r(   )rw   )rB   r   rE   rE   rF   rv         z'BailingMoeForCausalLMNextN.load_weightsFc                 C   s   | j | ||d d S )N)r(   weight_names)ry   )rB   r(   r   rE   rE   rF   rx      r   z,BailingMoeForCausalLMNextN.post_load_weightsr_   )FN)ra   rb   rc   packed_modules_mappingr   hf_to_sglang_mapperr   r   r   rd   r+   rT   no_gradre   r   r^   r   r   r   rv   rx   rE   rE   rE   rF   rg      s@    	
 	rg   )0__doc__loggingtypingr   r   r   rT   r   transformersr   sglang.srt.distributedr   sglang.srt.layers.dp_attentionr   sglang.srt.layers.layernormr	   sglang.srt.layers.linearr
   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.models.bailing_moer   r   $sglang.srt.models.bailing_moe_linearr   r   sglang.srt.models.utilsr   sglang.srt.server_argsr   sglang.srt.utilsr   r   
LoraConfig	getLoggerra   r3   r>   r   rg   
EntryClassrE   rE   rE   rF   <module>   s0   
|
I