o
    پiO                     @   s  d Z ddlZddlmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z; e<e=Z>e: Z?G dd dej@ZAG dd dej@ZBG dd dej@ZCG dd dej@ZDG dd  d ej@ZEeEZFdS )!z<
SGLang SDARModelLM (block diffusion / dLLM-style forward).
    N)IterableOptionalTupleUnion)nn)PretrainedConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)AttentionTypeRadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loadermaybe_remap_kv_scale_name)apply_qk_normcreate_fused_set_kv_buffer_argenable_fused_set_kv_buffer)get_global_server_args)
add_prefixis_cudamake_layersc                       sH   e Zd Z			ddededef fddZdd
ejdefddZ	  Z
S )SDARMLPNT configreduce_resultsprefixc              	      sZ   t    t|j|jgd d|td|d| _t|j|jd||td|d| _t	 | _
d S )N   Fgate_up_proj)biasquant_configr,   	down_proj)r/   r+   r0   r,   )super__init__r   hidden_sizeintermediate_sizer%   r.   r   r1   r
   act_fn)selfr*   r0   r+   r,   	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/sdar.pyr3   6   s"   

zSDARMLP.__init__Fhidden_statesuse_reduce_scatterc                 C   s.   |  |\}}| |}| j||d\}}|S )N)skip_all_reduce)r.   r6   r1   )r7   r<   r=   gate_up_r:   r:   r;   forwardO   s   

zSDARMLP.forward)NTr)   )F)__name__
__module____qualname__r   boolstrr3   torchTensorrA   __classcell__r:   r:   r8   r;   r(   5   s    r(   c                       sf   e Zd Z				ddededededeej	j
 f
 fd	d
Zdd ZdejdejdefddZ  ZS )SDARAttentionNTr)   r*   layer_idr+   r,   
alt_streamc                    s  t    || _|j| _|j| _t | _t }t	 }| j| dks#J | j| | _
|j| _| j|kr<| j| dks;J n	|| j dksEJ td| j| | _t|d| j| j | _| j
| j | _| j| j | _| jd | _t| j| j| j| jt|dd|||td|d	| _t| j| j | jt|dd||||td	|d
| _t| j|jd| _t| j|jd| _t|dd}	t|dd }
t|dd}| j| _t| j| j||	|
d| _t | j
| j| j| j|t!j"td|d| _#|| _$d S )Nr      head_dimg      attention_biasFqkv_proj)r/   r0   tp_ranktp_sizer,   o_proj)r/   r0   r+   rQ   rR   r,   )eps
rope_thetag     @rope_scalingmax_position_embeddingsi   )
rotary_dimmax_positionbaserV   attn)num_kv_headsrK   	attn_typer,   )%r2   r3   rK   r4   num_attention_headstotal_num_headsr	   rR   r   r   	num_headsnum_key_value_headstotal_num_kv_headsmaxr\   getattrrN   q_sizekv_sizescaler   r%   rP   r   rS   r   rms_norm_epsq_normk_normrX   r   
rotary_embr   r   ENCODER_ONLYr[   rL   )r7   r*   rK   r0   r+   r,   rL   attn_tp_rankattn_tp_sizerU   rV   max_posr8   r:   r;   r3   Y   s   
	





	zSDARAttention.__init__c           	   	   C   s   |  |\}}|j| j| j| jgdd\}}}t||| j| j| j| jd\}}| j	|||t
|r8t|| j|dnd d\}}|||fS )Ndimqkri   rj   rN   rL   valuelayerforward_batchfused_set_kv_buffer_arg)rP   splitre   rf   r!   ri   rj   rN   rL   rk   r#   r"   r[   )	r7   	positionsr<   ry   qkvr@   rt   ru   vr:   r:   r;   forward_prepare_native   s.    


z$SDARAttention.forward_prepare_nativer}   r<   ry   c              	   C   s   t  jd ur
| }| |\}}|j| j| j| jgdd\}}}t||| j| j	| j
| jd\}}| j|||t|rBt|| j|dnd d\}}t  jd urZ|tj}|tj}| j||||t| d}	| |	\}
}|
S )Nrp   rq   rs   rv   rz   )save_kv_cache)r$   rl_on_policy_targetbfloat16rP   r|   re   rf   r!   ri   rj   rN   rL   rk   r#   r"   r[   torG   rS   )r7   r}   r<   ry   r~   r@   rt   ru   r   context_layeroutr:   r:   r;   rA      sH    

zSDARAttention.forward)NTr)   N)rB   rC   rD   r   intrE   rF   r   rG   cudaStreamr3   r   rH   r   rA   rI   r:   r:   r8   r;   rJ   X   s0    
WrJ   c                       sz   e Zd Z			ddededee dedeej	j
 f
 fdd	Zd
ejdejdedeej deejejf f
ddZ  ZS )	SDARBlockNr)   r*   rK   r0   r,   rL   c                    s   t    |j| _|| _t jd urttjdtjddni }t	| jfd|j
i|| _t	| jfd|j
i|| _t|||dtd||d| _t||dtd|d| _tj||jdddd	| _t| j| j| jdd
| _d S )NTweight_dtypecast_x_before_out_muloverride_orig_dtypefp32_residualrT   F	self_attn)rK   r*   r0   r+   r,   rL   mlp)r*   r0   r+   r,   )rK   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparse)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatter)r2   r3   r4   rK   r$   r   dictrG   float32r   rh   r   r   rJ   r%   r   r(   r   r   init_newnum_hidden_layersr   r   layer_communicator)r7   r*   rK   r0   r,   rL   norm_kwargsr8   r:   r;   r3      sj   
	
	zSDARBlock.__init__r}   r<   ry   residualreturnc                 C   s|   | j |||\}}|jd dkr| j|||d}| j |||\}}| j |}| j||d}| j |||\}}||fS )Nr   )r}   r<   ry   )r=   )r   prepare_attnshaper   prepare_mlpshould_use_reduce_scatterr   postprocess_layer)r7   r}   r<   ry   r   r=   r:   r:   r;   rA   8  s0   zSDARBlock.forwardNr)   N)rB   rC   rD   r   r   r   r   rF   rG   r   r   r3   rH   r   r   rA   rI   r:   r:   r8   r;   r      s4    
;r   c                       s~   e Zd Z			ddedee dedeejj	 f fddZ
		dd	ejd
ejdedejdee deejef fddZ  ZS )	SDARModelNr)   r*   r0   r,   rL   c                    s   t    | _j| _j| _t | _| jjr)t	| j| jt
 td|d| _nt | _tj fdd| jj| jjtd|d\| _| _| _| jjrot jd ur^ttjdtjddni }t| jfd	ji|| _d S tdd
| _d S )Nembed_tokensr0   use_attn_tp_groupr,   c                    s   t | | dS )N)rK   r*   r0   r,   rL   )r   )idxr,   rL   r*   r0   r:   r;   <lambda>y  s    z$SDARModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizer,   Tr   rT   )return_tuple)r2   r3   r*   
vocab_sizer4   	embed_dimr   pp_groupis_first_rankr   r   r%   r   r   r'   r   rank_in_group
world_sizer   start_layer	end_layeris_last_rankr$   r   r   rG   r   r   rh   norm)r7   r*   r0   r,   rL   r   r8   r   r;   r3   _  sB   


zSDARModel.__init__	input_idsr}   ry   input_embedspp_proxy_tensorsr   c           
      C   s   | j jr|d u r| |n|}d }n|d usJ |d }|dd }t| j| jD ]}| j| }	|	||||\}}q)| j jsEt	||dS |j
 sR| ||\}}|S )Nr<   r   )r<   r   )r   r   r   getranger   r   r   r   r   forward_modeis_idler   )
r7   r   r}   ry   r   r   r<   r   irx   r:   r:   r;   rA     s&   


zSDARModel.forwardr   NN)rB   rC   rD   r   r   r   rF   rG   r   r   r3   rH   r   r   r   rA   rI   r:   r:   r8   r;   r   ^  s8    
9r   c                       s   e Zd Z		ddedee def fddZedd	 Z	ed
d Z
e 		ddejdejdedeej dee dejfddZdeeeejf  fddZ  ZS )SDARForCausalLMNr)   r*   r0   r,   c                    s   t    t | _| jjdksJ d| jj d|| _|| _tr&tj	
 nd }t||tdd|d| _| jjr`t }| jjdkrN|jrN|dkrN| jj| _nt|j|j|t jtd|d| _nt | _t|d	d
| _d S )NrM   z?SDARMoeForCausalLM does not support pipeline parallel (pp_size=z). Please set pp_size=1.modelr)   )r0   r,   rL   lm_headr   T)return_full_logits)r2   r3   r   r   r   r*   r0   _is_cudarG   r   r   r   r%   r   r   r	   tie_word_embeddingsr   r   r   r   r4   r$   enable_dp_lm_headr   r   logits_processor)r7   r*   r0   r,   rL   rR   r8   r:   r;   r3     s<   

zSDARForCausalLM.__init__c                 C      | j jS N)r   r   r7   r:   r:   r;   r        zSDARForCausalLM.start_layerc                 C   r   r   )r   r   r   r:   r:   r;   r     r   zSDARForCausalLM.end_layerr   r}   ry   r   r   r   c                 C   s2   | j |||||d}| jjr| ||| j|S |S )N)r   r}   ry   r   r   )r   r   r   r   r   )r7   r   r}   ry   r   r   r<   r:   r:   r;   rA     s   	
zSDARForCausalLM.forwardweightsc                 C   s  g d}t |  }|D ]\}}|ds)|ds$|ds$|dr)t|d}|dkrD| jjrD| jjrD|d }t|d	t	}||| t
|}|d ur_t| jd
r_|| jjk s^|| jjkr_qd|v sgd|v rhqd|v spd|v rqq|dr{||vr{qd|v rt||}|d u rq|D ](\}	}
}|
|vrq||
|	}|dr||vrq|| }|j}||||  n)|dr||vrq|| v r|| }t|d	t	}||| qtd| d qd S )N))rP   q_projrt   )rP   k_projru   )rP   v_projr   )r.   	gate_projr   )r.   up_projrM   zmodel.zlayers.zembed_tokens.znorm.r   zmodel.embed_tokens.weightzlm_head.weightweight_loaderr   zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerrg   z.biasz
Parameter z not found in params_dict)r   named_parameters
startswithr%   r   r   r*   r   rd   r   r   hasattrr   r   r   r    replaceendswithr   keysloggerwarning)r7   r   stacked_params_mappingparams_dictnameloaded_weightparamr   rK   
param_nameweight_nameshard_idr:   r:   r;   load_weights  sp   




zSDARForCausalLM.load_weights)Nr)   r   )rB   rC   rD   r   r   r   rF   r3   propertyr   r   rG   no_gradrH   r   r   rA   r   r   r   rI   r:   r:   r8   r;   r     s>    -

$r   )G__doc__loggingtypingr   r   r   r   rG   r   transformersr   sglang.srt.distributedr   r	   sglang.srt.layers.activationr
   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   r   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   r    sglang.srt.models.utilsr!   r"   r#   sglang.srt.server_argsr$   sglang.srt.utilsr%   r&   r'   	getLoggerrB   r   r   Moduler(   rJ   r   r   r   
EntryClassr:   r:   r:   r;   <module>   sB   
# %bW 