o
    پiUN                     @   s  d Z ddlZddlmZmZmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZmZ ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7 e4 Z8e9e:Z;G dd de
j<Z=da>dd Z?G dd de
j<Z@G dd de
j<ZAG dd de
j<ZBG dd  d e.ZCeCgZDdS )!z?Inference-only LLaMA model compatible with HuggingFace weights.    N)AnyDictListOptionalTupleUnion)nn)Llama4TextConfig)$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)FusedMoE)TopK)QuantizationConfig)RadixAttention)get_rope)VocabParallelEmbedding)ForwardBatchForwardModePPProxyTensors)LlamaForCausalLMLlamaMLP)
add_prefix	fast_topkget_compiler_backendis_cudamake_layers)get_current_device_stream_fastc                       s   e Zd Zejde dedejdejdede	de
ejejf f
dd	Z	
	ddededee def fddZ	ddede	fddZdefddZdd Zdd Z  ZS ) 	Llama4MoETdynamicbackendhidden_statesgating_outputtopkrenormalizereturnc                 C   sF   t ||dd\}}t| | j}|d|j|tj	fS )Ndim)
r!   torchsigmoidfloattodtypeviewreshapeshapeint32)r*   r+   r,   r-   router_scores_aKrouter_indices_aK r=   L/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/llama4.pycustom_routing_functionF   s   
z!Llama4MoE.custom_routing_functionN configlayer_idquant_configprefixc                    s   t    t | _|j| _t | _|j	}t
|j|jdd td|d| _t| jdtjd| _t|j|j||d|dtd|d| _t|j|d|td	|dd
| _d S )NFrouter)biasrC   rD   )top_kr-   r?   Texperts)num_expertshidden_sizeintermediate_sizerB   reduce_resultsrC   apply_router_weight_on_inputrD   silushared_expert)rJ   rK   
hidden_actrC   rD   rL   )super__init__r
   tp_sizenum_experts_per_tokrG   r2   get_device_moduledevice_modulerK   r   rJ   num_local_expertsr    rE   r   r&   r?   r,   r   rH   r   rO   )selfrA   rB   rC   rD   intermediate_size_moe	__class__r=   r>   rR   W   sF   

zLlama4MoE.__init__Fforward_batchuse_reduce_scatterc                 C   s4   |  ||j\}}|| }| jdkr|st|}|S )N   )_forward_coreforward_moderS   r   )rX   r*   r\   r]   
shared_out
routed_outout_aDr=   r=   r>   forward   s   zLlama4MoE.forwardr`   c                 C   s   t r| |S | |S N)_is_cuda#_forward_core_shared_routed_overlap_forward_core_normal)rX   r*   r`   r=   r=   r>   r_      s   

zLlama4MoE._forward_corec                 C   s8   |  |\}}| |}| ||}| ||}||fS re   )rE   rO   r,   rH   )rX   r*   router_logits_ra   topk_outputrb   r=   r=   r>   rh      s
   
zLlama4MoE._forward_core_normalc                 C   s   t | j}|t  | |}| j| | |\}}| ||}| ||}W d    n1 s4w   Y  t | ||fS re   )	_get_or_create_alt_streamrV   wait_streamr%   rO   streamrE   r,   rH   )rX   r*   
alt_streamra   ri   rj   rk   rb   r=   r=   r>   rg      s   

z-Llama4MoE._forward_core_shared_routed_overlapNr@   )F)__name__
__module____qualname__r2   compiler"   staticmethodTensorintboolr   r?   r	   r   r   strrR   r   rd   r   r_   rh   rg   __classcell__r=   r=   rZ   r>   r&   D   sD    3
r&   c                 C   s   t d u r|  a t S re   )_alt_streamStream)rV   r=   r=   r>   rl      s   rl   c                       s   e Zd Z							d dededed	ed
ededeeee	f  dedee
 dedededdf fddZdejdejfddZejde ddd ZdejdejdedejfddZ  ZS )!Llama4Attention'  N    Fr@   rA   rB   rJ   	num_headsnum_kv_heads
rope_thetarope_scalingmax_position_embeddingsrC   rF   bias_o_projrD   r.   c                    sL  t    || _|| _|d d dk| _|jo| j| _t }t }|| _| j| dks-J | j| | _	|| _
| j
|krE| j
| dksDJ n	|| j
 dksNJ td| j
| | _|j| _| j	| j | _| j| j | _| jd | _|j| _|j| _|j| _|| _|| _| j	| j | _| jrt| j|jddnd | _|	}|	}|	rt|	dr|	jrtd||	jv rd }td	||	jv rd }t|| j| j| j
|
|td
|||d	| _t| j| j |||td	|||dd| _ d}|	o|	! dk}|r|j"dv rd}| jrt#| j| j|t$||dkr|nd |dnd | _%t&| j	| j| j| j|td|| jd| _'d S )Nr^      r   g      F)rJ   eps
has_weightignoreq_projo_projqkv_proj)	rJ   	head_sizetotal_num_headstotal_num_kv_headsrF   rC   rD   tp_rankrS   )
input_sizeoutput_sizerF   rC   rD   r   rS   rL   Tgguf)llamallama4default)
rotary_dimmax_positionbaser   is_neox_styleattn)r   rB   rD   	use_irope)(rQ   rR   rB   rJ   use_ropeuse_qk_normr   r   r   r   r   maxr   head_dimq_sizekv_sizescalingattn_temperature_tuningfloor_scale
attn_scaler   r   n_repr   rms_norm_epsqk_normhasattrr   r    r   r   r   r   get_name
model_typer   rw   
rotary_embr   r   )rX   rA   rB   rJ   r   r   r   r   r   rC   rF   r   rD   attn_tp_rankattn_tp_sizeqkv_quant_configo_quant_configr   is_ggufrZ   r=   r>   rR      s   




	zLlama4Attention.__init__	positionsc                 C   s6   t |d | j }t |d | j d }|dS )Ng      ?r/   )r2   floorr   logr   	unsqueeze)rX   r   r   r   r=   r=   r>   _get_attn_scale5  s   
zLlama4Attention._get_attn_scaleTr'   c                 C   s   |  |}|| |jS re   )r   r5   r6   )rX   r   qr   r=   r=   r>   _mul_attn_scale:  s   
zLlama4Attention._mul_attn_scaler*   r\   c                 C   s  |  |\}}|j| j| j | jgdd\}}| jd ur6|j| j| jgdd\}}	| |||	\}
}~~	~
~| jd urY|d| j 	 }| |
tj	}|d| j| j }|j| j| jgdd\}}| jrs| jss| j||d}| ||||}| |\}}|S )Nr/   r0   )r   r   )r   splitr   r   r   r   r8   r   
contiguousbfloat16r5   r2   r   r   r   r   r   )rX   r   r*   r\   qkvrj   qkvq_viewk_viewq_out_unusedk_out_unusedr   kattn_outputoutputr=   r=   r>   rd   ?  s     

zLlama4Attention.forward)r~   Nr   NFFr@   )rq   rr   rs   r	   rw   r4   r   r   ry   r   r   rx   rR   r2   rv   r   rt   r"   r   r   rd   rz   r=   r=   rZ   r>   r}      s^    		
s
r}   c                       s   e Zd Z			ddededee def fdd	Zded
e	fddZ
d
efddZdejdejdedeej d
eejejf f
ddZ  ZS )Llama4DecoderLayerr   Nr@   rA   rB   rC   rD   c                    s(  t    || _|j| _|j}|j}|j}t | _t	 | _
t||| j|j|j||||ddtd|d| _|| _| |}| |d }	| |d }
|rYt|||td|d| _nt| j|jd|td|d| _t|j|jd	| _t|j|jd	| _tj||j||	|
d
| _t| j| j| jdd| _d S )NF	self_attn)rA   rB   rJ   r   r   r   r   r   rC   rF   r   rD   r^   feed_forwardrA   rB   rC   rD   rN   )rJ   rK   rP   rC   rD   r   )rB   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparseT)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatter) rQ   rR   rB   rJ   r   r   r   r   r   r   r   r}   num_attention_headsnum_key_value_headsr    r   rA   _is_moe_layerr&   r   r   intermediate_size_mlpr   r   r   r   r   init_newnum_hidden_layersr   r   layer_communicator)rX   rA   rB   rC   rD   r   r   r   is_moe_layeris_previous_moe_layeris_next_moe_layerrZ   r=   r>   rR   c  sr   


zLlama4DecoderLayer.__init__r.   c                 C   s,   | j jdkr| j jdkS |d | j j dkS )Nr   r^   )rA   interleave_moe_layer_steprW   )rX   rB   r=   r=   r>   r     s   z Llama4DecoderLayer._is_moe_layerc                 C   s   t | jtr
| jjS | jjS re   )
isinstancer   r&   rA   rK   r   rX   r=   r=   r>   get_intermediate_size  s   z(Llama4DecoderLayer.get_intermediate_sizer   r*   r\   residualc                 C   s|   | j |||\}}|jd dkr| j|||d}| j |||\}}| j |}| |||}| j |||\}}||fS )Nr   )r   r*   r\   )r   prepare_attnr9   r   prepare_mlpshould_use_reduce_scatterr   postprocess_layer)rX   r   r*   r\   r   r]   r=   r=   r>   rd     s,   zLlama4DecoderLayer.forward)r   Nr@   )rq   rr   rs   r	   rw   r   r   ry   rR   rx   r   r   r2   rv   r   r   rd   rz   r=   r=   rZ   r>   r   b  s4    Fr   c                       s   e Zd Z		ddedee deddf fddZ		dd	ej	d
ej	de
dej	dee deej	eej	eej	 f f fddZ  ZS )Llama4ModelNr@   rA   rC   rD   r.   c                    s~   t     | _ j| _ j| _t j jtd|t	 d| _
t j fddtd|d| _t j jd| _g | _d S )Nembed_tokens)rC   rD   use_attn_tp_groupc                    s   t  | |dS )Nr   )r   )idxrD   rA   rC   r=   r>   <lambda>  s    z&Llama4Model.__init__.<locals>.<lambda>layers)rD   r   )rQ   rR   rA   pad_token_idpadding_idx
vocab_sizer   rJ   r    r   r   r$   r   r   r   r   normlayers_to_capturerX   rA   rC   rD   rZ   r   r>   rR     s$   

zLlama4Model.__init__	input_idsr   r\   input_embedspp_proxy_tensorsc                 C   s   |d u r
|  |}n|}d }g }tt| jD ]}	|	| jv r%|||  | j|	 }
|
||||\}}q|j sA| ||\}}t|dkrI|S ||fS )Nr   )	r   rangelenr   r   appendr`   is_idler   )rX   r   r   r\   r   r   r*   r   aux_hidden_statesilayerrj   r=   r=   r>   rd     s(   



zLlama4Model.forwardrp   )NN)rq   rr   rs   r	   r   r   ry   rR   r2   rv   r   r   r   r   r   rd   rz   r=   r=   rZ   r>   r     s6    !r   c                       sv   e Zd Zg dddgdZ		ddedee d	ef fd
dZdd Z	dd Z
		ddedee d	efddZ  ZS )Llama4ForCausalLM)r   k_projv_proj	gate_projup_proj)r   gate_up_projNr@   rA   rC   rD   c                    s   t  ||| d S re   )rQ   rR   r   rZ   r=   r>   rR     s   zLlama4ForCausalLM.__init__c                 C      | j jS re   )modelr   r   r=   r=   r>   get_input_embeddings&     z&Llama4ForCausalLM.get_input_embeddingsc                 C   r  re   )r  r   r   r=   r=   r>   
get_layers)  r  zLlama4ForCausalLM.get_layersc                 C   s   t |||dS )N)rC   rD   )r   r   r=   r=   r>   _init_model,  s   zLlama4ForCausalLM._init_modelrp   )rq   rr   rs   packed_modules_mappingr	   r   r   ry   rR   r  r  r	  rz   r=   r=   rZ   r>   r     s0    r   )E__doc__loggingtypingr   r   r   r   r   r   r2   r   transformersr	   sglang.srt.distributedr
   r   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   &sglang.srt.layers.moe.fused_moe_tritonr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   r   r   sglang.srt.models.llamar   r   sglang.srt.utilsr    r!   r"   r#   r$   sglang.srt.utils.commonr%   rf   	getLoggerrq   loggerModuler&   r{   rl   r}   r   r   r   
EntryClassr=   r=   r=   r>   <module>   s@    
r #x>
