o
    ia                     @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ee6Z7G dd dej8Z9G dd dej8Z:G dd dej8Z;eG dd de/Z<G d d! d!e-e%Z=dS )"z?Inference-only LLaMA model compatible with HuggingFace weights.    )IterableN)nn)Llama4TextConfig)support_torch_compile)CacheConfig
VllmConfig)get_ep_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)init_logger)	AttentionChunkedLocalAttention)SharedFusedMoE)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)get_rope)default_weight_loadermaybe_remap_kv_scale_name)MixtureOfExperts)sequence_parallel_chunk)current_platform)is_torch_equal_or_newer   )LlamaForCausalLMLlamaMLP
LlamaModel)AutoWeightsLoaderPPMissingLayerextract_layer_index	fast_topkis_pp_missing_parameterc                       sb   e Zd Zedejdejdededeejejf f
ddZ	dd	e
d
ef fddZdd Z  ZS )	Llama4MoEhidden_statesgating_outputtopkrenormalizereturnc                 C   s0   t ||dd\}}t| }||tjfS Ndim)r"   torchsigmoidfloattoint32)r%   r&   r'   r(   router_scoresrouter_indices r5   W/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/llama4.pycustom_routing_functionF   s   z!Llama4MoE.custom_routing_function vllm_configprefixc                    sF  t    |jj}|j}|j}t | _|j| _	|j
| _t j| _t j| _| j | _|j}t|j|jdd | dd| _t|j|d|d| dd| jd| _|rT|jnd }|r[|jnd| _|rc|jnd| _|j| _| j| _d| _ |j| _!| j!| j | _"| j"| j | _#t$| j|j|j|jt%j&|d	dd|| d
| j| j| jd| _'d S )NFz.router)biasquant_configr:   siluz.shared_expert)hidden_sizeintermediate_size
hidden_actr<   r;   r:   reduce_results
disable_tpr   r   Tz.experts)shared_expertsnum_expertstop_kr>   r7   r?   apply_router_weight_on_inputrA   r(   r<   r:   is_sequence_parallelenable_eplbnum_redundant_experts)(super__init__model_config	hf_configparallel_configr<   r	   tp_sizenum_experts_per_tokrE   use_sequence_parallel_moerG   r   device_groupep_grouprank_in_groupep_ranksizeep_sizer?   r   r>   num_local_expertsrouterr   shared_experteplb_configrH   rI   n_redundant_expertsn_routed_expertsn_logical_expertsn_shared_expertsn_local_expertsn_physical_expertsn_local_physical_expertsr   r$   r7   experts)selfr9   r:   configrN   r<   intermediate_size_moer[   	__class__r5   r6   rK   R   sl   


zLlama4MoE.__init__c                 C   sz   |j d }| jrt|}| |\}}| j||d\}}|| }| jr0t|d}|d | }|S | jdkr;| j|}|S )Nr   )r%   router_logitsr   )shaperG   r   rY   rc   r
   rO   &maybe_all_reduce_tensor_model_parallel)rd   r%   
num_tokensri   _
shared_out
routed_outexperts_outr5   r5   r6   forward   s$   



zLlama4MoE.forward)r8   )__name__
__module____qualname__staticmethodr.   Tensorintbooltupler7   r   strrK   rq   __classcell__r5   r5   rg   r6   r$   E   s    Ar$   c                       s   e Zd Z						ddedededed	ed
edB dedededB deddf fddZ	de
jde
jfddZde
jde
jde
jfddZ  ZS )Llama4Attention    NFr8   re   r>   	num_headsnum_kv_headsmax_position_embeddingsr<   r;   bias_o_projcache_configr:   r)   c              	      s.  t    t|
| _|| _|j| _| j| j dk| _|jo | j | _t }|| _	| j	| dks1J | j	| | _
|| _| j|krI| j| dksHJ n	|| j dksRJ td| j| | _|j| _| j
| j | _| j| j | _| jd | _| jox|j| _t|dd| _t|dd| _|| _| j
| j | _| jrt| j|jdtjd	nd | _t|| j| j	| j|||
 d
d| _t| j	| j ||||
 dd| _ d}|o|! dk}|r|j"dkrd}| jst#| j||j$|dnd | _%| j o|j&}|rt'nt(}|| j
| j| jf| j|	||
 dd|rd|j&ini | _)d S )Nr   r   g      floor_scaleg      @
attn_scaleg?F)r>   eps
has_weightdtype	.qkv_proj)r>   	head_sizetotal_num_headstotal_num_kv_headsr;   r<   r:   z.o_proj)
input_sizeoutput_sizer;   r<   r:   Tggufllama)max_positionrope_parametersis_neox_stylez.attn)r   r   r<   r:   attention_chunk_size)*rJ   rK   r!   	layer_idxr>   no_rope_layersnopeuse_qk_normr	   r   r~   r   maxr   head_dimq_sizekv_sizescalingattn_temperature_tuninggetattrr   r   r   n_repr   rms_norm_epsr.   float32qk_normr   qkv_projr   o_projget_name
model_typer   r   
rotary_embr   r   r   attn)rd   re   r>   r~   r   r   r<   r;   r   r   r:   rO   r   is_ggufuse_chunked_local_attnattn_clsrg   r5   r6   rK      s   





	
zLlama4Attention.__init__	positionsc                 C   s6   t |d | j }t |d | j d }|dS )Ng      ?r+   )r.   floorr   logr   	unsqueeze)rd   r   r   r   r5   r5   r6   _get_attn_scale  s   
zLlama4Attention._get_attn_scaler%   c                 C   s   |  |\}}|j| j| j| jgdd\}}}| jd ur%| |||\}}| jd urX|d| j}| | d| j	|j
}|d| j}| | d| j	|j
}| jrk| jrk| |}|| 	|j
}| |||}	| |	\}
}|
S r*   )r   splitr   r   r   r   reshaper   r0   r1   r   r   r   r   r   r   )rd   r   r%   qkvrm   qkvr   attn_outputoutputr5   r5   r6   rq     s    

  

zLlama4Attention.forward)r}   NFFNr8   )rr   rs   rt   r   rw   r   rx   r   rz   rK   r.   rv   r   rq   r{   r5   r5   rg   r6   r|      sL    	
gr|   c                
       sh   e Zd Z		ddedededB ddf fddZd	ejd
ejdejdB de	ejejf fddZ
  ZS )Llama4DecoderLayerr8   Nr9   r:   re   r)   c                    s   t    |p
|jj}|j}|j}t|| _|j| j dk| _	|j
| _
|j}t|| j
|j|j||dd|| dd
| _|jdkoI| jd |j dk}|rWt|| dd| _nt| j
|jd|d| dd	| _t|j
|jd
| _t|j
|jd
| _d S )Nr   Fz
.self_attn)
re   r>   r~   r   r   r<   r;   r   r   r:   r   z.feed_forward)r9   r:   r=   )r>   r?   r@   r<   r;   r:   )r   )rJ   rK   rL   rM   r   r<   r!   r   r   global_layerr>   r   r|   num_attention_headsnum_key_value_heads	self_attninterleave_moe_layer_stepr$   feed_forwardr   intermediate_size_mlpr   r   input_layernormpost_attention_layernorm)rd   r9   r:   re   r   r<   r   is_moe_layerrg   r5   r6   rK   >  sP   



zLlama4DecoderLayer.__init__r   r%   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r   r%   )r   r   r   r   )rd   r   r%   r   r5   r5   r6   rq   r  s   
zLlama4DecoderLayer.forward)r8   N)rr   rs   rt   r   rz   r   rK   r.   rv   ry   rq   r{   r5   r5   rg   r6   r   =  s*    4r   c                       s   e Zd Zdeddededee f fddZ	dd	ed
ej	de
eejf dee deeeeeef  dedefddZdeeeej	f  dee fddZ  ZS )Llama4Modelr8   )r:   
layer_typer9   r:   r   c                   s.   |j jj| _|jjj| _t j	|||d d S Nr9   r:   r   )
rL   rM   rX   rD   rN   r[   rI   r\   rJ   rK   rd   r9   r:   r   rg   r5   r6   rK     s   zLlama4Model.__init__Tnameloaded_weightparams_dictloaded_paramsexpert_params_mappingfusedr)   c                 C   s  d}|r|j dkr|dd}d|v r|jddd}|D ]\}}	}
}|}|r;|	d\}}}}| d| }	| d	}|	|vr@q||	|}t|| rLq|d
sV|dr[||vr[q|| }|j}|rd|v rz|dv snJ |dkrtdnd}|| }t|}| j	| j
jj}|dur|dk  |j}|jt kp|jjo| dk}|jjdkr|rtds|tj| |j}n|| }|d  }
n	 ||||||
d || d}q|S )a,  
        Load MoE expert weights.

        Args:
            name: The name of the weight to load.
            loaded_weight: The weight to load.
            params_dict: The dictionary of module parameters.
            loaded_params: The set of already loaded parameters.
            expert_params_mapping: The mapping of expert parameters. Must be
                generated by SharedFusedMoE.make_expert_params_mapping().
            fused: Whether the expert weights are fused into a single weight
                tensor or are separate weight tensors for each expert.
                When fused is True, loaded_weight should have shape of:
                [num_experts, hidden_in, hidden_out] for gate/up/down proj and
                [hidden_out, hidden_in] for the others like router.
                When fused is False, loaded_weight should have shape of:
                [hidden_out, hidden_in].

        Returns:
            True if loaded_weight is one of MoE weights and the MoE expert
            weights are loaded successfully, False otherwise.
        F   r+   experts.gate_up_proj   r,   .weightz.bias_biasw13)w1w3r   r   r   Ncpuz2.11.0shard_id	expert_idT)ndim	transposechunkr   replacer#   endswithweight_loaderr!   layersr   rc   
expert_mapnonzeroflattenr1   devicer   r   	fp8_dtypeis_floating_pointelement_sizetyper   r.   float16itemadd)rd   r   r   r   r   r   r   expert_param_loaded
param_nameweight_namer   r   new_loaded_weighte_strrm   proj_strfull_param_nameparamr   	shard_idxr   r   local_expert_indicesis_fp8_dtyper5   r5   r6   load_moe_expert_weights  s   !




z#Llama4Model.load_moe_expert_weightsweightsc              	      s  g d}d}t j| ddd| j| jd}t j| ddddd	}t|  }t }|D ]\ }d
 v s5d v r9d}|}| jd urg| j  }	rg||	 }
t	|
dt
}| dkrX|n|d }||
| ||	 q(|D ]R\}}}| vsvd v rwqi drd v s || t | rqi drt |  d u rqi|  }
t	|
dt
}|t
kr||
| n||
|| |   n| j |||||drq(t | rq(g d}d v r*t fdd|D r*|  }
t	|
dt
}t	|ddrd v rdnd} dr|jtjkr|jdkr|dd}||
| |dd  n||
| |  q(|  }
t	|
dt
}||
| |  q(|S )!N))r   z.q_projr   )r   z.k_projr   )r   z.v_projr   ).gate_up_projz
.gate_projr   )r   z.up_projr   F	gate_proj	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerD   rI   gate_up_projr   )r  r  r  rD   r   zexperts.down_projTr   r   rc   )z.k_scalez.v_scaler   scale)r   )w13_input_scalew13_weight_scalew2_input_scalew2_weight_scalezexperts.c                 3   s    | ]}| v V  qd S Nr5   ).0
scale_namer   r5   r6   	<genexpr>  s    
z+Llama4Model.load_weights.<locals>.<genexpr>supports_moe_loadingw2_w2r   weight_scaler   r+   r   r   )r   make_expert_params_mappingrD   r\   dictnamed_parameterssetr<   get_cache_scaler   r   r-   r   r   r   r#   r   r   anyr   r.   float8_e4m3fnr   r   )rd   r   stacked_params_mappingfused_experts_paramsr   expert_params_mapping_fusedr   r   r   r  r   r   r   r   r   scale_namesr5   r  r6   load_weights#  s   















zLlama4Model.load_weights)T)rr   rs   rt   r   r   rz   r   rK   r.   rv   r  r   	Parameterr  listry   rw   rx   r   r   r  r{   r5   r5   rg   r6   r     s8    
 ,r   c                       s   e Zd Zg dddgdZdddedef fd	d
Zdd ZdededdfddZ	de
fdededee
 fddZdeeeejf  dee fddZdedejdeeejf fddZ  ZS )Llama4ForCausalLM)q_projk_projv_projr   r   )r   r  r8   )r:   r9   r:   c                   sT   |j  }||j j |j jdk}|d||j j_t j	||t
d |   d S )Ni   r   r   )rL   try_get_generation_configupdateoverride_generation_configmax_model_lengetrM   r   rJ   rK   r   set_moe_parameters)rd   r9   r:   
gen_configdefault_attn_temperature_tuningrg   r5   r6   rK     s   

zLlama4ForCausalLM.__init__c                 C   s   g | _ g | _d }| jjD ] }t|trqt|tsJ t|jtr,|j}| j	|jj
 q|d u rPd| _d| _d| _d| _d| _d| _d| _d| _td d S t| j| _d| _|j| _|j| _|j| _|j| _|j| _|j| _d S )Nr   z)No Llama4MoE layer found in model.layers.r   )expert_weights
moe_layersmodelr   
isinstancer    r   r   r$   appendrc   num_moe_layersnum_expert_groupsnum_logical_expertsnum_physical_expertsnum_local_physical_expertsnum_routed_expertsnum_shared_expertsrI   loggerwarninglenr^   ra   rb   r]   r_   r\   )rd   example_moelayerr5   r5   r6   r*    s:   
z$Llama4ForCausalLM.set_moe_parametersr5  r6  r)   Nc                 C   st   | j |ksJ || _|| _ || j | _| jjD ] }t|trqt|jt	r7|j}||_
||_| j|_|j  qd S r
  )r6  r5  r4  rI   r/  r   r0  r    r   r$   rb   ra   r\   rc   update_expert_map)rd   r5  r6  r=  moer5   r5   r6    update_physical_experts_metadata  s   

z2Llama4ForCausalLM.update_physical_experts_metadatar   c                 C   s   t |||dS r   )r   r   r5   r5   r6   _init_model%  s   zLlama4ForCausalLM._init_modelr   c                    s6   t   jjr	dgnd d} fdd|D }||S )Nzlm_head.)skip_prefixesc                    s   g | ]
\}}  ||qS r5   )permute_qk_weight_for_rotary)r  r   r   rd   r5   r6   
<listcomp>4  s    
z2Llama4ForCausalLM.load_weights.<locals>.<listcomp>)r   re   tie_word_embeddingsr  )rd   r   loaderr5   rD  r6   r  /  s   

zLlama4ForCausalLM.load_weightsr   r   c                    s   dt jdtdtf fdd}|d}|d dk}|d d	ko$|jt jk}|s)|rMd
|v s1d|v r=|| jj|}||fS d|v sEd|v rM|| jj	|}||fS )Nwn_headsis_weight_scalec                    s    j j| } j j}| jtjkr| jd d |kr|d }n| jtjkr3|r3| jd d |kr3|d }| ||| d d|	dd
||S )Nr   r      )re   r   r>   r   r.   uint8rj   r  viewr   r   )rH  rI  rJ  attn_inattn_outrD  r5   r6   permute@  s   
z?Llama4ForCausalLM.permute_qk_weight_for_rotary.<locals>.permuter   r+   r   r  wkr#  wqr"  )
r.   rv   rw   rx   r   r   r  re   r   r   )rd   r   r   rP  modules	is_weightis_nvfp4_weight_scaler5   rD  r6   rC  :  s(   
z.Llama4ForCausalLM.permute_qk_weight_for_rotary)rr   rs   rt   packed_modules_mappingr   rz   rK   r*  rw   r@  r   r   rA  r   ry   r.   rv   r  r  rC  r{   r5   r5   rg   r6   r!    s<    #

$
r!  )>__doc__collections.abcr   r.   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr   r	   r
   vllm.loggerr   $vllm.model_executor.layers.attentionr   r   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   %vllm.model_executor.models.interfacesr    vllm.model_executor.models.utilsr   vllm.platformsr   vllm.utils.torch_utilsr   r   r   r   r   utilsr   r    r!   r"   r#   rr   r9  Moduler$   r|   r   r   r!  r5   r5   r5   r6   <module>   s@   f I  S