o
    -i"                  
   @   s  d dl Z d dlmZmZ d dlmZ d dlmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z:m;Z; d dl<m=Z=m>Z>m?Z? d dl@mAZAmBZBmCZCmDZDmEZEmFZFmGZG d dlHmIZI d dlJmKZKmLZL ddlMmNZNmOZOmPZPmQZQmRZRmSZS dd lTmUZU dd!lVmWZWmXZXmYZY dd"lZm[Z[ G d#d$ d$eKZ\G d%d& d&e
j]Z^G d'd( d(e
j]Z_d)d* Z`G d+d, d,e
j]ZaG d-d. d.e
j]ZbG d/d0 d0e
j]ZcG d1d2 d2e
j]ZdG d3d4 d4e
j]Zeed5d ie5d6G d7d8 d8e
j]ZfG d9d: d:eCZgG d;d< d<eBeg ZhG d=d> d>eAeg Zie7jjehegeid?G d@dA dAe
j]eReSeNePeQZkdS )B    N)IterableMapping)tee)	AnnotatedLiteral)nn)BatchFeatureLlama4ConfigLlama4VisionConfig)SizeDict)Llama4Processor)find_supported_resolutionsget_best_fit)support_torch_compile)
VllmConfigset_current_vllm_config)BaseDummyOptions)$get_tensor_model_parallel_world_size)set_forward_context)MMEncoderAttention)FusedMoE)ColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)get_rope)initialize_model)default_weight_loader)MultiModelKeys)should_torch_compile_mm_vit)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MixtureOfExpertsMultiModalEmbeddingsSupportsEagle3SupportsLoRASupportsMultiModal
SupportsPP)Llama4ForCausalLM)AutoWeightsLoaderStageMissingLayermaybe_prefix)run_dp_sharded_vision_modelc                   @   sn   e Zd ZU dZdZed ed< eej	e
ddddf ed< eej	e
df ed< 	 eej	e
dd	f ed
< dS )Llama4ImagePatchInputsz
    Dimensions:
        - batch_size: Batch size
        - total_num_chunks: Batch size * number of chunks
        - num_channels: Number of channels
        - image_size: Size of each image
    pixel_valuestypetotal_num_chunksnum_channels
image_size
batch_sizepatches_per_image   aspect_ratiosN)__name__
__module____qualname____doc__r@   r   __annotations__r   torchTensorr1    rO   rO   _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/mllama4.pyr>   U   s   
 r>   c                       s`   e Zd Z			ddededededed	edB d
edef fddZdej	dej	fddZ
  ZS )Llama4VisionMLPN F
input_sizeintermediate_sizeoutput_sizebiasoutput_activationquant_configprefixuse_data_parallelc	           	         sV   t    t||||| d|d| _t||||| d|d| _t | _|| _	d S )Nz.fc1)rS   rU   rV   rX   rY   
disable_tpz.fc2)
super__init__r   fc1r   fc2r   GELUactivation_fnrW   )	selfrS   rT   rU   rV   rW   rX   rY   rZ   	__class__rO   rP   r]   v   s&   


zLlama4VisionMLP.__init__hidden_statesreturnc                 C   s:   |  |\}}| |}| |\}}| jr| |S |S N)r^   ra   r_   rW   rb   re   _rO   rO   rP   forward   s   

zLlama4VisionMLP.forwardNrR   F)rH   rI   rJ   intboolr   strr]   rM   rN   rj   __classcell__rO   rO   rc   rP   rQ   u   s,    	rQ   c                       s8   e Zd Z		d	dedB def fddZdd Z  ZS )
Llama4MultiModalProjectorNrR   rX   rY   c                    s2   t    t|jj|jjd|d| dd| _d S )NFTz	.linear_1)rS   rU   rV   rX   gather_outputrY   )r\   r]   r   vision_configvision_output_dimtext_confighidden_sizelinear_1)rb   configrX   rY   rc   rO   rP   r]      s   
z"Llama4MultiModalProjector.__init__c                 C   s   |  |\}}|S rg   )rv   )rb   image_featuresre   ri   rO   rO   rP   rj      s   z!Llama4MultiModalProjector.forward)NrR   )rH   rI   rJ   r   rn   r]   rj   ro   rO   rO   rc   rP   rp      s    rp   c           
   	   C   s   | j \}}}tt|}| |||d} |  \}}}}| ||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }||d|j d }	|	S )Nr   rF   r2      )shaperl   mathsqrtviewsizepermute
contiguous)
input_tensorshuffle_ratiorD   num_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensorrO   rO   rP   pixel_shuffle   s"   

r   c                       sL   e Zd Z			ddedB dedef fddZd	ejd
ejfddZ	  Z
S )Llama4VisionPixelShuffleMLPNrR   FrX   rY   rZ   c              
      s\   t    |j| _t|j| jd  | _|j| _t|j	|j|j|j
d|| d|d| _d S )NrF   T.mlprS   rT   rU   rV   rW   rX   rY   rZ   )r\   r]   pixel_shuffle_ratiorl   projector_input_dim	inner_dimprojector_output_dim
output_dimrQ   rT   multi_modal_projector_biasmlprb   rw   rX   rY   rZ   rc   rO   rP   r]      s    
z$Llama4VisionPixelShuffleMLP.__init__encoded_patchesrf   c                 C   s   t || j}| |S rg   )r   r   r   )rb   r   rO   rO   rP   rj      s   
z#Llama4VisionPixelShuffleMLP.forwardrk   )rH   rI   rJ   r   rn   rm   r]   rM   rN   rj   ro   rO   rO   rc   rP   r      s    r   c                	       N   e Zd Z		ddededB dedef fdd	Zd
ej	dej	fddZ
  ZS )Llama4VisionAttentionrR   Frw   rX   NrY   rZ   c                    sx  t    || _|rdnt | _|j| _|j| _|j| j | _	| j| j dks)J | j| j | _
| j
| j	 | _| j
| j	 | _|j| _| j	d | _t| j
| j	| j| _|r{t| j| jd| j  d|| dd| _t| j| j	 | jd|| dd| _n$t| j| j	| jd|| dd| _t| j| j	 | jdd|| dd	| _d
|jd dd}t| j	|j|j d |dtjd| _d S )Nr2   r         rF   Tz	.qkv_proj)rV   rX   rY   z.o_proj)rV   input_is_parallelrX   rY   mllama4
rope_thetag      ?)	rope_typer   partial_rotary_factorF)	head_sizemax_positionrope_parametersis_neox_styledtype)r\   r]   rw   r   tp_sizeru   	embed_dimnum_attention_heads	num_headshead_dimnum_local_headsq_sizekv_sizeattention_dropoutscalingr   attnr   qkv_projo_projr   r   r   r   rC   r   rM   	complex64
rotary_emb)rb   rw   rX   rY   rZ   r   rc   rO   rP   r]      st   




zLlama4VisionAttention.__init__re   rf   c           	      C   s   |j d d }| |\}}|j| j| j| jgdd\}}}||j d |j d | j| j}||j d |j d | j| j}| ||\}}||j d |j d d}||j d |j d d}| 	|||}|j
g |dR   }| |\}}|S )Nry   dimr   r2   )r{   r   splitr   r   r~   r   r   r   r   reshaper   r   )	rb   re   input_shapeqkvri   qkvattn_outputrO   rO   rP   rj   6  s      zLlama4VisionAttention.forwardrR   FrH   rI   rJ   r
   r   rn   rm   r]   rM   rN   rj   ro   rO   rO   rc   rP   r      s"    Jr   c                	       sH   e Zd Z		ddededB dedef fdd	Zd
ej	fddZ
  ZS )Llama4VisionEncoderLayerrR   Frw   rX   NrY   rZ   c              
      s   t    |j| _|j| _|j| _t||| d|d| _t|j|j|jdd|| d|d| _t	
|j| _t	
|j| _d S )Nz
.self_attnrX   rY   rZ   TFr   r   )r\   r]   ru   r   rT   r   	self_attnrQ   r   r   	LayerNorminput_layernormpost_attention_layernormr   rc   rO   rP   r]   N  s,   
z!Llama4VisionEncoderLayer.__init__hidden_statec                 C   sJ   |}|  |}| |}|| }|}| |}| |}|| }|f}|S rg   )r   r   r   r   )rb   r   residualoutputsrO   rO   rP   rj   n  s   



z Llama4VisionEncoderLayer.forwardr   r   rO   rO   rc   rP   r   M  s     r   c                	       r   )Llama4VisionEncoderrR   Frw   rX   NrY   rZ   c                    s:   t     | _t fddt jD | _d S )Nc                    s&   g | ]}t   d | dqS )z.layers.r   )r   ).0	layer_idxrw   rY   rX   rZ   rO   rP   
<listcomp>  s    z0Llama4VisionEncoder.__init__.<locals>.<listcomp>)r\   r]   rw   r   
ModuleListrangenum_hidden_layerslayersr   rc   r   rP   r]     s   

zLlama4VisionEncoder.__init__re   rf   c                 C   s    | j D ]
}||}|d }q|S )aR  
        Args:
            hidden_states: Input tensor of shape
                (batch_size, sequence_length, hidden_size).
                Hidden states from the model embeddings, representing
                the input tokens.
                associated vectors than the model's internal embedding
                lookup matrix.
        r   )r   )rb   re   encoder_layerlayer_outputsrO   rO   rP   rj     s   

zLlama4VisionEncoder.forwardr   r   rO   rO   rc   rP   r     s"    r   c                	       P   e Zd Z			ddededB dedef fdd	Zd
ej	dej	fddZ
  ZS )Llama4UnfoldConvolutionNrR   Frw   rX   rY   rZ   c              	      sl   t    |j}t|tr||f}tjj||jd| _t	|j
|d  |d  |jdd|| d|d| _d S )N)kernel_sizestrider   r2   FTz.linear)rS   rU   rV   rq   rX   rY   r[   )r\   r]   r   
isinstancerl   rM   r   Unfoldunfoldr   rB   ru   linear)rb   rw   rX   rY   rZ   r   rc   rO   rP   r]     s   

z Llama4UnfoldConvolution.__init__re   rf   c                 C   s*   |  |}|ddd}| |\}}|S )Nr   rF   r2   )r   r   r   rh   rO   rO   rP   rj     s   
zLlama4UnfoldConvolution.forwardrk   r   rO   rO   rc   rP   r     s    r   images_flattened)dynamic_arg_dims	enable_ifc                	       r   )Llama4VisionModelNrR   Frw   rX   rY   rZ   c                    s   t    || _|j| _|j| _|j| _|j| _| j| j d d | _|jd | _t	||| d|d| _
t| jt| j | _t| jt| j| j | _tj| jdd| _tj| jdd| _t||| d|d| _t||| d	|d
| _d S )NrF   r2   r   z.patch_embeddingr   gh㈵>)epsz.modelz.vision_adapter)rY   rZ   )r\   r]   rw   rC   r   ru   rB   r   scaler   patch_embeddingr   	ParameterrM   randnclass_embeddingpositional_embedding_vlmr   layernorm_prelayernorm_postr   modelr   vision_adapterr   rc   rO   rP   r]     s@   
zLlama4VisionModel.__init__r   rf   c                 C   s   |  |}|j\}}}| j|jd d|jd }tj||gdd}|d7 }||d||}| jj|j	|j
d}|| }| |}||d|}| |}| |}|d d d dd d f }| |}|S )Nr   r2   ry   r   )r   device)r   r{   r   expandrM   catr   r   tor   r   r   r~   r   r   r   )rb   r   r   	num_tilesr   
hidden_dimr   positional_embeddingrO   rO   rP   rj     s0   




zLlama4VisionModel.forwardrk   r   rO   rO   rc   rP   r     s$    0r   c                       s   e Zd Zdeddf fddZdefddZdedefd	d
Z	de
eedB f fddZededefddZdefddZdefddZ  ZS )Mllama4ProcessingInfoctxrf   Nc                    s   t  | d S rg   )r\   r]   )rb   r   rc   rO   rP   r]   *  s   zMllama4ProcessingInfo.__init__c                 C   s   | j tS rg   )r   get_hf_configr	   rb   rO   rO   rP   r   -  s   z#Mllama4ProcessingInfo.get_hf_configkwargsc                 K   s    | j jtfd|ddi|S )Nuse_fastT)r   get_hf_processorr   pop)rb   r   rO   rO   rP   r   0  s   
z&Mllama4ProcessingInfo.get_hf_processorc                 C   s   dd iS )NimagerO   r   rO   rO   rP   get_supported_mm_limits5  s   z-Mllama4ProcessingInfo.get_supported_mm_limitsrr   c                 C   sX   | j }| j}|| dksJ d| dd|  ttd| jd  }|| d | S )Nr   zchunk size z should be multiple of zpatch_size g      ?rF   )rC   r   rl   roundr   )rr   rC   r   ds_ratiorO   rO   rP   get_patch_per_chunk:  s   

z)Mllama4ProcessingInfo.get_patch_per_chunkc                 C   s   |   j}|jS rg   )r   image_processormax_patches)rb   r   rO   rO   rP   get_max_num_tilesG  s   
z'Mllama4ProcessingInfo.get_max_num_tilesc                 C   s$   |   j}|j}t|  | |dS )Nr   r   )r   rr   rC   r&   r  )rb   rr   rC   rO   rO   rP   !get_image_size_with_most_featuresK  s   
z7Mllama4ProcessingInfo.get_image_size_with_most_features)rH   rI   rJ   r+   r]   r	   r   objectr   r   r   rn   rl   r   staticmethodr
   r   r  r&   r  ro   rO   rO   rc   rP   r   )  s    r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )Mllama4MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrf   c                    s
  | j  }|d u r||ddS t j||||d}| j jdi |}|j | j  j}|dd urd|v s:J d|d }	| 	 
d|	idt}
|jt| j  tdd	 fd
d|
D }fdd|D }dd |D }t||d< t||d< |S )NF)add_special_tokens)r  r  r	  r
  r?   imagesz=images expected to be in mm_data when pixel_values is presentr   r  )max_num_chunksr   c                    s2   g | ]}t |jd  |jd ft jdqS )r2   r   )resize_to_max_canvas)r   r   rM   tensorr  )r   r   )r   possible_resolutionsrO   rP   r   z  s    zAMllama4MultiModalProcessor._call_hf_processor.<locals>.<listcomp>c                    s$   g | ]}|d    |d   fqS r   r2   rO   )r   rC   )	tile_sizerO   rP   r     s    c                 S   s,   g | ]\}}|| d krd nd ||  qS )r2   rO   )r   r_hr_wrO   rO   rP   r     s     rG   rE   rO   )infoget_tokenizerr\   _call_hf_processorr   r   r   rr   get_get_data_parserparse_mm_data	get_itemsr%   rC   r   r  r   rM   r  )rb   r  r  r	  r
  	tokenizerprocessed_outputs	processorrr   r  parsed_imagesbest_fit_sizesrG   rE   rc   )r   r  r  rP   r  S  sL   




	z-Mllama4MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s4   | dtd}ttd|tdtddS )NrE   r   r   )r?   rE   rG   )r  rM   emptydictr#   flat_from_sizesbatched)rb   r!  r"  rE   rO   rO   rP   _get_mm_fields_config  s   z0Mllama4MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sb   | j  }|j}| j || j jdi |  j} jdtf fdd}td||dgS )Nitem_idxc                    s0   d |  }|d j } j|d}t|S )Nr   rG   )aspect_rationum_patches_per_chunk)data_prompt_split_imager.   select_text)r*  out_itemr+  replhf_processorimg_patch_tokenr,  r)  rO   rP   get_replacement  s   
zGMllama4MultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   )modalitytargetreplacementrO   )	r  r   rr   r   r   image_tokenr4  rl   r,   )rb   r(  r"  r)  rw   rr   r9  r5  rO   r2  rP   _get_prompt_updates  s   
z.Mllama4MultiModalProcessor._get_prompt_updates)rH   rI   rJ   rn   r   r  r   r  r#   r'  r'   r$   listr-   r:  ro   rO   rO   rc   rP   r  R  s8    


=



r  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Mllama4DummyInputsBuilder	mm_countsrf   c                 C   s$   | dd}| j }|j}|| S )Nr   r   )r  r  r   fake_image_token)rb   r=  
num_imagesr  r9  rO   rO   rP   get_dummy_text  s   
z(Mllama4DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nr   r   )r   r   r?  	overrides)r  r  r  _get_dummy_images)rb   rA  r=  rB  r?  target_widthtarget_heightimage_overridesrO   rO   rP   get_dummy_mm_data  s   z+Mllama4DummyInputsBuilder.get_dummy_mm_datarg   )
rH   rI   rJ   r   rn   rl   r@  r   r"   rH  rO   rO   rO   rP   r<    s    
r<  )r  dummy_inputsc                       s  e Zd Zg dddgdZdZedededed	B fd
dZddde	def fddZ
deedf dd	fddZdeedf fddZdejdejdejfddZdedefdd Zd!eded	B fd"d#Zd$edefd%d&Zdefd'd(Z				dId)ejd*ejd+ed	B d,ejd	B d!edejeB fd-d.Zd/ejdejd	B fd0d1Zd2eeeejf  dedeeeeejf  eeeejf  f fd3d4Zd2eeeejf  deeeejf  fd5d6Zd7edefd8d9Zd2eeeejf  deeeeejf  eeeejf  f fd:d;Z d2eeeejf  d<e!deeeeejf  e"e f fd=d>Z#d?eeeejf  d<e!d@ede"e fdAdBZ$deeeeeef  fdCdDZ%d2eeeejf  de"e fdEdFZ&de'fdGdHZ(  Z)S )JLlama4ForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)r   gate_up_projTr6  irf   Nc                 C   s   | drdS td)Nr   z	<|image|>z Only image modality is supported)
startswith
ValueError)clsr6  rQ  rO   rO   rP   get_placeholder_str  s   
z2Llama4ForConditionalGeneration.get_placeholder_strrR   )rY   vllm_configrY   c             
      s  t    |jj}|j}|jj}|jdk| _|| _|| _	|| _|| _| 
|dR ddlm} t|, |ddd t|jd t|d| jd	| _W d    n1 sSw   Y  W d    n1 sbw   Y  t| j	d t|d
d| _W d    n1 s}w   Y  | | t||jdgt|dtd| _W d    n1 sw   Y  | jj| _d| _| jj| _| jj| _| jj| _| jj| _| jj | _ | jj!| _!| jj"| _"t#| j"| _$d S )Nr-  r   r   )set_model_tagr   T)
is_encodervision_model)rw   rX   rY   rZ   multi_modal_projector)rw   rX   rY   LlamaForCausalLMlanguage_model)rV  rY   model_classr2   )%r\   r]   model_config	hf_configrX   multimodal_configmm_encoder_tp_moderZ   rV  rw   _mark_tower_modelvllm.compilation.backendsrW  r   r   rr   r<   rY  rp   rZ  _mark_language_modelr   with_hf_configrt   r9   r\  make_empty_intermediate_tensorsnum_expert_groupsnum_logical_expertsnum_physical_expertsnum_local_physical_expertsnum_routed_expertsnum_shared_expertsnum_redundant_experts
moe_layerslennum_moe_layers)rb   rV  rY   rw   rX   r`  rW  rc   rO   rP   r]     sd   


 









z'Llama4ForConditionalGeneration.__init__r   .c                 C   s    t | jdsJ | j| dS )zBSet which layers should output auxiliary hidden states for EAGLE3.set_aux_hidden_state_layersN)hasattrr\  rq  )rb   r   rO   rO   rP   rq  2  s   z:Llama4ForConditionalGeneration.set_aux_hidden_state_layersc                 C   s   t | jdsJ | j S )zGet the layer indices for auxiliary hidden state outputs.

        Note: The GPU model runner will override this with layers from
        the speculative config if available, providing dynamic configuration.
        "get_eagle3_aux_hidden_state_layers)rr  r\  rs  r   rO   rO   rP   rs  8  s   
zALlama4ForConditionalGeneration.get_eagle3_aux_hidden_state_layersexpert_load_viewlogical_to_physical_maplogical_replica_countc                 C   s   | j ||| | j j| _d S rg   )r\  set_eplb_stateexpert_weights)rb   rt  ru  rv  rO   rO   rP   rw  B  s   z-Llama4ForConditionalGeneration.set_eplb_stateri  rj  c                 C   s   | j || d S rg   )r\   update_physical_experts_metadata)rb   ri  rj  rO   rO   rP   ry  M  s   z?Llama4ForConditionalGeneration.update_physical_experts_metadatar   c                 K   s<   | dd }|d u rd S | d}| d}td|||dS )Nr?   rE   rG   )r@   r?   rE   rG   )r   r>   )rb   r   r?   rE   rG   rO   rO   rP   _parse_and_validate_image_inputT  s   

z>Llama4ForConditionalGeneration._parse_and_validate_image_inputimage_inputc                 C   sd   | j r| jsJ |d }|d  }| jrt|| j }n|  |}| |}dd |j|ddD S )Nr?   rE   c                 S   s   g | ]}| d dqS r  )flatten)r   imgrO   rO   rP   r   w  s    
zGLlama4ForConditionalGeneration._process_image_input.<locals>.<listcomp>r   r   )rY  rZ  tolistrZ   r=   r   )rb   r{  r?   rE   vision_embeddings_flatrO   rO   rP   _process_image_inputf  s   

z3Llama4ForConditionalGeneration._process_image_inputc                 K   sV   | j di |}|d u rg S td | j | |W  d    S 1 s$w   Y  d S )NrO   )rz  r   rV  r  )rb   r   r{  rO   rO   rP   embed_multimodal|  s   
$z/Llama4ForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s   |d urd }|  ||||S rg   )r\  )rb   r  r  r  r  r   rO   rO   rP   rj     s
   z&Llama4ForConditionalGeneration.forwardre   c                 C   s   | j |S rg   )r\  compute_logits)rb   re   rO   rO   rP   r    s   z-Llama4ForConditionalGeneration.compute_logitsweightsc                    s^   t |d\dttttjf  f fdd}dttttjf  f fdd}| | fS )NrF   rf   c                  3   s(    D ]\} }|   r| |fV  qd S rg   rR  namer-  )rY   weights1rO   rP   get_prefix_weights     

zKLlama4ForConditionalGeneration.separate_weights.<locals>.get_prefix_weightsc                  3   s(    D ]\} }|   s| |fV  qd S rg   r  r  )rY   weights2rO   rP   get_other_weights  r  zJLlama4ForConditionalGeneration.separate_weights.<locals>.get_other_weights)r   r   tuplern   rM   rN   )rb   r  rY   r  r  rO   )rY   r  r  rP   separate_weights  s   ""z/Llama4ForConditionalGeneration.separate_weightsc                 c   s    dddd}i }|D ]/\}}|  D ]!\}}||vrq||d}||vr-d gd ||< ||| |<  n||fV  q|  D ]\}	}
tj|
dd}|	|fV  q?d S )Nr   r2   rF   ).self_attn.q_proj.self_attn.k_proj.self_attn.v_proj.self_attn.qkv_projrz   r   )itemsreplacerM   r   )rb   r  qkv_idx_mappingsqkv_weightsr  loaded_weightweight_nameidxnew_namekeyweight
qkv_weightrO   rO   rP   _consolidate_qkv_weights  s*   
z7Llama4ForConditionalGeneration._consolidate_qkv_weightsr  c                 C   s   | ds
| drr| dr|dddn|}d|v rNd|v s$d|v rNd|v r.|ddS d	|v r8|d	d
S d|v rB|ddS d|v rL|ddS |S d|v rpd|v sZd|v rpd|v rd|ddS d|v rn|ddS |S |S | dr}|ddS |S )zKRename weights from ModelOpt llama4 fp8 checkpoints to vLLM
        format.zmodel.zlanguage_model.model.r2   feed_forward.experts._input_scale_weight_scaledown_proj_input_scalew2_input_scaledown_proj_weight_scalew2_weight_scalegate_up_proj_input_scalew13_input_scalegate_up_proj_weight_scalew13_weight_scalez
self_attn.z.k_scalez.v_scalez.k_proj.k_scalez.attn.k_scalez.v_proj.v_scalez.attn.v_scalezlm_head.weightzlanguage_model.lm_head.weight)rR  r  )rb   r  renamedrO   rO   rP   &_rename_weight_for_modelopt_checkpoint  s<   
zELlama4ForConditionalGeneration._rename_weight_for_modelopt_checkpointc                 C   sr   g }g }|D ].\}}|  |}|ddd }tt| |tr q|dr-|||f q|||f q||fS )zORename weights and separate them into language_model and other
        weights..r2   r   zlanguage_model.)r  r   r   getattrr;   rR  append)rb   r  language_model_weightsother_weightsr  r  r  attrrO   rO   rP   _separate_and_rename_weights  s   

z;Llama4ForConditionalGeneration._separate_and_rename_weightsparams_dictc           	      C   s   g }g }t  }|D ]G\}}d|v rId|v rId|vrI||v rA|| }t|drA|j dkrA| dkrA|j|  || q	|||f q	|||f q	|||fS )zHandle expert scale parameters that need broadcasting.

        ModelOpt checkpoints use a single value tensor scalar for BMM style
        experts, vLLM expects the scale to be broadcasted across all experts.
        r  r   z.shared_expertr-  r2   )setrr  r-  numelfill_itemaddr  )	rb   r  r  regular_weightsexpert_scale_weightsupdated_paramsr  r  paramrO   rO   rP   !_handle_expert_scale_broadcasting  s&   

z@Llama4ForConditionalGeneration._handle_expert_scale_broadcastingr  stacked_params_mappingc                 C   s   t  }| jr| |}|D ]A\}}|D ]&\}}}	||vs| jr q|||}|| }
|| |
j}||
||	  n|| }
t|
dt}||
| || q|S )z6Load non-language-model weights with stacking support.weight_loader)r  rZ   r  r  r  r  r  r   )rb   r  r  r  r  r  r  
param_namer  shard_idr  r  rO   rO   rP   _load_other_weights-  s&   



z2Llama4ForConditionalGeneration._load_other_weightsc                 C   s   t j| ddd| jjj| jdS )NrN  	down_projrO  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_expertsrm  )r   make_expert_params_mappingrw   rt   num_local_expertsrm  r   rO   rO   rP   get_expert_mappingM  s   z1Llama4ForConditionalGeneration.get_expert_mappingc                 C   s   g d}t |  }t }| |\}}| ||\}}}	||	 t| }
|
|}|d us1J || |rD|
|}|rD|| || ||| |S )N))r  r  r   )r  r  r   )r  r  r   ).shared_expert.gate_up_projz.shared_expert.gate_projr   )r  z.shared_expert.up_projr2   ).feed_forward.gate_up_projz.feed_forward.gate_projr   )r  z.feed_forward.up_projr2   )	r$  named_parametersr  r  r  updater:   load_weightsr  )rb   r  r  r  r  r  r  r  r  updated_params_from_expertsloaderloaded_language_model_paramsloaded_expert_scale_paramsrO   rO   rP   r  Y  s*   





z+Llama4ForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r\  zmulti_modal_projector.zvision_model.)r\  	connectortower_model)r   from_string_fieldr   rO   rO   rP   get_mm_mapping  s
   z-Llama4ForConditionalGeneration.get_mm_mapping)NN)*rH   rI   rJ   packed_modules_mappingsupports_encoder_tp_dataclassmethodrn   rl   rU  r   r]   r  rq  rs  rM   rN   rw  ry  r  r>   rz  r4   r  r  r/   rj   r  r   r  r  r  r;  r  r$  r  r  r  r  r  r   r  ro   rO   rO   rc   rP   rJ    s    8






&

,&

%
 $+rJ  )lr|   collections.abcr   r   	itertoolsr   typingr   r   rM   r   transformersr   r	   r
   transformers.image_utilsr   transformers.models.llama4r   7transformers.models.llama4.image_processing_llama4_fastr   r   vllm.compilation.decoratorsr   vllm.configr   r   vllm.config.multimodalr   vllm.distributedr   vllm.forward_contextr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   $vllm.model_executor.layers.fused_moer   !vllm.model_executor.layers.linearr   r   r   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   &vllm.model_executor.model_loader.utilsr   -vllm.model_executor.model_loader.weight_utilsr   )vllm.model_executor.models.module_mappingr   !vllm.model_executor.models.visionr    vllm.multimodalr!   vllm.multimodal.inputsr"   r#   r$   vllm.multimodal.parser%   r&   r'   vllm.multimodal.processingr(   r)   r*   r+   r,   r-   r.   vllm.sequencer/   vllm.utils.tensor_schemar0   r1   
interfacesr3   r4   r5   r6   r7   r8   llama4r9   utilsr:   r;   r<   visionr=   r>   ModulerQ   rp   r   r   r   r   r   r   r   r   r  r<  register_processorrJ  rO   rO   rO   rP   <module>   s~   $	  )b5+[)n

