o
    }oi                     @   s  d dl mZmZ d dlmZ d dlmZmZmZ d dl	Z	d dl
Z
d dl	mZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z' z
d dl(m)Z) dZ*W n e+y   dZ*Y nw erd dl,mZ- d dl,m.Z. d dl/m0Z0 eG dd deZ1eG dd deZ2e!3edG dd de!j4def Z5e!6edG dd de!j4edf Z7e!j8d d!d"d#e!j9fd$d%Z:e!j8d!d&d"d#e!j9fd'd(Z;e!j8d)d*d"d#e!j9fd+d,Z<e!j8d*d)d"d#e!j9fd-d.Z=e!j8d/d0d"d#e!j9fd1d2Z>e!j8d0d/d"d#e!j9fd3d4Z?e!j8d5d6d"d#e!j9fd7d8Z@e!j8d6d5d"d#e!j9fd9d:ZAdS );    )	dataclassfield)Path)TYPE_CHECKINGDictTupleN)nn)TokenizerSpecLlama4Config)Llama4Experts16ConfigLlama4Experts128Config)Llama4OmniConfigLlama4OmniModelLlama4VisionConfig)
export_qkvexport_qkv_bias
import_qkvMultimodalProjectorConfig)load_distributed_model_weights)ioteardown)TransformFns_ModelState)logging)TransformerConfigTFLlama4ForConditionalGenerationAutoTokenizerc                   @   T   e Zd ZU dZedd dZeed< edd dZeed< edd dZ	eed	< d
S )Llama4ScoutExperts16ConfigzLlava v1.5 Config 7Bc                   C      t  S N)r    r&   r&   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/llama4/model/llama4_omni.py<lambda>5       z#Llama4ScoutExperts16Config.<lambda>default_factorylanguage_transformer_configc                   C   r$   r%   r   r&   r&   r&   r'   r(   6   r)   vision_transformer_configc                   C      t dddddddS Nmcore_affinei   i   Fprojector_type
input_sizehidden_sizeffn_hidden_sizebiasbias_activation_fusionr   r&   r&   r&   r'   r(   8       vision_projection_configN
__name__
__module____qualname____doc__r   r,   r   __annotations__r-   r9   r&   r&   r&   r'   r#   1      
 r#   c                   @   r"   )Llama4MaverickExperts128ConfigzLlava v1.5 Config 13Bc                   C   r$   r%   )r   r&   r&   r&   r'   r(   G   r)   z'Llama4MaverickExperts128Config.<lambda>r*   r,   c                   C   r$   r%   r   r&   r&   r&   r'   r(   H   r)   r-   c                   C   r.   r/   r   r&   r&   r&   r'   r(   J   r8   r9   Nr:   r&   r&   r&   r'   rA   C   r@   rA   hfc                   @   sl   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
dejdefddZe	defddZdS )HFLlama4OmniImporterzImporter for converting Hugging Face Llama models to NeMo format.

    This class handles the conversion of Hugging Face's LlamaForCausalLM models
    to NeMo's Llama4OmniModel format, including weight mapping and configuration translation.
    returnc                 C   s   t | j| jdS )zInitialize a NeMo Llama4OmniModel instance.

        Returns:
            Llama4OmniModel: Initialized NeMo Llama model with the appropriate configuration
                        and tokenizer.
        )	tokenizer)r   configrE   selfr&   r&   r'   init]   s   zHFLlama4OmniImporter.initoutput_pathc                 C   s   ddl m} |jt| dd}|  }| |}|tj}|tj}| 	|| | 
|| td| d|j d t|| ~~|S )zApply the conversion from HF to NeMo format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved NeMo model
        r   r   autotorch_dtypez.Converted Llama model to Nemo, model saved to z in .)transformersr   from_pretrainedstrrI   
nemo_setuptotorchbfloat16convert_state	nemo_saveprintdtyper   )rH   rJ   r   sourcetargettrainerr&   r&   r'   applyf   s   


zHFLlama4OmniImporter.applyc                 C   s   |  |}i ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1}tttttjd2d3tjd4tjd5d6tjd4g}tj	||||d7S )8aJ  Convert state dict from HF format to NeMo format.

        Maps the weights from the HF model to the NeMo model according to
        the appropriate mapping scheme.

        Args:
            source: Source HF model
            target: Target NeMo model

        Returns:
            The result of applying the transforms
        %vision_model.positional_embedding_vlm'vision_model.position_embeddings.weight*vision_model.patch_embedding.linear.weight!vision_model.conv1._linear.weight!vision_model.layernorm_pre.weightvision_model.ln_pre.weightvision_model.layernorm_pre.biasvision_model.ln_pre.bias"vision_model.layernorm_post.weightvision_model.ln_post.weight vision_model.layernorm_post.biasvision_model.ln_post.bias*vision_model.vision_adapter.mlp.fc1.weight2vision_model.adapter.mlp.encoder.linear_fc1.weight*vision_model.vision_adapter.mlp.fc2.weight2vision_model.adapter.mlp.encoder.linear_fc2.weight3vision_model.model.layers.*.self_attn.o_proj.weight?vision_model.decoder.layers.*.self_attention.linear_proj.weight1vision_model.model.layers.*.self_attn.o_proj.bias=vision_model.decoder.layers.*.self_attention.linear_proj.bias2vision_model.model.layers.*.input_layernorm.weightIvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight0vision_model.model.layers.*.input_layernorm.biasGvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_bias*vision_model.model.layers.*.mlp.fc1.weight3vision_model.decoder.layers.*.mlp.linear_fc1.weight(vision_model.model.layers.*.mlp.fc1.bias1vision_model.decoder.layers.*.mlp.linear_fc1.bias*vision_model.model.layers.*.mlp.fc2.weight3vision_model.decoder.layers.*.mlp.linear_fc2.weight(vision_model.model.layers.*.mlp.fc2.bias1vision_model.decoder.layers.*.mlp.linear_fc2.bias;vision_model.model.layers.*.post_attention_layernorm.weight>vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight<vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias vision_projection.encoder.weight/language_model.embedding.word_embeddings.weightAlanguage_model.decoder.layers.*.self_attention.linear_proj.weightKlanguage_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight-language_model.decoder.final_layernorm.weight"language_model.output_layer.weight8language_model.decoder.layers.*.pre_mlp_layernorm.weight@language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight1language_model.decoder.layers.*.mlp.router.weightDlanguage_model.decoder.layers.*.mlp.shared_experts.linear_fc2.weightz>language_model.decoder.layers.*.mlp.experts.linear_fc2.weight*z>language_model.decoder.layers.*.mlp.experts.linear_fc1.weight*5language_model.decoder.layers.*.mlp.linear_fc2.weight)9vision_model.model.layers.*.post_attention_layernorm.bias%multi_modal_projector.linear_1.weight(language_model.model.embed_tokens.weight5language_model.model.layers.*.self_attn.o_proj.weight4language_model.model.layers.*.input_layernorm.weight language_model.model.norm.weightlanguage_model.lm_head.weight=language_model.model.layers.*.post_attention_layernorm.weightClanguage_model.model.layers.*.dense-post_attention_layernorm.weight8language_model.model.layers.*.feed_forward.router.weightIlanguage_model.model.layers.*.feed_forward.shared_expert.down_proj.weightz>language_model.model.layers.*.feed_forward.experts.*.down_projzAlanguage_model.model.layers.*.feed_forward.experts.*.gate_up_proj;language_model.model.layers.*.feed_forward.down_proj.weightzIlanguage_model.model.layers.*.feed_forward.shared_expert.gate_proj.weightzGlanguage_model.model.layers.*.feed_forward.shared_expert.up_proj.weightDlanguage_model.decoder.layers.*.mlp.shared_experts.linear_fc1.weight
source_key
target_keyfnz;language_model.model.layers.*.feed_forward.gate_proj.weightz9language_model.model.layers.*.feed_forward.up_proj.weight5language_model.decoder.layers.*.mlp.linear_fc1.weightmapping
transforms)
_modify_llama4_source_state_import_cls_token_import_language_qkv_import_vision_qkv_import_vision_qkv_biasr   state_transformr   	merge_fc1apply_transforms)rH   rZ   r[   r   r   r&   r&   r'   rV      s   
	Fz"HFLlama4OmniImporter.convert_stater!   c                 C   s   ddl m} || t| S )zGet the tokenizer for the HF model.

        Returns:
            AutoTokenizer: Tokenizer instance initialized from the HF model's tokenizer
        r   r    )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr!   save_hf_tokenizer_assetsrQ   )rH   r!   r&   r&   r'   rE      s   zHFLlama4OmniImporter.tokenizerrZ   c              	   C   s8  |  }|jjj}| jj}t|jD ]}d}t|jt	r+t
|j|jks&J |j| }|r|d| d}tj||dd}t|D ]\}	}
|
 dd|d| d|	 d< qB|d| d	}tj||dd}t|D ]\}	}
|
 dd|d| d|	 d
< qmq|d| d}||d| d< qt|}|S )a  
        In Llama4, HF weight for local experts are mapped with a single tensor.
        Pre-chunk it before convert_state.
        For dense layer, we change the name for the post attention layer norm to
        avoid the many-to-one mapping in the conversion.
        Tzlanguage_model.model.layers.z".feed_forward.experts.gate_up_projr   )dim   z.feed_forward.experts.z.gate_up_projz.feed_forward.experts.down_projz
.down_projz .post_attention_layernorm.weightz&.dense-post_attention_layernorm.weight)
state_dictrF   text_confignum_local_expertsr,   range
num_layers
isinstancemoe_layer_freqlistlenpoprT   chunk	enumeratesqueeze	transposer   )rH   rZ   r   num_expertsr,   layer_iis_moe_layerweightweightsexpert_iexpert_weightr&   r&   r'   r      s4   

z0HFLlama4OmniImporter._modify_llama4_source_statec                 C   s  ddl m}m} |t| }z	|t| }W n ty#   d}Y nw dd }|j}|j}|j|j	|j
|j|jd}t|dddurY|jdd	krY|d
|jddd n|ddi t|dddkr|j|j dksrJ dg|jd  dg }	|j|j }
|d|	|
 i td|j|jt|dds|jn|j|j|j|j|j|j|jd
||jt|dd|jt|d|d|}t|jd}td|j|j|jddd}t|||d
t j!dS )zCreate a NeMo Llama4OmniConfig from the HF model config.

        Translates the HF configuration parameters to the equivalent NeMo
        configuration.

        Returns:
            Llama4OmniConfig: NeMo configuration for Llama models
        r   )
AutoConfigGenerationConfigNc                 S   s(   d}| | dkr|d }| | dks|S )N   r      r&   )
vocab_sizebaser&   r&   r'   make_vocab_size_divisible_by1  s
   zAHFLlama4OmniImporter.config.<locals>.make_vocab_size_divisible_by)moe_router_topknum_moe_experts
qk_l2_norm#moe_shared_expert_intermediate_sizemoe_ffn_hidden_sizerope_scaling	rope_typellama3Tfactorg       @)r   rope_scaling_factorFinterleave_moe_layer_stepr   r   intermediate_size_mlptie_word_embeddingshead_dim)r   r4   r5   num_attention_headsinit_method_stdlayernorm_epsilonnum_query_groups
seq_lengthrotary_basegated_linear_unitr   #share_embeddings_and_output_weightsr   kv_channelsgeneration_config)r   r0   r1   )r,   r-   r9   bf16params_dtyper&   )"rO   r   r   rP   rQ   	Exceptionr   vision_confignum_experts_per_tokr   use_qk_normintermediate_sizegetattrr   getupdatenum_hidden_layersr   Llama4TextConfigr4   r   r   initializer_rangerms_norm_epsnum_key_value_headsmax_position_embeddings
rope_thetar   r   r   
output_dimr   rT   rU   )rH   r   r   rZ   r   r   src_text_configsrc_vision_configargspatternnum_patternsr,   r-   r9   r&   r&   r'   rF     s|   


	zHFLlama4OmniImporter.configN)rD   r!   )r;   r<   r=   r>   r   rI   r   r]   rV   propertyrE   r   Moduler   r   r   rF   r&   r&   r&   r'   rC   U   s    	m
%rC   r   c                   @   sx   e Zd ZdZejfdddZedddZd	e	de	fd
dZ
dd ZedddZde	deeef fddZdd ZdS )HFLlama4OmniExportera  Exporter for converting NeMo Llama4 Omni models to Hugging Face format.

    This class handles the conversion of NeMo's Llama4OmniModel to Hugging Face's
    Llama4ForConditionalGeneration format, including weight mapping and configuration translation.
    rD   r   c                 C   sR   ddl m} ddlm} |  |j| j|dW  d   S 1 s"w   Y  dS )zInitialize a HF Llama4ForConditionalGeneration instance.

        Args:
            dtype: Data type for model parameters

        Returns:
            Llama4ForConditionalGeneration: Initialized HF Llama4 Omni model
        r   r   )no_init_weightsrL   N)rO   r   transformers.modeling_utilsr   _from_configrF   )rH   rY   r   r   r&   r&   r'   rI     s
   	$zHFLlama4OmniExporter.initHFLlama4Configc                 C   s  t jt| dd}ddlm} ddlm} ddlm} |j}|j}|j	r-|j
ddd	d
dnd}t|ddurU|j}t|trA|}	nt|trN|jt| }	n	td| d}	|d%i d|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|j d|d|	d | j!"d!gd }
||j|j#|j|j|jd"|j|j$|j%|j&|j&|j&d#}||
|| j!j'| j!j(d$}|S )&zCreate a HF LlamaOmniConfig from the NeMo model config.

        Translates the NeMo configuration parameters to the equivalent HF
        configuration.

        Returns:
            HFLlamaConfig: HF configuration for Llama4 Omni models
        zmodel.configsubpathr   r
   )r   r   g      ?g      @i    r   )r   low_freq_factorhigh_freq_factor original_max_position_embeddingsr   Nr   zUnexpected moe_layer_freq r   r   r4   r   r   r   r   r   r   r   r   r   r   r   r   r   r   pad_token_idz<|finetune_right_pad|>   )r4   
image_sizer   norm_epsr   num_channelsr   
patch_sizepixel_shuffle_ratioprojector_input_dimprojector_output_dimvision_output_dim)r   r   bos_token_ideos_token_idr&   ))r   load_contextrQ   rO   r   r   r   r,   r-   r   r   r   r   r   intr   r   sum
ValueErrorr   r4   r   r5   r   r   r   r   r   r   r   r   r   r   rE   tokens_to_idsimg_h	patch_dimr  r   bos_ideos_id)rH   rZ   r   HFLlama4TextConfigHFLlama4VisionConfigsource_languagesource_visionr   r   r   target_languagetarget_visionrF   r&   r&   r'   rF     s   



	
zHFLlama4OmniExporter.configrJ   c                 C   s   t d | | \}}t d t d | tj}t d | |||}| }|| z
| j	j	| W |S  t
yJ   t d Y |S w )zApply the conversion from NeMo to HF format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved HF model
        z<Loading Llama4Omni NeMo checkpoint. This may take a while...z"Llama4Omni NeMo checkpoint loaded.zInitializing the HF model..zStart Converting the model..zFailed to save tokenizer)r   info	ckpt_loadrI   rT   rU   rV   cpusave_pretrainedrE   r   warning)rH   rJ   rZ   source_configr[   r&   r&   r'   r]     s    
	



zHFLlama4OmniExporter.applyc                 C   s   |  ||}i ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1}tttttjd2d3tjd4tjd5d6tjd4g}tj	||||d7S )8a  Convert state dict from NeMo format to HF format.

        Maps the weights from the NeMo model to the HF model according to
        the appropriate mapping scheme.

        Args:
            source: Source NeMo model
            target: Target HF model
            source_config: Source NeMo Config

        Returns:
            The target model with weights transferred from source
        r_   r^   ra   r`   rc   rb   re   rd   rg   rf   ri   rh   rk   rj   rm   rl   ro   rn   rq   rp   rs   rr   ru   rt   rw   rv   ry   rx   r{   rz   r}   r|   r   r~   r   r   r   r   r   r   r   r   r   r   r   z<language_model.model.layers.*.feed_forward.experts.down_projz?language_model.model.layers.*.feed_forward.experts.gate_up_projr   )r   r   r   r   r   r   r   r   r   r   r   z=language_model.decoder.layers.*.mlp.experts.linear_fc2.weightz=language_model.decoder.layers.*.mlp.experts.linear_fc1.weightr   r   r   r   r   r   r   )
r   _export_cls_token_export_language_qkv_export_vision_qkv_export_vision_qkv_biasr   r   r   	split_fc1r   )rH   rZ   r[   r&  r   r   r&   r&   r'   rV     s   	Fz"HFLlama4OmniExporter.convert_stater	   c                 C   s   t jt| ddjS )zzGet the tokenizer from the NeMo model.

        Returns:
            TokenizerSpec: Tokenizer from the NeMo model
        modelr  )r   r  rQ   rE   rG   r&   r&   r'   rE   j  s   zHFLlama4OmniExporter.tokenizerpathc              
   C   s  |d d }|  stdt|d}t|}W d   n1 s#w   Y  |d }i }|d d d	 }|d d
 d	 }t|d }	|	D ]>\}
}d|
v rPqG|
dd}d|v r|d|ksh|d|krt	|dD ]}|| ||ddt
| < qo|||< qG||d fS )a  
        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
        so that it is consistent with the key names you would get from loading the checkpoint into a model.
        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.

        Args:
            path (Path): The path from which the model will be loaded.

        Returns
        -------
            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
        contextz
model.yamlz@model.yaml is not found in the context folder of the checkpoint.rNr   rF   r,   r   r-   T_extra_statezmodule. layersr   zlayers.)existsFileNotFoundErroropenyaml	safe_loadr   itemsreplacesizer   rQ   )rH   r-  
model_yamlstreamrF   dist_ckpt_folderr   langauge_layersvision_layersdistributed_model_weightskvnew_kir&   r&   r'   r"  s  s(   $ 
zHFLlama4OmniExporter.ckpt_loadc                 C   s  t |d d D ]s}d}t|d d tr-t|d d |d d ks%J |d d | }|r`|d| d}|ddd	 |d| d
< |d| d}|ddd	 |d| d< qd| d|v sjJ |d| d}||d| d< qt|}|S )z
        For MoE layer, we transpose the gate_up_proj and down_proj to match HF implementation.
        For dense layer, we change the name for the post attention layer norm to
        avoid the many-to-one mapping in the conversion.
        r,   r   Tr   zlanguage_model.decoder.layers.z&.mlp.experts.experts.linear_fc1.weightr   r   r   z.mlp.experts.linear_fc1.weightz&.mlp.experts.experts.linear_fc2.weightz.mlp.experts.linear_fc2.weightz!.mlp.linear_fc1.layer_norm_weightz.pre_mlp_layernorm.weight)r   r   r   r   r   permute
contiguousr   )rH   r   r&  r   r   r   rZ   r&   r&   r'   r     s:   


z0HFLlama4OmniExporter._modify_llama4_source_stateN)rD   r   )rD   r   )rD   r	   )r;   r<   r=   r>   rT   rU   rI   r   rF   r   r]   rV   rE   r   r   r"  r   r&   r&   r&   r'   r   y  s    Sl$r   )vision_model.class_embeddingzvision_model.class_token)r   r   ctxc                 C   s   | dddS )Nr   )reshaperH  	cls_tokenr&   r&   r'   r     s   r   rG  c                 C   s   |  S r%   )r   rK  r&   r&   r'   r'    s   r'  )z5language_model.model.layers.*.self_attn.q_proj.weightz5language_model.model.layers.*.self_attn.k_proj.weightz5language_model.model.layers.*.self_attn.v_proj.weightz@language_model.decoder.layers.*.self_attention.linear_qkv.weightc              
   C   2   | j jj}t||||j|j|j|j |j|jdS Nhead_numr   heads_per_groupr4   	head_size)r[   rF   r,   r   r   r   r4   r   rH  qrA  rB  megatron_configr&   r&   r'   r        


r   c                 C   s.   | j jj}t||j|j|j|j |j|jdS rN  )r[   rF   r   r   r   r   r4   r   rH  qkv	hf_configr&   r&   r'   r(    s   


r(  )z3vision_model.model.layers.*.self_attn.q_proj.weightz3vision_model.model.layers.*.self_attn.k_proj.weightz3vision_model.model.layers.*.self_attn.v_proj.weightz>vision_model.decoder.layers.*.self_attention.linear_qkv.weightc              
   C   rM  rN  )r[   rF   r-   r   r   r   r4   r   rS  r&   r&   r'   r     rV  r   c                 C   s,   | j jj}t||j|jd|j|j|j dS )Nr   rO  )r[   rF   r   r   r   r4   rW  r&   r&   r'   r)    s   


r)  )z1vision_model.model.layers.*.self_attn.q_proj.biasz1vision_model.model.layers.*.self_attn.k_proj.biasz1vision_model.model.layers.*.self_attn.v_proj.biasz<vision_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   sH   | j jj}t|d|d|d|j|j|j|j d|jddS )NrI  r   rO  )	r[   rF   r-   r   	unsqueezer   r   r   r   )rH  q_biask_biasv_biasrU  r&   r&   r'   r   )  s   


	r   c                 C   s(   | j jj}t||j|jd|j|j dS )Nr   )rP  r   rQ  rR  )r[   rF   r   r   r   r4   )rH  qkv_biasrY  r&   r&   r'   r*  @  s   


r*  )Bdataclassesr   r   pathlibr   typingr   r   r   rT   r6  r   "nemo.collections.common.tokenizersr	   nemo.collections.llmr   r   r   r   &nemo.collections.vlm.llama4.model.baser   r   (nemo.collections.vlm.llama4.model.visionr   %nemo.collections.vlm.neva.model.llavar   r   r    nemo.collections.vlm.vision.baser   .nemo.export.trt_llm.nemo_ckpt_loader.nemo_filer   nemo.lightningr   r   nemo.lightning.io.stater   r   
nemo.utilsr   ,megatron.core.transformer.transformer_configr   HAVE_TEImportErrorrO   r   r   r   r!   r#   rA   model_importerModelConnectorrC   model_exporterr   r   TransformCTXr   r'  r   r(  r   r)  r   r*  r&   r&   r&   r'   <module>   s   
  
%  G