o
    }oi)                  	   @   s  d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZ d dlZd dlm  mZ d dlZd dlmZ d dlmZmZmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 z
d dl3m4Z4 dZ5W n e6y   dZ5Y nw e
rd dl7mZ8 d dl9m:Z:m;Z; d dl<m=Z> d dl<m?Z? d dl@mAZA d dlBmCZC eG dd deZ=eG dd de=ZDeG dd de=ZEeG d d! d!e=ZFeG d"d# d#e=ZGeG d$d% d%eGZHeG d&d' d'eGZIeG d(d) d)eGZJeG d*d+ d+eHZKeG d,d- d-eHZLeG d.d/ d/eHZMeG d0d1 d1eHZNeG d2d3 d3eHZOeG d4d5 d5eDZPeG d6d7 d7eEZQeG d8d9 d9e=ZReG d:d; d;eFZSeG d<d= d=eGZTeG d>d? d?eTZUeG d@dA dAeTZVG dBdC dCeZWG dDdE dEeWZXe%YeWdFG dGdH dHe%jZdIeWf Z[e%\eWdFG dJdK dKe%jZeWdIf Z]e%\eWdLG dMdN dNe]Z^	O	P	Q	Rd^dSe_dTe_dUe_dVe`fdWdXZaebdYe,dZejcfd[d\Zdg d]ZedS )_    N)	dataclassfield)partial)Path)	TYPE_CHECKING	AnnotatedAnyCallableDictListOptionalTupleUnion)nn)	GPTConfigGPTModeltorch_dtype_from_mcore_configget_llama4_layer_spec)Config)load_distributed_model_weights)OptimizerModuleioteardown)ADAPTER_META_FILENAME)ckpt_to_weights_subdir)TransformCTXTransformFns_ModelState)dtype_from_hf)logging)
ModuleSpecTF)r   )AutoPeftModelForCausalLM
PeftConfigLlamaConfig)LlamaForCausalLMAutoTokenizer)TokenizerSpecc                   @   s   e Zd ZU dZdZeed< ejZ	e
ed< dZeed< dZeed< d	Zeed
< dZeed< dZeed< dZeed< d	Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dS )r%   zConfiguration class for Llama models.

    Extends GPTConfig with specific settings optimized for Llama architectures.
    Includes configurations for normalization, activation functions, and various
    architecture-specific options.
    RMSNormnormalizationactivation_funcTgated_linear_unitropeposition_embedding_typeFadd_bias_linear   
seq_length        attention_dropouthidden_dropout#share_embeddings_and_output_weightsbias_activation_fusionmasked_softmax_fusionpersist_layer_normbias_dropout_fusionapply_rope_fusionNuse_transformer_engine_op_fuser)__name__
__module____qualname____doc__r+   str__annotations__Fsilur,   r	   r-   boolr/   r0   r2   intr4   floatr5   r6   r7   r8   r9   r:   r;   r<   r    rH   rH   X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/llama.pyr%   9   s"   
 r%   c                   @   N   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed	< d
S )Llama2Config7BzConfiguration for a 7B parameter Llama 2 model.

    Specific configuration for the 7B Llama 2 model with 32 layers,
    4096 hidden size, and 32 attention heads.
        
num_layersr1   hidden_sizenum_attention_headsnum_query_groupsi +  ffn_hidden_sizeNr=   r>   r?   r@   rM   rF   rB   rN   rO   rP   rQ   rH   rH   rH   rI   rK   U      
 rK   c                   @   rJ   )Llama2Config13BzConfiguration for a 13B parameter Llama 2 model.

    Specific configuration for the 13B Llama 2 model with 40 layers,
    5120 hidden size, and 40 attention heads.
    (   rM      rN   rO   rP   i 6  rQ   NrR   rH   rH   rH   rI   rT   d   rS   rT   c                   @   sN   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dS )Llama2Config70BzConfiguration for a 70B parameter Llama 2 model.

    Specific configuration for the 70B Llama 2 model with 80 layers,
    8192 hidden size, and 64 attention heads with 8 query groups.
    P   rM       rN   @   rO      rP    p  rQ   NrR   rH   rH   rH   rI   rW   s   rS   rW   c                   @   s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dZeed< ejZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )Llama3ConfigzConfiguration for Llama 3 models.

    Base configuration for Llama 3 architecture with common settings
    across different model sizes, including group query attention (GQA)
    and architecture-specific settings.
    r[   rP   r3   r5   r4   r*   r+   g{Gz?init_method_stdgh㈵>layernorm_epsilonFr0   r,   Tr-   r7   r8   r9   r:   r;   r6   r.   r/         ?rotary_percentN)r=   r>   r?   r@   rP   rF   rB   r5   rG   r4   r+   rA   r^   r_   r0   rE   rC   rD   r,   r	   r-   r7   r8   r9   r:   r;   r6   r/   ra   rH   rH   rH   rI   r]      s&   
 r]   c                       sb   e Zd ZU dZdZeed< dZeed< dZeed< dZ	e
ed	< d
Zeed< dd fddZ  ZS )Llama31ConfigzConfiguration for Llama 3.1 models.

    Extends Llama3Config with specific settings for Llama 3.1 models,
    including RoPE scaling parameters.
           @scale_factorr`   low_freq_factor      @high_freq_factorrY   old_context_leng{Gz?r^   NreturnMCoreGPTModelc                    s8   t  ||||}t|jj| j| j| j| jd|j_|S )a  Configure and instantiate a Megatron Core Llama 3.1 model.

        Extends the base configuration with Llama 3.1 specific RoPE scaling.

        Args:
            tokenizer: Tokenizer used with the model
            pre_process: Whether to include pre-processing in the model
            post_process: Whether to include post-processing in the model

        Returns:
            MCoreGPTModel: Configured Megatron Core GPT model instance
        )factorre   rg   rh   )	superconfigure_modelapply_rope_scalingrotary_pos_embinv_freqrd   re   rg   rh   )self	tokenizerpre_processpost_processvp_stagemodel	__class__rH   rI   rm      s   
zLlama31Config.configure_model)NNN)ri   rj   )r=   r>   r?   r@   rd   rG   rB   re   rg   rh   rF   r^   rm   __classcell__rH   rH   rw   rI   rb      s   
 rb   c                   @   Z   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dS )Llama3Config8BzConfiguration for an 8B parameter Llama 3 model.

    Specific configuration for the 8B Llama 3 model with 32 layers,
    4096 hidden size, and 32 attention heads.
      rotary_baserY   r2   rL   rM   r1   rN    8  rQ   rO   Nr=   r>   r?   r@   r}   rF   rB   r2   rM   rN   rQ   rO   rH   rH   rH   rI   r{      s   
 r{   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dZeed< dZeed< dS )Llama3Config70BzConfiguration for a 70B parameter Llama 3 model.

    Specific configuration for the 70B Llama 3 model with 80 layers,
    8192 hidden size, and 64 attention heads.
    r|   r}   rY   r2   rX   rM   rN   r\   rQ   rZ   rO   gܠ[;Q?r^      make_vocab_size_divisible_byN)r=   r>   r?   r@   r}   rF   rB   r2   rM   rN   rQ   rO   r^   rG   r   rH   rH   rH   rI   r      s   
 r   c                   @   rz   )Llama31Config8BzConfiguration for an 8B parameter Llama 3.1 model.

    Specific configuration for the 8B Llama 3.1 model with 32 layers,
    4096 hidden size, and 32 attention heads, supporting a longer context
    length of 131K tokens.
    r|   r}      r2   rL   rM   r1   rN   r~   rQ   rO   Nr   rH   rH   rH   rI   r      s   
 r   c                   @   f   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dS )Llama31Config70BzConfiguration for a 70B parameter Llama 3.1 model.

    Specific configuration for the 70B Llama 3.1 model with 80 layers,
    8192 hidden size, and 64 attention heads, supporting a longer context
    length of 131K tokens.
    r|   r}   r   r2   rX   rM   rY   rN   r\   rQ   rZ   rO   r   r   Nr=   r>   r?   r@   r}   rF   rB   r2   rM   rN   rQ   rO   r   rH   rH   rH   rI   r         
 r   c                   @   sf   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dS )Llama31Config405BzConfiguration for a 405B parameter Llama 3.1 model.

    Specific configuration for the 405B Llama 3.1 model with 126 layers,
    16384 hidden size, and 128 attention heads, supporting a longer context
    length of 131K tokens.
    r|   r}   r   r2   ~   rM    @  rN   i   rQ   r   rO   r   Nr   rH   rH   rH   rI   r     r   r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZ	e
ed< dZe
ed	< d
Ze
ed< dZe
ed< dZe
ed< dZe
ed< dZe
ed< dZe
ed< dS )Llama32Config1BzConfiguration for a 1B parameter Llama 3.2 model.

    Specific configuration for the 1B Llama 3.2 model with 16 layers,
    2048 hidden size, and 32 attention heads (8 query groups).
    g      @@rd   Tr6   r|   r}   r   r2      rM   i   rN   rY   rQ   rL   rO   r[   rP   r   r   N)r=   r>   r?   r@   rd   rG   rB   r6   rE   r}   rF   r2   rM   rN   rQ   rO   rP   r   rH   rH   rH   rI   r        
 r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )Llama32Config3BzConfiguration for a 3B parameter Llama 3.2 model.

    Specific configuration for the 3B Llama 3.2 model with 28 layers,
    3072 hidden size, and 24 attention heads (8 query groups).
    rL   rd   Tr6   r|   r}   r   r2      rM   i   rN   rY   rQ      rO   r[   rP   r   r   N)r=   r>   r?   r@   rd   rF   rB   r6   rE   r}   r2   rM   rN   rQ   rO   rP   r   rH   rH   rH   rI   r   1  r   r   c                   @   *   e Zd ZU dZdZeed< dZeed< dS )CodeLlamaConfig7BzConfiguration for a 7B parameter CodeLlama model.

    Extends Llama2Config7B with modified settings specifically for code generation,
    including longer context length and different rotary base.
    @B r}   r   r2   Nr=   r>   r?   r@   r}   rF   rB   r2   rH   rH   rH   rI   r   E     
 r   c                   @   r   )CodeLlamaConfig13BzConfiguration for a 13B parameter CodeLlama model.

    Extends Llama2Config13B with modified settings specifically for code generation,
    including longer context length and different rotary base.
    r   r}   r   r2   Nr   rH   rH   rH   rI   r   Q  r   r   c                   @   r   )CodeLlamaConfig34BzConfiguration for a 34B parameter CodeLlama model.

    Specific configuration for the 34B CodeLlama model with 48 layers,
    8192 hidden size, and 64 attention heads (8 query groups).
    0   rM   rY   rN   rZ   rO   r[   rP   i V  rQ   r   r}   r   r2   N)r=   r>   r?   r@   rM   rF   rB   rN   rO   rP   rQ   r}   r2   rH   rH   rH   rI   r   ]  s   
 r   c                   @   s   e Zd ZdZdS )CodeLlamaConfig70BzConfiguration for a 70B parameter CodeLlama model.

    Extends Llama2Config70B with settings specifically for code generation.
    N)r=   r>   r?   r@   rH   rH   rH   rI   r   n  s    r   c                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< edd dZeeedgef f ed< dZeed< dZeed< dZeed< d Zeed!< dZeed"< d#Zeed$< d%Zeed&< d'Z e!e ed(< dZ"eed)< dZ#eed*< dZ$eed+< dZ%eed,< dZ&eed-< d.Z'e(ed/< dZ)eed0< d'S )1Llama4Configz2
    Configuration for Llama4 language model.
    r|   r}   rY   r2   r   rM   rV   rN   r   rQ   rU   rO   i@ 
vocab_sizeFr0   Tr-   rotary_interleavedr;      nope_layer_intervalc                   C   s   t S Nr   rH   rH   rH   rI   <lambda>  s    zLlama4Config.<lambda>default_factoryr%   transformer_layer_specmoe_grouped_gemm#moe_shared_expert_intermediate_sizemoe_ffn_hidden_size   moe_router_topkmoe_router_pre_softmaxsigmoidmoe_router_score_functionalltoallmoe_token_dispatcher_typeNmoe_router_dtypemoe_apply_probs_on_inputmoe_shared_expert_overlapmoe_permute_fusion
qk_l2_normrope_scalingrc   rope_scaling_factorattention_chunk_size)*r=   r>   r?   r@   r}   rF   rB   r2   rM   rN   rQ   rO   r   r0   rE   r-   r   r;   r   r   r   r   r!   r	   r   r   r   r   r   r   rA   r   r   r   r   r   r   r   r   r   rG   r   rH   rH   rH   rI   r   x  s@   
  r   c                   @   sB   e Zd ZU dZdZeed< dZeed< dZ	e
ed< dZeed< d	S )
Llama4Experts16Configz4
    Configuration for llama4 16-experts model.
    r   num_moe_expertsTr   rc   r   r   N)r=   r>   r?   r@   r   rF   rB   r   rE   r   rG   r   rH   rH   rH   rI   r     s   
 r   c                   @   sX   e Zd ZU dZdZeed< dZeed< e	dd dZ
eeee f ed	< dZeed
< dS )Llama4Experts128Configz5
    Configuration for llama4 128-experts model.
    r   r   Fr   c                   C   s   ddgd S )Nr   r   r   rH   rH   rH   rH   rI   r     s    zLlama4Experts128Config.<lambda>r   moe_layer_freqr   N)r=   r>   r?   r@   r   rF   rB   r   rE   r   r   r   r   r   rH   rH   rH   rI   r     s   
 "r   c                       sp   e Zd ZdZddddg fdeee ee f dee ded dee	e
jge
jf  dee f
 fd	d
Z  ZS )
LlamaModelzLlama model implementation based on the GPT model architecture.

    This class provides a high-level interface for Llama models,
    implementing the specific architecture and settings needed for Llama models.
    Nconfigoptimrr   r)   model_transformmodel_context_managersc                    s    t  j|pt ||||d d S N)r   rr   r   r   )rl   __init__r%   rq   r   r   rr   r   r   rw   rH   rI   r     s   
zLlamaModel.__init__)r=   r>   r?   r@   r   r   r%   r   r   r	   r   Moduler   r   ry   rH   rH   rw   rI   r     s$    r   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )MLPerfLoRALlamaModela|  Memory-optimized Llama model implementation for MLPerf LoRA fine-tuning.

    This class wraps LlamaModel and adds context managers around configure_model
    to reduce memory consumption during initialization. It applies techniques like
    avoiding unnecessary gradients and using FP8 parameter initialization.

    Changes made here are experimental, proceed with caution.
    Nr   r   rr   r)   r   c                    s*   t  g}t j|pt ||||d d S r   )torchno_gradrl   r   r%   r   rw   rH   rI   r     s   

zMLPerfLoRALlamaModel.__init__)NNNN)r=   r>   r?   r@   r   r   r%   r   r   r	   r   r   r   ry   rH   rH   rw   rI   r     s    r   hfc                   @   sl   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
dejdefddZe	defddZdS )HFLlamaImporterzImporter for converting Hugging Face Llama models to NeMo format.

    This class handles the conversion of Hugging Face's LlamaForCausalLM models
    to NeMo's LlamaModel format, including weight mapping and configuration translation.
    ri   c                 C   s   t | j| jdS )zInitialize a NeMo LlamaModel instance.

        Returns:
            LlamaModel: Initialized NeMo Llama model with the appropriate configuration
                        and tokenizer.
        )rr   )r   r   rr   rq   rH   rH   rI   init  s   zHFLlamaImporter.initoutput_pathc           	      C   s   ddl m}m} |t| }t|ddkr)ddl m} |jt| dd}|j}n	|jt| dd}|  }| 	|}| 
|| | || td| d	|j d
 t|| ~~|S )zApply the conversion from HF to NeMo format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved NeMo model
        r   )
AutoConfigAutoModelForCausalLM
model_typellama4)Llama4ForConditionalGenerationautotorch_dtypez.Converted Llama model to Nemo, model saved to z in .)transformersr   r   from_pretrainedrA   getattrr   language_modelr   
nemo_setupconvert_state	nemo_saveprintdtyper   )	rq   r   r   r   	hf_configr   sourcetargettrainerrH   rH   rI   apply  s   	

zHFLlamaImporter.applyc                 C   s   dddddd}t |jddr|d	= tjd
dtjdg}dt |jdv rN| |}dddddddd}|| |tjddtj	dtjddtj	dg n|ddd |
tjddtj	d tj||||dS )aJ  Convert state dict from HF format to NeMo format.

        Maps the weights from the HF model to the NeMo model according to
        the appropriate mapping scheme.

        Args:
            source: Source HF model
            target: Target NeMo model

        Returns:
            The result of applying the transforms
         embedding.word_embeddings.weight2decoder.layers.*.self_attention.linear_proj.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weightdecoder.final_layernorm.weightoutput_layer.weight)model.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight%model.layers.*.input_layernorm.weightmodel.norm.weightlm_head.weighttie_word_embeddingsFr   z&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnr   r   )decoder.layers.*.pre_mlp_layernorm.weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weight"decoder.layers.*.mlp.router.weight5decoder.layers.*.mlp.shared_experts.linear_fc2.weightz/decoder.layers.*.mlp.experts.linear_fc2.weight*z/decoder.layers.*.mlp.experts.linear_fc1.weight*&decoder.layers.*.mlp.linear_fc2.weight).model.layers.*.post_attention_layernorm.weightz4model.layers.*.dense-post_attention_layernorm.weight)model.layers.*.feed_forward.router.weight:model.layers.*.feed_forward.shared_expert.down_proj.weightz/model.layers.*.feed_forward.experts.*.down_projz2model.layers.*.feed_forward.experts.*.gate_up_projz,model.layers.*.feed_forward.down_proj.weightz:model.layers.*.feed_forward.shared_expert.gate_proj.weightz8model.layers.*.feed_forward.shared_expert.up_proj.weight5decoder.layers.*.mlp.shared_experts.linear_fc1.weightz,model.layers.*.feed_forward.gate_proj.weightz*model.layers.*.feed_forward.up_proj.weight&decoder.layers.*.mlp.linear_fc1.weight)r   #model.layers.*.mlp.down_proj.weightz#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weightmapping
transforms)r   r   r   state_transformr   	merge_qkv_modify_llama4_source_stateupdateextend	merge_fc1appendapply_transforms)rq   r   r   r  r  llama4_mappingrH   rH   rI   r     sf   

zHFLlamaImporter.convert_stater(   c                 C   s   ddl m} || t| S )zGet the tokenizer for the HF model.

        Returns:
            AutoTokenizer: Tokenizer instance initialized from the HF model's tokenizer
        r   r'   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr(   save_hf_tokenizer_assetsrA   )rq   r(   rH   rH   rI   rr     s   zHFLlamaImporter.tokenizerr   c           
   	   C   s8  |  }|jj}t| jjD ]}d}t| jjtr+t| jj| jjks%J | jj| }|r|	d| d}t
j||dd}t|D ]\}}	|	 dd|d| d| d< qB|	d| d	}t
j||dd}t|D ]\}}	|	 dd|d| d| d
< qmq|	d| d}||d| d< qt|}|S )a  
        In Llama4, HF weight for local experts are mapped with a single tensor.
        Pre-chunk it before convert_state.
        For dense layer, we change the name for the post attention layer norm to
        avoid the many-to-one mapping in the conversion.
        Tzmodel.layers.z".feed_forward.experts.gate_up_projr   dimr   z.feed_forward.experts.z.gate_up_projz.feed_forward.experts.down_projz
.down_projz .post_attention_layernorm.weightz&.dense-post_attention_layernorm.weight)
state_dictr   num_local_expertsrangerM   
isinstancer   listlenpopr   chunk	enumeratesqueeze	transposer   )
rq   r   r  num_expertslayer_iis_moe_layerweightweightsexpert_iexpert_weightrH   rH   rI   r    s.   z+HFLlamaImporter._modify_llama4_source_statec                 C   s  ddl m}m} |t| }z	|t| }W n ty#   d}Y nw dd }t|dddurD|jddkrDt	t
|jd	d
d}nt}i }dt|ddv rt}t|dddkr]|j}|j|j|j|j|jd}t|dddur|jddkr|d|jd	d
d n|ddi t|dddkr|j|j dksJ dg|jd  dg }|j|j }	|d||	 i |d*i d|jd|jdt|dds|jnT|jd|jd|jd|jd|jd|jd|jddd ||jd!t|d"dd#t|tj kd$t|tj!kd%t|d&|d'|jd(t|d)d|}
|
S d|jd|jd|jd|jd|jd|jddd ||jd!t|d"dd#t|tj kd$t|tj!kd%t|d&|d'|jd(t|d)d|}
|
S )+zCreate a NeMo LlamaConfig from the HF model config.

        Translates the HF configuration parameters to the equivalent NeMo
        configuration.

        Returns:
            LlamaConfig: NeMo configuration for Llama models
        r   )r   GenerationConfigNc                 S   s(   d}| | dkr|d }| | dks|S )Nr   r      rH   )r   baserH   rH   rI   r     s
   z<HFLlamaImporter.config.<locals>.make_vocab_size_divisible_byr   	rope_typellama3rk   rc   )rd   r   r   )r   r   r   r   r   T)r   r   Finterleave_moe_layer_stepr   r   rM   rN   rQ   intermediate_size_mlprO   r^   r_   rP   r2   r}   r-   r   r6   r   fp16bf16params_dtypegeneration_configr   kv_channelshead_dimrH   )"r   r   r$  r   rA   	Exceptionr   r   getr   rb   r%   r   text_confignum_experts_per_tokr  use_qk_normintermediate_sizer  num_hidden_layersr)  rN   r*  rO   initializer_rangerms_norm_epsnum_key_value_headsmax_position_embeddings
rope_thetar   r   r   float16bfloat16)rq   r   r$  r   r.  r   clsargspatternnum_patternsoutputrH   rH   rI   r     s   
  
	

	

zHFLlamaImporter.configN)ri   r(   )r=   r>   r?   r@   r   r   r   r   r   propertyrr   r   r   r   r  r%   r   rH   rH   rH   rI   r     s    	!f
$r   r&   c                   @   s   e Zd ZdZejfdddZdedefddZdd
dZ	e
dddZe
dddZdd Zdd Zdedeeef fddZdd Zd	S )HFLlamaExporterzExporter for converting NeMo Llama models to Hugging Face format.

    This class handles the conversion of NeMo's LlamaModel to Hugging Face's
    LlamaForCausalLM format, including weight mapping and configuration translation.
    ri   r&   c                 C   sR   ddl m} ddlm} |  |j| j|dW  d   S 1 s"w   Y  dS )zInitialize a HF LlamaForCausalLM instance.

        Args:
            dtype: Data type for model parameters

        Returns:
            LlamaForCausalLM: Initialized HF Llama model
        r   )r   )no_init_weightsr   N)r   r   transformers.modeling_utilsrF  from_configr   )rq   r   r   rF  rH   rH   rI   r     s
   	$zHFLlamaExporter.initr   c                 C   s   |   r| | \}}n| t| \}}|j}| t|}| |||}| }| jj	r?|
 }|d |j||d n|| z
| jj| W |S  ty]   td Y |S w )zApply the conversion from NeMo to HF format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved HF model
        r   )r  zFailed to save tokenizer)	is_llama4	ckpt_load	nemo_loadrA   r   r   r   r   cpur   r  r  save_pretrainedrr   r1  r    warning)rq   r   r   source_config_r   r  rH   rH   rI   r   !  s&   	

zHFLlamaExporter.applyNc                 C   s   |   }|r|dusJ | ||}dddddd}tjdd	tjd
tjddtjd
tjddtjd
g}| jj	sD|
tjddtjd
 |rk|d |dddddd |tjddtjd
tjddtjd
g tj||||dS )a  Convert state dict from NeMo format to HF format.

        Maps the weights from the NeMo model to the HF model according to
        the appropriate mapping scheme.

        Args:
            source: Source NeMo model
            target: Target HF model
            source_config: Source NeMo config (optional, used for Llama4)

        Returns:
            The target model with weights transferred from source
        Nr   r   r   r   r   )r   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   z-model.layers.*.feed_forward.experts.down_projz0model.layers.*.feed_forward.experts.gate_up_proj)r   r   r   z.decoder.layers.*.mlp.experts.linear_fc2.weightz.decoder.layers.*.mlp.experts.linear_fc1.weightr   r   r   r  )rI  r  r   r  r   	split_qkv	split_fc1prune_paddingr   r   r  r  r  r	  r  )rq   r   r   rO  rI  r  r  rH   rH   rI   r   C  s~   		
	zHFLlamaExporter.convert_stater)   c                 C   s   t jt| ddjS )zzGet the tokenizer from the NeMo model.

        Returns:
            TokenizerSpec: Tokenizer from the NeMo model
        rv   subpath)r   load_contextrA   rr   r   rH   rH   rI   rr     s   zHFLlamaExporter.tokenizerHFLlamaConfigc                 C   sV  t jt| dd}t|tr| |S ddlm} d}t|tr,|j	|j
|j|jdd}|di dd	gd
|jd|jd|jd|jd|jdurL|jn2|j|j d|jd|jd|jd|jd|jd| jjd|jd|d| jjd| jjS d|jd|jd|jd|jd|jd| jjd|jd|d| jjd| jjS )zCreate a HF LlamaConfig from the NeMo model config.

        Translates the NeMo configuration parameters to the equivalent HF
        configuration.

        Returns:
            HFLlamaConfig: HF configuration for Llama models
        model.configrT  r   r$   Nr(  rk   re   rg    original_max_position_embeddingsr'  architecturesr&   r7  rN   r6  rO   r0  r;  r8  r9  r:  r<  r   r   r   bos_token_ideos_token_idrH   )r   rV  rA   r  r   create_llama4_configr   r%   rb   rd   re   rg   rh   rM   rN   rQ   rO   r/  r2   r^   r_   rP   r}   rr   r   r6   bos_ideos_id)rq   r   rW  r   rH   rH   rI   r     s   




zHFLlamaExporter.configc                 C   s   t jt| dd}t|tS )z(Check if the model config is for Llama4.rX  rT  )r   rV  rA   r  r   )rq   r   rH   rH   rI   rI    s   
zHFLlamaExporter.is_llama4c                 C   s@  ddl m} t|tsJ |jr|jdddddnd}t|d	d}d}|durCt|tr/|}nt|tr<|j	t
| }ntd
| |d i d|jd|j	d|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|d|d| jdgd d| jjd| jj}|S )!z8Create a HF Llama4TextConfig from the NeMo Llama4Config.r   )Llama4TextConfigr`   rf   rY   r(  rY  Nr   zUnexpected moe_layer_freq r0  r7  rN   r6  r*  rO   r4  r  r;  r8  r9  r:  r5  r<  r   r   r)  pad_token_idz<|finetune_right_pad|>r\  r]  rH   )r   ra  r  r   r   r   r   rF   r  rM   sum
ValueErrorr/  rN   r   rQ   rO   r   r   r2   r^   r_   rP   r   r}   r   rr   tokens_to_idsr_  r`  )rq   r   HFLlama4TextConfigr   r   r)  r   rH   rH   rI   r^    s|   


	
z$HFLlamaExporter.create_llama4_configpathc              
      s  |d d }|  stdt|d}t|}W d   n1 s#w   Y  |d }i } fdd  |d	 }|j}t|d
 }	|	D ]7\}
}d|
v rOqF|
dd}d|v ry|	d|kryt
|	dD ]}|| ||ddt| < qg|||< qF||fS )a  
        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
        so that it is consistent with the key names you would get from loading the checkpoint into a model.
        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.

        Args:
            path (Path): The path from which the model will be loaded.

        Returns
        -------
            Tuple[Dict, Any]: The loaded state dict and the yaml config object.
        contextz
model.yamlz@model.yaml is not found in the context folder of the checkpoint.rNr!  c                    s,   t | trtdd fdd|  D S | S )Nr   rH   c                    s   i | ]	\}}| |qS rH   rH   ).0kkvvdict_to_objrH   rI   
<dictcomp>4  s    z?HFLlamaExporter.ckpt_load.<locals>.<lambda>.<locals>.<dictcomp>)r  dicttypeitems)drm  rH   rI   r   3  s   ,z+HFLlamaExporter.ckpt_load.<locals>.<lambda>r   T_extra_statezmodule. layersr   zlayers.)existsFileNotFoundErroropenyaml	safe_loadrM   r   rr  replacesizer  rA   )rq   rg  
model_yamlstreamr   dist_ckpt_folderr  
config_objlangauge_layersdistributed_model_weightskvnew_kirH   rm  rI   rJ    s*    
zHFLlamaExporter.ckpt_loadc                 C   s   t |jD ]g}d}t|jtrt|j|jksJ |j| }|rQ|d| d}|ddd |d| d< |d| d}|ddd |d| d	< qd| d
|v s[J |d| d
}||d| d< qt	||}|S )z
        For MoE layer, we transpose the gate_up_proj and down_proj to match HF implementation.
        For dense layer, we change the name for the post attention layer norm to
        avoid the many-to-one mapping in the conversion confi.
        Tdecoder.layers.z&.mlp.experts.experts.linear_fc1.weightr   r%  r   z.mlp.experts.linear_fc1.weightz&.mlp.experts.experts.linear_fc2.weightz.mlp.experts.linear_fc2.weightz!.mlp.linear_fc1.layer_norm_weightz.pre_mlp_layernorm.weight)
r  rM   r  r   r  r  r  permute
contiguousr   )rq   r  rO  r  r  r   r   rH   rH   rI   r  E  s.   

z+HFLlamaExporter._modify_llama4_source_state)ri   r&   r   )ri   r)   )ri   rW  )r=   r>   r?   r@   r   r>  r   r   r   r   rD  rr   r   rI  r^  r   r
   r   rJ  r  rH   rH   rH   rI   rE  
  s    
"c26(rE  zhf-peftc                       sP   e Zd ZdZejfd fddZdedefddZd	d
 Z	e
dddZ  ZS )HFLlamaPEFTExporterzExporter for converting NeMo Llama models with PEFT adapters to Hugging Face format.

    This class extends HFLlamaExporter to handle Parameter-Efficient Fine-Tuning (PEFT)
    adapters, specifically LoRA and DoRA adapters.
    ri   r"   c                    s   ddl m} t j|d}tt| ddt }t|d}t	|d }W d   n1 s.w   Y  d	
|d	d
d |_||| jddS )zInitialize a HF PEFT model.

        Args:
            dtype: Data type for model parameters

        Returns:
            AutoPeftModelForCausalLM: Initialized HF PEFT model
        r   )get_peft_model)r   F)	is_savingri  model_ckpt_pathN/)autocast_adapter_dtype)peftr  rl   r   r   rA   r   ry  jsonloadjoinsplitname_or_pathpeft_config)rq   r   r  rv   adapter_meta_pathfr  rw   rH   rI   r   n  s   	zHFLlamaPEFTExporter.initr   c                 C   sp   ddl m}m}m} tjt| dd| _| t| \}}| 	t
|j}| ||}| }|j|dd |S )zApply the conversion from NeMo PEFT model to HF format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved HF PEFT model
        r   )CanonicalLoRADoRALoRAzmodel.model_transformrT  F)save_embedding_layers)nemo.collections.llm.peftr  r  r  r   rV  rA   peft_objrK  r   r   r   r   rL  rM  )rq   r   r  r  r  r   rP  r   rH   rH   rI   r     s   	zHFLlamaPEFTExporter.applyc                 C   s<  ddl m} d}d}d}d}d}d}	| d	| d
| d| d	| d| d| d	| d
| d| d	| d| di}
g }t| j|r|
| d	| d| d| d	| d| d| d	| d| d| d	| d| d| d	| d| d| d	| d| d| d	|	 d| d| d	|	 d| d| d	|	 d | d!| d	|	 d"| d#i
 nY|tj| d$| d| d| dftj	d%tj| d&| d| d| dftj
d%tj| d'| d!| dftjd%tj| d(| d#| dftjd%g tj|||
|d)S )*a  Convert state dict from NeMo PEFT model to HF PEFT format.

        Maps the weights from the NeMo model to the HF model according to
        the appropriate mapping scheme for PEFT adapters.

        Args:
            source: Source NeMo model with PEFT adapters
            target: Target HF model

        Returns:
            The target model with weights transferred from source
        r   )r  r  zbase_model.model.model.layers.z"self_attention.linear_proj.adapterzmlp.linear_fc2.adapterz!self_attention.linear_qkv.adapterzmlp.linear_fc1.adapterz*.z.linear_in.weightz(*.self_attn.o_proj.lora_A.default.weightz.linear_out.weightz(*.self_attn.o_proj.lora_B.default.weightz%*.mlp.down_proj.lora_A.default.weightz%*.mlp.down_proj.lora_B.default.weightz.adapter_q.linear_in.weightz(*.self_attn.q_proj.lora_A.default.weightz.adapter_q.linear_out.weightz(*.self_attn.q_proj.lora_B.default.weightz.adapter_k.linear_in.weightz(*.self_attn.k_proj.lora_A.default.weightz.adapter_k.linear_out.weightz(*.self_attn.k_proj.lora_B.default.weightz.adapter_v.linear_in.weightz(*.self_attn.v_proj.lora_A.default.weightz.adapter_v.linear_out.weightz(*.self_attn.v_proj.lora_B.default.weightz.adapter_up.linear_in.weightz#*.mlp.up_proj.lora_A.default.weightz.adapter_up.linear_out.weightz#*.mlp.up_proj.lora_B.default.weightz.adapter_gate.linear_in.weightz%*.mlp.gate_proj.lora_A.default.weightz.adapter_gate.linear_out.weightz%*.mlp.gate_proj.lora_B.default.weightz4*.self_attention.linear_qkv.adapter.linear_in.weightr   z5*.self_attention.linear_qkv.adapter.linear_out.weightz)*.mlp.linear_fc1.adapter.linear_in.weightz**.mlp.linear_fc1.adapter.linear_out.weightr  )r  r  r  r  r  r	  r   r  r   
duplicate3rQ  
duplicate2rR  r  )rq   r   r   r  pnphp_projp_fc2p_qkvp_fc1r  r  rH   rH   rI   r     s   	
)z!HFLlamaPEFTExporter.convert_stater#   c              
   C   s   ddl m} ddlm} | jjr| jjdksJ ddgdgdgg d	d
gdgdgddgdgd	}g }| jjD ]	}|||  q7|| jj	|| jj
| jjt| j|dS )zCreate a PEFT config for the HF model.

        Translates the NeMo PEFT configuration to the equivalent HF PEFT
        configuration.

        Returns:
            PeftConfig: HF PEFT configuration
        r   )
LoraConfig)r  prez5LoRA dropout_position must be 'pre' to convert to HF.q_projk_projv_proj)r  r  r  o_projup_proj	gate_proj	down_proj)	linear_qlinear_klinear_v
linear_qkvlinear_projlinear_fc1_uplinear_fc1_gate
linear_fc1
linear_fc2)ri  target_modules
lora_alphalora_dropoutuse_dora)r  r  r  r  r  dropoutdropout_positionr  r	  r  alphar  )rq   r  r  NEMO2HFhf_target_modulestmrH   rH   rI   r    s6   

zHFLlamaPEFTExporter.peft_config)ri   r"   )ri   r#   )r=   r>   r?   r@   r   r>  r   r   r   r   rD  r  ry   rH   rH   rw   rI   r  f  s    gr  rc   r`   rf   rY   rk   re   rg   rh   c                 C   s   t d| d| d| d| d	 || }|| }dtj |  }t||k| | | }|| | ||  }	d|	 | | |	|  }
||k  ||k  }t||
|}|S )aZ  Apply RoPE scaling for extending context length in Llama models.

    This implements the NTK-aware RoPE scaling method used in Llama 3.1 models to
    extend context length beyond the original training length.

    Args:
        inv_freq: Original inverse frequency tensor
        factor: Scaling factor for context length extension
        low_freq_factor: Factor for low frequency components
        high_freq_factor: Factor for high frequency components
        old_context_len: Original context length

    Returns:
        torch.Tensor: Modified inverse frequency tensor for extended context
    zApply rope scaling with factor=z, low_freq_factor=z, high_freq_factor=z, old_context_len=r   r%  r   )r    infomathpir   where)rp   rk   re   rg   rh   low_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqrH   rH   rI   rn   +  s    rn   ctxtensorc                 C   s"   | j j}|j}tj||dd}|S )a!  
    Split interleave-concatenated MoE expert weights.

    Args:
        ctx: Transformation context containing model configuration.
        tensor: The tensor containing concatenated expert weights.

    Returns:
        A list of tensors, each corresponding to an expert's weights.
    r   r  )r   r   r  r   r  )r  r  megatron_configr  expert_tensorsrH   rH   rI   	split_moeV  s   r  )r%   rK   rT   rW   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rc   r`   rf   rY   )fr  r  dataclassesr   r   	functoolsr   pathlibr   typingr   r   r   r	   r
   r   r   r   r   r   torch.nn.functionalr   
functionalrC   rz  #nemo.collections.llm.gpt.model.baser   r   r   +nemo.collections.llm.gpt.model.llama4_utilsr   nemo.collections.llm.utilsr   .nemo.export.trt_llm.nemo_ckpt_loader.nemo_filer   nemo.lightningr   r   r   nemo.lightning.ckpt_utilsr   nemo.lightning.io.plr   nemo.lightning.io.stater   r   r   nemo.lightning.pytorch.utilsr   
nemo.utilsr    $megatron.core.transformer.spec_utilsr!   HAVE_TEImportError"megatron.core.models.gpt.gpt_modelrj   r  r"   r#   r   r%   rW  r&   r  r(   1nemo.collections.common.tokenizers.tokenizer_specr)   rK   rT   rW   r]   rb   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   model_importerModelConnectorr   model_exporterrE  r  rG   rF   rn   staticmethodTensorr  __all__rH   rH   rH   rI   <module>   s   ,&	'
  
  
] G
+