o
    }oi|                     @   s^  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZmZmZ d dlmZmZmZmZmZ d d	lmZmZ d d
lmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- e-d\Z.Z/d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 erd dl6m7Z7m8Z8 d dl9mZ: d dl9m;Z; d dl<m=Z= d dl>m?Z? eG dd deZ@G dd deZAdd d!e5fd"d#ZBeG d$d% d%ee3ZCeG d&d' d'ee3ZDG d(d) d)eZEe FeEd*G d+d, d,e jGd-eEf ZHe IeEd*G d.d/ d/e jGeEd-f ZJe IeEd0G d1d2 d2eJZKg d3ZLdS )4    N)	dataclass)partial)Path)TYPE_CHECKING	AnnotatedCallableOptionalUnion)nn)	GPTConfigGPTModeltorch_dtype_from_mcore_config)Llama31ConfigLlama31Config8BLlama31Config70BLlama31Config405BLlamaConfig)1LLAMA_31_NEMOTRON_ULTRA_253B_HETEROGENEOUS_CONFIG0LLAMA_33_NEMOTRON_SUPER_49B_HETEROGENEOUS_CONFIG)Config)OptimizerModuleioteardown)ADAPTER_META_FILENAME)ckpt_to_weights_subdir)TransformFns)dtype_from_hf)logging)safe_importtransformer_engine) get_gpt_heterogeneous_layer_spec)HeterogeneousTransformerConfig)
ModuleSpec)AutoPeftModelForCausalLM
PeftConfigr   )LlamaForCausalLMAutoTokenizer)TokenizerSpecc                   @      e Zd ZU dZdZeed< dS )Llama31NemotronNano8BConfigz1Configuration for an Llama31-Nemotron-Nano model.   kv_channelsN__name__
__module____qualname____doc__r-   int__annotations__ r5   r5   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/llama_nemotron.pyr+   ;   s   
 r+   c                   @   r*   )Llama31Nemotron70BConfigz0Configuration for an Llama31-Nemotron-70B model.r,   r-   Nr.   r5   r5   r5   r6   r7   B   s   
 r7   configr   returnc                 C   s   t | tdS )a%  Determine the most appropriate layer specification based on availability.

    Uses Transformer Engine specs if available, otherwise falls back to local implementation.

    Args:
        config: GPT configuration object

    Returns:
        ModuleSpec: The selected module specification
    )use_te)r    HAVE_TE)r8   r5   r5   r6   heterogeneous_layer_specI   s   r<   c                   @   l   e Zd ZU dZdZeed< dZeed< dZeed< dZ	e
ed	< eZe
ed
< eZeeedgef f ed< dS )Llama33NemotronSuper49BConfigz2Configuration for an Llama31-Nemotron-Super model.i    hidden_size@   num_attention_headsP   
num_layersN heterogeneous_layers_config_path(heterogeneous_layers_config_encoded_jsonr   transformer_layer_spec)r/   r0   r1   r2   r?   r3   r4   rA   rC   rD   strr   rE   r<   rF   r	   r"   r   r5   r5   r5   r6   r>   W      
 "r>   c                   @   r=   )Llama31NemotronUltra253BConfigz2Configuration for an Llama31-Nemotron-Ultra model.i @  r?   r,   rA      rC   NrD   rE   r   rF   )r/   r0   r1   r2   r?   r3   r4   rA   rC   rD   rG   r   rE   r<   rF   r	   r"   r   r5   r5   r5   r6   rI   c   rH   rI   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )LlamaNemotronModelzLlama-Nemotron model implementation based on the GPT model architecture.

    This class provides a high-level interface for Llama-Nemotron models,
    implementing the specific architecture and settings needed for Llama-Nemotron models.
    Nr8   optim	tokenizerr)   model_transformc                    s   t  j|pt |||d d S )N)rL   rM   rN   )super__init__r+   )selfr8   rL   rM   rN   	__class__r5   r6   rP   v   s   
zLlamaNemotronModel.__init__)NNNN)r/   r0   r1   r2   r   r   r   r   r   r   r
   ModulerP   __classcell__r5   r5   rR   r6   rK   o   s    rK   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFLlamaNemotronImportera  Importer for converting Hugging Face Llama-Nemotron models to NeMo format.

    This class handles the conversion of Hugging Face's LlamaForCausalLM models
    to NeMo's LlamaNemotronModel format, including weight mapping and configuration translation.
    r9   c                 C   s   t | j| jdS )zInitialize a NeMo LlamaModel instance.

        Returns:
            LlamaModel: Initialized NeMo Llama model with the appropriate configuration
                        and tokenizer.
        )rM   )rK   r8   rM   rQ   r5   r5   r6   init   s   zHFLlamaNemotronImporter.initoutput_pathc                 C   s   ddl m}m} tdt|   dt| v r"|jt| dd}n
|jt| ddd}td	 |  }| |}| 	|| | 
|| td
| d|j d t|| ~~|S )zApply the conversion from HF to NeMo format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved NeMo model
        r   )AutoModelForCausalLMr&   zLoad HF model Nanoautotorch_dtypeTtrust_remote_coder_   z$Initialize NeMo Nemotron-Llama modelz7Converted Llama-Nemotron model to Nemo, model saved to z in .)transformersr[   r&   r   inforG   from_pretrainedrY   
nemo_setupconvert_state	nemo_saveprintdtyper   )rQ   rZ   r[   r&   sourcetargettrainerr5   r5   r6   apply   s   	


zHFLlamaNemotronImporter.applyc                 C   s^   dddddddd}t |jd	d
r|d= tjddtjdtjddtjdg}tj||||dS )aJ  Convert state dict from HF format to NeMo format.

        Maps the weights from the HF model to the NeMo model according to
        the appropriate mapping scheme.

        Args:
            source: Source HF model
            target: Target NeMo model

        Returns:
            The result of applying the transforms
         embedding.word_embeddings.weight2decoder.layers.*.self_attention.linear_proj.weight&decoder.layers.*.mlp.linear_fc2.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weightdecoder.final_layernorm.weightoutput_layer.weight)model.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight#model.layers.*.mlp.down_proj.weight%model.layers.*.input_layernorm.weight.model.layers.*.post_attention_layernorm.weightmodel.norm.weightlm_head.weighttie_word_embeddingsFr|   z&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnz#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weight&decoder.layers.*.mlp.linear_fc1.weightmapping
transforms)getattrr8   r   state_transformr   	merge_qkv	merge_fc1apply_transformsrQ   rk   rl   r   r   r5   r5   r6   rg      s,   		z%HFLlamaNemotronImporter.convert_stater(   c                 C   s"   ddl m} || t| ddS )zGet the tokenizer for the HF model.

        Returns:
            AutoTokenizer: Tokenizer instance initialized from the HF model's tokenizer
        r   r'   Tra   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr(   save_hf_tokenizer_assetsrG   )rQ   r(   r5   r5   r6   rM      s   z!HFLlamaNemotronImporter.tokenizerc           	      C   s  ddl m}m} |jt| dd}z	|t| }W n ty%   d}Y nw dd }t|dds4J d	t|d
ddur^|jdkrCtnt	}t
|| d|jdd|j|jd jj d}n|jdkretnt}t
||jd}|d%i d|jd|jd|jd|jdt|ddd|jddd|jd|jd|jd|jddd||jdt|dd d!t|tjkd"t|tjkd#t|d$|}|S )&zCreate a NeMo LlamaNemotronConfig from the HF model config.

        Translates the HF configuration parameters to the equivalent NeMo
        configuration.

        Returns:
            LlamaConfig: NeMo configuration for Llama models
        r   )
AutoConfigGenerationConfigTr   Nc                 S   s(   d}| | dkr|d }| | dks|S )Nr,   r      r5   )
vocab_sizebaser5   r5   r6   make_vocab_size_divisible_by   s
   zDHFLlamaNemotronImporter.config.<locals>.make_vocab_size_divisible_byrope_scalingz-Llama-Nemotron model should have rope scalingblock_configsrB   factorg       @)rE   rD   scale_factornum_query_groups    )r   rC   r?   ffn_hidden_sizerA   r-   head_dimr   init_method_stdlayernorm_epsilon
seq_lengthrotary_basegated_linear_unitr   #share_embeddings_and_output_weightsr}   Ffp16bf16params_dtypegeneration_configr5   ) rc   r   r   re   rG   	Exceptionr   num_hidden_layersr>   rI   r   to_json_stringr   getrA   r   	attentionn_heads_in_groupr+   r7   num_key_value_headsr?   intermediate_sizeinitializer_rangerms_norm_epsmax_position_embeddings
rope_thetar   r   torchfloat16bfloat16)	rQ   r   r   rk   r   r   target_classclsoutputr5   r5   r6   r8      sr   
	

zHFLlamaNemotronImporter.configN)r9   r(   )r/   r0   r1   r2   rK   rY   r   rn   rg   propertyrM   r   r8   r5   r5   r5   r6   rW      s    	+
rW   r&   c                   @   s\   e Zd ZdZejddfdddZddedefd	d
Zdd Z	e
dddZe
dddZdS )HFLlamaNemotronExportera  Exporter for converting NeMo Llama-Nemotron models to Hugging Face format.

    This class handles the conversion of NeMo's LlamaNemotronModel to Hugging Face's
    LlamaForCausalLM format, including weight mapping and configuration translation.
    It supports both homogeneous (Nano/70B) and heterogeneous (Super/Ultra) model architectures.

    The exporter performs the following key operations:
    1. Initializes a Hugging Face model with appropriate configuration
    2. Maps weights from NeMo format to Hugging Face format
    3. Handles special cases for heterogeneous architectures
    4. Saves the converted model and tokenizer to the specified output path

    Attributes:
        tokenizer: The tokenizer associated with the NeMo model
        config: The configuration for the Hugging Face model

    Methods:
        init: Initialize a Hugging Face model instance
        apply: Convert and save the model to Hugging Face format
        convert_state: Convert model weights from NeMo to Hugging Face format
    FNr9   r&   c           	      C   s   ddl m}m} ddlm} | 7 |r#|j| j|dW  d   S |dus)J |j|dd}|j|d|d}t|	d	 |W  d   S 1 sKw   Y  dS )
a  Initialize a Hugging Face LlamaForCausalLM model instance.

        This method creates a new Hugging Face model instance with the appropriate configuration
        and data type. It handles both homogeneous and heterogeneous model architectures.

        Args:
            dtype (torch.dtype, optional): Data type for model parameters. Defaults to torch.bfloat16.
            from_config (bool, optional): Whether to initialize from config or load from pretrained.
                Set to True for homogeneous models (Nano/70B), False for heterogeneous models (Super/Ultra).
                Defaults to False.
            model_name (str, optional): Name of the pretrained model to load for heterogeneous architectures.
                Required when from_config is False. Defaults to None.

        Returns:
            LlamaForCausalLM: Initialized Hugging Face Llama model instance

        Raises:
            AssertionError: If model_name is not provided for heterogeneous models
        r   )r   r[   )no_init_weightsr^   NTr   r`   r[   )
rc   r   r[   transformers.modeling_utilsr   from_configr8   re   typeregister_for_auto_class)	rQ   rj   r   
model_namer   r[   r   r8   hf_modelr5   r5   r6   rY   C  s    $zHFLlamaNemotronExporter.initrZ   c                 C   s   t d | t| \}}t|jt}|du r0|r0|jj}|dkr%d}n|dkr,d}ntd| j	t
|j| |d}| ||}| }|| | jj| |S )	a  Convert and save a NeMo Llama-Nemotron model to Hugging Face format.

        This method performs the complete conversion process:
        1. Loads the NeMo model checkpoint
        2. Determines the appropriate target model configuration
        3. Initializes the Hugging Face model
        4. Converts and transfers the weights
        5. Saves the converted model and tokenizer

        Args:
            output_path (Path): Directory path where the converted model will be saved
            target_model_name (str, optional): Name of the target Hugging Face model.
                Required for heterogeneous models (Super/Ultra). For homogeneous models,
                this is determined automatically. Defaults to None.

        Returns:
            Path: Path to the saved Hugging Face model directory

        Raises:
            ValueError: If the target model is not supported or if target_model_name is missing
                      for heterogeneous models
        z(Loading Llama-Nemotron NeMo checkpoint..NrB   &nvidia/Llama-3_3-Nemotron-Super-49B-v1rJ   'nvidia/Llama-3_1-Nemotron-Ultra-253B-v1^Unknown target model. Currently only support exporting Llama-Nemotron Nano/Super/Ultra models.r   r   )r   rd   	nemo_loadrG   
isinstancer8   r!   rC   
ValueErrorrY   r   rg   cpusave_pretrainedrM   )rQ   rZ   target_model_namerk   _is_heterogeneousrC   rl   r5   r5   r6   rn   n  s.   

zHFLlamaNemotronExporter.applyc                 C   sf   dddddd}t jddtjd	t jd
dtjd	t jddtjd	t jddtjd	g}t j||||dS )aZ  Convert state dict from NeMo format to HF format.

        Maps the weights from the NeMo model to the HF model according to
        the appropriate mapping scheme.

        Args:
            source: Source NeMo model
            target: Target HF model

        Returns:
            The target model with weights transferred from source
        rw   rx   ry   rz   r{   )rp   rq   rr   rs   rt   r   r~   r   r   r   ro   rv   ru   r|   r   )r   r   r   	split_qkv	split_fc1prune_paddingr   r   r5   r5   r6   rg     sB   		z%HFLlamaNemotronExporter.convert_stater)   c                 C   s   t jt| ddjS )zzGet the tokenizer from the NeMo model.

        Returns:
            TokenizerSpec: Tokenizer from the NeMo model
        modelsubpath)r   load_contextrG   rM   rX   r5   r5   r6   rM     s   z!HFLlamaNemotronExporter.tokenizerHFLlamaConfigc                 C   s   t jt| dd}t|trJ ddlm} d}t|tr)|j|j	|j
|jdd}||j|j|j|j|j|j|j|j|j|j| jj|j|| jj| jjdS )	aJ  Create a HF LlamaConfig from the NeMo model config.
        This function should only be invoked for Non-heterogeneous transformers (i.e. Nano).

        Translates the NeMo configuration parameters to the equivalent HF
        configuration.

        Returns:
            HFLlamaConfig: HF configuration for Llama models
        zmodel.configr   r   r%   Nllama3)r   low_freq_factorhigh_freq_factor original_max_position_embeddings	rope_type)r   r?   r   rA   r   r   r   r   r   r   r   r}   r   bos_token_ideos_token_id)r   r   rG   r   r!   rc   r   r   r   r   r   old_context_lenrC   r?   r   rA   r-   r   r   r   r   r   rM   r   r   bos_ideos_id)rQ   rk   r   r   r5   r5   r6   r8     s8   
zHFLlamaNemotronExporter.config)r9   r&   N)r9   r)   )r9   r   )r/   r0   r1   r2   r   r   rY   r   rn   rg   r   rM   r8   r5   r5   r5   r6   r   +  s    +57r   zhf-peftc                       sV   e Zd ZdZejddfd fddZddedefd	d
Zdd Z	e
dddZ  ZS )HFLlamaNemotronPEFTExporterzExporter for converting NeMotron Llama models with PEFT adapters to Hugging Face format.

    This class extends HFLlamaNemotronExporter to handle Parameter-Efficient Fine-Tuning (PEFT)
    adapters, specifically LoRA and DoRA adapters.
    FNr9   r#   c           	         s   ddl m} t j|||d}tt| ddt }t|d}t	|d }W d   n1 s0w   Y  d	
|d	d
d |_||| jddS )zInitialize a HF PEFT model.

        Args:
            dtype: Data type for model parameters

        Returns:
            AutoPeftModelForCausalLM: Initialized HF PEFT model
        r   )get_peft_model)rj   r   r   F)	is_savingrmodel_ckpt_pathN/)autocast_adapter_dtype)peftr   rO   rY   r   rG   r   openjsonloadjoinsplitname_or_pathpeft_config)	rQ   rj   r   r   r   r   adapter_meta_pathfr   rR   r5   r6   rY     s   	z HFLlamaNemotronPEFTExporter.initrZ   c                 C   s   ddl m}m}m} tjt| dd| _| t| \}}t	|j
t}|du r?|r?|j
j}	|	dkr4d}n|	dkr;d	}ntd
| jt|j
| |d}
| ||
}
|
 }
|
j|dd |S )zApply the conversion from NeMo PEFT model to HF format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved HF PEFT model
        r   )CanonicalLoRADoRALoRAzmodel.model_transformr   NrB   r   rJ   r   r   r   F)save_embedding_layers)nemo.collections.llm.peftr   r   r   r   r   rG   peft_objr   r   r8   r!   rC   r   rY   r   rg   r   r   )rQ   rZ   r   r   r   r   rk   r   r   rC   rl   r5   r5   r6   rn   .  s.   	z!HFLlamaNemotronPEFTExporter.applyc                 C   s<  ddl m} d}d}d}d}d}d}	| d	| d
| d| d	| d| d| d	| d
| d| d	| d| di}
g }t| j|r|
| d	| d| d| d	| d| d| d	| d| d| d	| d| d| d	| d| d| d	| d| d| d	|	 d| d| d	|	 d| d| d	|	 d | d!| d	|	 d"| d#i
 nY|tj| d$| d| d| dftj	d%tj| d&| d| d| dftj
d%tj| d'| d!| dftjd%tj| d(| d#| dftjd%g tj|||
|d)S )*a  Convert state dict from NeMo PEFT model to HF PEFT format.

        Maps the weights from the NeMo model to the HF model according to
        the appropriate mapping scheme for PEFT adapters.

        Args:
            source: Source NeMo model with PEFT adapters
            target: Target HF model

        Returns:
            The target model with weights transferred from source
        r   )r   zdecoder.layers.zbase_model.model.model.layers.z"self_attention.linear_proj.adapterzmlp.linear_fc2.adapterz!self_attention.linear_qkv.adapterzmlp.linear_fc1.adapterz*.z.linear_in.weightz(*.self_attn.o_proj.lora_A.default.weightz.linear_out.weightz(*.self_attn.o_proj.lora_B.default.weightz%*.mlp.down_proj.lora_A.default.weightz%*.mlp.down_proj.lora_B.default.weightz.adapter_q.linear_in.weightz(*.self_attn.q_proj.lora_A.default.weightz.adapter_q.linear_out.weightz(*.self_attn.q_proj.lora_B.default.weightz.adapter_k.linear_in.weightz(*.self_attn.k_proj.lora_A.default.weightz.adapter_k.linear_out.weightz(*.self_attn.k_proj.lora_B.default.weightz.adapter_v.linear_in.weightz(*.self_attn.v_proj.lora_A.default.weightz.adapter_v.linear_out.weightz(*.self_attn.v_proj.lora_B.default.weightz.adapter_up.linear_in.weightz#*.mlp.up_proj.lora_A.default.weightz.adapter_up.linear_out.weightz#*.mlp.up_proj.lora_B.default.weightz.adapter_gate.linear_in.weightz%*.mlp.gate_proj.lora_A.default.weightz.adapter_gate.linear_out.weightz%*.mlp.gate_proj.lora_B.default.weightz4*.self_attention.linear_qkv.adapter.linear_in.weightr   z5*.self_attention.linear_qkv.adapter.linear_out.weightz)*.mlp.linear_fc1.adapter.linear_in.weightz**.mlp.linear_fc1.adapter.linear_out.weightr   )r   r   r   r   updateextendr   r   r   
duplicate3r   
duplicate2r   r   )rQ   rk   rl   r   pnphp_projp_fc2p_qkvp_fc1r   r   r5   r5   r6   rg   V  s   	
)z)HFLlamaNemotronPEFTExporter.convert_stater$   c              
   C   s   ddl m} ddlm} | jjr| jjdksJ ddgdgdgg d	d
gdgdgddgdgd	}g }| jjD ]	}|||  q7|| jj	|| jj
| jjt| j|dS )zCreate a PEFT config for the HF model.

        Translates the NeMo PEFT configuration to the equivalent HF PEFT
        configuration.

        Returns:
            PeftConfig: HF PEFT configuration
        r   )
LoraConfig)r   prez5LoRA dropout_position must be 'pre' to convert to HF.q_projk_projv_proj)r  r	  r
  o_projup_proj	gate_proj	down_proj)	linear_qlinear_klinear_v
linear_qkvlinear_projlinear_fc1_uplinear_fc1_gate
linear_fc1
linear_fc2)r   target_modules
lora_alphalora_dropoutuse_dora)r   r  r   r   r   dropoutdropout_positionr  r   dimalphar   )rQ   r  r   NEMO2HFhf_target_modulestmr5   r5   r6   r     s6   

z'HFLlamaNemotronPEFTExporter.peft_config)r9   r#   r   )r9   r$   )r/   r0   r1   r2   r   r   rY   r   rn   rg   r   r   rU   r5   r5   rR   r6   r     s    (gr   )rK   r+   r>   rI   r7   )Mr   dataclassesr   	functoolsr   pathlibr   typingr   r   r   r   r	   r   r
   #nemo.collections.llm.gpt.model.baser   r   r   $nemo.collections.llm.gpt.model.llamar   r   r   r   r   4nemo.collections.llm.gpt.model.llama_nemotron_configr   r   nemo.collections.llm.utilsr   nemo.lightningr   r   r   nemo.lightning.ckpt_utilsr   nemo.lightning.io.plr   nemo.lightning.io.stater   nemo.lightning.pytorch.utilsr   
nemo.utilsr   nemo.utils.import_utilsr   r   r;   @megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specsr    <megatron.core.transformer.heterogeneous.heterogeneous_configr!   $megatron.core.transformer.spec_utilsr"   r   r#   r$   rc   r   r&   r   r(   1nemo.collections.common.tokenizers.tokenizer_specr)   r+   r7   r<   r>   rI   rK   model_importerModelConnectorrW   model_exporterr   r   __all__r5   r5   r5   r6   <module>   s^   
 
) 
f X