o
    }oiמ                  	   @   s  d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlm  mZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZm Z m!Z! d dl"m#Z# d dl$m%Z% z(d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z0 d dl1m2Z3 dZ4W n e5e6fy   e%7d dZ4Y nw d dl8m9Z9 d dl:m;Z; d dl<m=Z= dej>fddZ?dd Z@eG dd de=e jAZBG d d! d!eZ/e Ce/d"G d#d$ d$e jDd!e/f ZEe Ce/d%G d&d' d'e jDd(e/f ZFe Ge/d%G d)d* d*e jDe/d(f ZHe jId+d,d-d.e jJfd/d0ZKe jId,d+d-d.e jJfd1d2ZLe jId3d4d-d.e jJfd5d6ZMe jId7d8d-d.e jJfd9d:ZNeG d;d< d<eBZOeG d=d> d>eBZPeG d?d@ d@eBZQeG dAdB dBeBZReG dCdD dDeBZSeG dEdF dFeBZTeG dGdH dHeBZUeG dIdJ dJeBZVeG dKdL dLeVZWeG dMdN dNeVZXeG dOdP dPeVZYeG dQdR dReVZZg dSZ[dS )T    N)	dataclassfield)Path)	AnnotatedCallableLiteralOptionalUnion)nn)
AutoConfigAutoModelForCausalLMAutoTokenizer)TokenizerSpec)GPTModelgpt_data_steptorch_dtype_from_mcore_config)Config)OptimizerModuleget_vocab_sizeioteardown)_ModelState)logging)parallel_state)load_plain_tensors)GPTInferenceWrapper)InferenceWrapperConfig)
MambaModel)mamba_stack_specTzZThe package `megatron.core` was not imported in this environment which is needed for SSMs.F)AttnBackend)
ModuleSpec)TransformerConfigreturnc                 C   s.   |d |d |d d}d|d< | di |S )aC  
    Performs a forward step for the SSM model.

    Args:
        model (torch.nn.Module): The model to perform the forward step on.
        batch (dict): A dictionary containing input tensors such as `tokens`, `position_ids`, and `labels`.

    Returns:
        torch.Tensor: The output tensor from the forward step.
    tokensposition_idslabels)	input_idsr$   r%   Nattention_mask r(   )modelbatchforward_argsr(   r(   V/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/ssm.pyssm_forward_step3   s   r-   c           	   
   C   s  dt jd< dt jd< d}d}tjjd||d t| }t| }|D ]}d	|v r.|| q#|d
 }|d
 |d |d z|d W n	 t	yR   Y nw z|d W n	 t	yc   Y nw z|d W n	 t	yt   Y nw t
|jD ]\}}|dkrZtj|d| d |d| d |d| d |d| d |d| d gdd|d| d< |d| d |d| d |d| d |d| d |d| d tj|d| d |d| d |d| d gdd|d| d< |d| d |d| d |d| d tj|d| d |d| d |d| d gdd|d| d < |d| d |d| d |d| d qztj  ||fS )!a  
    Handles distributed checkpoint loading and processing.

    Args:
        checkpoint_dir (str): The directory containing the checkpoint files.

    Returns:
        tuple: A tuple containing the processed state dictionary and distributed checkpoint arguments.
    	localhostMASTER_ADDR12355MASTER_PORT   r   gloo)backendrank
world_size	optimizerargscheckpoint_version	iterationopt_param_scheduler$num_floating_point_operations_so_farrerun_state_machineMdecoder.layers.z.mixer.in_proj.weight.zz.mixer.in_proj.weight.xz.mixer.in_proj.weight.Bz.mixer.in_proj.weight.Cz.mixer.in_proj.weight.dt)dimz.mixer.in_proj.weightz.mixer.conv1d.weight.xz.mixer.conv1d.weight.Bz.mixer.conv1d.weight.Cz.mixer.conv1d.weightz.mixer.conv1d.bias.xz.mixer.conv1d.bias.Bz.mixer.conv1d.bias.Cz.mixer.conv1d.bias)osenvirontorchdistributedinit_process_groupr   listkeyspop	Exception	enumeratehybrid_override_patterncatdestroy_process_group)	checkpoint_dirr6   r5   
state_dictkey_listkdist_ckpt_argsisymbolr(   r(   r,   dist_ckpt_handlerG   s   








rU   c                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< e	j
Ze	jed< dZeed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZed ed< dZeed< dZeed< dZe e ed < dZ!eed!< d"Z"eed#< dZ#eed$< d%Z$eed&< dZ%eed'< dZ&eed(< dZ'eed)< d*Z(eed+< dZ)eed,< e*j+Z,e*ed-< e-Z.e/ed.< e0Z1e/ed/< dZ2eed0< dZ3eed1< dZ4eed2< dZ5eed3< dZ6eed4< e7d5d6 d7Z8e9e:e/g e:f f ed8< 	d>d9e e d:d;fd<d=Z;dS )?	SSMConfigaF  
    Configuration class for the SSM model.

    Inherits from TransformerConfig and io.IOMixin to provide additional configuration options
    specific to the SSM model.

    Attributes:
        fp16_lm_cross_entropy (bool): Whether to use FP16 for cross-entropy loss.
        parallel_output (bool): Whether to enable parallel output.
        share_embeddings_and_output_weights (bool): Whether to share embeddings and output weights.
        params_dtype (torch.dtype): The data type for model parameters.
        ... (other attributes are described in the class definition)
    Ffp16_lm_cross_entropyTparallel_output#share_embeddings_and_output_weightsparams_dtypefp16bf16   
num_layers   mamba_num_groupsr2   num_attention_headsg        hybrid_attention_ratiohybrid_mlp_ratioNrK   post_processpre_process    
seq_lengthnone)learned_absoluteroperh   position_embedding_typeg      ?rotary_percenti'  rotary_baseseq_len_interpolation_factorapply_rope_fusion   make_vocab_size_divisible_bygated_linear_unitRMSNormnormalizationadd_bias_linearhidden_dropoutattention_dropoutgh㈵>layernorm_epsilonget_attention_mask_from_fusionattention_backendforward_step_fndata_step_fn
vocab_filetokenizer_model_pathdeallocate_pipeline_outputsbias_dropout_fusioncross_entropy_loss_fusionc                   C   s   t S N)default_mamba_stack_specr(   r(   r(   r,   <lambda>   s    zSSMConfig.<lambda>)default_factoryr   vp_stager"   MCoreMambaModelc                 C   s   | j }t|ts| }t| dddu r|du sJ dt| |t| |j| j| j| j	| j
| j| j| j| j| j|p:t |p?t dS )zA
        Configures the model for training or inference.
        $virtual_pipeline_model_parallel_sizeNzVirtual pipeline model parallelism is temporarily unsupported in SSM/Mamaba models due to upstream MCore MambaModel API dependency)r   
vocab_sizemax_sequence_lengthrb   rc   rK   rk   rl   rm   rn   re   rd   )r   
isinstancer    getattrr   r   r   rq   rg   rb   rc   rK   rk   rl   rm   rn   r   is_pipeline_first_stageis_pipeline_last_stage)self	tokenizerre   rd   r   r   r(   r(   r,   configure_model   s*   


zSSMConfig.configure_model)NNN)<__name__
__module____qualname____doc__rW   bool__annotations__rX   rY   rC   bfloat16rZ   dtyper[   r\   r^   intr`   ra   rb   floatrc   rK   strrd   re   rg   rk   r   rl   rm   rn   r   ro   rq   rr   rt   ru   rv   rw   rx   ry   r   flashrz   r-   r{   r   r   r|   r}   r~   r   r   r   r   r   r	   r    r   r(   r(   r(   r,   rV      s^   
 rV   c                       sz   e Zd ZdZ				ddeee ee f dee ded dee	e
jge
jf  f fdd	Z	
ddejfddZ  ZS )r   ar  
    A subclass of GPTModel that implements the Mamba architecture.

    Attributes:
        config (SSMConfig): The configuration for the Mamba model.
        optim (OptimizerModule): The optimizer module for training.
        tokenizer (TokenizerSpec): The tokenizer used for text processing.
        model_transform (Callable): A function to transform the model.
    Nconfigoptimr   r   model_transformc                    s   t  j|pt |||d dS )zu
        Initializes the MambaModel with the given configuration, optimizer, tokenizer, and model transform.
        )r   r   r   N)super__init__rV   )r   r   r   r   r   	__class__r(   r,   r     s   
zMambaModel.__init__rf   r"   c                 C   s   | j }|rt|tu rnt|dd}|s|du st|tur"tdd}| jdur.| jj}nt| jdr9| jj}ntdt	|jj
||||d}t||}|S )z>
        Returns the inference wrapper for the model.
        moduleNz@Exact MCoreMambaModel instance not found in the model structure.r   zlUnable to find vocab size. Either pass in a tokenizer with vocab size, or set vocab size in the model config)hidden_sizerZ   &inference_batch_times_seqlen_thresholdpadded_vocab_sizeinference_max_seq_length)r   typer   r   
ValueErrorr   r   hasattrr   r   r   r   )r   rZ   r   r   mcore_modelr   inference_wrapper_configmodel_inference_wrapperr(   r(   r,   get_inference_wrapper  s2   



z MambaModel.get_inference_wrapper)NNNN)rf   )r   r   r   r   r   r   rV   r   r   r   r
   Moduler   rC   Tensorr   __classcell__r(   r(   r   r,   r      s&    r   pytorchc                       st   e Zd ZdZddef fddZdefddZdd
ede	defddZ
dd Zedd ZedefddZ  ZS )PyTorchSSMImporterz
    A model importer for loading PyTorch-based SSM models.

    Attributes:
        path (str): The path to the model checkpoint.
        model_config (SSMConfig): The configuration for the model.
    Npathc                    s   t  | |}||_|S )zC
        Creates a new instance of the SSM model importer.
        )r   __new__model_config)clsr   r   instancer   r(   r,   r   >  s   zPyTorchSSMImporter.__new__r"   c                 C      t | j| jdS 3
        Initializes the model for export.
        )r   r   r   r   r   r(   r(   r,   initF     zPyTorchSSMImporter.initFoutput_pathsource_dist_ckptc                 C   s   |rt t| \}}n	tjt| dd}d|v r|d }t|}|  }| |}|| jj	 || jj	 | 
|| dt_dt_dt_dt_| || td|  t|| ~~|S )aP  
        Converts the SSM model to Nemo format and saves it to the specified path.
        Args:
            output_path (Path): The path to save the exported model.
            source_dist_ckpt (bool): Whether to load from a distributed checkpoint.
        Returns:
            output_path (Path): The path to the saved model.
        cpu)map_locationr)   r   z,Converted SSM model to Nemo, model saved to )rU   r   rC   loadr   r   
nemo_setuptor   rZ   convert_stater   _DATA_PARALLEL_GROUP_DATA_PARALLEL_GROUP_WITH_CP_MPU_DATA_PARALLEL_WORLD_SIZE_MPU_DATA_PARALLEL_RANK	nemo_saver   infor   )r   r   r   sourcerR   targettrainerr(   r(   r,   applyL  s(   	

zPyTorchSSMImporter.applyc                 C   s   | j jdkrdddddddd	d
dddd}nFd| j jv rSdddddddd	d
ddd}d| j jv r;|dd dD  n|ddi d| j jv rR|dd dD  n	td| j dtj|||dS )  
        Converts the state of the source model to match the target model.
        Args:
            source (torch.nn.Module): The source model.
            target (torch.nn.Module): The target model.
        Returns:
            torch.nn.Module: The converted target model.
        base embedding.word_embeddings.weightdecoder.layers.*.mixer.A_logdecoder.layers.*.mixer.D$decoder.layers.*.mixer.conv1d.weight"decoder.layers.*.mixer.conv1d.bias%decoder.layers.*.mixer.in_proj.weightdecoder.layers.*.mixer.dt_bias&decoder.layers.*.mixer.out_proj.weight"decoder.layers.*.mixer.norm.weight0decoder.layers.*.mixer.in_proj.layer_norm_weightdecoder.final_norm.weightoutput_layer.weight)zbackbone.embedding.weightbackbone.layers.*.mixer.A_logbackbone.layers.*.mixer.D%backbone.layers.*.mixer.conv1d.weight#backbone.layers.*.mixer.conv1d.bias&backbone.layers.*.mixer.in_proj.weightbackbone.layers.*.mixer.dt_bias'backbone.layers.*.mixer.out_proj.weight#backbone.layers.*.mixer.norm.weightzbackbone.layers.*.norm.weightbackbone.norm_f.weightlm_head.weightnvidia)r   r   r   r   r   r   r   r   r   r   r   	nemotronhc                 S      i | ]}||qS r(   r(   .0keyr(   r(   r,   
<dictcomp>      z4PyTorchSSMImporter.convert_state.<locals>.<dictcomp>)r   zdecoder.layers.*.norm.weighthybridc                 S   r   r(   r(   r   r(   r(   r,   r     s    )z1decoder.layers.*.mlp.linear_fc1.layer_norm_weight&decoder.layers.*.mlp.linear_fc1.weight&decoder.layers.*.mlp.linear_fc2.weight2decoder.layers.*.self_attention.linear_proj.weightz<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight1decoder.layers.*.self_attention.linear_qkv.weightzmapping type [z] not found.)mapping)r   mapping_typeupdateAttributeErrorr   apply_transforms)r   r   r   r   r(   r(   r,   r   q  sT   	z PyTorchSSMImporter.convert_statec                 C   s2   ddl m} || jj| jj| jj| jjdd}|S )z
        Loads the tokenizer from the specified path.
        Returns:
            TokenizerSpec: The tokenizer object.
        r   )get_nmt_tokenizerT)library
model_namer}   tokenizer_modeluse_fast)3nemo.collections.nlp.modules.common.tokenizer_utilsr   r   tokenizer_librarytokenizer_namer}   r~   )r   r   r   r(   r(   r,   r     s   zPyTorchSSMImporter.tokenizerc                 C   s   | j S )
        Loads the model configuration from the specified path.
        Returns:
            SSMConfig: The model configuration object.
        )r   r   r(   r(   r,   r     s   zPyTorchSSMImporter.configr   )F)r   r   r   r   r   r   r   r   r   r   r   r   propertyr   rV   r   r   r(   r(   r   r,   r   4  s    %A
r   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFNemotronHImporterz
    A model importer for loading Hugging Face-based NemotronH models.

    Attributes:
        path (str): The path to the Hugging Face model checkpoint.
        model_config (SSMConfig): The configuration for the model.
    r"   c                 C   r   r   r   r   r(   r(   r,   r     r   zHFNemotronHImporter.initr   c                 C   sx   t jt| dd}|  }| |}|| jj}|| jj}| || | 	|| t
d|  t|| ~~|S )a  
        Converts the NemotronH model to Nemo format and saves it to the specified path.
        Args:
            output_path (Path): The path to save the exported model.
        Returns:
            output_path (Path): The path to the saved model.
        Ttrust_remote_codez9Converted NemotronH Hybrid model to Nemo, model saved to )r   from_pretrainedr   r   r   r   r   rZ   r   r   printr   )r   r   r   r   r   r(   r(   r,   r     s   

zHFNemotronHImporter.applyc                 C   s   ddddddddd	d
ddddd}t |jjD ]>\}}|dkr,d| d|d| d< q|dkr=d| d|d| d< q|dkrNd| d|d| d< qtd| dtj|||tgdS )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )backbone.embeddings.weightr   r   r   r   r   r   r   r   &backbone.layers.*.mixer.up_proj.weight(backbone.layers.*.mixer.down_proj.weight%backbone.layers.*.mixer.o_proj.weightr   r   r>   r?    .mixer.in_proj.layer_norm_weightbackbone.layers..norm.weight-!.mlp.linear_fc1.layer_norm_weight*,.self_attention.linear_qkv.layer_norm_weightlayer type  not found.r   
transforms)rJ   r   rK   r   r   r   _import_qkv)r   r   r   r   rS   
layer_typer(   r(   r,   r     s2   

z!HFNemotronHImporter.convert_stater   c                 C   s"   ddl m} || t| ddS )
        Loads the tokenizer from the specified path.
        Returns:
            AutoTokenizer: The tokenizer object.
        r   )r   Tr
  )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   save_hf_tokenizer_assetsr   )r   r   r(   r(   r,   r     s   zHFNemotronHImporter.tokenizerc                 C   s   t jt| dd}tj|_dd }d|jv rt }|S d|jv r%t }|S d|jv r/t	 }|S d|jv r9t
 }|S td	|j )
r  Tr
  c                 S   s(   d}| | dkr|d }| | dks|S )Nrp   r   r]   r(   )r   r   r(   r(   r,   rq   4  s
   z@HFNemotronHImporter.config.<locals>.make_vocab_size_divisible_by4B8B47B56BUnsupported model size: )r   r  r   rC   r   torch_dtype_name_or_pathNemotronHConfig4BNemotronHConfig8BNemotronHConfig47BNemotronHConfig56Br   )r   r   rq   nemotron_h_configr(   r(   r,   r   )  s    




zHFNemotronHImporter.configN)r"   r   )r   r   r   r   r   r   r   r   r   r  r   rV   r   r(   r(   r(   r,   r	    s    '
r	  r   c                   @   sR   e Zd ZdZejfdddZdedefddZd	d
 Z	e
dd Ze
dd ZdS )HFNemotronHExporterz
    A model exporter for converting Mamba models to Hugging Face format.

    Attributes:
        path (str): The path to save the exported model.
        model_config (SSMConfig): The configuration for the model.
    r"   r   c                 C   sF   ddl m} |  tj| jddW  d   S 1 sw   Y  dS )r   r   )no_init_weightsTr
  N)transformers.modeling_utilsr/  r   from_configr   )r   r   r/  r(   r(   r,   r   R  s   $zHFNemotronHExporter.initr   c                 C   s   |  t| \}}|t|j}|  t|j}| ||}| }|| z	| j	| W |S  t
yB   td Y |S w )a  
        Converts the Mamba model to Hugging Face format and saves it to the specified path.
        Args:
            output_path (Path): The path to save the exported model.
        Returns:
            output_path (Path): The path to the saved model.
        zFailed to save tokenizer)	nemo_loadr   r   r   r   r   r   r   save_pretrainedr   rI   r   warning)r   r   r   _r   r(   r(   r,   r   [  s   
zHFNemotronHExporter.applyc                 C   s   ddddddddd	d
ddd}t |jjD ]>\}}|dkr*d| d|d| d< q|dkr;d| d|d| d< q|dkrLd| d|d| d< qtd| dtttg}tj||||dS )r   r   r   r   r   r   r   r   r   r  r  r  r   )r   r   r   r   r   r   r   r   r   r   r   r   r>   r  r  r?   r  r  r  r  r  r  r  r  )	rJ   r   rK   r   _export_qkv_export_embedding_export_headr   r   )r   r   r   r   rS   r  r  r(   r(   r,   r   q  s:   


z!HFNemotronHExporter.convert_statec                 C   s   t jdddS )r  nvidia/Nemotron-H-8B-Base-8KTr
  )r   r  r   r(   r(   r,   r     s   zHFNemotronHExporter.tokenizerc                 C   s   t jt| dd}t|tkrtjddd}|S t|tkr'tjddd}|S t|tkr6tjddd}|S t|t	krEtjddd}|S t
d	| )
r  zmodel.config)subpathznvidia/Nemotron-H-4B-Base-8KTr
  r9  znvidia/Nemotron-H-47B-Base-8Kznvidia/Nemotron-H-56B-Base-8Kr&  )r   load_contextr   r   r)  r   r  r*  r+  r,  r   )r   r   	hf_configr(   r(   r,   r     s   
zHFNemotronHExporter.configN)r"   r   )r   r   r   r   rC   r   r   r   r   r   r  r   r   r(   r(   r(   r,   r.  H  s    	-
	r.  )z%backbone.layers.*.mixer.q_proj.weightz%backbone.layers.*.mixer.k_proj.weightz%backbone.layers.*.mixer.v_proj.weightr   )
source_key
target_keyctxc                 C   s  | j j}|j}|j}|| }|j}|j}	| }
||	f|
dd  }||	f|
dd  }|j| }|j| }|j| }g }t|D ]<}|	||| |d | ddddf  |	|||d ddddf  |	|||d ddddf  qAt
|}|jdksJ |j|jd |d | ksJ |j|jd |	ksJ |j|jd |
d ksJ |j||	|d|   |g}|S )a  
    Transforms Q, K, and V projection weights from the source model to the target model.

    Args:
        ctx (io.TransformCTX): The transformation context.
        q (torch.Tensor): The Q projection weights.
        k (torch.Tensor): The K projection weights.
        v (torch.Tensor): The V projection weights.

    Returns:
        torch.Tensor: The transformed QKV weights.
    r2   N   r   r]   )r   r   ra   num_query_groupsr   kv_channelssizeviewrangeappendrC   rL   ndimshapereshape)r?  qrQ   vmegatron_confighead_numrA  heads_per_groupr   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_lrS   qkv_weightsr(   r(   r,   r    s0   


,$&
 r  c                    s   | j j}|j}|j}||  |j}|j}|d|  }||||g}t fddt	|D }t
 | d }	t
 d | d }
|| d| }||	 d| }||
 d| }|||fS )a<  
    Transforms QKV weights from the target model back to the source model format.

    Args:
        ctx (io.TransformCTX): The transformation context.
        linear_qkv (torch.Tensor): The QKV weights from the target model.

    Returns:
        tuple: A tuple containing the transformed Q, K, and V weights.
    r]   c                    s,   g | ]}t  d  |  d  |   qS )r]   )rC   arange)r   rS   rN  r(   r,   
<listcomp>  s    z_export_qkv.<locals>.<listcomp>r2   )r   r   ra   rA  r   rB  rI  rC   rL   rE  rU  r   )r?  
linear_qkvrL  rM  rA  r   rO  qkv_total_dimq_slicek_slicev_sliceq_projk_projv_projr(   rV  r,   r6    s&   

r6  r   r  c                 C      | j j}|d|jddf S )a8  
    Transforms the embedding weights from the target model to the source model format.

    Args:
        ctx (io.TransformCTX): The transformation context.
        embedding (torch.Tensor): The embedding weights from the target model.

    Returns:
        torch.Tensor: The transformed embedding weights.
    Nr   r   r   r?  	embeddingrL  r(   r(   r,   r7  "     r7  r   r   c                 C   ra  )aA  
    Transforms the output layer weights from the target model to the source model format.

    Args:
        ctx (io.TransformCTX): The transformation context.
        embedding (torch.Tensor): The output layer weights from the target model.

    Returns:
        torch.Tensor: The transformed output layer weights.
    Nrb  rc  r(   r(   r,   r8  6  re  r8  c                   @      e Zd ZU d ZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dS )BaseMambaConfig130MMMMMMMMMMMMMMMMMMMMMMMMMrK      r^      rg   i   r   r2   r`   ffn_hidden_size   rq   huggingfacer  EleutherAI/gpt-neox-20br  r   r   Nr   r   r   r   rK   r   r   r^   r   rg   r   r`   rk  rq   r  r  r   r(   r(   r(   r,   rg  J     
 rg  c                   @   rf  )BaseMambaConfig370M0MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMrK   0   r^   rj  rg   i   r   r2   r`   rk  rl  rq   rm  r  rn  r  r   r   Nro  r(   r(   r(   r,   rq  Z  rp  rq  c                   @   rf  )BaseMambaConfig780Mrr  rK   rs  r^   rj  rg   i   r   r2   r`   rk  rl  rq   rm  r  rn  r  r   r   Nro  r(   r(   r(   r,   rt  j  rp  rt  c                   @   s   e Zd ZU d ZdZeed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dS )BaseMambaConfig1_3Brr  rK   rs  r^   rj  rg   r   r2   r`   rk  rl  rq   rm  r  rn  r  r   r   Nro  r(   r(   r(   r,   ru  z  rp  ru  c                   @   rf  )BaseMambaConfig2_7B@MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMrK   @   r^   rj  rg   i 
  r   r2   r`   rk  rl  rq   rm  r  rn  r  r   r   Nro  r(   r(   r(   r,   rv    rp  rv  c                   @   s   e Zd ZU d ZdZeed< dZeed< dZ	eed< dZ
eed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )NVIDIAMambaConfig8B8MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMrK       ra   8   r^      rg   r   r_   r`   rk  rp   rq   megatronr  GPTSentencePieceTokenizerr  znvidia-purer   N)r   r   r   r   rK   r   r   ra   r   r^   rg   r   r`   rk  rq   r  r  r   r(   r(   r(   r,   ry    s   
 ry  c                   @   s   e Zd ZU d ZdZeed< dZeed< dZ	eed< dZ
eed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )NVIDIAMambaHybridConfig8Bz8M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-rK   r|  r^   r}  rg   r   r_   r`   i @  rk  r{  ra   rA  rp   rq   r~  r  r  r  znvidia-hybridr   N)r   r   r   r   rK   r   r   r^   r   rg   r   r`   rk  ra   rA  rq   r  r  r   r(   r(   r(   r,   r    s   
 r  c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dd Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )NemotronHConfigBasez-Base configuration class for NemotronH modelsrf   rg   r_   r`   rx  mamba_head_dimrA  rp   rq   c                 C   s   t t| dS )Nr]   )rC   powFrelu)xr(   r(   r,   r     r   zNemotronHConfigBase.<lambda>activation_functiktokenr  TiktokenTokenizerr  znvidia-hybrid-nemotronhr   Tmasked_softmax_fusionFapply_query_key_layer_scalingpersist_layer_normattention_softmax_in_fp32i   r   first_last_layers_bf16is_hybrid_modelN)r   r   r   r   rg   r   r   r`   r  rA  rq   r  callabler  r   r  r   r  r   r  r  r  r   r  r  r(   r(   r(   r,   r    s$   
 r  c                   @   s~   e Zd ZU d ZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< d	Zeed< dZeed< dZeed< dZeed< dS )r)  4M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-rK   4   r^   i   r   p   mamba_num_headsrp   rB  mamba_state_dimi 0  rk  r{  ra   Fuse_mamba_mem_eff_pathN)r   r   r   r   rK   r   r   r^   r   r   r  rB  r  rk  ra   r  r   r(   r(   r(   r,   r)    s   
 r)  c                   @   Z   e Zd ZU d ZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dS )r*  r  rK   r  r^   r}  r   rp   r  i T  rk  r{  ra   Nr   r   r   r   rK   r   r   r^   r   r   r  rk  ra   r(   r(   r(   r,   r*    s   
 r*  c                   @   r  )r+  zbM-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M---MM---M-M*-M-M-M-M-M-rK   b   r^   rf   r      r  i x  rk  rx  ra   Nr  r(   r(   r(   r,   r+    s   
 
r+  c                   @   r  )r,  zvM-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-rK   v   r^   rf   r   r  r  i   rk  rx  ra   Nr  r(   r(   r(   r,   r,    s   
 
r,  )rV   rg  rq  rt  ru  rv  ry  r  r  r)  r*  r+  r,  )\rA   dataclassesr   r   pathlibr   typingr   r   r   r   r	   rC   torch.nn.functionalr
   
functionalr  transformersr   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   #nemo.collections.llm.gpt.model.baser   r   r   nemo.collections.llm.utilsr   nemo.lightningr   r   r   r   nemo.lightning.io.stater   
nemo.utilsr   megatron.corer   .megatron.core.dist_checkpointing.serializationr   Jmegatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapperr   Imegatron.core.inference.model_inference_wrappers.inference_wrapper_configr   megatron.core.models.mambar   r   ,megatron.core.models.mamba.mamba_layer_specsr   r   HAVE_MEGATRON_CORE_OR_TEImportErrorModuleNotFoundErrorr4  megatron.core.transformer.enumsr   $megatron.core.transformer.spec_utilsr    ,megatron.core.transformer.transformer_configr!   r   r-   rU   IOMixinrV   model_importerModelConnectorr   r	  model_exporterr.  state_transformTransformCTXr  r6  r7  r8  rg  rq  rt  ru  rv  ry  r  r  r)  r*  r+  r,  __all__r(   r(   r(   r,   <module>   s   
VX
> 

xw-%