o
    }oi|                     @   sJ  d dl Z d dlZd dlmZmZmZ d dlmZmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZ d dlZd dlm  mZ d dlZd dlmZ d dlmZ d dlmZ d d	l m!Z! d d
lmZ d dl"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 erd dl8m9Z9 d dl"m:Z: d dl"m;Z< d dl=m>Z> d dl?m@Z@ e%rd dlAmBZB eG dd dee&ZCeG dd deCZDeG dd deDZEeG dd  d eCZFG d!d" d"e'ZGe-jHeGd#d$G d%d& d&e-jId'eGf ZJe-KeGd#G d(d) d)e-jIeGd'f ZLg d*ZMdS )+    N)asdict	dataclassfield)cached_propertypartial)Path)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion)get_gpt_decoder_block_spec)
IdentityOp)MLATransformerConfig)	load_file)nn
AutoConfig)HAVE_TE	GPTConfigGPTModelgpt_data_steptorch_dtype_from_dict_config)load_distributed_model_weights)ioteardown)TransformFns_ModelState)OptimizerModule)dtype_from_hf)logging)
ModuleSpecAutoModelForCausalLMDeepseekV3ConfigAutoTokenizer)TokenizerSpec)is_te_min_versionc                       s  e Zd ZU dZeeedZede	dgdf f e
d< dZee
d< ejZe	e
d< d	Zee
d
< dZee
d< dZee
d< dZee
d< dZee
d< dZee
d< dZee
d< dZee
d< dZee
d< dZee
d< dZee e
d< dZee e
d< dZ ee
d< dZ!ee
d< d	Z"ee
d < d	Z#ee
d!< d	Z$ee
d"< d#Z%ee
d$< d%Z&ee
d&< d	Z'ee
d'< d(Z(ee e
d)< d*Z)ee
d+< d,Z*ee
d-< dZ+ee
d.< d/Z,ee
d0< dZ-ee
d1< d2Z.ee
d3< d4Z/ee
d5< d4Z0ee
d6< d7Z1ee
d8< d9Z2ee
d:< d	Z3ee
d;< e4j5Z6e4j7e
d<< d	Z8ee
d=< dZ9ee
d>< d	Z:ee
d?< dZ;ee e
d@< dZ<ee e
dA< dZ=ee
dB< dZ>ee
dC< dZ?ee
dD< d	Z@ee
dE< d	ZAee
dF< d	ZBee
dG< d	ZCee
dH< d	ZDee
dI< dJZEee
dK< erTeFdLndZGee
dM<  fdNdOZH  ZIS )PDeepSeekConfigz4
    Base config for DeepSeek V2 and V3 models.
    )use_transformer_enginer$   r   transformer_layer_specRMSNormnormalizationactivation_funcTgated_linear_unitropeposition_embedding_typeFadd_bias_linear#share_embeddings_and_output_weights   num_attention_headskv_channelsi   max_position_embeddings
seq_lengthg     @rotary_base  make_vocab_size_divisible_byNmtp_num_layersmtp_loss_scaling_factorg        attention_dropouthidden_dropoutqk_layernormmoe_grouped_gemmmoe_router_pre_softmaxalltoallmoe_token_dispatcher_typeseq_aux_lossmoe_router_load_balancing_typemoe_shared_expert_overlapfp32moe_router_dtype   q_lora_ranki   kv_lora_rankqk_head_dim@   qk_pos_emb_head_dim
v_head_dim(   rotary_scaling_factor      ?mscalemscale_all_dimg~jtx?init_method_stdgư>layernorm_epsilonbf16params_dtype%async_tensor_model_parallel_allreduceattention_softmax_in_fp32persist_layer_norm"num_layers_in_first_pipeline_stage!num_layers_in_last_pipeline_stage'account_for_embedding_in_pipeline_split"account_for_loss_in_pipeline_splitapply_rope_fusionbias_activation_fusionbias_dropout_fusionmasked_softmax_fusiongradient_accumulation_fusioncross_entropy_loss_fusiontecross_entropy_fusion_implz2.1.0moe_permute_fusionc                    s*   t    | jd urttdd| _d S d S )NT)use_mtp)super__post_init__r@   r   r   data_step_fnself	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/deepseek.pyrp      s   

zDeepSeekConfig.__post_init__)J__name__
__module____qualname____doc__r   r   r   r/   r   r
   __annotations__r1   strFsilur2   r3   boolr5   r6   r7   r9   intr:   r;   r<   r=   floatr?   r@   r   rA   rB   rC   rD   rE   rF   rH   rJ   rK   rM   rO   rP   rQ   rS   rT   rV   rX   rY   rZ   r[   r\   torchbfloat16r]   dtyper^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rl   r,   rm   rp   __classcell__rv   rv   rt   rw   r-   9   sp   
  r-   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< edd dZeeee f ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS ) DeepSeekV2ConfigzG
    DeepSeek-V2 Model: https://github.com/deepseek-ai/DeepSeek-V2
    <   
num_layersi   hidden_sizei 0  ffn_hidden_size   num_moe_expertsrN   moe_ffn_hidden_sizei   #moe_shared_expert_intermediate_sizec                   C      dgdgd  S )Nr      ;   rv   rv   rv   rv   rw   <lambda>       zDeepSeekV2Config.<lambda>default_factorymoe_layer_freq   moe_router_topk   moe_router_num_groups   moe_router_group_topkg      0@moe_router_topk_scaling_factorMbP?moe_aux_loss_coeffg9v?rX   rY   N)rx   ry   rz   r{   r   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rX   rY   rv   rv   rv   rw   r      s    
 "r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< dZ
eed
< dZeed< dZeed< dZeed< dZeed< edd dZeeee f ed< dZeed< dZeed< dZeed< dZeed< dS )DeepSeekV2LiteConfigz
    DeepSeek-V2-Lite Model: https://github.com/deepseek-ai/DeepSeek-V2
    HuggingFace: https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
       r      r   i*  r      r9   r:   NrO   rR   r   i  r   i   r   c                   C   r   )Nr   r      rv   rv   rv   rv   rw   r      r   zDeepSeekV2LiteConfig.<lambda>r   r   r   r   r   r   r   rW   r   )rx   ry   rz   r{   r   r   r|   r   r   r9   r:   rO   r   r   r   r   r   r   r   r   r   r   r   r   rv   rv   rv   rw   r      s    
 "r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< d
Zeed< edd dZeeee f ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d Zeed!< d"Zeed#< d"Zeed$< d%S )&DeepSeekV3ConfigzG
    DeepSeek-V3 Model: https://github.com/deepseek-ai/DeepSeek-V3
    =   r   i   r   i H  r      r   r   r   r   c                   C   s   dgd dgd  S )Nr   r   r   :   rv   rv   rv   rv   rw   r      s    zDeepSeekV3Config.<lambda>r   r   r   r   r      r   g      @r   g-C6?r      r?   sigmoidmoe_router_score_functionTmoe_router_enable_expert_biasr   moe_router_bias_update_raterW   rX   rY   N)rx   ry   rz   r{   r   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r?   r   r}   r   r   r   rX   rY   rv   rv   rv   rw   r      s,   
 r   c                       sV   e Zd Z				d	dee dee ded deeejgejf  f fddZ	  Z
S )
DeepSeekModelNconfigoptim	tokenizerr+   model_transformc                    s   t  j|pt |||d d S )N)r   r   r   )ro   __init__r   )rs   r   r   r   r   rt   rv   rw   r      s   
zDeepSeekModel.__init__)NNNN)rx   ry   rz   r   r-   r!   r
   r   Moduler   r   rv   rv   rt   rw   r      s    r   hf)extc                   @   s   e Zd ZdefddZddededefddZd	d
 Zde	j
defddZde	j
eB ddfddZdd ZedddZedefddZdS )HFDeepSeekImporterreturnc                 C   s   t | j| jdS )N)r   )r   r   r   rr   rv   rv   rw   init   s   zHFDeepSeekImporter.initFoutput_pathconvert_mtpc                 C   sz   ddl m} || _|   |jt| ddd}|  }| |}| || | 	|| t
d|  t|| ~~|S )Nr   r%   Tautotrust_remote_codetorch_dtypez1Converted DeepSeek model to Nemo, model saved to )transformersr&   r   _verify_sourcefrom_pretrainedr}   r   
nemo_setupconvert_state	nemo_saver#   infor   )rs   r   r   r&   sourcetargettrainerrv   rv   rw   apply   s   

zHFDeepSeekImporter.applyc                 C   s&   t jt| dd}d|vsJ dd S )NTr   quantization_configa  HuggingFace cannot load DeepSeek V3's FP8 checkpoint directly. You must convert the checkpoint to BF16. See NeMo documentation for more details: https://nemo-framework-tme.gitlab-master-pages.nvidia.com/documentation/user-guide/latest/llms/deepseek_v3.html#nemo-2-0-finetuning-recipes )r   r   r}   )rs   source_configrv   rv   rw   r      s   
z!HFDeepSeekImporter._verify_sourcer   c                 C   sT   |  }t| jjD ]\}}|dkr#|d| d}||d| d< q
t|}|S )a  
        In deepseek, HF weight `model.layers.*.post_attention_layernorm.weight` is mapped to mcore weight
        a) `decoder.layers.*.mlp.linear_fc1.layer_norm_weight`, if the layer is dense
        b) `decoder.layers.*.pre_mlp_layernorm.weight`, if the layer is MoE

        We rename model.layers.*.post_attention_layernorm.weight in the first case to prevent a one-to-many mapping
        r   zmodel.layers.z .post_attention_layernorm.weightz&.dense-post_attention_layernorm.weight)
state_dict	enumerater   r   popr    )rs   r   r   layer_iuse_moeweightrv   rv   rw   _modify_source_state   s   	z'HFDeepSeekImporter._modify_source_stateNc                 C   s  | j j}| j j| j j d }t| d d}t|}W d    n1 s%w   Y  t }t }|d  D ]'\}}	t	d| }
r]|t
|
d  krQ|kr]n q6||	 || q6i }|D ]}t| |  D ]\}}||v r||||dd< qlqb| | d S )Nr   zmodel.safetensors.index.jsonr
weight_mapz.*\.layers\.(\d+)\.z.layers.z.mtp.)r   r   r@   openjsonloadsetitemsrematchr   groupaddr   replacer   update)rs   r   mtp_hf_layer_lowmtp_hf_layer_highfilemanifestsafetensor_files_to_loadmtp_hf_keyskfnamer   mtp_state_dictsafetensor_filevrv   rv   rw   _add_mtp_to_source  s*   "

z%HFDeepSeekImporter._add_mtp_to_sourcec                 C   s  i dddddddddd	d
dddddddddddddddddddddd d!}| j jd u rE|d= |d= d"|d#< | j jd ur[t|jjjd$ jjts[d%|d< t|jjjd$ jj	tskd&|d< t|jjjd$ j
ts|d= t|j}n| |}t| j d'r| j jr|d(d)i tjd*d+tjd,tjd-d.tjd,tjd/d0tjd,g}t| j d1d r| jr| | |d2d3d4d5d6 tj||||d7S )8Nmodel.embed_tokens.weight embedding.word_embeddings.weightz**.input_layernorm.weightz**.self_attn.o_proj.weightz$**.self_attention.linear_proj.weightz**.self_attn.q_a_proj.weightz+**.self_attention.linear_q_down_proj.weightz**.self_attn.q_b_proj.weightz)**.self_attention.linear_q_up_proj.weightz&**.self_attn.kv_a_proj_with_mqa.weightz,**.self_attention.linear_kv_down_proj.weightz**.self_attn.kv_b_proj.weightz***.self_attention.linear_kv_up_proj.weightz!**.self_attn.q_a_layernorm.weightz4**.self_attention.linear_q_up_proj.layer_norm_weightz"**.self_attn.kv_a_layernorm.weightz5**.self_attention.linear_kv_up_proj.layer_norm_weightz(**.dense-post_attention_layernorm.weightz#**.mlp.linear_fc1.layer_norm_weightz"**.post_attention_layernorm.weightz**.pre_mlp_layernorm.weightz**.mlp.down_proj.weightz**.mlp.linear_fc2.weightz**.mlp.gate.weightz**.mlp.router.weightz!**.mlp.experts.*.down_proj.weightz!**.mlp.experts.linear_fc2.weight*z&**.mlp.shared_experts.down_proj.weightz'**.mlp.shared_experts.linear_fc2.weightmodel.norm.weightdecoder.final_layernorm.weightlm_head.weightoutput_layer.weightz&**.self_attention.linear_q_proj.weightz**.self_attn.q_proj.weightr   z$**.self_attention.q_layernorm.weightz%**.self_attention.kv_layernorm.weightr   z#**.mlp.gate.e_score_correction_biasz**.mlp.router.expert_bias)z**.mlp.gate_proj.weightz**.mlp.up_proj.weightz**.mlp.linear_fc1.weight
source_key
target_keyfn)z!**.mlp.experts.*.gate_proj.weightz**.mlp.experts.*.up_proj.weightz!**.mlp.experts.linear_fc1.weight*)z&**.mlp.shared_experts.gate_proj.weightz$**.mlp.shared_experts.up_proj.weightz'**.mlp.shared_experts.linear_fc1.weightr@   zmtp.layers.*.eh_proj.weightzmtp.layers.*.enorm.weightzmtp.layers.*.hnorm.weightz$mtp.layers.*.shared_head_norm.weight)zmodel.mtp.*.eh_proj.weightzmodel.mtp.*.enorm.weightzmodel.mtp.*.hnorm.weightz#model.mtp.*.shared_head.norm.weightmapping
transforms)r   rO   
isinstancemoduledecoderlayersself_attentionq_layernormr   kv_layernormpre_mlp_layernormr    r   r   hasattrr   r   r   state_transformr   	merge_fc1getattrr   r   apply_transforms)rs   r   r   r   r   rv   rv   rw   r   ,  s   	


	z HFDeepSeekImporter.convert_stater*   c                 C   s"   ddl m} || t| ddS )Nr   r)   T)use_fast)=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr*   save_hf_tokenizer_assetsr}   )rs   r*   rv   rv   rw   r     s   zHFDeepSeekImporter.tokenizerc                 C   s  ddl m} ddl m} |jt| dd}z	|t| }W n ty)   d }Y nw |j|j }|jdk}|rDdd| j	r?|j
nd d}ni }td%i d|jd	|jd
|jd|jd|jd|jd|jd|jd|j|j ddg|j dg|  d|jd|jd|jd|jdt|ddd|jd|jd|jd|jd|rdnd d!t|tjkd"t|tj kd#t|d$||S d!t|tjkd"t|tj kd#t|d$||S )&Nr   r   )GenerationConfigTr   r   )r   r   r@   r   r   r   r9   r:   rO   r   r   r   r   r   r   r   r   r   r   aux_loss_alphar   rP   rQ   rS   rT   r?   r   r>   fp16r\   r]   generation_configrv   )!r   r   r  r   r}   OSErrornum_hidden_layersfirst_k_dense_replacescoring_funcr   num_nextn_predict_layersr-   r   intermediate_sizer9   num_key_value_headsrO   n_routed_expertsmoe_intermediate_sizen_shared_expertsnum_experts_per_tokn_group
topk_grouprouted_scaling_factorr  rP   qk_nope_head_dimqk_rope_head_dimrT   r"   r   float16r   )rs   HFAutoConfigr  r   r  n_moe_layersis_v3	v3_kwargsrv   rv   rw   r     s   
	
zHFDeepSeekImporter.config)Fr   r*   )rx   ry   rz   r   r   r   r   r   r   r   r   r    r   r   r   r   r   r-   r   rv   rv   rv   rw   r      s    	cr   r&   c                   @   s   e Zd ZejdfdddZdeeef defddZ	d	e
deeef fd
dZdde
de
fddZdd Zdeeef deeef defddZedddZedddZdS )HFDeepSeekExporterdeepseek-ai/DeepSeek-V3r   r&   c                 C   sv   ddl m}m} ddlm} |   |j|dd}|j|d|d}t|d |W  d    S 1 s4w   Y  d S )Nr   )r   r&   )no_init_weightsTr   r   r&   )	r   r   r&   transformers.modeling_utilsr)  r   from_configtyperegister_for_auto_class)rs   r   
model_namer   r&   r)  r   hf_modelrv   rv   rw   r     s   $zHFDeepSeekExporter.initr   c                 C   s:   |d rd}n|d durd}nd}t d| d |S )	a  
        Detect the HF DeepSeek version based on the source NeMo config.

        Args:
            source_config (Dict[str, Any]): The source NeMo model config.

        Returns:
            str: The DeepSeek version in the Hugging Face Hub convention.
        r   r(  rO   Nzdeepseek-ai/DeepSeek-V2zdeepseek-ai/DeepSeek-V2-LitezYour model is determined to be zS based on the config. If this is not correct, please pass in a local HF checkpoint.)r#   r   )rs   r   target_model_namerv   rv   rw   _detect_hf_deepseek_version  s   

z.HFDeepSeekExporter._detect_hf_deepseek_versionpathc                 C   s   |d d }|  stdt|d}t|}W d   n1 s#w   Y  |d }i }t|d D ]0\}}d|v r>q5|d	d
}	d|v rat|	dD ]}
||
 ||	ddt
|
 < qOq5|||	< q5||d fS )a  
        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
        so that it is consistent with the key names you would get from loading the checkpoint into a model.
        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.

        Args:
            path (Path): The path from which the model will be loaded.

        Returns
        -------
            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
        contextz
model.yamlz@model.yaml is not found in the context folder of the checkpoint.r   NweightsT_extra_statezmodule. z.experts.experts.r   z	.experts.r   )existsFileNotFoundErrorr   yaml	safe_loadr   r   r   rangesizer}   )rs   r2  
model_yamlstreamr   dist_ckpt_folderr   r   r   new_kirv   rv   rw   	ckpt_load  s$   
zHFDeepSeekExporter.ckpt_loadNr   c                 C   s   t d | | \}}t d |d u rt d | |}| jt||d}| |||}| }t d| d |j	|dd | j
	| |S )	Nz:Loading DeepSeek NeMo checkpoint. This may take a while...z DeepSeek NeMo checkpoint loaded.zBefore DeepSeek is officially supported in HF, you should pass in a local HF checkpoint using llm.export_ckpt(..., target_model_name=<local hf path>))r.  z0Converted DeepSeek model to HF, saving model to z...F)safe_serialization)r#   r   rB  warningr1  r   r   r   cpusave_pretrainedr   )rs   r   r0  r   r   r   rv   rv   rw   r     s   


zHFDeepSeekExporter.applyc                 C   s(  i ddddddddd	d
dddddddddddddddddddddd }|d! d u rB|d= |d	= d"|d#< |d! d urSd$|v rS| d|d%< d&|v r^| d|d'< |d(d)rk|d*d+i tjd,d-tjd.tjd/d0tjd.tjd1d2tjd.g}| ||}tj||||d3S )4Nr   r   z'decoder.layers.*.input_layernorm.weightz%model.layers.*.input_layernorm.weightz2decoder.layers.*.self_attention.linear_proj.weightz&model.layers.*.self_attn.o_proj.weightz9decoder.layers.*.self_attention.linear_q_down_proj.weightz(model.layers.*.self_attn.q_a_proj.weightz7decoder.layers.*.self_attention.linear_q_up_proj.weightz(model.layers.*.self_attn.q_b_proj.weightz:decoder.layers.*.self_attention.linear_kv_down_proj.weightz2model.layers.*.self_attn.kv_a_proj_with_mqa.weightz8decoder.layers.*.self_attention.linear_kv_up_proj.weightz)model.layers.*.self_attn.kv_b_proj.weightzBdecoder.layers.*.self_attention.linear_q_up_proj.layer_norm_weightz-model.layers.*.self_attn.q_a_layernorm.weightzCdecoder.layers.*.self_attention.linear_kv_up_proj.layer_norm_weightz.model.layers.*.self_attn.kv_a_layernorm.weightz)decoder.layers.*.pre_mlp_layernorm.weightz.model.layers.*.post_attention_layernorm.weightz&decoder.layers.*.mlp.linear_fc2.weightz#model.layers.*.mlp.down_proj.weightz"decoder.layers.*.mlp.router.weightzmodel.layers.*.mlp.gate.weightz/decoder.layers.*.mlp.experts.linear_fc2.weight*z-model.layers.*.mlp.experts.*.down_proj.weightz5decoder.layers.*.mlp.shared_experts.linear_fc2.weightz2model.layers.*.mlp.shared_experts.down_proj.weightr   r   r   r   rO   z&model.layers.*.self_attn.q_proj.weightz4decoder.layers.*.self_attention.linear_q_proj.weightz2decoder.layers.0.self_attention.q_layernorm.weightz2decoder.layers.*.self_attention.q_layernorm.weightz3decoder.layers.0.self_attention.kv_layernorm.weightz3decoder.layers.*.self_attention.kv_layernorm.weightr   Fz'decoder.layers.*.mlp.router.expert_biasz/model.layers.*.mlp.gate.e_score_correction_biasz&decoder.layers.*.mlp.linear_fc1.weight)z#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weightr   z/decoder.layers.*.mlp.experts.linear_fc1.weight*)z-model.layers.*.mlp.experts.*.gate_proj.weightz+model.layers.*.mlp.experts.*.up_proj.weightz5decoder.layers.*.mlp.shared_experts.linear_fc1.weight)z2model.layers.*.mlp.shared_experts.gate_proj.weightz0model.layers.*.mlp.shared_experts.up_proj.weightr   )	r   getr   r   r  r   	split_fc1r   r	  )rs   r   r   r   r   r   rv   rv   rw   r   *  s   	
z HFDeepSeekExporter.convert_stater   c                 C   sP   t |d D ]}d| d|v r!|d| d}||d| d< qt|}|S )a  
        In deepseek, HF weight `model.layers.*.post_attention_layernorm.weight` is mapped to mcore weight
        a) `decoder.layers.*.mlp.linear_fc1.layer_norm_weight`, if the layer is dense
        b) `decoder.layers.*.pre_mlp_layernorm.weight`, if the layer is MoE

        We rename decoder.layers.*.mlp.linear_fc1.layer_norm_weight in the first case to unify key names
        r   zdecoder.layers.z!.mlp.linear_fc1.layer_norm_weightz.pre_mlp_layernorm.weight)r;  r   r    )rs   r   r   r   r   modified_sourcerv   rv   rw   r   }  s   z'HFDeepSeekExporter._modify_source_stater*   c                 C   s   t j| ddjS )Nmodel)subpath)r   load_contextr   rr   rv   rv   rw   r     s   zHFDeepSeekExporter.tokenizerHFDeepseekV3Configc                 C   sZ  ddl m} tt| jj}| t|}|dkr"t	d| dd}|t
|jk rD|j| dkrD|d7 }|t
|jk rD|j| dks2tdd |j|d	 D sTJ |d i d
dgd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|d|jd|jd|jd|jd|jd|jd| jj S )!ai  Create a HF DeepseekV3Config from the NeMo model config.

        Translates the NeMo configuration parameters to the equivalent HF
        configuration.

        Currently only supports DeepseekV3Config based on availability
        in the Transformers library.

        Returns:
            HFDeepseekV3Config: HF configuration for DeepSeekV3 models
        r   r'   r(  z$Getting config for model other than z is not supported.r   c                 s   s    | ]}|d kV  qdS )r   Nrv   ).0xrv   rv   rw   	<genexpr>  s    z,HFDeepSeekExporter.config.<locals>.<genexpr>NarchitecturesDeepseekV3ForCausalLMr  r   r  r9   rO   r  r   rT   rP   r  r  r  r  r  r  r  r  r  r;   
vocab_sizerv   )!r   r(   r   rL  r}   rJ  r   r1  r   
ValueErrorlenr   allr   r   r   r9   rO   rQ   rS   rT   rP   r:   r   r   r   r   r   r   r   r;   r   rS  )rs   rM  r   r0  r   rv   rv   rw   r     sj    	
zHFDeepSeekExporter.config)r   r&   )Nr&  )r   rM  )rx   ry   rz   r   r   r   r   r}   r	   r1  r   r   rB  r   r   r    r   propertyr   r   rv   rv   rv   rw   r'    s    !&Sr'  )r-   r   r   r   r   )Nr   r   dataclassesr   r   r   	functoolsr   r   pathlibr   typingr   r	   r
   r   r   r   r   r   r   torch.nn.functionalr   
functionalr~   r9  (megatron.core.models.gpt.gpt_layer_specsr   %megatron.core.transformer.identity_opr   ,megatron.core.transformer.transformer_configr   safetensors.torchr   r   r   #nemo.collections.llm.gpt.model.baser   r   r   r   r   .nemo.export.trt_llm.nemo_ckpt_loader.nemo_filer   nemo.lightningr   r   nemo.lightning.io.stater   r    nemo.lightning.pytorch.optimr!   nemo.lightning.pytorch.utilsr"   
nemo.utilsr#   megatron.core.transformerr$   r&   r(   rM  r  r*   1nemo.collections.common.tokenizers.tokenizer_specr+   megatron.core.utilsr,   r-   r   r   r   r   model_importerModelConnectorr   model_exporterr'  __all__rv   rv   rv   rw   <module>   s^   (M 
i  