o
    }oi=                     @   s  d dl mZ d dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlmZ ernd dlmZ d dlmZ  d dl!m"Z" d dl#m$Z$ eG dd deZeG dd deZ%eG dd deZ&eG dd deZ'eG dd deZ(eG dd deZ)eG dd deZ*eG dd  d eZ+eG d!d" d"e%Z,eG d#d$ d$e%Z-G d%d& d&eZ.e/e.d'G d(d) d)ej0d*e.f Z1e2e.d'G d+d, d,ej0e.d*f Z3g d-Z4dS ).    )	dataclass)cached_propertypartial)Path)TYPE_CHECKING	AnnotatedCallableOptionalN)nn)GPTModeltorch_dtype_from_mcore_config)Qwen2Config)Config)OptimizerModuleioteardown)TransformFns)dtype_from_hfAutoModelForCausalLMQwen3ConfigAutoTokenizer)TokenizerSpecc                   @   s^   e Zd ZU dZdZeed< dZeed< dZe	e
 ed< dZe
ed	< d
Ze
ed< dZe
ed< dS )r   z'
    Base config for Qwen 3 Models
    Fadd_qkv_biasTqk_layernorm   kv_channels   num_query_groupsi   max_position_embeddingsiQ 
vocab_sizeN)__name__
__module____qualname____doc__r   bool__annotations__r   r   r	   intr    r!   r"    r*   r*   X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/qwen3.pyr   &   s   
 r   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZ	e
ed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dS )Qwen3MoEConfigz+
    Base config for Qwen 3 MoE Models
    r   num_moe_expertsaux_lossmoe_router_load_balancing_typegMbP?moe_aux_loss_coeffr   moe_router_topkFmoe_router_pre_softmaxTmoe_grouped_gemmalltoallmoe_token_dispatcher_typemoe_permute_fusionN)r#   r$   r%   r&   r-   r)   r(   r/   strr0   floatr1   r2   r'   r3   r5   r6   r*   r*   r*   r+   r,   4   s   
 r,   c                   @   N   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dS )Qwen3Config600MzH
    Config for Qwen 3 0.6B: https://huggingface.co/Qwen/Qwen3-0.6B
       
num_layersi   hidden_size   num_attention_headsi   ffn_hidden_sizeT#share_embeddings_and_output_weightsNr#   r$   r%   r&   r<   r)   r(   r=   r?   r@   rA   r'   r*   r*   r*   r+   r:   D      
 r:   c                   @   r9   )Qwen3Config1P7BzH
    Config for Qwen 3 1.7B: https://huggingface.co/Qwen/Qwen3-1.7B
    r;   r<      r=   r>   r?      r@   TrA   NrB   r*   r*   r*   r+   rD   Q   rC   rD   c                   @   r9   )Qwen3Config4BzD
    Config for Qwen 3 4B: https://huggingface.co/Qwen/Qwen3-4B
    $   r<   i 
  r=       r?   i &  r@   TrA   NrB   r*   r*   r*   r+   rG   ^   rC   rG   c                   @   sB   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
S )Qwen3Config8BzD
    Config for Qwen 3 8B: https://huggingface.co/Qwen/Qwen3-8B
    rH   r<      r=   rI   r?    0  r@   N
r#   r$   r%   r&   r<   r)   r(   r=   r?   r@   r*   r*   r*   r+   rJ   k      
 rJ   c                   @   B   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	S )
Qwen3Config14BzF
    Config for Qwen 3 14B: https://huggingface.co/Qwen/Qwen3-14B
    (   r<      r=   r?   i D  r@   NrM   r*   r*   r*   r+   rP   w   rN   rP   c                   @   rO   )
Qwen3Config32BzF
    Config for Qwen 3 32B: https://huggingface.co/Qwen/Qwen3-32B
    @   r<   rR   r=   r?   i d  r@   NrM   r*   r*   r*   r+   rS      rN   rS   c                   @   Z   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dS )Qwen3Config30B_A3BzN
    Config for Qwen 3 30B-A3B: https://huggingface.co/Qwen/Qwen3-30B-A3B
    0   r<   rE   r=   rI   r?      r    rF   r@   i   moe_ffn_hidden_sizeNr#   r$   r%   r&   r<   r)   r(   r=   r?   r    r@   rY   r*   r*   r*   r+   rV         
 rV   c                   @   rU   )Qwen3Config235B_A22BzR
    Config for Qwen 3 235B-A22B: https://huggingface.co/Qwen/Qwen3-235B-A22B
    ^   r<   rK   r=   rT   r?   rX   r    rL   r@   i   rY   NrZ   r*   r*   r*   r+   r\      r[   r\   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )
Qwen3Modelz
    Base model for Qwen 3
    Nconfigoptim	tokenizerr   model_transformc                    s   t  j|pt |||d d S )N)r`   ra   rb   )super__init__r   )selfr_   r`   ra   rb   	__class__r*   r+   rd      s   zQwen3Model.__init__)NNNN)r#   r$   r%   r&   r   r	   r   r   r   r   r
   Modulerd   __classcell__r*   r*   rf   r+   r^      s    r^   hfc                   @   sT   e Zd ZdefddZdedefddZdd Zedd
dZ	ede
fddZdS )HFQwen3Importerreturnc                 C   s   t | j| jdS )N)ra   )r^   r_   ra   re   r*   r*   r+   init   s   zHFQwen3Importer.initoutput_pathc                 C   sj   ddl m} |jt| ddd}|  }| |}| || | || td|  t	|| ~~|S )Nr   r   autoT)torch_dtypetrust_remote_codez/Converted Qwen 3 model to Nemo, model saved to )
transformersr   from_pretrainedr7   rn   
nemo_setupconvert_state	nemo_saveprintr   )re   ro   r   sourcetargettrainerr*   r*   r+   apply   s   

zHFQwen3Importer.applyc                 C   s   dddddddd}| j jd u}|r|d	d
dd n|ddd t|j ddr.|d= tjddtjd|sAtjddtjdntjddtjdg}tj	||||dS )N embedding.word_embeddings.weight$**.self_attention.linear_proj.weight$**.self_attention.q_layernorm.weight$**.self_attention.k_layernorm.weight.**.self_attention.linear_qkv.layer_norm_weightdecoder.final_layernorm.weightoutput_layer.weight)model.embed_tokens.weight**.self_attn.o_proj.weight**.self_attn.q_norm.weight**.self_attn.k_norm.weight**.input_layernorm.weightmodel.norm.weightlm_head.weight!**.mlp.experts.linear_fc2.weight***.mlp.router.weight**.pre_mlp_layernorm.weight)!**.mlp.experts.*.down_proj.weight**.mlp.gate.weight"**.post_attention_layernorm.weight**.mlp.linear_fc2.weight#**.mlp.linear_fc1.layer_norm_weight)**.mlp.down_proj.weightr   tie_word_embeddingsFr   z**.self_attn.q_proj.weightz**.self_attn.k_proj.weightz**.self_attn.v_proj.weight#**.self_attention.linear_qkv.weight
source_key
target_keyfnz**.mlp.gate_proj.weightz**.mlp.up_proj.weight**.mlp.linear_fc1.weightz!**.mlp.experts.*.gate_proj.weightz**.mlp.experts.*.up_proj.weight!**.mlp.experts.linear_fc1.weight*mapping
transforms)
r_   r-   updategetattrr   state_transformr   	merge_qkv	merge_fc1apply_transformsre   ry   rz   r   is_moer   r*   r*   r+   rv      sR   	zHFQwen3Importer.convert_stater   c                 C   s"   ddl m} || t| ddS )Nr   r   Trr   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   save_hf_tokenizer_assetsr7   )re   r   r*   r*   r+   ra   	  s   zHFQwen3Importer.tokenizerc                 C   s   ddl m} ddl m} |jt| dd}|t| }t|dd d u}|r.tt|jd}nt	}||j
|j|j|j|j|j|j|jd|jt|d	d
t|tjkt|tjkt||d}|S )Nr   )
AutoConfig)GenerationConfigTr   num_experts)rY   i  r   F)r<   r=   r@   r?   r    init_method_stdlayernorm_epsilonr"   make_vocab_size_divisible_byrotary_baserA   fp16bf16params_dtypegeneration_config)rs   r   r   rt   r7   r   r   r,   moe_intermediate_sizer   num_hidden_layersr=   intermediate_sizer?   num_key_value_headsinitializer_rangerms_norm_epsr"   
rope_thetar   torchfloat16bfloat16)re   HFAutoConfigr   ry   r   r   qwen3_config_clsoutputr*   r*   r+   r_     s4   
zHFQwen3Importer.configN)rl   r   )r#   r$   r%   r^   rn   r   r|   rv   r   ra   r   r_   r*   r*   r*   r+   rk      s    8rk   r   c                   @   sP   e Zd ZejfdddZdedefddZdd	 Ze	d
d Z
e	dddZdS )HFQwen3Exporterrl   r   c                 C   sT   ddl m} ddlm} |  |j| jd|dW  d    S 1 s#w   Y  d S )Nr   r   )no_init_weightsT)rr   rq   )rs   r   transformers.modeling_utilsr   from_configr_   )re   dtyper   r   r*   r*   r+   rn   4  s
   $zHFQwen3Exporter.initro   c                 C   sP   |  t| \}}| t|j}| ||}| }|| | j| |S N)		nemo_loadr7   rn   r   r_   rv   cpusave_pretrainedra   )re   ro   ry   _rz   r*   r*   r+   r|   ;  s   
zHFQwen3Exporter.applyc                 C   s   dddddd}t | jdddk}|r|d	d
dd n|ddd tjddtjd|s8tjddtjdntjddtjdtjddtjdg}| jj	sZ|
tjddtjd tj||||dS )Nr   r   r   r   r   )r~   r   r   r   r   r   r   r   r   r   )r   r   r   r   )r   r   r   r   r   r   r   r   r   r}   r   r   r   r   )r   r_   r   r   r   r   	split_qkv	split_fc1prune_paddingr   appendr   r   r*   r*   r+   rv   F  sn   zHFQwen3Exporter.convert_statec                 C   s   t t| jjjS r   )r   load_contextr7   modelra   rm   r*   r*   r+   ra     s   zHFQwen3Exporter.tokenizerHFQwen3Configc                 C   s   ddl m} ddl m} tjt| dd}|jd u}|r*t||j|j|j	|j
ddn|}|di dd	gd
|jd|jd|jd|jd|jd|jd|jd|jd|jd|jdt|d| jjdd d|jd|jddddS )Nr   r   )Qwen3MoeConfigzmodel.config)subpathT)r   r   num_experts_per_tokrouter_aux_loss_coefnorm_topk_probarchitecturesQwen3ForCausalLMr   r=   r   r?   head_dimr!   r   r   r   r   r"   sliding_windowr   max_window_layersbos_token_idi[P eos_token_idi]P r*   )rs   r   r   r   r   r7   r-   r   rY   r1   r0   r<   r=   r@   r?   r   r!   r   r   r    r   r   ra   r"   rA   )re   r   HFQwen3MoeConfigry   r   hf_config_clsr*   r*   r+   r_     sd   

		
zHFQwen3Exporter.configN)rl   r   )rl   r   )r#   r$   r%   r   r   rn   r   r|   rv   propertyra   r_   r*   r*   r*   r+   r   1  s    E
r   )
r   r:   rD   rG   rJ   rP   rS   rV   r\   r^   )5dataclassesr   	functoolsr   r   pathlibr   typingr   r   r   r	   r   r
   #nemo.collections.llm.gpt.model.baser   r   $nemo.collections.llm.gpt.model.qwen2r   nemo.collections.llm.utilsr   nemo.lightningr   r   r   nemo.lightning.io.stater   nemo.lightning.pytorch.utilsr   rs   r   r   r   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   r,   r:   rD   rG   rJ   rP   rS   rV   r\   r^   model_importerModelConnectorrk   model_exporterr   __all__r*   r*   r*   r+   <module>   sX   

v 