o
    }oiS6                     @   s  d dl mZ d dlmZ d dlmZmZmZmZ d dl	Z	d dl
m  mZ d dl	mZ d dlmZmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ erkd dlmZ d dlmZ d dl m!Z! d dl"m#Z# eG dd deZeG dd deZ$eG dd de$Z%eG dd deZ&eG dd de&Z'eG dd deZ(eG dd de(Z)eG dd deZ*eG dd  d eZ+eG d!d" d"eZ,eG d#d$ d$e,Z-G d%d& d&eZ.e/e.d'G d(d) d)ej0d*e.f Z1e2e.d'G d+d, d,ej0e.d*f Z3g d-Z4dS ).    )	dataclass)Path)TYPE_CHECKING	AnnotatedCallableOptionalN)nn)	GPTConfigGPTModeltorch_dtype_from_mcore_config)Config)OptimizerModuleioteardown)TransformFns)dtype_from_hfAutoModelForCausalLMQwen2ConfigAutoTokenizer)TokenizerSpecc                   @   s   e Zd ZU dZdZeed< ejZ	e
ed< dZeed< dZeed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dZeed< dZeed< dS )r   z'
    Base config for Qwen 2 Models
    RMSNormnormalizationactivation_funcTgated_linear_unitFadd_bias_linearadd_qkv_biasi   
seq_lengthg{Gz?init_method_stdg        hidden_dropoutattention_dropoutiQ 
vocab_size#share_embeddings_and_output_weightsgư>layernorm_epsilong    .Arotary_baseropeposition_embedding_typeN)__name__
__module____qualname____doc__r   str__annotations__Fsilur   r   r   boolr   r   r   intr    r!   floatr"   r#   r$   r   r%   r&   r(    r4   r4   X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/qwen2.pyr   %   s    
 r   c                   @   N   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dS )Qwen2Config500MzH
    Config for Qwen 2 0.5B: https://huggingface.co/Qwen/Qwen2-0.5B
       
num_layersi  hidden_size   num_attention_heads   num_query_groupsi   ffn_hidden_sizeNr)   r*   r+   r,   r9   r2   r.   r:   r<   r>   r?   r4   r4   r4   r5   r7   ;      
 r7   c                   @      e Zd ZU dZdZeed< dS )Qwen25Config500MzL
    Config for Qwen 2.5 0.5B: https://huggingface.co/Qwen/Qwen2.5-0.5B
    i   r   Nr)   r*   r+   r,   r   r2   r.   r4   r4   r4   r5   rC   H      
 rC   c                   @   r6   )Qwen2Config1P5BzH
    Config for Qwen 2 1.5B: https://huggingface.co/Qwen/Qwen2-1.5B
       r9   i   r:      r<   r=   r>   i #  r?   Nr@   r4   r4   r4   r5   rF   Q   rA   rF   c                   @   rB   )Qwen25Config1P5BzL
    Config for Qwen 2.5 1.5B: https://huggingface.co/Qwen/Qwen2.5-1.5B
       r   NrD   r4   r4   r4   r5   rI   ^   rE   rI   c                   @   sZ   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dS )Qwen2Config7BzD
    Config for Qwen 2 7B: https://huggingface.co/Qwen/Qwen2-7B
    rG   r9   i   r:   r<      r>   i J  r?    R r#   N)r)   r*   r+   r,   r9   r2   r.   r:   r<   r>   r?   r#   r4   r4   r4   r5   rK   g   s   
 rK   c                   @   rB   )Qwen25Config7BzH
    Config for Qwen 2.5 7B: https://huggingface.co/Qwen/Qwen2.5-7B
    rJ   r   NrD   r4   r4   r4   r5   rN   u   rE   rN   c                   @   r   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dZeed< dS )Qwen25Config14BzJ
    Config for Qwen 2.5 14B: https://huggingface.co/Qwen/Qwen2.5-14B
    0   r9      r:   (   r<      r>   i 6  r?   rM   r#   h㈵>r%   rJ   r   Nr)   r*   r+   r,   r9   r2   r.   r:   r<   r>   r?   r#   r%   r3   r   r4   r4   r4   r5   rP   ~      
 rP   c                   @   rO   )Qwen25Config32BzJ
    Config for Qwen 2.5 32B: https://huggingface.co/Qwen/Qwen2.5-32B
    @   r9   rR   r:   rS   r<   rT   r>   i l  r?   rM   r#   rU   r%   rJ   r   NrV   r4   r4   r4   r5   rX      rW   rX   c                   @   sf   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dS )Qwen2Config72BzF
    Config for Qwen 2 72B: https://huggingface.co/Qwen/Qwen2-72B
    P   r9   i    r:   rY   r<   rT   r>   is  r?   rM   r#   rU   r%   N)r)   r*   r+   r,   r9   r2   r.   r:   r<   r>   r?   r#   r%   r3   r4   r4   r4   r5   rZ      s   
 rZ   c                   @   rB   )Qwen25Config72BzJ
    Config for Qwen 2.5 72B: https://huggingface.co/Qwen/Qwen2.5-72B
    rJ   r   NrD   r4   r4   r4   r5   r\      rE   r\   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )
Qwen2Modelz
    Base model for Qwen 2
    Nconfigoptim	tokenizerr   model_transformc                    s   t  j|pt |||d d S )N)r_   r`   ra   )super__init__r   )selfr^   r_   r`   ra   	__class__r4   r5   rc      s   zQwen2Model.__init__)NNNN)r)   r*   r+   r,   r   r   r   r   r   r   r   Modulerc   __classcell__r4   r4   re   r5   r]      s    r]   hfc                   @   sT   e Zd ZdefddZdedefddZdd Zedd
dZ	ede
fddZdS )HFQwen2Importerreturnc                 C   s   t | j| jdS )N)r`   )r]   r^   r`   rd   r4   r4   r5   init   s   zHFQwen2Importer.initoutput_pathc                 C   sj   ddl m} |jt| ddd}|  }| |}| || | || td|  t	|| ~~|S )Nr   r   autoT)torch_dtypetrust_remote_codez-Converted Qwen model to Nemo, model saved to )
transformersr   from_pretrainedr-   rm   
nemo_setupconvert_state	nemo_saveprintr   )rd   rn   r   sourcetargettrainerr4   r4   r5   apply   s   

zHFQwen2Importer.applyc                 C   sZ   dddddddd}t jd	d
tjdt jddtjdt jddtjdg}t j||||dS )N embedding.word_embeddings.weight2decoder.layers.*.self_attention.linear_proj.weight&decoder.layers.*.mlp.linear_fc2.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weightdecoder.final_layernorm.weightoutput_layer.weight)model.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight#model.layers.*.mlp.down_proj.weight%model.layers.*.input_layernorm.weight.model.layers.*.post_attention_layernorm.weightmodel.norm.weightlm_head.weightz&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnz$model.layers.*.self_attn.q_proj.biasz$model.layers.*.self_attn.k_proj.biasz$model.layers.*.self_attn.v_proj.bias/decoder.layers.*.self_attention.linear_qkv.biasz#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weight&decoder.layers.*.mlp.linear_fc1.weightmapping
transforms)r   state_transformr   	merge_qkvmerge_qkv_bias	merge_fc1apply_transformsrd   rx   ry   r   r   r4   r4   r5   ru      s2   		zHFQwen2Importer.convert_stater   c                 C   s"   ddl m} || t| ddS )Nr   r   Trq   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   save_hf_tokenizer_assetsr-   )rd   r   r4   r4   r5   r`     s   zHFQwen2Importer.tokenizerc                 C   s   ddl m} ddl m} |jt| dd}|t| }tdi d|jd|jd|jd	|j	d
|j
d|jd|jddddd|jddd|jd|jdt|tjkdt|tjkdt|d|}|S )Nr   )
AutoConfig)GenerationConfigTr   r9   r:   r?   r<   r>   r    r%   r   make_vocab_size_divisible_by   r&   r$   Fr#   r   fp16bf16params_dtypegeneration_configr4   )rr   r   r   rs   r-   r   num_hidden_layersr:   intermediate_sizer<   num_key_value_headsinitializer_rangerms_norm_eps
rope_thetar#   max_position_embeddingsr   torchfloat16bfloat16)rd   HFAutoConfigr   rx   r   outputr4   r4   r5   r^     sP   	
zHFQwen2Importer.configN)rk   r   )r)   r*   r+   r]   rm   r   r{   ru   propertyr`   r   r^   r4   r4   r4   r5   rj      s    &rj   r   c                   @   sP   e Zd ZejfdddZdedefddZdd	 Ze	d
d Z
e	dddZdS )HFQwen2Exporterrk   r   c                 C   sT   ddl m} ddlm} |  |j| jd|dW  d    S 1 s#w   Y  d S )Nr   r   )no_init_weightsT)rq   rp   )rr   r   transformers.modeling_utilsr   from_configr^   )rd   dtyper   r   r4   r4   r5   rm   )  s
   $zHFQwen2Exporter.initrn   c                 C   sP   |  t| \}}| t|j}| ||}| }|| | j| |S N)		nemo_loadr-   rm   r   r^   ru   cpusave_pretrainedr`   )rd   rn   rx   _ry   r4   r4   r5   r{   0  s   
zHFQwen2Exporter.applyc              	   C   sv   dddddd}t jddtjd	t jd
dtjd	t jddtjd	t jddtjd	t jddtjd	g}t j||||dS )Nr   r   r   r   r   )r}   r~   r   r   r   r   r   r   r   r   r   r   r|   r   r   r   r   )r   r   r   	split_qkvsplit_qkv_bias	split_fc1prune_paddingr   r   r4   r4   r5   ru   ;  sL   			#zHFQwen2Exporter.convert_statec                 C   s   t t| jjjS r   )r   load_contextr-   modelr`   rl   r4   r4   r5   r`   n  s   zHFQwen2Exporter.tokenizerHFQwen2Configc                 C   s|   ddl m} tjt| dd}|dg|j|j|j|j|j	d ur"|j	n|j|j |j
|j|j|j|jt|d| jj|j
ddS )	Nr   r   zmodel.config)subpathQwen2ForCausalLMr#   F)architecturesr   r:   r   r<   head_dimr   r   r   r   r   r#   sliding_windowtie_word_embeddings)rr   r   r   r   r-   r9   r:   r?   r<   kv_channelsr   r    r%   r>   r&   getattrr`   r#   )rd   r   rx   r4   r4   r5   r^   r  s(   

zHFQwen2Exporter.configN)rk   r   )rk   r   )r)   r*   r+   r   r   rm   r   r{   ru   r   r`   r^   r4   r4   r4   r5   r   &  s    3
r   )r   r7   rF   rK   rZ   rC   rI   rN   rP   rX   r\   r]   )5dataclassesr   pathlibr   typingr   r   r   r   r   torch.nn.functionalr   
functionalr/   #nemo.collections.llm.gpt.model.baser	   r
   r   nemo.collections.llm.utilsr   nemo.lightningr   r   r   nemo.lightning.io.stater   nemo.lightning.pytorch.utilsr   rr   r   r   r   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   r7   rC   rF   rI   rK   rN   rP   rX   rZ   r\   r]   model_importerModelConnectorrj   model_exporterr   __all__r4   r4   r4   r5   <module>   sX   

`g