o
    wi7                     @   s.  d dl mZ d dlmZ d dlmZmZmZmZ d dl	Z	d dl
m  mZ d dl	mZ d dlmZmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ erkd dlmZ d dlmZ d dl m!Z! d dl"m#Z# eG dd deZeG dd deZ$eG dd de$Z%eG dd deZ&eG dd deZ'eG dd de&Z(eG dd deZ)eG dd de)Z*eG dd  d eZ+eG d!d" d"eZ,eG d#d$ d$eZ-eG d%d& d&e-Z.G d'd( d(eZ/e0e/d)G d*d+ d+ej1d,e/f Z2e3e/d)G d-d. d.ej1e/d,f Z4g d/Z5dS )0    )	dataclass)Path)TYPE_CHECKING	AnnotatedCallableOptionalN)nn)	GPTConfigGPTModeltorch_dtype_from_mcore_config)Config)OptimizerModuleioteardown)TransformFns)dtype_from_hfAutoModelForCausalLMQwen2ConfigAutoTokenizer)TokenizerSpecc                   @   s   e Zd ZU dZdZeed< ejZ	e
ed< dZeed< dZeed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dZeed< dZeed< dS )r   z'
    Base config for Qwen 2 Models
    RMSNormnormalizationactivation_funcTgated_linear_unitFadd_bias_linearadd_qkv_biasi   
seq_lengthg{Gz?init_method_stdg        hidden_dropoutattention_dropoutQ 
vocab_size#share_embeddings_and_output_weightsgư>layernorm_epsilong    .Arotary_baseropeposition_embedding_typeN)__name__
__module____qualname____doc__r   str__annotations__Fsilur   r   r   boolr   r   r   intr    r!   floatr"   r$   r%   r   r&   r'   r)    r5   r5   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/qwen2.pyr   %   s    
 r   c                   @   N   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dS )Qwen2Config500MzH
    Config for Qwen 2 0.5B: https://huggingface.co/Qwen/Qwen2-0.5B
       
num_layersi  hidden_size   num_attention_heads   num_query_groupsi   ffn_hidden_sizeNr*   r+   r,   r-   r:   r3   r/   r;   r=   r?   r@   r5   r5   r5   r6   r8   ;      
 r8   c                   @      e Zd ZU dZdZeed< dS )Qwen25Config500MzL
    Config for Qwen 2.5 0.5B: https://huggingface.co/Qwen/Qwen2.5-0.5B
    i   r   Nr*   r+   r,   r-   r   r3   r/   r5   r5   r5   r6   rD   H      
 rD   c                   @   r7   )Qwen2Config1P5BzH
    Config for Qwen 2 1.5B: https://huggingface.co/Qwen/Qwen2-1.5B
       r:   i   r;      r=   r>   r?   i #  r@   NrA   r5   r5   r5   r6   rG   Q   rB   rG   c                   @   f   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dS )Qwen25Config3BzH
    Config for Qwen 2.5 3B: https://huggingface.co/Qwen/Qwen2.5-3B
    $   r:   i   r;      r=   r>   r?   i +  r@   r#   r$   Tr%   N)r*   r+   r,   r-   r:   r3   r/   r;   r=   r?   r@   r$   r%   r2   r5   r5   r5   r6   rK   ^      
 rK   c                   @   rC   )Qwen25Config1P5BzL
    Config for Qwen 2.5 1.5B: https://huggingface.co/Qwen/Qwen2.5-1.5B
       r   NrE   r5   r5   r5   r6   rO   m   rF   rO   c                   @   sZ   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dS )Qwen2Config7BzD
    Config for Qwen 2 7B: https://huggingface.co/Qwen/Qwen2-7B
    rH   r:   i   r;   r=      r?   i J  r@    R r$   N)r*   r+   r,   r-   r:   r3   r/   r;   r=   r?   r@   r$   r5   r5   r5   r6   rQ   v   s   
 rQ   c                   @   rC   )Qwen25Config7BzH
    Config for Qwen 2.5 7B: https://huggingface.co/Qwen/Qwen2.5-7B
    rP   r   NrE   r5   r5   r5   r6   rT      rF   rT   c                   @   r   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dZeed< dS )Qwen25Config14BzJ
    Config for Qwen 2.5 14B: https://huggingface.co/Qwen/Qwen2.5-14B
    0   r:      r;   (   r=      r?   i 6  r@   rS   r$   h㈵>r&   rP   r   Nr*   r+   r,   r-   r:   r3   r/   r;   r=   r?   r@   r$   r&   r4   r   r5   r5   r5   r6   rV         
 rV   c                   @   rU   )Qwen25Config32BzJ
    Config for Qwen 2.5 32B: https://huggingface.co/Qwen/Qwen2.5-32B
    @   r:   rX   r;   rY   r=   rZ   r?   i l  r@   rS   r$   r[   r&   rP   r   Nr\   r5   r5   r5   r6   r^      r]   r^   c                   @   rJ   )Qwen2Config72BzF
    Config for Qwen 2 72B: https://huggingface.co/Qwen/Qwen2-72B
    P   r:   i    r;   r_   r=   rZ   r?   is  r@   rS   r$   r[   r&   N)r*   r+   r,   r-   r:   r3   r/   r;   r=   r?   r@   r$   r&   r4   r5   r5   r5   r6   r`      rN   r`   c                   @   rC   )Qwen25Config72BzJ
    Config for Qwen 2.5 72B: https://huggingface.co/Qwen/Qwen2.5-72B
    rP   r   NrE   r5   r5   r5   r6   rb      rF   rb   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )
Qwen2Modelz
    Base model for Qwen 2
    Nconfigoptim	tokenizerr   model_transformc                    s   t  j|pt |||d d S )N)re   rf   rg   )super__init__r   )selfrd   re   rf   rg   	__class__r5   r6   ri      s   zQwen2Model.__init__)NNNN)r*   r+   r,   r-   r   r   r   r   r   r   r   Moduleri   __classcell__r5   r5   rk   r6   rc      s    rc   hfc                   @   sT   e Zd ZdefddZdedefddZdd Zedd
dZ	ede
fddZdS )HFQwen2Importerreturnc                 C   s   t | j| jdS )N)rf   )rc   rd   rf   rj   r5   r5   r6   init   s   zHFQwen2Importer.initoutput_pathc                 C   sj   ddl m} |jt| ddd}|  }| |}| || | || td|  t	|| ~~|S )Nr   r   autoT)torch_dtypetrust_remote_codez-Converted Qwen model to Nemo, model saved to )
transformersr   from_pretrainedr.   rs   
nemo_setupconvert_state	nemo_saveprintr   )rj   rt   r   sourcetargettrainerr5   r5   r6   apply   s   

zHFQwen2Importer.applyc                 C   sZ   dddddddd}t jd	d
tjdt jddtjdt jddtjdg}t j||||dS )N embedding.word_embeddings.weight2decoder.layers.*.self_attention.linear_proj.weight&decoder.layers.*.mlp.linear_fc2.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weightdecoder.final_layernorm.weightoutput_layer.weight)model.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight#model.layers.*.mlp.down_proj.weight%model.layers.*.input_layernorm.weight.model.layers.*.post_attention_layernorm.weightmodel.norm.weightlm_head.weightz&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnz$model.layers.*.self_attn.q_proj.biasz$model.layers.*.self_attn.k_proj.biasz$model.layers.*.self_attn.v_proj.bias/decoder.layers.*.self_attention.linear_qkv.biasz#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weight&decoder.layers.*.mlp.linear_fc1.weightmapping
transforms)r   state_transformr   	merge_qkvmerge_qkv_bias	merge_fc1apply_transformsrj   r~   r   r   r   r5   r5   r6   r{      s2   		zHFQwen2Importer.convert_stater   c                 C   s"   ddl m} || t| ddS )Nr   r   Trw   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   save_hf_tokenizer_assetsr.   )rj   r   r5   r5   r6   rf     s   zHFQwen2Importer.tokenizerc                 C   s   ddl m} ddl m} |jt| dd}|t| }tdi d|jd|jd|jd	|j	d
|j
d|jd|jddddd|jddd|jd|jdt|tjkdt|tjkdt|d|}|S )Nr   )
AutoConfig)GenerationConfigTr   r:   r;   r@   r=   r?   r    r&   r   make_vocab_size_divisible_by   r'   r%   Fr$   r   fp16bf16params_dtypegeneration_configr5   )rx   r   r   ry   r.   r   num_hidden_layersr;   intermediate_sizer=   num_key_value_headsinitializer_rangerms_norm_eps
rope_thetar$   max_position_embeddingsr   torchfloat16bfloat16)rj   HFAutoConfigr   r~   r   outputr5   r5   r6   rd     sP   	
zHFQwen2Importer.configN)rq   r   )r*   r+   r,   rc   rs   r   r   r{   propertyrf   r   rd   r5   r5   r5   r6   rp      s    &rp   r   c                   @   sP   e Zd ZejfdddZdedefddZdd	 Ze	d
d Z
e	dddZdS )HFQwen2Exporterrq   r   c                 C   sT   ddl m} ddlm} |  |j| jd|dW  d    S 1 s#w   Y  d S )Nr   r   )no_init_weightsT)rw   rv   )rx   r   transformers.modeling_utilsr   from_configrd   )rj   dtyper   r   r5   r5   r6   rs   8  s
   $zHFQwen2Exporter.initrt   c                 C   sP   |  t| \}}| t|j}| ||}| }|| | j| |S N)		nemo_loadr.   rs   r   rd   r{   cpusave_pretrainedrf   )rj   rt   r~   _r   r5   r5   r6   r   ?  s   
zHFQwen2Exporter.applyc              	   C   sv   dddddd}t jddtjd	t jd
dtjd	t jddtjd	t jddtjd	t jddtjd	g}t j||||dS )Nr   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   	split_qkvsplit_qkv_bias	split_fc1prune_paddingr   r   r5   r5   r6   r{   J  sL   			#zHFQwen2Exporter.convert_statec                 C   s   t t| jjjS r   )r   load_contextr.   modelrf   rr   r5   r5   r6   rf   }  s   zHFQwen2Exporter.tokenizerHFQwen2Configc                 C   s|   ddl m} tjt| dd}|dg|j|j|j|j|j	d ur"|j	n|j|j |j
|j|j|j|jt|d| jj|j
ddS )	Nr   r   zmodel.config)subpathQwen2ForCausalLMr$   F)architecturesr   r;   r   r=   head_dimr   r   r   r   r   r$   sliding_windowtie_word_embeddings)rx   r   r   r   r.   r:   r;   r@   r=   kv_channelsr   r    r&   r?   r'   getattrrf   r$   )rj   r   r~   r5   r5   r6   rd     s(   

zHFQwen2Exporter.configN)rq   r   )rq   r   )r*   r+   r,   r   r   rs   r   r   r{   r   rf   rd   r5   r5   r5   r6   r   5  s    3
r   )r   r8   rG   rK   rQ   r`   rD   rO   rT   rV   r^   rb   rc   )6dataclassesr   pathlibr   typingr   r   r   r   r   torch.nn.functionalr   
functionalr0   #nemo.collections.llm.gpt.model.baser	   r
   r   nemo.collections.llm.utilsr   nemo.lightningr   r   r   nemo.lightning.io.stater   nemo.lightning.pytorch.utilsr   rx   r   r   r   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   r8   rD   rG   rK   rO   rQ   rT   rV   r^   r`   rb   rc   model_importerModelConnectorrp   model_exporterr   __all__r5   r5   r5   r6   <module>   s\   

`g