o
    }oi8                     @   s  d dl mZ d dlmZ d dlmZmZmZmZ d dl	Z	d dl
m  mZ d dl	mZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZ er[d d
lmZ d dlmZ d dlmZ eG dd deZeG dd deZeG dd deZ eG dd deZ!G dd deZ"ej#e"ddG dd dej$de"f Z%ej&ddddej'fd d!Z(ej&d"d#ddej'fd$d%Z)e*e"dG d&d' d'ej$e"df Z+g d(Z,dS ))    )	dataclass)Path)TYPE_CHECKINGCallableOptionalUnionN)nn)	GPTConfigGPTModel)ioteardown)TransformFns)OptimizerModuleMixtralForCausalLMAutoTokenizer)TokenizerSpecc                   @   s  e Zd ZU dZdZeed< ejZ	e
ed< dZeed< dZeed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d Zeed!< d"Z eed#< d$Z!eed%< d&Z"eed'< d(Z#eed)< d*Z$eed+< d	Z%eed,< e&j'Z(e&j)ed-< d	Z*eed.< d	Z+eed/< d	Z,eed0< dZ-eed1< d2S )3MixtralConfigz)
    Base config for Mixtral models.
    RMSNormnormalizationactivation_funcropeposition_embedding_typeFadd_bias_linearTgated_linear_unit    
num_layers   hidden_sizenum_attention_heads   num_query_groups 8  ffn_hidden_sizemax_position_embeddings
seq_lengthg        attention_dropouthidden_dropout#share_embeddings_and_output_weightsnum_moe_expertsg{Gz?moe_aux_loss_coeff   moe_router_topkmoe_router_pre_softmaxalltoallmoe_token_dispatcher_typeaux_lossmoe_router_load_balancing_typeg{Gz?init_method_stdgh㈵>layernorm_epsilong      ?rotary_percentg    .Arotary_basebf16params_dtypeapply_rope_fusionbias_activation_fusionbias_dropout_fusionmasked_softmax_fusionN).__name__
__module____qualname____doc__r   str__annotations__Fsilur   r   r   r   boolr   r   intr   r    r"   r$   r%   r&   r'   floatr(   r)   r*   r+   r-   r.   r0   r2   r3   r4   r5   r6   r7   torchbfloat16r8   dtyper9   r:   r;   r<    rK   rK   Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/mixtral.pyr   #   sB   
 r   c                   @   sZ   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< d	Zeed< dS )MixtralConfig8x3Bz
    NeMo's Mixtral-8x3B model variant
    https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
    r   r   i 
  r   r    i #  r$   r   r%   r&   Nr=   r>   r?   r@   r   rF   rB   r   r    r$   r%   r&   rK   rK   rK   rL   rM   Q      
 rM   c                   @   sN   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed	< d
S )MixtralConfig8x7Bzn
    Config for Mixtral-8x7B model
    Official announcement: https://mistral.ai/news/mixtral-of-experts/
    r   r   r   r   r#   r$   r%   r&   N)r=   r>   r?   r@   r   rF   rB   r   r$   r%   r&   rK   rK   rK   rL   rP   `   s   
 rP   c                   @   sZ   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< d
Zeed< dS )MixtralConfig8x22Bzj
    Config for Mixtral-8x22B model
    Official announcement: https://mistral.ai/news/mixtral-8x22b/
    8   r   i   r   0   r    i @  r$   r   r%   r&   NrN   rK   rK   rK   rL   rQ   n   rO   rQ   c                       sb   e Zd ZdZ				d
deeeef  dee ded dee	e
jge
jf  f fdd	Z  ZS )MixtralModelzMcore-based MixtralModelNconfigoptim	tokenizerr   model_transformc                    s   t  j|pt |||d dS )zMcore-based MixtralModel ctor)rV   rW   rX   N)super__init__rP   )selfrU   rV   rW   rX   	__class__rK   rL   rZ      s   
zMixtralModel.__init__)NNNN)r=   r>   r?   r@   r   r   rP   rQ   r   r   r   ModulerZ   __classcell__rK   rK   r\   rL   rT   }   s    rT   hf)extc                   @   s\   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	deeB fddZdS )HFMixtralImporterzHF to NeMo importerreturnc                 C   s   t | j| jdS )init)rW   )rT   rU   rW   r[   rK   rK   rL   rd      s   zHFMixtralImporter.initoutput_pathc                 C   s\   ddl m} |jt| ddd}|  }| |}| || | || t|| ~~|S )zImport model from HFr   r   autoT)torch_dtypeuse_safetensors)	transformersr   from_pretrainedrA   rd   
nemo_setupconvert_state	nemo_saver   )r[   rf   r   sourcetargettrainerrK   rK   rL   apply   s   

zHFMixtralImporter.applyc                 C   sL   ddddddd}t jdd	tjd
t jddtjd
ttg}t j||||dS )zState-dict converter2decoder.layers.*.self_attention.linear_proj.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight)decoder.layers.*.pre_mlp_layernorm.weight>decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight"decoder.layers.*.mlp.router.weightdecoder.final_layernorm.weight)&model.layers.*.self_attn.o_proj.weight%model.layers.*.input_layernorm.weight.model.layers.*.post_attention_layernorm.weight3model.layers.*.block_sparse_moe.experts.*.w2.weight+model.layers.*.block_sparse_moe.gate.weightmodel.norm.weightz&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnz3model.layers.*.block_sparse_moe.experts.*.w1.weightz3model.layers.*.block_sparse_moe.experts.*.w3.weight>decoder.layers.*.mlp.experts.local_experts.*.linear_fc1.weightmapping
transforms)r   state_transformr   	merge_qkv	merge_fc1_import_embedding_import_lm_headapply_transformsr[   ro   rp   r   r   rK   rK   rL   rm      s*   	zHFMixtralImporter.convert_stater   c                 C   s   ddl m} || t| S )zConfigures tokenizerr   r   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   save_hf_tokenizer_assetsrA   )r[   r   rK   rK   rL   rW      s   zHFMixtralImporter.tokenizerc              	   C   s(  ddl m} ddl m} |t| }|t| }t}dt|  v r&t}|d%i dt|ddt	j
kdtjd	|jd
|jd|jdt|d|j|j d|jd|jddd|jd|jd|jd|jd|jddddd|jd|jdddd d!dd"dd#t|dt	j
d$|S )&zReturns Mcore config from HFr   )GenerationConfigr   8x22br7   rh   Nr   r   r   r$   kv_channelshead_dimr%   r&   r   r   r6   r    r"   r*   r-   r.   Fr   r   r4   r3   r   Tmake_vocab_size_divisible_by   use_cpu_initializationperform_initializationr8   generation_configrK   )rj   r   r   rk   rA   rP   lowerrQ   getattrrH   rI   rC   rD   num_hidden_layersr   intermediate_sizer    r%   
rope_thetanum_key_value_headsnum_local_expertsnum_experts_per_tokrms_norm_epsinitializer_range)r[   r   HfMixtralConfigrU   r   
config_clsrK   rK   rL   rU      sp   	zHFMixtralImporter.configN)rc   r   )r=   r>   r?   r@   rT   rd   r   rr   rm   propertyrW   rP   rQ   rU   rK   rK   rK   rL   rb      s    $rb   r   model.embed_tokens.weight embedding.word_embeddings.weight)r   r   ctxc                 C   s@   | j jjj}|jd }| jd d|ddf | | jd S )r   r   r   N)ro   modelembed_tokensweightshapetarget_statecopy_)r   	embeddingembedding_weight
vocab_sizerK   rK   rL   r      s   
 
r   lm_head.weightoutput_layer.weightc                 C   s>   | j jj}|jd }| jd d|ddf | | jd S )zimport headr   r   N)ro   lm_headr   r   r   r   )r   r   lm_head_weightr   rK   rK   rL   r   	  s   

 
r   c                   @   sN   e Zd ZdZdddZdedefddZd	d
 Zedd Z	edddZ
dS )HFMixtralExporterzNeMo to HF exporterrc   r   c                 C   sN   ddl m} ddlm} |  || jW  d   S 1 s w   Y  dS )z HFMixtralExporter initializationr   )AutoModelForCausalLM)no_init_weightsN)rj   r   transformers.modeling_utilsr   from_configrU   )r[   r   r   rK   rK   rL   rd     s
   
$zHFMixtralExporter.initrf   c                 C   sH   |   }| t| \}}| ||}| }|| | j| |S )zexport to hf format)rd   	nemo_loadrA   rm   cpusave_pretrainedrW   )r[   rf   rp   ro   _rK   rK   rL   rr   !  s   
zHFMixtralExporter.applyc                 C   sh   ddddddd}t jdd	tjd
t jddtjd
t jddtjd
t jddtjd
g}t j||||dS )zconvert statery   rz   r{   r|   r}   r~   )rs   rt   ru   rv   rw   rx   r   r   r   r   r   r   r   r   r   r   )r   r   r   prune_padding	split_qkv	split_fc1r   r   rK   rK   rL   rm   1  sD   	zHFMixtralExporter.convert_statec                 C   s   t jt| ddjS )zreturn tokenizerr   subpath)r   load_contextrA   rW   re   rK   rK   rL   rW   b  s   zHFMixtralExporter.tokenizerr   c                 C   sd   t jt| dd}ddlm} |dg|j|j|j|j|j|j	|j
|j|j|j|j|j| jj|jdS )zreturn hf-config from mcorezmodel.configr   r   r   r   )architecturesr   r   r   r%   r&   r   r    r   r   r   r   r   r   r   )r   r   rA   rj   r   r   r   r$   r%   r6   r    r"   r*   r-   r4   r3   rW   r   r   )r[   ro   r   rK   rK   rL   rU   g  s&   zHFMixtralExporter.configN)rc   r   )rc   r   )r=   r>   r?   r@   rd   r   rr   rm   r   rW   rU   rK   rK   rK   rL   r     s    
1
r   )r   rM   rP   rQ   rT   )-dataclassesr   pathlibr   typingr   r   r   r   rH   torch.nn.functionalr   
functionalrC   #nemo.collections.llm.gpt.model.baser	   r
   nemo.lightningr   r   nemo.lightning.io.stater   nemo.lightning.pytorch.optimr   rj   r   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   r   rM   rP   rQ   rT   model_importerModelConnectorrb   r   TransformCTXr   r   model_exporterr   __all__rK   rK   rK   rL   <module>   sL   -o
q