o
    }oi                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZ	 d dl
mZ d dlmZ dd Zde	jfd	d
ZeG dd dZde	jfddZdejfddZdd Zdd ZdejfddZdejfddZdd Zedkre Zejdkrye \ZZne \ZZeej Z!ej"eeej#ej$eej%ej%ej&ej%dkej'e!e e	j(eddd ej)d!d"d#d$d% ej%d&kre* + D ])\Z,Z-e.e-ej/re0e-1 Z2ng Z2d'e,vree, e3e-j e2fksJ qdS dS dS )(    N)OptimizerConfig)	lightning)llm)track_ioc                  C   s   t jdd} | jdtjddgdd | jdtd	d
d | jdtddd | jdtddd | jdtddd | jdtddd | jdddd | jdddd |  S )Nz)Finetune a small GPT model using NeMo 2.0)descriptionz--modelmistralmixtralmodel)typechoiceshelpz--max-steps	   znumber of devices)r
   defaultr   z--mbs   zmicro batch sizez--gbs   zglobal batch sizez--tp   ztensor parallel sizez--epzexpert parallel sizez
--dist-opt
store_truezuse dist opt)actionr   z--use-excludezwill use exclude_modules)argparseArgumentParseradd_argumentstrlowerint
parse_args)parser r   X/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/lora_mistralai.pyget_args   s   r   returnc                 C   s:   t j|||d}t jt|||d|t jddddddd	S )N)tensor_model_parallel_sizeexpert_model_parallel_sizesequence_parallelgpuz
bf16-mixed)	precisionr   r   )	devices	max_stepsacceleratorstrategypluginslog_every_n_stepslimit_val_batchesval_check_intervalnum_sanity_val_steps)nlMegatronStrategyTrainermaxMegatronMixedPrecision)r%   tpepspr&   r(   r   r   r   trainer'   s    
r6   c                   @   s0   e Zd Zddg dfddZdd Zdd	 Zd
S )OrdTokenizeri0u     )bos_ideos_idpad_idc                 C   s(   || _ || _|| _t| j|k sJ d S N)
vocab_sizenum_reserved_tokensspecial_token_nameslen)selfr=   r>   r?   r   r   r   __init__=   s   zOrdTokenizer.__init__c                 C   s:   || j di v r| j d |S || j v r| j | S t)Nr?   )__dict__getindexAttributeError)rA   namer   r   r   __getattr__C   s
   

zOrdTokenizer.__getattr__c                    s0   t t fddt |}t| jk sJ |S )Nc                    s    j t|  S r<   )r>   ord)xrA   r   r   <lambda>L   s    z*OrdTokenizer.text_to_ids.<locals>.<lambda>)listmapr1   r=   )rA   text	token_idsr   rK   r   text_to_idsK   s   zOrdTokenizer.text_to_idsN)__name__
__module____qualname__rB   rH   rQ   r   r   r   r   r7   ;   s    r7   c                  C   s*   t jddddddd} t jddd| d d	S )
NT
   reduced_train_lossr   )	save_lastevery_n_train_stepsmonitor
save_top_ksave_on_train_epoch_endsave_optim_on_train_end
nemo2_peftz/tmp/peft_logsF)rG   log_diruse_datetime_versionckptwandb)r.   ModelCheckpoint
NeMoLogger)r`   r   r   r   loggerQ   s   	rd   c                 C   s   t jd| |ddS )N   r   )
seq_lengthmicro_batch_sizeglobal_batch_sizenum_workers)r   SquadDataModule)mbsgbsr   r   r   squadd   s   rm   c                
   C   s  i ddddgfdddgfddddgfd	ddgfd
dddgfddg fddddgfddg fdddgfddddgfdddgfddddgfddg fddddgfddg fdddgfddddgfi dddgfddddgfddg fddddgfd dg fd!ddd"gfd#ddgfd$ddd"gfd%dg fd&dddgfd'dg fd(ddgfd)dddgfd*ddgfd+dddgfd,dg fd-dddgfi d.dg fd/ddgfd0dddgfd1ddgfd2dddgfd3dg fd4dddgfd5dg fd6ddgfd7dddgfd8ddgfd9dddgfd:dg fd;dddgfd<dg fd=ddd"gfd>ddgfddd"gfdg fdddgfdg fddgfdd?gfdddgfdg fd@S )AN'module.embedding.word_embeddings.weightTensor ;     Bmodule.decoder.layers.0.self_attention.core_attention._extra_stater   9module.decoder.layers.0.self_attention.linear_proj.weightre   ?module.decoder.layers.0.self_attention.linear_proj._extra_stateKmodule.decoder.layers.0.self_attention.linear_proj.adapter.linear_in.weight    Qmodule.decoder.layers.0.self_attention.linear_proj.adapter.linear_in._extra_stateNoneTypeLmodule.decoder.layers.0.self_attention.linear_proj.adapter.linear_out.weightRmodule.decoder.layers.0.self_attention.linear_proj.adapter.linear_out._extra_stateCmodule.decoder.layers.0.self_attention.linear_qkv.layer_norm_weight8module.decoder.layers.0.self_attention.linear_qkv.weight   >module.decoder.layers.0.self_attention.linear_qkv._extra_stateJmodule.decoder.layers.0.self_attention.linear_qkv.adapter.linear_in.weight   Pmodule.decoder.layers.0.self_attention.linear_qkv.adapter.linear_in._extra_stateKmodule.decoder.layers.0.self_attention.linear_qkv.adapter.linear_out.weightQmodule.decoder.layers.0.self_attention.linear_qkv.adapter.linear_out._extra_statez8module.decoder.layers.0.mlp.linear_fc1.layer_norm_weightz-module.decoder.layers.0.mlp.linear_fc1.weight 8  z3module.decoder.layers.0.mlp.linear_fc1._extra_statez?module.decoder.layers.0.mlp.linear_fc1.adapter.linear_in.weightzEmodule.decoder.layers.0.mlp.linear_fc1.adapter.linear_in._extra_statez@module.decoder.layers.0.mlp.linear_fc1.adapter.linear_out.weightzFmodule.decoder.layers.0.mlp.linear_fc1.adapter.linear_out._extra_statez-module.decoder.layers.0.mlp.linear_fc2.weight   z3module.decoder.layers.0.mlp.linear_fc2._extra_statez?module.decoder.layers.0.mlp.linear_fc2.adapter.linear_in.weightzEmodule.decoder.layers.0.mlp.linear_fc2.adapter.linear_in._extra_statez@module.decoder.layers.0.mlp.linear_fc2.adapter.linear_out.weightzFmodule.decoder.layers.0.mlp.linear_fc2.adapter.linear_out._extra_stateBmodule.decoder.layers.1.self_attention.core_attention._extra_state9module.decoder.layers.1.self_attention.linear_proj.weight?module.decoder.layers.1.self_attention.linear_proj._extra_stateKmodule.decoder.layers.1.self_attention.linear_proj.adapter.linear_in.weightQmodule.decoder.layers.1.self_attention.linear_proj.adapter.linear_in._extra_stateLmodule.decoder.layers.1.self_attention.linear_proj.adapter.linear_out.weightRmodule.decoder.layers.1.self_attention.linear_proj.adapter.linear_out._extra_stateCmodule.decoder.layers.1.self_attention.linear_qkv.layer_norm_weight8module.decoder.layers.1.self_attention.linear_qkv.weight>module.decoder.layers.1.self_attention.linear_qkv._extra_stateJmodule.decoder.layers.1.self_attention.linear_qkv.adapter.linear_in.weightPmodule.decoder.layers.1.self_attention.linear_qkv.adapter.linear_in._extra_stateKmodule.decoder.layers.1.self_attention.linear_qkv.adapter.linear_out.weightQmodule.decoder.layers.1.self_attention.linear_qkv.adapter.linear_out._extra_statez8module.decoder.layers.1.mlp.linear_fc1.layer_norm_weightz-module.decoder.layers.1.mlp.linear_fc1.weightz3module.decoder.layers.1.mlp.linear_fc1._extra_statez?module.decoder.layers.1.mlp.linear_fc1.adapter.linear_in.weightzEmodule.decoder.layers.1.mlp.linear_fc1.adapter.linear_in._extra_statez@module.decoder.layers.1.mlp.linear_fc1.adapter.linear_out.weightzFmodule.decoder.layers.1.mlp.linear_fc1.adapter.linear_out._extra_statez-module.decoder.layers.1.mlp.linear_fc2.weightz3module.decoder.layers.1.mlp.linear_fc2._extra_stater   )z?module.decoder.layers.1.mlp.linear_fc2.adapter.linear_in.weightzEmodule.decoder.layers.1.mlp.linear_fc2.adapter.linear_in._extra_statez@module.decoder.layers.1.mlp.linear_fc2.adapter.linear_out.weightzFmodule.decoder.layers.1.mlp.linear_fc2.adapter.linear_out._extra_state%module.decoder.final_layernorm.weight+module.decoder.final_layernorm._extra_statemodule.output_layer.weight module.output_layer._extra_stater   r   r   r   r   get_mistral_expected_ckpth   s   


	






 !"#
$%
&'()*
+,
-./012
3
4

r   c                   C   s4  i ddddgfdddgfddddgfd	ddgfd
dddgfddg fddddgfddg fdddgfddddgfdddgfddddgfddg fddddgfddg fdddgfdddgfi ddddgfddddgfd ddgfd!dddgfd"dg fd#dddgfd$dg fd%ddd&gfd'ddgfd(ddd&gfd)dg fd*dddgfd+dg fd,dddgfd-ddgfd.dddgfd/dg fi d0dddgfd1dg fd2ddd&gfd3ddgfd4ddd&gfd5dg fd6dddgfd7dg fd8dddgfd9ddgfd:dddgfd;dg fd<dddgfd=dg fd>ddd&gfd?ddgfd@ddd&gfi dAdg fdBdddgfdCdg fdDdddgfdEddgfdFdddgfdGdg fdHdddgfdIdg fdJddd&gfdKddgfdLddd&gfdMdg fdNdddgfdOdg fdPdddgfdQddgfi dRdddgfdSdg fdTdddgfdUdg fdVddd&gfdWddgfdXddd&gfdYdg fdZdddgfd[dg fd\dddgfd]ddgfd^dddgfd_dg fd`dddgfdadg fdbddd&gfi dcddgfddddd&gfdedg fdfdddgfdgdg fdhdddgfdiddgfdjdddgfdkdg fdldddgfdmdg fdnddd&gfdoddgfdpddd&gfdqdg fdrdddgfdsdg fi dtdddgfduddgfdvdddgfdwdg fdxdddgfdydg fdzddd&gfd{ddgfd|ddd&gfd}dg fd~dddgfddg fdddgfddddgfdddgfddddgfddg fi ddddgfddg fdddgfddddgfdddgfddddgfddg fddddgfddg fdddgfdddgfddddgfddddgfdddgfddddgfddg fddddgfi ddg fdddd&gfdddgfdddd&gfddg fddddgfddg fddddgfdddgfddddgfddg fddddgfddg fdddd&gfdddgfdddd&gfddg fi ddddgfddg fddddgfdddgfddddgfddg fddddgfddg fdddd&gfdddgfdddd&gfddg fddddgfddg fddddgfdddgfddddgfi ddg fddddgfddg fdddd&gfdddgfdddd&gfddg fddddgfddg fddddgfdddgfddddgfddg fddddgfddg fdddd&gfdddgfi dddd&gfddg fddddgfddg fddddgfdddgfddddgfddg fddddgfddg fdddd&gfdddgfdddd&gfddg fddddgfddg fddddgfi dddgfddddgfddg fddddgfddg fdddd&gfdddgfdddd&gfddg fddddgfddg fddddgfdddgfddddgfddg fddddgfddg fddd&gfddgfddd&gfdg fdddgfdg fddgfddgfdddgfdg fd
S )Nrn   ro   rp   rq   rr   r   rs   re   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   z0module.decoder.layers.0.pre_mlp_layernorm.weightz6module.decoder.layers.0.pre_mlp_layernorm._extra_stater   z)module.decoder.layers.0.mlp.router.weight   zEmodule.decoder.layers.0.mlp.experts.local_experts.0.linear_fc1.weightr   zKmodule.decoder.layers.0.mlp.experts.local_experts.0.linear_fc1._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.0.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.0.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.0.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.0.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.0.linear_fc2.weightr   zKmodule.decoder.layers.0.mlp.experts.local_experts.0.linear_fc2._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.0.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.0.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.0.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.0.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.1.linear_fc1.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.1.linear_fc1._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.1.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.1.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.1.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.1.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.1.linear_fc2.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.1.linear_fc2._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.1.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.1.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.1.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.1.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.2.linear_fc1.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.2.linear_fc1._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.2.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.2.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.2.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.2.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.2.linear_fc2.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.2.linear_fc2._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.2.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.2.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.2.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.2.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.3.linear_fc1.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.3.linear_fc1._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.3.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.3.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.3.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.3.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.3.linear_fc2.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.3.linear_fc2._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.3.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.3.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.3.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.3.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.4.linear_fc1.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.4.linear_fc1._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.4.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.4.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.4.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.4.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.4.linear_fc2.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.4.linear_fc2._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.4.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.4.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.4.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.4.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.5.linear_fc1.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.5.linear_fc1._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.5.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.5.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.5.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.5.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.5.linear_fc2.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.5.linear_fc2._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.5.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.5.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.5.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.5.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.6.linear_fc1.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.6.linear_fc1._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.6.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.6.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.6.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.6.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.6.linear_fc2.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.6.linear_fc2._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.6.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.6.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.6.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.6.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.7.linear_fc1.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.7.linear_fc1._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.7.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.7.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.7.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.7.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.0.mlp.experts.local_experts.7.linear_fc2.weightzKmodule.decoder.layers.0.mlp.experts.local_experts.7.linear_fc2._extra_statezWmodule.decoder.layers.0.mlp.experts.local_experts.7.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.0.mlp.experts.local_experts.7.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.0.mlp.experts.local_experts.7.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.0.mlp.experts.local_experts.7.linear_fc2.adapter.linear_out._extra_stater   r   r   r   r   r   r   r   r   r   r   r   r   r   z0module.decoder.layers.1.pre_mlp_layernorm.weightz6module.decoder.layers.1.pre_mlp_layernorm._extra_statez)module.decoder.layers.1.mlp.router.weightzEmodule.decoder.layers.1.mlp.experts.local_experts.0.linear_fc1.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.0.linear_fc1._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.0.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.0.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.0.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.0.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.0.linear_fc2.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.0.linear_fc2._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.0.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.0.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.0.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.0.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.1.linear_fc1.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.1.linear_fc1._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.1.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.1.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.1.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.1.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.1.linear_fc2.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.1.linear_fc2._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.1.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.1.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.1.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.1.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.2.linear_fc1.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.2.linear_fc1._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.2.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.2.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.2.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.2.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.2.linear_fc2.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.2.linear_fc2._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.2.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.2.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.2.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.2.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.3.linear_fc1.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.3.linear_fc1._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.3.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.3.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.3.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.3.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.3.linear_fc2.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.3.linear_fc2._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.3.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.3.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.3.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.3.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.4.linear_fc1.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.4.linear_fc1._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.4.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.4.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.4.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.4.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.4.linear_fc2.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.4.linear_fc2._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.4.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.4.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.4.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.4.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.5.linear_fc1.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.5.linear_fc1._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.5.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.5.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.5.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.5.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.5.linear_fc2.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.5.linear_fc2._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.5.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.5.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.5.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.5.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.6.linear_fc1.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.6.linear_fc1._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.6.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.6.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.6.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.6.linear_fc1.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.6.linear_fc2.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.6.linear_fc2._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.6.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.6.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.6.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.6.linear_fc2.adapter.linear_out._extra_statezEmodule.decoder.layers.1.mlp.experts.local_experts.7.linear_fc1.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.7.linear_fc1._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.7.linear_fc1.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.7.linear_fc1.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.7.linear_fc1.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.7.linear_fc1.adapter.linear_out._extra_state)
zEmodule.decoder.layers.1.mlp.experts.local_experts.7.linear_fc2.weightzKmodule.decoder.layers.1.mlp.experts.local_experts.7.linear_fc2._extra_statezWmodule.decoder.layers.1.mlp.experts.local_experts.7.linear_fc2.adapter.linear_in.weightz]module.decoder.layers.1.mlp.experts.local_experts.7.linear_fc2.adapter.linear_in._extra_statezXmodule.decoder.layers.1.mlp.experts.local_experts.7.linear_fc2.adapter.linear_out.weightz^module.decoder.layers.1.mlp.experts.local_experts.7.linear_fc2.adapter.linear_out._extra_stater   r   r   r   r   r   r   r   r   get_mixtral_expected_ckpt   s  


	




!%
&'+/37
89=AEI
JKOSW[
\]aeim
nosw{ 
      
      
           $  
%  &  *  .  2  6  
7  8  <  @  D  H  
I  J  N  R  V  Z  
[  \  `  d  h  l  
m  n  r  v  z  ~  
           	            
                    #    
$    %    )    -    1    
5    6    
7    8    9    :    ;    
<    =    
>    ?    @    A    B    
C    
D    E    F    
G    H    L    P    T    X    
Y    Z    ^    b    f    j    
k    l    p    t    x    |    
}    ~                             
                              !      
"      #      '      +      /      3      
4      5      9      =      A      E      
F      G      K      O      S      W      
X      Y      ]      a      e      i      
j      k      o      s      w      {      
|      }                       
                
                                                 
!        "        &        *        .        2        
3        4        8        <        @        D        
E        F        J        N        R        
V
    r   c                  C   &   t  } tjtjdd| d}|t fS Nr   )
num_layers)	tokenizer)r7   r   MixtralModelMixtralConfig8x7Br   r   r	   r   r   r   mixtral_8x7b     
r   c                  C   r   r   )r7   r   MistralModelMistralConfig7Br   r   r   r   r   
mistral_7b  r   r   c                 C   s    | rt jjdgg dS t j S )Noutput_layer)exclude_modulestarget_modules)r   peftLoRA)use_excluder   r   r   	make_lora   s   
r   __main__r   r   adamg-C6?g\(\?g      ?T)	optimizerlr
adam_beta2use_distributed_optimizer	clip_gradbf16)config)r	   datar6   r   logoptimr   _extra_state)4r   lightning.pytorchpytorchpltorchmegatron.core.optimizerr   nemor   r.   nemo.collectionsr   nemo.lightning.io.mixinr   r   r0   r6   r7   rc   rd   LightningDataModulerm   r   r   LightningModuler   r   r   rR   argsr	   ref_ckptr   lorafinetunerk   rl   r3   r4   r&   MegatronOptimizerModuledist_opt
state_dictitemskeyval
isinstancero   rM   sizeshaper
   r   r   r   r   <module>   sp   @    p



