o
    }oic7                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ dd ZdddZdd Zdd Zedkrfee  dS dS )    NPath)DistributedDataParallelConfig)AttnBackend)MixtralConfig8x3BMixtralModelPreTrainingDataModule)trainget_nmt_tokenizer)MegatronStrategy
NeMoLoggerTrainer)MegatronOptimizerModule)OptimizerConfigc                 C   s   t dd| |dS )NmegatronGPT2BPETokenizer)
vocab_filemerges_filer
   )
vocab_pathmerges_path r   f/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/megatron_mixtral_pretraining.py	tokenizer   s   r   Tc                    s~   ddl m} dd l dd lm  m} ddlm} t| |s"|| } || }| } fdd|j	
 D }|j||d |S )Nr   r   )FileSystemReaderc                    s4   i | ]\}}t |jd kr| j|j|jjdqS )TensorStorageMetadata)dtype)type__name__emptysize
propertiesr   ).0ktptorchr   r   
<dictcomp>4   s
    zload_dcp.<locals>.<dictcomp>)storage_reader)pathlibr   r&   torch.distributed.checkpointdistributed
checkpointr   
isinstanceread_metadatastate_dict_metadataitemsload)ckpt_dirtorch_tensorr   dcpr   	fs_readermetadata
state_dictr   r%   r   load_dcp(   s    

r8   c                 C   s  t | jdddtjtjtjtjtddddd dd	}td| j| jd|dd dd}t| j	d	d
dddt
| j| jd}tdsi dd
dddddddddddddddddddddd	d dd!tjd"tjd#tj}d|_tddtjd$dd%d%d&dd&ddd'}t|d(}t|||j
d)}t| jd| jd*}t| j}	|	 rJ d+|	 d,t|d |||d-|d. t| jd/ }	|	 sJ d0|	 d,|	 sJ d0|	 d1g d2}
|
D ]<}|	| }| sJ d0| d,| sJ d0| d3t|tjs	J d0| d4| j sJ d0| d5qt!|	D ]}||
v s,J d6| d7qi d8t"d9dgtjd:fd;t"g d<tjd:fd=t"d
dgtjd:fd>t"g d?tjd:fd@t"d
dgtjd:fdAt"g dBtjd:fdCt"g dDtjd:fdEt"g dFtjd:fdGt"dgtjd:fdHt"d9dgtjd:fdIt"g dJtjd:fdKt"g dJtjd:fdLt"g dJtjd:fdMt"dgtjd:fdNt"dgtjd:fdOt"dgtjd:fdPt"g dQtjd:fi dRt"g dQtjd:fdSt"g dQtjd:fdTt"g dUtjd:fdVt"g dUtjd:fdWt"g dUtjd:fdXt"g dYtjd:fdZt"g dYtjd:fd[t"g dYtjd:fd\t"g d]tjd:fd^t"g d]tjd:fd_t"g d]tjd:fd`t"g datjd:fdbt"g datjd:fdct"g datjd:fddt"g d]tjd:fdet"g d]tjd:fdft"g d]tjd:ft"g dgtjd:ft"g dgtjd:ft"g dgtjd:ft"g dJtjd:ft"g dJtjd:ft"g dJtjd:fdh}t#|	}t$|% }t$|% }t&|t&|ks,J dit&|t&||| f|' D ]\}\}}}||v sDJ d0| djt(|| tj)sUJ d0| dkt&|dkr|*dlr|| j+d|d fksJ d0| dm|| j+ dn|d  don|| j+|ksJ d0| dm|| j+ dp| || j,|ksJ d0| dq|| j, dp| t-|| j.|ksJ d0| dr|| j. dp| q0d S )tN   FT)grad_reduce_in_fp32overlap_grad_reduceuse_distributed_optimizercheck_for_nan_in_gradbucket_size)	expert_model_parallel_sizetensor_model_parallel_sizesequence_parallelcontext_parallel_sizeparams_dtypepipeline_dtypeautocast_dtype	precisionddpgpur   )log_every_n_stepsdevices	max_stepsacceleratorstrategynum_sanity_val_stepsloggerlimit_val_batchesi      z99,1,0)
seq_lengthglobal_batch_sizemicro_batch_sizenum_workerssplitr   
num_layershidden_size   num_attention_heads   num_query_groupsffn_hidden_size@  kv_channels   init_method_stdgQ?hidden_dropoutg?attention_dropoutlayernorm_epsilongh㈵>make_vocab_size_divisible_bymax_position_embeddingsbf16rC   rD   attention_backendg{Gz?g?g        )fp16rg   rC   lrweight_decay
adam_beta1
adam_beta2	clip_gradr<   min_lrlog_num_zeros_in_gradbarrier_with_L1_time)config)optimr   )nameuse_datetime_versionexplicit_log_dirzDid not expect z	 to existdata)modelresumerw   trainerlogr   rs   z>checkpoints/--None=0.0000-epoch=0-consumed_samples=8.0/weightsz	Expected z to be a directory)z__0_0.distcpz__0_1.distcpz	common.ptzmetadata.jsonz	.metadataz to be a filez to be readablez to be non-emptyzGot unexpected z in checkpoint directoryz'module.embedding.word_embeddings.weighti  cpuz7module.decoder.layers.self_attention.linear_proj.weight)rQ   rY   rY   zAmodule.decoder.layers.self_attention.linear_qkv.layer_norm_weightz6module.decoder.layers.self_attention.linear_qkv.weight)rQ   i  rY   z.module.decoder.layers.pre_mlp_layernorm.weightz'module.decoder.layers.mlp.router.weight)rQ   r[   rY   z;module.decoder.layers.mlp.experts.experts.linear_fc1.weight)rQ   r[   i  rY   z;module.decoder.layers.mlp.experts.experts.linear_fc2.weight)rQ   r[   rY   r^   z%module.decoder.final_layernorm.weightzmodule.output_layer.weightz5optimizer.state.fp32_param.module.output_layer.weight)r9   r9   i @b z2optimizer.state.exp_avg.module.output_layer.weightz5optimizer.state.exp_avg_sq.module.output_layer.weightz@optimizer.state.fp32_param.module.decoder.final_layernorm.weightz=optimizer.state.exp_avg.module.decoder.final_layernorm.weightz@optimizer.state.exp_avg_sq.module.decoder.final_layernorm.weightzVoptimizer.state.fp32_param.module.decoder.layers.mlp.experts.experts.linear_fc2.weight)rQ   r[   r9   r9      zSoptimizer.state.exp_avg.module.decoder.layers.mlp.experts.experts.linear_fc2.weightzVoptimizer.state.exp_avg_sq.module.decoder.layers.mlp.experts.experts.linear_fc2.weightzVoptimizer.state.fp32_param.module.decoder.layers.mlp.experts.experts.linear_fc1.weight)rQ   r[   rQ   r9   r}   zSoptimizer.state.exp_avg.module.decoder.layers.mlp.experts.experts.linear_fc1.weightzVoptimizer.state.exp_avg_sq.module.decoder.layers.mlp.experts.experts.linear_fc1.weightzBoptimizer.state.fp32_param.module.decoder.layers.mlp.router.weight)rQ   r9   r9   i   z?optimizer.state.exp_avg.module.decoder.layers.mlp.router.weightzBoptimizer.state.exp_avg_sq.module.decoder.layers.mlp.router.weightzIoptimizer.state.fp32_param.module.decoder.layers.pre_mlp_layernorm.weight)rQ   r9   rY   zFoptimizer.state.exp_avg.module.decoder.layers.pre_mlp_layernorm.weightzIoptimizer.state.exp_avg_sq.module.decoder.layers.pre_mlp_layernorm.weightzQoptimizer.state.fp32_param.module.decoder.layers.self_attention.linear_qkv.weight)rQ   r9   r9   i   zNoptimizer.state.exp_avg.module.decoder.layers.self_attention.linear_qkv.weightzQoptimizer.state.exp_avg_sq.module.decoder.layers.self_attention.linear_qkv.weightz\optimizer.state.fp32_param.module.decoder.layers.self_attention.linear_qkv.layer_norm_weightzYoptimizer.state.exp_avg.module.decoder.layers.self_attention.linear_qkv.layer_norm_weightz\optimizer.state.exp_avg_sq.module.decoder.layers.self_attention.linear_qkv.layer_norm_weight)rQ   r9   r9   i @  )zRoptimizer.state.fp32_param.module.decoder.layers.self_attention.linear_proj.weightzOoptimizer.state.exp_avg.module.decoder.layers.self_attention.linear_proj.weightzRoptimizer.state.exp_avg_sq.module.decoder.layers.self_attention.linear_proj.weightzBoptimizer.state.fp32_param.module.embedding.word_embeddings.weightz?optimizer.state.exp_avg.module.embedding.word_embeddings.weightzBoptimizer.state.exp_avg_sq.module.embedding.word_embeddings.weightzCheckpoint length mismatch z to be in ckptz to be a tensorzoptimizer.statez shapes to match z & (1, )z & z dtype to match z device to match r   )/r   rJ   r&   bfloat16float32McoreDDPConfigr   rK   r   	data_pathr   r   r   r   r   unfused(overlap_param_gather_with_optimizer_stepr   MegatronOptimr   r   experiment_nameexperiment_dirr   existsr	   is_diris_fileosaccessR_OKstatst_sizelistdirSizer8   setkeyslenr0   r-   Tensor
startswithshaper   strdevice)argsrM   rz   rw   mixtral_configoptim_configoptrx   nemo_loggeroutput_pathoutput_filesfilepathexpected_ckptckpt	ckpt_keysexpected_keyskeyr   r   r   r   r   r   mainA   sz  
	


 	
"',16;@EJOTY^chmrx  "
 008r   c                  C   s   t jdd} | jdtddd | jdtdd	d | jd
tddd | jdtddd | jdtdd | jdtd dd | jdtd dd |  S )Nz*Train a small Mixtral model using NeMo 2.0)descriptionz	--devicesr9   z%Number of devices to use for training)r   defaulthelpz--max-steps   zNumber of steps to train forz--experiment-dirz/tmp/exp_dirz-directory to write results and checkpoints toz--experiment-namemini_mixtral_testzname of experimentz--data-pathzPath to data file)r   r   z--vocab-pathzPath to vocab filez--merges-pathzPath to merges file)argparseArgumentParseradd_argumentintr   
parse_args)parserr   r   r   r   a  s   r   __main__)T)r   r   r)   r   r&   megatron.core.distributedr   r   megatron.core.transformer.enumsr   nemo.collections.llmr   r   r   nemo.collections.llm.apir	   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightningr   r   r   %nemo.lightning.pytorch.optim.megatronr   r   r   r   r8   r   r   r   r   r   r   r   <module>   s*   
	  "