o
    }oi`2                  "   @   s  d Z ddlmZ ddlmZ ddlmZmZmZm	Z	 ddl
mZ ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ e"d\Z%Z&e"d\Z'Z(e(rzddl)m*Z* e"d\Z'Z+e"d\Z'Z,e(re+re,rddl-m.Z. erddl
mZ/ ddl0m1Z1 ddl2m3Z3 ddgZ4dej5dej5fddZ6de	ej5ej7f de	ej5ej7f fddZ8d ej9d!e:ddfd"dZ;	#	#			#	#	#	$		%			d>d&e:d'e<d(e<d)e<dB d*e<dB d+e<d,e<d-e<d.e=d/e:dB d0e=d1e>dB d2e>dB d3e>dB de?e	ej@ejAf ejBf fd4dZC	d?d ejDd!ee: d5ed6 ddeB fd7d8ZEd d9d!e:d:d;fd<d=ZFdS )@z>Utility functions for loading models with modelopt layer spec.    )nullcontextpartial)TYPE_CHECKINGCallableOptionalUnionN)_WrappingCheckpointIO)StrictHandling)	lightning)llm) _setup_trainer_and_restore_model)ckpt_to_context_subdir)ckpt_to_weights_subdir)logging)safe_import)unwrap_modelzmodelopt.torch.opttransformer_engineget_gpt_modelopt_spec	mamba_ssmcausal_conv1d)get_mamba_stack_modelopt_spec)CheckpointIO)MegatronParallel#set_modelopt_spec_if_exists_in_ckpt2setup_trainer_and_restore_model_with_modelopt_spec	model_cfgreturnc                 C   s   t d t| tjsJ dzddlm} ddlm} ||d| j	d}W n t
y=   ddlm} || jd| j	d	}Y nw || _| S )
z9Set model.config.transformer_layer_spec to modelopt spec.z@Setting model.config.transformer_layer_spec to gpt_modelopt_speczmodel_cfg must be a GPTConfigr   r   r   T)remap_te_layernorm
qk_l2_norm)get_gpt_layer_modelopt_spec)num_expertsr   r    )r   info
isinstancer   	GPTConfig	functoolsr   4megatron.core.post_training.modelopt.gpt.model_specsr   r    ImportError8megatron.core.inference.modelopt_support.gpt.model_specsr!   num_moe_expertstransformer_layer_spec)r   r   r   modelopt_specr!    r-   ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/modelopt/model_utils.py_set_gpt_modelopt_spec:   s   


r/   c                 C   sX   t d t| tjrttdd| _| S t| tjr#tt	dd| _
| S tdt|  )a  
    Set the model layer spec to a modelopt spec variant. This function updates the model
    config with the appropriate modelopt layer specification based on the model type.

    Args:
        model_cfg (Union[llm.GPTConfig, llm.SSMConfig]): The model config.

    Returns:
        Union[llm.GPTConfig, llm.SSMConfig]: The model config updated for the modelopt layer specification.
    z<Setting model layer specification to the modelopt layer specT)r   z1No modelopt layer spec supported for config type )r   r#   r$   r   r%   r   r   r+   	SSMConfigr   mamba_stack_spec
ValueErrortype)r   r-   r-   r.   _set_gpt_mamba_modelopt_specO   s   
r4   modelpathc                 C   st   t |d}t|ddd }| rt| drdS t| tjtjfr.t	| j
 d| j
_dS tt|  d dS )zdSet model.config.transformer_layer_spec to modelopt spec if modelopt_state exists in the checkpoint.znemo://F	is_savingmodelopt_statemoduleNzI is neither a GPTModel nor MambaModel. Modelopt state will not be loaded.)strremoveprefixr   existshasattrr$   r   GPTModel
MambaModelr4   configgradient_accumulation_fusionr   warningr3   )r5   r6   modelopt_state_pathr-   r-   r.   r   g   s   
   TF
model_pathtensor_model_parallel_sizepipeline_model_parallel_size"num_layers_in_first_pipeline_stage!num_layers_in_last_pipeline_stageexpert_model_parallel_sizedevices	num_nodesinference_onlytokenizer_pathlegacy_ckptstrategy_kwargstrainer_kwargsmodel_config_overridesc                 C   s  |du ri }|du ri }|du ri }t d|  d |r6tjd|||tjdddd|
r.tjndd	|}ntjd|||tj|
rCtjndd|}tjd||d|tj	d	tjdd
dd|}tj
jt| dd}t|j | D ]\}}t d| d|  t|j|| qs|r|`|r||j_|r||j_d}|	rddlm} ||	}t| ||| d|j_t d| d ||fS )a  Loads a GPT model from a NeMo 2.0 checkpoint using modelopt layer spec.

    Args:
        model_path (str): Path to the NeMo checkpoint.
        tensor_model_parallel_size (int): Size of the tensor model parallelism.
        pipeline_model_parallel_size (int): Size of the pipeline model parallelism.
        num_layers_in_first_pipeline_stage (int): Number of layers in the first pipeline stage.
        num_layers_in_last_pipeline_stage (int): Number of layers in the last pipeline stage.
        devices (int): Number of devices on each node.
        num_nodes (int): Number of nodes being used.
        inference_only (bool): If True, loads the model for inference only w/o initializing the optimizer.
        tokenizer_path (Optional[str]): Path to the tokenizer if not using model's tokenizer.
        legacy_ckpt (bool): If True, allow loading ckpt saved with older version of TE.
        strategy_kwargs (Optional[dict]): Additional keyword arguments for nl.MegatronStrategy.
        trainer_kwargs (Optional[dict]): Additional keyword arguments for nl.Trainer.
        model_config_overrides (Optional[dict]): keyword arguments to override model config.

    Returns:
        Union[llm.GPTModel, llm.MambaModel]: The loaded model with the specified configuration.
    NzLoading model from z with modelopt layer spec...Fpytorch)	rG   rH   rK   pipeline_dtypeckpt_load_optimizerckpt_parallel_save_optimsetup_optimizersddpckpt_load_strictness)rG   rH   rK   rU   rZ   gpuz
bf16-mixedT)	precisionparams_dtypeautocast_enabledgrad_reduce_in_fp32)rL   rM   acceleratorstrategypluginsr5   )r6   subpathzOverriding model.config.z to r   )get_tokenizerzLoaded model: 
r-   )r   r#   nlMegatronStrategytorchbfloat16r
   LOG_ALLTrainerMegatronMixedPrecisionioload_contextr   r4   rA   itemssetattroptimrI   rJ   3nemo.collections.nlp.modules.common.tokenizer_utilsrd   r   ra   restore_config)rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   ra   trainerr5   kv	tokenizerrd   r-   r-   r.   r   w   sx   $
	

rt   z
pl.Trainerc                 C   s   t sdS |s|du rdS t|jjd|j}|sdS t| }tj|r*t	
d dS t|dr3| nt  tj|gt|dd W d   n1 sMw   Y  tj|r]t	
d t|dre|jS dS )	z
    Restore ModelOpt state from checkpoint.

    Args:
        model (nn.Module): The model to restore the state to.
        path (str): The path to the checkpoint.
        trainer (pl.Trainer): The trainer object, in case path not provided.
    Nr6   zAModel Optimizer state already restored from checkpoint. Skipping.hide_teacher_modelFr7   z/Restored Model Optimizer state from checkpoint.hide_loss_modules)HAVE_MODELOPTgetattrra   rs   	ckpt_pathr   mtoModeloptStateManageris_convertedr   r#   r>   rx   r   rb   restore_sharded_modelopt_stater   ry   )r5   r6   rt   
core_modelr-   r-   r.   restore_modelopt_state   s.   



r   r   checkpoint_ior   c                 C   s   t sdS t| }tj|sdS t|tr|jn|}t|ddr+t	
d |jdd t|dr4| nt  tjj|gt|dd|jd	 W d   n1 sQw   Y  t	d
 dS )a  
    Save ModelOpt state to checkpoint.

    Args:
        model (nn.Module): The MegatronParallel model to save the state from.
        path (str): The path to the checkpoint.
        checkpoint_io (CheckpointIO): The checkpoint IO object from MegatronStrategy.
    N
async_saveFzCModel-Optimizer library in use. Async checkpoint saving is blocked.T)blockingrx   r7   )sharded_strategyz,Saved Model-Optimizer state into checkpoint.)rz   r   r}   r~   r   r$   r	   r   r{   r   rC   maybe_finalize_save_checkpointr>   rx   r   rb   save_sharded_modelopt_stater   save_sharded_strategyr#   )r5   r6   r   r   ckpt_ior-   r-   r.   save_modelopt_state  s"   	

r   )rE   rE   NNrE   rE   rE   TNFNNN)NN)G__doc__
contextlibr   r&   r   typingr   r   r   r   lightning.pytorchrT   Lrh   torch.nnnn$lightning.pytorch.plugins.io.wrapperr	   +megatron.core.dist_checkpointing.validationr
   nemor   rf   nemo.collectionsr   #nemo.collections.llm.inference.baser   nemo.lightning.ckpt_utilsr   nemo.lightning.io.plr   
nemo.utilsr   nemo.utils.import_utilsr   nemo.utils.model_utilsr   r}   rz   _HAVE_TEr'   r   HAVE_MAMBA_SSMHAVE_CAUSAL_CONV1D6megatron.core.post_training.modelopt.mamba.model_specsr   pllightning.fabric.pluginsr    nemo.lightning.megatron_parallelr   __all__r%   r/   r0   r4   LightningModuler;   r   intbooldicttupler?   r@   rk   r   Moduler   r   r-   r-   r-   r.   <module>   s   
	

l
)